Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 28 additions & 22 deletions diffbot.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Diffbot API wrapper."""
"""Diffbot API wrapper. Edited Lee H"""
import argparse
import json
import os
Expand Down Expand Up @@ -30,10 +30,13 @@ def __init__(self, token, version=API_VERSION):
self._version = version

@staticmethod
def _get(url, params=None):
def _get(url, headers=None, params=None):
"""HTTP GET request."""
try:
response = requests.get(url, params=params)
if headers:
response = requests.get(url, headers=headers, params=params)
else:
response = response.get(url, params=params)
response.raise_for_status()
# If JSON fails, return raw data
# (e.g. when downloading CSV job logs).
Expand All @@ -46,24 +49,24 @@ def _get(url, params=None):
return json.loads(urllib2.urlopen(url).read().decode(ENCODING))

@staticmethod
def _post(url, data, content_type, params=None):
def _post(url, data, headers=None, params=None):
"""HTTP POST request."""
try:
response = requests.post(url, params=params, data=data, headers={
'Content-Type': content_type,
})
response = requests.post(url, params=params, data=data, headers=headers)
response.raise_for_status()
return response.json()
except NameError:
url = '{0}?{1}'.format(url, urllib.urlencode(params))
req = urllib2.Request(url, data.encode(ENCODING), {
'Content-Type': content_type,
})
req = urllib2.Request(url, data.encode(ENCODING), headers)
return json.loads(urllib2.urlopen(req).read().decode(ENCODING))

def endpoint(self, name):
def endpoint(self, name, no_render=False):
"""Generate the URL endpoint for the given API."""
return '{0}/v{1}/{2}'.format(API_ROOT, self._version, name)
endpoint_url = '{0}/v{1}/{2}'.format(API_ROOT, self._version, name)
if no_render:
# Turn off js to speed up processing
endpoint_url += '?norender'
return endpoint_url

def api(self, name, url, **kwargs):
"""Generic API method."""
Expand All @@ -74,6 +77,7 @@ def api(self, name, url, **kwargs):
timeout = kwargs.get('timeout')
text = kwargs.get('text')
html = kwargs.get('html')
headers = kwargs.get('headers')
if text and html:
raise ValueError(u'Both `text` and `html` arguments provided!')
params = {'url': url, 'token': self._token}
Expand All @@ -86,8 +90,12 @@ def api(self, name, url, **kwargs):
url = self.endpoint(name)
if text or html:
content_type = html and 'text/html' or 'text/plain'
return self._post(url, text or html, content_type, params=params)
return self._get(url, params=params)
headers_cust = {'Content-Type': content_type}
if headers:
headers_cust = headers.copy()
headers_cust['Content-Type'] = content_type
return self._post(url, text or html, headers=headers_cust, params=params)
return self._get(url, headers=headers, params=params)

def article(self, url, **kwargs):
"""Article API."""
Expand Down Expand Up @@ -122,13 +130,11 @@ def crawl(self, urls, name='crawl', api='analyze', **kwargs):
if isinstance(urls, list):
urls = ' '.join(urls)
url = self.endpoint('crawl')
process_url = self.endpoint(api)
params = {
'token': self._token,
'seeds': urls,
'name': name,
'apiUrl': process_url,
}
process_url = self.endpoint(api, no_render=kwargs.get('no_render', False))
params = {'token': self._token,
'seeds': urls,
'name': name,
'apiUrl': process_url,}

# Add any additional named parameters as accepted by Crawlbot
params['maxToCrawl'] = 10
Expand All @@ -153,7 +159,7 @@ def __init__(self, token, name, version=API_VERSION):
def control(self, **kwargs):
params = {'token': self._token, 'name': self._name}
params.update(kwargs)
res = self._get(self._url, params)
res = self._get(self._url, params=params)
job = next(j for j in res['jobs'] if j['name'] == self._name)
return job

Expand Down