Web service API für Green Spider
https://github.com/netzbegruenung/green-spider
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
304 lines
8.7 KiB
304 lines
8.7 KiB
import collections |
|
import csv |
|
import io |
|
import sys |
|
from datetime import datetime |
|
from os import getenv |
|
from wsgiref import simple_server |
|
|
|
import falcon |
|
from falcon import media |
|
import jsonhandler |
|
|
|
from google.cloud import datastore |
|
from elasticsearch import Elasticsearch |
|
|
|
credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH') |
|
datastore_client = datastore.Client.from_service_account_json(credentials_path) |
|
|
|
es = Elasticsearch([{'host': 'elasticsearch', 'port': 9200}]) |
|
|
|
es_doc_type = 'result' |
|
spider_results_kind = 'spider-results' |
|
webscreenshots_kind = 'webscreenshot' |
|
|
|
es_index_name = spider_results_kind |
|
|
|
def convert_datastore_datetime(field): |
|
""" |
|
return datetime in different ways, depending on whether the lib returns |
|
a str, int, or datetime.datetime |
|
""" |
|
dt = '' |
|
if type(field) == datetime: |
|
dt = field |
|
elif type(field) == int: |
|
dt = datetime.utcfromtimestamp(field / 1000000) |
|
elif type(field) == str: |
|
dt = datetime.utcfromtimestamp(int(field) / 1000000) |
|
return dt |
|
|
|
|
|
def flatten(d, parent_key='', sep='.'): |
|
items = [] |
|
for k, v in d.items(): |
|
new_key = parent_key + sep + k if parent_key else k |
|
if isinstance(v, collections.MutableMapping): |
|
items.extend(flatten(v, new_key, sep=sep).items()) |
|
else: |
|
items.append((new_key, v)) |
|
return dict(items) |
|
|
|
|
|
def simplify_rating(d): |
|
""" |
|
Removes some keys from a flattened rating dict |
|
""" |
|
keys_to_delete = [] |
|
for key in d.keys(): |
|
if key.endswith(".type") or key.endswith(".max_score"): |
|
keys_to_delete.append(key) |
|
|
|
for key in keys_to_delete: |
|
del d[key] |
|
|
|
return d |
|
|
|
|
|
def tablelize_checks(d): |
|
""" |
|
Returns a dict with the check details we want to be contained |
|
in a table export. |
|
""" |
|
out = {} |
|
|
|
# CMS names separated by space |
|
out['generator'] = " ".join(list(set([i for i in d['generator'].values() if i is not None]))) |
|
|
|
# List of actual URLs crawled |
|
out['resulting_urls'] = "" |
|
if 'url_canonicalization' in d: |
|
out['resulting_urls'] = " ".join([i for i in d['url_canonicalization'] if i is not None]) |
|
|
|
return out |
|
|
|
|
|
def get_table_result(client): |
|
query = client.query(kind=spider_results_kind) |
|
|
|
out = [] |
|
for entity in query.fetch(eventual=True): |
|
created = convert_datastore_datetime(entity.get('created')) |
|
|
|
record = { |
|
'input_url': entity.key.name, |
|
'created': created.isoformat(), |
|
'score': entity.get('score'), |
|
} |
|
|
|
record.update(flatten(entity.get('meta'), parent_key='meta')) |
|
record.update(simplify_rating(flatten(entity.get('rating'), parent_key='rating'))) |
|
record.update(tablelize_checks(entity.get('checks'))) |
|
|
|
out.append(record) |
|
return out |
|
|
|
|
|
class LastUpdated(object): |
|
|
|
def on_get(self, req, resp): |
|
""" |
|
Informs about the most recent update to the spider results data |
|
""" |
|
res = es.search(index=es_index_name, |
|
_source_include=['created'], |
|
body={"query": {"match_all": {}}}, |
|
sort='created:desc', |
|
size=1) |
|
|
|
resp.media = { |
|
"last_updated": res['hits']['hits'][0]['_source']['created'] |
|
} |
|
|
|
|
|
class TableResults(object): |
|
|
|
def on_get(self, req, resp): |
|
""" |
|
Returns big sites results |
|
""" |
|
out = get_table_result(datastore_client) |
|
|
|
maxage = 48 * 60 * 60 # two days |
|
resp.cache_control = ["max_age=%d" % maxage] |
|
if req.accept == 'text/csv': |
|
# return CSV |
|
headers = sorted(out[0].keys()) |
|
|
|
with io.StringIO(newline='\n') as csvfile: |
|
writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) |
|
writer.writerow(headers) |
|
for row in out: |
|
o = [] |
|
for f in headers: |
|
o.append(str(row[f])) |
|
writer.writerow(o) |
|
resp.body = csvfile.getvalue() |
|
resp.content_type = 'text/csv' |
|
resp.status = falcon.HTTP_200 |
|
else: |
|
resp.media = out |
|
|
|
|
|
class SpiderResultsQuery(object): |
|
|
|
def on_get(self, req, resp): |
|
""" |
|
Queries the ES index for sites matching a term |
|
""" |
|
query_term = req.get_param('q', default='') |
|
from_num = req.get_param('from', default='0') |
|
|
|
try: |
|
from_num = int(from_num) |
|
except Exception: |
|
raise falcon.HTTPError(falcon.HTTP_400, |
|
'Bad request', |
|
'The parameter "from" bust be an integer.') |
|
|
|
res = es.search(index=es_index_name, |
|
_source_include=['created', 'meta', 'rating', 'score', 'url'], |
|
body={ |
|
"query": { |
|
"query_string": { |
|
"query": query_term, |
|
"default_operator": "AND", |
|
} |
|
} |
|
}, |
|
from_=from_num, |
|
size=20, |
|
sort='score:desc') |
|
resp.media = { |
|
"hits": res['hits'] |
|
} |
|
|
|
|
|
class SpiderResultsCount(object): |
|
|
|
def on_get(self, req, resp): |
|
""" |
|
Returns the number of items in the spider-results ES index |
|
""" |
|
query_term = req.get_param('q') |
|
body = {"query": {"match_all" : {}}} |
|
if query_term is not None: |
|
body = { |
|
"query": { |
|
"bool" : { |
|
"must" : { |
|
"query_string" : { |
|
"query" : query_term |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
res = es.search(index=es_index_name, body=body, size=0) |
|
|
|
maxage = 5 * 60 # 5 minutes in seconds |
|
resp.cache_control = ["max_age=%d" % maxage] |
|
resp.media = { |
|
"count": res['hits']['total'] |
|
} |
|
|
|
|
|
class SiteDetails(object): |
|
|
|
def on_get(self, req, resp): |
|
""" |
|
Returns details for one URL |
|
""" |
|
|
|
url = req.get_param('url') |
|
if url is None or url == '': |
|
raise falcon.HTTPError(falcon.HTTP_400, |
|
'Bad request', |
|
'The parameter url must not be empty') |
|
|
|
entity = es.get(index=es_index_name, doc_type=es_doc_type, id=url) |
|
if entity is None: |
|
raise falcon.HTTPError(falcon.HTTP_404, |
|
'Not found', |
|
'A site with this URL does not exist') |
|
|
|
entity['_source']['url'] = entity['_id'] |
|
|
|
maxage = 5 * 60 # 5 minutes in seconds |
|
resp.cache_control = ["max_age=%d" % maxage] |
|
resp.media = entity['_source'] |
|
|
|
|
|
class SiteScreenshots(object): |
|
|
|
def on_get(self, req, resp): |
|
""" |
|
Returns screenshots for one URL |
|
""" |
|
|
|
url = req.get_param('url') |
|
if url is None or url == '': |
|
raise falcon.HTTPError(falcon.HTTP_400, |
|
'Bad request', |
|
'The parameter url must not be empty') |
|
|
|
query = datastore_client.query(kind=webscreenshots_kind) |
|
query.add_filter('url', '=', req.get_param('url')) |
|
entities = list(query.fetch()) |
|
|
|
maxage = 24 * 60 * 60 # 24 hours in seconds |
|
if len(entities) == 0: |
|
maxage = 3 * 60 * 60 # 3 hours in seconds |
|
|
|
resp.cache_control = ["max_age=%d" % maxage] |
|
resp.media = entities |
|
|
|
|
|
class Index(object): |
|
def on_get(self, req, resp): |
|
resp.media = { |
|
"message": "This is green-spider-api", |
|
"url": "https://github.com/netzbegruenung/green-spider-api", |
|
"endpoints": [ |
|
"/api/v1/spider-results/count/", |
|
"/api/v1/spider-results/last-updated/", |
|
"/api/v1/spider-results/table/", |
|
"/api/v1/spider-results/site", |
|
"/api/v1/screenshots/site", |
|
] |
|
} |
|
|
|
handlers = media.Handlers({ |
|
'application/json': jsonhandler.JSONHandler(), |
|
'text/csv': media.BaseHandler, |
|
}) |
|
|
|
app = falcon.API() |
|
|
|
app.req_options.media_handlers = handlers |
|
app.resp_options.media_handlers = handlers |
|
|
|
app.add_route('/api/v1/spider-results/last-updated/', LastUpdated()) |
|
app.add_route('/api/v1/spider-results/table/', TableResults()) |
|
app.add_route('/api/v1/spider-results/query/', SpiderResultsQuery()) |
|
app.add_route('/api/v1/spider-results/count/', SpiderResultsCount()) |
|
app.add_route('/api/v1/spider-results/site', SiteDetails()) |
|
app.add_route('/api/v1/screenshots/site', SiteScreenshots()) |
|
app.add_route('/', Index()) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
httpd = simple_server.make_server('127.0.0.1', 5000, app) |
|
httpd.serve_forever()
|
|
|