green-spider-api/main.py

305 lines
8.7 KiB
Python
Raw Permalink Normal View History

2018-11-26 09:35:52 +01:00
import collections
2019-07-15 17:51:26 +02:00
import csv
import io
import sys
2018-10-31 23:23:05 +01:00
from datetime import datetime
from os import getenv
from wsgiref import simple_server
import falcon
from falcon import media
import jsonhandler
from google.cloud import datastore
2019-04-12 09:15:35 +02:00
from elasticsearch import Elasticsearch
2018-10-31 23:23:05 +01:00
credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH')
datastore_client = datastore.Client.from_service_account_json(credentials_path)
2019-04-12 09:15:35 +02:00
es = Elasticsearch([{'host': 'elasticsearch', 'port': 9200}])
es_doc_type = 'result'
2018-11-02 09:22:26 +01:00
spider_results_kind = 'spider-results'
2018-11-12 22:16:09 +01:00
webscreenshots_kind = 'webscreenshot'
2018-10-31 23:23:05 +01:00
2019-04-12 09:15:35 +02:00
es_index_name = spider_results_kind
2018-10-31 23:23:05 +01:00
2018-11-26 09:35:52 +01:00
def convert_datastore_datetime(field):
"""
return datetime in different ways, depending on whether the lib returns
a str, int, or datetime.datetime
"""
dt = ''
if type(field) == datetime:
dt = field
elif type(field) == int:
dt = datetime.utcfromtimestamp(field / 1000000)
elif type(field) == str:
dt = datetime.utcfromtimestamp(int(field) / 1000000)
return dt
def flatten(d, parent_key='', sep='.'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
2018-11-26 21:10:45 +01:00
def simplify_rating(d):
"""
Removes some keys from a flattened rating dict
"""
keys_to_delete = []
for key in d.keys():
if key.endswith(".type") or key.endswith(".max_score"):
keys_to_delete.append(key)
for key in keys_to_delete:
del d[key]
return d
def tablelize_checks(d):
"""
Returns a dict with the check details we want to be contained
in a table export.
"""
out = {}
# CMS names separated by space
out['generator'] = " ".join(list(set([i for i in d['generator'].values() if i is not None])))
# List of actual URLs crawled
out['resulting_urls'] = ""
if 'url_canonicalization' in d:
out['resulting_urls'] = " ".join([i for i in d['url_canonicalization'] if i is not None])
return out
def get_table_result(client):
2018-11-26 09:35:52 +01:00
query = client.query(kind=spider_results_kind)
out = []
for entity in query.fetch(eventual=True):
created = convert_datastore_datetime(entity.get('created'))
record = {
'input_url': entity.key.name,
'created': created.isoformat(),
'score': entity.get('score'),
}
2018-11-26 21:10:45 +01:00
2018-11-26 09:35:52 +01:00
record.update(flatten(entity.get('meta'), parent_key='meta'))
2018-11-26 21:10:45 +01:00
record.update(simplify_rating(flatten(entity.get('rating'), parent_key='rating')))
record.update(tablelize_checks(entity.get('checks')))
2018-11-26 09:35:52 +01:00
out.append(record)
return out
2018-10-31 23:23:05 +01:00
class LastUpdated(object):
def on_get(self, req, resp):
"""
Informs about the most recent update to the spider results data
"""
2019-04-12 09:15:14 +02:00
res = es.search(index=es_index_name,
2019-04-15 21:47:18 +02:00
_source_include=['created'],
2019-04-12 09:15:14 +02:00
body={"query": {"match_all": {}}},
sort='created:desc',
size=1)
2018-10-31 23:23:05 +01:00
resp.media = {
2019-04-12 09:15:14 +02:00
"last_updated": res['hits']['hits'][0]['_source']['created']
2018-10-31 23:23:05 +01:00
}
2019-04-15 21:47:18 +02:00
class TableResults(object):
2018-10-31 23:23:05 +01:00
def on_get(self, req, resp):
"""
2019-04-15 21:47:18 +02:00
Returns big sites results
2018-10-31 23:23:05 +01:00
"""
2019-04-15 21:47:18 +02:00
out = get_table_result(datastore_client)
2018-10-31 23:23:05 +01:00
2019-04-15 21:47:18 +02:00
maxage = 48 * 60 * 60 # two days
2018-10-31 23:23:05 +01:00
resp.cache_control = ["max_age=%d" % maxage]
2019-07-15 17:51:26 +02:00
if req.accept == 'text/csv':
# return CSV
headers = sorted(out[0].keys())
with io.StringIO(newline='\n') as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
writer.writerow(headers)
for row in out:
o = []
for f in headers:
o.append(str(row[f]))
writer.writerow(o)
resp.body = csvfile.getvalue()
resp.content_type = 'text/csv'
resp.status = falcon.HTTP_200
else:
resp.media = out
2018-10-31 23:23:05 +01:00
2019-04-15 21:47:18 +02:00
class SpiderResultsQuery(object):
2018-11-26 09:35:52 +01:00
def on_get(self, req, resp):
"""
2019-04-15 21:47:18 +02:00
Queries the ES index for sites matching a term
2018-11-26 09:35:52 +01:00
"""
2019-04-15 21:47:18 +02:00
query_term = req.get_param('q', default='')
from_num = req.get_param('from', default='0')
2018-11-26 09:35:52 +01:00
2019-04-15 21:47:18 +02:00
try:
from_num = int(from_num)
except Exception:
raise falcon.HTTPError(falcon.HTTP_400,
'Bad request',
'The parameter "from" bust be an integer.')
res = es.search(index=es_index_name,
_source_include=['created', 'meta', 'rating', 'score', 'url'],
body={
"query": {
"query_string": {
"query": query_term,
"default_operator": "AND",
}
}
},
from_=from_num,
size=20,
sort='score:desc')
resp.media = {
"hits": res['hits']
}
class SpiderResultsCount(object):
def on_get(self, req, resp):
"""
Returns the number of items in the spider-results ES index
"""
query_term = req.get_param('q')
body = {"query": {"match_all" : {}}}
if query_term is not None:
body = {
"query": {
"bool" : {
"must" : {
"query_string" : {
"query" : query_term
}
}
}
}
}
res = es.search(index=es_index_name, body=body, size=0)
maxage = 5 * 60 # 5 minutes in seconds
2018-11-26 09:35:52 +01:00
resp.cache_control = ["max_age=%d" % maxage]
2019-04-15 21:47:18 +02:00
resp.media = {
"count": res['hits']['total']
}
2018-11-26 09:35:52 +01:00
2018-11-02 09:22:26 +01:00
class SiteDetails(object):
def on_get(self, req, resp):
"""
Returns details for one URL
"""
url = req.get_param('url')
if url is None or url == '':
raise falcon.HTTPError(falcon.HTTP_400,
'Bad request',
'The parameter url must not be empty')
2019-04-12 09:14:56 +02:00
entity = es.get(index=es_index_name, doc_type=es_doc_type, id=url)
2018-11-02 09:22:26 +01:00
if entity is None:
raise falcon.HTTPError(falcon.HTTP_404,
'Not found',
'A site with this URL does not exist')
2019-04-15 21:47:18 +02:00
entity['_source']['url'] = entity['_id']
2019-04-12 09:14:56 +02:00
maxage = 5 * 60 # 5 minutes in seconds
2018-11-02 09:22:26 +01:00
resp.cache_control = ["max_age=%d" % maxage]
2019-04-12 09:14:56 +02:00
resp.media = entity['_source']
2018-11-02 09:22:26 +01:00
2018-11-12 22:16:09 +01:00
class SiteScreenshots(object):
def on_get(self, req, resp):
"""
Returns screenshots for one URL
"""
url = req.get_param('url')
if url is None or url == '':
raise falcon.HTTPError(falcon.HTTP_400,
'Bad request',
'The parameter url must not be empty')
query = datastore_client.query(kind=webscreenshots_kind)
query.add_filter('url', '=', req.get_param('url'))
entities = list(query.fetch())
maxage = 24 * 60 * 60 # 24 hours in seconds
if len(entities) == 0:
maxage = 3 * 60 * 60 # 3 hours in seconds
resp.cache_control = ["max_age=%d" % maxage]
resp.media = entities
2018-11-26 09:35:52 +01:00
class Index(object):
def on_get(self, req, resp):
resp.media = {
"message": "This is green-spider-api",
"url": "https://github.com/netzbegruenung/green-spider-api",
"endpoints": [
2019-04-15 21:47:18 +02:00
"/api/v1/spider-results/count/",
2018-11-26 09:35:52 +01:00
"/api/v1/spider-results/last-updated/",
2019-04-12 09:13:54 +02:00
"/api/v1/spider-results/table/",
2018-11-26 09:35:52 +01:00
"/api/v1/spider-results/site",
"/api/v1/screenshots/site",
]
}
2018-10-31 23:23:05 +01:00
handlers = media.Handlers({
'application/json': jsonhandler.JSONHandler(),
2019-07-15 17:51:26 +02:00
'text/csv': media.BaseHandler,
2018-10-31 23:23:05 +01:00
})
app = falcon.API()
app.req_options.media_handlers = handlers
app.resp_options.media_handlers = handlers
app.add_route('/api/v1/spider-results/last-updated/', LastUpdated())
2018-11-26 21:10:45 +01:00
app.add_route('/api/v1/spider-results/table/', TableResults())
2019-04-15 21:47:18 +02:00
app.add_route('/api/v1/spider-results/query/', SpiderResultsQuery())
app.add_route('/api/v1/spider-results/count/', SpiderResultsCount())
2018-11-02 09:22:26 +01:00
app.add_route('/api/v1/spider-results/site', SiteDetails())
2018-11-12 22:16:09 +01:00
app.add_route('/api/v1/screenshots/site', SiteScreenshots())
2018-11-26 09:35:52 +01:00
app.add_route('/', Index())
2018-11-12 22:16:09 +01:00
2018-10-31 23:23:05 +01:00
if __name__ == '__main__':
2018-11-26 21:10:45 +01:00
2018-10-31 23:23:05 +01:00
httpd = simple_server.make_server('127.0.0.1', 5000, app)
httpd.serve_forever()