mirror of
https://github.com/netzbegruenung/green-spider-api.git
synced 2024-05-08 03:33:41 +02:00
305 lines
8.7 KiB
Python
305 lines
8.7 KiB
Python
import collections
|
|
import csv
|
|
import io
|
|
import sys
|
|
from datetime import datetime
|
|
from os import getenv
|
|
from wsgiref import simple_server
|
|
|
|
import falcon
|
|
from falcon import media
|
|
import jsonhandler
|
|
|
|
from google.cloud import datastore
|
|
from elasticsearch import Elasticsearch
|
|
|
|
credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH')
|
|
datastore_client = datastore.Client.from_service_account_json(credentials_path)
|
|
|
|
es = Elasticsearch([{'host': 'elasticsearch', 'port': 9200}])
|
|
|
|
es_doc_type = 'result'
|
|
spider_results_kind = 'spider-results'
|
|
webscreenshots_kind = 'webscreenshot'
|
|
|
|
es_index_name = spider_results_kind
|
|
|
|
def convert_datastore_datetime(field):
|
|
"""
|
|
return datetime in different ways, depending on whether the lib returns
|
|
a str, int, or datetime.datetime
|
|
"""
|
|
dt = ''
|
|
if type(field) == datetime:
|
|
dt = field
|
|
elif type(field) == int:
|
|
dt = datetime.utcfromtimestamp(field / 1000000)
|
|
elif type(field) == str:
|
|
dt = datetime.utcfromtimestamp(int(field) / 1000000)
|
|
return dt
|
|
|
|
|
|
def flatten(d, parent_key='', sep='.'):
|
|
items = []
|
|
for k, v in d.items():
|
|
new_key = parent_key + sep + k if parent_key else k
|
|
if isinstance(v, collections.MutableMapping):
|
|
items.extend(flatten(v, new_key, sep=sep).items())
|
|
else:
|
|
items.append((new_key, v))
|
|
return dict(items)
|
|
|
|
|
|
def simplify_rating(d):
|
|
"""
|
|
Removes some keys from a flattened rating dict
|
|
"""
|
|
keys_to_delete = []
|
|
for key in d.keys():
|
|
if key.endswith(".type") or key.endswith(".max_score"):
|
|
keys_to_delete.append(key)
|
|
|
|
for key in keys_to_delete:
|
|
del d[key]
|
|
|
|
return d
|
|
|
|
|
|
def tablelize_checks(d):
|
|
"""
|
|
Returns a dict with the check details we want to be contained
|
|
in a table export.
|
|
"""
|
|
out = {}
|
|
|
|
# CMS names separated by space
|
|
out['generator'] = " ".join(list(set([i for i in d['generator'].values() if i is not None])))
|
|
|
|
# List of actual URLs crawled
|
|
out['resulting_urls'] = ""
|
|
if 'url_canonicalization' in d:
|
|
out['resulting_urls'] = " ".join([i for i in d['url_canonicalization'] if i is not None])
|
|
|
|
return out
|
|
|
|
|
|
def get_table_result(client):
|
|
query = client.query(kind=spider_results_kind)
|
|
|
|
out = []
|
|
for entity in query.fetch(eventual=True):
|
|
created = convert_datastore_datetime(entity.get('created'))
|
|
|
|
record = {
|
|
'input_url': entity.key.name,
|
|
'created': created.isoformat(),
|
|
'score': entity.get('score'),
|
|
}
|
|
|
|
record.update(flatten(entity.get('meta'), parent_key='meta'))
|
|
record.update(simplify_rating(flatten(entity.get('rating'), parent_key='rating')))
|
|
record.update(tablelize_checks(entity.get('checks')))
|
|
|
|
out.append(record)
|
|
return out
|
|
|
|
|
|
class LastUpdated(object):
|
|
|
|
def on_get(self, req, resp):
|
|
"""
|
|
Informs about the most recent update to the spider results data
|
|
"""
|
|
res = es.search(index=es_index_name,
|
|
_source_include=['created'],
|
|
body={"query": {"match_all": {}}},
|
|
sort='created:desc',
|
|
size=1)
|
|
|
|
resp.media = {
|
|
"last_updated": res['hits']['hits'][0]['_source']['created']
|
|
}
|
|
|
|
|
|
class TableResults(object):
|
|
|
|
def on_get(self, req, resp):
|
|
"""
|
|
Returns big sites results
|
|
"""
|
|
out = get_table_result(datastore_client)
|
|
|
|
maxage = 48 * 60 * 60 # two days
|
|
resp.cache_control = ["max_age=%d" % maxage]
|
|
if req.accept == 'text/csv':
|
|
# return CSV
|
|
headers = sorted(out[0].keys())
|
|
|
|
with io.StringIO(newline='\n') as csvfile:
|
|
writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
|
|
writer.writerow(headers)
|
|
for row in out:
|
|
o = []
|
|
for f in headers:
|
|
o.append(str(row[f]))
|
|
writer.writerow(o)
|
|
resp.body = csvfile.getvalue()
|
|
resp.content_type = 'text/csv'
|
|
resp.status = falcon.HTTP_200
|
|
else:
|
|
resp.media = out
|
|
|
|
|
|
class SpiderResultsQuery(object):
|
|
|
|
def on_get(self, req, resp):
|
|
"""
|
|
Queries the ES index for sites matching a term
|
|
"""
|
|
query_term = req.get_param('q', default='')
|
|
from_num = req.get_param('from', default='0')
|
|
|
|
try:
|
|
from_num = int(from_num)
|
|
except Exception:
|
|
raise falcon.HTTPError(falcon.HTTP_400,
|
|
'Bad request',
|
|
'The parameter "from" bust be an integer.')
|
|
|
|
res = es.search(index=es_index_name,
|
|
_source_include=['created', 'meta', 'rating', 'score', 'url'],
|
|
body={
|
|
"query": {
|
|
"query_string": {
|
|
"query": query_term,
|
|
"default_operator": "AND",
|
|
}
|
|
}
|
|
},
|
|
from_=from_num,
|
|
size=20,
|
|
sort='score:desc')
|
|
resp.media = {
|
|
"hits": res['hits']
|
|
}
|
|
|
|
|
|
class SpiderResultsCount(object):
|
|
|
|
def on_get(self, req, resp):
|
|
"""
|
|
Returns the number of items in the spider-results ES index
|
|
"""
|
|
query_term = req.get_param('q')
|
|
body = {"query": {"match_all" : {}}}
|
|
if query_term is not None:
|
|
body = {
|
|
"query": {
|
|
"bool" : {
|
|
"must" : {
|
|
"query_string" : {
|
|
"query" : query_term
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
res = es.search(index=es_index_name, body=body, size=0)
|
|
|
|
maxage = 5 * 60 # 5 minutes in seconds
|
|
resp.cache_control = ["max_age=%d" % maxage]
|
|
resp.media = {
|
|
"count": res['hits']['total']
|
|
}
|
|
|
|
|
|
class SiteDetails(object):
|
|
|
|
def on_get(self, req, resp):
|
|
"""
|
|
Returns details for one URL
|
|
"""
|
|
|
|
url = req.get_param('url')
|
|
if url is None or url == '':
|
|
raise falcon.HTTPError(falcon.HTTP_400,
|
|
'Bad request',
|
|
'The parameter url must not be empty')
|
|
|
|
entity = es.get(index=es_index_name, doc_type=es_doc_type, id=url)
|
|
if entity is None:
|
|
raise falcon.HTTPError(falcon.HTTP_404,
|
|
'Not found',
|
|
'A site with this URL does not exist')
|
|
|
|
entity['_source']['url'] = entity['_id']
|
|
|
|
maxage = 5 * 60 # 5 minutes in seconds
|
|
resp.cache_control = ["max_age=%d" % maxage]
|
|
resp.media = entity['_source']
|
|
|
|
|
|
class SiteScreenshots(object):
|
|
|
|
def on_get(self, req, resp):
|
|
"""
|
|
Returns screenshots for one URL
|
|
"""
|
|
|
|
url = req.get_param('url')
|
|
if url is None or url == '':
|
|
raise falcon.HTTPError(falcon.HTTP_400,
|
|
'Bad request',
|
|
'The parameter url must not be empty')
|
|
|
|
query = datastore_client.query(kind=webscreenshots_kind)
|
|
query.add_filter('url', '=', req.get_param('url'))
|
|
entities = list(query.fetch())
|
|
|
|
maxage = 24 * 60 * 60 # 24 hours in seconds
|
|
if len(entities) == 0:
|
|
maxage = 3 * 60 * 60 # 3 hours in seconds
|
|
|
|
resp.cache_control = ["max_age=%d" % maxage]
|
|
resp.media = entities
|
|
|
|
|
|
class Index(object):
|
|
def on_get(self, req, resp):
|
|
resp.media = {
|
|
"message": "This is green-spider-api",
|
|
"url": "https://github.com/netzbegruenung/green-spider-api",
|
|
"endpoints": [
|
|
"/api/v1/spider-results/count/",
|
|
"/api/v1/spider-results/last-updated/",
|
|
"/api/v1/spider-results/table/",
|
|
"/api/v1/spider-results/site",
|
|
"/api/v1/screenshots/site",
|
|
]
|
|
}
|
|
|
|
handlers = media.Handlers({
|
|
'application/json': jsonhandler.JSONHandler(),
|
|
'text/csv': media.BaseHandler,
|
|
})
|
|
|
|
app = falcon.API()
|
|
|
|
app.req_options.media_handlers = handlers
|
|
app.resp_options.media_handlers = handlers
|
|
|
|
app.add_route('/api/v1/spider-results/last-updated/', LastUpdated())
|
|
app.add_route('/api/v1/spider-results/table/', TableResults())
|
|
app.add_route('/api/v1/spider-results/query/', SpiderResultsQuery())
|
|
app.add_route('/api/v1/spider-results/count/', SpiderResultsCount())
|
|
app.add_route('/api/v1/spider-results/site', SiteDetails())
|
|
app.add_route('/api/v1/screenshots/site', SiteScreenshots())
|
|
app.add_route('/', Index())
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
httpd = simple_server.make_server('127.0.0.1', 5000, app)
|
|
httpd.serve_forever()
|