diff --git a/Dockerfile b/Dockerfile index 67ed079..8f3681c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,8 @@ FROM python:3.6.7-slim-jessie + ADD requirements.txt / +RUN pip install --upgrade pip RUN pip install --no-cache-dir -r requirements.txt ADD jsonhandler.py / diff --git a/main.py b/main.py index 0860cff..ab2e7bc 100644 --- a/main.py +++ b/main.py @@ -9,13 +9,18 @@ from falcon import media import jsonhandler from google.cloud import datastore +from elasticsearch import Elasticsearch credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH') datastore_client = datastore.Client.from_service_account_json(credentials_path) +es = Elasticsearch([{'host': 'elasticsearch', 'port': 9200}]) + +es_doc_type = 'result' spider_results_kind = 'spider-results' webscreenshots_kind = 'webscreenshot' +es_index_name = spider_results_kind def convert_datastore_datetime(field): """ @@ -43,25 +48,6 @@ def flatten(d, parent_key='', sep='.'): return dict(items) -def get_compact_results(client): - query = client.query(kind=spider_results_kind, - order=['-created'], - #projection=['created', 'meta', 'score'], - ) - - out = [] - for entity in query.fetch(eventual=True): - created = convert_datastore_datetime(entity.get('created')) - - out.append({ - 'input_url': entity.key.name, - 'created': created.isoformat(), - 'meta': entity.get('meta'), - 'score': entity.get('score'), - }) - return out - - def simplify_rating(d): """ Removes some keys from a flattened rating dict @@ -122,33 +108,17 @@ class LastUpdated(object): """ Informs about the most recent update to the spider results data """ - query = datastore_client.query(kind=spider_results_kind, - order=['-created'], - projection=['created']) - items = list(query.fetch(limit=1, eventual=True)) - ts = int(items[0].get('created')) / 1000000 - dt = datetime.utcfromtimestamp(ts).isoformat() + res = es.search(index=es_index_name, + _source_include=['created'], + body={"query": {"match_all": {}}}, + sort='created:desc', + size=1) - maxage = 60 * 60 # one hour in seconds - resp.cache_control = ["max_age=%d" % maxage] resp.media = { - "last_updated": dt + "last_updated": res['hits']['hits'][0]['_source']['created'] } -class CompactResults(object): - - def on_get(self, req, resp): - """ - Returns compact sites overview and score - """ - out = get_compact_results(datastore_client) - - maxage = 6 * 60 * 60 # six hours in seconds - resp.cache_control = ["max_age=%d" % maxage] - resp.media = out - - class TableResults(object): def on_get(self, req, resp): @@ -162,6 +132,70 @@ class TableResults(object): resp.media = out +class SpiderResultsQuery(object): + + def on_get(self, req, resp): + """ + Queries the ES index for sites matching a term + """ + query_term = req.get_param('q', default='') + from_num = req.get_param('from', default='0') + + try: + from_num = int(from_num) + except Exception: + raise falcon.HTTPError(falcon.HTTP_400, + 'Bad request', + 'The parameter "from" bust be an integer.') + + res = es.search(index=es_index_name, + _source_include=['created', 'meta', 'rating', 'score', 'url'], + body={ + "query": { + "query_string": { + "query": query_term, + "default_operator": "AND", + } + } + }, + from_=from_num, + size=20, + sort='score:desc') + resp.media = { + "hits": res['hits'] + } + + +class SpiderResultsCount(object): + + def on_get(self, req, resp): + """ + Returns the number of items in the spider-results ES index + """ + query_term = req.get_param('q') + body = {"query": {"match_all" : {}}} + if query_term is not None: + body = { + "query": { + "bool" : { + "must" : { + "query_string" : { + "query" : query_term + } + } + } + } + } + + res = es.search(index=es_index_name, body=body, size=0) + + maxage = 5 * 60 # 5 minutes in seconds + resp.cache_control = ["max_age=%d" % maxage] + resp.media = { + "count": res['hits']['total'] + } + + class SiteDetails(object): def on_get(self, req, resp): @@ -175,16 +209,17 @@ class SiteDetails(object): 'Bad request', 'The parameter url must not be empty') - key = datastore_client.key(spider_results_kind, req.get_param('url')) - entity = datastore_client.get(key) + entity = es.get(index=es_index_name, doc_type=es_doc_type, id=url) if entity is None: raise falcon.HTTPError(falcon.HTTP_404, 'Not found', 'A site with this URL does not exist') - maxage = 24 * 60 * 60 # 24 hours in seconds + entity['_source']['url'] = entity['_id'] + + maxage = 5 * 60 # 5 minutes in seconds resp.cache_control = ["max_age=%d" % maxage] - resp.media = dict(entity) + resp.media = entity['_source'] class SiteScreenshots(object): @@ -218,9 +253,9 @@ class Index(object): "message": "This is green-spider-api", "url": "https://github.com/netzbegruenung/green-spider-api", "endpoints": [ + "/api/v1/spider-results/count/", "/api/v1/spider-results/last-updated/", - "/api/v1/spider-results/big/", - "/api/v1/spider-results/compact/", + "/api/v1/spider-results/table/", "/api/v1/spider-results/site", "/api/v1/screenshots/site", ] @@ -236,8 +271,9 @@ app.req_options.media_handlers = handlers app.resp_options.media_handlers = handlers app.add_route('/api/v1/spider-results/last-updated/', LastUpdated()) -app.add_route('/api/v1/spider-results/compact/', CompactResults()) app.add_route('/api/v1/spider-results/table/', TableResults()) +app.add_route('/api/v1/spider-results/query/', SpiderResultsQuery()) +app.add_route('/api/v1/spider-results/count/', SpiderResultsCount()) app.add_route('/api/v1/spider-results/site', SiteDetails()) app.add_route('/api/v1/screenshots/site', SiteScreenshots()) app.add_route('/', Index()) diff --git a/requirements.txt b/requirements.txt index ead59c1..218555f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ astroid==2.0.4 cachetools==2.1.0 certifi==2018.10.15 chardet==3.0.4 +elasticsearch==6.3.1 falcon==1.4.1 google-api-core==1.5.1 google-auth==1.5.1 @@ -15,8 +16,8 @@ isort==4.3.4 lazy-object-proxy==1.3.1 mccabe==0.6.1 protobuf==3.6.1 -pyasn1==0.4.4 pyasn1-modules==0.2.2 +pyasn1==0.4.4 pylint==2.1.1 python-mimeparse==1.6.0 pytz==2018.7