green-spider-api/main.py

import collections
import csv
import io
import sys
from datetime import datetime
from os import getenv
from wsgiref import simple_server

import falcon
from falcon import media
import jsonhandler

from google.cloud import datastore
from elasticsearch import Elasticsearch

credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH')
datastore_client = datastore.Client.from_service_account_json(credentials_path)

es = Elasticsearch([{'host': 'elasticsearch', 'port': 9200}])

es_doc_type = 'result'
spider_results_kind = 'spider-results'
webscreenshots_kind = 'webscreenshot'

es_index_name = spider_results_kind

def convert_datastore_datetime(field):
    """
    return datetime in different ways, depending on whether the lib returns
    a str, int, or datetime.datetime
    """
    dt = ''
    if type(field) == datetime:
        dt = field
    elif type(field) == int:
        dt = datetime.utcfromtimestamp(field / 1000000)
    elif type(field) == str:
        dt = datetime.utcfromtimestamp(int(field) / 1000000)
    return dt


def flatten(d, parent_key='', sep='.'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


def simplify_rating(d):
    """
    Removes some keys from a flattened rating dict
    """
    keys_to_delete = []
    for key in d.keys():
        if key.endswith(".type") or key.endswith(".max_score"):
            keys_to_delete.append(key)
    
    for key in keys_to_delete:
        del d[key]
    
    return d
            

def tablelize_checks(d):
    """
    Returns a dict with the check details we want to be contained
    in a table export.
    """
    out = {}

    # CMS names separated by space
    out['generator'] = " ".join(list(set([i for i in d['generator'].values() if i is not None])))

    # List of actual URLs crawled
    out['resulting_urls'] = ""
    if 'url_canonicalization' in d:
        out['resulting_urls'] = " ".join([i for i in d['url_canonicalization'] if i is not None])

    return out


def get_table_result(client):
    query = client.query(kind=spider_results_kind)

    out = []
    for entity in query.fetch(eventual=True):
        created = convert_datastore_datetime(entity.get('created'))

        record = {
            'input_url': entity.key.name,
            'created': created.isoformat(),
            'score': entity.get('score'),
        }

        record.update(flatten(entity.get('meta'), parent_key='meta'))
        record.update(simplify_rating(flatten(entity.get('rating'), parent_key='rating')))
        record.update(tablelize_checks(entity.get('checks')))

        out.append(record)
    return out


class LastUpdated(object):

    def on_get(self, req, resp):
        """
        Informs about the most recent update to the spider results data
        """
        res = es.search(index=es_index_name,
            _source_include=['created'],
            body={"query": {"match_all": {}}},
            sort='created:desc',
            size=1)

        resp.media = {
            "last_updated": res['hits']['hits'][0]['_source']['created']
        }


class TableResults(object):

    def on_get(self, req, resp):
        """
        Returns big sites results
        """
        out = get_table_result(datastore_client)

        maxage = 48 * 60 * 60  # two days
        resp.cache_control = ["max_age=%d" % maxage]
        if req.accept == 'text/csv':
            # return CSV
            headers = sorted(out[0].keys())

            with io.StringIO(newline='\n') as csvfile:
                writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
                writer.writerow(headers)
                for row in out:
                    o = []
                    for f in headers:
                        o.append(str(row[f]))
                    writer.writerow(o)
                resp.body = csvfile.getvalue()
            resp.content_type = 'text/csv'
            resp.status = falcon.HTTP_200
        else:
            resp.media = out


class SpiderResultsQuery(object):

    def on_get(self, req, resp):
        """
        Queries the ES index for sites matching a term
        """
        query_term = req.get_param('q', default='')
        from_num = req.get_param('from', default='0')

        try:
            from_num = int(from_num)
        except Exception:
            raise falcon.HTTPError(falcon.HTTP_400,
                               'Bad request',
                               'The parameter "from" bust be an integer.')

        res = es.search(index=es_index_name,
            _source_include=['created', 'meta', 'rating', 'score', 'url'],
            body={
                "query": {
                    "query_string": {
                        "query": query_term,
                        "default_operator": "AND",
                    }
                }
            },
            from_=from_num,
            size=20,
            sort='score:desc')
        resp.media = {
            "hits": res['hits']
        }


class SpiderResultsCount(object):

    def on_get(self, req, resp):
        """
        Returns the number of items in the spider-results ES index
        """
        query_term = req.get_param('q')
        body = {"query": {"match_all" : {}}}
        if query_term is not None:
            body = {
                "query": {
                    "bool" : {
                        "must" : {
                            "query_string" : {
                                "query" : query_term
                            }
                        }
                    }
                }
            }
            
        res = es.search(index=es_index_name, body=body, size=0)
        
        maxage = 5 * 60  # 5 minutes in seconds
        resp.cache_control = ["max_age=%d" % maxage]
        resp.media = {
            "count": res['hits']['total']
        }


class SiteDetails(object):

    def on_get(self, req, resp):
        """
        Returns details for one URL
        """

        url = req.get_param('url')
        if url is None or url == '':
            raise falcon.HTTPError(falcon.HTTP_400,
                               'Bad request',
                               'The parameter url must not be empty')

        entity = es.get(index=es_index_name, doc_type=es_doc_type, id=url)
        if entity is None:
            raise falcon.HTTPError(falcon.HTTP_404,
                               'Not found',
                               'A site with this URL does not exist')

        entity['_source']['url'] = entity['_id']

        maxage = 5 * 60  # 5 minutes in seconds
        resp.cache_control = ["max_age=%d" % maxage]
        resp.media = entity['_source']


class SiteScreenshots(object):

    def on_get(self, req, resp):
        """
        Returns screenshots for one URL
        """

        url = req.get_param('url')
        if url is None or url == '':
            raise falcon.HTTPError(falcon.HTTP_400,
                               'Bad request',
                               'The parameter url must not be empty')

        query = datastore_client.query(kind=webscreenshots_kind)
        query.add_filter('url', '=', req.get_param('url'))
        entities = list(query.fetch())

        maxage = 24 * 60 * 60  # 24 hours in seconds
        if len(entities) == 0:
            maxage = 3 * 60 * 60  # 3 hours in seconds

        resp.cache_control = ["max_age=%d" % maxage]
        resp.media = entities


class Index(object):
    def on_get(self, req, resp):
        resp.media = {
            "message": "This is green-spider-api",
            "url": "https://github.com/netzbegruenung/green-spider-api",
            "endpoints": [
                "/api/v1/spider-results/count/",
                "/api/v1/spider-results/last-updated/",
                "/api/v1/spider-results/table/",
                "/api/v1/spider-results/site",
                "/api/v1/screenshots/site",
            ]
        }

handlers = media.Handlers({
    'application/json': jsonhandler.JSONHandler(),
    'text/csv': media.BaseHandler,
})

app = falcon.API()

app.req_options.media_handlers = handlers
app.resp_options.media_handlers = handlers

app.add_route('/api/v1/spider-results/last-updated/', LastUpdated())
app.add_route('/api/v1/spider-results/table/', TableResults())
app.add_route('/api/v1/spider-results/query/', SpiderResultsQuery())
app.add_route('/api/v1/spider-results/count/', SpiderResultsCount())
app.add_route('/api/v1/spider-results/site', SiteDetails())
app.add_route('/api/v1/screenshots/site', SiteScreenshots())
app.add_route('/', Index())


if __name__ == '__main__':

    httpd = simple_server.make_server('127.0.0.1', 5000, app)
    httpd.serve_forever()
Add big result export 2018-11-26 09:35:52 +01:00			`import collections`
Add CSV support 2019-07-15 17:51:26 +02:00			`import csv`
			`import io`
			`import sys`
First working version 2018-10-31 23:23:05 +01:00			`from datetime import datetime`
			`from os import getenv`
			`from wsgiref import simple_server`

			`import falcon`
			`from falcon import media`
			`import jsonhandler`

			`from google.cloud import datastore`
Adapt CompactResults to use ES 2019-04-12 09:15:35 +02:00			`from elasticsearch import Elasticsearch`
First working version 2018-10-31 23:23:05 +01:00
			`credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH')`
			`datastore_client = datastore.Client.from_service_account_json(credentials_path)`

Adapt CompactResults to use ES 2019-04-12 09:15:35 +02:00			`es = Elasticsearch([{'host': 'elasticsearch', 'port': 9200}])`

			`es_doc_type = 'result'`
Add details endpoint 2018-11-02 09:22:26 +01:00			`spider_results_kind = 'spider-results'`
Add endpoint for screenshot of a site 2018-11-12 22:16:09 +01:00			`webscreenshots_kind = 'webscreenshot'`
First working version 2018-10-31 23:23:05 +01:00
Adapt CompactResults to use ES 2019-04-12 09:15:35 +02:00			`es_index_name = spider_results_kind`
First working version 2018-10-31 23:23:05 +01:00
Add big result export 2018-11-26 09:35:52 +01:00			`def convert_datastore_datetime(field):`
			`"""`
			`return datetime in different ways, depending on whether the lib returns`
			`a str, int, or datetime.datetime`
			`"""`
			`dt = ''`
			`if type(field) == datetime:`
			`dt = field`
			`elif type(field) == int:`
			`dt = datetime.utcfromtimestamp(field / 1000000)`
			`elif type(field) == str:`
			`dt = datetime.utcfromtimestamp(int(field) / 1000000)`
			`return dt`


			`def flatten(d, parent_key='', sep='.'):`
			`items = []`
			`for k, v in d.items():`
			`new_key = parent_key + sep + k if parent_key else k`
			`if isinstance(v, collections.MutableMapping):`
			`items.extend(flatten(v, new_key, sep=sep).items())`
			`else:`
			`items.append((new_key, v))`
			`return dict(items)`


Add table data export 2018-11-26 21:10:45 +01:00			`def simplify_rating(d):`
			`"""`
			`Removes some keys from a flattened rating dict`
			`"""`
			`keys_to_delete = []`
			`for key in d.keys():`
			`if key.endswith(".type") or key.endswith(".max_score"):`
			`keys_to_delete.append(key)`

			`for key in keys_to_delete:`
			`del d[key]`

			`return d`


			`def tablelize_checks(d):`
			`"""`
			`Returns a dict with the check details we want to be contained`
			`in a table export.`
			`"""`
			`out = {}`

			`# CMS names separated by space`
			`out['generator'] = " ".join(list(set([i for i in d['generator'].values() if i is not None])))`

			`# List of actual URLs crawled`
			`out['resulting_urls'] = ""`
			`if 'url_canonicalization' in d:`
			`out['resulting_urls'] = " ".join([i for i in d['url_canonicalization'] if i is not None])`

			`return out`


			`def get_table_result(client):`
Add big result export 2018-11-26 09:35:52 +01:00			`query = client.query(kind=spider_results_kind)`

			`out = []`
			`for entity in query.fetch(eventual=True):`
			`created = convert_datastore_datetime(entity.get('created'))`

			`record = {`
			`'input_url': entity.key.name,`
			`'created': created.isoformat(),`
			`'score': entity.get('score'),`
			`}`
Add table data export 2018-11-26 21:10:45 +01:00
Add big result export 2018-11-26 09:35:52 +01:00			`record.update(flatten(entity.get('meta'), parent_key='meta'))`
Add table data export 2018-11-26 21:10:45 +01:00			`record.update(simplify_rating(flatten(entity.get('rating'), parent_key='rating')))`
			`record.update(tablelize_checks(entity.get('checks')))`

Add big result export 2018-11-26 09:35:52 +01:00			`out.append(record)`
			`return out`

First working version 2018-10-31 23:23:05 +01:00
			`class LastUpdated(object):`

			`def on_get(self, req, resp):`
			`"""`
			`Informs about the most recent update to the spider results data`
			`"""`
Adapt LastUpdated to use ES 2019-04-12 09:15:14 +02:00			`res = es.search(index=es_index_name,`
Various changes and additions 2019-04-15 21:47:18 +02:00			`_source_include=['created'],`
Adapt LastUpdated to use ES 2019-04-12 09:15:14 +02:00			`body={"query": {"match_all": {}}},`
			`sort='created:desc',`
			`size=1)`

First working version 2018-10-31 23:23:05 +01:00			`resp.media = {`
Adapt LastUpdated to use ES 2019-04-12 09:15:14 +02:00			`"last_updated": res['hits']['hits'][0]['_source']['created']`
First working version 2018-10-31 23:23:05 +01:00			`}`


Various changes and additions 2019-04-15 21:47:18 +02:00			`class TableResults(object):`
First working version 2018-10-31 23:23:05 +01:00
			`def on_get(self, req, resp):`
			`"""`
Various changes and additions 2019-04-15 21:47:18 +02:00			`Returns big sites results`
First working version 2018-10-31 23:23:05 +01:00			`"""`
Various changes and additions 2019-04-15 21:47:18 +02:00			`out = get_table_result(datastore_client)`
First working version 2018-10-31 23:23:05 +01:00
Various changes and additions 2019-04-15 21:47:18 +02:00			`maxage = 48 * 60 * 60 # two days`
First working version 2018-10-31 23:23:05 +01:00			`resp.cache_control = ["max_age=%d" % maxage]`
Add CSV support 2019-07-15 17:51:26 +02:00			`if req.accept == 'text/csv':`
			`# return CSV`
			`headers = sorted(out[0].keys())`

			`with io.StringIO(newline='\n') as csvfile:`
			`writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)`
			`writer.writerow(headers)`
			`for row in out:`
			`o = []`
			`for f in headers:`
			`o.append(str(row[f]))`
			`writer.writerow(o)`
			`resp.body = csvfile.getvalue()`
			`resp.content_type = 'text/csv'`
			`resp.status = falcon.HTTP_200`
			`else:`
			`resp.media = out`
First working version 2018-10-31 23:23:05 +01:00

Various changes and additions 2019-04-15 21:47:18 +02:00			`class SpiderResultsQuery(object):`
Add big result export 2018-11-26 09:35:52 +01:00
			`def on_get(self, req, resp):`
			`"""`
Various changes and additions 2019-04-15 21:47:18 +02:00			`Queries the ES index for sites matching a term`
Add big result export 2018-11-26 09:35:52 +01:00			`"""`
Various changes and additions 2019-04-15 21:47:18 +02:00			`query_term = req.get_param('q', default='')`
			`from_num = req.get_param('from', default='0')`
Add big result export 2018-11-26 09:35:52 +01:00
Various changes and additions 2019-04-15 21:47:18 +02:00			`try:`
			`from_num = int(from_num)`
			`except Exception:`
			`raise falcon.HTTPError(falcon.HTTP_400,`
			`'Bad request',`
			`'The parameter "from" bust be an integer.')`

			`res = es.search(index=es_index_name,`
			`_source_include=['created', 'meta', 'rating', 'score', 'url'],`
			`body={`
			`"query": {`
			`"query_string": {`
			`"query": query_term,`
			`"default_operator": "AND",`
			`}`
			`}`
			`},`
			`from_=from_num,`
			`size=20,`
			`sort='score:desc')`
			`resp.media = {`
			`"hits": res['hits']`
			`}`


			`class SpiderResultsCount(object):`

			`def on_get(self, req, resp):`
			`"""`
			`Returns the number of items in the spider-results ES index`
			`"""`
			`query_term = req.get_param('q')`
			`body = {"query": {"match_all" : {}}}`
			`if query_term is not None:`
			`body = {`
			`"query": {`
			`"bool" : {`
			`"must" : {`
			`"query_string" : {`
			`"query" : query_term`
			`}`
			`}`
			`}`
			`}`
			`}`

			`res = es.search(index=es_index_name, body=body, size=0)`

			`maxage = 5 * 60 # 5 minutes in seconds`
Add big result export 2018-11-26 09:35:52 +01:00			`resp.cache_control = ["max_age=%d" % maxage]`
Various changes and additions 2019-04-15 21:47:18 +02:00			`resp.media = {`
			`"count": res['hits']['total']`
			`}`
Add big result export 2018-11-26 09:35:52 +01:00

Add details endpoint 2018-11-02 09:22:26 +01:00			`class SiteDetails(object):`

			`def on_get(self, req, resp):`
			`"""`
			`Returns details for one URL`
			`"""`

			`url = req.get_param('url')`
			`if url is None or url == '':`
			`raise falcon.HTTPError(falcon.HTTP_400,`
			`'Bad request',`
			`'The parameter url must not be empty')`

Adapt SiteDetails to use ES 2019-04-12 09:14:56 +02:00			`entity = es.get(index=es_index_name, doc_type=es_doc_type, id=url)`
Add details endpoint 2018-11-02 09:22:26 +01:00			`if entity is None:`
			`raise falcon.HTTPError(falcon.HTTP_404,`
			`'Not found',`
			`'A site with this URL does not exist')`

Various changes and additions 2019-04-15 21:47:18 +02:00			`entity['_source']['url'] = entity['_id']`
Adapt SiteDetails to use ES 2019-04-12 09:14:56 +02:00
			`maxage = 5 * 60 # 5 minutes in seconds`
Add details endpoint 2018-11-02 09:22:26 +01:00			`resp.cache_control = ["max_age=%d" % maxage]`
Adapt SiteDetails to use ES 2019-04-12 09:14:56 +02:00			`resp.media = entity['_source']`
Add details endpoint 2018-11-02 09:22:26 +01:00

Add endpoint for screenshot of a site 2018-11-12 22:16:09 +01:00			`class SiteScreenshots(object):`

			`def on_get(self, req, resp):`
			`"""`
			`Returns screenshots for one URL`
			`"""`

			`url = req.get_param('url')`
			`if url is None or url == '':`
			`raise falcon.HTTPError(falcon.HTTP_400,`
			`'Bad request',`
			`'The parameter url must not be empty')`

			`query = datastore_client.query(kind=webscreenshots_kind)`
			`query.add_filter('url', '=', req.get_param('url'))`
			`entities = list(query.fetch())`

			`maxage = 24 * 60 * 60 # 24 hours in seconds`
			`if len(entities) == 0:`
			`maxage = 3 * 60 * 60 # 3 hours in seconds`

			`resp.cache_control = ["max_age=%d" % maxage]`
			`resp.media = entities`


Add big result export 2018-11-26 09:35:52 +01:00			`class Index(object):`
			`def on_get(self, req, resp):`
			`resp.media = {`
			`"message": "This is green-spider-api",`
			`"url": "https://github.com/netzbegruenung/green-spider-api",`
			`"endpoints": [`
Various changes and additions 2019-04-15 21:47:18 +02:00			`"/api/v1/spider-results/count/",`
Add big result export 2018-11-26 09:35:52 +01:00			`"/api/v1/spider-results/last-updated/",`
Fix endpoint index 2019-04-12 09:13:54 +02:00			`"/api/v1/spider-results/table/",`
Add big result export 2018-11-26 09:35:52 +01:00			`"/api/v1/spider-results/site",`
			`"/api/v1/screenshots/site",`
			`]`
			`}`

First working version 2018-10-31 23:23:05 +01:00			`handlers = media.Handlers({`
			`'application/json': jsonhandler.JSONHandler(),`
Add CSV support 2019-07-15 17:51:26 +02:00			`'text/csv': media.BaseHandler,`
First working version 2018-10-31 23:23:05 +01:00			`})`

			`app = falcon.API()`

			`app.req_options.media_handlers = handlers`
			`app.resp_options.media_handlers = handlers`

			`app.add_route('/api/v1/spider-results/last-updated/', LastUpdated())`
Add table data export 2018-11-26 21:10:45 +01:00			`app.add_route('/api/v1/spider-results/table/', TableResults())`
Various changes and additions 2019-04-15 21:47:18 +02:00			`app.add_route('/api/v1/spider-results/query/', SpiderResultsQuery())`
			`app.add_route('/api/v1/spider-results/count/', SpiderResultsCount())`
Add details endpoint 2018-11-02 09:22:26 +01:00			`app.add_route('/api/v1/spider-results/site', SiteDetails())`
Add endpoint for screenshot of a site 2018-11-12 22:16:09 +01:00			`app.add_route('/api/v1/screenshots/site', SiteScreenshots())`
Add big result export 2018-11-26 09:35:52 +01:00			`app.add_route('/', Index())`
Add endpoint for screenshot of a site 2018-11-12 22:16:09 +01:00
First working version 2018-10-31 23:23:05 +01:00
			`if __name__ == '__main__':`
Add table data export 2018-11-26 21:10:45 +01:00
First working version 2018-10-31 23:23:05 +01:00			`httpd = simple_server.make_server('127.0.0.1', 5000, app)`
			`httpd.serve_forever()`