mirror of
https://github.com/netzbegruenung/green-spider-api.git
synced 2024-05-04 09:43:41 +02:00
commit
7b197f649a
127
main.py
127
main.py
|
@ -1,5 +1,7 @@
|
|||
import collections
|
||||
from datetime import datetime
|
||||
from os import getenv
|
||||
import sys
|
||||
from wsgiref import simple_server
|
||||
|
||||
import falcon
|
||||
|
@ -8,7 +10,6 @@ import jsonhandler
|
|||
|
||||
from google.cloud import datastore
|
||||
|
||||
|
||||
credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH')
|
||||
datastore_client = datastore.Client.from_service_account_json(credentials_path)
|
||||
|
||||
|
@ -16,6 +17,32 @@ spider_results_kind = 'spider-results'
|
|||
webscreenshots_kind = 'webscreenshot'
|
||||
|
||||
|
||||
def convert_datastore_datetime(field):
|
||||
"""
|
||||
return datetime in different ways, depending on whether the lib returns
|
||||
a str, int, or datetime.datetime
|
||||
"""
|
||||
dt = ''
|
||||
if type(field) == datetime:
|
||||
dt = field
|
||||
elif type(field) == int:
|
||||
dt = datetime.utcfromtimestamp(field / 1000000)
|
||||
elif type(field) == str:
|
||||
dt = datetime.utcfromtimestamp(int(field) / 1000000)
|
||||
return dt
|
||||
|
||||
|
||||
def flatten(d, parent_key='', sep='.'):
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
new_key = parent_key + sep + k if parent_key else k
|
||||
if isinstance(v, collections.MutableMapping):
|
||||
items.extend(flatten(v, new_key, sep=sep).items())
|
||||
else:
|
||||
items.append((new_key, v))
|
||||
return dict(items)
|
||||
|
||||
|
||||
def get_compact_results(client):
|
||||
query = client.query(kind=spider_results_kind,
|
||||
order=['-created'],
|
||||
|
@ -24,27 +51,71 @@ def get_compact_results(client):
|
|||
|
||||
out = []
|
||||
for entity in query.fetch(eventual=True):
|
||||
|
||||
# handle creation date in different ways, depending on whether the lib returns
|
||||
# a str, int, or datetime.datetime
|
||||
created = entity.get('created')
|
||||
dt = ''
|
||||
if type(created) == datetime:
|
||||
dt = created
|
||||
elif type(created) == int:
|
||||
dt = datetime.utcfromtimestamp(created / 1000000)
|
||||
elif type(created) == str:
|
||||
dt = datetime.utcfromtimestamp(int(created) / 1000000)
|
||||
created = convert_datastore_datetime(entity.get('created'))
|
||||
|
||||
out.append({
|
||||
'input_url': entity.key.name,
|
||||
'created': dt.isoformat(),
|
||||
'created': created.isoformat(),
|
||||
'meta': entity.get('meta'),
|
||||
'score': entity.get('score'),
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def simplify_rating(d):
|
||||
"""
|
||||
Removes some keys from a flattened rating dict
|
||||
"""
|
||||
keys_to_delete = []
|
||||
for key in d.keys():
|
||||
if key.endswith(".type") or key.endswith(".max_score"):
|
||||
keys_to_delete.append(key)
|
||||
|
||||
for key in keys_to_delete:
|
||||
del d[key]
|
||||
|
||||
return d
|
||||
|
||||
|
||||
def tablelize_checks(d):
|
||||
"""
|
||||
Returns a dict with the check details we want to be contained
|
||||
in a table export.
|
||||
"""
|
||||
out = {}
|
||||
|
||||
# CMS names separated by space
|
||||
out['generator'] = " ".join(list(set([i for i in d['generator'].values() if i is not None])))
|
||||
|
||||
# List of actual URLs crawled
|
||||
out['resulting_urls'] = ""
|
||||
if 'url_canonicalization' in d:
|
||||
out['resulting_urls'] = " ".join([i for i in d['url_canonicalization'] if i is not None])
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def get_table_result(client):
|
||||
query = client.query(kind=spider_results_kind)
|
||||
|
||||
out = []
|
||||
for entity in query.fetch(eventual=True):
|
||||
created = convert_datastore_datetime(entity.get('created'))
|
||||
|
||||
record = {
|
||||
'input_url': entity.key.name,
|
||||
'created': created.isoformat(),
|
||||
'score': entity.get('score'),
|
||||
}
|
||||
|
||||
record.update(flatten(entity.get('meta'), parent_key='meta'))
|
||||
record.update(simplify_rating(flatten(entity.get('rating'), parent_key='rating')))
|
||||
record.update(tablelize_checks(entity.get('checks')))
|
||||
|
||||
out.append(record)
|
||||
return out
|
||||
|
||||
|
||||
class LastUpdated(object):
|
||||
|
||||
def on_get(self, req, resp):
|
||||
|
@ -78,6 +149,19 @@ class CompactResults(object):
|
|||
resp.media = out
|
||||
|
||||
|
||||
class TableResults(object):
|
||||
|
||||
def on_get(self, req, resp):
|
||||
"""
|
||||
Returns big sites results
|
||||
"""
|
||||
out = get_table_result(datastore_client)
|
||||
|
||||
maxage = 48 * 60 * 60 # two days
|
||||
resp.cache_control = ["max_age=%d" % maxage]
|
||||
resp.media = out
|
||||
|
||||
|
||||
class SiteDetails(object):
|
||||
|
||||
def on_get(self, req, resp):
|
||||
|
@ -128,6 +212,20 @@ class SiteScreenshots(object):
|
|||
resp.media = entities
|
||||
|
||||
|
||||
class Index(object):
|
||||
def on_get(self, req, resp):
|
||||
resp.media = {
|
||||
"message": "This is green-spider-api",
|
||||
"url": "https://github.com/netzbegruenung/green-spider-api",
|
||||
"endpoints": [
|
||||
"/api/v1/spider-results/last-updated/",
|
||||
"/api/v1/spider-results/big/",
|
||||
"/api/v1/spider-results/compact/",
|
||||
"/api/v1/spider-results/site",
|
||||
"/api/v1/screenshots/site",
|
||||
]
|
||||
}
|
||||
|
||||
handlers = media.Handlers({
|
||||
'application/json': jsonhandler.JSONHandler(),
|
||||
})
|
||||
|
@ -139,10 +237,13 @@ app.resp_options.media_handlers = handlers
|
|||
|
||||
app.add_route('/api/v1/spider-results/last-updated/', LastUpdated())
|
||||
app.add_route('/api/v1/spider-results/compact/', CompactResults())
|
||||
app.add_route('/api/v1/spider-results/table/', TableResults())
|
||||
app.add_route('/api/v1/spider-results/site', SiteDetails())
|
||||
app.add_route('/api/v1/screenshots/site', SiteScreenshots())
|
||||
app.add_route('/', Index())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
httpd = simple_server.make_server('127.0.0.1', 5000, app)
|
||||
httpd.serve_forever()
|
||||
|
|
31
main_test.py
Normal file
31
main_test.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
import unittest
|
||||
from main import flatten
|
||||
|
||||
class TestFlattenDict(unittest.TestCase):
|
||||
|
||||
def test_flatten(self):
|
||||
input = {
|
||||
"foo": {
|
||||
"bar": {
|
||||
"one": 1,
|
||||
"two": 2,
|
||||
}
|
||||
},
|
||||
"bar": {
|
||||
"one": 1,
|
||||
"two": 2,
|
||||
}
|
||||
}
|
||||
expected = {
|
||||
"foo.bar.one": 1,
|
||||
"foo.bar.two": 2,
|
||||
"bar.one": 1,
|
||||
"bar.two": 2,
|
||||
}
|
||||
out = flatten(input)
|
||||
self.assertEqual(out, expected)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in a new issue