Merge pull request #58 from netzbegruenung/write-to-db

Use job queue, write to database
2024-04-29 23:54:51 +02:00 · 2018-08-23 10:16:50 +02:00 · 2018-08-23 10:16:50 +02:00 · bf23478265
parent b61172445b ab84ec8796
commit bf23478265
6 changed files with 425 additions and 245 deletions
--- a/3
+++ b/3
@ -12,7 +12,7 @@ RUN apt-get update \
  && wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
    && dpkg -i google-chrome-stable_current_amd64.deb \
    && rm google-chrome-stable_current_amd64.deb \
-  && pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 certifi==2018.1.18 \
+  && pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 \
  && wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
    && unzip chromedriver_linux64.zip \
    && rm chromedriver_linux64.zip \
@ -25,6 +25,7 @@ RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x

 ADD spider.py /
 ADD spider_test.py /
+ADD data_export.py /

 ENTRYPOINT ["python3"]
 CMD ["/spider.py"]
--- a/22
+++ b/22
@ -7,24 +7,40 @@ dockerimage:
 	docker pull debian:stretch-slim
 	docker build -t spider .

+# Create spider job queue
+spiderjobs: dockerimage
+	docker run --rm -ti \
+		-v $(PWD)/secrets:/secrets \
+		spider spider.py \
+		--credentials-path /secrets/datastore-writer.json \
+		--loglevel debug \
+		jobs
+
 # Run spider in docker image
 spider: dockerimage
 	docker run --rm -ti \
 		-v $(PWD)/webapp/dist/data:/out \
 		-v $(PWD)/docs/siteicons:/icons \
+		-v $(PWD)/secrets:/secrets \
+		spider spider.py \
+		--credentials-path /secrets/datastore-writer.json \
+		--loglevel debug \
 		spider

 test: dockerimage
 	docker run --rm -ti spider /spider_test.py

-screenshots: venv
-	venv/bin/python ./screenshots.py secrets/screenshot-reader.json
+export:
+	docker run --rm -ti \
+		-v $(PWD)/webapp/dist/data:/out \
+		-v $(PWD)/secrets:/secrets \
+		spider data_export.py /secrets/datastore-reader.json

 webapp/node_modules:
 	cd webapp && npm install

 # Build webapp
-webapp: webapp/node_modules screenshots
+webapp: webapp/node_modules
 	cd webapp && npx webpack --config webpack.config.js
 	cp -r webapp/dist/* ./docs/
 	cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/
--- a/README.md
+++ b/README.md
@ -1,11 +1,15 @@
 # Green Spider

-Initiative und Tools zur Förderung eines benutzer*innenfreundlichen Auftritts von Bündnis 90/Die Grünen im Web
+Initiative und Tools zur Förderung eines benutzer*innenfreundlichen Auftritts von Bündnis 90/Die Grünen im Web.
+
+Zur Auswertung: [https://green-spider.netzbegruenung.de/](https://green-spider.netzbegruenung.de/)

 ## Tools

 - Spider: Sammelt Informationen über Websites von B90/GRÜNE Gliederungen

+- Screenshotter: Erstellt Seiten-Screenshots. Siehe [netzbegruenung/green-spider-screenshotter](https://github.com/netzbegruenung/green-spider-screenshotter/)
+
 - Webapp: Darstellung der Spider-Ergebnisse unter [green-spider.netzbegruenung.de](https://green-spider.netzbegruenung.de/)

 ## Aktivitäten
@ -22,21 +26,28 @@ Zur Kommunikation dient der Chatbegrünung-Kanal [#green-spider](https://chatbeg

 ### Spider ausführen

-Damit werden alle bekannten WWW-Adressen aus [netzbegruenung/green-directory](https://github.com/netzbegruenung/green-directory) geprüft und Daten dazu gesammelt.
-
 Voraussetzungen:

- GNU make
- Python 3
- virtualenv
+- Docker
+- Schlüssel mit Schreibrecht für die Ergebnis-Datenbank

-Starte den Vorgang mit diesem Befehl:
+Um alle Sites aus aus [netzbegruenung/green-directory](https://github.com/netzbegruenung/green-directory) zu spidern:

 ```nohighlight
+make spiderjobs
 make spider
 ```

-Das Ergebnis ist die Datei `webapp/dist/data/spider_result.json`. Wenn Du die neuen Daten ins Repository einspielen möchtest, erstelle bitte einen Pull Request.
+Alternativ kann wie im nachfolgenden Beispiel gezeogt das Spidern einer einzelnen URL angestoßen werden. Diese muss nicht zwingend Teil des `green-directory` sein.
+
+```nohighlight
+docker run --rm -ti \
+  -v $PWD/secrets:/secrets spider \
+  spider.py --credentials-path /secrets/datastore-writer.json \
+  jobs --url https://www.trittin.de/
+
+make spider
+```

 ### Screenshots erstellen

@ -44,16 +55,17 @@ Siehe [green-spider-screenshotter](https://github.com/netzbegruenung/green-spide

 ### Webapp aktualisieren

-Die unter https://netzbegruenung.github.io/green-spider/ veröffentlichte Webapp zeigt den Inhalt des [docs](https://github.com/netzbegruenung/green-spider/tree/master/docs) Verzeichnisses für den `master` Branch dieses repositories an. Dieser kann automatisch neu erzeugt werden.
-
 Voraussetzungen:

 - npm
- Service-Account JSON-Datei für den Lesezugriff auf Screenshot-Daten
+- Docker
+- Schlüssel mit Leserecht für Screenshot- und Ergebnis-Datenbank

-Um den Inhalt des docs-Verzeichnisses zu aktualisieren, gibt es im Makefile dieses Kommando:
+Die beiden nachfolgenden Kommandos erzeugen die JSON-Exporte der Spider-Ergebnisse
+und Screenshots und aktualisieren die Webapp.

 ```nohighlight
+make export
 make webapp
 ```

--- a/data_export.py
+++ b/data_export.py
@ -1,17 +1,35 @@
+"""
+Exports data from the database to JSON files for use in a static webapp
+"""
+
 from google.cloud import datastore
 import json
 import sys
 import os


-def main():
-    if len(sys.argv) == 1:
-        print("Error: please provide path to Google Storage API system account JSON file as argument")
-        sys.exit(1)
+client = None

-    key_path = sys.argv[1]
-    client = datastore.Client.from_service_account_json(key_path)
+def export_results():
+    """
+    Export of the main results data
+    """
+    out = []

+    query = client.query(kind='spider-results')
+    for entity in query.fetch():
+        print(entity.key.name)
+        out.append(dict(entity)["results"])
+    
+    output_filename = "/out/spider_result.json"
+    with open(output_filename, 'w', encoding="utf8") as jsonfile:
+        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
+
+
+def export_screenshots():
+    """
+    Export of screenshot meta data
+    """
    out = {}

    query = client.query(kind='webscreenshot')
@ -19,10 +37,18 @@ def main():
        print(item['url'], os.path.basename(item['screenshot_url']))
        out[item['url']] = os.path.basename(item['screenshot_url'])
    
-    output_filename = "./webapp/dist/data/screenshots.json"
+    output_filename = "/out/screenshots.json"
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)


 if __name__ == "__main__":
-    main()
+    if len(sys.argv) == 1:
+        print("Error: please provide path to Google Storage API system account JSON file as argument")
+        sys.exit(1)
+
+    key_path = sys.argv[1]
+    client = datastore.Client.from_service_account_json(key_path)
+    
+    export_screenshots()
+    export_results()
--- a/spider.py
+++ b/spider.py
@ -1,68 +1,175 @@
-# coding: utf8
+"""
+Provides the spider functionality (website checks).
+"""

-from bs4 import BeautifulSoup
-from git import Repo
-from multiprocessing import Pool
-from selenium import webdriver
-from socket import gethostbyname_ex
-from urllib.parse import urljoin
-from urllib.parse import urlparse
-import certifi
+import argparse
 import hashlib
 import json
 import logging
 import os
 import random
 import re
-import requests
 import shutil
 import statistics
-import sys
+from datetime import datetime
+from socket import gethostbyname_ex
+from urllib.parse import urljoin
+from urllib.parse import urlparse
+
+import requests
 import yaml

+from bs4 import BeautifulSoup
+from git import Repo
+from selenium import webdriver
+from google.cloud import datastore
+from google.api_core.exceptions import InvalidArgument
+

 # configuration

-# number of parallel processes to use for crawling
-concurrency = 1
-
 # connection timeout for website checks (seconds)
-connect_timeout = 5
+CONNECT_TIMEOUT = 5

 # response timeout for website checks
-read_timeout = 10
+READ_TIMEOUT = 10

 # Git repo for our data
-green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
+GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
 # folder in that repo that holds the data
-green_direcory_data_path = 'data/countries/de'
-green_directory_local_path = './cache/green-directory'
+GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
+GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'

-result_path = '/out'
+RESULT_PATH = '/out'

-siteicons_path = '/icons'
+SITEICONS_PATH = '/icons'

 # IP address of the newthinking GCMS server
-gcms_ip = "91.102.13.20"
+GCMS_IP = "91.102.13.20"
+
+JOB_DATASTORE_KIND = 'spider-jobs'
+RESULTS_DATASTORE_KIND = 'spider-results'

 # end configuration

+DATASTORE_CLIENT = None
+
+
+def chunks(the_list, size):
+    """
+    Yield successive n-sized chunks from list the_list
+    where n = size.
+    """
+    for i in range(0, len(the_list), size):
+        yield the_list[i:i + size]
+
+
+def create_jobs(url=None):
+    """
+    Read all URLs from green directory and fill a job database
+    with one job per URL.
+
+    Alternatively, if the url argument is given, only the given URL
+    will be added as a spider job.
+    """
+
+    # refresh our local clone of the green directory
+    logging.info("Refreshing green-directory clone")
+    get_green_directory()
+
+    # build the list of website URLs to run checks for
+    logging.info("Processing green-directory")
+    input_entries = []
+
+    count = 0
+
+    for entry in dir_entries():
+
+        if 'type' not in entry:
+            logging.error("Entry without type")
+            continue
+        if 'urls' not in entry:
+            logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
+            continue
+
+        website_url = None
+        for index in range(len(entry['urls'])):
+            try:
+                if entry['urls'][index]['type'] == "WEBSITE":
+                    website_url = entry['urls'][index]['url']
+                    if website_url:
+                        if url is not None and website_url != url:
+                            continue
+                        input_entries.append({
+                            "url": website_url,
+                            "level": entry.get("level"),
+                            "state": entry.get("state"),
+                            "district": entry.get("district"),
+                            "city": entry.get("city"),
+                        })
+                        count += 1
+            except NameError:
+                logging.error("Error in %s: 'url' key missing (%s)",
+                              repr_entry(entry), entry['urls'][index])
+
+    # ensure the passed URL argument is really there, even if not part
+    # of the directory.
+    if url and count == 0:
+        logging.info("Adding job for URL %s which is not part of green-directory", url)
+        input_entries.append({
+            "url": url,
+            "level": None,
+            "state": None,
+            "district": None,
+            "city": None,
+        })
+
+    # randomize order, to distribute requests over servers
+    logging.debug("Shuffling input URLs")
+    random.seed()
+    random.shuffle(input_entries)
+
+    count = 0
+    logging.info("Writing jobs")
+
+    entities = []
+
+    for entry in input_entries:
+        key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
+        entity = datastore.Entity(key=key)
+        entity.update({
+            "created": datetime.utcnow(),
+            "level": entry["level"],
+            "state": entry["state"],
+            "district": entry["district"],
+            "city": entry["city"],
+        })
+        entities.append(entity)
+
+    # commmit to DB
+    for chunk in chunks(entities, 300):
+        logging.debug("Writing jobs chunk of length %d", len(chunk))
+        DATASTORE_CLIENT.put_multi(chunk)
+        count += len(chunk)
+
+    logging.info("Writing jobs done, %s jobs added", count)
+

 def get_green_directory():
    """
    Clones the source of website URLs, the green directory,
    into the local file system using git
    """
-    if os.path.exists(green_directory_local_path):
-        shutil.rmtree(green_directory_local_path)
-    Repo.clone_from(green_directory_repo, green_directory_local_path)
+    if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
+        shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
+    Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)


 def dir_entries():
    """
    Iterator over all data files in the cloned green directory
    """
-    path = os.path.join(green_directory_local_path, green_direcory_data_path)
+    path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
    for root, _, files in os.walk(path):
        for fname in files:

@ -80,14 +187,14 @@ def repr_entry(entry):
    Return string representation of a directory entry,
    for logging/debugging purposes
    """
-    r = entry['type']
+    ret = entry['type']
    if 'level' in entry:
-        r += "/" + entry['level']
+        ret += "/" + entry['level']
    if 'state' in entry:
-        r += "/" + entry['state']
+        ret += "/" + entry['state']
    if 'district' in entry:
-        r += "/" + entry['district']
-    return r
+        ret += "/" + entry['district']
+    return ret


 def derive_test_hostnames(hostname):
@ -117,24 +224,25 @@ def reduce_urls(urllist):
    that either don't work or lead somewhere else
    """
    targets = set()
-    for u in urllist:
-        if u['error'] is not None:
+    for url in urllist:
+        if url['error'] is not None:
            continue
-        if u['redirects_to'] is not None:
-            targets.add(u['redirects_to'])
+        if url['redirects_to'] is not None:
+            targets.add(url['redirects_to'])
        else:
-            targets.add(u['url'])
+            targets.add(url['url'])
    return sorted(list(targets))


-def normalize_title(s):
+def normalize_title(title):
    """
    Removes garbage from HTML page titles
    """
-    s = s.replace('\u00a0', ' ')
-    s = s.replace('  ', ' ')
-    s = s.strip()
-    return s
+    title = title.replace(u'\u00a0', ' ')
+    title = title.replace('  ', ' ')
+    title = title.strip()
+    return title
+

 def download_icon(icon_url):
    """
@ -150,10 +258,10 @@ def download_icon(icon_url):
    }

    # Download the icon
-    r = requests.get(icon_url)
-    r.raise_for_status()
+    req = requests.get(icon_url)
+    req.raise_for_status()

-    content_hash = hashlib.md5(r.content).hexdigest()
+    content_hash = hashlib.md5(req.content).hexdigest()
    extension = ""

    file_name = os.path.basename(icon_url)[-1]
@ -161,24 +269,25 @@ def download_icon(icon_url):
        ext = file_name.split(".")[-1]
        if ext != "":
            extension = ext
-    
+
    if extension == "":
        # derive from content type
-        t = r.headers.get('content-type')
+        ctype = req.headers.get('content-type')
        try:
-            extension = default_endings[t]
+            extension = default_endings[ctype]
        except KeyError:
-            logging.error("No file ending defined for icon type '%s'" % t)
+            logging.error("No file ending defined for icon type '%s'", ctype)
            return None
-    
+
    filename = content_hash + "." + extension.lower()

-    path = siteicons_path + os.path.sep + filename
+    path = SITEICONS_PATH + os.path.sep + filename
    with open(path, 'wb') as iconfile:
-        iconfile.write(r.content)
+        iconfile.write(req.content)

    return filename

+
 def check_responsiveness(url):
    """
    Checks
@ -193,9 +302,9 @@ def check_responsiveness(url):

    # sizes we check for (width, height)
    sizes = (
-        (320,480), # old smartphone
-        (768,1024), # older tablet or newer smartphone
-        (1024,768), # older desktop or horiz. tablet
+        (320, 480), # old smartphone
+        (768, 1024), # older tablet or newer smartphone
+        (1024, 768), # older desktop or horiz. tablet
        (1920, 1080), # Full HD horizontal
    )

@ -218,7 +327,8 @@ def check_responsiveness(url):

    return details

-def check_content(r):
+
+def check_content(req):
    """
    Adds details to check regarding content of the page

@ -227,10 +337,10 @@ def check_content(r):
    """
    result = {}

-    result['encoding'] = r.encoding.lower()
-    soup = BeautifulSoup(r.text, 'html.parser')
+    result['encoding'] = req.encoding.lower()
+    soup = BeautifulSoup(req.text, 'html.parser')

-    result['html'] = r.text
+    result['html'] = req.text

    # page title
    result['title'] = None
@ -245,47 +355,47 @@ def check_content(r):
    result['canonical_link'] = None
    link = soup.find('link', rel='canonical')
    if link:
-        result['canonical_link'] = urljoin(r.url, link.get('href'))
+        result['canonical_link'] = urljoin(req.url, link.get('href'))

    # icon
    result['icon'] = None
-    link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
+    link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
    if link:
-        result['icon'] = urljoin(r.url, link.get('href'))
+        result['icon'] = urljoin(req.url, link.get('href'))
    else:
-        link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
+        link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
        if link:
-            result['icon'] = urljoin(r.url, link.get('href'))
+            result['icon'] = urljoin(req.url, link.get('href'))

    # feed links
    result['feeds'] = []
    rss_links = soup.find_all('link', type='application/rss+xml')
    atom_links = soup.find_all('link', type='application/atom+xml')

-    if len(rss_links) > 0:
-        for l in rss_links:
-            result['feeds'].append(urljoin(r.url, l.get('href')))
-    if len(atom_links) > 0:
-        for l in rss_links:
-            result['feeds'].append(urljoin(r.url, l.get('href')))
+    if rss_links:
+        for link in rss_links:
+            result['feeds'].append(urljoin(req.url, link.get('href')))
+    if atom_links:
+        for link in rss_links:
+            result['feeds'].append(urljoin(req.url, link.get('href')))

    # generator meta tag
    result['generator'] = None
    if head is not None:
        generator = head.select('[name=generator]')
-        if len(generator):
+        if generator:
            result['generator'] = generator[0].get('content')

    # opengraph meta tags
    result['opengraph'] = None
-    og = set()
+    opengraph = set()
    if head is not None:
        for item in head.find_all(property=re.compile('^og:')):
-            og.add(item.get('property'))
+            opengraph.add(item.get('property'))
        for item in head.find_all(itemprop=re.compile('^og:')):
-            og.add(item.get('itemprop'))
-        if len(og):
-            result['opengraph'] = sorted(list(og))
+            opengraph.add(item.get('itemprop'))
+        if opengraph:
+            result['opengraph'] = sorted(list(opengraph))

    return result

@ -298,8 +408,8 @@ def collect_ipv4_addresses(hostname_dict):
    for item in hostname_dict.values():
        if 'ip_addresses' not in item:
            continue
-        for ip in item['ip_addresses']:
-            ips.add(ip)
+        for ip_addr in item['ip_addresses']:
+            ips.add(ip_addr)
    return sorted(list(ips))


@ -310,11 +420,11 @@ def parse_generator(generator):
    generator = generator.lower()
    if 'typo3' in generator:
        return "typo3"
-    elif 'wordpress' in generator:
+    if 'wordpress' in generator:
        return "wordpress"
-    elif 'drupal' in generator:
+    if 'drupal' in generator:
        return "drupal"
-    elif 'joomla' in generator:
+    if 'joomla' in generator:
        return "joomla"
    return generator

@ -328,7 +438,9 @@ def check_site(entry):
    4. Run full check on canonical URL
    """
    headers = {
-        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
+                      'AppleWebKit/537.36 (KHTML, like Gecko) ' +
+                      'Chrome/65.0.3325.181 green-spider/0.1'
    }

    # all the info we'll return for the site
@ -337,12 +449,13 @@ def check_site(entry):
        'input_url': entry['url'],
        # Meta: Regional and type metadata for the site
        'meta': {
-            'level': entry['level'],
-            'state': entry['state'],
-            'district': entry['district'],
-            'city': entry['city'],
+            'level': entry.get('level'),
+            'state': entry.get('state'),
+            'district': entry.get('district'),
+            'city': entry.get('city'),
        },
-        # Details: All details we collected about the site (which aren't directly related to the report criteria)
+        # Details: All details we collected about the site (which aren't directly
+        # related to the report criteria)
        'details': {
            'hostnames': {},
            'ipv4_addresses': [],
@ -375,18 +488,18 @@ def check_site(entry):

    # try to resolve hostnames
    processed_hostnames = {}
-    for hn in hostnames:
+    for hostname in hostnames:

-        processed_hostnames[hn] = {
+        processed_hostnames[hostname] = {
            'resolvable': False,
        }

        try:
-            hostname, aliases, ip_addresses = gethostbyname_ex(hn)
-            processed_hostnames[hn]['resolvable'] = True
-            processed_hostnames[hn]['resolved_hostname'] = hostname
-            processed_hostnames[hn]['aliases'] = aliases
-            processed_hostnames[hn]['ip_addresses'] = ip_addresses
+            hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
+            processed_hostnames[hostname]['resolvable'] = True
+            processed_hostnames[hostname]['resolved_hostname'] = hostname
+            processed_hostnames[hostname]['aliases'] = aliases
+            processed_hostnames[hostname]['ip_addresses'] = ip_addresses
        except:
            pass

@ -398,9 +511,9 @@ def check_site(entry):
    checked_urls = []
    checked_urls_set = set()

-    for hn in processed_hostnames.keys():
+    for hostname in processed_hostnames.keys():

-        item = processed_hostnames[hn]
+        item = processed_hostnames[hostname]

        if not item['resolvable']:
            continue
@ -421,18 +534,19 @@ def check_site(entry):
            }

            try:
-                r = requests.head(record['url'], headers=headers, allow_redirects=True)
-                if r.url == url:
-                    logging.info("URL: %s - status %s" % (record['url'], r.status_code))
+                req = requests.head(record['url'], headers=headers, allow_redirects=True)
+                if req.url == url:
+                    logging.info("URL: %s - status %s", record['url'], req.status_code)
                else:
-                    logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
-                    record['redirects_to'] = r.url
-            except Exception as e:
+                    logging.info("URL: %s - status %s - redirects to %s", record['url'],
+                                 req.status_code, req.url)
+                    record['redirects_to'] = req.url
+            except Exception as exc:
                record['error'] = {
-                    'type': str(type(e)),
-                    'message': str(e),
+                    'type': str(type(exc)),
+                    'message': str(exc),
                }
-                logging.info("URL %s: %s %s" % (url, str(type(e)), e))
+                logging.info("URL %s: %s %s", url, str(type(exc)), exc)

            checked_urls.append(record)

@ -442,7 +556,7 @@ def check_site(entry):
    # Deeper test for the remaining (canonical) URL(s)
    for check_url in result['details']['canonical_urls']:

-        logging.info("Downloading URL %s" % check_url)
+        logging.info("Downloading URL %s", check_url)

        check = {
            'url': check_url,
@ -454,37 +568,38 @@ def check_site(entry):
        }

        try:
-            r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
-            check['status_code'] = r.status_code
-            check['duration'] = round(r.elapsed.microseconds / 1000)
+            req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
+            check['status_code'] = req.status_code
+            check['duration'] = round(req.elapsed.microseconds / 1000)

            # Content checks
-            if r.status_code < 300:
-                check['content'] = check_content(r)
+            if req.status_code < 300:
+                check['content'] = check_content(req)

            # Responsiveness check
            try:
                check['responsive'] = check_responsiveness(check_url)
-            except Exception as e:
-                logging.error("Error when checking responsiveness for '%s': %s" % (check_url, e))
+            except Exception as exc:
+                logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)

-        except requests.exceptions.ConnectionError as e:
-            logging.error(str(e) + " " + check_url)
+        except requests.exceptions.ConnectionError as exc:
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "connection"
-        except requests.exceptions.ReadTimeout as e:
-            logging.error(str(e) + " " + check_url)
+        except requests.exceptions.ReadTimeout as exc:
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "read_timeout"
-        except requests.exceptions.Timeout as e:
-            logging.error(str(e) + " " + check_url)
+        except requests.exceptions.Timeout as exc:
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "connection_timeout"
-        except Exception as e:
-            logging.error(str(e) + " " + check_url)
+        except Exception as exc:
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "unknown"

        result['details']['urlchecks'].append(check)


-    result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
+    result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
+                                            key=lambda url: url['url'])

    # collect icons
    icons = set()
@ -492,24 +607,24 @@ def check_site(entry):
        if 'content' not in c:
            continue
        if c['content'] is None:
-            logging.warning("No content for %s" % entry['url'])
+            logging.warning("No content for %s", entry['url'])
            continue
        if c['content']['icon'] is not None:
            icons.add(c['content']['icon'])
    downloaded_icons = set()
    for icon_url in icons:
-        logging.info("Getting icon %s" % icon_url)
+        logging.info("Getting icon %s", icon_url)
        try:
            downloaded_icons.add(download_icon(icon_url))
        except Exception as e:
-            logging.error("Could not download icon: %s" % e)
+            logging.error("Could not download icon: %s", e)
    result['details']['icons'] = sorted(list(downloaded_icons))

    # collect feeds
    feeds = set()
    for c in result['details']['urlchecks']:
        if c['content'] is None:
-            logging.warning("No content for %s" % entry['url'])
+            logging.warning("No content for %s", entry['url'])
            continue
        if 'feeds' in c['content'] and len(c['content']['feeds']):
            for feed in c['content']['feeds']:
@ -543,7 +658,7 @@ def check_site(entry):
            result['details']['cms'] = parse_generator(c['content']['generator'])
            # Qualify certain CMS flavours in more detail
            if result['details']['cms'] == "typo3":
-                if gcms_ip in result['details']['ipv4_addresses']:
+                if GCMS_IP in result['details']['ipv4_addresses']:
                    result['details']['cms'] = "typo3-gcms"
                elif 'typo3-gruene.de' in c['content']['html']:
                    result['details']['cms'] = "typo3-gruene"
@ -555,7 +670,8 @@ def check_site(entry):
            # No generator Tag. Use HTML content.
            if 'Urwahl3000' in c['content']['html']:
                result['details']['cms'] = "wordpress-urwahl"
-            elif 'josephknowsbest' in c['content']['html'] or 'Joseph-knows-best' in c['content']['html']:
+            elif ('josephknowsbest' in c['content']['html'] or
+                  'Joseph-knows-best' in c['content']['html']):
                result['details']['cms'] = "wordpress-josephknowsbest"
            elif 'wordpress' in c['content']['html']:
                result['details']['cms'] = "wordpress"
@ -567,7 +683,7 @@ def check_site(entry):
    ### Derive criteria

    # DNS_RESOLVABLE_IPV4
-    if len(result['details']['ipv4_addresses']):
+    if result['details']['ipv4_addresses']:
        result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}

    # SITE_REACHABLE
@ -584,8 +700,8 @@ def check_site(entry):

    # WWW_OPTIONAL
    num_hostnames = 0
-    for hn in result['details']['hostnames'].keys():
-        item = result['details']['hostnames'][hn]
+    for hostname in result['details']['hostnames'].keys():
+        item = result['details']['hostnames'][hostname]
        if not item['resolvable']:
            continue
        num_hostnames += 1
@ -600,20 +716,20 @@ def check_site(entry):
    else:
        links = set()
        if result['details']['urlchecks'] is None:
-            logging.warning("No urlchecks for %s" % entry['url'])
+            logging.warning("No urlchecks for %s", entry['url'])
        else:
            for item in result['details']['urlchecks']:
-                if item['content']['canonical_link'] is not None:
+                if item['content'] is not None and item['content']['canonical_link'] is not None:
                    links.add(item['content']['canonical_link'])
        if len(links) == 1:
            result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}

    # FAVICON
-    if len(result['details']['icons']):
+    if result['details']['icons']:
        result['result']['FAVICON'] = {'value': True, 'score': 1}

    # FEEDS
-    if len(result['details']['feeds']):
+    if result['details']['feeds']:
        result['result']['FEEDS'] = {'value': True, 'score': 1}

    # HTTP_RESPONSE_DURATION
@ -621,17 +737,18 @@ def check_site(entry):
    for item in result['details']['urlchecks']:
        if item['error'] is None:
            durations.append(item['duration'])
-    val = round(statistics.mean(durations))
-    result['result']['HTTP_RESPONSE_DURATION']['value'] = val
-    if val < 100:
-        result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
-    elif val < 1000:
-        result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
+    if durations:
+        val = round(statistics.mean(durations))
+        result['result']['HTTP_RESPONSE_DURATION']['value'] = val
+        if val < 100:
+            result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
+        elif val < 1000:
+            result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5

    # RESPONSIVE
    if result['details']['responsive'] is not None:
        if (result['details']['responsive']['min_width'] < 500 and
-            len(result['details']['responsive']['viewport_meta_tag']) > 0):
+                len(result['details']['responsive']['viewport_meta_tag']) > 0):
            result['result']['RESPONSIVE']['value'] = True
            result['result']['RESPONSIVE']['score'] = 1

@ -649,87 +766,91 @@ def check_site(entry):
    return result


-def main():
+def get_job_from_queue():
    """
-    Bringing it all together
+    Returns a URL from the queue
    """
-    logging.basicConfig(level=logging.INFO)
-    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
+    out = None

-    # refresh our local clone of the green directory
-    get_green_directory()
+    with DATASTORE_CLIENT.transaction():
+        query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
+        for entity in query.fetch(limit=1):
+            logging.debug("Got job: %s", entity)
+            out = dict(entity)
+            out["url"] = entity.key.name
+            DATASTORE_CLIENT.delete(entity.key)

-    # build the list of website URLs to run checks for
-    logging.info("Processing green-directory")
-    input_entries = []
+    return out

-    for entry in dir_entries():
+def work_of_queue():
+    """
+    Take job from queue and finish it until there are no more jobs
+    """
+    while True:
+        job = get_job_from_queue()
+        if job is None:
+            logging.info("No more jobs. Exiting.")
+            break

-        if 'type' not in entry:
-            logging.error("Entry without type")
-            continue
-        if 'urls' not in entry:
-            logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
-            continue
+        logging.info("Starting job %s", job["url"])
+        result = check_site(entry=job)
+        #logging.debug(result)
+        logging.info("Job %s finished checks", job["url"])
+        logging.info("Job %s writing to DB", job["url"])

-        website_url = None
-        for n in range(len(entry['urls'])):
-            try:
-                if entry['urls'][n]['type'] == "WEBSITE":
-                    website_url = entry['urls'][n]['url']
-                    if website_url:
-                        input_entries.append({
-                            "url": website_url,
-                            "level": entry.get("level"),
-                            "state": entry.get("state"),
-                            "district": entry.get("district"),
-                            "city": entry.get("city"),
-                        })
-            except NameError:
-                logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
-
-
-    # randomize order, to distribute requests over servers
-    logging.info("Shuffling input URLs")
-    random.seed()
-    random.shuffle(input_entries)
-
-    # run checks
-    logging.info("Starting checks")
-    results = {}
-
-    pool = Pool(concurrency)
-    for ientry in input_entries:
-        logging.info("Submitting %s to job pool" % ientry['url'])
-        results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry})
-    pool.close()
-    pool.join()
-
-    logging.info("Checks are finished")
-
-    # Restructure result from dict of ApplyResult
-    # to list of dicts and sort in stable way
-    json_result = []
-    done = set()
-
-    logging.info("Restructuring results")
-
-    # convert results from ApplyResult to dict
-    for url in sorted(results.keys()):
-        if url not in done:
-            logging.info("Getting result for %s" % url)
-            try:
-                resultsitem = results[url].get()
-                json_result.append(resultsitem)
-            except Exception as e:
-                logging.error("Error getting result for '%s': %s" % (url, e))
-        done.add(url)
-
-    # Write result as JSON
-    output_filename = os.path.join(result_path, "spider_result.json")
-    with open(output_filename, 'w', encoding="utf8") as jsonfile:
-        json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
+        key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
+        entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
+        record = {
+            "created": datetime.utcnow(),
+            "results": result,
+        }
+        entity.update(record)
+        try:
+            DATASTORE_CLIENT.put(entity)
+        except InvalidArgument as ex:
+            logging.error("Could not write result: %s", ex)
+        except ex:
+            logging.error("Could not write result: %s", ex)


 if __name__ == "__main__":
-    main()
+    """
+    Bringing it all together
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--credentials-path', dest='credentials_path',
+                        help='Path to the service account credentials JSON file',
+                        default='/secrets/service-account.json')
+    parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
+                        default='info')
+
+    subparsers = parser.add_subparsers(help='sub-command help', dest='command')
+
+    subparsers.add_parser('spider', help='Take jobs off the queue and spider')
+
+    jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
+
+    jobs_parser.add_argument('--url', help='Add a job to spider a URL')
+    args = parser.parse_args()
+
+    loglevel = args.loglevel.lower()
+    if loglevel == 'error':
+        logging.basicConfig(level=logging.ERROR)
+    elif loglevel == 'warn':
+        logging.basicConfig(level=logging.WARN)
+    elif loglevel == 'debug':
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+        loglevel = 'info'
+
+    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
+
+    DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
+
+    logging.debug("Called command %s", args.command)
+
+    if args.command == 'jobs':
+        create_jobs(args.url)
+    else:
+        work_of_queue()
--- a/webapp/src/index.js
+++ b/webapp/src/index.js
@ -55,7 +55,7 @@ $(function(){

        // IPs
        var ips = _.join(item.details.ipv4_addresses, ', ');
-        row.append('<td class="text '+ (ips === '' ? 'bad' : 'good') +' text-center" data-order="' + (ips === '' ? no : ips) + '"><span class="tt" title="IPv4-Adresse(n) des Servers bzw. der Server">' + (ips === '' ? no : ips) + '</span></td>');
+        row.append('<td class="text '+ (ips === '' ? 'bad' : 'good') +' text-center" data-order="' + ips + '"><span class="tt" title="IPv4-Adresse(n) des Servers bzw. der Server">' + (ips === '' ? no : ips) + '</span></td>');
 
        // SITE_REACHABLE
        var reachable = '<span class="tt" title="Die Site war beim Check erreichbar.">' + yes + '</span>';
@ -65,10 +65,14 @@ $(function(){
        row.append('<td class="'+ (item.result.SITE_REACHABLE.value ? 'good' : 'bad') +' text-center" data-order="'+ (item.result.SITE_REACHABLE.value ? '1' : '0') +'">' + reachable + '</td>');

        // HTTP_RESPONSE_DURATION
-        var durationClass = 'bad';
-        if (item.result.HTTP_RESPONSE_DURATION.score > 0) { durationClass = 'medium'; }
-        if (item.result.HTTP_RESPONSE_DURATION.score > 0.5) { durationClass = 'good'; }
-        row.append('<td class="text '+ durationClass +' text-center" data-order="' + item.result.HTTP_RESPONSE_DURATION.value + '"><span class="tt" title="Dauer, bis der Server die Seitenanfrage beantwortet. Unter 100 ms ist sehr gut. Unter 1 Sekunde ist okay.">' + item.result.HTTP_RESPONSE_DURATION.value + ' ms</span></td>');
+        if (!item.result.SITE_REACHABLE.value || item.result.HTTP_RESPONSE_DURATION.value === null) {
+          row.append('<td class="text bad text-center" data-order="99999999"><span class="tt" title="Nicht anwendbar">' + no + '</span></td>');
+        } else {
+          var durationClass = 'bad';
+          if (item.result.HTTP_RESPONSE_DURATION.score > 0) { durationClass = 'medium'; }
+          if (item.result.HTTP_RESPONSE_DURATION.score > 0.5) { durationClass = 'good'; }
+          row.append('<td class="text '+ durationClass +' text-center" data-order="' + item.result.HTTP_RESPONSE_DURATION.value + '"><span class="tt" title="Dauer, bis der Server die Seitenanfrage beantwortet. Unter 100 ms ist sehr gut. Unter 1 Sekunde ist okay.">' + item.result.HTTP_RESPONSE_DURATION.value + ' ms</span></td>');
+        }

        // FAVICON
        var icon = item.result.FAVICON.value && (item.details.icons[0] != null);
@ -103,7 +107,7 @@ $(function(){

        // screenshots
        var screenshot = false;
-        if (item.details.canonical_urls.length > 0) {
+        if (item.details.canonical_urls && item.details.canonical_urls.length > 0) {
          if (typeof screenshots[item.details.canonical_urls[0]] !== 'undefined') {
            var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.details.canonical_urls[0]];
            var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.details.canonical_urls[0]];