From 88ec1f63f71d9f7e8c1e9258968ecdc7cf14fc54 Mon Sep 17 00:00:00 2001
From: Marian Steinbach <marian@giantswarm.io>
Date: Thu, 23 Aug 2018 09:36:33 +0200
Subject: [PATCH] Big spider.py overhaul

---
 spider.py | 531 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 316 insertions(+), 215 deletions(-)

diff --git a/spider.py b/spider.py
index 3b33732..93f376c 100644
--- a/spider.py
+++ b/spider.py
@@ -1,68 +1,155 @@
-# coding: utf8
+"""
+Provides the spider functionality (website checks).
+"""
 
-from bs4 import BeautifulSoup
-from git import Repo
-from multiprocessing import Pool
-from selenium import webdriver
-from socket import gethostbyname_ex
-from urllib.parse import urljoin
-from urllib.parse import urlparse
-import certifi
+import argparse
 import hashlib
 import json
 import logging
 import os
 import random
 import re
-import requests
 import shutil
 import statistics
-import sys
+from datetime import datetime
+from socket import gethostbyname_ex
+from urllib.parse import urljoin
+from urllib.parse import urlparse
+
+import requests
 import yaml
 
+from bs4 import BeautifulSoup
+from git import Repo
+from selenium import webdriver
+from google.cloud import datastore
+from google.api_core.exceptions import InvalidArgument
+
 
 # configuration
 
-# number of parallel processes to use for crawling
-concurrency = 1
-
 # connection timeout for website checks (seconds)
-connect_timeout = 5
+CONNECT_TIMEOUT = 5
 
 # response timeout for website checks
-read_timeout = 10
+READ_TIMEOUT = 10
 
 # Git repo for our data
-green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
+GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
 # folder in that repo that holds the data
-green_direcory_data_path = 'data/countries/de'
-green_directory_local_path = './cache/green-directory'
+GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
+GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
 
-result_path = '/out'
+RESULT_PATH = '/out'
 
-siteicons_path = '/icons'
+SITEICONS_PATH = '/icons'
 
 # IP address of the newthinking GCMS server
-gcms_ip = "91.102.13.20"
+GCMS_IP = "91.102.13.20"
+
+JOB_DATASTORE_KIND = 'spider-jobs'
+RESULTS_DATASTORE_KIND = 'spider-results'
 
 # end configuration
 
+DATASTORE_CLIENT = None
+
+
+def chunks(the_list, size):
+    """
+    Yield successive n-sized chunks from list the_list
+    where n = size.
+    """
+    for i in range(0, len(the_list), size):
+        yield the_list[i:i + size]
+
+
+def create_jobs():
+    """
+    Read all URLs from green directory and fill a job database
+    with one job per URL
+    """
+
+    # refresh our local clone of the green directory
+    logging.info("Refreshing green-directory clone")
+    get_green_directory()
+
+    # build the list of website URLs to run checks for
+    logging.info("Processing green-directory")
+    input_entries = []
+
+    for entry in dir_entries():
+
+        if 'type' not in entry:
+            logging.error("Entry without type")
+            continue
+        if 'urls' not in entry:
+            logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
+            continue
+
+        website_url = None
+        for index in range(len(entry['urls'])):
+            try:
+                if entry['urls'][index]['type'] == "WEBSITE":
+                    website_url = entry['urls'][index]['url']
+                    if website_url:
+                        input_entries.append({
+                            "url": website_url,
+                            "level": entry.get("level"),
+                            "state": entry.get("state"),
+                            "district": entry.get("district"),
+                            "city": entry.get("city"),
+                        })
+            except NameError:
+                logging.error("Error in %s: 'url' key missing (%s)",
+                              repr_entry(entry), entry['urls'][index])
+
+    # randomize order, to distribute requests over servers
+    logging.debug("Shuffling input URLs")
+    random.seed()
+    random.shuffle(input_entries)
+
+    count = 0
+    logging.info("Writing jobs")
+
+    entities = []
+
+    for entry in input_entries:
+        key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
+        entity = datastore.Entity(key=key)
+        entity.update({
+            "created": datetime.utcnow(),
+            "level": entry["level"],
+            "state": entry["state"],
+            "district": entry["district"],
+            "city": entry["city"],
+        })
+        entities.append(entity)
+
+    # commmit to DB
+    for chunk in chunks(entities, 300):
+        logging.debug("Writing jobs chunk of length %d", len(chunk))
+        DATASTORE_CLIENT.put_multi(chunk)
+        count += len(chunk)
+
+    logging.info("Writing jobs done, %s jobs added", count)
+
 
 def get_green_directory():
     """
     Clones the source of website URLs, the green directory,
     into the local file system using git
     """
-    if os.path.exists(green_directory_local_path):
-        shutil.rmtree(green_directory_local_path)
-    Repo.clone_from(green_directory_repo, green_directory_local_path)
+    if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
+        shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
+    Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
 
 
 def dir_entries():
     """
     Iterator over all data files in the cloned green directory
     """
-    path = os.path.join(green_directory_local_path, green_direcory_data_path)
+    path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
     for root, _, files in os.walk(path):
         for fname in files:
 
@@ -80,14 +167,14 @@ def repr_entry(entry):
     Return string representation of a directory entry,
     for logging/debugging purposes
     """
-    r = entry['type']
+    ret = entry['type']
     if 'level' in entry:
-        r += "/" + entry['level']
+        ret += "/" + entry['level']
     if 'state' in entry:
-        r += "/" + entry['state']
+        ret += "/" + entry['state']
     if 'district' in entry:
-        r += "/" + entry['district']
-    return r
+        ret += "/" + entry['district']
+    return ret
 
 
 def derive_test_hostnames(hostname):
@@ -117,24 +204,25 @@ def reduce_urls(urllist):
     that either don't work or lead somewhere else
     """
     targets = set()
-    for u in urllist:
-        if u['error'] is not None:
+    for url in urllist:
+        if url['error'] is not None:
             continue
-        if u['redirects_to'] is not None:
-            targets.add(u['redirects_to'])
+        if url['redirects_to'] is not None:
+            targets.add(url['redirects_to'])
         else:
-            targets.add(u['url'])
+            targets.add(url['url'])
     return sorted(list(targets))
 
 
-def normalize_title(s):
+def normalize_title(title):
     """
     Removes garbage from HTML page titles
     """
-    s = s.replace('\u00a0', ' ')
-    s = s.replace('  ', ' ')
-    s = s.strip()
-    return s
+    title = title.replace(u'\u00a0', ' ')
+    title = title.replace('  ', ' ')
+    title = title.strip()
+    return title
+
 
 def download_icon(icon_url):
     """
@@ -150,10 +238,10 @@ def download_icon(icon_url):
     }
 
     # Download the icon
-    r = requests.get(icon_url)
-    r.raise_for_status()
+    req = requests.get(icon_url)
+    req.raise_for_status()
 
-    content_hash = hashlib.md5(r.content).hexdigest()
+    content_hash = hashlib.md5(req.content).hexdigest()
     extension = ""
 
     file_name = os.path.basename(icon_url)[-1]
@@ -161,24 +249,25 @@ def download_icon(icon_url):
         ext = file_name.split(".")[-1]
         if ext != "":
             extension = ext
-    
+
     if extension == "":
         # derive from content type
-        t = r.headers.get('content-type')
+        ctype = req.headers.get('content-type')
         try:
-            extension = default_endings[t]
+            extension = default_endings[ctype]
         except KeyError:
-            logging.error("No file ending defined for icon type '%s'" % t)
+            logging.error("No file ending defined for icon type '%s'", ctype)
             return None
-    
+
     filename = content_hash + "." + extension.lower()
 
-    path = siteicons_path + os.path.sep + filename
+    path = SITEICONS_PATH + os.path.sep + filename
     with open(path, 'wb') as iconfile:
-        iconfile.write(r.content)
+        iconfile.write(req.content)
 
     return filename
 
+
 def check_responsiveness(url):
     """
     Checks
@@ -193,9 +282,9 @@ def check_responsiveness(url):
 
     # sizes we check for (width, height)
     sizes = (
-        (320,480), # old smartphone
-        (768,1024), # older tablet or newer smartphone
-        (1024,768), # older desktop or horiz. tablet
+        (320, 480), # old smartphone
+        (768, 1024), # older tablet or newer smartphone
+        (1024, 768), # older desktop or horiz. tablet
         (1920, 1080), # Full HD horizontal
     )
 
@@ -218,7 +307,8 @@ def check_responsiveness(url):
 
     return details
 
-def check_content(r):
+
+def check_content(req):
     """
     Adds details to check regarding content of the page
 
@@ -227,10 +317,10 @@ def check_content(r):
     """
     result = {}
 
-    result['encoding'] = r.encoding.lower()
-    soup = BeautifulSoup(r.text, 'html.parser')
+    result['encoding'] = req.encoding.lower()
+    soup = BeautifulSoup(req.text, 'html.parser')
 
-    result['html'] = r.text
+    result['html'] = req.text
 
     # page title
     result['title'] = None
@@ -245,47 +335,47 @@ def check_content(r):
     result['canonical_link'] = None
     link = soup.find('link', rel='canonical')
     if link:
-        result['canonical_link'] = urljoin(r.url, link.get('href'))
+        result['canonical_link'] = urljoin(req.url, link.get('href'))
 
     # icon
     result['icon'] = None
-    link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
+    link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
     if link:
-        result['icon'] = urljoin(r.url, link.get('href'))
+        result['icon'] = urljoin(req.url, link.get('href'))
     else:
-        link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
+        link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
         if link:
-            result['icon'] = urljoin(r.url, link.get('href'))
+            result['icon'] = urljoin(req.url, link.get('href'))
 
     # feed links
     result['feeds'] = []
     rss_links = soup.find_all('link', type='application/rss+xml')
     atom_links = soup.find_all('link', type='application/atom+xml')
 
-    if len(rss_links) > 0:
-        for l in rss_links:
-            result['feeds'].append(urljoin(r.url, l.get('href')))
-    if len(atom_links) > 0:
-        for l in rss_links:
-            result['feeds'].append(urljoin(r.url, l.get('href')))
+    if rss_links:
+        for link in rss_links:
+            result['feeds'].append(urljoin(req.url, link.get('href')))
+    if atom_links:
+        for link in rss_links:
+            result['feeds'].append(urljoin(req.url, link.get('href')))
 
     # generator meta tag
     result['generator'] = None
     if head is not None:
         generator = head.select('[name=generator]')
-        if len(generator):
+        if generator:
             result['generator'] = generator[0].get('content')
 
     # opengraph meta tags
     result['opengraph'] = None
-    og = set()
+    opengraph = set()
     if head is not None:
         for item in head.find_all(property=re.compile('^og:')):
-            og.add(item.get('property'))
+            opengraph.add(item.get('property'))
         for item in head.find_all(itemprop=re.compile('^og:')):
-            og.add(item.get('itemprop'))
-        if len(og):
-            result['opengraph'] = sorted(list(og))
+            opengraph.add(item.get('itemprop'))
+        if opengraph:
+            result['opengraph'] = sorted(list(opengraph))
 
     return result
 
@@ -298,8 +388,8 @@ def collect_ipv4_addresses(hostname_dict):
     for item in hostname_dict.values():
         if 'ip_addresses' not in item:
             continue
-        for ip in item['ip_addresses']:
-            ips.add(ip)
+        for ip_addr in item['ip_addresses']:
+            ips.add(ip_addr)
     return sorted(list(ips))
 
 
@@ -310,11 +400,11 @@ def parse_generator(generator):
     generator = generator.lower()
     if 'typo3' in generator:
         return "typo3"
-    elif 'wordpress' in generator:
+    if 'wordpress' in generator:
         return "wordpress"
-    elif 'drupal' in generator:
+    if 'drupal' in generator:
         return "drupal"
-    elif 'joomla' in generator:
+    if 'joomla' in generator:
         return "joomla"
     return generator
 
@@ -328,7 +418,9 @@ def check_site(entry):
     4. Run full check on canonical URL
     """
     headers = {
-        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
+                      'AppleWebKit/537.36 (KHTML, like Gecko) ' +
+                      'Chrome/65.0.3325.181 green-spider/0.1'
     }
 
     # all the info we'll return for the site
@@ -337,12 +429,13 @@ def check_site(entry):
         'input_url': entry['url'],
         # Meta: Regional and type metadata for the site
         'meta': {
-            'level': entry['level'],
-            'state': entry['state'],
-            'district': entry['district'],
-            'city': entry['city'],
+            'level': entry.get('level'),
+            'state': entry.get('state'),
+            'district': entry.get('district'),
+            'city': entry.get('city'),
         },
-        # Details: All details we collected about the site (which aren't directly related to the report criteria)
+        # Details: All details we collected about the site (which aren't directly
+        # related to the report criteria)
         'details': {
             'hostnames': {},
             'ipv4_addresses': [],
@@ -375,18 +468,18 @@ def check_site(entry):
 
     # try to resolve hostnames
     processed_hostnames = {}
-    for hn in hostnames:
+    for hostname in hostnames:
 
-        processed_hostnames[hn] = {
+        processed_hostnames[hostname] = {
             'resolvable': False,
         }
 
         try:
-            hostname, aliases, ip_addresses = gethostbyname_ex(hn)
-            processed_hostnames[hn]['resolvable'] = True
-            processed_hostnames[hn]['resolved_hostname'] = hostname
-            processed_hostnames[hn]['aliases'] = aliases
-            processed_hostnames[hn]['ip_addresses'] = ip_addresses
+            hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
+            processed_hostnames[hostname]['resolvable'] = True
+            processed_hostnames[hostname]['resolved_hostname'] = hostname
+            processed_hostnames[hostname]['aliases'] = aliases
+            processed_hostnames[hostname]['ip_addresses'] = ip_addresses
         except:
             pass
 
@@ -398,9 +491,9 @@ def check_site(entry):
     checked_urls = []
     checked_urls_set = set()
 
-    for hn in processed_hostnames.keys():
+    for hostname in processed_hostnames.keys():
 
-        item = processed_hostnames[hn]
+        item = processed_hostnames[hostname]
 
         if not item['resolvable']:
             continue
@@ -421,18 +514,19 @@ def check_site(entry):
             }
 
             try:
-                r = requests.head(record['url'], headers=headers, allow_redirects=True)
-                if r.url == url:
-                    logging.info("URL: %s - status %s" % (record['url'], r.status_code))
+                req = requests.head(record['url'], headers=headers, allow_redirects=True)
+                if req.url == url:
+                    logging.info("URL: %s - status %s", record['url'], req.status_code)
                 else:
-                    logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
-                    record['redirects_to'] = r.url
-            except Exception as e:
+                    logging.info("URL: %s - status %s - redirects to %s", record['url'],
+                                 req.status_code, req.url)
+                    record['redirects_to'] = req.url
+            except Exception as exc:
                 record['error'] = {
-                    'type': str(type(e)),
-                    'message': str(e),
+                    'type': str(type(exc)),
+                    'message': str(exc),
                 }
-                logging.info("URL %s: %s %s" % (url, str(type(e)), e))
+                logging.info("URL %s: %s %s", url, str(type(exc)), exc)
 
             checked_urls.append(record)
 
@@ -442,7 +536,7 @@ def check_site(entry):
     # Deeper test for the remaining (canonical) URL(s)
     for check_url in result['details']['canonical_urls']:
 
-        logging.info("Downloading URL %s" % check_url)
+        logging.info("Downloading URL %s", check_url)
 
         check = {
             'url': check_url,
@@ -454,37 +548,38 @@ def check_site(entry):
         }
 
         try:
-            r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
-            check['status_code'] = r.status_code
-            check['duration'] = round(r.elapsed.microseconds / 1000)
+            req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
+            check['status_code'] = req.status_code
+            check['duration'] = round(req.elapsed.microseconds / 1000)
 
             # Content checks
-            if r.status_code < 300:
-                check['content'] = check_content(r)
+            if req.status_code < 300:
+                check['content'] = check_content(req)
 
             # Responsiveness check
             try:
                 check['responsive'] = check_responsiveness(check_url)
-            except Exception as e:
-                logging.error("Error when checking responsiveness for '%s': %s" % (check_url, e))
+            except Exception as exc:
+                logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
 
-        except requests.exceptions.ConnectionError as e:
-            logging.error(str(e) + " " + check_url)
+        except requests.exceptions.ConnectionError as exc:
+            logging.error(str(exc) + " " + check_url)
             check['error'] = "connection"
-        except requests.exceptions.ReadTimeout as e:
-            logging.error(str(e) + " " + check_url)
+        except requests.exceptions.ReadTimeout as exc:
+            logging.error(str(exc) + " " + check_url)
             check['error'] = "read_timeout"
-        except requests.exceptions.Timeout as e:
-            logging.error(str(e) + " " + check_url)
+        except requests.exceptions.Timeout as exc:
+            logging.error(str(exc) + " " + check_url)
             check['error'] = "connection_timeout"
-        except Exception as e:
-            logging.error(str(e) + " " + check_url)
+        except Exception as exc:
+            logging.error(str(exc) + " " + check_url)
             check['error'] = "unknown"
 
         result['details']['urlchecks'].append(check)
 
 
-    result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
+    result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
+                                            key=lambda url: url['url'])
 
     # collect icons
     icons = set()
@@ -492,24 +587,24 @@ def check_site(entry):
         if 'content' not in c:
             continue
         if c['content'] is None:
-            logging.warning("No content for %s" % entry['url'])
+            logging.warning("No content for %s", entry['url'])
             continue
         if c['content']['icon'] is not None:
             icons.add(c['content']['icon'])
     downloaded_icons = set()
     for icon_url in icons:
-        logging.info("Getting icon %s" % icon_url)
+        logging.info("Getting icon %s", icon_url)
         try:
             downloaded_icons.add(download_icon(icon_url))
         except Exception as e:
-            logging.error("Could not download icon: %s" % e)
+            logging.error("Could not download icon: %s", e)
     result['details']['icons'] = sorted(list(downloaded_icons))
 
     # collect feeds
     feeds = set()
     for c in result['details']['urlchecks']:
         if c['content'] is None:
-            logging.warning("No content for %s" % entry['url'])
+            logging.warning("No content for %s", entry['url'])
             continue
         if 'feeds' in c['content'] and len(c['content']['feeds']):
             for feed in c['content']['feeds']:
@@ -543,7 +638,7 @@ def check_site(entry):
             result['details']['cms'] = parse_generator(c['content']['generator'])
             # Qualify certain CMS flavours in more detail
             if result['details']['cms'] == "typo3":
-                if gcms_ip in result['details']['ipv4_addresses']:
+                if GCMS_IP in result['details']['ipv4_addresses']:
                     result['details']['cms'] = "typo3-gcms"
                 elif 'typo3-gruene.de' in c['content']['html']:
                     result['details']['cms'] = "typo3-gruene"
@@ -555,7 +650,8 @@ def check_site(entry):
             # No generator Tag. Use HTML content.
             if 'Urwahl3000' in c['content']['html']:
                 result['details']['cms'] = "wordpress-urwahl"
-            elif 'josephknowsbest' in c['content']['html'] or 'Joseph-knows-best' in c['content']['html']:
+            elif ('josephknowsbest' in c['content']['html'] or
+                  'Joseph-knows-best' in c['content']['html']):
                 result['details']['cms'] = "wordpress-josephknowsbest"
             elif 'wordpress' in c['content']['html']:
                 result['details']['cms'] = "wordpress"
@@ -567,7 +663,7 @@ def check_site(entry):
     ### Derive criteria
 
     # DNS_RESOLVABLE_IPV4
-    if len(result['details']['ipv4_addresses']):
+    if result['details']['ipv4_addresses']:
         result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
 
     # SITE_REACHABLE
@@ -584,8 +680,8 @@ def check_site(entry):
 
     # WWW_OPTIONAL
     num_hostnames = 0
-    for hn in result['details']['hostnames'].keys():
-        item = result['details']['hostnames'][hn]
+    for hostname in result['details']['hostnames'].keys():
+        item = result['details']['hostnames'][hostname]
         if not item['resolvable']:
             continue
         num_hostnames += 1
@@ -600,20 +696,20 @@ def check_site(entry):
     else:
         links = set()
         if result['details']['urlchecks'] is None:
-            logging.warning("No urlchecks for %s" % entry['url'])
+            logging.warning("No urlchecks for %s", entry['url'])
         else:
             for item in result['details']['urlchecks']:
-                if item['content']['canonical_link'] is not None:
+                if item['content'] is not None and item['content']['canonical_link'] is not None:
                     links.add(item['content']['canonical_link'])
         if len(links) == 1:
             result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
 
     # FAVICON
-    if len(result['details']['icons']):
+    if result['details']['icons']:
         result['result']['FAVICON'] = {'value': True, 'score': 1}
 
     # FEEDS
-    if len(result['details']['feeds']):
+    if result['details']['feeds']:
         result['result']['FEEDS'] = {'value': True, 'score': 1}
 
     # HTTP_RESPONSE_DURATION
@@ -621,17 +717,18 @@ def check_site(entry):
     for item in result['details']['urlchecks']:
         if item['error'] is None:
             durations.append(item['duration'])
-    val = round(statistics.mean(durations))
-    result['result']['HTTP_RESPONSE_DURATION']['value'] = val
-    if val < 100:
-        result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
-    elif val < 1000:
-        result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
+    if durations:
+        val = round(statistics.mean(durations))
+        result['result']['HTTP_RESPONSE_DURATION']['value'] = val
+        if val < 100:
+            result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
+        elif val < 1000:
+            result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
 
     # RESPONSIVE
     if result['details']['responsive'] is not None:
         if (result['details']['responsive']['min_width'] < 500 and
-            len(result['details']['responsive']['viewport_meta_tag']) > 0):
+                len(result['details']['responsive']['viewport_meta_tag']) > 0):
             result['result']['RESPONSIVE']['value'] = True
             result['result']['RESPONSIVE']['score'] = 1
 
@@ -649,87 +746,91 @@ def check_site(entry):
     return result
 
 
-def main():
+def get_job_from_queue():
     """
-    Bringing it all together
+    Returns a URL from the queue
     """
-    logging.basicConfig(level=logging.INFO)
-    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
+    out = None
 
-    # refresh our local clone of the green directory
-    get_green_directory()
+    with DATASTORE_CLIENT.transaction():
+        query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
+        for entity in query.fetch(limit=1):
+            logging.debug("Got job: %s", entity)
+            out = dict(entity)
+            out["url"] = entity.key.name
+            DATASTORE_CLIENT.delete(entity.key)
 
-    # build the list of website URLs to run checks for
-    logging.info("Processing green-directory")
-    input_entries = []
+    return out
 
-    for entry in dir_entries():
+def work_of_queue():
+    """
+    Take job from queue and finish it until there are no more jobs
+    """
+    while True:
+        job = get_job_from_queue()
+        if job is None:
+            logging.info("No more jobs. Exiting.")
+            break
 
-        if 'type' not in entry:
-            logging.error("Entry without type")
-            continue
-        if 'urls' not in entry:
-            logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
-            continue
+        logging.info("Starting job %s", job["url"])
+        result = check_site(entry=job)
+        #logging.debug(result)
+        logging.info("Job %s finished checks", job["url"])
+        logging.info("Job %s writing to DB", job["url"])
 
-        website_url = None
-        for n in range(len(entry['urls'])):
-            try:
-                if entry['urls'][n]['type'] == "WEBSITE":
-                    website_url = entry['urls'][n]['url']
-                    if website_url:
-                        input_entries.append({
-                            "url": website_url,
-                            "level": entry.get("level"),
-                            "state": entry.get("state"),
-                            "district": entry.get("district"),
-                            "city": entry.get("city"),
-                        })
-            except NameError:
-                logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
-
-
-    # randomize order, to distribute requests over servers
-    logging.info("Shuffling input URLs")
-    random.seed()
-    random.shuffle(input_entries)
-
-    # run checks
-    logging.info("Starting checks")
-    results = {}
-
-    pool = Pool(concurrency)
-    for ientry in input_entries:
-        logging.info("Submitting %s to job pool" % ientry['url'])
-        results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry})
-    pool.close()
-    pool.join()
-
-    logging.info("Checks are finished")
-
-    # Restructure result from dict of ApplyResult
-    # to list of dicts and sort in stable way
-    json_result = []
-    done = set()
-
-    logging.info("Restructuring results")
-
-    # convert results from ApplyResult to dict
-    for url in sorted(results.keys()):
-        if url not in done:
-            logging.info("Getting result for %s" % url)
-            try:
-                resultsitem = results[url].get()
-                json_result.append(resultsitem)
-            except Exception as e:
-                logging.error("Error getting result for '%s': %s" % (url, e))
-        done.add(url)
-
-    # Write result as JSON
-    output_filename = os.path.join(result_path, "spider_result.json")
-    with open(output_filename, 'w', encoding="utf8") as jsonfile:
-        json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
+        key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
+        entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
+        record = {
+            "created": datetime.utcnow(),
+            "results": result,
+        }
+        entity.update(record)
+        try:
+            DATASTORE_CLIENT.put(entity)
+        except InvalidArgument as ex:
+            logging.error("Could not write result: %s", ex)
+        except ex:
+            logging.error("Could not write result: %s", ex)
 
 
 if __name__ == "__main__":
-    main()
+    """
+    Bringing it all together
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--credentials-path', dest='credentials_path',
+                        help='Path to the service account credentials JSON file',
+                        default='/secrets/service-account.json')
+    parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
+                        default='info')
+
+    subparsers = parser.add_subparsers(help='sub-command help', dest='command')
+
+    subparsers.add_parser('spider', help='Take jobs off the queue and spider')
+
+    jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
+
+    jobs_parser.add_argument('--url', help='Add a job to spider a URL')
+    args = parser.parse_args()
+
+    loglevel = args.loglevel.lower()
+    if loglevel == 'error':
+        logging.basicConfig(level=logging.ERROR)
+    elif loglevel == 'warn':
+        logging.basicConfig(level=logging.WARN)
+    elif loglevel == 'debug':
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+        loglevel = 'info'
+
+    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
+
+    DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
+
+    logging.debug("Called command %s", args.command)
+
+    if args.command == 'jobs':
+        create_jobs()
+    else:
+        work_of_queue()