Big spider.py overhaul

2024-04-29 07:44:50 +02:00 · 2018-08-23 09:36:33 +02:00 · 2018-08-23 09:36:33 +02:00 · 88ec1f63f7
parent b61172445b
commit 88ec1f63f7
1 changed files with 316 additions and 215 deletions
--- a/spider.py
+++ b/spider.py
@ -1,68 +1,155 @@
-# coding: utf8
+"""
 Provides the spider functionality (website checks).
 """
-from bs4 import BeautifulSoup
+import argparse
 from git import Repo
 from multiprocessing import Pool
 from selenium import webdriver
 from socket import gethostbyname_ex
 from urllib.parse import urljoin
 from urllib.parse import urlparse
 import certifi
 import hashlib
 import json
 import logging
 import os
 import random
 import re
 import requests
 import shutil
 import statistics
-import sys
+from datetime import datetime
 from socket import gethostbyname_ex
 from urllib.parse import urljoin
 from urllib.parse import urlparse
 import requests
 import yaml
 from bs4 import BeautifulSoup
 from git import Repo
 from selenium import webdriver
 from google.cloud import datastore
 from google.api_core.exceptions import InvalidArgument
 # configuration
 # number of parallel processes to use for crawling
 concurrency = 1
 # connection timeout for website checks (seconds)
-connect_timeout = 5
+CONNECT_TIMEOUT = 5
 # response timeout for website checks
-read_timeout = 10
+READ_TIMEOUT = 10
 # Git repo for our data
-green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
+GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
 # folder in that repo that holds the data
-green_direcory_data_path = 'data/countries/de'
+GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
-green_directory_local_path = './cache/green-directory'
+GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
-result_path = '/out'
+RESULT_PATH = '/out'
-siteicons_path = '/icons'
+SITEICONS_PATH = '/icons'
 # IP address of the newthinking GCMS server
-gcms_ip = "91.102.13.20"
+GCMS_IP = "91.102.13.20"
 JOB_DATASTORE_KIND = 'spider-jobs'
 RESULTS_DATASTORE_KIND = 'spider-results'
 # end configuration
 DATASTORE_CLIENT = None
 def chunks(the_list, size):
    """
    Yield successive n-sized chunks from list the_list
    where n = size.
    """
    for i in range(0, len(the_list), size):
        yield the_list[i:i + size]
 def create_jobs():
    """
    Read all URLs from green directory and fill a job database
    with one job per URL
    """
    # refresh our local clone of the green directory
    logging.info("Refreshing green-directory clone")
    get_green_directory()
    # build the list of website URLs to run checks for
    logging.info("Processing green-directory")
    input_entries = []
    for entry in dir_entries():
        if 'type' not in entry:
            logging.error("Entry without type")
            continue
        if 'urls' not in entry:
            logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
            continue
        website_url = None
        for index in range(len(entry['urls'])):
            try:
                if entry['urls'][index]['type'] == "WEBSITE":
                    website_url = entry['urls'][index]['url']
                    if website_url:
                        input_entries.append({
                            "url": website_url,
                            "level": entry.get("level"),
                            "state": entry.get("state"),
                            "district": entry.get("district"),
                            "city": entry.get("city"),
                        })
            except NameError:
                logging.error("Error in %s: 'url' key missing (%s)",
                              repr_entry(entry), entry['urls'][index])
    # randomize order, to distribute requests over servers
    logging.debug("Shuffling input URLs")
    random.seed()
    random.shuffle(input_entries)
    count = 0
    logging.info("Writing jobs")
    entities = []
    for entry in input_entries:
        key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
        entity = datastore.Entity(key=key)
        entity.update({
            "created": datetime.utcnow(),
            "level": entry["level"],
            "state": entry["state"],
            "district": entry["district"],
            "city": entry["city"],
        })
        entities.append(entity)
    # commmit to DB
    for chunk in chunks(entities, 300):
        logging.debug("Writing jobs chunk of length %d", len(chunk))
        DATASTORE_CLIENT.put_multi(chunk)
        count += len(chunk)
    logging.info("Writing jobs done, %s jobs added", count)
 def get_green_directory():
    """
    Clones the source of website URLs, the green directory,
    into the local file system using git
    """
-    if os.path.exists(green_directory_local_path):
+    if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
-        shutil.rmtree(green_directory_local_path)
+        shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
-    Repo.clone_from(green_directory_repo, green_directory_local_path)
+    Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
 def dir_entries():
    """
    Iterator over all data files in the cloned green directory
    """
-    path = os.path.join(green_directory_local_path, green_direcory_data_path)
+    path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
    for root, _, files in os.walk(path):
        for fname in files:
@ -80,14 +167,14 @@ def repr_entry(entry):
    Return string representation of a directory entry,
    for logging/debugging purposes
    """
-    r = entry['type']
+    ret = entry['type']
    if 'level' in entry:
-        r += "/" + entry['level']
+        ret += "/" + entry['level']
    if 'state' in entry:
-        r += "/" + entry['state']
+        ret += "/" + entry['state']
    if 'district' in entry:
-        r += "/" + entry['district']
+        ret += "/" + entry['district']
-    return r
+    return ret
 def derive_test_hostnames(hostname):
@ -117,24 +204,25 @@ def reduce_urls(urllist):
    that either don't work or lead somewhere else
    """
    targets = set()
-    for u in urllist:
+    for url in urllist:
-        if u['error'] is not None:
+        if url['error'] is not None:
            continue
-        if u['redirects_to'] is not None:
+        if url['redirects_to'] is not None:
-            targets.add(u['redirects_to'])
+            targets.add(url['redirects_to'])
        else:
-            targets.add(u['url'])
+            targets.add(url['url'])
    return sorted(list(targets))
-def normalize_title(s):
+def normalize_title(title):
    """
    Removes garbage from HTML page titles
    """
-    s = s.replace('\u00a0', ' ')
+    title = title.replace(u'\u00a0', ' ')
-    s = s.replace('  ', ' ')
+    title = title.replace('  ', ' ')
-    s = s.strip()
+    title = title.strip()
-    return s
+    return title
 def download_icon(icon_url):
    """
@ -150,10 +238,10 @@ def download_icon(icon_url):
    }
    # Download the icon
-    r = requests.get(icon_url)
+    req = requests.get(icon_url)
-    r.raise_for_status()
+    req.raise_for_status()
-    content_hash = hashlib.md5(r.content).hexdigest()
+    content_hash = hashlib.md5(req.content).hexdigest()
    extension = ""
    file_name = os.path.basename(icon_url)[-1]
@ -161,24 +249,25 @@ def download_icon(icon_url):
        ext = file_name.split(".")[-1]
        if ext != "":
            extension = ext
-    
+
    if extension == "":
        # derive from content type
-        t = r.headers.get('content-type')
+        ctype = req.headers.get('content-type')
        try:
-            extension = default_endings[t]
+            extension = default_endings[ctype]
        except KeyError:
-            logging.error("No file ending defined for icon type '%s'" % t)
+            logging.error("No file ending defined for icon type '%s'", ctype)
            return None
-    
+
    filename = content_hash + "." + extension.lower()
-    path = siteicons_path + os.path.sep + filename
+    path = SITEICONS_PATH + os.path.sep + filename
    with open(path, 'wb') as iconfile:
-        iconfile.write(r.content)
+        iconfile.write(req.content)
    return filename
 def check_responsiveness(url):
    """
    Checks
@ -193,9 +282,9 @@ def check_responsiveness(url):
    # sizes we check for (width, height)
    sizes = (
-        (320,480), # old smartphone
+        (320, 480), # old smartphone
-        (768,1024), # older tablet or newer smartphone
+        (768, 1024), # older tablet or newer smartphone
-        (1024,768), # older desktop or horiz. tablet
+        (1024, 768), # older desktop or horiz. tablet
        (1920, 1080), # Full HD horizontal
    )
@ -218,7 +307,8 @@ def check_responsiveness(url):
    return details
-def check_content(r):
+
 def check_content(req):
    """
    Adds details to check regarding content of the page
@ -227,10 +317,10 @@ def check_content(r):
    """
    result = {}
-    result['encoding'] = r.encoding.lower()
+    result['encoding'] = req.encoding.lower()
-    soup = BeautifulSoup(r.text, 'html.parser')
+    soup = BeautifulSoup(req.text, 'html.parser')
-    result['html'] = r.text
+    result['html'] = req.text
    # page title
    result['title'] = None
@ -245,47 +335,47 @@ def check_content(r):
    result['canonical_link'] = None
    link = soup.find('link', rel='canonical')
    if link:
-        result['canonical_link'] = urljoin(r.url, link.get('href'))
+        result['canonical_link'] = urljoin(req.url, link.get('href'))
    # icon
    result['icon'] = None
-    link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
+    link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
    if link:
-        result['icon'] = urljoin(r.url, link.get('href'))
+        result['icon'] = urljoin(req.url, link.get('href'))
    else:
-        link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
+        link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
        if link:
-            result['icon'] = urljoin(r.url, link.get('href'))
+            result['icon'] = urljoin(req.url, link.get('href'))
    # feed links
    result['feeds'] = []
    rss_links = soup.find_all('link', type='application/rss+xml')
    atom_links = soup.find_all('link', type='application/atom+xml')
-    if len(rss_links) > 0:
+    if rss_links:
-        for l in rss_links:
+        for link in rss_links:
-            result['feeds'].append(urljoin(r.url, l.get('href')))
+            result['feeds'].append(urljoin(req.url, link.get('href')))
-    if len(atom_links) > 0:
+    if atom_links:
-        for l in rss_links:
+        for link in rss_links:
-            result['feeds'].append(urljoin(r.url, l.get('href')))
+            result['feeds'].append(urljoin(req.url, link.get('href')))
    # generator meta tag
    result['generator'] = None
    if head is not None:
        generator = head.select('[name=generator]')
-        if len(generator):
+        if generator:
            result['generator'] = generator[0].get('content')
    # opengraph meta tags
    result['opengraph'] = None
-    og = set()
+    opengraph = set()
    if head is not None:
        for item in head.find_all(property=re.compile('^og:')):
-            og.add(item.get('property'))
+            opengraph.add(item.get('property'))
        for item in head.find_all(itemprop=re.compile('^og:')):
-            og.add(item.get('itemprop'))
+            opengraph.add(item.get('itemprop'))
-        if len(og):
+        if opengraph:
-            result['opengraph'] = sorted(list(og))
+            result['opengraph'] = sorted(list(opengraph))
    return result
@ -298,8 +388,8 @@ def collect_ipv4_addresses(hostname_dict):
    for item in hostname_dict.values():
        if 'ip_addresses' not in item:
            continue
-        for ip in item['ip_addresses']:
+        for ip_addr in item['ip_addresses']:
-            ips.add(ip)
+            ips.add(ip_addr)
    return sorted(list(ips))
@ -310,11 +400,11 @@ def parse_generator(generator):
    generator = generator.lower()
    if 'typo3' in generator:
        return "typo3"
-    elif 'wordpress' in generator:
+    if 'wordpress' in generator:
        return "wordpress"
-    elif 'drupal' in generator:
+    if 'drupal' in generator:
        return "drupal"
-    elif 'joomla' in generator:
+    if 'joomla' in generator:
        return "joomla"
    return generator
@ -328,7 +418,9 @@ def check_site(entry):
    4. Run full check on canonical URL
    """
    headers = {
-        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
                      'AppleWebKit/537.36 (KHTML, like Gecko) ' +
                      'Chrome/65.0.3325.181 green-spider/0.1'
    }
    # all the info we'll return for the site
@ -337,12 +429,13 @@ def check_site(entry):
        'input_url': entry['url'],
        # Meta: Regional and type metadata for the site
        'meta': {
-            'level': entry['level'],
+            'level': entry.get('level'),
-            'state': entry['state'],
+            'state': entry.get('state'),
-            'district': entry['district'],
+            'district': entry.get('district'),
-            'city': entry['city'],
+            'city': entry.get('city'),
        },
-        # Details: All details we collected about the site (which aren't directly related to the report criteria)
+        # Details: All details we collected about the site (which aren't directly
        # related to the report criteria)
        'details': {
            'hostnames': {},
            'ipv4_addresses': [],
@ -375,18 +468,18 @@ def check_site(entry):
    # try to resolve hostnames
    processed_hostnames = {}
-    for hn in hostnames:
+    for hostname in hostnames:
-        processed_hostnames[hn] = {
+        processed_hostnames[hostname] = {
            'resolvable': False,
        }
        try:
-            hostname, aliases, ip_addresses = gethostbyname_ex(hn)
+            hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
-            processed_hostnames[hn]['resolvable'] = True
+            processed_hostnames[hostname]['resolvable'] = True
-            processed_hostnames[hn]['resolved_hostname'] = hostname
+            processed_hostnames[hostname]['resolved_hostname'] = hostname
-            processed_hostnames[hn]['aliases'] = aliases
+            processed_hostnames[hostname]['aliases'] = aliases
-            processed_hostnames[hn]['ip_addresses'] = ip_addresses
+            processed_hostnames[hostname]['ip_addresses'] = ip_addresses
        except:
            pass
@ -398,9 +491,9 @@ def check_site(entry):
    checked_urls = []
    checked_urls_set = set()
-    for hn in processed_hostnames.keys():
+    for hostname in processed_hostnames.keys():
-        item = processed_hostnames[hn]
+        item = processed_hostnames[hostname]
        if not item['resolvable']:
            continue
@ -421,18 +514,19 @@ def check_site(entry):
            }
            try:
-                r = requests.head(record['url'], headers=headers, allow_redirects=True)
+                req = requests.head(record['url'], headers=headers, allow_redirects=True)
-                if r.url == url:
+                if req.url == url:
-                    logging.info("URL: %s - status %s" % (record['url'], r.status_code))
+                    logging.info("URL: %s - status %s", record['url'], req.status_code)
                else:
-                    logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
+                    logging.info("URL: %s - status %s - redirects to %s", record['url'],
-                    record['redirects_to'] = r.url
+                                 req.status_code, req.url)
-            except Exception as e:
+                    record['redirects_to'] = req.url
            except Exception as exc:
                record['error'] = {
-                    'type': str(type(e)),
+                    'type': str(type(exc)),
-                    'message': str(e),
+                    'message': str(exc),
                }
-                logging.info("URL %s: %s %s" % (url, str(type(e)), e))
+                logging.info("URL %s: %s %s", url, str(type(exc)), exc)
            checked_urls.append(record)
@ -442,7 +536,7 @@ def check_site(entry):
    # Deeper test for the remaining (canonical) URL(s)
    for check_url in result['details']['canonical_urls']:
-        logging.info("Downloading URL %s" % check_url)
+        logging.info("Downloading URL %s", check_url)
        check = {
            'url': check_url,
@ -454,37 +548,38 @@ def check_site(entry):
        }
        try:
-            r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
+            req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
-            check['status_code'] = r.status_code
+            check['status_code'] = req.status_code
-            check['duration'] = round(r.elapsed.microseconds / 1000)
+            check['duration'] = round(req.elapsed.microseconds / 1000)
            # Content checks
-            if r.status_code < 300:
+            if req.status_code < 300:
-                check['content'] = check_content(r)
+                check['content'] = check_content(req)
            # Responsiveness check
            try:
                check['responsive'] = check_responsiveness(check_url)
-            except Exception as e:
+            except Exception as exc:
-                logging.error("Error when checking responsiveness for '%s': %s" % (check_url, e))
+                logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
-        except requests.exceptions.ConnectionError as e:
+        except requests.exceptions.ConnectionError as exc:
-            logging.error(str(e) + " " + check_url)
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "connection"
-        except requests.exceptions.ReadTimeout as e:
+        except requests.exceptions.ReadTimeout as exc:
-            logging.error(str(e) + " " + check_url)
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "read_timeout"
-        except requests.exceptions.Timeout as e:
+        except requests.exceptions.Timeout as exc:
-            logging.error(str(e) + " " + check_url)
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "connection_timeout"
-        except Exception as e:
+        except Exception as exc:
-            logging.error(str(e) + " " + check_url)
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "unknown"
        result['details']['urlchecks'].append(check)
-    result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
+    result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
                                            key=lambda url: url['url'])
    # collect icons
    icons = set()
@ -492,24 +587,24 @@ def check_site(entry):
        if 'content' not in c:
            continue
        if c['content'] is None:
-            logging.warning("No content for %s" % entry['url'])
+            logging.warning("No content for %s", entry['url'])
            continue
        if c['content']['icon'] is not None:
            icons.add(c['content']['icon'])
    downloaded_icons = set()
    for icon_url in icons:
-        logging.info("Getting icon %s" % icon_url)
+        logging.info("Getting icon %s", icon_url)
        try:
            downloaded_icons.add(download_icon(icon_url))
        except Exception as e:
-            logging.error("Could not download icon: %s" % e)
+            logging.error("Could not download icon: %s", e)
    result['details']['icons'] = sorted(list(downloaded_icons))
    # collect feeds
    feeds = set()
    for c in result['details']['urlchecks']:
        if c['content'] is None:
-            logging.warning("No content for %s" % entry['url'])
+            logging.warning("No content for %s", entry['url'])
            continue
        if 'feeds' in c['content'] and len(c['content']['feeds']):
            for feed in c['content']['feeds']:
@ -543,7 +638,7 @@ def check_site(entry):
            result['details']['cms'] = parse_generator(c['content']['generator'])
            # Qualify certain CMS flavours in more detail
            if result['details']['cms'] == "typo3":
-                if gcms_ip in result['details']['ipv4_addresses']:
+                if GCMS_IP in result['details']['ipv4_addresses']:
                    result['details']['cms'] = "typo3-gcms"
                elif 'typo3-gruene.de' in c['content']['html']:
                    result['details']['cms'] = "typo3-gruene"
@ -555,7 +650,8 @@ def check_site(entry):
            # No generator Tag. Use HTML content.
            if 'Urwahl3000' in c['content']['html']:
                result['details']['cms'] = "wordpress-urwahl"
-            elif 'josephknowsbest' in c['content']['html'] or 'Joseph-knows-best' in c['content']['html']:
+            elif ('josephknowsbest' in c['content']['html'] or
                  'Joseph-knows-best' in c['content']['html']):
                result['details']['cms'] = "wordpress-josephknowsbest"
            elif 'wordpress' in c['content']['html']:
                result['details']['cms'] = "wordpress"
@ -567,7 +663,7 @@ def check_site(entry):
    ### Derive criteria
    # DNS_RESOLVABLE_IPV4
-    if len(result['details']['ipv4_addresses']):
+    if result['details']['ipv4_addresses']:
        result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
    # SITE_REACHABLE
@ -584,8 +680,8 @@ def check_site(entry):
    # WWW_OPTIONAL
    num_hostnames = 0
-    for hn in result['details']['hostnames'].keys():
+    for hostname in result['details']['hostnames'].keys():
-        item = result['details']['hostnames'][hn]
+        item = result['details']['hostnames'][hostname]
        if not item['resolvable']:
            continue
        num_hostnames += 1
@ -600,20 +696,20 @@ def check_site(entry):
    else:
        links = set()
        if result['details']['urlchecks'] is None:
-            logging.warning("No urlchecks for %s" % entry['url'])
+            logging.warning("No urlchecks for %s", entry['url'])
        else:
            for item in result['details']['urlchecks']:
-                if item['content']['canonical_link'] is not None:
+                if item['content'] is not None and item['content']['canonical_link'] is not None:
                    links.add(item['content']['canonical_link'])
        if len(links) == 1:
            result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
    # FAVICON
-    if len(result['details']['icons']):
+    if result['details']['icons']:
        result['result']['FAVICON'] = {'value': True, 'score': 1}
    # FEEDS
-    if len(result['details']['feeds']):
+    if result['details']['feeds']:
        result['result']['FEEDS'] = {'value': True, 'score': 1}
    # HTTP_RESPONSE_DURATION
@ -621,17 +717,18 @@ def check_site(entry):
    for item in result['details']['urlchecks']:
        if item['error'] is None:
            durations.append(item['duration'])
-    val = round(statistics.mean(durations))
+    if durations:
-    result['result']['HTTP_RESPONSE_DURATION']['value'] = val
+        val = round(statistics.mean(durations))
-    if val < 100:
+        result['result']['HTTP_RESPONSE_DURATION']['value'] = val
-        result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
+        if val < 100:
-    elif val < 1000:
+            result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
-        result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
+        elif val < 1000:
            result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
    # RESPONSIVE
    if result['details']['responsive'] is not None:
        if (result['details']['responsive']['min_width'] < 500 and
-            len(result['details']['responsive']['viewport_meta_tag']) > 0):
+                len(result['details']['responsive']['viewport_meta_tag']) > 0):
            result['result']['RESPONSIVE']['value'] = True
            result['result']['RESPONSIVE']['score'] = 1
@ -649,87 +746,91 @@ def check_site(entry):
    return result
-def main():
+def get_job_from_queue():
    """
-    Bringing it all together
+    Returns a URL from the queue
    """
-    logging.basicConfig(level=logging.INFO)
+    out = None
    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
-    # refresh our local clone of the green directory
+    with DATASTORE_CLIENT.transaction():
-    get_green_directory()
+        query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
        for entity in query.fetch(limit=1):
            logging.debug("Got job: %s", entity)
            out = dict(entity)
            out["url"] = entity.key.name
            DATASTORE_CLIENT.delete(entity.key)
-    # build the list of website URLs to run checks for
+    return out
    logging.info("Processing green-directory")
    input_entries = []
-    for entry in dir_entries():
+def work_of_queue():
    """
    Take job from queue and finish it until there are no more jobs
    """
    while True:
        job = get_job_from_queue()
        if job is None:
            logging.info("No more jobs. Exiting.")
            break
-        if 'type' not in entry:
+        logging.info("Starting job %s", job["url"])
-            logging.error("Entry without type")
+        result = check_site(entry=job)
-            continue
+        #logging.debug(result)
-        if 'urls' not in entry:
+        logging.info("Job %s finished checks", job["url"])
-            logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
+        logging.info("Job %s writing to DB", job["url"])
            continue
-        website_url = None
+        key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
-        for n in range(len(entry['urls'])):
+        entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
-            try:
+        record = {
-                if entry['urls'][n]['type'] == "WEBSITE":
+            "created": datetime.utcnow(),
-                    website_url = entry['urls'][n]['url']
+            "results": result,
-                    if website_url:
+        }
-                        input_entries.append({
+        entity.update(record)
-                            "url": website_url,
+        try:
-                            "level": entry.get("level"),
+            DATASTORE_CLIENT.put(entity)
-                            "state": entry.get("state"),
+        except InvalidArgument as ex:
-                            "district": entry.get("district"),
+            logging.error("Could not write result: %s", ex)
-                            "city": entry.get("city"),
+        except ex:
-                        })
+            logging.error("Could not write result: %s", ex)
            except NameError:
                logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
    # randomize order, to distribute requests over servers
    logging.info("Shuffling input URLs")
    random.seed()
    random.shuffle(input_entries)
    # run checks
    logging.info("Starting checks")
    results = {}
    pool = Pool(concurrency)
    for ientry in input_entries:
        logging.info("Submitting %s to job pool" % ientry['url'])
        results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry})
    pool.close()
    pool.join()
    logging.info("Checks are finished")
    # Restructure result from dict of ApplyResult
    # to list of dicts and sort in stable way
    json_result = []
    done = set()
    logging.info("Restructuring results")
    # convert results from ApplyResult to dict
    for url in sorted(results.keys()):
        if url not in done:
            logging.info("Getting result for %s" % url)
            try:
                resultsitem = results[url].get()
                json_result.append(resultsitem)
            except Exception as e:
                logging.error("Error getting result for '%s': %s" % (url, e))
        done.add(url)
    # Write result as JSON
    output_filename = os.path.join(result_path, "spider_result.json")
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
 if __name__ == "__main__":
-    main()
+    """
    Bringing it all together
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--credentials-path', dest='credentials_path',
                        help='Path to the service account credentials JSON file',
                        default='/secrets/service-account.json')
    parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
                        default='info')
    subparsers = parser.add_subparsers(help='sub-command help', dest='command')
    subparsers.add_parser('spider', help='Take jobs off the queue and spider')
    jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
    jobs_parser.add_argument('--url', help='Add a job to spider a URL')
    args = parser.parse_args()
    loglevel = args.loglevel.lower()
    if loglevel == 'error':
        logging.basicConfig(level=logging.ERROR)
    elif loglevel == 'warn':
        logging.basicConfig(level=logging.WARN)
    elif loglevel == 'debug':
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
        loglevel = 'info'
    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
    DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
    logging.debug("Called command %s", args.command)
    if args.command == 'jobs':
        create_jobs()
    else:
        work_of_queue()