From 88ec1f63f71d9f7e8c1e9258968ecdc7cf14fc54 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Thu, 23 Aug 2018 09:36:33 +0200 Subject: [PATCH] Big spider.py overhaul --- spider.py | 531 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 316 insertions(+), 215 deletions(-) diff --git a/spider.py b/spider.py index 3b33732..93f376c 100644 --- a/spider.py +++ b/spider.py @@ -1,68 +1,155 @@ -# coding: utf8 +""" +Provides the spider functionality (website checks). +""" -from bs4 import BeautifulSoup -from git import Repo -from multiprocessing import Pool -from selenium import webdriver -from socket import gethostbyname_ex -from urllib.parse import urljoin -from urllib.parse import urlparse -import certifi +import argparse import hashlib import json import logging import os import random import re -import requests import shutil import statistics -import sys +from datetime import datetime +from socket import gethostbyname_ex +from urllib.parse import urljoin +from urllib.parse import urlparse + +import requests import yaml +from bs4 import BeautifulSoup +from git import Repo +from selenium import webdriver +from google.cloud import datastore +from google.api_core.exceptions import InvalidArgument + # configuration -# number of parallel processes to use for crawling -concurrency = 1 - # connection timeout for website checks (seconds) -connect_timeout = 5 +CONNECT_TIMEOUT = 5 # response timeout for website checks -read_timeout = 10 +READ_TIMEOUT = 10 # Git repo for our data -green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git' +GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git' # folder in that repo that holds the data -green_direcory_data_path = 'data/countries/de' -green_directory_local_path = './cache/green-directory' +GREEN_DIRECTORY_DATA_PATH = 'data/countries/de' +GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory' -result_path = '/out' +RESULT_PATH = '/out' -siteicons_path = '/icons' +SITEICONS_PATH = '/icons' # IP address of the newthinking GCMS server -gcms_ip = "91.102.13.20" +GCMS_IP = "91.102.13.20" + +JOB_DATASTORE_KIND = 'spider-jobs' +RESULTS_DATASTORE_KIND = 'spider-results' # end configuration +DATASTORE_CLIENT = None + + +def chunks(the_list, size): + """ + Yield successive n-sized chunks from list the_list + where n = size. + """ + for i in range(0, len(the_list), size): + yield the_list[i:i + size] + + +def create_jobs(): + """ + Read all URLs from green directory and fill a job database + with one job per URL + """ + + # refresh our local clone of the green directory + logging.info("Refreshing green-directory clone") + get_green_directory() + + # build the list of website URLs to run checks for + logging.info("Processing green-directory") + input_entries = [] + + for entry in dir_entries(): + + if 'type' not in entry: + logging.error("Entry without type") + continue + if 'urls' not in entry: + logging.debug("Entry %s does not have any URLs.", repr_entry(entry)) + continue + + website_url = None + for index in range(len(entry['urls'])): + try: + if entry['urls'][index]['type'] == "WEBSITE": + website_url = entry['urls'][index]['url'] + if website_url: + input_entries.append({ + "url": website_url, + "level": entry.get("level"), + "state": entry.get("state"), + "district": entry.get("district"), + "city": entry.get("city"), + }) + except NameError: + logging.error("Error in %s: 'url' key missing (%s)", + repr_entry(entry), entry['urls'][index]) + + # randomize order, to distribute requests over servers + logging.debug("Shuffling input URLs") + random.seed() + random.shuffle(input_entries) + + count = 0 + logging.info("Writing jobs") + + entities = [] + + for entry in input_entries: + key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"]) + entity = datastore.Entity(key=key) + entity.update({ + "created": datetime.utcnow(), + "level": entry["level"], + "state": entry["state"], + "district": entry["district"], + "city": entry["city"], + }) + entities.append(entity) + + # commmit to DB + for chunk in chunks(entities, 300): + logging.debug("Writing jobs chunk of length %d", len(chunk)) + DATASTORE_CLIENT.put_multi(chunk) + count += len(chunk) + + logging.info("Writing jobs done, %s jobs added", count) + def get_green_directory(): """ Clones the source of website URLs, the green directory, into the local file system using git """ - if os.path.exists(green_directory_local_path): - shutil.rmtree(green_directory_local_path) - Repo.clone_from(green_directory_repo, green_directory_local_path) + if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH): + shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH) + Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH) def dir_entries(): """ Iterator over all data files in the cloned green directory """ - path = os.path.join(green_directory_local_path, green_direcory_data_path) + path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH) for root, _, files in os.walk(path): for fname in files: @@ -80,14 +167,14 @@ def repr_entry(entry): Return string representation of a directory entry, for logging/debugging purposes """ - r = entry['type'] + ret = entry['type'] if 'level' in entry: - r += "/" + entry['level'] + ret += "/" + entry['level'] if 'state' in entry: - r += "/" + entry['state'] + ret += "/" + entry['state'] if 'district' in entry: - r += "/" + entry['district'] - return r + ret += "/" + entry['district'] + return ret def derive_test_hostnames(hostname): @@ -117,24 +204,25 @@ def reduce_urls(urllist): that either don't work or lead somewhere else """ targets = set() - for u in urllist: - if u['error'] is not None: + for url in urllist: + if url['error'] is not None: continue - if u['redirects_to'] is not None: - targets.add(u['redirects_to']) + if url['redirects_to'] is not None: + targets.add(url['redirects_to']) else: - targets.add(u['url']) + targets.add(url['url']) return sorted(list(targets)) -def normalize_title(s): +def normalize_title(title): """ Removes garbage from HTML page titles """ - s = s.replace('\u00a0', ' ') - s = s.replace(' ', ' ') - s = s.strip() - return s + title = title.replace(u'\u00a0', ' ') + title = title.replace(' ', ' ') + title = title.strip() + return title + def download_icon(icon_url): """ @@ -150,10 +238,10 @@ def download_icon(icon_url): } # Download the icon - r = requests.get(icon_url) - r.raise_for_status() + req = requests.get(icon_url) + req.raise_for_status() - content_hash = hashlib.md5(r.content).hexdigest() + content_hash = hashlib.md5(req.content).hexdigest() extension = "" file_name = os.path.basename(icon_url)[-1] @@ -161,24 +249,25 @@ def download_icon(icon_url): ext = file_name.split(".")[-1] if ext != "": extension = ext - + if extension == "": # derive from content type - t = r.headers.get('content-type') + ctype = req.headers.get('content-type') try: - extension = default_endings[t] + extension = default_endings[ctype] except KeyError: - logging.error("No file ending defined for icon type '%s'" % t) + logging.error("No file ending defined for icon type '%s'", ctype) return None - + filename = content_hash + "." + extension.lower() - path = siteicons_path + os.path.sep + filename + path = SITEICONS_PATH + os.path.sep + filename with open(path, 'wb') as iconfile: - iconfile.write(r.content) + iconfile.write(req.content) return filename + def check_responsiveness(url): """ Checks @@ -193,9 +282,9 @@ def check_responsiveness(url): # sizes we check for (width, height) sizes = ( - (320,480), # old smartphone - (768,1024), # older tablet or newer smartphone - (1024,768), # older desktop or horiz. tablet + (320, 480), # old smartphone + (768, 1024), # older tablet or newer smartphone + (1024, 768), # older desktop or horiz. tablet (1920, 1080), # Full HD horizontal ) @@ -218,7 +307,8 @@ def check_responsiveness(url): return details -def check_content(r): + +def check_content(req): """ Adds details to check regarding content of the page @@ -227,10 +317,10 @@ def check_content(r): """ result = {} - result['encoding'] = r.encoding.lower() - soup = BeautifulSoup(r.text, 'html.parser') + result['encoding'] = req.encoding.lower() + soup = BeautifulSoup(req.text, 'html.parser') - result['html'] = r.text + result['html'] = req.text # page title result['title'] = None @@ -245,47 +335,47 @@ def check_content(r): result['canonical_link'] = None link = soup.find('link', rel='canonical') if link: - result['canonical_link'] = urljoin(r.url, link.get('href')) + result['canonical_link'] = urljoin(req.url, link.get('href')) # icon result['icon'] = None - link = soup.find('link', rel=lambda x: x and x.lower()=='icon') + link = soup.find('link', rel=lambda x: x and x.lower() == 'icon') if link: - result['icon'] = urljoin(r.url, link.get('href')) + result['icon'] = urljoin(req.url, link.get('href')) else: - link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon') + link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon') if link: - result['icon'] = urljoin(r.url, link.get('href')) + result['icon'] = urljoin(req.url, link.get('href')) # feed links result['feeds'] = [] rss_links = soup.find_all('link', type='application/rss+xml') atom_links = soup.find_all('link', type='application/atom+xml') - if len(rss_links) > 0: - for l in rss_links: - result['feeds'].append(urljoin(r.url, l.get('href'))) - if len(atom_links) > 0: - for l in rss_links: - result['feeds'].append(urljoin(r.url, l.get('href'))) + if rss_links: + for link in rss_links: + result['feeds'].append(urljoin(req.url, link.get('href'))) + if atom_links: + for link in rss_links: + result['feeds'].append(urljoin(req.url, link.get('href'))) # generator meta tag result['generator'] = None if head is not None: generator = head.select('[name=generator]') - if len(generator): + if generator: result['generator'] = generator[0].get('content') # opengraph meta tags result['opengraph'] = None - og = set() + opengraph = set() if head is not None: for item in head.find_all(property=re.compile('^og:')): - og.add(item.get('property')) + opengraph.add(item.get('property')) for item in head.find_all(itemprop=re.compile('^og:')): - og.add(item.get('itemprop')) - if len(og): - result['opengraph'] = sorted(list(og)) + opengraph.add(item.get('itemprop')) + if opengraph: + result['opengraph'] = sorted(list(opengraph)) return result @@ -298,8 +388,8 @@ def collect_ipv4_addresses(hostname_dict): for item in hostname_dict.values(): if 'ip_addresses' not in item: continue - for ip in item['ip_addresses']: - ips.add(ip) + for ip_addr in item['ip_addresses']: + ips.add(ip_addr) return sorted(list(ips)) @@ -310,11 +400,11 @@ def parse_generator(generator): generator = generator.lower() if 'typo3' in generator: return "typo3" - elif 'wordpress' in generator: + if 'wordpress' in generator: return "wordpress" - elif 'drupal' in generator: + if 'drupal' in generator: return "drupal" - elif 'joomla' in generator: + if 'joomla' in generator: return "joomla" return generator @@ -328,7 +418,9 @@ def check_site(entry): 4. Run full check on canonical URL """ headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1' + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' + + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + + 'Chrome/65.0.3325.181 green-spider/0.1' } # all the info we'll return for the site @@ -337,12 +429,13 @@ def check_site(entry): 'input_url': entry['url'], # Meta: Regional and type metadata for the site 'meta': { - 'level': entry['level'], - 'state': entry['state'], - 'district': entry['district'], - 'city': entry['city'], + 'level': entry.get('level'), + 'state': entry.get('state'), + 'district': entry.get('district'), + 'city': entry.get('city'), }, - # Details: All details we collected about the site (which aren't directly related to the report criteria) + # Details: All details we collected about the site (which aren't directly + # related to the report criteria) 'details': { 'hostnames': {}, 'ipv4_addresses': [], @@ -375,18 +468,18 @@ def check_site(entry): # try to resolve hostnames processed_hostnames = {} - for hn in hostnames: + for hostname in hostnames: - processed_hostnames[hn] = { + processed_hostnames[hostname] = { 'resolvable': False, } try: - hostname, aliases, ip_addresses = gethostbyname_ex(hn) - processed_hostnames[hn]['resolvable'] = True - processed_hostnames[hn]['resolved_hostname'] = hostname - processed_hostnames[hn]['aliases'] = aliases - processed_hostnames[hn]['ip_addresses'] = ip_addresses + hostname, aliases, ip_addresses = gethostbyname_ex(hostname) + processed_hostnames[hostname]['resolvable'] = True + processed_hostnames[hostname]['resolved_hostname'] = hostname + processed_hostnames[hostname]['aliases'] = aliases + processed_hostnames[hostname]['ip_addresses'] = ip_addresses except: pass @@ -398,9 +491,9 @@ def check_site(entry): checked_urls = [] checked_urls_set = set() - for hn in processed_hostnames.keys(): + for hostname in processed_hostnames.keys(): - item = processed_hostnames[hn] + item = processed_hostnames[hostname] if not item['resolvable']: continue @@ -421,18 +514,19 @@ def check_site(entry): } try: - r = requests.head(record['url'], headers=headers, allow_redirects=True) - if r.url == url: - logging.info("URL: %s - status %s" % (record['url'], r.status_code)) + req = requests.head(record['url'], headers=headers, allow_redirects=True) + if req.url == url: + logging.info("URL: %s - status %s", record['url'], req.status_code) else: - logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url)) - record['redirects_to'] = r.url - except Exception as e: + logging.info("URL: %s - status %s - redirects to %s", record['url'], + req.status_code, req.url) + record['redirects_to'] = req.url + except Exception as exc: record['error'] = { - 'type': str(type(e)), - 'message': str(e), + 'type': str(type(exc)), + 'message': str(exc), } - logging.info("URL %s: %s %s" % (url, str(type(e)), e)) + logging.info("URL %s: %s %s", url, str(type(exc)), exc) checked_urls.append(record) @@ -442,7 +536,7 @@ def check_site(entry): # Deeper test for the remaining (canonical) URL(s) for check_url in result['details']['canonical_urls']: - logging.info("Downloading URL %s" % check_url) + logging.info("Downloading URL %s", check_url) check = { 'url': check_url, @@ -454,37 +548,38 @@ def check_site(entry): } try: - r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout)) - check['status_code'] = r.status_code - check['duration'] = round(r.elapsed.microseconds / 1000) + req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT)) + check['status_code'] = req.status_code + check['duration'] = round(req.elapsed.microseconds / 1000) # Content checks - if r.status_code < 300: - check['content'] = check_content(r) + if req.status_code < 300: + check['content'] = check_content(req) # Responsiveness check try: check['responsive'] = check_responsiveness(check_url) - except Exception as e: - logging.error("Error when checking responsiveness for '%s': %s" % (check_url, e)) + except Exception as exc: + logging.error("Error when checking responsiveness for '%s': %s", check_url, exc) - except requests.exceptions.ConnectionError as e: - logging.error(str(e) + " " + check_url) + except requests.exceptions.ConnectionError as exc: + logging.error(str(exc) + " " + check_url) check['error'] = "connection" - except requests.exceptions.ReadTimeout as e: - logging.error(str(e) + " " + check_url) + except requests.exceptions.ReadTimeout as exc: + logging.error(str(exc) + " " + check_url) check['error'] = "read_timeout" - except requests.exceptions.Timeout as e: - logging.error(str(e) + " " + check_url) + except requests.exceptions.Timeout as exc: + logging.error(str(exc) + " " + check_url) check['error'] = "connection_timeout" - except Exception as e: - logging.error(str(e) + " " + check_url) + except Exception as exc: + logging.error(str(exc) + " " + check_url) check['error'] = "unknown" result['details']['urlchecks'].append(check) - result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url']) + result['details']['urlchecks'] = sorted(result['details']['urlchecks'], + key=lambda url: url['url']) # collect icons icons = set() @@ -492,24 +587,24 @@ def check_site(entry): if 'content' not in c: continue if c['content'] is None: - logging.warning("No content for %s" % entry['url']) + logging.warning("No content for %s", entry['url']) continue if c['content']['icon'] is not None: icons.add(c['content']['icon']) downloaded_icons = set() for icon_url in icons: - logging.info("Getting icon %s" % icon_url) + logging.info("Getting icon %s", icon_url) try: downloaded_icons.add(download_icon(icon_url)) except Exception as e: - logging.error("Could not download icon: %s" % e) + logging.error("Could not download icon: %s", e) result['details']['icons'] = sorted(list(downloaded_icons)) # collect feeds feeds = set() for c in result['details']['urlchecks']: if c['content'] is None: - logging.warning("No content for %s" % entry['url']) + logging.warning("No content for %s", entry['url']) continue if 'feeds' in c['content'] and len(c['content']['feeds']): for feed in c['content']['feeds']: @@ -543,7 +638,7 @@ def check_site(entry): result['details']['cms'] = parse_generator(c['content']['generator']) # Qualify certain CMS flavours in more detail if result['details']['cms'] == "typo3": - if gcms_ip in result['details']['ipv4_addresses']: + if GCMS_IP in result['details']['ipv4_addresses']: result['details']['cms'] = "typo3-gcms" elif 'typo3-gruene.de' in c['content']['html']: result['details']['cms'] = "typo3-gruene" @@ -555,7 +650,8 @@ def check_site(entry): # No generator Tag. Use HTML content. if 'Urwahl3000' in c['content']['html']: result['details']['cms'] = "wordpress-urwahl" - elif 'josephknowsbest' in c['content']['html'] or 'Joseph-knows-best' in c['content']['html']: + elif ('josephknowsbest' in c['content']['html'] or + 'Joseph-knows-best' in c['content']['html']): result['details']['cms'] = "wordpress-josephknowsbest" elif 'wordpress' in c['content']['html']: result['details']['cms'] = "wordpress" @@ -567,7 +663,7 @@ def check_site(entry): ### Derive criteria # DNS_RESOLVABLE_IPV4 - if len(result['details']['ipv4_addresses']): + if result['details']['ipv4_addresses']: result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1} # SITE_REACHABLE @@ -584,8 +680,8 @@ def check_site(entry): # WWW_OPTIONAL num_hostnames = 0 - for hn in result['details']['hostnames'].keys(): - item = result['details']['hostnames'][hn] + for hostname in result['details']['hostnames'].keys(): + item = result['details']['hostnames'][hostname] if not item['resolvable']: continue num_hostnames += 1 @@ -600,20 +696,20 @@ def check_site(entry): else: links = set() if result['details']['urlchecks'] is None: - logging.warning("No urlchecks for %s" % entry['url']) + logging.warning("No urlchecks for %s", entry['url']) else: for item in result['details']['urlchecks']: - if item['content']['canonical_link'] is not None: + if item['content'] is not None and item['content']['canonical_link'] is not None: links.add(item['content']['canonical_link']) if len(links) == 1: result['result']['CANONICAL_URL'] = {'value': True, 'score': 1} # FAVICON - if len(result['details']['icons']): + if result['details']['icons']: result['result']['FAVICON'] = {'value': True, 'score': 1} # FEEDS - if len(result['details']['feeds']): + if result['details']['feeds']: result['result']['FEEDS'] = {'value': True, 'score': 1} # HTTP_RESPONSE_DURATION @@ -621,17 +717,18 @@ def check_site(entry): for item in result['details']['urlchecks']: if item['error'] is None: durations.append(item['duration']) - val = round(statistics.mean(durations)) - result['result']['HTTP_RESPONSE_DURATION']['value'] = val - if val < 100: - result['result']['HTTP_RESPONSE_DURATION']['score'] = 1 - elif val < 1000: - result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5 + if durations: + val = round(statistics.mean(durations)) + result['result']['HTTP_RESPONSE_DURATION']['value'] = val + if val < 100: + result['result']['HTTP_RESPONSE_DURATION']['score'] = 1 + elif val < 1000: + result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5 # RESPONSIVE if result['details']['responsive'] is not None: if (result['details']['responsive']['min_width'] < 500 and - len(result['details']['responsive']['viewport_meta_tag']) > 0): + len(result['details']['responsive']['viewport_meta_tag']) > 0): result['result']['RESPONSIVE']['value'] = True result['result']['RESPONSIVE']['score'] = 1 @@ -649,87 +746,91 @@ def check_site(entry): return result -def main(): +def get_job_from_queue(): """ - Bringing it all together + Returns a URL from the queue """ - logging.basicConfig(level=logging.INFO) - logging.getLogger("urllib3").setLevel(logging.CRITICAL) + out = None - # refresh our local clone of the green directory - get_green_directory() + with DATASTORE_CLIENT.transaction(): + query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND) + for entity in query.fetch(limit=1): + logging.debug("Got job: %s", entity) + out = dict(entity) + out["url"] = entity.key.name + DATASTORE_CLIENT.delete(entity.key) - # build the list of website URLs to run checks for - logging.info("Processing green-directory") - input_entries = [] + return out - for entry in dir_entries(): +def work_of_queue(): + """ + Take job from queue and finish it until there are no more jobs + """ + while True: + job = get_job_from_queue() + if job is None: + logging.info("No more jobs. Exiting.") + break - if 'type' not in entry: - logging.error("Entry without type") - continue - if 'urls' not in entry: - logging.debug("Entry %s does not have any URLs." % repr_entry(entry)) - continue + logging.info("Starting job %s", job["url"]) + result = check_site(entry=job) + #logging.debug(result) + logging.info("Job %s finished checks", job["url"]) + logging.info("Job %s writing to DB", job["url"]) - website_url = None - for n in range(len(entry['urls'])): - try: - if entry['urls'][n]['type'] == "WEBSITE": - website_url = entry['urls'][n]['url'] - if website_url: - input_entries.append({ - "url": website_url, - "level": entry.get("level"), - "state": entry.get("state"), - "district": entry.get("district"), - "city": entry.get("city"), - }) - except NameError: - logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n])) - - - # randomize order, to distribute requests over servers - logging.info("Shuffling input URLs") - random.seed() - random.shuffle(input_entries) - - # run checks - logging.info("Starting checks") - results = {} - - pool = Pool(concurrency) - for ientry in input_entries: - logging.info("Submitting %s to job pool" % ientry['url']) - results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry}) - pool.close() - pool.join() - - logging.info("Checks are finished") - - # Restructure result from dict of ApplyResult - # to list of dicts and sort in stable way - json_result = [] - done = set() - - logging.info("Restructuring results") - - # convert results from ApplyResult to dict - for url in sorted(results.keys()): - if url not in done: - logging.info("Getting result for %s" % url) - try: - resultsitem = results[url].get() - json_result.append(resultsitem) - except Exception as e: - logging.error("Error getting result for '%s': %s" % (url, e)) - done.add(url) - - # Write result as JSON - output_filename = os.path.join(result_path, "spider_result.json") - with open(output_filename, 'w', encoding="utf8") as jsonfile: - json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False) + key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"]) + entity = datastore.Entity(key=key, exclude_from_indexes=['results']) + record = { + "created": datetime.utcnow(), + "results": result, + } + entity.update(record) + try: + DATASTORE_CLIENT.put(entity) + except InvalidArgument as ex: + logging.error("Could not write result: %s", ex) + except ex: + logging.error("Could not write result: %s", ex) if __name__ == "__main__": - main() + """ + Bringing it all together + """ + parser = argparse.ArgumentParser() + parser.add_argument('--credentials-path', dest='credentials_path', + help='Path to the service account credentials JSON file', + default='/secrets/service-account.json') + parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)", + default='info') + + subparsers = parser.add_subparsers(help='sub-command help', dest='command') + + subparsers.add_parser('spider', help='Take jobs off the queue and spider') + + jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue') + + jobs_parser.add_argument('--url', help='Add a job to spider a URL') + args = parser.parse_args() + + loglevel = args.loglevel.lower() + if loglevel == 'error': + logging.basicConfig(level=logging.ERROR) + elif loglevel == 'warn': + logging.basicConfig(level=logging.WARN) + elif loglevel == 'debug': + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + loglevel = 'info' + + logging.getLogger("urllib3").setLevel(logging.CRITICAL) + + DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path) + + logging.debug("Called command %s", args.command) + + if args.command == 'jobs': + create_jobs() + else: + work_of_queue()