Big spider.py overhaul

This commit is contained in:
Marian Steinbach 2018-08-23 09:36:33 +02:00
parent b61172445b
commit 88ec1f63f7
1 changed files with 316 additions and 215 deletions

531
spider.py
View File

@ -1,68 +1,155 @@
# coding: utf8
"""
Provides the spider functionality (website checks).
"""
from bs4 import BeautifulSoup
from git import Repo
from multiprocessing import Pool
from selenium import webdriver
from socket import gethostbyname_ex
from urllib.parse import urljoin
from urllib.parse import urlparse
import certifi
import argparse
import hashlib
import json
import logging
import os
import random
import re
import requests
import shutil
import statistics
import sys
from datetime import datetime
from socket import gethostbyname_ex
from urllib.parse import urljoin
from urllib.parse import urlparse
import requests
import yaml
from bs4 import BeautifulSoup
from git import Repo
from selenium import webdriver
from google.cloud import datastore
from google.api_core.exceptions import InvalidArgument
# configuration
# number of parallel processes to use for crawling
concurrency = 1
# connection timeout for website checks (seconds)
connect_timeout = 5
CONNECT_TIMEOUT = 5
# response timeout for website checks
read_timeout = 10
READ_TIMEOUT = 10
# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
result_path = '/out'
RESULT_PATH = '/out'
siteicons_path = '/icons'
SITEICONS_PATH = '/icons'
# IP address of the newthinking GCMS server
gcms_ip = "91.102.13.20"
GCMS_IP = "91.102.13.20"
JOB_DATASTORE_KIND = 'spider-jobs'
RESULTS_DATASTORE_KIND = 'spider-results'
# end configuration
DATASTORE_CLIENT = None
def chunks(the_list, size):
"""
Yield successive n-sized chunks from list the_list
where n = size.
"""
for i in range(0, len(the_list), size):
yield the_list[i:i + size]
def create_jobs():
"""
Read all URLs from green directory and fill a job database
with one job per URL
"""
# refresh our local clone of the green directory
logging.info("Refreshing green-directory clone")
get_green_directory()
# build the list of website URLs to run checks for
logging.info("Processing green-directory")
input_entries = []
for entry in dir_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
continue
website_url = None
for index in range(len(entry['urls'])):
try:
if entry['urls'][index]['type'] == "WEBSITE":
website_url = entry['urls'][index]['url']
if website_url:
input_entries.append({
"url": website_url,
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
except NameError:
logging.error("Error in %s: 'url' key missing (%s)",
repr_entry(entry), entry['urls'][index])
# randomize order, to distribute requests over servers
logging.debug("Shuffling input URLs")
random.seed()
random.shuffle(input_entries)
count = 0
logging.info("Writing jobs")
entities = []
for entry in input_entries:
key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
entity = datastore.Entity(key=key)
entity.update({
"created": datetime.utcnow(),
"level": entry["level"],
"state": entry["state"],
"district": entry["district"],
"city": entry["city"],
})
entities.append(entity)
# commmit to DB
for chunk in chunks(entities, 300):
logging.debug("Writing jobs chunk of length %d", len(chunk))
DATASTORE_CLIENT.put_multi(chunk)
count += len(chunk)
logging.info("Writing jobs done, %s jobs added", count)
def get_green_directory():
"""
Clones the source of website URLs, the green directory,
into the local file system using git
"""
if os.path.exists(green_directory_local_path):
shutil.rmtree(green_directory_local_path)
Repo.clone_from(green_directory_repo, green_directory_local_path)
if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
def dir_entries():
"""
Iterator over all data files in the cloned green directory
"""
path = os.path.join(green_directory_local_path, green_direcory_data_path)
path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
for root, _, files in os.walk(path):
for fname in files:
@ -80,14 +167,14 @@ def repr_entry(entry):
Return string representation of a directory entry,
for logging/debugging purposes
"""
r = entry['type']
ret = entry['type']
if 'level' in entry:
r += "/" + entry['level']
ret += "/" + entry['level']
if 'state' in entry:
r += "/" + entry['state']
ret += "/" + entry['state']
if 'district' in entry:
r += "/" + entry['district']
return r
ret += "/" + entry['district']
return ret
def derive_test_hostnames(hostname):
@ -117,24 +204,25 @@ def reduce_urls(urllist):
that either don't work or lead somewhere else
"""
targets = set()
for u in urllist:
if u['error'] is not None:
for url in urllist:
if url['error'] is not None:
continue
if u['redirects_to'] is not None:
targets.add(u['redirects_to'])
if url['redirects_to'] is not None:
targets.add(url['redirects_to'])
else:
targets.add(u['url'])
targets.add(url['url'])
return sorted(list(targets))
def normalize_title(s):
def normalize_title(title):
"""
Removes garbage from HTML page titles
"""
s = s.replace('\u00a0', ' ')
s = s.replace(' ', ' ')
s = s.strip()
return s
title = title.replace(u'\u00a0', ' ')
title = title.replace(' ', ' ')
title = title.strip()
return title
def download_icon(icon_url):
"""
@ -150,10 +238,10 @@ def download_icon(icon_url):
}
# Download the icon
r = requests.get(icon_url)
r.raise_for_status()
req = requests.get(icon_url)
req.raise_for_status()
content_hash = hashlib.md5(r.content).hexdigest()
content_hash = hashlib.md5(req.content).hexdigest()
extension = ""
file_name = os.path.basename(icon_url)[-1]
@ -161,24 +249,25 @@ def download_icon(icon_url):
ext = file_name.split(".")[-1]
if ext != "":
extension = ext
if extension == "":
# derive from content type
t = r.headers.get('content-type')
ctype = req.headers.get('content-type')
try:
extension = default_endings[t]
extension = default_endings[ctype]
except KeyError:
logging.error("No file ending defined for icon type '%s'" % t)
logging.error("No file ending defined for icon type '%s'", ctype)
return None
filename = content_hash + "." + extension.lower()
path = siteicons_path + os.path.sep + filename
path = SITEICONS_PATH + os.path.sep + filename
with open(path, 'wb') as iconfile:
iconfile.write(r.content)
iconfile.write(req.content)
return filename
def check_responsiveness(url):
"""
Checks
@ -193,9 +282,9 @@ def check_responsiveness(url):
# sizes we check for (width, height)
sizes = (
(320,480), # old smartphone
(768,1024), # older tablet or newer smartphone
(1024,768), # older desktop or horiz. tablet
(320, 480), # old smartphone
(768, 1024), # older tablet or newer smartphone
(1024, 768), # older desktop or horiz. tablet
(1920, 1080), # Full HD horizontal
)
@ -218,7 +307,8 @@ def check_responsiveness(url):
return details
def check_content(r):
def check_content(req):
"""
Adds details to check regarding content of the page
@ -227,10 +317,10 @@ def check_content(r):
"""
result = {}
result['encoding'] = r.encoding.lower()
soup = BeautifulSoup(r.text, 'html.parser')
result['encoding'] = req.encoding.lower()
soup = BeautifulSoup(req.text, 'html.parser')
result['html'] = r.text
result['html'] = req.text
# page title
result['title'] = None
@ -245,47 +335,47 @@ def check_content(r):
result['canonical_link'] = None
link = soup.find('link', rel='canonical')
if link:
result['canonical_link'] = urljoin(r.url, link.get('href'))
result['canonical_link'] = urljoin(req.url, link.get('href'))
# icon
result['icon'] = None
link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
result['icon'] = urljoin(req.url, link.get('href'))
else:
link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
result['icon'] = urljoin(req.url, link.get('href'))
# feed links
result['feeds'] = []
rss_links = soup.find_all('link', type='application/rss+xml')
atom_links = soup.find_all('link', type='application/atom+xml')
if len(rss_links) > 0:
for l in rss_links:
result['feeds'].append(urljoin(r.url, l.get('href')))
if len(atom_links) > 0:
for l in rss_links:
result['feeds'].append(urljoin(r.url, l.get('href')))
if rss_links:
for link in rss_links:
result['feeds'].append(urljoin(req.url, link.get('href')))
if atom_links:
for link in rss_links:
result['feeds'].append(urljoin(req.url, link.get('href')))
# generator meta tag
result['generator'] = None
if head is not None:
generator = head.select('[name=generator]')
if len(generator):
if generator:
result['generator'] = generator[0].get('content')
# opengraph meta tags
result['opengraph'] = None
og = set()
opengraph = set()
if head is not None:
for item in head.find_all(property=re.compile('^og:')):
og.add(item.get('property'))
opengraph.add(item.get('property'))
for item in head.find_all(itemprop=re.compile('^og:')):
og.add(item.get('itemprop'))
if len(og):
result['opengraph'] = sorted(list(og))
opengraph.add(item.get('itemprop'))
if opengraph:
result['opengraph'] = sorted(list(opengraph))
return result
@ -298,8 +388,8 @@ def collect_ipv4_addresses(hostname_dict):
for item in hostname_dict.values():
if 'ip_addresses' not in item:
continue
for ip in item['ip_addresses']:
ips.add(ip)
for ip_addr in item['ip_addresses']:
ips.add(ip_addr)
return sorted(list(ips))
@ -310,11 +400,11 @@ def parse_generator(generator):
generator = generator.lower()
if 'typo3' in generator:
return "typo3"
elif 'wordpress' in generator:
if 'wordpress' in generator:
return "wordpress"
elif 'drupal' in generator:
if 'drupal' in generator:
return "drupal"
elif 'joomla' in generator:
if 'joomla' in generator:
return "joomla"
return generator
@ -328,7 +418,9 @@ def check_site(entry):
4. Run full check on canonical URL
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
'AppleWebKit/537.36 (KHTML, like Gecko) ' +
'Chrome/65.0.3325.181 green-spider/0.1'
}
# all the info we'll return for the site
@ -337,12 +429,13 @@ def check_site(entry):
'input_url': entry['url'],
# Meta: Regional and type metadata for the site
'meta': {
'level': entry['level'],
'state': entry['state'],
'district': entry['district'],
'city': entry['city'],
'level': entry.get('level'),
'state': entry.get('state'),
'district': entry.get('district'),
'city': entry.get('city'),
},
# Details: All details we collected about the site (which aren't directly related to the report criteria)
# Details: All details we collected about the site (which aren't directly
# related to the report criteria)
'details': {
'hostnames': {},
'ipv4_addresses': [],
@ -375,18 +468,18 @@ def check_site(entry):
# try to resolve hostnames
processed_hostnames = {}
for hn in hostnames:
for hostname in hostnames:
processed_hostnames[hn] = {
processed_hostnames[hostname] = {
'resolvable': False,
}
try:
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
processed_hostnames[hn]['resolvable'] = True
processed_hostnames[hn]['resolved_hostname'] = hostname
processed_hostnames[hn]['aliases'] = aliases
processed_hostnames[hn]['ip_addresses'] = ip_addresses
hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
processed_hostnames[hostname]['resolvable'] = True
processed_hostnames[hostname]['resolved_hostname'] = hostname
processed_hostnames[hostname]['aliases'] = aliases
processed_hostnames[hostname]['ip_addresses'] = ip_addresses
except:
pass
@ -398,9 +491,9 @@ def check_site(entry):
checked_urls = []
checked_urls_set = set()
for hn in processed_hostnames.keys():
for hostname in processed_hostnames.keys():
item = processed_hostnames[hn]
item = processed_hostnames[hostname]
if not item['resolvable']:
continue
@ -421,18 +514,19 @@ def check_site(entry):
}
try:
r = requests.head(record['url'], headers=headers, allow_redirects=True)
if r.url == url:
logging.info("URL: %s - status %s" % (record['url'], r.status_code))
req = requests.head(record['url'], headers=headers, allow_redirects=True)
if req.url == url:
logging.info("URL: %s - status %s", record['url'], req.status_code)
else:
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
record['redirects_to'] = r.url
except Exception as e:
logging.info("URL: %s - status %s - redirects to %s", record['url'],
req.status_code, req.url)
record['redirects_to'] = req.url
except Exception as exc:
record['error'] = {
'type': str(type(e)),
'message': str(e),
'type': str(type(exc)),
'message': str(exc),
}
logging.info("URL %s: %s %s" % (url, str(type(e)), e))
logging.info("URL %s: %s %s", url, str(type(exc)), exc)
checked_urls.append(record)
@ -442,7 +536,7 @@ def check_site(entry):
# Deeper test for the remaining (canonical) URL(s)
for check_url in result['details']['canonical_urls']:
logging.info("Downloading URL %s" % check_url)
logging.info("Downloading URL %s", check_url)
check = {
'url': check_url,
@ -454,37 +548,38 @@ def check_site(entry):
}
try:
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
check['status_code'] = r.status_code
check['duration'] = round(r.elapsed.microseconds / 1000)
req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
check['status_code'] = req.status_code
check['duration'] = round(req.elapsed.microseconds / 1000)
# Content checks
if r.status_code < 300:
check['content'] = check_content(r)
if req.status_code < 300:
check['content'] = check_content(req)
# Responsiveness check
try:
check['responsive'] = check_responsiveness(check_url)
except Exception as e:
logging.error("Error when checking responsiveness for '%s': %s" % (check_url, e))
except Exception as exc:
logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
except requests.exceptions.ConnectionError as e:
logging.error(str(e) + " " + check_url)
except requests.exceptions.ConnectionError as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "connection"
except requests.exceptions.ReadTimeout as e:
logging.error(str(e) + " " + check_url)
except requests.exceptions.ReadTimeout as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "read_timeout"
except requests.exceptions.Timeout as e:
logging.error(str(e) + " " + check_url)
except requests.exceptions.Timeout as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "connection_timeout"
except Exception as e:
logging.error(str(e) + " " + check_url)
except Exception as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "unknown"
result['details']['urlchecks'].append(check)
result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
key=lambda url: url['url'])
# collect icons
icons = set()
@ -492,24 +587,24 @@ def check_site(entry):
if 'content' not in c:
continue
if c['content'] is None:
logging.warning("No content for %s" % entry['url'])
logging.warning("No content for %s", entry['url'])
continue
if c['content']['icon'] is not None:
icons.add(c['content']['icon'])
downloaded_icons = set()
for icon_url in icons:
logging.info("Getting icon %s" % icon_url)
logging.info("Getting icon %s", icon_url)
try:
downloaded_icons.add(download_icon(icon_url))
except Exception as e:
logging.error("Could not download icon: %s" % e)
logging.error("Could not download icon: %s", e)
result['details']['icons'] = sorted(list(downloaded_icons))
# collect feeds
feeds = set()
for c in result['details']['urlchecks']:
if c['content'] is None:
logging.warning("No content for %s" % entry['url'])
logging.warning("No content for %s", entry['url'])
continue
if 'feeds' in c['content'] and len(c['content']['feeds']):
for feed in c['content']['feeds']:
@ -543,7 +638,7 @@ def check_site(entry):
result['details']['cms'] = parse_generator(c['content']['generator'])
# Qualify certain CMS flavours in more detail
if result['details']['cms'] == "typo3":
if gcms_ip in result['details']['ipv4_addresses']:
if GCMS_IP in result['details']['ipv4_addresses']:
result['details']['cms'] = "typo3-gcms"
elif 'typo3-gruene.de' in c['content']['html']:
result['details']['cms'] = "typo3-gruene"
@ -555,7 +650,8 @@ def check_site(entry):
# No generator Tag. Use HTML content.
if 'Urwahl3000' in c['content']['html']:
result['details']['cms'] = "wordpress-urwahl"
elif 'josephknowsbest' in c['content']['html'] or 'Joseph-knows-best' in c['content']['html']:
elif ('josephknowsbest' in c['content']['html'] or
'Joseph-knows-best' in c['content']['html']):
result['details']['cms'] = "wordpress-josephknowsbest"
elif 'wordpress' in c['content']['html']:
result['details']['cms'] = "wordpress"
@ -567,7 +663,7 @@ def check_site(entry):
### Derive criteria
# DNS_RESOLVABLE_IPV4
if len(result['details']['ipv4_addresses']):
if result['details']['ipv4_addresses']:
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
# SITE_REACHABLE
@ -584,8 +680,8 @@ def check_site(entry):
# WWW_OPTIONAL
num_hostnames = 0
for hn in result['details']['hostnames'].keys():
item = result['details']['hostnames'][hn]
for hostname in result['details']['hostnames'].keys():
item = result['details']['hostnames'][hostname]
if not item['resolvable']:
continue
num_hostnames += 1
@ -600,20 +696,20 @@ def check_site(entry):
else:
links = set()
if result['details']['urlchecks'] is None:
logging.warning("No urlchecks for %s" % entry['url'])
logging.warning("No urlchecks for %s", entry['url'])
else:
for item in result['details']['urlchecks']:
if item['content']['canonical_link'] is not None:
if item['content'] is not None and item['content']['canonical_link'] is not None:
links.add(item['content']['canonical_link'])
if len(links) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
# FAVICON
if len(result['details']['icons']):
if result['details']['icons']:
result['result']['FAVICON'] = {'value': True, 'score': 1}
# FEEDS
if len(result['details']['feeds']):
if result['details']['feeds']:
result['result']['FEEDS'] = {'value': True, 'score': 1}
# HTTP_RESPONSE_DURATION
@ -621,17 +717,18 @@ def check_site(entry):
for item in result['details']['urlchecks']:
if item['error'] is None:
durations.append(item['duration'])
val = round(statistics.mean(durations))
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
if val < 100:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
elif val < 1000:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
if durations:
val = round(statistics.mean(durations))
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
if val < 100:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
elif val < 1000:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
# RESPONSIVE
if result['details']['responsive'] is not None:
if (result['details']['responsive']['min_width'] < 500 and
len(result['details']['responsive']['viewport_meta_tag']) > 0):
len(result['details']['responsive']['viewport_meta_tag']) > 0):
result['result']['RESPONSIVE']['value'] = True
result['result']['RESPONSIVE']['score'] = 1
@ -649,87 +746,91 @@ def check_site(entry):
return result
def main():
def get_job_from_queue():
"""
Bringing it all together
Returns a URL from the queue
"""
logging.basicConfig(level=logging.INFO)
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
out = None
# refresh our local clone of the green directory
get_green_directory()
with DATASTORE_CLIENT.transaction():
query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
for entity in query.fetch(limit=1):
logging.debug("Got job: %s", entity)
out = dict(entity)
out["url"] = entity.key.name
DATASTORE_CLIENT.delete(entity.key)
# build the list of website URLs to run checks for
logging.info("Processing green-directory")
input_entries = []
return out
for entry in dir_entries():
def work_of_queue():
"""
Take job from queue and finish it until there are no more jobs
"""
while True:
job = get_job_from_queue()
if job is None:
logging.info("No more jobs. Exiting.")
break
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
continue
logging.info("Starting job %s", job["url"])
result = check_site(entry=job)
#logging.debug(result)
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
website_url = None
for n in range(len(entry['urls'])):
try:
if entry['urls'][n]['type'] == "WEBSITE":
website_url = entry['urls'][n]['url']
if website_url:
input_entries.append({
"url": website_url,
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
except NameError:
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
# randomize order, to distribute requests over servers
logging.info("Shuffling input URLs")
random.seed()
random.shuffle(input_entries)
# run checks
logging.info("Starting checks")
results = {}
pool = Pool(concurrency)
for ientry in input_entries:
logging.info("Submitting %s to job pool" % ientry['url'])
results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry})
pool.close()
pool.join()
logging.info("Checks are finished")
# Restructure result from dict of ApplyResult
# to list of dicts and sort in stable way
json_result = []
done = set()
logging.info("Restructuring results")
# convert results from ApplyResult to dict
for url in sorted(results.keys()):
if url not in done:
logging.info("Getting result for %s" % url)
try:
resultsitem = results[url].get()
json_result.append(resultsitem)
except Exception as e:
logging.error("Error getting result for '%s': %s" % (url, e))
done.add(url)
# Write result as JSON
output_filename = os.path.join(result_path, "spider_result.json")
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
record = {
"created": datetime.utcnow(),
"results": result,
}
entity.update(record)
try:
DATASTORE_CLIENT.put(entity)
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except ex:
logging.error("Could not write result: %s", ex)
if __name__ == "__main__":
main()
"""
Bringing it all together
"""
parser = argparse.ArgumentParser()
parser.add_argument('--credentials-path', dest='credentials_path',
help='Path to the service account credentials JSON file',
default='/secrets/service-account.json')
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
default='info')
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
subparsers.add_parser('spider', help='Take jobs off the queue and spider')
jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
jobs_parser.add_argument('--url', help='Add a job to spider a URL')
args = parser.parse_args()
loglevel = args.loglevel.lower()
if loglevel == 'error':
logging.basicConfig(level=logging.ERROR)
elif loglevel == 'warn':
logging.basicConfig(level=logging.WARN)
elif loglevel == 'debug':
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
loglevel = 'info'
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
logging.debug("Called command %s", args.command)
if args.command == 'jobs':
create_jobs()
else:
work_of_queue()