green-spider/spider.py

808 lines
25 KiB
Python
Raw Normal View History

2018-08-23 09:36:33 +02:00
"""
Provides the spider functionality (website checks).
"""
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
import argparse
2018-05-25 19:11:09 +02:00
import hashlib
2018-04-03 23:15:28 +02:00
import json
import logging
import os
import random
2018-04-09 22:39:53 +02:00
import re
2018-04-03 23:15:28 +02:00
import shutil
import statistics
2018-08-23 09:36:33 +02:00
from datetime import datetime
from socket import gethostbyname_ex
from urllib.parse import urljoin
from urllib.parse import urlparse
import requests
2018-04-03 23:15:28 +02:00
import yaml
2018-08-27 22:40:31 +02:00
import tenacity
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
from bs4 import BeautifulSoup
from git import Repo
from selenium import webdriver
from google.cloud import datastore
2018-08-27 22:40:31 +02:00
from google.api_core.exceptions import Aborted
2018-08-23 09:36:33 +02:00
from google.api_core.exceptions import InvalidArgument
2018-04-03 23:15:28 +02:00
# configuration
# connection timeout for website checks (seconds)
2018-08-23 09:36:33 +02:00
CONNECT_TIMEOUT = 5
2018-04-03 23:15:28 +02:00
# response timeout for website checks
2018-08-23 09:36:33 +02:00
READ_TIMEOUT = 10
2018-04-03 23:15:28 +02:00
# Git repo for our data
2018-08-23 09:36:33 +02:00
GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
2018-04-03 23:15:28 +02:00
# folder in that repo that holds the data
2018-08-23 09:36:33 +02:00
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
RESULT_PATH = '/out'
2018-04-03 23:15:28 +02:00
# IP address of the newthinking GCMS server
2018-08-23 09:36:33 +02:00
GCMS_IP = "91.102.13.20"
JOB_DATASTORE_KIND = 'spider-jobs'
RESULTS_DATASTORE_KIND = 'spider-results'
2018-04-03 23:15:28 +02:00
# end configuration
2018-08-23 09:36:33 +02:00
DATASTORE_CLIENT = None
def chunks(the_list, size):
"""
Yield successive n-sized chunks from list the_list
where n = size.
"""
for i in range(0, len(the_list), size):
yield the_list[i:i + size]
def create_jobs(url=None):
2018-08-23 09:36:33 +02:00
"""
Read all URLs from green directory and fill a job database
with one job per URL.
Alternatively, if the url argument is given, only the given URL
will be added as a spider job.
2018-08-23 09:36:33 +02:00
"""
# refresh our local clone of the green directory
logging.info("Refreshing green-directory clone")
get_green_directory()
# build the list of website URLs to run checks for
logging.info("Processing green-directory")
input_entries = []
count = 0
2018-08-23 09:36:33 +02:00
for entry in dir_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
continue
website_url = None
for index in range(len(entry['urls'])):
try:
if entry['urls'][index]['type'] == "WEBSITE":
website_url = entry['urls'][index]['url']
if website_url:
if url is not None and website_url != url:
continue
2018-08-23 09:36:33 +02:00
input_entries.append({
"url": website_url,
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
count += 1
2018-08-23 09:36:33 +02:00
except NameError:
logging.error("Error in %s: 'url' key missing (%s)",
repr_entry(entry), entry['urls'][index])
# ensure the passed URL argument is really there, even if not part
# of the directory.
if url and count == 0:
logging.info("Adding job for URL %s which is not part of green-directory", url)
input_entries.append({
"url": url,
"level": None,
"state": None,
"district": None,
"city": None,
})
2018-08-23 09:36:33 +02:00
# randomize order, to distribute requests over servers
logging.debug("Shuffling input URLs")
random.seed()
random.shuffle(input_entries)
count = 0
logging.info("Writing jobs")
entities = []
for entry in input_entries:
key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
entity = datastore.Entity(key=key)
entity.update({
"created": datetime.utcnow(),
"level": entry["level"],
"state": entry["state"],
"district": entry["district"],
"city": entry["city"],
})
entities.append(entity)
# commmit to DB
for chunk in chunks(entities, 300):
logging.debug("Writing jobs chunk of length %d", len(chunk))
DATASTORE_CLIENT.put_multi(chunk)
count += len(chunk)
logging.info("Writing jobs done, %s jobs added", count)
2018-04-04 21:14:16 +02:00
2018-04-03 23:15:28 +02:00
def get_green_directory():
"""
2018-04-04 21:14:16 +02:00
Clones the source of website URLs, the green directory,
into the local file system using git
2018-04-03 23:15:28 +02:00
"""
2018-08-23 09:36:33 +02:00
if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
2018-04-03 23:15:28 +02:00
def dir_entries():
2018-04-04 21:14:16 +02:00
"""
Iterator over all data files in the cloned green directory
"""
2018-08-23 09:36:33 +02:00
path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
2018-05-25 19:11:09 +02:00
for root, _, files in os.walk(path):
2018-04-03 23:15:28 +02:00
for fname in files:
filepath = os.path.join(root, fname)
if not filepath.endswith(".yaml"):
continue
2018-05-03 11:29:11 +02:00
with open(filepath, 'r', encoding='utf8') as yamlfile:
2018-04-03 23:15:28 +02:00
for doc in yaml.load_all(yamlfile):
yield doc
2018-04-04 21:14:16 +02:00
2018-04-03 23:15:28 +02:00
def repr_entry(entry):
"""
2018-04-04 21:14:16 +02:00
Return string representation of a directory entry,
for logging/debugging purposes
2018-04-03 23:15:28 +02:00
"""
2018-08-23 09:36:33 +02:00
ret = entry['type']
2018-04-03 23:15:28 +02:00
if 'level' in entry:
2018-08-23 09:36:33 +02:00
ret += "/" + entry['level']
2018-04-03 23:15:28 +02:00
if 'state' in entry:
2018-08-23 09:36:33 +02:00
ret += "/" + entry['state']
2018-04-03 23:15:28 +02:00
if 'district' in entry:
2018-08-23 09:36:33 +02:00
ret += "/" + entry['district']
return ret
2018-04-03 23:15:28 +02:00
2018-04-04 21:14:16 +02:00
def derive_test_hostnames(hostname):
"""
Derives the hostnames variants to test for a given host name.
From 'gruene-x.de' or 'www.gruene-x.de' it makes
['gruene-x.de', 'www.gruene-x.de']
which are both plausible web URLs to be used for a domain.
"""
hostnames = set()
hostnames.add(hostname)
if hostname.startswith('www.'):
hostnames.add(hostname[4:])
else:
hostnames.add('www.' + hostname)
2018-05-03 11:44:00 +02:00
return sorted(list(hostnames))
2018-04-04 21:14:16 +02:00
def reduce_urls(urllist):
"""
Reduce a list of urls with metadata by eliminating those
that either don't work or lead somewhere else
"""
targets = set()
2018-08-23 09:36:33 +02:00
for url in urllist:
if url['error'] is not None:
2018-04-04 21:14:16 +02:00
continue
2018-08-23 09:36:33 +02:00
if url['redirects_to'] is not None:
targets.add(url['redirects_to'])
2018-04-04 21:14:16 +02:00
else:
2018-08-23 09:36:33 +02:00
targets.add(url['url'])
2018-05-03 13:55:17 +02:00
return sorted(list(targets))
2018-04-04 21:14:16 +02:00
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
def normalize_title(title):
2018-04-09 22:39:53 +02:00
"""
Removes garbage from HTML page titles
"""
2018-08-23 09:36:33 +02:00
title = title.replace(u'\u00a0', ' ')
title = title.replace(' ', ' ')
title = title.strip()
return title
2018-04-09 22:39:53 +02:00
def check_responsiveness(url):
"""
Checks
- whether a page adapts to different viewport sizes
- whether a viewport meta tag exists
and returns details
"""
details = {
'document_width': {},
'viewport_meta_tag': None,
}
# sizes we check for (width, height)
sizes = (
2018-08-23 09:36:33 +02:00
(320, 480), # old smartphone
(768, 1024), # older tablet or newer smartphone
(1024, 768), # older desktop or horiz. tablet
(1920, 1080), # Full HD horizontal
)
# Our selenium user agent using PhantomJS/Webkit as an engine
driver = webdriver.PhantomJS()
driver.set_window_size(sizes[0][0], sizes[0][1])
driver.get(url)
for (width, height) in sizes:
driver.set_window_size(width, height)
key = "%sx%s" % (width, height)
width = driver.execute_script("return document.body.scrollWidth")
details['document_width'][key] = int(width)
try:
element = driver.find_element_by_xpath("//meta[@name='viewport']")
details['viewport_meta_tag'] = element.get_attribute('content')
except:
pass
return details
2018-08-23 09:36:33 +02:00
def check_content(req):
2018-04-09 22:39:53 +02:00
"""
Adds details to check regarding content of the page
check: the dict containing details for this URL
r: requests request/response object
"""
result = {}
2018-08-23 09:36:33 +02:00
result['encoding'] = req.encoding.lower()
soup = BeautifulSoup(req.text, 'html.parser')
2018-04-09 22:39:53 +02:00
2018-08-23 09:36:33 +02:00
result['html'] = req.text
2018-04-09 22:39:53 +02:00
# page title
result['title'] = None
2018-05-04 10:02:01 +02:00
title = None
head = soup.find('head')
if head is not None:
title = head.find('title')
2018-04-09 22:39:53 +02:00
if title is not None:
result['title'] = normalize_title(title.get_text())
# canonical link
result['canonical_link'] = None
link = soup.find('link', rel='canonical')
if link:
2018-08-23 09:36:33 +02:00
result['canonical_link'] = urljoin(req.url, link.get('href'))
2018-04-09 23:02:12 +02:00
# icon
result['icon'] = None
2018-08-23 09:36:33 +02:00
link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
2018-04-09 23:02:12 +02:00
if link:
2018-08-23 09:36:33 +02:00
result['icon'] = urljoin(req.url, link.get('href'))
2018-04-09 23:02:12 +02:00
else:
2018-08-23 09:36:33 +02:00
link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
2018-04-09 23:02:12 +02:00
if link:
2018-08-23 09:36:33 +02:00
result['icon'] = urljoin(req.url, link.get('href'))
2018-04-09 22:39:53 +02:00
# feed links
result['feeds'] = []
rss_links = soup.find_all('link', type='application/rss+xml')
atom_links = soup.find_all('link', type='application/atom+xml')
2018-08-23 09:36:33 +02:00
if rss_links:
for link in rss_links:
result['feeds'].append(urljoin(req.url, link.get('href')))
if atom_links:
for link in rss_links:
result['feeds'].append(urljoin(req.url, link.get('href')))
2018-04-09 22:39:53 +02:00
# generator meta tag
result['generator'] = None
2018-05-04 10:02:01 +02:00
if head is not None:
generator = head.select('[name=generator]')
2018-08-23 09:36:33 +02:00
if generator:
2018-05-04 10:02:01 +02:00
result['generator'] = generator[0].get('content')
2018-04-09 22:39:53 +02:00
# opengraph meta tags
result['opengraph'] = None
2018-08-23 09:36:33 +02:00
opengraph = set()
2018-05-04 10:02:01 +02:00
if head is not None:
for item in head.find_all(property=re.compile('^og:')):
2018-08-23 09:36:33 +02:00
opengraph.add(item.get('property'))
2018-05-04 10:02:01 +02:00
for item in head.find_all(itemprop=re.compile('^og:')):
2018-08-23 09:36:33 +02:00
opengraph.add(item.get('itemprop'))
if opengraph:
result['opengraph'] = sorted(list(opengraph))
2018-04-09 22:39:53 +02:00
return result
2018-04-09 23:02:12 +02:00
def collect_ipv4_addresses(hostname_dict):
"""
Return list of unique IPv4 addresses
"""
ips = set()
for item in hostname_dict.values():
if 'ip_addresses' not in item:
continue
2018-08-23 09:36:33 +02:00
for ip_addr in item['ip_addresses']:
ips.add(ip_addr)
return sorted(list(ips))
def parse_generator(generator):
"""
Return well known CMS names from generator
"""
generator = generator.lower()
if 'typo3' in generator:
return "typo3"
2018-08-23 09:36:33 +02:00
if 'wordpress' in generator:
return "wordpress"
2018-08-23 09:36:33 +02:00
if 'drupal' in generator:
return "drupal"
2018-08-23 09:36:33 +02:00
if 'joomla' in generator:
return "joomla"
return generator
def check_site(entry):
2018-04-03 23:15:28 +02:00
"""
2018-04-04 21:14:16 +02:00
Performs our site check and returns results as a dict.
1. Normalize the input URL and derive the URLs to check for
2. HEAD the check urls
3. Determine the canonical URL
4. Run full check on canonical URL
2018-04-03 23:15:28 +02:00
"""
2018-04-04 21:14:16 +02:00
headers = {
2018-08-23 09:36:33 +02:00
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
'AppleWebKit/537.36 (KHTML, like Gecko) ' +
'Chrome/65.0.3325.181 green-spider/0.1'
2018-04-04 21:14:16 +02:00
}
# all the info we'll return for the site
2018-04-03 23:15:28 +02:00
result = {
# input_url: The URL we derived all checks from
'input_url': entry['url'],
# Meta: Regional and type metadata for the site
'meta': {
2018-08-23 09:36:33 +02:00
'level': entry.get('level'),
'state': entry.get('state'),
'district': entry.get('district'),
'city': entry.get('city'),
},
2018-08-23 09:36:33 +02:00
# Details: All details we collected about the site (which aren't directly
# related to the report criteria)
'details': {
'hostnames': {},
'ipv4_addresses': [],
'resolvable_urls': [],
'canonical_urls': [],
'urlchecks': [],
'icons': [],
'feeds': [],
'cms': None,
'responsive': None,
},
# The actual report criteria
'result': {
'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0},
'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0},
'HTTPS': {'type': 'boolean', 'value': False, 'score': 0},
'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0},
'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0},
'FAVICON': {'type': 'boolean', 'value': False, 'score': 0},
'FEEDS': {'type': 'boolean', 'value': False, 'score': 0},
'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0},
'RESPONSIVE': {'type': 'boolean', 'value': False, 'score': 0},
},
'score': 0.0,
2018-04-03 23:15:28 +02:00
}
# derive hostnames to test (with/without www.)
parsed = urlparse(entry['url'])
2018-04-04 21:14:16 +02:00
hostnames = derive_test_hostnames(parsed.hostname)
2018-04-03 23:15:28 +02:00
# try to resolve hostnames
processed_hostnames = {}
2018-08-23 09:36:33 +02:00
for hostname in hostnames:
2018-04-04 21:14:16 +02:00
2018-08-23 09:36:33 +02:00
processed_hostnames[hostname] = {
2018-04-04 21:14:16 +02:00
'resolvable': False,
}
try:
2018-08-23 09:36:33 +02:00
hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
processed_hostnames[hostname]['resolvable'] = True
processed_hostnames[hostname]['resolved_hostname'] = hostname
processed_hostnames[hostname]['aliases'] = aliases
processed_hostnames[hostname]['ip_addresses'] = ip_addresses
2018-04-04 21:14:16 +02:00
except:
pass
result['details']['hostnames'] = processed_hostnames
2018-04-04 21:14:16 +02:00
result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
2018-04-04 21:14:16 +02:00
# check basic HTTP(S) reachability
2018-04-04 21:14:16 +02:00
checked_urls = []
checked_urls_set = set()
2018-08-23 09:36:33 +02:00
for hostname in processed_hostnames.keys():
2018-08-23 09:36:33 +02:00
item = processed_hostnames[hostname]
2018-04-04 21:14:16 +02:00
if not item['resolvable']:
continue
for scheme in ('http', 'https'):
url = scheme + '://' + item['resolved_hostname'] + '/'
if url in checked_urls_set:
continue
checked_urls_set.add(url)
2018-04-04 21:14:16 +02:00
record = {
'url': url,
2018-04-04 21:14:16 +02:00
'error': None,
'redirects_to': None,
}
try:
2018-08-23 09:36:33 +02:00
req = requests.head(record['url'], headers=headers, allow_redirects=True)
if req.url == url:
logging.info("URL: %s - status %s", record['url'], req.status_code)
2018-04-04 21:14:16 +02:00
else:
2018-08-23 09:36:33 +02:00
logging.info("URL: %s - status %s - redirects to %s", record['url'],
req.status_code, req.url)
record['redirects_to'] = req.url
except Exception as exc:
2018-04-04 21:14:16 +02:00
record['error'] = {
2018-08-23 09:36:33 +02:00
'type': str(type(exc)),
'message': str(exc),
2018-04-04 21:14:16 +02:00
}
2018-08-23 09:36:33 +02:00
logging.info("URL %s: %s %s", url, str(type(exc)), exc)
2018-04-04 21:14:16 +02:00
checked_urls.append(record)
result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls))
2018-04-04 21:14:16 +02:00
# Deeper test for the remaining (canonical) URL(s)
for check_url in result['details']['canonical_urls']:
2018-04-04 21:14:16 +02:00
2018-08-23 09:36:33 +02:00
logging.info("Downloading URL %s", check_url)
2018-04-04 21:14:16 +02:00
check = {
'url': check_url,
'status_code': None,
'duration': None,
'error': None,
2018-04-09 22:39:53 +02:00
'content': None,
'responsive': None,
2018-04-04 21:14:16 +02:00
}
try:
2018-08-23 09:36:33 +02:00
req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
check['status_code'] = req.status_code
check['duration'] = round(req.elapsed.microseconds / 1000)
2018-04-09 22:39:53 +02:00
# Content checks
2018-08-23 09:36:33 +02:00
if req.status_code < 300:
check['content'] = check_content(req)
2018-04-09 22:39:53 +02:00
# Responsiveness check
try:
check['responsive'] = check_responsiveness(check_url)
2018-08-23 09:36:33 +02:00
except Exception as exc:
logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
2018-08-23 09:36:33 +02:00
except requests.exceptions.ConnectionError as exc:
logging.error(str(exc) + " " + check_url)
2018-04-04 21:14:16 +02:00
check['error'] = "connection"
2018-08-23 09:36:33 +02:00
except requests.exceptions.ReadTimeout as exc:
logging.error(str(exc) + " " + check_url)
2018-04-04 21:14:16 +02:00
check['error'] = "read_timeout"
2018-08-23 09:36:33 +02:00
except requests.exceptions.Timeout as exc:
logging.error(str(exc) + " " + check_url)
2018-05-25 19:11:09 +02:00
check['error'] = "connection_timeout"
2018-08-23 09:36:33 +02:00
except Exception as exc:
logging.error(str(exc) + " " + check_url)
2018-04-04 21:14:16 +02:00
check['error'] = "unknown"
result['details']['urlchecks'].append(check)
2018-04-04 21:14:16 +02:00
2018-08-23 09:36:33 +02:00
result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
key=lambda url: url['url'])
# collect icons
2018-04-19 11:36:14 +02:00
icons = set()
for c in result['details']['urlchecks']:
2018-04-19 11:36:14 +02:00
if 'content' not in c:
continue
if c['content'] is None:
2018-08-23 09:36:33 +02:00
logging.warning("No content for %s", entry['url'])
2018-04-19 11:36:14 +02:00
continue
if c['content']['icon'] is not None:
icons.add(c['content']['icon'])
2018-08-27 21:17:14 +02:00
result['details']['icons'] = sorted(list(icons))
# collect feeds
2018-04-19 11:36:14 +02:00
feeds = set()
for c in result['details']['urlchecks']:
2018-04-19 11:36:14 +02:00
if c['content'] is None:
2018-08-23 09:36:33 +02:00
logging.warning("No content for %s", entry['url'])
2018-04-19 11:36:14 +02:00
continue
if 'feeds' in c['content'] and len(c['content']['feeds']):
for feed in c['content']['feeds']:
2018-04-19 11:36:14 +02:00
feeds.add(feed)
result['details']['feeds'] = sorted(list(feeds))
# detect responsive
viewports = set()
min_width = 2000
for c in result['details']['urlchecks']:
if c['responsive'] is None:
continue
if c['responsive']['viewport_meta_tag'] is not None:
viewports.add(c['responsive']['viewport_meta_tag'])
widths = c['responsive']['document_width'].values()
if min(widths) < min_width:
min_width = min(widths)
result['details']['responsive'] = {
'viewport_meta_tag': list(viewports),
'min_width': min_width,
}
# detect CMS
for c in result['details']['urlchecks']:
if c['content'] is None:
continue
if 'generator' not in c['content']:
continue
if c['content']['generator'] != "" and c['content']['generator'] is not None:
result['details']['cms'] = parse_generator(c['content']['generator'])
# Qualify certain CMS flavours in more detail
if result['details']['cms'] == "typo3":
2018-08-23 09:36:33 +02:00
if GCMS_IP in result['details']['ipv4_addresses']:
result['details']['cms'] = "typo3-gcms"
elif 'typo3-gruene.de' in c['content']['html']:
result['details']['cms'] = "typo3-gruene"
elif result['details']['cms'] == "wordpress":
if 'Urwahl3000' in c['content']['html']:
result['details']['cms'] = "wordpress-urwahl"
else:
# No generator Tag. Use HTML content.
if 'Urwahl3000' in c['content']['html']:
result['details']['cms'] = "wordpress-urwahl"
2018-08-23 09:36:33 +02:00
elif ('josephknowsbest' in c['content']['html'] or
'Joseph-knows-best' in c['content']['html']):
2018-08-10 11:26:02 +02:00
result['details']['cms'] = "wordpress-josephknowsbest"
elif 'wordpress' in c['content']['html']:
result['details']['cms'] = "wordpress"
# we can stop here
break
### Derive criteria
# DNS_RESOLVABLE_IPV4
2018-08-23 09:36:33 +02:00
if result['details']['ipv4_addresses']:
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
# SITE_REACHABLE
for item in result['details']['resolvable_urls']:
if item['error'] is None:
result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1}
break
# HTTPS
for item in result['details']['urlchecks']:
if item['error'] is None and item['url'].startswith('https://'):
2018-04-21 20:37:53 +02:00
result['result']['HTTPS'] = {'value': True, 'score': 2}
break
# WWW_OPTIONAL
num_hostnames = 0
2018-08-23 09:36:33 +02:00
for hostname in result['details']['hostnames'].keys():
item = result['details']['hostnames'][hostname]
if not item['resolvable']:
continue
num_hostnames += 1
if num_hostnames > 1:
result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1}
# CANONICAL_URL
# - either there is only one canonical URL (through redirects)
# - or several pages have identical rel=canonical links
if len(result['details']['canonical_urls']) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
else:
links = set()
2018-04-19 11:36:14 +02:00
if result['details']['urlchecks'] is None:
2018-08-23 09:36:33 +02:00
logging.warning("No urlchecks for %s", entry['url'])
2018-04-19 11:36:14 +02:00
else:
for item in result['details']['urlchecks']:
2018-08-23 09:36:33 +02:00
if item['content'] is not None and item['content']['canonical_link'] is not None:
2018-04-19 11:36:14 +02:00
links.add(item['content']['canonical_link'])
if len(links) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
# FAVICON
2018-08-23 09:36:33 +02:00
if result['details']['icons']:
result['result']['FAVICON'] = {'value': True, 'score': 1}
# FEEDS
2018-08-23 09:36:33 +02:00
if result['details']['feeds']:
result['result']['FEEDS'] = {'value': True, 'score': 1}
# HTTP_RESPONSE_DURATION
durations = []
for item in result['details']['urlchecks']:
if item['error'] is None:
durations.append(item['duration'])
2018-08-23 09:36:33 +02:00
if durations:
val = round(statistics.mean(durations))
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
if val < 100:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
elif val < 1000:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
# RESPONSIVE
if result['details']['responsive'] is not None:
if (result['details']['responsive']['min_width'] < 500 and
2018-08-23 09:36:33 +02:00
len(result['details']['responsive']['viewport_meta_tag']) > 0):
result['result']['RESPONSIVE']['value'] = True
result['result']['RESPONSIVE']['score'] = 1
# Overall score
for item in result['result'].keys():
result['score'] += result['result'][item]['score']
# clean up - remove full HTML
for item in result['details']['urlchecks']:
try:
del item['content']['html']
except:
pass
2018-04-03 23:15:28 +02:00
return result
2018-04-04 21:14:16 +02:00
2018-08-27 22:40:31 +02:00
@tenacity.retry(wait=tenacity.wait_exponential(),
retry=tenacity.retry_if_exception_type(Aborted))
2018-08-23 09:36:33 +02:00
def get_job_from_queue():
"""
2018-08-23 09:36:33 +02:00
Returns a URL from the queue
"""
2018-08-23 09:36:33 +02:00
out = None
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
with DATASTORE_CLIENT.transaction():
query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
for entity in query.fetch(limit=1):
logging.debug("Got job: %s", entity)
out = dict(entity)
out["url"] = entity.key.name
DATASTORE_CLIENT.delete(entity.key)
2018-08-23 09:36:33 +02:00
return out
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
def work_of_queue():
"""
Take job from queue and finish it until there are no more jobs
"""
while True:
job = get_job_from_queue()
if job is None:
logging.info("No more jobs. Exiting.")
break
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
logging.info("Starting job %s", job["url"])
result = check_site(entry=job)
#logging.debug(result)
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
record = {
"created": datetime.utcnow(),
"results": result,
}
entity.update(record)
try:
DATASTORE_CLIENT.put(entity)
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except ex:
logging.error("Could not write result: %s", ex)
2018-04-19 11:36:14 +02:00
2018-08-23 09:36:33 +02:00
if __name__ == "__main__":
"""
Bringing it all together
"""
parser = argparse.ArgumentParser()
parser.add_argument('--credentials-path', dest='credentials_path',
help='Path to the service account credentials JSON file',
default='/secrets/service-account.json')
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
default='info')
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
subparsers.add_parser('spider', help='Take jobs off the queue and spider')
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
2018-04-19 11:36:14 +02:00
2018-08-23 09:36:33 +02:00
jobs_parser.add_argument('--url', help='Add a job to spider a URL')
args = parser.parse_args()
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
loglevel = args.loglevel.lower()
if loglevel == 'error':
logging.basicConfig(level=logging.ERROR)
elif loglevel == 'warn':
logging.basicConfig(level=logging.WARN)
elif loglevel == 'debug':
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
loglevel = 'info'
2018-04-19 11:36:14 +02:00
2018-08-23 09:36:33 +02:00
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
2018-04-04 21:14:16 +02:00
2018-08-23 09:36:33 +02:00
DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
logging.debug("Called command %s", args.command)
2018-04-03 23:15:28 +02:00
2018-08-23 09:36:33 +02:00
if args.command == 'jobs':
create_jobs(args.url)
2018-08-23 09:36:33 +02:00
else:
work_of_queue()