Move test criteria logic into spider, add scoring

This commit is contained in:
Marian Steinbach 2018-04-19 00:51:59 +02:00
parent d4ac1fa9f5
commit ea05bf5192
7 changed files with 286 additions and 100 deletions

19
KRITERIEN.md Normal file
View file

@ -0,0 +1,19 @@
# Qualitätskriterien
Wir prüfen Sites nach den folgenden Kriterien:
- `DNS_RESOLVABLE_IPV4`: Der Hostname der URL ist in eine IPv4 Adresse auflösbar
- `SITE_REACHABLE`: Die Site ist per HTTP(S) erreichbar (Status-Code 200)
- `HTTPS`: Die Site ist über HTTPS erreichbar. Das Server-Zertifikat ist gültig und stammt von einer vertrauenswürdigen CA.
- `WWW_OPTIONAL`: Die Verwendung von `www.` zu Beginn der Startseiten-URL ist optional. Die Site ist sowohl mit als auch ohne dieses Präfix im Hostnamen erreichbar.
- `CANONICAL_URL`: Bei mehreren möglichen URLs, über die auf die Site zugegriffen werden kann, wird auf eine kanonische URL weiter geleitet bzw. per `rel=canonical` Link verwiesen.
- `FAVICON`: Die Site hat ein Favoriten-Icon.
- `FEEDS`: Die Site verweist auf RSS oder Atom Feeds via `rel=alternate` Link Tag.
- `HTTP_RESPONSE_DURATION`: Zeit, die vom Absenden des HTTP-Request bis zum Empfang der Response-Header vergangen ist.

File diff suppressed because one or more lines are too long

View file

@ -43,8 +43,14 @@
<table class="table">
<thead>
<tr>
<th scope="col">Typ</th>
<th scope="col">Land</th>
<th scope="col">Kreis</th>
<th scope="col">Stadt</th>
<th scope="col">URL</th>
<th scope="col">Score</th>
<th scope="col">IP-Adresse</th>
<th scope="col">Erreichbar</th>
<th scope="col">Icon</th>
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>

230
spider.py
View file

@ -14,9 +14,11 @@ import random
import re
import requests
import shutil
import statistics
import sys
import yaml
# configuration
# number of parallel processes to use for crawling
@ -31,7 +33,7 @@ read_timeout = 10
# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data'
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'
result_path = './webapp/dist/data'
@ -153,11 +155,11 @@ def check_content(r):
# icon
result['icon'] = None
link = soup.find('link', rel='icon')
link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
else:
link = soup.find('link', rel='shortcut icon')
link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
@ -192,7 +194,20 @@ def check_content(r):
return result
def check_site(url):
def collect_ipv4_addresses(hostname_dict):
"""
Return list of unique IPv4 addresses
"""
ips = set()
for item in hostname_dict.values():
if 'ip_addresses' not in item:
continue
for ip in item['ip_addresses']:
ips.add(ip)
return sorted(list(ips))
def check_site(entry):
"""
Performs our site check and returns results as a dict.
@ -205,49 +220,88 @@ def check_site(url):
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
}
# all the info we'll return for the site
result = {
'input_url': url,
'hostnames': [],
'resolvable_urls': [],
'canonical_urls': [],
'urlchecks': [],
# input_url: The URL we derived all checks from
'input_url': entry['url'],
# Meta: Regional and type metadata for the site
'meta': {
'level': entry['level'],
'state': entry['state'],
'district': entry['district'],
'city': entry['city'],
},
# Details: All details we collected about the site (which aren't directly related to the report criteria)
'details': {
'hostnames': {},
'ipv4_addresses': [],
'resolvable_urls': [],
'canonical_urls': [],
'urlchecks': [],
'icons': [],
'feeds': [],
},
# The actual report criteria
'result': {
'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0},
'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0},
'HTTPS': {'type': 'boolean', 'value': False, 'score': 0},
'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0},
'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0},
'FAVICON': {'type': 'boolean', 'value': False, 'score': 0},
'FEEDS': {'type': 'boolean', 'value': False, 'score': 0},
'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0},
},
'score': 0.0,
}
# derive hostnames to test
parsed = urlparse(url)
# derive hostnames to test (with/without www.)
parsed = urlparse(entry['url'])
hostnames = derive_test_hostnames(parsed.hostname)
processed_hostnames = []
# try to resolve hostnames
processed_hostnames = {}
for hn in hostnames:
record = {
'input_hostname': hn,
processed_hostnames[hn] = {
'resolvable': False,
}
try:
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
record['resolvable'] = True
record['resolved_hostname'] = hostname
record['aliases'] = aliases
record['ip_addresses'] = ip_addresses
processed_hostnames[hn]['resolvable'] = True
processed_hostnames[hn]['resolved_hostname'] = hostname
processed_hostnames[hn]['aliases'] = aliases
processed_hostnames[hn]['ip_addresses'] = ip_addresses
except:
pass
processed_hostnames.append(record)
result['details']['hostnames'] = processed_hostnames
result['hostnames'] = sorted(processed_hostnames, key=lambda hn: hn['input_hostname'])
result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
# check basic HTTP(S) reachability
checked_urls = []
for item in processed_hostnames:
checked_urls_set = set()
for hn in processed_hostnames.keys():
item = processed_hostnames[hn]
if not item['resolvable']:
continue
for scheme in ('http', 'https'):
url = scheme + '://' + item['resolved_hostname'] + '/'
if url in checked_urls_set:
continue
checked_urls_set.add(url)
record = {
'url': scheme + '://' + item['resolved_hostname'] + '/',
'url': url,
'error': None,
'redirects_to': None,
}
@ -255,7 +309,7 @@ def check_site(url):
try:
r = requests.head(record['url'], headers=headers, allow_redirects=True)
if r.url == url:
logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
logging.info("URL: %s - status %s" % (record['url'], r.status_code))
else:
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
record['redirects_to'] = r.url
@ -268,13 +322,13 @@ def check_site(url):
checked_urls.append(record)
result['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
result['canonical_urls'] = sorted(reduce_urls(checked_urls))
result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls))
# Deeper test for the remaining (canonical) URL(s)
for check_url in result['canonical_urls']:
for check_url in result['details']['canonical_urls']:
logging.info("Checking URL %s" % check_url)
logging.info("Downloading URL %s" % check_url)
check = {
'url': check_url,
@ -306,27 +360,111 @@ def check_site(url):
logging.error(str(e) + " " + check_url)
check['error'] = "unknown"
result['urlchecks'].append(check)
result['details']['urlchecks'].append(check)
result['urlchecks'] = sorted(result['urlchecks'], key=lambda url: url['url'])
result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
# collect icons
for c in result['details']['urlchecks']:
if 'icon' in c['content'] and c['content']['icon']:
if c['content']['icon'] in result['details']['icons']:
continue
result['details']['icons'].append(c['content']['icon'])
# collect feeds
for c in result['details']['urlchecks']:
if 'feeds' in c['content'] and len(c['content']['feeds']):
for feed in c['content']['feeds']:
if feed in result['details']['feeds']:
continue
result['details']['feeds'].append(feed)
### Derive criteria
# DNS_RESOLVABLE_IPV4
if len(result['details']['ipv4_addresses']):
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
# SITE_REACHABLE
for item in result['details']['resolvable_urls']:
if item['error'] is None:
result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1}
break
# HTTPS
for item in result['details']['urlchecks']:
if item['error'] is None and item['url'].startswith('https://'):
result['result']['HTTPS'] = {'value': True, 'score': 1}
break
# WWW_OPTIONAL
num_hostnames = 0
for hn in result['details']['hostnames'].keys():
item = result['details']['hostnames'][hn]
if not item['resolvable']:
continue
num_hostnames += 1
if num_hostnames > 1:
result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1}
# CANONICAL_URL
# - either there is only one canonical URL (through redirects)
# - or several pages have identical rel=canonical links
if len(result['details']['canonical_urls']) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
else:
links = set()
for item in result['details']['urlchecks']:
if item['content']['canonical_link'] is not None:
links.add(item['content']['canonical_link'])
if len(links) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
# FAVICON
if len(result['details']['icons']):
result['result']['FAVICON'] = {'value': True, 'score': 1}
# FEEDS
if len(result['details']['feeds']):
result['result']['FEEDS'] = {'value': True, 'score': 1}
# HTTP_RESPONSE_DURATION
durations = []
for item in result['details']['urlchecks']:
if item['error'] is None:
durations.append(item['duration'])
result['result']['HTTP_RESPONSE_DURATION'] = {
'value': round(statistics.mean(durations)),
'score': 1.0/statistics.mean(durations) * 500
}
# Overall score
for item in result['result'].keys():
result['score'] += result['result'][item]['score']
return result
def main():
"""
Bringing it all together
"""
logging.basicConfig(level=logging.INFO)
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
# refresh our local clone of the green directory
get_green_directory()
urls = []
# build the list of website URLs to run checks for
input_entries = []
for entry in dir_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
continue
@ -339,23 +477,29 @@ def main():
except NameError as ne:
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
if website_url:
urls.append(website_url)
input_entries.append({
"url": website_url,
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
# randomize order, to distribute requests over servers
random.seed()
random.shuffle(urls)
random.shuffle(input_entries)
# run checks
results = {}
if concurrency > 1:
pool = Pool(concurrency)
for url in urls:
results[url] = pool.apply_async(check_site, kwds={"url": url})
pool.close()
pool.join()
else:
for url in urls:
results[url] = check_site(url)
pool = Pool(concurrency)
for entry in input_entries:
results[entry['url']] = pool.apply_async(check_site, kwds={'entry': entry})
pool.close()
pool.join()
# Restructure result from dict of ApplyResult
# to list of dicts and sort in stable way
results2 = []
done = set()

26
webapp/dist/bundle.js vendored

File diff suppressed because one or more lines are too long

View file

@ -43,8 +43,14 @@
<table class="table">
<thead>
<tr>
<th scope="col">Typ</th>
<th scope="col">Land</th>
<th scope="col">Kreis</th>
<th scope="col">Stadt</th>
<th scope="col">URL</th>
<th scope="col">Score</th>
<th scope="col">IP-Adresse</th>
<th scope="col">Erreichbar</th>
<th scope="col">Icon</th>
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>

View file

@ -18,54 +18,65 @@ $(function(){
var row = $(document.createElement('tr'));
// typ
var level = null;
if (item.meta.level === 'DE:ORTSVERBAND') {
level = 'OV';
} else if (item.meta.level === 'DE:KREISVERBAND') {
level = 'KV';
} else if (item.meta.level === 'DE:LANDESVERBAND') {
level = 'LV';
}
row.append('<td>' + (level === null ? '' : level) + '</td>');
// land
row.append('<td>' + (item.meta.state === null ? '' : item.meta.state) + '</td>');
// kreis
row.append('<td>' + (item.meta.district === null ? '' : item.meta.district) + '</td>');
// stadt
row.append('<td>' + (item.meta.city === null ? '' : item.meta.city) + '</td>');
// input URL
row.append('<td><a href="' + item.input_url + '">' + punycode.toUnicode(item.input_url) + '</a></td>');
// score
row.append('<td>' + item.score.toFixed(1) + '</td>');
// IPs
var ips = _.join(_.uniq(_.flatten(_.map(item.hostnames, 'ip_addresses'))), ', ');
row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌ Keine' : ips) + '</td>');
var ips = _.join(item.details.ipv4_addresses, ', ');
row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌' : ips) + '</td>');
// icon
var icons = [];
var icon = false;
icons = _.uniq(_.map(item.urlchecks, 'content.icon'));
if (icons.length > 0 && icons[0]) {
icon = icons[0];
}
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + icon + '" class="icon"/>') : '❌') + '</td>');
// SITE_REACHABLE
row.append('<td class="'+ (item.result.SITE_REACHABLE.value ? 'good' : 'bad') +' text-center">' + (item.result.SITE_REACHABLE.value ? '✅' : '❌') + '</td>');
// hostnames
var twoHostnames = false;
if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
twoHostnames = true;
};
row.append('<td class="'+ (twoHostnames ? 'good' : 'bad') +' text-center">' + (twoHostnames ? '✅' : '❌') + '</td>');
// FAVICON
var icon = item.result.FAVICON.value;
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + item.details.icons[0] + '" class="icon">') : '❌') + '</td>');
// WWW_OPTIONAL
var wwwOptional = item.result.WWW_OPTIONAL.value;
row.append('<td class="'+ (wwwOptional ? 'good' : 'bad') +' text-center">' + (wwwOptional ? '✅' : '❌') + '</td>');
// one canonical URL
var canonical = false;
if (item.canonical_urls.length === 1) canonical = true;
var canonical_links = _.uniq(_.map(item.urlchecks, 'content.canonical_link'));
if (canonical_links.length === 1) canonical = true;
var canonical = item.result.CANONICAL_URL.value;
row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');
// https
var hasHTTPS = false;
hasHTTPS = _.find(item.canonical_urls, function(o){
return o.indexOf('https://') !== -1;
});
var hasHTTPS = item.result.HTTPS.value;
row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');
// feeds
var feeds = false;
feeds = _.uniq(_.flatten(_.map(item.urlchecks, 'content.feeds')));
row.append('<td class="'+ (feeds.length ? 'good' : 'bad') +' text-center">' + (feeds.length ? '✅' : '❌') + '</td>');
var feeds = item.result.FEEDS.value;
row.append('<td class="'+ (feeds ? 'good' : 'bad') +' text-center">' + (feeds ? '✅' : '❌') + '</td>');
// screenshots
var screenshot = false;
if (item.canonical_urls.length > 0) {
if (typeof screenshots[item.canonical_urls[0]] !== 'undefined') {
var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.canonical_urls[0]];
var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.canonical_urls[0]];
if (item.details.canonical_urls.length > 0) {
if (typeof screenshots[item.details.canonical_urls[0]] !== 'undefined') {
var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.details.canonical_urls[0]];
var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.details.canonical_urls[0]];
screenshot = '<a class="screenshot" href="'+ surl +'" target="_blank" title="Mobile">M</a>';
screenshot += '<a class="screenshot" href="'+ lurl +'" target="_blank" title="Desktop">D</a>';
}