mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-04-26 14:24:51 +02:00
Move test criteria logic into spider, add scoring
This commit is contained in:
parent
d4ac1fa9f5
commit
ea05bf5192
19
KRITERIEN.md
Normal file
19
KRITERIEN.md
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Qualitätskriterien
|
||||
|
||||
Wir prüfen Sites nach den folgenden Kriterien:
|
||||
|
||||
- `DNS_RESOLVABLE_IPV4`: Der Hostname der URL ist in eine IPv4 Adresse auflösbar
|
||||
|
||||
- `SITE_REACHABLE`: Die Site ist per HTTP(S) erreichbar (Status-Code 200)
|
||||
|
||||
- `HTTPS`: Die Site ist über HTTPS erreichbar. Das Server-Zertifikat ist gültig und stammt von einer vertrauenswürdigen CA.
|
||||
|
||||
- `WWW_OPTIONAL`: Die Verwendung von `www.` zu Beginn der Startseiten-URL ist optional. Die Site ist sowohl mit als auch ohne dieses Präfix im Hostnamen erreichbar.
|
||||
|
||||
- `CANONICAL_URL`: Bei mehreren möglichen URLs, über die auf die Site zugegriffen werden kann, wird auf eine kanonische URL weiter geleitet bzw. per `rel=canonical` Link verwiesen.
|
||||
|
||||
- `FAVICON`: Die Site hat ein Favoriten-Icon.
|
||||
|
||||
- `FEEDS`: Die Site verweist auf RSS oder Atom Feeds via `rel=alternate` Link Tag.
|
||||
|
||||
- `HTTP_RESPONSE_DURATION`: Zeit, die vom Absenden des HTTP-Request bis zum Empfang der Response-Header vergangen ist.
|
File diff suppressed because one or more lines are too long
|
@ -43,8 +43,14 @@
|
|||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th scope="col">Typ</th>
|
||||
<th scope="col">Land</th>
|
||||
<th scope="col">Kreis</th>
|
||||
<th scope="col">Stadt</th>
|
||||
<th scope="col">URL</th>
|
||||
<th scope="col">Score</th>
|
||||
<th scope="col">IP-Adresse</th>
|
||||
<th scope="col">Erreichbar</th>
|
||||
<th scope="col">Icon</th>
|
||||
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
|
||||
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
|
||||
|
|
230
spider.py
230
spider.py
|
@ -14,9 +14,11 @@ import random
|
|||
import re
|
||||
import requests
|
||||
import shutil
|
||||
import statistics
|
||||
import sys
|
||||
import yaml
|
||||
|
||||
|
||||
# configuration
|
||||
|
||||
# number of parallel processes to use for crawling
|
||||
|
@ -31,7 +33,7 @@ read_timeout = 10
|
|||
# Git repo for our data
|
||||
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
|
||||
# folder in that repo that holds the data
|
||||
green_direcory_data_path = 'data'
|
||||
green_direcory_data_path = 'data/countries/de'
|
||||
green_directory_local_path = './cache/green-directory'
|
||||
|
||||
result_path = './webapp/dist/data'
|
||||
|
@ -153,11 +155,11 @@ def check_content(r):
|
|||
|
||||
# icon
|
||||
result['icon'] = None
|
||||
link = soup.find('link', rel='icon')
|
||||
link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
|
||||
if link:
|
||||
result['icon'] = urljoin(r.url, link.get('href'))
|
||||
else:
|
||||
link = soup.find('link', rel='shortcut icon')
|
||||
link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
|
||||
if link:
|
||||
result['icon'] = urljoin(r.url, link.get('href'))
|
||||
|
||||
|
@ -192,7 +194,20 @@ def check_content(r):
|
|||
return result
|
||||
|
||||
|
||||
def check_site(url):
|
||||
def collect_ipv4_addresses(hostname_dict):
|
||||
"""
|
||||
Return list of unique IPv4 addresses
|
||||
"""
|
||||
ips = set()
|
||||
for item in hostname_dict.values():
|
||||
if 'ip_addresses' not in item:
|
||||
continue
|
||||
for ip in item['ip_addresses']:
|
||||
ips.add(ip)
|
||||
return sorted(list(ips))
|
||||
|
||||
|
||||
def check_site(entry):
|
||||
"""
|
||||
Performs our site check and returns results as a dict.
|
||||
|
||||
|
@ -205,49 +220,88 @@ def check_site(url):
|
|||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
|
||||
}
|
||||
|
||||
# all the info we'll return for the site
|
||||
result = {
|
||||
'input_url': url,
|
||||
'hostnames': [],
|
||||
'resolvable_urls': [],
|
||||
'canonical_urls': [],
|
||||
'urlchecks': [],
|
||||
# input_url: The URL we derived all checks from
|
||||
'input_url': entry['url'],
|
||||
# Meta: Regional and type metadata for the site
|
||||
'meta': {
|
||||
'level': entry['level'],
|
||||
'state': entry['state'],
|
||||
'district': entry['district'],
|
||||
'city': entry['city'],
|
||||
},
|
||||
# Details: All details we collected about the site (which aren't directly related to the report criteria)
|
||||
'details': {
|
||||
'hostnames': {},
|
||||
'ipv4_addresses': [],
|
||||
'resolvable_urls': [],
|
||||
'canonical_urls': [],
|
||||
'urlchecks': [],
|
||||
'icons': [],
|
||||
'feeds': [],
|
||||
},
|
||||
# The actual report criteria
|
||||
'result': {
|
||||
'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'HTTPS': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'FAVICON': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'FEEDS': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0},
|
||||
},
|
||||
'score': 0.0,
|
||||
}
|
||||
|
||||
# derive hostnames to test
|
||||
parsed = urlparse(url)
|
||||
# derive hostnames to test (with/without www.)
|
||||
parsed = urlparse(entry['url'])
|
||||
hostnames = derive_test_hostnames(parsed.hostname)
|
||||
|
||||
|
||||
processed_hostnames = []
|
||||
# try to resolve hostnames
|
||||
processed_hostnames = {}
|
||||
for hn in hostnames:
|
||||
|
||||
record = {
|
||||
'input_hostname': hn,
|
||||
processed_hostnames[hn] = {
|
||||
'resolvable': False,
|
||||
}
|
||||
|
||||
try:
|
||||
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
|
||||
record['resolvable'] = True
|
||||
record['resolved_hostname'] = hostname
|
||||
record['aliases'] = aliases
|
||||
record['ip_addresses'] = ip_addresses
|
||||
processed_hostnames[hn]['resolvable'] = True
|
||||
processed_hostnames[hn]['resolved_hostname'] = hostname
|
||||
processed_hostnames[hn]['aliases'] = aliases
|
||||
processed_hostnames[hn]['ip_addresses'] = ip_addresses
|
||||
except:
|
||||
pass
|
||||
|
||||
processed_hostnames.append(record)
|
||||
result['details']['hostnames'] = processed_hostnames
|
||||
|
||||
result['hostnames'] = sorted(processed_hostnames, key=lambda hn: hn['input_hostname'])
|
||||
result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
|
||||
|
||||
# check basic HTTP(S) reachability
|
||||
checked_urls = []
|
||||
for item in processed_hostnames:
|
||||
checked_urls_set = set()
|
||||
|
||||
for hn in processed_hostnames.keys():
|
||||
|
||||
item = processed_hostnames[hn]
|
||||
|
||||
if not item['resolvable']:
|
||||
continue
|
||||
|
||||
for scheme in ('http', 'https'):
|
||||
|
||||
url = scheme + '://' + item['resolved_hostname'] + '/'
|
||||
|
||||
if url in checked_urls_set:
|
||||
continue
|
||||
|
||||
checked_urls_set.add(url)
|
||||
|
||||
record = {
|
||||
'url': scheme + '://' + item['resolved_hostname'] + '/',
|
||||
'url': url,
|
||||
'error': None,
|
||||
'redirects_to': None,
|
||||
}
|
||||
|
@ -255,7 +309,7 @@ def check_site(url):
|
|||
try:
|
||||
r = requests.head(record['url'], headers=headers, allow_redirects=True)
|
||||
if r.url == url:
|
||||
logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
|
||||
logging.info("URL: %s - status %s" % (record['url'], r.status_code))
|
||||
else:
|
||||
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
|
||||
record['redirects_to'] = r.url
|
||||
|
@ -268,13 +322,13 @@ def check_site(url):
|
|||
|
||||
checked_urls.append(record)
|
||||
|
||||
result['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
|
||||
result['canonical_urls'] = sorted(reduce_urls(checked_urls))
|
||||
result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
|
||||
result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls))
|
||||
|
||||
# Deeper test for the remaining (canonical) URL(s)
|
||||
for check_url in result['canonical_urls']:
|
||||
for check_url in result['details']['canonical_urls']:
|
||||
|
||||
logging.info("Checking URL %s" % check_url)
|
||||
logging.info("Downloading URL %s" % check_url)
|
||||
|
||||
check = {
|
||||
'url': check_url,
|
||||
|
@ -306,27 +360,111 @@ def check_site(url):
|
|||
logging.error(str(e) + " " + check_url)
|
||||
check['error'] = "unknown"
|
||||
|
||||
result['urlchecks'].append(check)
|
||||
result['details']['urlchecks'].append(check)
|
||||
|
||||
|
||||
result['urlchecks'] = sorted(result['urlchecks'], key=lambda url: url['url'])
|
||||
result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
|
||||
|
||||
# collect icons
|
||||
for c in result['details']['urlchecks']:
|
||||
if 'icon' in c['content'] and c['content']['icon']:
|
||||
if c['content']['icon'] in result['details']['icons']:
|
||||
continue
|
||||
result['details']['icons'].append(c['content']['icon'])
|
||||
|
||||
# collect feeds
|
||||
for c in result['details']['urlchecks']:
|
||||
if 'feeds' in c['content'] and len(c['content']['feeds']):
|
||||
for feed in c['content']['feeds']:
|
||||
if feed in result['details']['feeds']:
|
||||
continue
|
||||
result['details']['feeds'].append(feed)
|
||||
|
||||
|
||||
### Derive criteria
|
||||
|
||||
# DNS_RESOLVABLE_IPV4
|
||||
if len(result['details']['ipv4_addresses']):
|
||||
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
|
||||
|
||||
# SITE_REACHABLE
|
||||
for item in result['details']['resolvable_urls']:
|
||||
if item['error'] is None:
|
||||
result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1}
|
||||
break
|
||||
|
||||
# HTTPS
|
||||
for item in result['details']['urlchecks']:
|
||||
if item['error'] is None and item['url'].startswith('https://'):
|
||||
result['result']['HTTPS'] = {'value': True, 'score': 1}
|
||||
break
|
||||
|
||||
# WWW_OPTIONAL
|
||||
num_hostnames = 0
|
||||
for hn in result['details']['hostnames'].keys():
|
||||
item = result['details']['hostnames'][hn]
|
||||
if not item['resolvable']:
|
||||
continue
|
||||
num_hostnames += 1
|
||||
if num_hostnames > 1:
|
||||
result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1}
|
||||
|
||||
# CANONICAL_URL
|
||||
# - either there is only one canonical URL (through redirects)
|
||||
# - or several pages have identical rel=canonical links
|
||||
if len(result['details']['canonical_urls']) == 1:
|
||||
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
||||
else:
|
||||
links = set()
|
||||
for item in result['details']['urlchecks']:
|
||||
if item['content']['canonical_link'] is not None:
|
||||
links.add(item['content']['canonical_link'])
|
||||
if len(links) == 1:
|
||||
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
||||
|
||||
# FAVICON
|
||||
if len(result['details']['icons']):
|
||||
result['result']['FAVICON'] = {'value': True, 'score': 1}
|
||||
|
||||
# FEEDS
|
||||
if len(result['details']['feeds']):
|
||||
result['result']['FEEDS'] = {'value': True, 'score': 1}
|
||||
|
||||
# HTTP_RESPONSE_DURATION
|
||||
durations = []
|
||||
for item in result['details']['urlchecks']:
|
||||
if item['error'] is None:
|
||||
durations.append(item['duration'])
|
||||
result['result']['HTTP_RESPONSE_DURATION'] = {
|
||||
'value': round(statistics.mean(durations)),
|
||||
'score': 1.0/statistics.mean(durations) * 500
|
||||
}
|
||||
|
||||
# Overall score
|
||||
for item in result['result'].keys():
|
||||
result['score'] += result['result'][item]['score']
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Bringing it all together
|
||||
"""
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
||||
|
||||
# refresh our local clone of the green directory
|
||||
get_green_directory()
|
||||
|
||||
urls = []
|
||||
# build the list of website URLs to run checks for
|
||||
input_entries = []
|
||||
|
||||
for entry in dir_entries():
|
||||
|
||||
if 'type' not in entry:
|
||||
logging.error("Entry without type")
|
||||
continue
|
||||
|
||||
if 'urls' not in entry:
|
||||
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
|
||||
continue
|
||||
|
@ -339,23 +477,29 @@ def main():
|
|||
except NameError as ne:
|
||||
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
|
||||
if website_url:
|
||||
urls.append(website_url)
|
||||
input_entries.append({
|
||||
"url": website_url,
|
||||
"level": entry.get("level"),
|
||||
"state": entry.get("state"),
|
||||
"district": entry.get("district"),
|
||||
"city": entry.get("city"),
|
||||
})
|
||||
|
||||
# randomize order, to distribute requests over servers
|
||||
random.seed()
|
||||
random.shuffle(urls)
|
||||
random.shuffle(input_entries)
|
||||
|
||||
# run checks
|
||||
results = {}
|
||||
|
||||
if concurrency > 1:
|
||||
pool = Pool(concurrency)
|
||||
for url in urls:
|
||||
results[url] = pool.apply_async(check_site, kwds={"url": url})
|
||||
pool.close()
|
||||
pool.join()
|
||||
else:
|
||||
for url in urls:
|
||||
results[url] = check_site(url)
|
||||
pool = Pool(concurrency)
|
||||
for entry in input_entries:
|
||||
results[entry['url']] = pool.apply_async(check_site, kwds={'entry': entry})
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
# Restructure result from dict of ApplyResult
|
||||
# to list of dicts and sort in stable way
|
||||
results2 = []
|
||||
done = set()
|
||||
|
||||
|
|
26
webapp/dist/bundle.js
vendored
26
webapp/dist/bundle.js
vendored
File diff suppressed because one or more lines are too long
6
webapp/dist/index.html
vendored
6
webapp/dist/index.html
vendored
|
@ -43,8 +43,14 @@
|
|||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th scope="col">Typ</th>
|
||||
<th scope="col">Land</th>
|
||||
<th scope="col">Kreis</th>
|
||||
<th scope="col">Stadt</th>
|
||||
<th scope="col">URL</th>
|
||||
<th scope="col">Score</th>
|
||||
<th scope="col">IP-Adresse</th>
|
||||
<th scope="col">Erreichbar</th>
|
||||
<th scope="col">Icon</th>
|
||||
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
|
||||
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
|
||||
|
|
|
@ -18,54 +18,65 @@ $(function(){
|
|||
|
||||
var row = $(document.createElement('tr'));
|
||||
|
||||
// typ
|
||||
var level = null;
|
||||
if (item.meta.level === 'DE:ORTSVERBAND') {
|
||||
level = 'OV';
|
||||
} else if (item.meta.level === 'DE:KREISVERBAND') {
|
||||
level = 'KV';
|
||||
} else if (item.meta.level === 'DE:LANDESVERBAND') {
|
||||
level = 'LV';
|
||||
}
|
||||
row.append('<td>' + (level === null ? '' : level) + '</td>');
|
||||
|
||||
// land
|
||||
row.append('<td>' + (item.meta.state === null ? '' : item.meta.state) + '</td>');
|
||||
|
||||
// kreis
|
||||
row.append('<td>' + (item.meta.district === null ? '' : item.meta.district) + '</td>');
|
||||
|
||||
// stadt
|
||||
row.append('<td>' + (item.meta.city === null ? '' : item.meta.city) + '</td>');
|
||||
|
||||
// input URL
|
||||
row.append('<td><a href="' + item.input_url + '">' + punycode.toUnicode(item.input_url) + '</a></td>');
|
||||
|
||||
// score
|
||||
row.append('<td>' + item.score.toFixed(1) + '</td>');
|
||||
|
||||
// IPs
|
||||
var ips = _.join(_.uniq(_.flatten(_.map(item.hostnames, 'ip_addresses'))), ', ');
|
||||
row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌ Keine' : ips) + '</td>');
|
||||
var ips = _.join(item.details.ipv4_addresses, ', ');
|
||||
row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌' : ips) + '</td>');
|
||||
|
||||
// icon
|
||||
var icons = [];
|
||||
var icon = false;
|
||||
icons = _.uniq(_.map(item.urlchecks, 'content.icon'));
|
||||
if (icons.length > 0 && icons[0]) {
|
||||
icon = icons[0];
|
||||
}
|
||||
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + icon + '" class="icon"/>') : '❌') + '</td>');
|
||||
// SITE_REACHABLE
|
||||
row.append('<td class="'+ (item.result.SITE_REACHABLE.value ? 'good' : 'bad') +' text-center">' + (item.result.SITE_REACHABLE.value ? '✅' : '❌') + '</td>');
|
||||
|
||||
// hostnames
|
||||
var twoHostnames = false;
|
||||
if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
|
||||
twoHostnames = true;
|
||||
};
|
||||
row.append('<td class="'+ (twoHostnames ? 'good' : 'bad') +' text-center">' + (twoHostnames ? '✅' : '❌') + '</td>');
|
||||
// FAVICON
|
||||
var icon = item.result.FAVICON.value;
|
||||
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + item.details.icons[0] + '" class="icon">') : '❌') + '</td>');
|
||||
|
||||
// WWW_OPTIONAL
|
||||
var wwwOptional = item.result.WWW_OPTIONAL.value;
|
||||
row.append('<td class="'+ (wwwOptional ? 'good' : 'bad') +' text-center">' + (wwwOptional ? '✅' : '❌') + '</td>');
|
||||
|
||||
// one canonical URL
|
||||
var canonical = false;
|
||||
if (item.canonical_urls.length === 1) canonical = true;
|
||||
var canonical_links = _.uniq(_.map(item.urlchecks, 'content.canonical_link'));
|
||||
if (canonical_links.length === 1) canonical = true;
|
||||
var canonical = item.result.CANONICAL_URL.value;
|
||||
row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');
|
||||
|
||||
// https
|
||||
var hasHTTPS = false;
|
||||
hasHTTPS = _.find(item.canonical_urls, function(o){
|
||||
return o.indexOf('https://') !== -1;
|
||||
});
|
||||
var hasHTTPS = item.result.HTTPS.value;
|
||||
row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');
|
||||
|
||||
// feeds
|
||||
var feeds = false;
|
||||
feeds = _.uniq(_.flatten(_.map(item.urlchecks, 'content.feeds')));
|
||||
row.append('<td class="'+ (feeds.length ? 'good' : 'bad') +' text-center">' + (feeds.length ? '✅' : '❌') + '</td>');
|
||||
var feeds = item.result.FEEDS.value;
|
||||
row.append('<td class="'+ (feeds ? 'good' : 'bad') +' text-center">' + (feeds ? '✅' : '❌') + '</td>');
|
||||
|
||||
// screenshots
|
||||
var screenshot = false;
|
||||
if (item.canonical_urls.length > 0) {
|
||||
if (typeof screenshots[item.canonical_urls[0]] !== 'undefined') {
|
||||
var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.canonical_urls[0]];
|
||||
var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.canonical_urls[0]];
|
||||
if (item.details.canonical_urls.length > 0) {
|
||||
if (typeof screenshots[item.details.canonical_urls[0]] !== 'undefined') {
|
||||
var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.details.canonical_urls[0]];
|
||||
var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.details.canonical_urls[0]];
|
||||
screenshot = '<a class="screenshot" href="'+ surl +'" target="_blank" title="Mobile">M</a>';
|
||||
screenshot += '<a class="screenshot" href="'+ lurl +'" target="_blank" title="Desktop">D</a>';
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue