Move test criteria logic into spider, add scoring

2024-04-26 14:24:51 +02:00 · 2018-04-19 00:51:59 +02:00 · 2018-04-19 00:51:59 +02:00 · ea05bf5192
parent d4ac1fa9f5
commit ea05bf5192
7 changed files with 286 additions and 100 deletions
--- a/KRITERIEN.md
+++ b/KRITERIEN.md
@ -0,0 +1,19 @@
+# Qualitätskriterien
+
+Wir prüfen Sites nach den folgenden Kriterien:
+
+- `DNS_RESOLVABLE_IPV4`: Der Hostname der URL ist in eine IPv4 Adresse auflösbar
+
+- `SITE_REACHABLE`: Die Site ist per HTTP(S) erreichbar (Status-Code 200)
+
+- `HTTPS`: Die Site ist über HTTPS erreichbar. Das Server-Zertifikat ist gültig und stammt von einer vertrauenswürdigen CA.
+
+- `WWW_OPTIONAL`: Die Verwendung von `www.` zu Beginn der Startseiten-URL ist optional. Die Site ist sowohl mit als auch ohne dieses Präfix im Hostnamen erreichbar.
+
+- `CANONICAL_URL`: Bei mehreren möglichen URLs, über die auf die Site zugegriffen werden kann, wird auf eine kanonische URL weiter geleitet bzw. per `rel=canonical` Link verwiesen.
+
+- `FAVICON`: Die Site hat ein Favoriten-Icon.
+
+- `FEEDS`: Die Site verweist auf RSS oder Atom Feeds via `rel=alternate` Link Tag.
+
+- `HTTP_RESPONSE_DURATION`: Zeit, die vom Absenden des HTTP-Request bis zum Empfang der Response-Header vergangen ist.
--- a/docs/bundle.js
+++ b/docs/bundle.js
--- a/docs/index.html
+++ b/docs/index.html
@ -43,8 +43,14 @@
  <table class="table">
    <thead>
      <tr>
+        <th scope="col">Typ</th>
+        <th scope="col">Land</th>
+        <th scope="col">Kreis</th>
+        <th scope="col">Stadt</th>
        <th scope="col">URL</th>
+        <th scope="col">Score</th>
        <th scope="col">IP-Adresse</th>
+        <th scope="col">Erreichbar</th>
        <th scope="col">Icon</th>
        <th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
        <th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
--- a/spider.py
+++ b/spider.py
@ -14,9 +14,11 @@ import random
 import re
 import requests
 import shutil
+import statistics
 import sys
 import yaml

+
 # configuration

 # number of parallel processes to use for crawling
@ -31,7 +33,7 @@ read_timeout = 10
 # Git repo for our data
 green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
 # folder in that repo that holds the data
-green_direcory_data_path = 'data'
+green_direcory_data_path = 'data/countries/de'
 green_directory_local_path = './cache/green-directory'

 result_path = './webapp/dist/data'
@ -153,11 +155,11 @@ def check_content(r):

    # icon
    result['icon'] = None
-    link = soup.find('link', rel='icon')
+    link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
    if link:
        result['icon'] = urljoin(r.url, link.get('href'))
    else:
-        link = soup.find('link', rel='shortcut icon')
+        link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
        if link:
            result['icon'] = urljoin(r.url, link.get('href'))

@ -192,7 +194,20 @@ def check_content(r):
    return result


-def check_site(url):
+def collect_ipv4_addresses(hostname_dict):
+    """
+    Return list of unique IPv4 addresses
+    """
+    ips = set()
+    for item in hostname_dict.values():
+        if 'ip_addresses' not in item:
+            continue
+        for ip in item['ip_addresses']:
+            ips.add(ip)
+    return sorted(list(ips))
+
+
+def check_site(entry):
    """
    Performs our site check and returns results as a dict.

@ -205,49 +220,88 @@ def check_site(url):
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
    }

+    # all the info we'll return for the site
    result = {
-        'input_url': url,
-        'hostnames': [],
-        'resolvable_urls': [],
-        'canonical_urls': [],
-        'urlchecks': [],
+        # input_url: The URL we derived all checks from
+        'input_url': entry['url'],
+        # Meta: Regional and type metadata for the site
+        'meta': {
+            'level': entry['level'],
+            'state': entry['state'],
+            'district': entry['district'],
+            'city': entry['city'],
+        },
+        # Details: All details we collected about the site (which aren't directly related to the report criteria)
+        'details': {
+            'hostnames': {},
+            'ipv4_addresses': [],
+            'resolvable_urls': [],
+            'canonical_urls': [],
+            'urlchecks': [],
+            'icons': [],
+            'feeds': [],
+        },
+        # The actual report criteria
+        'result': {
+            'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0},
+            'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0},
+            'HTTPS': {'type': 'boolean', 'value': False, 'score': 0},
+            'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0},
+            'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0},
+            'FAVICON': {'type': 'boolean', 'value': False, 'score': 0},
+            'FEEDS': {'type': 'boolean', 'value': False, 'score': 0},
+            'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0},
+        },
+        'score': 0.0,
    }

-    # derive hostnames to test
-    parsed = urlparse(url)
+    # derive hostnames to test (with/without www.)
+    parsed = urlparse(entry['url'])
    hostnames = derive_test_hostnames(parsed.hostname)

-
-    processed_hostnames = []
+    # try to resolve hostnames
+    processed_hostnames = {}
    for hn in hostnames:

-        record  = {
-            'input_hostname': hn,
+        processed_hostnames[hn] = {
            'resolvable': False,
        }

        try:
            hostname, aliases, ip_addresses = gethostbyname_ex(hn)
-            record['resolvable'] = True
-            record['resolved_hostname'] = hostname
-            record['aliases'] = aliases
-            record['ip_addresses'] = ip_addresses
+            processed_hostnames[hn]['resolvable'] = True
+            processed_hostnames[hn]['resolved_hostname'] = hostname
+            processed_hostnames[hn]['aliases'] = aliases
+            processed_hostnames[hn]['ip_addresses'] = ip_addresses
        except:
            pass

-        processed_hostnames.append(record)
+    result['details']['hostnames'] = processed_hostnames

-    result['hostnames'] = sorted(processed_hostnames, key=lambda hn: hn['input_hostname'])
+    result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)

+    # check basic HTTP(S) reachability
    checked_urls = []
-    for item in processed_hostnames:
+    checked_urls_set = set()
+
+    for hn in processed_hostnames.keys():
+
+        item = processed_hostnames[hn]
+
        if not item['resolvable']:
            continue

        for scheme in ('http', 'https'):

+            url = scheme + '://' + item['resolved_hostname'] + '/'
+
+            if url in checked_urls_set:
+                continue
+
+            checked_urls_set.add(url)
+
            record = {
-                'url': scheme + '://' + item['resolved_hostname'] + '/',
+                'url': url,
                'error': None,
                'redirects_to': None,
            }
@ -255,7 +309,7 @@ def check_site(url):
            try:
                r = requests.head(record['url'], headers=headers, allow_redirects=True)
                if r.url == url:
-                    logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
+                    logging.info("URL: %s - status %s" % (record['url'], r.status_code))
                else:
                    logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
                    record['redirects_to'] = r.url
@ -268,13 +322,13 @@ def check_site(url):

            checked_urls.append(record)

-    result['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
-    result['canonical_urls'] = sorted(reduce_urls(checked_urls))
+    result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
+    result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls))

    # Deeper test for the remaining (canonical) URL(s)
-    for check_url in result['canonical_urls']:
+    for check_url in result['details']['canonical_urls']:

-        logging.info("Checking URL %s" % check_url)
+        logging.info("Downloading URL %s" % check_url)

        check = {
            'url': check_url,
@ -306,27 +360,111 @@ def check_site(url):
            logging.error(str(e) + " " + check_url)
            check['error'] = "unknown"

-        result['urlchecks'].append(check)
+        result['details']['urlchecks'].append(check)


-    result['urlchecks'] = sorted(result['urlchecks'], key=lambda url: url['url'])
+    result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
+
+    # collect icons
+    for c in result['details']['urlchecks']:
+        if 'icon' in c['content'] and c['content']['icon']:
+            if c['content']['icon'] in result['details']['icons']:
+                continue
+            result['details']['icons'].append(c['content']['icon'])
+
+    # collect feeds
+    for c in result['details']['urlchecks']:
+        if 'feeds' in c['content'] and len(c['content']['feeds']):
+            for feed in c['content']['feeds']:
+                if feed in result['details']['feeds']:
+                    continue
+                result['details']['feeds'].append(feed)
+
+
+    ### Derive criteria
+
+    # DNS_RESOLVABLE_IPV4
+    if len(result['details']['ipv4_addresses']):
+        result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
+
+    # SITE_REACHABLE
+    for item in result['details']['resolvable_urls']:
+        if item['error'] is None:
+            result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1}
+            break
+
+    # HTTPS
+    for item in result['details']['urlchecks']:
+        if item['error'] is None and item['url'].startswith('https://'):
+            result['result']['HTTPS'] = {'value': True, 'score': 1}
+            break
+
+    # WWW_OPTIONAL
+    num_hostnames = 0
+    for hn in result['details']['hostnames'].keys():
+        item = result['details']['hostnames'][hn]
+        if not item['resolvable']:
+            continue
+        num_hostnames += 1
+    if num_hostnames > 1:
+        result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1}
+
+    # CANONICAL_URL
+    # - either there is only one canonical URL (through redirects)
+    # - or several pages have identical rel=canonical links
+    if len(result['details']['canonical_urls']) == 1:
+        result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
+    else:
+        links = set()
+        for item in result['details']['urlchecks']:
+            if item['content']['canonical_link'] is not None:
+                links.add(item['content']['canonical_link'])
+        if len(links) == 1:
+            result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
+
+    # FAVICON
+    if len(result['details']['icons']):
+        result['result']['FAVICON'] = {'value': True, 'score': 1}
+
+    # FEEDS
+    if len(result['details']['feeds']):
+        result['result']['FEEDS'] = {'value': True, 'score': 1}
+
+    # HTTP_RESPONSE_DURATION
+    durations = []
+    for item in result['details']['urlchecks']:
+        if item['error'] is None:
+            durations.append(item['duration'])
+    result['result']['HTTP_RESPONSE_DURATION'] = {
+        'value': round(statistics.mean(durations)),
+        'score': 1.0/statistics.mean(durations) * 500
+    }
+
+    # Overall score
+    for item in result['result'].keys():
+        result['score'] += result['result'][item]['score']

    return result


 def main():
+    """
+    Bringing it all together
+    """
    logging.basicConfig(level=logging.INFO)
    logging.getLogger("urllib3").setLevel(logging.CRITICAL)

+    # refresh our local clone of the green directory
    get_green_directory()

-    urls = []
+    # build the list of website URLs to run checks for
+    input_entries = []
+
    for entry in dir_entries():

        if 'type' not in entry:
            logging.error("Entry without type")
            continue
-
        if 'urls' not in entry:
            logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
            continue
@ -339,23 +477,29 @@ def main():
            except NameError as ne:
                logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
        if website_url:
-            urls.append(website_url)
+            input_entries.append({
+                "url": website_url,
+                "level": entry.get("level"),
+                "state": entry.get("state"),
+                "district": entry.get("district"),
+                "city": entry.get("city"),
+            })

+    # randomize order, to distribute requests over servers
    random.seed()
-    random.shuffle(urls)
+    random.shuffle(input_entries)

+    # run checks
    results = {}

-    if concurrency > 1:
-        pool = Pool(concurrency)
-        for url in urls:
-            results[url] = pool.apply_async(check_site, kwds={"url": url})
-        pool.close()
-        pool.join()
-    else:
-        for url in urls:
-            results[url] = check_site(url)
+    pool = Pool(concurrency)
+    for entry in input_entries:
+        results[entry['url']] = pool.apply_async(check_site, kwds={'entry': entry})
+    pool.close()
+    pool.join()

+    # Restructure result from dict of ApplyResult
+    # to list of dicts and sort in stable way
    results2 = []
    done = set()

--- a/webapp/dist/bundle.js
+++ b/webapp/dist/bundle.js
--- a/webapp/dist/index.html
+++ b/webapp/dist/index.html
@ -43,8 +43,14 @@
  <table class="table">
    <thead>
      <tr>
+        <th scope="col">Typ</th>
+        <th scope="col">Land</th>
+        <th scope="col">Kreis</th>
+        <th scope="col">Stadt</th>
        <th scope="col">URL</th>
+        <th scope="col">Score</th>
        <th scope="col">IP-Adresse</th>
+        <th scope="col">Erreichbar</th>
        <th scope="col">Icon</th>
        <th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
        <th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
--- a/webapp/src/index.js
+++ b/webapp/src/index.js
@ -18,54 +18,65 @@ $(function(){

        var row = $(document.createElement('tr'));

+        // typ
+        var level = null;
+        if (item.meta.level === 'DE:ORTSVERBAND') {
+          level = 'OV';
+        } else if (item.meta.level === 'DE:KREISVERBAND') {
+          level = 'KV';
+        } else if (item.meta.level === 'DE:LANDESVERBAND') {
+          level = 'LV';
+        }
+        row.append('<td>' + (level === null ? '' : level) + '</td>');
+
+        // land
+        row.append('<td>' + (item.meta.state === null ? '' : item.meta.state) + '</td>');
+
+        // kreis
+        row.append('<td>' + (item.meta.district === null ? '' : item.meta.district) + '</td>');
+
+        // stadt
+        row.append('<td>' + (item.meta.city === null ? '' : item.meta.city) + '</td>');
+
        // input URL
        row.append('<td><a href="' + item.input_url + '">' + punycode.toUnicode(item.input_url) + '</a></td>');

+        // score
+        row.append('<td>' + item.score.toFixed(1) + '</td>');
+
        // IPs
-        var ips = _.join(_.uniq(_.flatten(_.map(item.hostnames, 'ip_addresses'))), ', ');
-        row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌  Keine' : ips) + '</td>');
+        var ips = _.join(item.details.ipv4_addresses, ', ');
+        row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌' : ips) + '</td>');

-        // icon
-        var icons = [];
-        var icon = false;
-        icons = _.uniq(_.map(item.urlchecks, 'content.icon'));
-        if (icons.length > 0 && icons[0]) {
-          icon = icons[0];
-        }
-        row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + icon + '" class="icon"/>') : '❌') + '</td>');
+        // SITE_REACHABLE
+        row.append('<td class="'+ (item.result.SITE_REACHABLE.value ? 'good' : 'bad') +' text-center">' + (item.result.SITE_REACHABLE.value ? '✅' : '❌') + '</td>');

-        // hostnames
-        var twoHostnames = false;
-        if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
-          twoHostnames = true;
-        };
-        row.append('<td class="'+ (twoHostnames ? 'good' : 'bad') +' text-center">' + (twoHostnames ? '✅' : '❌') + '</td>');
+        // FAVICON
+        var icon = item.result.FAVICON.value;
+        row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + item.details.icons[0] + '" class="icon">') : '❌') + '</td>');
+
+        // WWW_OPTIONAL
+        var wwwOptional = item.result.WWW_OPTIONAL.value;
+        row.append('<td class="'+ (wwwOptional ? 'good' : 'bad') +' text-center">' + (wwwOptional ? '✅' : '❌') + '</td>');

        // one canonical URL
-        var canonical = false;
-        if (item.canonical_urls.length === 1) canonical = true;
-        var canonical_links = _.uniq(_.map(item.urlchecks, 'content.canonical_link'));
-        if (canonical_links.length === 1) canonical = true;
+        var canonical = item.result.CANONICAL_URL.value;
        row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');

        // https
-        var hasHTTPS = false;
-        hasHTTPS = _.find(item.canonical_urls, function(o){
-          return o.indexOf('https://') !== -1;
-        });
+        var hasHTTPS = item.result.HTTPS.value;
        row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');

        // feeds
-        var feeds = false;
-        feeds = _.uniq(_.flatten(_.map(item.urlchecks, 'content.feeds')));
-        row.append('<td class="'+ (feeds.length ? 'good' : 'bad') +' text-center">' + (feeds.length ? '✅' : '❌') + '</td>');
+        var feeds = item.result.FEEDS.value;
+        row.append('<td class="'+ (feeds ? 'good' : 'bad') +' text-center">' + (feeds ? '✅' : '❌') + '</td>');

        // screenshots
        var screenshot = false;
-        if (item.canonical_urls.length > 0) {
-          if (typeof screenshots[item.canonical_urls[0]] !== 'undefined') {
-            var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.canonical_urls[0]];
-            var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.canonical_urls[0]];
+        if (item.details.canonical_urls.length > 0) {
+          if (typeof screenshots[item.details.canonical_urls[0]] !== 'undefined') {
+            var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.details.canonical_urls[0]];
+            var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.details.canonical_urls[0]];
            screenshot = '<a class="screenshot" href="'+ surl +'" target="_blank" title="Mobile">M</a>';
            screenshot += '<a class="screenshot" href="'+ lurl +'" target="_blank" title="Desktop">D</a>';
          }