Merge pull request #6 from netzbegruenung/content-checks

Neue checks basierend auf Inhalten
2024-05-04 18:13:40 +02:00 · 2018-04-09 23:29:21 +02:00 · 2018-04-09 23:29:21 +02:00 · 12e2770ffb
parent b8d8bfd621 a6664f0503
commit 12e2770ffb
9 changed files with 24987 additions and 2110 deletions
--- a/docs/bundle.js
+++ b/docs/bundle.js
--- a/docs/data/spider_result.json
+++ b/docs/data/spider_result.json
--- a/docs/index.html
+++ b/docs/index.html
@ -21,6 +21,11 @@
    background-color: #cfeaa8;
    font-size: 1rem;
  }
+
+  .icon {
+    width: 20px;
+    height: 20px;
+  }
  </style>
 </head>
 <body>
@ -34,9 +39,11 @@
      <tr>
        <th scope="col">URL</th>
        <th scope="col">IP-Adresse</th>
+        <th scope="col">Icon</th>
        <th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
        <th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
        <th scope="col"><abbr title="Site nutzt HTTP-Verschlüsselung">HTTPS</abbr></th>
+        <th scope="col">Feed</th>
      </tr>
    </thead>
    <tbody>
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,10 @@
+beautifulsoup4==4.6.0
 certifi==2018.1.18
 chardet==3.0.4
+gitdb2==2.0.3
+GitPython==2.1.9
 idna==2.6
+PyYAML==3.12
 requests==2.18.4
+smmap2==2.0.3
 urllib3==1.22
-pyyaml==3.12
--- a/spider.py
+++ b/spider.py
@ -1,14 +1,17 @@
 # coding: utf8

+from bs4 import BeautifulSoup
 from git import Repo
 from multiprocessing import Pool
 from socket import gethostbyname_ex
+from urllib.parse import urljoin
 from urllib.parse import urlparse
 import certifi
 import json
 import logging
 import os
 import random
+import re
 import requests
 import shutil
 import sys
@ -115,6 +118,80 @@ def reduce_urls(urllist):
    return list(targets)


+def normalize_title(s):
+    """
+    Removes garbage from HTML page titles
+    """
+    s = s.replace('\u00a0', ' ')
+    s = s.replace('  ', ' ')
+    s = s.strip()
+    return s
+
+def check_content(r):
+    """
+    Adds details to check regarding content of the page
+
+    check: the dict containing details for this URL
+    r: requests request/response object
+    """
+    result = {}
+
+    result['encoding'] = r.encoding
+    soup = BeautifulSoup(r.text, 'html.parser')
+
+    # page title
+    result['title'] = None
+    title = soup.find('head').find('title')
+    if title is not None:
+        result['title'] = normalize_title(title.get_text())
+
+    # canonical link
+    result['canonical_link'] = None
+    link = soup.find('link', rel='canonical')
+    if link:
+        result['canonical_link'] = urljoin(r.url, link.get('href'))
+
+    # icon
+    result['icon'] = None
+    link = soup.find('link', rel='icon')
+    if link:
+        result['icon'] = urljoin(r.url, link.get('href'))
+    else:
+        link = soup.find('link', rel='shortcut icon')
+        if link:
+            result['icon'] = urljoin(r.url, link.get('href'))
+
+    # feed links
+    result['feeds'] = []
+    rss_links = soup.find_all('link', type='application/rss+xml')
+    atom_links = soup.find_all('link', type='application/atom+xml')
+
+    if len(rss_links) > 0:
+        for l in rss_links:
+            result['feeds'].append(urljoin(r.url, l.get('href')))
+    if len(atom_links) > 0:
+        for l in rss_links:
+            result['feeds'].append(urljoin(r.url, l.get('href')))
+
+    # generator meta tag
+    result['generator'] = None
+    generator = soup.head.select('[name=generator]')
+    if len(generator):
+        result['generator'] = generator[0].get('content')
+
+    # opengraph meta tags
+    result['opengraph'] = None
+    og = set()
+    for item in soup.head.find_all(property=re.compile('^og:')):
+        og.add(item.get('property'))
+    for item in soup.head.find_all(itemprop=re.compile('^og:')):
+        og.add(item.get('itemprop'))
+    if len(og):
+        result['opengraph'] = list(og)
+
+    return result
+
+
 def check_site(url):
    """
    Performs our site check and returns results as a dict.
@ -204,12 +281,18 @@ def check_site(url):
            'status_code': None,
            'duration': None,
            'error': None,
+            'content': None,
        }

        try:
            r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
            check['status_code'] = r.status_code
            check['duration'] = round(r.elapsed.microseconds / 1000)
+
+            # Content checks
+            if r.status_code < 300:
+                check['content'] = check_content(r)
+
        except requests.exceptions.ConnectionError as e:
            logging.error(str(e) + " " + check_url)
            check['error'] = "connection"
@ -285,7 +368,7 @@ def main():
    # Write result as JSON
    output_filename = os.path.join(result_path, "spider_result.json")
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
-        json.dump(results2, jsonfile, indent=2, sort_keys=True)
+        json.dump(results2, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)


 if __name__ == "__main__":
--- a/webapp/dist/bundle.js
+++ b/webapp/dist/bundle.js
--- a/webapp/dist/data/spider_result.json
+++ b/webapp/dist/data/spider_result.json
--- a/webapp/dist/index.html
+++ b/webapp/dist/index.html
@ -21,6 +21,11 @@
    background-color: #cfeaa8;
    font-size: 1rem;
  }
+
+  .icon {
+    width: 20px;
+    height: 20px;
+  }
  </style>
 </head>
 <body>
@ -34,9 +39,11 @@
      <tr>
        <th scope="col">URL</th>
        <th scope="col">IP-Adresse</th>
+        <th scope="col">Icon</th>
        <th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
        <th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
        <th scope="col"><abbr title="Site nutzt HTTP-Verschlüsselung">HTTPS</abbr></th>
+        <th scope="col">Feed</th>
      </tr>
    </thead>
    <tbody>
--- a/webapp/src/index.js
+++ b/webapp/src/index.js
@ -20,6 +20,15 @@ $(function(){
      var ips = _.join(_.uniq(_.flatten(_.map(item.hostnames, 'ip_addresses'))), ', ');
      row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌  Keine' : ips) + '</td>');

+      // icon
+      var icons = [];
+      var icon = false;
+      icons = _.uniq(_.map(item.urlchecks, 'content.icon'));
+      if (icons.length > 0 && icons[0]) {
+        icon = icons[0];
+      }
+      row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + icon + '" class="icon"/>') : '❌') + '</td>');
+
      // hostnames
      var twoHostnames = false;
      if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
@ -30,6 +39,8 @@ $(function(){
      // one canonical URL
      var canonical = false;
      if (item.canonical_urls.length === 1 ) canonical = true;
+      var canonical_links = _.uniq(_.map(item.urlchecks, 'content.canonical_link'));
+      if (canonical_links.length === 1) canonical = true;
      row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');

      // https
@ -39,6 +50,11 @@ $(function(){
      });
      row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');

+      // feeds
+      var feeds = false;
+      feeds = _.uniq(_.flatten(_.map(item.urlchecks, 'content.feeds')));
+      row.append('<td class="'+ (feeds.length ? 'good' : 'bad') +' text-center">' + (feeds.length ? '✅' : '❌') + '</td>');
+
      tbody.append(row);
    });