Merge pull request #6 from netzbegruenung/content-checks

Neue checks basierend auf Inhalten
This commit is contained in:
Marian Steinbach 2018-04-09 23:29:21 +02:00 committed by GitHub
commit 12e2770ffb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 24987 additions and 2110 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -21,6 +21,11 @@
background-color: #cfeaa8;
font-size: 1rem;
}
.icon {
width: 20px;
height: 20px;
}
</style>
</head>
<body>
@ -34,9 +39,11 @@
<tr>
<th scope="col">URL</th>
<th scope="col">IP-Adresse</th>
<th scope="col">Icon</th>
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
<th scope="col"><abbr title="Site nutzt HTTP-Verschlüsselung">HTTPS</abbr></th>
<th scope="col">Feed</th>
</tr>
</thead>
<tbody>

View file

@ -1,6 +1,10 @@
beautifulsoup4==4.6.0
certifi==2018.1.18
chardet==3.0.4
gitdb2==2.0.3
GitPython==2.1.9
idna==2.6
PyYAML==3.12
requests==2.18.4
smmap2==2.0.3
urllib3==1.22
pyyaml==3.12

View file

@ -1,14 +1,17 @@
# coding: utf8
from bs4 import BeautifulSoup
from git import Repo
from multiprocessing import Pool
from socket import gethostbyname_ex
from urllib.parse import urljoin
from urllib.parse import urlparse
import certifi
import json
import logging
import os
import random
import re
import requests
import shutil
import sys
@ -115,6 +118,80 @@ def reduce_urls(urllist):
return list(targets)
def normalize_title(s):
"""
Removes garbage from HTML page titles
"""
s = s.replace('\u00a0', ' ')
s = s.replace(' ', ' ')
s = s.strip()
return s
def check_content(r):
"""
Adds details to check regarding content of the page
check: the dict containing details for this URL
r: requests request/response object
"""
result = {}
result['encoding'] = r.encoding
soup = BeautifulSoup(r.text, 'html.parser')
# page title
result['title'] = None
title = soup.find('head').find('title')
if title is not None:
result['title'] = normalize_title(title.get_text())
# canonical link
result['canonical_link'] = None
link = soup.find('link', rel='canonical')
if link:
result['canonical_link'] = urljoin(r.url, link.get('href'))
# icon
result['icon'] = None
link = soup.find('link', rel='icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
else:
link = soup.find('link', rel='shortcut icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
# feed links
result['feeds'] = []
rss_links = soup.find_all('link', type='application/rss+xml')
atom_links = soup.find_all('link', type='application/atom+xml')
if len(rss_links) > 0:
for l in rss_links:
result['feeds'].append(urljoin(r.url, l.get('href')))
if len(atom_links) > 0:
for l in rss_links:
result['feeds'].append(urljoin(r.url, l.get('href')))
# generator meta tag
result['generator'] = None
generator = soup.head.select('[name=generator]')
if len(generator):
result['generator'] = generator[0].get('content')
# opengraph meta tags
result['opengraph'] = None
og = set()
for item in soup.head.find_all(property=re.compile('^og:')):
og.add(item.get('property'))
for item in soup.head.find_all(itemprop=re.compile('^og:')):
og.add(item.get('itemprop'))
if len(og):
result['opengraph'] = list(og)
return result
def check_site(url):
"""
Performs our site check and returns results as a dict.
@ -204,12 +281,18 @@ def check_site(url):
'status_code': None,
'duration': None,
'error': None,
'content': None,
}
try:
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
check['status_code'] = r.status_code
check['duration'] = round(r.elapsed.microseconds / 1000)
# Content checks
if r.status_code < 300:
check['content'] = check_content(r)
except requests.exceptions.ConnectionError as e:
logging.error(str(e) + " " + check_url)
check['error'] = "connection"
@ -285,7 +368,7 @@ def main():
# Write result as JSON
output_filename = os.path.join(result_path, "spider_result.json")
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(results2, jsonfile, indent=2, sort_keys=True)
json.dump(results2, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
if __name__ == "__main__":

26
webapp/dist/bundle.js vendored

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -21,6 +21,11 @@
background-color: #cfeaa8;
font-size: 1rem;
}
.icon {
width: 20px;
height: 20px;
}
</style>
</head>
<body>
@ -34,9 +39,11 @@
<tr>
<th scope="col">URL</th>
<th scope="col">IP-Adresse</th>
<th scope="col">Icon</th>
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
<th scope="col"><abbr title="Site nutzt HTTP-Verschlüsselung">HTTPS</abbr></th>
<th scope="col">Feed</th>
</tr>
</thead>
<tbody>

View file

@ -20,6 +20,15 @@ $(function(){
var ips = _.join(_.uniq(_.flatten(_.map(item.hostnames, 'ip_addresses'))), ', ');
row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌ Keine' : ips) + '</td>');
// icon
var icons = [];
var icon = false;
icons = _.uniq(_.map(item.urlchecks, 'content.icon'));
if (icons.length > 0 && icons[0]) {
icon = icons[0];
}
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + icon + '" class="icon"/>') : '❌') + '</td>');
// hostnames
var twoHostnames = false;
if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
@ -30,6 +39,8 @@ $(function(){
// one canonical URL
var canonical = false;
if (item.canonical_urls.length === 1 ) canonical = true;
var canonical_links = _.uniq(_.map(item.urlchecks, 'content.canonical_link'));
if (canonical_links.length === 1) canonical = true;
row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');
// https
@ -39,6 +50,11 @@ $(function(){
});
row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');
// feeds
var feeds = false;
feeds = _.uniq(_.flatten(_.map(item.urlchecks, 'content.feeds')));
row.append('<td class="'+ (feeds.length ? 'good' : 'bad') +' text-center">' + (feeds.length ? '✅' : '❌') + '</td>');
tbody.append(row);
});