mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-04-29 23:54:51 +02:00
Add content checks
This commit is contained in:
parent
0ecf7e306e
commit
48ee1ab8a1
71
spider.py
71
spider.py
|
@ -1,5 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from git import Repo
|
from git import Repo
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from socket import gethostbyname_ex
|
from socket import gethostbyname_ex
|
||||||
|
@ -9,6 +10,7 @@ import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import re
|
||||||
import requests
|
import requests
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
|
@ -115,6 +117,69 @@ def reduce_urls(urllist):
|
||||||
return list(targets)
|
return list(targets)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_title(s):
|
||||||
|
"""
|
||||||
|
Removes garbage from HTML page titles
|
||||||
|
"""
|
||||||
|
s = s.replace('\u00a0', ' ')
|
||||||
|
s = s.replace(' ', ' ')
|
||||||
|
s = s.strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
def check_content(r):
|
||||||
|
"""
|
||||||
|
Adds details to check regarding content of the page
|
||||||
|
|
||||||
|
check: the dict containing details for this URL
|
||||||
|
r: requests request/response object
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
result['encoding'] = r.encoding
|
||||||
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
|
|
||||||
|
# page title
|
||||||
|
result['title'] = None
|
||||||
|
title = soup.find('head').find('title')
|
||||||
|
if title is not None:
|
||||||
|
result['title'] = normalize_title(title.get_text())
|
||||||
|
|
||||||
|
# canonical link
|
||||||
|
result['canonical_link'] = None
|
||||||
|
link = soup.find('link', rel='canonical')
|
||||||
|
if link:
|
||||||
|
result['canonical_link'] = link.get('href')
|
||||||
|
|
||||||
|
# feed links
|
||||||
|
result['feeds'] = []
|
||||||
|
rss_links = soup.find_all('link', type='application/rss+xml')
|
||||||
|
atom_links = soup.find_all('link', type='application/atom+xml')
|
||||||
|
|
||||||
|
if len(rss_links) > 0:
|
||||||
|
for l in rss_links:
|
||||||
|
result['feeds'].append(l.get('href'))
|
||||||
|
if len(atom_links) > 0:
|
||||||
|
for l in rss_links:
|
||||||
|
result['feeds'].append(l.get('href'))
|
||||||
|
|
||||||
|
# generator meta tag
|
||||||
|
result['generator'] = None
|
||||||
|
generator = soup.head.select('[name=generator]')
|
||||||
|
if len(generator):
|
||||||
|
result['generator'] = generator[0].get('content')
|
||||||
|
|
||||||
|
# opengraph meta tags
|
||||||
|
result['opengraph'] = None
|
||||||
|
og = set()
|
||||||
|
for item in soup.head.find_all(property=re.compile('^og:')):
|
||||||
|
og.add(item.get('property'))
|
||||||
|
for item in soup.head.find_all(itemprop=re.compile('^og:')):
|
||||||
|
og.add(item.get('itemprop'))
|
||||||
|
if len(og):
|
||||||
|
result['opengraph'] = list(og)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def check_site(url):
|
def check_site(url):
|
||||||
"""
|
"""
|
||||||
Performs our site check and returns results as a dict.
|
Performs our site check and returns results as a dict.
|
||||||
|
@ -204,12 +269,18 @@ def check_site(url):
|
||||||
'status_code': None,
|
'status_code': None,
|
||||||
'duration': None,
|
'duration': None,
|
||||||
'error': None,
|
'error': None,
|
||||||
|
'content': None,
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
|
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
|
||||||
check['status_code'] = r.status_code
|
check['status_code'] = r.status_code
|
||||||
check['duration'] = round(r.elapsed.microseconds / 1000)
|
check['duration'] = round(r.elapsed.microseconds / 1000)
|
||||||
|
|
||||||
|
# Content checks
|
||||||
|
if r.status_code < 300:
|
||||||
|
check['content'] = check_content(r)
|
||||||
|
|
||||||
except requests.exceptions.ConnectionError as e:
|
except requests.exceptions.ConnectionError as e:
|
||||||
logging.error(str(e) + " " + check_url)
|
logging.error(str(e) + " " + check_url)
|
||||||
check['error'] = "connection"
|
check['error'] = "connection"
|
||||||
|
|
Loading…
Reference in a new issue