First working code and results
This commit is contained in:
parent
47b468b2a0
commit
220a6ba629
|
@ -0,0 +1,2 @@
|
|||
venv
|
||||
cache
|
23
README.md
23
README.md
|
@ -1,2 +1,25 @@
|
|||
# green-spider
|
||||
|
||||
Collects data on green websites and checks for things like SEO, performance, TLS.
|
||||
|
||||
Written and tested in Python3
|
||||
|
||||
### Ideas
|
||||
|
||||
- If the URL does not start with `www.`, will entering `www.<url>` also work?
|
||||
- If the URL is HTTP, is it possible to access the site via HTTPS (recommended)?
|
||||
- If the URL is HTTPS, is it possible to access the sire via HTTP (recommended: redirect to HTTPS)
|
||||
- Check which cookies are set and with what settings (expiry, domain)
|
||||
- submit the URL against a service like Google Page Speed and retrieve the score
|
||||
- Check against our own webpagetest.org instance
|
||||
- Detect which one of the well-known CMS is used?
|
||||
|
||||
### Usage
|
||||
|
||||
```nohighlight
|
||||
virtualenv -p python3 venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
|
||||
python spider.py
|
||||
```
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
certifi==2018.1.18
|
||||
chardet==3.0.4
|
||||
idna==2.6
|
||||
requests==2.18.4
|
||||
urllib3==1.22
|
||||
pyyaml==3.12
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,180 @@
|
|||
# coding: utf8
|
||||
|
||||
from git import Repo
|
||||
from multiprocessing import Pool
|
||||
from urllib.parse import urlparse
|
||||
from socket import gethostbyname_ex
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import requests
|
||||
import shutil
|
||||
import sys
|
||||
import yaml
|
||||
import json
|
||||
|
||||
# configuration
|
||||
|
||||
# number of parallel processes to use for crawling
|
||||
concurrency = 6
|
||||
|
||||
# connection timeout for website checks (seconds)
|
||||
connect_timeout = 5
|
||||
|
||||
# response timeout for website checks
|
||||
read_timeout = 10
|
||||
|
||||
# Git repo for our data
|
||||
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
|
||||
# folder in that repo that holds the data
|
||||
green_direcory_data_path = 'data'
|
||||
green_directory_local_path = './cache/green-directory'
|
||||
|
||||
|
||||
# end configuration
|
||||
|
||||
def get_green_directory():
|
||||
"""
|
||||
Clones the green directory into the local file system
|
||||
"""
|
||||
if os.path.exists(green_directory_local_path):
|
||||
shutil.rmtree(green_directory_local_path)
|
||||
Repo.clone_from(green_directory_repo, green_directory_local_path)
|
||||
|
||||
|
||||
def dir_entries():
|
||||
path = os.path.join(green_directory_local_path, green_direcory_data_path)
|
||||
for root, dirs, files in os.walk(path):
|
||||
for fname in files:
|
||||
|
||||
filepath = os.path.join(root, fname)
|
||||
if not filepath.endswith(".yaml"):
|
||||
continue
|
||||
|
||||
with open(filepath, 'r') as yamlfile:
|
||||
for doc in yaml.load_all(yamlfile):
|
||||
yield doc
|
||||
|
||||
def repr_entry(entry):
|
||||
"""
|
||||
Return string representation of an entry
|
||||
"""
|
||||
r = entry['type']
|
||||
if 'level' in entry:
|
||||
r += "/" + entry['level']
|
||||
if 'state' in entry:
|
||||
r += "/" + entry['state']
|
||||
if 'district' in entry:
|
||||
r += "/" + entry['district']
|
||||
return r
|
||||
|
||||
def resolve_hostname(url):
|
||||
parsed = urlparse(url)
|
||||
hostname, aliaslist, ipaddrlist = gethostbyname_ex(parsed.hostname)
|
||||
return (parsed.scheme, hostname, aliaslist, ipaddrlist)
|
||||
|
||||
def check_site(url):
|
||||
"""
|
||||
Performs our site check and returns results as a dict
|
||||
"""
|
||||
result = {
|
||||
'status_code': 0,
|
||||
'error': None,
|
||||
'redirects': 0,
|
||||
'final_url': None,
|
||||
'hostname': None,
|
||||
'scheme': None,
|
||||
'aliases': None,
|
||||
'ip_addresses': None,
|
||||
'duration': 0,
|
||||
}
|
||||
|
||||
try:
|
||||
(scheme, hostname, aliases, ip_addresses) = resolve_hostname(url)
|
||||
result['scheme'] = scheme
|
||||
result['hostname'] = hostname
|
||||
result['aliases'] = aliases
|
||||
result['ip_addresses'] = ip_addresses
|
||||
except Exception as e:
|
||||
logging.error(str(e) + " " + url)
|
||||
|
||||
headers = {
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
|
||||
}
|
||||
|
||||
try:
|
||||
r = requests.get(url, headers=headers, timeout=(connect_timeout, read_timeout))
|
||||
result['status_code'] = r.status_code
|
||||
if len(r.history) > 0:
|
||||
result['redirects'] = len(r.history)
|
||||
result['final_url'] = r.url
|
||||
result['duration'] = r.elapsed.microseconds / 1000
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
logging.error(str(e) + " " + url)
|
||||
result['error'] = "connection"
|
||||
except requests.exceptions.Timeout as e:
|
||||
logging.error(str(e) + " " + url)
|
||||
result['error'] = "connection_timeout"
|
||||
except requests.exceptions.ReadTimeout as e:
|
||||
logging.error(str(e) + " " + url)
|
||||
result['error'] = "read_timeout"
|
||||
except Exception as e:
|
||||
logging.error(str(e) + " " + url)
|
||||
result['error'] = "unknown"
|
||||
|
||||
logging.info("%s done" % url)
|
||||
return result
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
get_green_directory()
|
||||
|
||||
urls = []
|
||||
for entry in dir_entries():
|
||||
|
||||
if 'type' not in entry:
|
||||
logging.error("Entry without type")
|
||||
continue
|
||||
|
||||
if 'urls' not in entry:
|
||||
logging.info("Entry %s does not have any URLs." % repr_entry(entry))
|
||||
continue
|
||||
|
||||
website_url = None
|
||||
for n in range(len(entry['urls'])):
|
||||
try:
|
||||
if entry['urls'][n]['type'] == "WEBSITE":
|
||||
website_url = entry['urls'][n]['url']
|
||||
except NameError as ne:
|
||||
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
|
||||
if website_url:
|
||||
urls.append(website_url)
|
||||
|
||||
random.seed()
|
||||
random.shuffle(urls)
|
||||
|
||||
results = {}
|
||||
|
||||
if concurrency > 1:
|
||||
pool = Pool(concurrency)
|
||||
for url in urls:
|
||||
results[url] = pool.apply_async(check_site, kwds={"url": url})
|
||||
pool.close()
|
||||
pool.join()
|
||||
else:
|
||||
for url in urls:
|
||||
results[url] = check_site(url)
|
||||
|
||||
results2 = {}
|
||||
|
||||
for url in results.keys():
|
||||
results2[url] = results[url].get()
|
||||
|
||||
with open('result.json', 'w', encoding="utf8") as jsonfile:
|
||||
json.dump(results2, jsonfile, indent=2, sort_keys=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue