mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-06 02:43:42 +02:00
Marian Steinbach
618e29d763
* CLI: remove 'jobs' command, add 'manager' * Add job definition * Move jobs to manage folder * Rename jobs to manager * Add rq and redis dependencies * Add docker-compose YAML * Downgrade to alpine 3.8 * Adjust paths in Dockerfile, remove entrypoint * Rename 'make spiderjobs' to 'make jobs' * Fix docker exectution * Adapt 'make jobs' * Fix metadata scheme * Add docker dependency * Rendomize queue (a bit) * Use latest image, remove debug output * Make docker-compose file downwards-compatible * Use latest instead of dev image tag * Update docker-compose.yaml * Adapt job start script * Fix redis connection in manager * Add support for increasing timeout via environment variable * Adapt load_in_browser to cookies table schema change * Fix execution * Mitigate yaml warning * Bump some dependency versions * Report resource usage stats for each job * checks/load_in_browser: Return DOM size, prevent multiple page loads * Update .dockerignore * Code update * Script update * Update README.md * WIP * WIP commit * Update Dockerfile to alpine:edge and chromium v90 * Update TestCertificateChecker * Set defaults for __init__ function * Detect sunflower theme * Update unit test for new datetime (zero-basing) * Set logging prefs from Chromium in a new way * Move datastore client instantiation As it is not needed for all commands * Change green-directory repository URL * Add git settings for cloning green-directory * Pin alpine version 3.14, fix py3-cryptography * Use plain docker build progress output * Add volumes to 'make test' docker run command * Fix bug * Update example command in README * Update dependencies * Add creation of Kubernetes jobs
139 lines
3.7 KiB
Python
139 lines
3.7 KiB
Python
"""
|
|
Provides the spider functionality (website checks).
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import statistics
|
|
import time
|
|
from datetime import datetime
|
|
from pprint import pprint
|
|
|
|
from google.api_core.exceptions import InvalidArgument
|
|
from google.cloud import datastore
|
|
|
|
import checks
|
|
import config
|
|
import manager
|
|
import rating
|
|
|
|
def check_and_rate_site(entry):
|
|
"""
|
|
Performs our site checks, calculates the score
|
|
and returns results as a dict.
|
|
"""
|
|
|
|
# all the info we'll return for the site
|
|
result = {
|
|
# input_url: The URL we derived all checks from
|
|
'input_url': entry['url'],
|
|
# Meta: Regional and type metadata for the site
|
|
'meta': {
|
|
'type': entry.get('type'),
|
|
'level': entry.get('level'),
|
|
'state': entry.get('state'),
|
|
'district': entry.get('district'),
|
|
'city': entry.get('city'),
|
|
},
|
|
# checks: Results from our checks
|
|
'checks': {},
|
|
# The actual report scoring criteria
|
|
'rating': {},
|
|
# resulting score
|
|
'score': 0.0,
|
|
}
|
|
|
|
# Results from our next generation checkers
|
|
result['checks'] = checks.perform_checks(entry['url'])
|
|
|
|
result['rating'] = rating.calculate_rating(result['checks'])
|
|
|
|
# Overall score is the sum of the individual scores
|
|
for key in result['rating']:
|
|
result['score'] += result['rating'][key]['score']
|
|
|
|
# Remove bigger result portions to safe some storage:
|
|
# - HTML page content
|
|
# - Hyperlinks
|
|
# - Performnance log
|
|
try:
|
|
for url in result['checks']['page_content']:
|
|
del result['checks']['page_content'][url]['content']
|
|
|
|
for url in result['checks']['load_in_browser']:
|
|
del result['checks']['load_in_browser'][url]['performance_log']
|
|
|
|
del result['checks']['hyperlinks']
|
|
except:
|
|
pass
|
|
|
|
return result
|
|
|
|
|
|
def test_url(url):
|
|
"""
|
|
Run the spider for a single URL and print the result.
|
|
Doesn't write anything to the database.
|
|
"""
|
|
logging.info("Crawling URL %s", url)
|
|
|
|
# mock job
|
|
job = {
|
|
"url": url,
|
|
}
|
|
|
|
result = check_and_rate_site(entry=job)
|
|
pprint(result)
|
|
|
|
|
|
def execute_single_job(datastore_client, job, entity_kind):
|
|
"""
|
|
Executes spider for one single job
|
|
"""
|
|
validate_job(job)
|
|
|
|
logging.info("Starting job %s", job["url"])
|
|
result = check_and_rate_site(entry=job)
|
|
|
|
logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str))
|
|
|
|
logging.info("Job %s finished checks", job["url"])
|
|
logging.info("Job %s writing to DB", job["url"])
|
|
|
|
key = datastore_client.key(entity_kind, job["url"])
|
|
entity = datastore.Entity(key=key)
|
|
record = {
|
|
'created': datetime.utcnow(),
|
|
'meta': result['meta'],
|
|
'checks': result['checks'],
|
|
'rating': result['rating'],
|
|
'score': result['score'],
|
|
}
|
|
|
|
entity.update(record)
|
|
try:
|
|
datastore_client.put(entity)
|
|
logging.debug("Successfully wrote record to database")
|
|
except InvalidArgument as ex:
|
|
logging.error("Could not write result: %s", ex)
|
|
except Exception as ex:
|
|
logging.error("Could not write result: %s", ex)
|
|
|
|
def work_of_queue(datastore_client, entity_kind):
|
|
"""
|
|
Take job from queue and finish it until there are no more jobs
|
|
"""
|
|
while True:
|
|
job = manager.get_job_from_queue(datastore_client)
|
|
if job is None:
|
|
logging.info("No more jobs. Exiting.")
|
|
break
|
|
|
|
execute_single_job(datastore_client, job, entity_kind)
|
|
|
|
def validate_job(jobdict):
|
|
if "url" not in jobdict:
|
|
raise Exception("Job does not have required 'url' attribute")
|