green-spider/manager/__init__.py
Marian Steinbach 618e29d763
Job-Verwaltung mit RQ, und vieles mehr (#149)
* CLI: remove 'jobs' command, add 'manager'

* Add job definition

* Move jobs to manage folder

* Rename jobs to manager

* Add rq and redis dependencies

* Add docker-compose YAML

* Downgrade to alpine 3.8

* Adjust paths in Dockerfile, remove entrypoint

* Rename 'make spiderjobs' to 'make jobs'

* Fix docker exectution

* Adapt 'make jobs'

* Fix metadata scheme

* Add docker dependency

* Rendomize queue (a bit)

* Use latest image, remove debug output

* Make docker-compose file downwards-compatible

* Use latest instead of dev image tag

* Update docker-compose.yaml

* Adapt job start script

* Fix redis connection in manager

* Add support for increasing timeout via environment variable

* Adapt load_in_browser to cookies table schema change

* Fix execution

* Mitigate yaml warning

* Bump some dependency versions

* Report resource usage stats for each job

* checks/load_in_browser: Return DOM size, prevent multiple page loads

* Update .dockerignore

* Code update

* Script update

* Update README.md

* WIP

* WIP commit

* Update Dockerfile to alpine:edge and chromium v90

* Update TestCertificateChecker

* Set defaults for __init__ function

* Detect sunflower theme

* Update unit test for new datetime (zero-basing)

* Set logging prefs from Chromium in a new way

* Move datastore client instantiation

As it is not needed for all commands

* Change green-directory repository URL

* Add git settings for cloning green-directory

* Pin alpine version 3.14, fix py3-cryptography

* Use plain docker build progress output

* Add volumes to 'make test' docker run command

* Fix bug

* Update example command in README

* Update dependencies

* Add creation of Kubernetes jobs
2021-11-11 20:15:43 +01:00

206 lines
6.1 KiB
Python

"""
The manager module allows to fill the RQ job queue.
"""
import logging
import math
import os
import random
import shutil
import time
import json
from datetime import datetime
from git import Repo
from rq import Queue
import redis
import yaml
from yaml import Loader
from hashlib import sha256
import config
REDIS_URL = os.environ.get("REDIS_URL", "redis://redis:6379/0")
def clone_data_directory():
"""
Clones the source of website URLs, the green directory,
into the local file system using git
"""
if os.path.exists(config.GREEN_DIRECTORY_LOCAL_PATH):
shutil.rmtree(config.GREEN_DIRECTORY_LOCAL_PATH)
Repo.clone_from(config.GREEN_DIRECTORY_REPO, config.GREEN_DIRECTORY_LOCAL_PATH)
def directory_entries():
"""
Iterator over all data files in the cloned green directory
"""
path = os.path.join(config.GREEN_DIRECTORY_LOCAL_PATH, config.GREEN_DIRECTORY_DATA_PATH)
for root, _, files in os.walk(path):
for fname in files:
filepath = os.path.join(root, fname)
if not filepath.endswith(".yaml"):
continue
with open(filepath, 'r', encoding='utf8') as yamlfile:
for doc in yaml.load_all(yamlfile, Loader=Loader):
yield doc
def chunks(the_list, size):
"""
Yield successive n-sized chunks from list the_list
where n = size.
"""
for i in range(0, len(the_list), size):
yield the_list[i:i + size]
def create_jobs(url=None):
"""
Read all URLs from green directory and fill a job database
with one job per URL.
Alternatively, if the url argument is given, only the given URL
will be added as a spider job.
"""
logging.info('Waiting for redis at %s' % REDIS_URL)
redis_success = False
while not redis_success:
try:
redis_conn = redis.from_url(REDIS_URL)
redis_success = True
except Exception as ex:
logging.error(ex)
time.sleep(5)
queue = Queue('low', connection=redis_conn)
# refresh our local clone of the green directory
logging.info("Refreshing green-directory clone")
clone_data_directory()
# build the list of website URLs to run checks for
logging.info("Processing green-directory")
input_entries = []
count = 0
random.seed()
for entry in directory_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
continue
website_url = None
for index in range(len(entry['urls'])):
try:
if entry['urls'][index]['type'] == "WEBSITE":
website_url = entry['urls'][index]['url']
if website_url:
if url is not None and website_url != url:
continue
input_entries.append({
"url": website_url,
"type": entry.get("type"),
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
count += 1
except NameError:
logging.error("Error in %s: 'url' key missing (%s)",
repr_entry(entry), entry['urls'][index])
# Ensure the passed URL argument is really there, even if not part
# of the directory.
if url and count == 0:
logging.info("Adding job for URL %s which is not part of green-directory", url)
input_entries.append({
"url": url,
"type": None,
"level": None,
"state": None,
"district": None,
"city": None,
})
count = 0
errorcount = 0
logging.info("Writing jobs")
count = 0
for entry in input_entries:
try:
_ = queue.enqueue('job.run',
job_timeout='300s',
at_front=random.choice([True, False]),
# keywords args passes on the job function
kwargs={
'job': entry,
})
# Print job for debugging purposes
print(json.dumps(entry))
#logging.debug("Added job with ID %s for URL %s" % (enqueued_job.id, entry['url']))
count += 1
except Exception as e:
errorcount += 1
logging.error("Error adding job for URL %s: %s" % (entry['url'], e))
# Write kubernetes Job
make_k8s_job(entry, count)
count += 1
logging.info("Writing jobs done, %s jobs added", count)
logging.info("%d errors while writing jobs", errorcount)
def make_k8s_job(job_data, count):
now = datetime.utcnow().strftime('%Y%m%d%H%M')
urlhash = sha256(job_data['url'].encode('utf-8')).hexdigest()[0:12]
job_name = f'gs-{now}-{urlhash}'
filename = f'{job_name}.yaml'
batch_folder = math.floor(count / config.K8S_JOB_BATCH_SIZE)
output_dir = os.path.join(config.K8S_JOBS_PATH, str(batch_folder))
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, filename)
job_json = json.dumps(job_data)
job_flag = f'\'--job={job_json}\''
with open(config.K8S_JOB_TEMPLATE, "r") as template_file:
template = template_file.read()
template = template.replace('JOB_NAME', job_name)
template = template.replace('POD_NAME', job_name)
template = template.replace('JOB_FLAG', job_flag)
with open(output_path, "w") as output:
output.write(template)
def repr_entry(entry):
"""
Return string representation of a directory entry,
for logging/debugging purposes
"""
ret = entry['type']
if 'level' in entry:
ret += "/" + entry['level']
if 'state' in entry:
ret += "/" + entry['state']
if 'district' in entry:
ret += "/" + entry['district']
return ret