Remove worker functions from spider code

This commit is contained in:
Marian Steinbach 2024-03-04 09:27:40 +01:00
parent 3f4ea41e86
commit 75964011c2

View file

@ -88,51 +88,6 @@ def test_url(url):
pprint(result)
def execute_single_job(datastore_client, job, entity_kind):
"""
Executes spider for one single job
"""
validate_job(job)
logging.info("Starting job %s", job["url"])
result = check_and_rate_site(entry=job)
logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str))
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
key = datastore_client.key(entity_kind, job["url"])
entity = datastore.Entity(key=key)
record = {
'created': datetime.utcnow(),
'meta': result['meta'],
'checks': result['checks'],
'rating': result['rating'],
'score': result['score'],
}
entity.update(record)
try:
datastore_client.put(entity)
logging.debug("Successfully wrote record to database")
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except Exception as ex:
logging.error("Could not write result: %s", ex)
def work_of_queue(datastore_client, entity_kind):
"""
Take job from queue and finish it until there are no more jobs
"""
while True:
job = manager.get_job_from_queue(datastore_client)
if job is None:
logging.info("No more jobs. Exiting.")
break
execute_single_job(datastore_client, job, entity_kind)
def validate_job(jobdict):
if "url" not in jobdict:
raise Exception("Job does not have required 'url' attribute")