Bring back function to execute a single spider job

This commit is contained in:
Marian Steinbach 2024-03-06 23:42:59 +01:00
parent 73e847d025
commit e1cf2f3719
3 changed files with 43 additions and 1 deletions

10
cli.py
View file

@ -34,6 +34,10 @@ if __name__ == "__main__":
# subcommands
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
# 'spider' subcommand to execute a job from the queue and store the result.
spider_parser = subparsers.add_parser('spider', help='Execute a spider job from the queue and store the result.')
spider_parser.add_argument('--job', help='JSON job data')
# 'dryrun' subcommand to spider one URL without writing results back.
dryrun_parser = subparsers.add_parser('dryrun', help='Spider an arbitrary URL without storing results. ')
dryrun_parser.add_argument('url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
@ -82,6 +86,12 @@ if __name__ == "__main__":
result = spider.check_and_rate_site({"url": args.url, "type": "REGIONAL_CHAPTER", "level": "DE:KREISVERBAND", "state": "Unnamed", "district": "Unnamed"})
print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder))
elif args.command == 'spider':
from spider import spider
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
job = json.loads(args.job)
spider.execute_single_job(datastore_client, job, "spider-results")
else:
parser.print_help()
sys.exit(1)

2
job.py
View file

@ -12,7 +12,7 @@ import logging
import docker
from google.cloud import datastore
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt
# Maximum per-job runtime in seconds. This can be increased for second, third attempt
# via the environment JOB_TIMEOUT variable.
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))

View file

@ -72,6 +72,38 @@ def check_and_rate_site(entry):
return result
def execute_single_job(datastore_client, job, entity_kind):
"""
Executes spider for one single job
"""
validate_job(job)
logging.info("Starting job %s", job["url"])
result = check_and_rate_site(entry=job)
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
key = datastore_client.key(entity_kind, job["url"])
entity = datastore.Entity(key=key)
record = {
'created': datetime.utcnow(),
'meta': result['meta'],
'checks': result['checks'],
'rating': result['rating'],
'score': result['score'],
}
entity.update(record)
try:
datastore_client.put(entity)
logging.debug("Successfully wrote record to database")
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except Exception as ex:
logging.error("Could not write result: %s", ex)
def test_url(url):
"""
Run the spider for a single URL and print the result.