mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-14 06:26:07 +02:00
Bring back function to execute a single spider job
This commit is contained in:
parent
73e847d025
commit
e1cf2f3719
10
cli.py
10
cli.py
|
@ -34,6 +34,10 @@ if __name__ == "__main__":
|
|||
# subcommands
|
||||
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
||||
|
||||
# 'spider' subcommand to execute a job from the queue and store the result.
|
||||
spider_parser = subparsers.add_parser('spider', help='Execute a spider job from the queue and store the result.')
|
||||
spider_parser.add_argument('--job', help='JSON job data')
|
||||
|
||||
# 'dryrun' subcommand to spider one URL without writing results back.
|
||||
dryrun_parser = subparsers.add_parser('dryrun', help='Spider an arbitrary URL without storing results. ')
|
||||
dryrun_parser.add_argument('url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
||||
|
@ -82,6 +86,12 @@ if __name__ == "__main__":
|
|||
result = spider.check_and_rate_site({"url": args.url, "type": "REGIONAL_CHAPTER", "level": "DE:KREISVERBAND", "state": "Unnamed", "district": "Unnamed"})
|
||||
print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder))
|
||||
|
||||
elif args.command == 'spider':
|
||||
from spider import spider
|
||||
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
||||
job = json.loads(args.job)
|
||||
spider.execute_single_job(datastore_client, job, "spider-results")
|
||||
|
||||
else:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
|
2
job.py
2
job.py
|
@ -12,7 +12,7 @@ import logging
|
|||
import docker
|
||||
from google.cloud import datastore
|
||||
|
||||
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt
|
||||
# Maximum per-job runtime in seconds. This can be increased for second, third attempt
|
||||
# via the environment JOB_TIMEOUT variable.
|
||||
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
|
||||
|
||||
|
|
|
@ -72,6 +72,38 @@ def check_and_rate_site(entry):
|
|||
return result
|
||||
|
||||
|
||||
def execute_single_job(datastore_client, job, entity_kind):
|
||||
"""
|
||||
Executes spider for one single job
|
||||
"""
|
||||
validate_job(job)
|
||||
|
||||
logging.info("Starting job %s", job["url"])
|
||||
result = check_and_rate_site(entry=job)
|
||||
|
||||
logging.info("Job %s finished checks", job["url"])
|
||||
logging.info("Job %s writing to DB", job["url"])
|
||||
|
||||
key = datastore_client.key(entity_kind, job["url"])
|
||||
entity = datastore.Entity(key=key)
|
||||
record = {
|
||||
'created': datetime.utcnow(),
|
||||
'meta': result['meta'],
|
||||
'checks': result['checks'],
|
||||
'rating': result['rating'],
|
||||
'score': result['score'],
|
||||
}
|
||||
|
||||
entity.update(record)
|
||||
try:
|
||||
datastore_client.put(entity)
|
||||
logging.debug("Successfully wrote record to database")
|
||||
except InvalidArgument as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
except Exception as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
|
||||
|
||||
def test_url(url):
|
||||
"""
|
||||
Run the spider for a single URL and print the result.
|
||||
|
|
Loading…
Reference in a new issue