mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-02 17:14:51 +02:00
0c0bcbf54e
* Use UTC for feed item age calculation * Improvements in run-job.sh script * Prevent output buffering in job creation * Remove unused environment variable references * Print more detailed results count * Bring back function to execute a single spider job * Fix 'make spider' command * Upgrade docker to 5.0.3
98 lines
3.5 KiB
Python
98 lines
3.5 KiB
Python
"""
|
|
Command line utility for spider, export etc.
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import signal
|
|
import sys
|
|
import json
|
|
|
|
from google.cloud import datastore
|
|
|
|
def handle_sigint(signum, frame):
|
|
"""
|
|
Handles SIGINT, which occurs on Ctrl-C
|
|
"""
|
|
print("\nInterrupted by SIGINT\n")
|
|
sys.exit()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
signal.signal(signal.SIGINT, handle_sigint)
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
# global flags
|
|
parser.add_argument('--credentials-path', dest='credentials_path',
|
|
help='Path to the service account credentials JSON file',
|
|
default='/secrets/service-account.json')
|
|
|
|
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
|
|
default='info')
|
|
|
|
# subcommands
|
|
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
|
|
|
# 'spider' subcommand to execute a job from the queue and store the result.
|
|
spider_parser = subparsers.add_parser('spider', help='Execute a spider job from the queue and store the result.')
|
|
spider_parser.add_argument('--job', help='JSON job data')
|
|
|
|
# 'dryrun' subcommand to spider one URL without writing results back.
|
|
dryrun_parser = subparsers.add_parser('dryrun', help='Spider an arbitrary URL without storing results. ')
|
|
dryrun_parser.add_argument('url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
|
|
|
# manager subcommand
|
|
manager_parser = subparsers.add_parser('manager', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
|
manager_parser.add_argument('--url', help='Add a job to spider a specific URL')
|
|
|
|
# export subcommand
|
|
export_parser = subparsers.add_parser('export', help='Export JSON data')
|
|
export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)')
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
# set log level
|
|
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
|
|
|
loglevel = args.loglevel.lower()
|
|
if loglevel == 'error':
|
|
logging.basicConfig(level=logging.ERROR)
|
|
elif loglevel == 'warn':
|
|
logging.basicConfig(level=logging.WARN)
|
|
elif loglevel == 'debug':
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
logging.getLogger("selenium").setLevel(logging.INFO)
|
|
else:
|
|
logging.basicConfig(level=logging.INFO)
|
|
loglevel = 'info'
|
|
|
|
logging.debug("Called command %s", args.command)
|
|
|
|
if args.command == 'manager':
|
|
import manager
|
|
manager.create_jobs(args.url)
|
|
|
|
elif args.command == 'export':
|
|
import export
|
|
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
|
export.export_results(datastore_client, args.kind)
|
|
|
|
elif args.command == 'dryrun':
|
|
from spider import spider
|
|
from export.datetimeencoder import DateTimeEncoder
|
|
|
|
result = spider.check_and_rate_site({"url": args.url, "type": "REGIONAL_CHAPTER", "level": "DE:KREISVERBAND", "state": "Unnamed", "district": "Unnamed"})
|
|
print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder))
|
|
|
|
elif args.command == 'spider':
|
|
from spider import spider
|
|
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
|
job = json.loads(args.job)
|
|
spider.execute_single_job(datastore_client, job, "spider-results")
|
|
|
|
else:
|
|
parser.print_help()
|
|
sys.exit(1)
|