mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-05 10:33:39 +02:00
Marian Steinbach
618e29d763
* CLI: remove 'jobs' command, add 'manager' * Add job definition * Move jobs to manage folder * Rename jobs to manager * Add rq and redis dependencies * Add docker-compose YAML * Downgrade to alpine 3.8 * Adjust paths in Dockerfile, remove entrypoint * Rename 'make spiderjobs' to 'make jobs' * Fix docker exectution * Adapt 'make jobs' * Fix metadata scheme * Add docker dependency * Rendomize queue (a bit) * Use latest image, remove debug output * Make docker-compose file downwards-compatible * Use latest instead of dev image tag * Update docker-compose.yaml * Adapt job start script * Fix redis connection in manager * Add support for increasing timeout via environment variable * Adapt load_in_browser to cookies table schema change * Fix execution * Mitigate yaml warning * Bump some dependency versions * Report resource usage stats for each job * checks/load_in_browser: Return DOM size, prevent multiple page loads * Update .dockerignore * Code update * Script update * Update README.md * WIP * WIP commit * Update Dockerfile to alpine:edge and chromium v90 * Update TestCertificateChecker * Set defaults for __init__ function * Detect sunflower theme * Update unit test for new datetime (zero-basing) * Set logging prefs from Chromium in a new way * Move datastore client instantiation As it is not needed for all commands * Change green-directory repository URL * Add git settings for cloning green-directory * Pin alpine version 3.14, fix py3-cryptography * Use plain docker build progress output * Add volumes to 'make test' docker run command * Fix bug * Update example command in README * Update dependencies * Add creation of Kubernetes jobs
94 lines
3.2 KiB
Python
94 lines
3.2 KiB
Python
"""
|
|
Command line utility for spider, export etc.
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import signal
|
|
import sys
|
|
import json
|
|
|
|
from google.cloud import datastore
|
|
|
|
def handle_sigint(signum, frame):
|
|
"""
|
|
Handles SIGINT, which occurs on Ctrl-C
|
|
"""
|
|
print("\nInterrupted by SIGINT\n")
|
|
sys.exit()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
signal.signal(signal.SIGINT, handle_sigint)
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
# global flags
|
|
parser.add_argument('--credentials-path', dest='credentials_path',
|
|
help='Path to the service account credentials JSON file',
|
|
default='/secrets/service-account.json')
|
|
|
|
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
|
|
default='info')
|
|
|
|
# subcommands
|
|
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
|
|
|
# spider subcommand
|
|
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
|
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
|
|
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
|
spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.')
|
|
|
|
# manager subcommand
|
|
manager_parser = subparsers.add_parser('manager', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
|
manager_parser.add_argument('--url', help='Add a job to spider a specific URL')
|
|
|
|
# export subcommand
|
|
export_parser = subparsers.add_parser('export', help='Export JSON data')
|
|
export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)')
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
# set log level
|
|
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
|
|
|
loglevel = args.loglevel.lower()
|
|
if loglevel == 'error':
|
|
logging.basicConfig(level=logging.ERROR)
|
|
elif loglevel == 'warn':
|
|
logging.basicConfig(level=logging.WARN)
|
|
elif loglevel == 'debug':
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
logging.getLogger("selenium").setLevel(logging.INFO)
|
|
else:
|
|
logging.basicConfig(level=logging.INFO)
|
|
loglevel = 'info'
|
|
|
|
logging.debug("Called command %s", args.command)
|
|
|
|
if args.command == 'manager':
|
|
|
|
import manager
|
|
manager.create_jobs(args.url)
|
|
|
|
elif args.command == 'export':
|
|
|
|
import export
|
|
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
|
export.export_results(datastore_client, args.kind)
|
|
|
|
else:
|
|
from spider import spider
|
|
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
|
|
|
if args.url:
|
|
# spider one URL for diagnostic purposes
|
|
spider.test_url(args.url)
|
|
elif args.job:
|
|
job = json.loads(args.job)
|
|
spider.execute_single_job(datastore_client, job, args.kind)
|
|
else:
|
|
spider.work_of_queue(datastore_client, args.kind)
|