diff --git a/.dockerignore b/.dockerignore index e5d5555..3d3c549 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,6 @@ .git -webapp docs +/screenshots secrets temp venv diff --git a/.gitignore b/.gitignore index e694cdc..8dbd94d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ temp __pycache__ .vscode/settings.json kubernetes/green-spider-secret.yaml -/volumes \ No newline at end of file +/volumes +/screenshots diff --git a/Dockerfile b/Dockerfile index f05abd1..1fc3374 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,23 +1,26 @@ -FROM python:3.7-alpine3.9 +FROM alpine:3.14 WORKDIR /workdir ADD requirements.txt /workdir/ -RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \ - echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \ - apk update && \ - apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \ - pip3 install --upgrade pip && \ - pip3 install -r requirements.txt && \ - apk del python3-dev build-base +RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \ + echo "http://dl-4.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \ + apk --update --no-cache add ca-certificates chromium chromium-chromedriver \ + python3-dev py3-grpcio py3-wheel py3-pip py3-lxml \ + build-base git libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \ + pip install -r requirements.txt && \ + apk del build-base -ADD cli.py / -ADD config /config -ADD jobs /jobs -ADD checks /checks -ADD rating /rating -ADD spider /spider -ADD export /export +# As alpine's py3-cryptography did not work as of alpine v3.14, we use this hack from +# https://github.com/pyca/cryptography/issues/3344#issuecomment-650845512 +RUN LDFLAGS="-L/opt/openssl/lib -Wl,-rpath,/opt/openssl/lib" CFLAGS="-I/opt/openssl/include" pip3 install -U cryptography -ENTRYPOINT ["python3", "/cli.py"] +ADD cli.py /workdir/ +ADD manager /workdir/manager +ADD config /workdir/config +ADD checks /workdir/checks +ADD rating /workdir/rating +ADD spider /workdir/spider +ADD export /workdir/export +ADD job.py /workdir/ diff --git a/Makefile b/Makefile index cf76950..33cac49 100644 --- a/Makefile +++ b/Makefile @@ -6,16 +6,17 @@ DB_ENTITY := spider-results # Build docker image dockerimage: - docker build -t $(IMAGE) . + docker build --progress plain -t $(IMAGE) . -# Create spider job queue -spiderjobs: +# Fill the queue with spider jobs, one for each site. +jobs: docker run --rm -ti \ -v $(PWD)/secrets:/secrets \ $(IMAGE) \ - --credentials-path /secrets/datastore-writer.json \ - --loglevel debug \ - jobs + python cli.py \ + --credentials-path /secrets/datastore-writer.json \ + --loglevel debug \ + manager # Run spider in docker image spider: @@ -41,6 +42,9 @@ export: # run spider tests test: docker run --rm -ti \ + -v $(PWD)/volumes/dev-shm:/dev/shm \ + -v $(PWD)/secrets:/secrets \ + -v $(PWD)/screenshots:/screenshots \ -v $(PWD)/volumes/chrome-userdir:/opt/chrome-userdir \ --entrypoint "python3" \ $(IMAGE) \ diff --git a/README.md b/README.md index cadb2bc..e47b09b 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Alle Informationen zum Betrieb befinden sich im Verzeichnis [devops](https://git Green Spider ist in Python 3 geschrieben und wird aktuell unter 3.6 getestet und ausgeführt. -Aufgrund zahlreicher Dependencies empfiehlt es sich, den Spider Code lokal in Docker +Aufgrund zahlreicher Abhängigkeiten empfiehlt es sich, den Spider Code lokal in Docker auszuführen. Das Image wird über den folgenden Befehl erzeugt: @@ -57,18 +57,19 @@ Am einfachsten geht das über den `make spider` Befehl, so: make spider ARGS="--url http://www.example.com/" ``` -Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenank. +Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenbank. Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel): -``` +```nohighlight docker run --rm -ti \ -v $(pwd)/volumes/dev-shm:/dev/shm \ -v $(pwd)/secrets:/secrets \ + -v $(pwd)/screenshots:/screenshots \ -v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \ --shm-size=2g \ - quay.io/netzbegruenung/green-spider:latest \ + quay.io/netzbegruenung/green-spider:latest python3 cli.py \ --credentials-path /secrets/datastore-writer.json \ --loglevel debug \ - spider --job '{"url": "https://xn--grne-porta-westfalica-9hc.de/", "meta": {"city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}}' + spider --job '{"url": "https://gruene-porta-westfalica.de/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}' ``` diff --git a/checks/__init__.py b/checks/__init__.py index 9761025..3d1d5a5 100644 --- a/checks/__init__.py +++ b/checks/__init__.py @@ -54,16 +54,27 @@ def perform_checks(input_url): results = {} + # TODO: + # Set screenshot_bucket_name and storage_credentials_path + # based on flags. config = Config(urls=[input_url], user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' + - 'Safari/537.36 green-spider/0.2') + 'Safari/537.36 green-spider/0.2', + screenshot_bucket_name='green-spider-screenshots.sendung.de', + screenshot_datastore_kind='webscreenshot', + storage_credentials_path='/secrets/screenshots-uploader.json', + datastore_credentials_path='/secrets/datastore-writer.json') + # Iterate over all checks. for check_name, check in check_modules: + + # checker is the individual test/assertion handler we instantiate + # for each check step. checker = check.Checker(config=config, previous_results=results) - # see if dependencies are met + # Ensure that dependencies are met for the checker. dependencies = checker.depends_on_results() if dependencies != []: for dep in dependencies: @@ -71,10 +82,16 @@ def perform_checks(input_url): logging.debug("Skipping check %s as dependency %s is not met" % (check_name, dep)) continue + # Execute the checker's main function. result = checker.run() results[check_name] = result - # update config for the next check + # Execute any cleanup/aftermath function (if given) for the checker. + modified_results = checker.post_hook(result) + if modified_results is not None: + results[check_name] = modified_results + + # Update config for the next check(s) in the sequence. config = checker.config logging.debug("config after check %s: %r" % (check_name, config)) diff --git a/checks/abstract_checker.py b/checks/abstract_checker.py index c0d7964..9eb8b9c 100644 --- a/checks/abstract_checker.py +++ b/checks/abstract_checker.py @@ -21,6 +21,20 @@ class AbstractChecker(object): """Executes the check routine, returns result dict""" raise NotImplementedError() + def post_hook(self, result): + """ + Optional function to execute after run(). Can be used to post-process + results data. Should be defined by the implementing checker. + + Params: + result: Result data from the run() function. + + Returns: + Dict: Modified results data + None: Means that nothing has been done, so should be ignored. + """ + return None + @property def config(self): return self._config diff --git a/checks/certificate_test.py b/checks/certificate_test.py index b7f2ef2..3a3b643 100644 --- a/checks/certificate_test.py +++ b/checks/certificate_test.py @@ -14,7 +14,7 @@ class TestCertificateChecker(unittest.TestCase): result = checker.run() self.assertIn(url, result) self.assertIsNone(result[url]['exception']) - self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services') + self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services LLC') def test_kaarst(self): """Real-workd example""" @@ -24,7 +24,7 @@ class TestCertificateChecker(unittest.TestCase): result = checker.run() self.assertIn(url, result) self.assertIsNone(result[url]['exception']) - self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited') + self.assertEqual(result[url]['issuer']['O'], 'DigiCert Inc') def test_tls_v_1_0(self): """Load a certificate for a TLS v1.0 server""" diff --git a/checks/config.py b/checks/config.py index 1fff550..5330622 100644 --- a/checks/config.py +++ b/checks/config.py @@ -3,9 +3,19 @@ class Config(object): Our configuration to be passed to checks """ - def __init__(self, urls, user_agent='green-spider/1.0'): + def __init__(self, + urls, + screenshot_bucket_name='', + screenshot_datastore_kind='', + storage_credentials_path='', + datastore_credentials_path='', + user_agent='green-spider/1.0'): self._urls = set(urls) self._user_agent = user_agent + self._screenshot_bucket_name = screenshot_bucket_name + self._screenshot_datastore_kind = screenshot_datastore_kind + self._storage_credentials_path = storage_credentials_path + self._datastore_credentials_path = datastore_credentials_path def __repr__(self): return "Config(urls=%r)" % self._urls @@ -27,3 +37,19 @@ class Config(object): @property def user_agent(self): return self._user_agent + + @property + def screenshot_bucket_name(self): + return self._screenshot_bucket_name + + @property + def storage_credentials_path(self): + return self._storage_credentials_path + + @property + def datastore_credentials_path(self): + return self._datastore_credentials_path + + @property + def screenshot_datastore_kind(self): + return self._screenshot_datastore_kind diff --git a/checks/generator.py b/checks/generator.py index 10d9cbd..0fae75f 100644 --- a/checks/generator.py +++ b/checks/generator.py @@ -75,6 +75,9 @@ class Checker(AbstractChecker): elif ('Urwahl3000' in page_content['content'] or '/themes/urwahl3000' in page_content['content']): generator = 'wordpress-urwahl' + + elif ('/themes/sunflower' in page_content['content']): + generator = 'wordpress-sunflower' elif ('/themes/sunflower' in page_content['content']): generator = 'wordpress-sunflower' diff --git a/checks/load_feeds_test.py b/checks/load_feeds_test.py index 3e8dfc2..3850aac 100644 --- a/checks/load_feeds_test.py +++ b/checks/load_feeds_test.py @@ -60,15 +60,13 @@ class TestFeed(unittest.TestCase): result = checker.run() pprint(result) - self.assertEqual(result, { - 'http://example.com/feed.xml': { - 'exception': None, - 'title': 'Liftoff News', - 'latest_entry': datetime(2003, 6, 3, 9, 39, 21), - 'first_entry': datetime(2003, 5, 30, 11, 6, 42), - 'average_interval': 340359, - 'num_entries': 2, - } + self.assertEqual(result['http://example.com/feed.xml'], { + 'exception': None, + 'average_interval': 340359, + 'first_entry': datetime(2003, 5, 30, 11, 6, 42), + 'latest_entry': datetime(2003, 6, 3, 9, 39, 21), + 'num_entries': 2, + 'title': 'Liftoff News', }) diff --git a/checks/load_in_browser.py b/checks/load_in_browser.py index 0b7ceb2..9b4cd31 100644 --- a/checks/load_in_browser.py +++ b/checks/load_in_browser.py @@ -9,29 +9,38 @@ Information includes: - what cookies are set during loading the page """ +from datetime import datetime +import hashlib import logging import math +import os import shutil import time import sqlite3 +import json from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException from selenium.common.exceptions import TimeoutException +from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import tenacity +from google.cloud import storage +from google.cloud import datastore + from checks.abstract_checker import AbstractChecker class Checker(AbstractChecker): - page_load_timeout = 30 + page_load_timeout = 120 # sizes we check for (width, height) sizes = ( - (360, 640), # rather old smartphone - (768, 1024), # older tablet or newer smartphone - (1024, 768), # older desktop or horiz. tablet (1920, 1080), # Full HD horizontal + (1500, 1500), # useful window size we also use for the main screenshot + (1024, 768), # older desktop or horiz. tablet + (768, 1024), # older tablet or newer smartphone + (360, 640), # rather old smartphone ) def __init__(self, config, previous_results=None): @@ -39,22 +48,50 @@ class Checker(AbstractChecker): # Our selenium user agent using Chrome headless as an engine chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument('enable-automation') chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--dns-prefetch-disable') chrome_options.add_argument('--disable-extensions') + chrome_options.add_argument('--disk-cache-size=0') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--verbose') + chrome_options.page_load_strategy = 'normal' # path where to get cookies from chrome_options.add_argument("--user-data-dir=/opt/chrome-userdir") + # mobile_emulation = { + # "deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 }, + # "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" + # } + #mobile_emulation = { "deviceName": "Nexus 5" } + #chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) + # empty /opt/chrome-userdir shutil.rmtree('/opt/chrome-userdir', ignore_errors=True) - self.driver = webdriver.Chrome(options=chrome_options) + # activate performance logging (includes network logging) + capabilities = DesiredCapabilities.CHROME + capabilities['goog:loggingPrefs'] = {'performance': 'ALL'} + + # TODO: also do this + # (from https://stackoverflow.com/questions/60375633/capture-logs-from-chrome-during-test-is-running-python#comment106827817_60385493) + capabilities['loggingPrefs'] = {'performance': 'ALL'} + + + self.driver = webdriver.Chrome(options=chrome_options, desired_capabilities=capabilities) self.driver.set_page_load_timeout(self.page_load_timeout) - def run(self): + # We capture the browser engine's user agent string + # for the record. + self.user_agent = self.driver.execute_script("return navigator.userAgent;") + def run(self): + """ + Main function of this check. + """ results = {} for url in self.config.urls: @@ -64,15 +101,22 @@ class Checker(AbstractChecker): 'min_document_width': None, 'logs': None, 'font_families': None, + 'performance_log': [], + 'screenshots': [], } - # responsive check + self.driver.get(url) + + # Responsive layout check and screenshots. try: - sizes = self.check_responsiveness(url) + check_responsiveness_results = self.check_responsiveness(url) results[url] = { - 'sizes': sizes, - 'min_document_width': min([s['document_width'] for s in sizes]), + 'sizes': check_responsiveness_results['sizes'], + 'min_document_width': min([s['document_width'] for s in check_responsiveness_results['sizes']]), + 'dom_size': self.get_dom_size(), 'logs': self.capture_log(), + 'performance_log': [], + 'screenshots': check_responsiveness_results['screenshots'], } except TimeoutException as e: logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e)) @@ -81,6 +125,7 @@ class Checker(AbstractChecker): logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re)) pass + # Scroll page to bottom, to load all lazy-loading resources. try: self.scroll_to_bottom() except TimeoutException as e: @@ -112,6 +157,7 @@ class Checker(AbstractChecker): logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e)) pass + # Process cookies. try: results[url]['cookies'] = self.get_cookies() except TimeoutException as e: @@ -120,10 +166,79 @@ class Checker(AbstractChecker): except tenacity.RetryError as re: logging.warn("RetryError when collecting cookies for %s: %s" % (url, re)) pass + + for logentry in self.driver.get_log('performance'): + decoded_logentry = json.loads(logentry['message']) + results[url]['performance_log'].append(decoded_logentry) self.driver.quit() return results + + def post_hook(self, result): + """ + Logic executed after run() is done. + Used to upload screenshots and metadata to cloud storage and datastore. + """ + # Upload screenshots and metadata + + logging.debug("load_in_browser post_hook 1 - Creating client") + + storage_client = storage.Client.from_service_account_json(self.config.storage_credentials_path) + bucket = storage_client.get_bucket(self.config.screenshot_bucket_name) + + datastore_client = datastore.Client.from_service_account_json(self.config.datastore_credentials_path) + exclude_from_indexes = ['size', 'screenshot_url', 'user_agent'] + + for url in result.keys(): + for screenshot in result[url]['screenshots']: + # Upload one screenshot + try: + local_file = '%s/%s' % (screenshot['folder'], screenshot['filename']) + + logging.debug("Handling screenshot file %s" % local_file) + + if not os.path.exists(screenshot['local_path']): + logging.warning("No screenshot created: size=%s, url='%s'" % (screenshot['size'], screenshot['url'])) + continue + + logging.debug("Uploading %s to %s/%s" % (screenshot['local_path'], screenshot['folder'], screenshot['filename'])) + with open(screenshot['local_path'], 'rb') as my_file: + # Create new blob in remote bucket + blob = bucket.blob(local_file) + blob.upload_from_file(my_file, content_type="image/png") + blob.make_public() + except Exception as e: + logging.warn("Error uploading screenshot for %s: %s" % (screenshot['url'], e)) + continue + + try: + os.remove(screenshot['local_path']) + except: + pass + + # Write metadata for one screenshot + data = { + 'url': screenshot['url'], + 'size': screenshot['size'], + 'screenshot_url': screenshot['screenshot_url'], + 'user_agent': screenshot['user_agent'], + 'created': screenshot['created'], + } + try: + key = datastore_client.key(self.config.screenshot_datastore_kind, screenshot['screenshot_url']) + entity = datastore.Entity(key=key, exclude_from_indexes=exclude_from_indexes) + entity.update(data) + datastore_client.put(entity) + logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url']) + except Exception as e: + logging.warn("Error in %s: %s" % (screenshot['url'], e)) + + + # Remove screenshots part from results + del result[url]['screenshots'] + + return result def get_cookies(self): # read cookie DB to get 3rd party cookies, too @@ -131,7 +246,7 @@ class Checker(AbstractChecker): db = sqlite3.connect('/opt/chrome-userdir/Default/Cookies') db.row_factory = sqlite3.Row c = db.cursor() - c.execute("SELECT creation_utc, host_key, name, path, expires_utc, is_secure, is_httponly, has_expires, is_persistent, firstpartyonly FROM cookies") + c.execute("SELECT creation_utc, host_key, name, path, expires_utc, is_secure, is_httponly, has_expires, is_persistent FROM cookies") for row in c.fetchall(): cookies.append(dict(row)) c.close() @@ -142,11 +257,13 @@ class Checker(AbstractChecker): @tenacity.retry(stop=tenacity.stop_after_attempt(3), retry=tenacity.retry_if_exception_type(TimeoutException)) def check_responsiveness(self, url): - result = [] + result = { + 'sizes': [], + 'screenshots': [], + } # set window to the first size initially self.driver.set_window_size(self.sizes[0][0], self.sizes[0][1]) - self.driver.get(url) for (width, height) in self.sizes: self.driver.set_window_size(width, height) @@ -155,13 +272,44 @@ class Checker(AbstractChecker): time.sleep(1.0) doc_width = self.driver.execute_script("return document.body.scrollWidth") - result.append({ + result['sizes'].append({ 'viewport_width': width, 'document_width': int(doc_width), }) + # Make screenshot + urlhash = hashlib.md5(bytearray(url, 'utf-8')).hexdigest() + folder = "%sx%s" % (width, height) + abs_folder = "/screenshots/%s" % folder + os.makedirs(abs_folder, exist_ok=True) + filename = urlhash + '.png' + abs_filepath = "%s/%s" % (abs_folder, filename) + created = datetime.utcnow() + + success = self.driver.save_screenshot(abs_filepath) + + if not success: + logging.warn("Failed to create screenshot %s" % abs_filepath) + continue + + result['screenshots'].append({ + 'local_path': abs_filepath, + 'folder': folder, + 'filename': filename, + 'url': url, + 'size': [width, height], + 'screenshot_url': 'http://%s/%s/%s' % ( + self.config.screenshot_bucket_name, folder, filename), + 'user_agent': self.user_agent, + 'created': created, + }) + return result + def get_dom_size(self): + dom_length = self.driver.execute_script("return document.getElementsByTagName('*').length") + return int(dom_length) + def capture_log(self): """ Returns log elements with level "SEVERE" or "WARNING" diff --git a/cli.py b/cli.py index 0dcb236..3c4ee6f 100644 --- a/cli.py +++ b/cli.py @@ -19,7 +19,7 @@ def handle_sigint(signum, frame): if __name__ == "__main__": - signal.signal(signal.SIGINT,handle_sigint) + signal.signal(signal.SIGINT, handle_sigint) parser = argparse.ArgumentParser() @@ -40,9 +40,9 @@ if __name__ == "__main__": spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.') spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.') - # jobs subcommand - jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.') - jobs_parser.add_argument('--url', help='Add a job to spider a specific URL') + # manager subcommand + manager_parser = subparsers.add_parser('manager', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.') + manager_parser.add_argument('--url', help='Add a job to spider a specific URL') # export subcommand export_parser = subparsers.add_parser('export', help='Export JSON data') @@ -68,20 +68,21 @@ if __name__ == "__main__": logging.debug("Called command %s", args.command) - datastore_client = datastore.Client.from_service_account_json(args.credentials_path) + if args.command == 'manager': - if args.command == 'jobs': - - import jobs - jobs.create_jobs(datastore_client, args.url) + import manager + manager.create_jobs(args.url) elif args.command == 'export': import export + datastore_client = datastore.Client.from_service_account_json(args.credentials_path) export.export_results(datastore_client, args.kind) else: from spider import spider + datastore_client = datastore.Client.from_service_account_json(args.credentials_path) + if args.url: # spider one URL for diagnostic purposes spider.test_url(args.url) diff --git a/config/__init__.py b/config/__init__.py index 9b173cb..78d772d 100644 --- a/config/__init__.py +++ b/config/__init__.py @@ -7,7 +7,7 @@ CONNECT_TIMEOUT = 5 READ_TIMEOUT = 10 # Git repo for our data -GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git' +GREEN_DIRECTORY_REPO = 'https://git.verdigado.com/NB-Public/green-directory.git' # folder in that repo that holds the data GREEN_DIRECTORY_DATA_PATH = 'data/countries/de' @@ -15,9 +15,12 @@ GREEN_DIRECTORY_DATA_PATH = 'data/countries/de' # folder we use locally to clone the repo GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory' -# IP address of the newthinking GCMS server +# IP address of the verdigado GCMS server GCMS_IP = "194.29.234.123" # kind name of the spider job key datastore entities JOB_DATASTORE_KIND = 'spider-jobs' +K8S_JOBS_PATH = './k8s-jobs' +K8S_JOB_TEMPLATE = './manager/job_template.yaml' +K8S_JOB_BATCH_SIZE = 10 diff --git a/devops/README.md b/devops/README.md index c22f213..7d3a0f2 100644 --- a/devops/README.md +++ b/devops/README.md @@ -50,7 +50,7 @@ devops/ssh.sh Hostname: `green-spider.netzbegruenung.de` -``` +```shell docker-compose stop webapp docker run -it --rm -p 443:443 -p 80:80 --name certbot \ -v /etc/letsencrypt:/etc/letsencrypt \ diff --git a/devops/run-job.sh b/devops/run-job.sh index 9de244d..c229ce9 100755 --- a/devops/run-job.sh +++ b/devops/run-job.sh @@ -127,13 +127,11 @@ ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP << EOF echo "" echo "Install docker" - apt-get install -y docker-ce + apt-get install -y docker-ce docker-compose mkdir /root/secrets EOF -echo "Done with remote setup." - if [[ $1 == "screenshotter" ]]; then ### screenshotter @@ -149,6 +147,41 @@ if [[ $1 == "screenshotter" ]]; then -v /root/secrets:/secrets \ quay.io/netzbegruenung/green-spider-screenshotter +elif [[ $1 == "spider-new" ]] +then + # Some dependencies specific to this task + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP apt-get install -y python3-pip build-essential + + # Upload some files + scp -o StrictHostKeyChecking=no -q secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json + scp -o StrictHostKeyChecking=no -q docker-compose.yaml root@$SERVER_IP:/root/docker-compose.yaml + scp -o StrictHostKeyChecking=no -q requirements.txt root@$SERVER_IP:/root/requirements.txt + scp -o StrictHostKeyChecking=no -q job.py root@$SERVER_IP:/root/job.py + + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP pip3 install -r requirements.txt + + # Bring up redis for the queue + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose pull redis + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose up -d redis + sleep 5 + + # Bring up queue manager + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose pull manager + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose up manager + + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq info --url redis://localhost:6379/0 + + # Start worker and work off the queue once + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq worker --burst high default low --url redis://localhost:6379/0 + + # Re-queue failed jobs once, then re-execute. + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq requeue --queue low -u redis://localhost:6379 --all + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq info --url redis://localhost:6379/0 + + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq worker --burst high default low --url redis://localhost:6379/0 + + echo "Done with queued jobs." + else ### spider diff --git a/devops/ssh.sh b/devops/ssh.sh deleted file mode 100755 index ddba4ce..0000000 --- a/devops/ssh.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# Log in to webapp server via SSH - -API_TOKEN_SECRET="secrets/hetzner-api-token.sh" -test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; } -source $API_TOKEN_SECRET - -source devops/functions.bash - -get_ip - -echo "Use this command for SSH access:" -echo "ssh -o StrictHostKeyChecking=no root@${IP_IP}" - -ssh -o StrictHostKeyChecking=no root@${IP_IP} diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..173b4eb --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,51 @@ +version: "2" +services: + + redis: + image: redis:5-alpine + command: redis-server --save "" --appendonly no + volumes: + - ${PWD}/volumes/redis-data:/data + restart: unless-stopped + networks: + - internal_network + - external_network + ports: + - "6379:6379" + + # manager manages the job queue. + manager: + image: quay.io/netzbegruenung/green-spider:latest + command: > + python3 cli.py + --credentials-path /secrets/datastore-writer.json + --loglevel debug manager + environment: + REDIS_URL: redis://redis:6379/0 + GIT_USERNAME: ${GIT_USERNAME} + GIT_PASSWORD: ${GIT_PASSWORD} + volumes: + - ${PWD}/secrets:/secrets + networks: + - internal_network + - external_network + depends_on: + - redis + + dashboard: + image: eoranged/rq-dashboard:v0.6.1 + environment: + RQ_DASHBOARD_REDIS_URL: redis://redis:6379/0 + networks: + - internal_network + - external_network + ports: + - "9181:9181" + depends_on: + - redis + +networks: + internal_network: + internal: true + external_network: + internal: false \ No newline at end of file diff --git a/job.py b/job.py new file mode 100644 index 0000000..05edb7e --- /dev/null +++ b/job.py @@ -0,0 +1,147 @@ +""" +Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der +Spider-Warteschlange abzuarbeiten. +""" + +import json +import os +from datetime import datetime +import time +import logging + +import docker +from google.cloud import datastore + +# Maximum oper-job runtime in seconds. This can be increased for second, third attempt +# via the environment JOB_TIMEOUT variable. +TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50")) + +DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest' + +CREDENTIALS_PATH = '/secrets/datastore-writer.json' + +client = docker.from_env() +low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock') + +datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH) + +pwd = os.path.abspath(".") +secrets_path = pwd + "/secrets" +chromedir_path = pwd + "/volumes/chrome-userdir" +screenshots_path = pwd + "/screenshots" + +volumes = {} +volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'} +volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'} +volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'} + +logger = logging.getLogger('rq.worker') +logger.setLevel(logging.DEBUG) + +def run(job): + """ + Runs a spider container with the given job. + + Returns the container logs. If the execution takes longer than the + duration defined by the JOB_TIMEOUT environment variable (in seconds), + the container gets killed. + """ + cmd_template = ("python cli.py --credentials-path={path} " + " --loglevel=debug " + " spider " + " --job='{job_json}'") + + cmd = cmd_template.format(path=CREDENTIALS_PATH, + job_json=json.dumps(job)) + + container = client.containers.run(image=DOCKER_IMAGE, + command=cmd, + detach=True, + remove=True, + shm_size='2G', + stdout=True, + stderr=True, + tty=False, + volumes=volumes) + + id = container.id + + # Data about this spider run, to be written to datastore + key = datastore_client.key('spider-runs') + entity = datastore.Entity(key=key) + results = { + 'datetime': datetime.utcnow(), + 'url': job['url'], + 'success': True, + 'error': '', + 'duration_seconds': 0, + 'cpu_usage_seconds': 0, + 'network_received_bytes': 0, + 'network_transmitted_bytes': 0, + 'memory_max_bytes': 0, + } + + # wait for finish + start = datetime.utcnow() + while True: + time.sleep(1) + + clist = client.containers.list(filters={'id': id}) + if len(clist) == 0: + break + + for c in clist: + + # Collect stats + try: + stats = low_level_client.stats(id, stream=False) + + cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0 + if 'networks' in stats: + network_received_bytes = stats['networks']['eth0']['rx_bytes'] + network_transmitted_bytes = stats['networks']['eth0']['tx_bytes'] + + memory_max_bytes = 0 + if 'max_usage' in stats['memory_stats']: + memory_max_bytes = stats['memory_stats']['max_usage'] + results['memory_max_bytes'] = memory_max_bytes + + #logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000)) + + if cpu_usage > 0: + results['cpu_usage_seconds'] = round(cpu_usage) + + if network_received_bytes > 0: + results['network_received_bytes'] = network_received_bytes + + if network_transmitted_bytes > 0: + results['network_transmitted_bytes'] = network_transmitted_bytes + + + except docker.errors.APIError as e: + logger.error("Could not get stats: %s" % e) + except json.decoder.JSONDecodeError: + # This means we didn't get proper stats + pass + + runtime = (datetime.utcnow() - start).seconds + results['duration_seconds'] = round(runtime) + + #if c.status != "running": + # logger.info("Container %s status: %s" % (c.id, c.status)) + + if c.status == "exited": + logger.debug("Container %s is exited." % c.id) + break + + if runtime > TIMEOUT: + c.kill() + results['success'] = False + results['error'] = 'TIMEOUT' + entity.update(results) + datastore_client.put(entity) + raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT) + + entity.update(results) + datastore_client.put(entity) + return results diff --git a/k8s-job-manager.py b/k8s-job-manager.py new file mode 100644 index 0000000..0f1cf9b --- /dev/null +++ b/k8s-job-manager.py @@ -0,0 +1,67 @@ +import config + +import os +from datetime import datetime +import time +import random +from pathlib import Path + +import kubernetes + +PENDING_LIMIT = 2 +RUNNING_LIMIT = 4 + +INTERVAL = 10 # Seconds + +def main(): + + # Get jobs + jobs = list(Path("./k8s-jobs").rglob("*.yaml")) + random.seed() + random.shuffle(jobs) + + kubernetes.config.load_kube_config(context='giantswarm-5jka7') + v1client = kubernetes.client.CoreV1Api() + k8sclient = kubernetes.client.ApiClient() + + start = datetime.utcnow() + jobs_queued = 0 + + while len(jobs) > 0: + # Check whether there are pods pending + pending_pods = v1client.list_pod_for_all_namespaces( + watch=False, + field_selector='status.phase=Pending', + label_selector='app=green-spider') + pending = list(pending_pods.items) + + # Get running pods + running_pods = v1client.list_pod_for_all_namespaces( + watch=False, + field_selector='status.phase=Running', + label_selector='app=green-spider') + running = list(running_pods.items) + + now = datetime.utcnow() + duration = now - start + + # Add new job to the queue + if len(pending) < PENDING_LIMIT and len(running) < RUNNING_LIMIT: + to_be_queued = RUNNING_LIMIT - len(running) + for _ in range(to_be_queued): + job_path = jobs.pop(0) + jobs_queued += 1 + + duration_per_job = duration / jobs_queued + jobs_remaining = len(jobs) + + print(f'{jobs_queued} jobs queued in {duration} - {jobs_remaining} jobs (estimated {duration_per_job * jobs_remaining}) remaining at {int(duration_per_job.total_seconds())} seconds per job on average') + kubernetes.utils.create_from_yaml(k8sclient, job_path) + os.remove(job_path) + + time.sleep(INTERVAL) + + print('No more jobs left. Done.') + +if __name__ == '__main__': + main() diff --git a/kubernetes/job-example.yaml b/kubernetes/job-example.yaml new file mode 100644 index 0000000..cd8c6bd --- /dev/null +++ b/kubernetes/job-example.yaml @@ -0,0 +1,67 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: green-spider-job-1 + namespace: marian + labels: + app: green-spider +spec: + activeDeadlineSeconds: 120 + ttlSecondsAfterFinished: 600 + completions: 1 + backoffLimit: 3 + + # Pod template + template: + metadata: + name: green-spider-job + namespace: marian + labels: + app: green-spider + spec: + restartPolicy: Never + nodeSelector: + giantswarm.io/machine-pool: 5n27k + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app + operator: In + values: + - green-spider + topologyKey: topology.kubernetes.io/region + containers: + - name: spider + image: quay.io/netzbegruenung/green-spider:kubernetes + imagePullPolicy: IfNotPresent + command: + - python + - cli.py + - --credentials-path=/secrets/datastore-writer.json + - --loglevel=debug + - spider + - '--job={"url":"https://www.gruene.de/","type":"PARTY","level":"DE:BUNDESVERBAND","state":null,"district":null,"city":null}' + volumeMounts: + - name: secrets + mountPath: "/secrets" + readOnly: true + - name: shared + mountPath: /dev/shm + resources: + requests: + cpu: 1000m + memory: 5000M + volumes: + - name: secrets + secret: + secretName: green-spider + items: + - key: datastore-writer.json + path: datastore-writer.json + - key: screenshots-uploader.json + path: screenshots-uploader.json + - name: shared + emptyDir: {} diff --git a/kubernetes/psp.yaml b/kubernetes/psp.yaml new file mode 100644 index 0000000..45cc61b --- /dev/null +++ b/kubernetes/psp.yaml @@ -0,0 +1,18 @@ +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: green-spider-job-psp + namespace: marian +spec: + privileged: false + seLinux: + rule: RunAsAny + supplementalGroups: + rule: RunAsAny + runAsUser: + rule: RunAsAny + fsGroup: + rule: RunAsAny + volumes: + - emptyDir + - secret diff --git a/jobs/__init__.py b/manager/__init__.py similarity index 60% rename from jobs/__init__.py rename to manager/__init__.py index 3e125d5..6ce0501 100644 --- a/jobs/__init__.py +++ b/manager/__init__.py @@ -1,21 +1,26 @@ """ -The jobs module allows to create jobs for the queue and take jobs off the queue +The manager module allows to fill the RQ job queue. """ -from datetime import datetime import logging +import math import os import random import shutil +import time +import json +from datetime import datetime from git import Repo -import tenacity +from rq import Queue +import redis import yaml -from google.api_core.exceptions import Aborted -from google.cloud import datastore +from yaml import Loader +from hashlib import sha256 import config +REDIS_URL = os.environ.get("REDIS_URL", "redis://redis:6379/0") def clone_data_directory(): """ @@ -40,7 +45,7 @@ def directory_entries(): continue with open(filepath, 'r', encoding='utf8') as yamlfile: - for doc in yaml.load_all(yamlfile): + for doc in yaml.load_all(yamlfile, Loader=Loader): yield doc @@ -53,7 +58,7 @@ def chunks(the_list, size): yield the_list[i:i + size] -def create_jobs(datastore_client, url=None): +def create_jobs(url=None): """ Read all URLs from green directory and fill a job database with one job per URL. @@ -62,6 +67,18 @@ def create_jobs(datastore_client, url=None): will be added as a spider job. """ + logging.info('Waiting for redis at %s' % REDIS_URL) + redis_success = False + while not redis_success: + try: + redis_conn = redis.from_url(REDIS_URL) + redis_success = True + except Exception as ex: + logging.error(ex) + time.sleep(5) + + queue = Queue('low', connection=redis_conn) + # refresh our local clone of the green directory logging.info("Refreshing green-directory clone") clone_data_directory() @@ -104,7 +121,7 @@ def create_jobs(datastore_client, url=None): logging.error("Error in %s: 'url' key missing (%s)", repr_entry(entry), entry['urls'][index]) - # ensure the passed URL argument is really there, even if not part + # Ensure the passed URL argument is really there, even if not part # of the directory. if url and count == 0: logging.info("Adding job for URL %s which is not part of green-directory", url) @@ -115,55 +132,63 @@ def create_jobs(datastore_client, url=None): "state": None, "district": None, "city": None, - "index": int(random.uniform(1000000, 9999999)), }) count = 0 + errorcount = 0 logging.info("Writing jobs") - entities = [] - + count = 0 for entry in input_entries: - key = datastore_client.key(config.JOB_DATASTORE_KIND, entry["url"]) - entity = datastore.Entity(key=key) - entity.update({ - "created": datetime.utcnow(), - "type": entry["type"], - "level": entry["level"], - "state": entry["state"], - "district": entry["district"], - "city": entry["city"], - "index": int(random.uniform(1000000, 9999999)), - }) - entities.append(entity) + try: + _ = queue.enqueue('job.run', + job_timeout='300s', + at_front=random.choice([True, False]), + # keywords args passes on the job function + kwargs={ + 'job': entry, + }) - # commmit to DB - for chunk in chunks(entities, 300): - logging.debug("Writing jobs chunk of length %d", len(chunk)) - datastore_client.put_multi(chunk) - count += len(chunk) + # Print job for debugging purposes + print(json.dumps(entry)) + + #logging.debug("Added job with ID %s for URL %s" % (enqueued_job.id, entry['url'])) + count += 1 + except Exception as e: + errorcount += 1 + logging.error("Error adding job for URL %s: %s" % (entry['url'], e)) + + # Write kubernetes Job + make_k8s_job(entry, count) + + count += 1 logging.info("Writing jobs done, %s jobs added", count) + logging.info("%d errors while writing jobs", errorcount) -@tenacity.retry(wait=tenacity.wait_exponential(), - retry=tenacity.retry_if_exception_type(Aborted)) -def get_job_from_queue(datastore_client): - """ - Returns a URL from the queue - """ - out = None +def make_k8s_job(job_data, count): + now = datetime.utcnow().strftime('%Y%m%d%H%M') + urlhash = sha256(job_data['url'].encode('utf-8')).hexdigest()[0:12] + job_name = f'gs-{now}-{urlhash}' + filename = f'{job_name}.yaml' + batch_folder = math.floor(count / config.K8S_JOB_BATCH_SIZE) + output_dir = os.path.join(config.K8S_JOBS_PATH, str(batch_folder)) + os.makedirs(output_dir, exist_ok=True) + output_path = os.path.join(output_dir, filename) + job_json = json.dumps(job_data) + job_flag = f'\'--job={job_json}\'' - with datastore_client.transaction(): - query = datastore_client.query(kind=config.JOB_DATASTORE_KIND, - order=['index']) - for entity in query.fetch(limit=1): - logging.debug("Got job: %s", entity) - out = dict(entity) - out["url"] = entity.key.name - datastore_client.delete(entity.key) + with open(config.K8S_JOB_TEMPLATE, "r") as template_file: + template = template_file.read() + + template = template.replace('JOB_NAME', job_name) + template = template.replace('POD_NAME', job_name) + template = template.replace('JOB_FLAG', job_flag) + + with open(output_path, "w") as output: + output.write(template) - return out def repr_entry(entry): """ diff --git a/manager/job_template.yaml b/manager/job_template.yaml new file mode 100644 index 0000000..98cf395 --- /dev/null +++ b/manager/job_template.yaml @@ -0,0 +1,67 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: JOB_NAME + namespace: marian + labels: + app: green-spider +spec: + activeDeadlineSeconds: 600 + ttlSecondsAfterFinished: 600 + completions: 1 + backoffLimit: 3 + + # Pod template + template: + metadata: + name: POD_NAME + namespace: marian + labels: + app: green-spider + spec: + restartPolicy: Never + nodeSelector: + giantswarm.io/machine-pool: 5n27k + # affinity: + # podAntiAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # - labelSelector: + # matchExpressions: + # - key: app + # operator: In + # values: + # - green-spider + # topologyKey: topology.kubernetes.io/region + containers: + - name: spider + image: quay.io/netzbegruenung/green-spider:20211031-chromium93 + imagePullPolicy: IfNotPresent + command: + - python3 + - cli.py + - --credentials-path=/secrets/datastore-writer.json + - --loglevel=debug + - spider + - JOB_FLAG + volumeMounts: + - name: secrets + mountPath: "/secrets" + readOnly: true + - name: shared + mountPath: /dev/shm + resources: + requests: + cpu: 1000m + memory: 5000M + volumes: + - name: secrets + secret: + secretName: green-spider + items: + - key: datastore-writer.json + path: datastore-writer.json + - key: screenshots-uploader.json + path: screenshots-uploader.json + - name: shared + emptyDir: {} diff --git a/rating/__init__.py b/rating/__init__.py index 2071c23..4d247b5 100644 --- a/rating/__init__.py +++ b/rating/__init__.py @@ -10,6 +10,8 @@ from rating import contact_link from rating import favicon from rating import feeds from rating import https +from rating import network_payload +from rating import network_requests from rating import no_network_errors from rating import no_script_errors from rating import no_third_party_cookies @@ -39,6 +41,8 @@ def calculate_rating(results): 'FEEDS': feeds, 'HTTPS': https, 'HTTP_RESPONSE_DURATION': response_duration, + 'NETWORK_PAYLOAD': network_payload, + 'NETWORK_REQUESTS': network_requests, 'NO_NETWORK_ERRORS': no_network_errors, 'NO_SCRIPT_ERRORS': no_script_errors, 'NO_THIRD_PARTY_COOKIES': no_third_party_cookies, diff --git a/rating/network_payload.py b/rating/network_payload.py new file mode 100644 index 0000000..0fb6e65 --- /dev/null +++ b/rating/network_payload.py @@ -0,0 +1,57 @@ +""" +This rater evaluates the amount of data transferred for a page load. + +Currently no score is given. The plan is however to reward site that +cause smaller transfers. + +The rater uses Chrome performance log messages of type +'Network.loadingFinished'. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'number' + default_value = 0 + depends_on_checks = ['load_in_browser'] + max_score = 1.0 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + payloads_for_urls = [] + + for url in self.check_results['load_in_browser']: + payload = 0 + + if (self.check_results['load_in_browser'][url]['performance_log'] == [] or + self.check_results['load_in_browser'][url]['performance_log'] is None): + continue + + for lentry in self.check_results['load_in_browser'][url]['performance_log']: + if lentry['message']['method'] == 'Network.loadingFinished': + payload += lentry['message']['params']['encodedDataLength'] + + payloads_for_urls.append(payload) + + # Calculate score based on the largest value found for a URL. + # See https://github.com/netzbegruenung/green-spider/issues/11#issuecomment-600307544 + # for details. + if len(payloads_for_urls) > 0: + value = max(payloads_for_urls) + if value < 994000: + score = 1 + elif value < 1496000: + score = .5 + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/network_requests.py b/rating/network_requests.py new file mode 100644 index 0000000..b3a0229 --- /dev/null +++ b/rating/network_requests.py @@ -0,0 +1,57 @@ +""" +This rater evaluates the number of network requests made. + +Currently no score is given. The plan is however to reward site that +use only few requests. + +The rater uses Chrome performance log messages of type +'Network.requestWillBeSent'. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'number' + default_value = 0 + depends_on_checks = ['load_in_browser'] + max_score = 1.0 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + num_requests_for_urls = [] + + for url in self.check_results['load_in_browser']: + num_requests = 0 + + if (self.check_results['load_in_browser'][url]['performance_log'] == [] or + self.check_results['load_in_browser'][url]['performance_log'] is None): + continue + + for lentry in self.check_results['load_in_browser'][url]['performance_log']: + if lentry['message']['method'] == 'Network.requestWillBeSent': + num_requests += 1 + + num_requests_for_urls.append(num_requests) + + # Calculate score based on the largest value found for a URL. + # See https://github.com/netzbegruenung/green-spider/issues/11#issuecomment-600307544 + # for details. + if len(num_requests_for_urls) > 0: + value = max(num_requests_for_urls) + if value <= 28: + score = 1.0 + elif value <= 38: + score = 0.5 + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/response_duration.py b/rating/response_duration.py index 6f22d84..f5cb8f2 100644 --- a/rating/response_duration.py +++ b/rating/response_duration.py @@ -9,7 +9,7 @@ from rating.abstract_rater import AbstractRater class Rater(AbstractRater): rating_type = 'number' - default_value = False + default_value = 0 depends_on_checks = ['page_content'] max_score = 1.0 diff --git a/requirements.txt b/requirements.txt index c4c8eab..9aed040 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,42 @@ -beautifulsoup4==4.8.1 +beautifulsoup4==4.9.3 +cachetools==4.2.2 +certifi==2020.12.5 +cffi==1.14.5 +chardet==3.0.4 +click==7.1.2 +cssselect==1.1.0 dnspython==1.16.0 -feedparser==5.2.1 -GitPython -google-cloud-datastore==1.10.0 -html-similarity==0.3.2 +docker==4.4.1 +feedparser==6.0.8 +gitdb==4.0.7 +GitPython==3.1.14 +google-api-core==1.26.3 +google-auth==1.30.0 +google-cloud-core==1.6.0 +google-cloud-datastore==1.15.3 +google-cloud-storage==1.38.0 +googleapis-common-protos==1.53.0 +html-similarity==0.3.3 httpretty==0.9.7 -pyopenssl==18.0.0 -PyYAML -requests==2.22.0 -responses==0.10.15 -# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296 -selenium==3.8.0 +idna==2.10 +parsel==1.6.0 +protobuf==3.15.8 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.20 +pyOpenSSL==20.0.1 +pytz==2021.1 +PyYAML==5.4.1 +redis==3.5.3 +requests==2.25.1 +responses==0.13.3 +rq==1.8.0 +rsa==4.7.2 +selenium==3.141.0 +smmap==3.0.4 smmap2==2.0.5 -tenacity==5.0.2 -urllib3==1.25.9 +soupsieve==2.2.1 +tenacity==5.1.5 +urllib3==1.26.4 +w3lib==1.22.0 +websocket-client==0.59.0 diff --git a/spider/spider.py b/spider/spider.py index 2b08251..c2912d0 100644 --- a/spider/spider.py +++ b/spider/spider.py @@ -16,7 +16,7 @@ from google.cloud import datastore import checks import config -import jobs +import manager import rating def check_and_rate_site(entry): @@ -54,10 +54,17 @@ def check_and_rate_site(entry): for key in result['rating']: result['score'] += result['rating'][key]['score'] - # remove full HTML page content and hyperlinks to safe some storage + # Remove bigger result portions to safe some storage: + # - HTML page content + # - Hyperlinks + # - Performnance log try: for url in result['checks']['page_content']: del result['checks']['page_content'][url]['content'] + + for url in result['checks']['load_in_browser']: + del result['checks']['load_in_browser'][url]['performance_log'] + del result['checks']['hyperlinks'] except: pass @@ -80,6 +87,7 @@ def test_url(url): result = check_and_rate_site(entry=job) pprint(result) + def execute_single_job(datastore_client, job, entity_kind): """ Executes spider for one single job @@ -103,9 +111,11 @@ def execute_single_job(datastore_client, job, entity_kind): 'rating': result['rating'], 'score': result['score'], } + entity.update(record) try: datastore_client.put(entity) + logging.debug("Successfully wrote record to database") except InvalidArgument as ex: logging.error("Could not write result: %s", ex) except Exception as ex: @@ -116,7 +126,7 @@ def work_of_queue(datastore_client, entity_kind): Take job from queue and finish it until there are no more jobs """ while True: - job = jobs.get_job_from_queue(datastore_client) + job = manager.get_job_from_queue(datastore_client) if job is None: logging.info("No more jobs. Exiting.") break