mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-02 17:14:51 +02:00
Make and use a versioned docker image (#279)
* Revert redis module to 4.1.0 * Revert dnspython to 2.1.0 * Revert click to 8.0.3 * Specify alpine 3.16.2, reorganize into multiple steps * Replace 'latest' with 'main' everywhere * Fix deprecation warnings * Add Google root certificates * Re-order APK packages, write list after installing * Create VERSION file during docker image build * Pin chromium version
This commit is contained in:
parent
024ef118dd
commit
5e723c94db
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -8,4 +8,5 @@ kubernetes/green-spider-secret.yaml
|
|||
/volumes
|
||||
/screenshots
|
||||
/k8s-jobs
|
||||
/VERSION
|
||||
.env
|
||||
|
|
33
Dockerfile
33
Dockerfile
|
@ -1,16 +1,30 @@
|
|||
FROM alpine:3.16
|
||||
FROM alpine:3.16.2
|
||||
|
||||
ENV CHROMIUM_VERSION=106.0.5249.119-r1
|
||||
|
||||
RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \
|
||||
echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \
|
||||
apk --update --no-cache add ca-certificates \
|
||||
chromium=$CHROMIUM_VERSION \
|
||||
chromium-chromedriver=$CHROMIUM_VERSION \
|
||||
py3-cryptography python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \
|
||||
build-base git icu-libs libssl1.1 libssl3 libxml2 libxml2-dev libxslt libxslt-dev \
|
||||
libffi-dev openssl-dev cargo
|
||||
|
||||
RUN apk info -v | sort
|
||||
|
||||
WORKDIR /workdir
|
||||
|
||||
ADD requirements.txt /workdir/
|
||||
# Execute time consuming compilations in a separate step
|
||||
RUN python3 -m pip install libcst==0.4.7 sgmllib3k==1.0.0
|
||||
|
||||
RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main/" >> /etc/apk/repositories && \
|
||||
echo "http://dl-4.alpinelinux.org/alpine/edge/community/" >> /etc/apk/repositories && \
|
||||
apk --update --no-cache add ca-certificates chromium chromium-chromedriver py3-cryptography \
|
||||
python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \
|
||||
build-base git icu-libs libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \
|
||||
pip install -r requirements.txt && \
|
||||
apk del build-base
|
||||
ADD https://pki.google.com/roots.pem /google_roots.pem
|
||||
ENV GRPC_DEFAULT_SSL_ROOTS_FILE_PATH=/google_roots.pem
|
||||
|
||||
ADD requirements.txt /workdir/
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
RUN python3 -m pip freeze
|
||||
|
||||
ADD cli.py /workdir/
|
||||
ADD manager /workdir/manager
|
||||
|
@ -20,3 +34,4 @@ ADD rating /workdir/rating
|
|||
ADD spider /workdir/spider
|
||||
ADD export /workdir/export
|
||||
ADD job.py /workdir/
|
||||
ADD VERSION /workdir/VERSION
|
||||
|
|
8
Makefile
8
Makefile
|
@ -1,11 +1,13 @@
|
|||
IMAGE := quay.io/netzbegruenung/green-spider:latest
|
||||
IMAGE := quay.io/netzbegruenung/green-spider:main
|
||||
|
||||
DB_ENTITY := spider-results
|
||||
|
||||
VERSION = $(shell git describe --exact-match --tags 2> /dev/null || git rev-parse HEAD)
|
||||
|
||||
.PHONY: dockerimage spider export
|
||||
|
||||
# Build docker image
|
||||
dockerimage:
|
||||
dockerimage: VERSION
|
||||
docker build --progress plain -t $(IMAGE) .
|
||||
|
||||
# Fill the queue with spider jobs, one for each site.
|
||||
|
@ -50,3 +52,5 @@ test:
|
|||
$(IMAGE) \
|
||||
-m unittest discover -p '*_test.py' -v
|
||||
|
||||
VERSION:
|
||||
@echo $(VERSION) > VERSION
|
||||
|
|
|
@ -68,8 +68,8 @@ docker run --rm -ti \
|
|||
-v $(pwd)/screenshots:/screenshots \
|
||||
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
|
||||
--shm-size=2g \
|
||||
quay.io/netzbegruenung/green-spider:latest python3 cli.py \
|
||||
quay.io/netzbegruenung/green-spider:main python3 cli.py \
|
||||
--credentials-path /secrets/datastore-writer.json \
|
||||
--loglevel debug \
|
||||
spider --job '{"url": "https://gruene-porta-westfalica.de/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
|
||||
spider --job '{"url": "https://gruene-porta-westfalica.de/home/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
|
||||
```
|
||||
|
|
|
@ -49,7 +49,7 @@ class Checker(AbstractChecker):
|
|||
|
||||
# IPv4
|
||||
try:
|
||||
answers = dns.resolver.query(hostname, "A")
|
||||
answers = dns.resolver.resolve(hostname, "A")
|
||||
result['resolvable_ipv4'] = True
|
||||
for rdata in answers:
|
||||
result['ipv4_addresses'].append(rdata.address)
|
||||
|
@ -58,7 +58,7 @@ class Checker(AbstractChecker):
|
|||
|
||||
# IPv6
|
||||
try:
|
||||
answers = dns.resolver.query(hostname, "AAAA")
|
||||
answers = dns.resolver.resolve(hostname, "AAAA")
|
||||
result['resolvable_ipv6'] = True
|
||||
for rdata in answers:
|
||||
result['ipv6_addresses'].append(rdata.address)
|
||||
|
|
|
@ -36,7 +36,7 @@ class Checker(AbstractChecker):
|
|||
page_content = self.previous_results['page_content'][url]
|
||||
|
||||
if page_content['content'] is None:
|
||||
logging.warn("Content for URL %s is None" % url)
|
||||
logging.warning("Content for URL %s is None" % url)
|
||||
|
||||
content[url] = page_content['content']
|
||||
|
||||
|
|
|
@ -119,20 +119,20 @@ class Checker(AbstractChecker):
|
|||
'screenshots': check_responsiveness_results['screenshots'],
|
||||
}
|
||||
except TimeoutException as e:
|
||||
logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
|
||||
logging.warning("TimeoutException when checking responsiveness for %s: %s" % (url, e))
|
||||
pass
|
||||
except tenacity.RetryError as re:
|
||||
logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
|
||||
logging.warning("RetryError when checking responsiveness for %s: %s" % (url, re))
|
||||
pass
|
||||
|
||||
# Scroll page to bottom, to load all lazy-loading resources.
|
||||
try:
|
||||
self.scroll_to_bottom()
|
||||
except TimeoutException as e:
|
||||
logging.warn("TimeoutException in scroll_to_bottom for %s: %s" % (url, e))
|
||||
logging.warning("TimeoutException in scroll_to_bottom for %s: %s" % (url, e))
|
||||
pass
|
||||
except tenacity.RetryError as re:
|
||||
logging.warn("RetryError in scroll_to_bottom for %s: %s" % (url, re))
|
||||
logging.warning("RetryError in scroll_to_bottom for %s: %s" % (url, re))
|
||||
pass
|
||||
|
||||
# CSS collection
|
||||
|
@ -148,23 +148,23 @@ class Checker(AbstractChecker):
|
|||
continue
|
||||
font_families.add(font_family.lower())
|
||||
except StaleElementReferenceException as e:
|
||||
logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
|
||||
logging.warning("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
|
||||
continue
|
||||
|
||||
results[url]['font_families'] = sorted(list(font_families))
|
||||
|
||||
except TimeoutException as e:
|
||||
logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
|
||||
logging.warning("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
|
||||
pass
|
||||
|
||||
# Process cookies.
|
||||
try:
|
||||
results[url]['cookies'] = self.get_cookies()
|
||||
except TimeoutException as e:
|
||||
logging.warn("TimeoutException when collecting cookies %s: %s" % (url, e))
|
||||
logging.warning("TimeoutException when collecting cookies %s: %s" % (url, e))
|
||||
pass
|
||||
except tenacity.RetryError as re:
|
||||
logging.warn("RetryError when collecting cookies for %s: %s" % (url, re))
|
||||
logging.warning("RetryError when collecting cookies for %s: %s" % (url, re))
|
||||
pass
|
||||
|
||||
for logentry in self.driver.get_log('performance'):
|
||||
|
@ -209,7 +209,7 @@ class Checker(AbstractChecker):
|
|||
blob.upload_from_file(my_file, content_type="image/png")
|
||||
blob.make_public()
|
||||
except Exception as e:
|
||||
logging.warn("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
|
||||
logging.warning("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
|
||||
continue
|
||||
|
||||
try:
|
||||
|
@ -232,7 +232,7 @@ class Checker(AbstractChecker):
|
|||
datastore_client.put(entity)
|
||||
logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url'])
|
||||
except Exception as e:
|
||||
logging.warn("Error in %s: %s" % (screenshot['url'], e))
|
||||
logging.warning("Error in %s: %s" % (screenshot['url'], e))
|
||||
|
||||
|
||||
# Remove screenshots part from results
|
||||
|
@ -289,7 +289,7 @@ class Checker(AbstractChecker):
|
|||
success = self.driver.save_screenshot(abs_filepath)
|
||||
|
||||
if not success:
|
||||
logging.warn("Failed to create screenshot %s" % abs_filepath)
|
||||
logging.warning("Failed to create screenshot %s" % abs_filepath)
|
||||
continue
|
||||
|
||||
result['screenshots'].append({
|
||||
|
|
|
@ -15,7 +15,7 @@ services:
|
|||
|
||||
# manager manages the job queue.
|
||||
manager:
|
||||
image: quay.io/netzbegruenung/green-spider:latest
|
||||
image: quay.io/netzbegruenung/green-spider:main
|
||||
command: >
|
||||
python3 cli.py
|
||||
--credentials-path /secrets/datastore-writer.json
|
||||
|
|
2
job.py
2
job.py
|
@ -16,7 +16,7 @@ from google.cloud import datastore
|
|||
# via the environment JOB_TIMEOUT variable.
|
||||
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
|
||||
|
||||
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'
|
||||
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:main'
|
||||
|
||||
CREDENTIALS_PATH = '/secrets/datastore-writer.json'
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: spider
|
||||
image: quay.io/netzbegruenung/green-spider:latest
|
||||
image: quay.io/netzbegruenung/green-spider:main
|
||||
imagePullPolicy: Always
|
||||
args:
|
||||
- "--credentials-path=/secrets/datastore-writer.json"
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
apiVersion: batch/v1beta1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: green-spider-screenshotter
|
||||
spec:
|
||||
# Saturday at 1:05 UTC
|
||||
schedule: "5 1 * * 6"
|
||||
jobTemplate:
|
||||
spec:
|
||||
parallelism: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: screenshotter
|
||||
image: quay.io/netzbegruenung/green-spider-screenshotter:latest
|
||||
imagePullPolicy: Always
|
||||
volumeMounts:
|
||||
- name: secrets
|
||||
mountPath: "/secrets"
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 800m
|
||||
memory: 4000M
|
||||
# No restarts, as this would mean to start over.
|
||||
# TODO: Maintain a queue and change this.
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: secrets
|
||||
secret:
|
||||
secretName: green-spider
|
||||
items:
|
||||
- key: datastore-writer.json
|
||||
path: datastore-writer.json
|
||||
- key: screenshots-uploader.json
|
||||
path: screenshots-uploader.json
|
|
@ -12,7 +12,7 @@ spec:
|
|||
spec:
|
||||
containers:
|
||||
- name: spider
|
||||
image: quay.io/netzbegruenung/green-spider:latest
|
||||
image: quay.io/netzbegruenung/green-spider:main
|
||||
imagePullPolicy: Always
|
||||
args:
|
||||
- "--credentials-path=/secrets/datastore-writer.json"
|
||||
|
|
|
@ -3,19 +3,19 @@ cachetools==4.2.4
|
|||
certifi==2021.10.8
|
||||
cffi==1.15.1
|
||||
chardet==3.0.4
|
||||
click==8.1.3
|
||||
click==8.0.3
|
||||
cssselect==1.1.0
|
||||
dnspython==2.2.1
|
||||
dnspython==2.1.0
|
||||
docker==4.4.1
|
||||
feedparser==6.0.8
|
||||
gitdb==4.0.9
|
||||
GitPython==3.1.24
|
||||
google-api-core==2.2.2
|
||||
google-auth==2.3.3
|
||||
google-cloud-core==2.2.1
|
||||
google-cloud-datastore==2.4.0
|
||||
google-cloud-storage==1.43.0
|
||||
googleapis-common-protos==1.53.0
|
||||
google-api-core==2.10.2
|
||||
google-auth==2.13.0
|
||||
google-cloud-core==2.3.2
|
||||
google-cloud-datastore==2.9.0
|
||||
google-cloud-storage==2.5.0
|
||||
googleapis-common-protos==1.56.4
|
||||
html-similarity==0.3.3
|
||||
httpretty==1.1.4
|
||||
idna==2.10
|
||||
|
@ -25,9 +25,9 @@ protobuf==4.21.8
|
|||
pyasn1==0.4.8
|
||||
pyasn1-modules==0.2.8
|
||||
pycparser==2.21
|
||||
pyOpenSSL==22.1.0
|
||||
pyOpenSSL==22.0.0
|
||||
pytz==2021.3
|
||||
redis==4.3.4
|
||||
redis==4.1.0
|
||||
requests==2.26.0
|
||||
responses==0.22.0
|
||||
rq==1.8.0
|
||||
|
|
Loading…
Reference in a new issue