mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-11 21:16:07 +02:00
Compare commits
10 commits
97cca76af2
...
6ade3514e0
Author | SHA1 | Date | |
---|---|---|---|
6ade3514e0 | |||
5e183dc34a | |||
99d0980686 | |||
ee167db02f | |||
a84c2244bd | |||
e2bc62e51a | |||
f2a4be1b9d | |||
1136637ed8 | |||
ab58152b8e | |||
0c0bcbf54e |
|
@ -1,6 +1,6 @@
|
|||
FROM alpine:3.19@sha256:c5b1261d6d3e43071626931fc004f70149baeba2c8ec672bd4f27761f8e1ad6b
|
||||
|
||||
ENV CHROMIUM_VERSION=122.0.6261.94-r0
|
||||
ENV CHROMIUM_VERSION=123.0.6312.86-r0
|
||||
|
||||
RUN echo "http://dl-cdn.alpinelinux.org/alpine/v3.19/community" >> /etc/apk/repositories && \
|
||||
apk --update --no-cache add ca-certificates \
|
||||
|
|
2
Makefile
2
Makefile
|
@ -34,7 +34,7 @@ dryrun:
|
|||
# Run the spider.
|
||||
# OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES is a workaround for mac OS.
|
||||
spider:
|
||||
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq --verbose --burst high default low
|
||||
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq worker --burst high default low
|
||||
|
||||
export:
|
||||
docker run --rm -ti \
|
||||
|
|
|
@ -5,6 +5,7 @@ Loads feeds linked from pages and collects information on the contained content
|
|||
import logging
|
||||
from time import mktime
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
import feedparser
|
||||
|
||||
|
@ -102,7 +103,7 @@ class Checker(AbstractChecker):
|
|||
max_date = timestamp
|
||||
|
||||
if max_date is not None:
|
||||
return datetime.fromtimestamp(max_date)
|
||||
return datetime.fromtimestamp(max_date).replace(tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def find_first_entry(self, entries):
|
||||
|
@ -117,4 +118,4 @@ class Checker(AbstractChecker):
|
|||
min_date = timestamp
|
||||
|
||||
if min_date is not None:
|
||||
return datetime.fromtimestamp(min_date)
|
||||
return datetime.fromtimestamp(min_date).replace(tzinfo=timezone.utc)
|
||||
|
|
|
@ -6,6 +6,7 @@ from checks import html_head, page_content
|
|||
from checks import load_feeds
|
||||
from checks.config import Config
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from pprint import pprint
|
||||
|
||||
|
@ -63,8 +64,8 @@ class TestFeed(unittest.TestCase):
|
|||
self.assertEqual(result['http://example.com/feed.xml'], {
|
||||
'exception': None,
|
||||
'average_interval': 340359,
|
||||
'first_entry': datetime(2003, 5, 30, 11, 6, 42),
|
||||
'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
|
||||
'first_entry': datetime(2003, 5, 30, 11, 6, 42, tzinfo=timezone.utc),
|
||||
'latest_entry': datetime(2003, 6, 3, 9, 39, 21, tzinfo=timezone.utc),
|
||||
'num_entries': 2,
|
||||
'title': 'Liftoff News',
|
||||
})
|
||||
|
|
10
cli.py
10
cli.py
|
@ -34,6 +34,10 @@ if __name__ == "__main__":
|
|||
# subcommands
|
||||
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
||||
|
||||
# 'spider' subcommand to execute a job from the queue and store the result.
|
||||
spider_parser = subparsers.add_parser('spider', help='Execute a spider job from the queue and store the result.')
|
||||
spider_parser.add_argument('--job', help='JSON job data')
|
||||
|
||||
# 'dryrun' subcommand to spider one URL without writing results back.
|
||||
dryrun_parser = subparsers.add_parser('dryrun', help='Spider an arbitrary URL without storing results. ')
|
||||
dryrun_parser.add_argument('url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
||||
|
@ -82,6 +86,12 @@ if __name__ == "__main__":
|
|||
result = spider.check_and_rate_site({"url": args.url, "type": "REGIONAL_CHAPTER", "level": "DE:KREISVERBAND", "state": "Unnamed", "district": "Unnamed"})
|
||||
print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder))
|
||||
|
||||
elif args.command == 'spider':
|
||||
from spider import spider
|
||||
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
||||
job = json.loads(args.job)
|
||||
spider.execute_single_job(datastore_client, job, "spider-results")
|
||||
|
||||
else:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
|
|
@ -135,7 +135,7 @@ wait_for_server
|
|||
echo "Executing remote commands..."
|
||||
|
||||
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP << EOF
|
||||
DEBIAN_FRONTEND=noninteractive
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
echo ""
|
||||
echo "Update package sources"
|
||||
|
|
|
@ -98,21 +98,22 @@ function wait_for_server()
|
|||
create_server $1
|
||||
wait_for_server
|
||||
|
||||
echo "\nExecuting remote commands..."
|
||||
echo ""
|
||||
echo "Executing remote commands..."
|
||||
|
||||
SSHCMD="ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP"
|
||||
SCPCMD="scp -o StrictHostKeyChecking=no -q"
|
||||
|
||||
$SSHCMD << EOF
|
||||
DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
echo ""
|
||||
echo "Update package sources"
|
||||
apt-get update -q
|
||||
|
||||
echo ""
|
||||
echo "Install dependencies"
|
||||
apt-get install -y apt-transport-https ca-certificates curl git gnupg2 lsb-release software-properties-common
|
||||
apt-get install -y apt-transport-https git gnupg2 software-properties-common
|
||||
|
||||
echo ""
|
||||
echo "Add Docker key"
|
||||
|
@ -140,55 +141,69 @@ $SSHCMD << EOF
|
|||
|
||||
echo ""
|
||||
echo "Test docker"
|
||||
docker pull hello-world
|
||||
docker run --rm hello-world
|
||||
|
||||
mkdir /root/secrets
|
||||
EOF
|
||||
|
||||
echo "\nCopying files to server"
|
||||
echo ""
|
||||
echo "Copying files to server"
|
||||
$SCPCMD secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json
|
||||
$SCPCMD docker-compose.yaml root@$SERVER_IP:/root/docker-compose.yaml
|
||||
$SCPCMD job.py root@$SERVER_IP:/root/job.py
|
||||
$SCPCMD requirements.txt root@$SERVER_IP:/root/requirements.txt
|
||||
|
||||
echo "\nInstalling Python dependencies"
|
||||
$SSHCMD apt-get install -y python3-pip build-essential
|
||||
echo ""
|
||||
echo "Installing Python dependencies"
|
||||
$SSHCMD DEBIAN_FRONTEND=noninteractive apt-get install -y python3-pip build-essential
|
||||
$SSHCMD pip3 install -r requirements.txt
|
||||
|
||||
echo "\nCloning green-directory"
|
||||
echo ""
|
||||
echo "Cloning green-directory"
|
||||
$SSHCMD git clone --progress --depth 1 https://$GIT_TOKEN@git.verdigado.com/NB-Public/green-directory.git /root/cache/green-directory
|
||||
|
||||
echo "\nPulling Docker images"
|
||||
echo ""
|
||||
echo "Pulling container images"
|
||||
$SSHCMD docker compose pull --quiet redis manager
|
||||
|
||||
echo "\nStarting redis in background"
|
||||
$SSHCMD docker compose up -d redis
|
||||
echo ""
|
||||
echo "Starting redis in background"
|
||||
$SSHCMD docker compose up --detach redis
|
||||
sleep 5
|
||||
|
||||
echo "\nCreating jobs"
|
||||
echo ""
|
||||
echo "Creating jobs"
|
||||
$SSHCMD docker compose up manager
|
||||
|
||||
echo "\nQueue status:"
|
||||
$SSHCMD rq info --url redis://localhost:6379/0
|
||||
echo ""
|
||||
echo "Queue status"
|
||||
$SSHCMD rq info --url redis://localhost:6379
|
||||
|
||||
echo "\nStarting worker (first run)"
|
||||
$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0
|
||||
echo ""
|
||||
echo "Starting worker for first run"
|
||||
$SSHCMD rq worker --burst high default low --url redis://localhost:6379
|
||||
|
||||
echo "\nRe-queuing failed jobs"
|
||||
echo ""
|
||||
echo "Re-queuing failed jobs"
|
||||
$SSHCMD rq requeue --queue low --all --url redis://localhost:6379
|
||||
|
||||
echo "\nQueue status:"
|
||||
$SSHCMD rq info --url redis://localhost:6379/0
|
||||
echo ""
|
||||
echo "Queue status:"
|
||||
$SSHCMD rq info --url redis://localhost:6379
|
||||
|
||||
echo "\nStarting worker (second run)"
|
||||
$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0
|
||||
echo ""
|
||||
echo "Starting worker for second run"
|
||||
$SSHCMD JOB_TIMEOUT=100 rq worker --burst high default low --url redis://localhost:6379
|
||||
|
||||
echo "\nDone."
|
||||
echo ""
|
||||
echo "Done."
|
||||
|
||||
|
||||
|
||||
# Delete the box
|
||||
echo "\nDeleting server $SERVERNAME with ID $SERVER_ID"
|
||||
echo ""
|
||||
echo "Deleting server $SERVERNAME with ID $SERVER_ID"
|
||||
curl -s -X DELETE -H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $API_TOKEN" \
|
||||
https://api.hetzner.cloud/v1/servers/$SERVER_ID
|
||||
|
|
|
@ -17,14 +17,12 @@ services:
|
|||
manager:
|
||||
image: ghcr.io/netzbegruenung/green-spider:latest
|
||||
command: >
|
||||
python3 cli.py
|
||||
python3 -u cli.py
|
||||
--credentials-path /secrets/datastore-writer.json
|
||||
--loglevel debug
|
||||
manager
|
||||
environment:
|
||||
REDIS_URL: redis://redis:6379/0
|
||||
GIT_USERNAME: ${GIT_USERNAME}
|
||||
GIT_PASSWORD: ${GIT_PASSWORD}
|
||||
volumes:
|
||||
- ${PWD}/secrets:/secrets
|
||||
- ${PWD}/cache/green-directory:/workdir/cache/green-directory
|
||||
|
|
2
job.py
2
job.py
|
@ -12,7 +12,7 @@ import logging
|
|||
import docker
|
||||
from google.cloud import datastore
|
||||
|
||||
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt
|
||||
# Maximum per-job runtime in seconds. This can be increased for second, third attempt
|
||||
# via the environment JOB_TIMEOUT variable.
|
||||
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
|
||||
|
||||
|
|
|
@ -126,10 +126,11 @@ def create_jobs(url=None):
|
|||
|
||||
count = 0
|
||||
errorcount = 0
|
||||
jobscount = 0
|
||||
logging.info("Writing jobs")
|
||||
|
||||
count = 0
|
||||
for entry in input_entries:
|
||||
count += 1
|
||||
try:
|
||||
_ = queue.enqueue('job.run',
|
||||
job_timeout=JOB_TTL,
|
||||
|
@ -141,7 +142,7 @@ def create_jobs(url=None):
|
|||
|
||||
# Print job for debugging purposes
|
||||
logging.debug(f"Created job: {json.dumps(entry)}")
|
||||
count += 1
|
||||
jobscount += 1
|
||||
except Exception as e:
|
||||
errorcount += 1
|
||||
logging.error("Error adding job for URL %s: %s" % (entry['url'], e))
|
||||
|
@ -149,10 +150,9 @@ def create_jobs(url=None):
|
|||
# Write kubernetes Job
|
||||
make_k8s_job(entry, count)
|
||||
|
||||
count += 1
|
||||
|
||||
logging.info("Writing jobs done, %s jobs added", count)
|
||||
logging.info("%d errors while writing jobs", errorcount)
|
||||
logging.info("Processed %s entries", count)
|
||||
logging.info("Created %s jobs", jobscount)
|
||||
logging.info("%d errors", errorcount)
|
||||
|
||||
|
||||
def make_k8s_job(job_data, count):
|
||||
|
|
|
@ -1,42 +1,42 @@
|
|||
beautifulsoup4==4.12.3
|
||||
cachetools==5.3.3
|
||||
certifi==2023.7.22
|
||||
cffi==1.15.1
|
||||
certifi==2024.2.2
|
||||
cffi==1.16.0
|
||||
chardet==5.2.0
|
||||
click>=7,<8
|
||||
cssselect==1.2.0
|
||||
dnspython==2.6.1
|
||||
docker==4.4.1
|
||||
feedparser==6.0.8
|
||||
gitdb==4.0.9
|
||||
GitPython==3.1.41
|
||||
google-api-core==2.10.2
|
||||
google-auth==2.28.1
|
||||
google-cloud-core==2.3.2
|
||||
google-cloud-datastore==2.9.0
|
||||
google-cloud-storage==2.5.0
|
||||
googleapis-common-protos==1.56.4
|
||||
docker==5.0.3
|
||||
feedparser==6.0.11
|
||||
gitdb==4.0.11
|
||||
GitPython==3.1.43
|
||||
google-api-core==2.18.0
|
||||
google-auth==2.29.0
|
||||
google-cloud-core==2.4.1
|
||||
google-cloud-datastore==2.19.0
|
||||
google-cloud-storage==2.16.0
|
||||
googleapis-common-protos==1.63.0
|
||||
html-similarity==0.3.3
|
||||
httpretty==1.1.4
|
||||
idna==3.6
|
||||
kubernetes==29.0.0
|
||||
parsel==1.6.0
|
||||
protobuf==4.25.3
|
||||
pyasn1==0.4.8
|
||||
pyasn1-modules==0.2.8
|
||||
pycparser==2.21
|
||||
pyOpenSSL==23.0.0
|
||||
pytz==2021.3
|
||||
redis==4.1.0
|
||||
parsel==1.6.0 # html-similarity 0.3.3 depends on parsel==1.6.0
|
||||
protobuf==4.25.3 # <5.0.0 requested by google-api-core 2.18.0
|
||||
pyasn1==0.6.0
|
||||
pyasn1-modules==0.4.0
|
||||
pycparser==2.22
|
||||
pyOpenSSL==24.1.0
|
||||
pytz==2024.1
|
||||
redis==5.0.3
|
||||
requests==2.31.0
|
||||
responses==0.22.0
|
||||
rq==1.16.0
|
||||
responses==0.25.0
|
||||
rq==1.16.1
|
||||
rsa==4.9
|
||||
selenium==3.141.0
|
||||
smmap==5.0.1
|
||||
smmap2==3.0.1
|
||||
soupsieve==2.2.1
|
||||
soupsieve==2.5
|
||||
tenacity==8.2.3
|
||||
urllib3==1.26.7
|
||||
w3lib==1.22.0
|
||||
urllib3==1.26.18 # v2 requires code changes in timeout handling
|
||||
w3lib==2.1.2
|
||||
websocket-client==1.7.0
|
||||
|
|
|
@ -72,6 +72,38 @@ def check_and_rate_site(entry):
|
|||
return result
|
||||
|
||||
|
||||
def execute_single_job(datastore_client, job, entity_kind):
|
||||
"""
|
||||
Executes spider for one single job
|
||||
"""
|
||||
validate_job(job)
|
||||
|
||||
logging.info("Starting job %s", job["url"])
|
||||
result = check_and_rate_site(entry=job)
|
||||
|
||||
logging.info("Job %s finished checks", job["url"])
|
||||
logging.info("Job %s writing to DB", job["url"])
|
||||
|
||||
key = datastore_client.key(entity_kind, job["url"])
|
||||
entity = datastore.Entity(key=key)
|
||||
record = {
|
||||
'created': datetime.utcnow(),
|
||||
'meta': result['meta'],
|
||||
'checks': result['checks'],
|
||||
'rating': result['rating'],
|
||||
'score': result['score'],
|
||||
}
|
||||
|
||||
entity.update(record)
|
||||
try:
|
||||
datastore_client.put(entity)
|
||||
logging.debug("Successfully wrote record to database")
|
||||
except InvalidArgument as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
except Exception as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
|
||||
|
||||
def test_url(url):
|
||||
"""
|
||||
Run the spider for a single URL and print the result.
|
||||
|
|
Loading…
Reference in a new issue