mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-04 10:03:40 +02:00
Merge pull request #34 from netzbegruenung/dockerize
Spider in Docker-container laufen lassen
This commit is contained in:
commit
f44ed61bd6
6
.dockerignore
Normal file
6
.dockerignore
Normal file
|
@ -0,0 +1,6 @@
|
|||
.git
|
||||
webapp
|
||||
docs
|
||||
secrets
|
||||
temp
|
||||
venv
|
31
Dockerfile
Normal file
31
Dockerfile
Normal file
|
@ -0,0 +1,31 @@
|
|||
FROM debian:stretch-slim
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y git wget gnupg fonts-liberation libappindicator3-1 \
|
||||
libasound2 libatk-bridge2.0-0 libatk1.0-0 libcairo2 libcups2 libdbus-1-3 \
|
||||
libexpat1 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 \
|
||||
libpango-1.0-0 libpangocairo-1.0-0 libx11-6 libx11-xcb1 libxcb1 \
|
||||
libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 \
|
||||
libxrandr2 libxrender1 libxss1 libxtst6 lsb-release xdg-utils \
|
||||
python3 python3-pip unzip \
|
||||
&& apt-get clean \
|
||||
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
||||
&& dpkg -i google-chrome-stable_current_amd64.deb \
|
||||
&& rm google-chrome-stable_current_amd64.deb \
|
||||
&& pip3 install GitPython idna PyYAML requests==2.18.4 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 certifi==2018.1.18 \
|
||||
&& wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
|
||||
&& unzip chromedriver_linux64.zip \
|
||||
&& rm chromedriver_linux64.zip \
|
||||
&& apt-get clean
|
||||
|
||||
# TODO: move this into the above
|
||||
RUN pip3 install beautifulsoup4==4.6.0
|
||||
|
||||
RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 \
|
||||
&& tar xjf phantomjs-2.1.1-linux-x86_64.tar.bz2 \
|
||||
&& mv phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin/ \
|
||||
&& rm -rf phantomjs-2.1.1-linux-x86_64
|
||||
|
||||
ADD spider.py /
|
||||
|
||||
ENTRYPOINT ["python3", "/spider.py"]
|
12
Makefile
12
Makefile
|
@ -2,13 +2,11 @@
|
|||
|
||||
.PHONY: webapp
|
||||
|
||||
# Python venv for running the spider locally
|
||||
venv:
|
||||
virtualenv -p python3 venv
|
||||
venv/bin/pip3 install -r requirements.txt
|
||||
|
||||
spider: venv
|
||||
venv/bin/python ./spider.py
|
||||
# Build docker image and run spider in Docker container
|
||||
spider:
|
||||
docker pull debian:stretch-slim
|
||||
docker build -t spider .
|
||||
docker run --rm -ti -v $(PWD)/webapp/dist/data:/out spider
|
||||
|
||||
screenshots: venv
|
||||
docker pull netzbegruenung/green-spider-screenshotter:latest
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -37,7 +37,7 @@ green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
|
|||
green_direcory_data_path = 'data/countries/de'
|
||||
green_directory_local_path = './cache/green-directory'
|
||||
|
||||
result_path = './webapp/dist/data'
|
||||
result_path = '/out'
|
||||
|
||||
# IP address of the newthinking GCMS server
|
||||
gcms_ip = "91.102.13.20"
|
||||
|
@ -67,7 +67,7 @@ def dir_entries():
|
|||
if not filepath.endswith(".yaml"):
|
||||
continue
|
||||
|
||||
with open(filepath, 'r') as yamlfile:
|
||||
with open(filepath, 'r', encoding='utf8') as yamlfile:
|
||||
for doc in yaml.load_all(yamlfile):
|
||||
yield doc
|
||||
|
||||
|
|
16387
webapp/dist/data/spider_result.json
vendored
16387
webapp/dist/data/spider_result.json
vendored
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue