mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-02 00:54:52 +02:00
Add creating of a job for a particular URL
This commit is contained in:
parent
01155b58a5
commit
3a7a553534
26
spider.py
26
spider.py
|
@ -64,10 +64,13 @@ def chunks(the_list, size):
|
||||||
yield the_list[i:i + size]
|
yield the_list[i:i + size]
|
||||||
|
|
||||||
|
|
||||||
def create_jobs():
|
def create_jobs(url=None):
|
||||||
"""
|
"""
|
||||||
Read all URLs from green directory and fill a job database
|
Read all URLs from green directory and fill a job database
|
||||||
with one job per URL
|
with one job per URL.
|
||||||
|
|
||||||
|
Alternatively, if the url argument is given, only the given URL
|
||||||
|
will be added as a spider job.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# refresh our local clone of the green directory
|
# refresh our local clone of the green directory
|
||||||
|
@ -78,6 +81,8 @@ def create_jobs():
|
||||||
logging.info("Processing green-directory")
|
logging.info("Processing green-directory")
|
||||||
input_entries = []
|
input_entries = []
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
|
||||||
for entry in dir_entries():
|
for entry in dir_entries():
|
||||||
|
|
||||||
if 'type' not in entry:
|
if 'type' not in entry:
|
||||||
|
@ -93,6 +98,8 @@ def create_jobs():
|
||||||
if entry['urls'][index]['type'] == "WEBSITE":
|
if entry['urls'][index]['type'] == "WEBSITE":
|
||||||
website_url = entry['urls'][index]['url']
|
website_url = entry['urls'][index]['url']
|
||||||
if website_url:
|
if website_url:
|
||||||
|
if url is not None and website_url != url:
|
||||||
|
continue
|
||||||
input_entries.append({
|
input_entries.append({
|
||||||
"url": website_url,
|
"url": website_url,
|
||||||
"level": entry.get("level"),
|
"level": entry.get("level"),
|
||||||
|
@ -100,10 +107,23 @@ def create_jobs():
|
||||||
"district": entry.get("district"),
|
"district": entry.get("district"),
|
||||||
"city": entry.get("city"),
|
"city": entry.get("city"),
|
||||||
})
|
})
|
||||||
|
count += 1
|
||||||
except NameError:
|
except NameError:
|
||||||
logging.error("Error in %s: 'url' key missing (%s)",
|
logging.error("Error in %s: 'url' key missing (%s)",
|
||||||
repr_entry(entry), entry['urls'][index])
|
repr_entry(entry), entry['urls'][index])
|
||||||
|
|
||||||
|
# ensure the passed URL argument is really there, even if not part
|
||||||
|
# of the directory.
|
||||||
|
if url and count == 0:
|
||||||
|
logging.info("Adding job for URL %s which is not part of green-directory", url)
|
||||||
|
input_entries.append({
|
||||||
|
"url": url,
|
||||||
|
"level": None,
|
||||||
|
"state": None,
|
||||||
|
"district": None,
|
||||||
|
"city": None,
|
||||||
|
})
|
||||||
|
|
||||||
# randomize order, to distribute requests over servers
|
# randomize order, to distribute requests over servers
|
||||||
logging.debug("Shuffling input URLs")
|
logging.debug("Shuffling input URLs")
|
||||||
random.seed()
|
random.seed()
|
||||||
|
@ -831,6 +851,6 @@ if __name__ == "__main__":
|
||||||
logging.debug("Called command %s", args.command)
|
logging.debug("Called command %s", args.command)
|
||||||
|
|
||||||
if args.command == 'jobs':
|
if args.command == 'jobs':
|
||||||
create_jobs()
|
create_jobs(args.url)
|
||||||
else:
|
else:
|
||||||
work_of_queue()
|
work_of_queue()
|
||||||
|
|
Loading…
Reference in a new issue