From 3a7a5535342718ad86b0875ffcb3159b96e70d3a Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Thu, 23 Aug 2018 10:01:22 +0200 Subject: [PATCH] Add creating of a job for a particular URL --- spider.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/spider.py b/spider.py index 93f376c..e334915 100644 --- a/spider.py +++ b/spider.py @@ -64,10 +64,13 @@ def chunks(the_list, size): yield the_list[i:i + size] -def create_jobs(): +def create_jobs(url=None): """ Read all URLs from green directory and fill a job database - with one job per URL + with one job per URL. + + Alternatively, if the url argument is given, only the given URL + will be added as a spider job. """ # refresh our local clone of the green directory @@ -78,6 +81,8 @@ def create_jobs(): logging.info("Processing green-directory") input_entries = [] + count = 0 + for entry in dir_entries(): if 'type' not in entry: @@ -93,6 +98,8 @@ def create_jobs(): if entry['urls'][index]['type'] == "WEBSITE": website_url = entry['urls'][index]['url'] if website_url: + if url is not None and website_url != url: + continue input_entries.append({ "url": website_url, "level": entry.get("level"), @@ -100,10 +107,23 @@ def create_jobs(): "district": entry.get("district"), "city": entry.get("city"), }) + count += 1 except NameError: logging.error("Error in %s: 'url' key missing (%s)", repr_entry(entry), entry['urls'][index]) + # ensure the passed URL argument is really there, even if not part + # of the directory. + if url and count == 0: + logging.info("Adding job for URL %s which is not part of green-directory", url) + input_entries.append({ + "url": url, + "level": None, + "state": None, + "district": None, + "city": None, + }) + # randomize order, to distribute requests over servers logging.debug("Shuffling input URLs") random.seed() @@ -831,6 +851,6 @@ if __name__ == "__main__": logging.debug("Called command %s", args.command) if args.command == 'jobs': - create_jobs() + create_jobs(args.url) else: work_of_queue()