mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-04-29 15:44:52 +02:00
Some stability improvements
This commit is contained in:
parent
ea05bf5192
commit
278261db26
67
spider.py
67
spider.py
|
@ -22,7 +22,7 @@ import yaml
|
||||||
# configuration
|
# configuration
|
||||||
|
|
||||||
# number of parallel processes to use for crawling
|
# number of parallel processes to use for crawling
|
||||||
concurrency = 4
|
concurrency = 3
|
||||||
|
|
||||||
# connection timeout for website checks (seconds)
|
# connection timeout for website checks (seconds)
|
||||||
connect_timeout = 5
|
connect_timeout = 5
|
||||||
|
@ -366,19 +366,27 @@ def check_site(entry):
|
||||||
result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
|
result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
|
||||||
|
|
||||||
# collect icons
|
# collect icons
|
||||||
|
icons = set()
|
||||||
for c in result['details']['urlchecks']:
|
for c in result['details']['urlchecks']:
|
||||||
if 'icon' in c['content'] and c['content']['icon']:
|
if 'content' not in c:
|
||||||
if c['content']['icon'] in result['details']['icons']:
|
continue
|
||||||
continue
|
if c['content'] is None:
|
||||||
result['details']['icons'].append(c['content']['icon'])
|
logging.warning("No content for %s" % entry['url'])
|
||||||
|
continue
|
||||||
|
if c['content']['icon'] is not None:
|
||||||
|
icons.add(c['content']['icon'])
|
||||||
|
result['details']['icons'] = sorted(list(icons))
|
||||||
|
|
||||||
# collect feeds
|
# collect feeds
|
||||||
|
feeds = set()
|
||||||
for c in result['details']['urlchecks']:
|
for c in result['details']['urlchecks']:
|
||||||
|
if c['content'] is None:
|
||||||
|
logging.warning("No content for %s" % entry['url'])
|
||||||
|
continue
|
||||||
if 'feeds' in c['content'] and len(c['content']['feeds']):
|
if 'feeds' in c['content'] and len(c['content']['feeds']):
|
||||||
for feed in c['content']['feeds']:
|
for feed in c['content']['feeds']:
|
||||||
if feed in result['details']['feeds']:
|
feeds.add(feed)
|
||||||
continue
|
result['details']['feeds'] = sorted(list(feeds))
|
||||||
result['details']['feeds'].append(feed)
|
|
||||||
|
|
||||||
|
|
||||||
### Derive criteria
|
### Derive criteria
|
||||||
|
@ -416,9 +424,12 @@ def check_site(entry):
|
||||||
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
||||||
else:
|
else:
|
||||||
links = set()
|
links = set()
|
||||||
for item in result['details']['urlchecks']:
|
if result['details']['urlchecks'] is None:
|
||||||
if item['content']['canonical_link'] is not None:
|
logging.warning("No urlchecks for %s" % entry['url'])
|
||||||
links.add(item['content']['canonical_link'])
|
else:
|
||||||
|
for item in result['details']['urlchecks']:
|
||||||
|
if item['content']['canonical_link'] is not None:
|
||||||
|
links.add(item['content']['canonical_link'])
|
||||||
if len(links) == 1:
|
if len(links) == 1:
|
||||||
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
||||||
|
|
||||||
|
@ -435,10 +446,12 @@ def check_site(entry):
|
||||||
for item in result['details']['urlchecks']:
|
for item in result['details']['urlchecks']:
|
||||||
if item['error'] is None:
|
if item['error'] is None:
|
||||||
durations.append(item['duration'])
|
durations.append(item['duration'])
|
||||||
result['result']['HTTP_RESPONSE_DURATION'] = {
|
val = round(statistics.mean(durations))
|
||||||
'value': round(statistics.mean(durations)),
|
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
|
||||||
'score': 1.0/statistics.mean(durations) * 500
|
if val < 100:
|
||||||
}
|
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
|
||||||
|
elif val < 1000:
|
||||||
|
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
|
||||||
|
|
||||||
# Overall score
|
# Overall score
|
||||||
for item in result['result'].keys():
|
for item in result['result'].keys():
|
||||||
|
@ -458,6 +471,7 @@ def main():
|
||||||
get_green_directory()
|
get_green_directory()
|
||||||
|
|
||||||
# build the list of website URLs to run checks for
|
# build the list of website URLs to run checks for
|
||||||
|
logging.info("Processing green-directory")
|
||||||
input_entries = []
|
input_entries = []
|
||||||
|
|
||||||
for entry in dir_entries():
|
for entry in dir_entries():
|
||||||
|
@ -485,34 +499,47 @@ def main():
|
||||||
"city": entry.get("city"),
|
"city": entry.get("city"),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
# randomize order, to distribute requests over servers
|
# randomize order, to distribute requests over servers
|
||||||
|
logging.info("Shuffling input URLs")
|
||||||
random.seed()
|
random.seed()
|
||||||
random.shuffle(input_entries)
|
random.shuffle(input_entries)
|
||||||
|
|
||||||
# run checks
|
# run checks
|
||||||
|
logging.info("Starting checks")
|
||||||
results = {}
|
results = {}
|
||||||
|
|
||||||
pool = Pool(concurrency)
|
pool = Pool(concurrency)
|
||||||
for entry in input_entries:
|
for ientry in input_entries:
|
||||||
results[entry['url']] = pool.apply_async(check_site, kwds={'entry': entry})
|
logging.info("Submitting %s to job pool" % ientry['url'])
|
||||||
|
results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry})
|
||||||
pool.close()
|
pool.close()
|
||||||
pool.join()
|
pool.join()
|
||||||
|
|
||||||
|
logging.info("Checks are finished")
|
||||||
|
|
||||||
# Restructure result from dict of ApplyResult
|
# Restructure result from dict of ApplyResult
|
||||||
# to list of dicts and sort in stable way
|
# to list of dicts and sort in stable way
|
||||||
results2 = []
|
json_result = []
|
||||||
done = set()
|
done = set()
|
||||||
|
|
||||||
|
logging.info("Restructuring results")
|
||||||
|
|
||||||
# convert results from ApplyResult to dict
|
# convert results from ApplyResult to dict
|
||||||
for url in sorted(results.keys()):
|
for url in sorted(results.keys()):
|
||||||
if url not in done:
|
if url not in done:
|
||||||
results2.append(results[url].get())
|
logging.info("Getting result for %s" % url)
|
||||||
|
try:
|
||||||
|
resultsitem = results[url].get()
|
||||||
|
json_result.append(resultsitem)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("Error ehn getting result for '%s': %s" % (url, e))
|
||||||
done.add(url)
|
done.add(url)
|
||||||
|
|
||||||
# Write result as JSON
|
# Write result as JSON
|
||||||
output_filename = os.path.join(result_path, "spider_result.json")
|
output_filename = os.path.join(result_path, "spider_result.json")
|
||||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||||
json.dump(results2, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in a new issue