mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-02 09:04:51 +02:00
Add icon retrieval
This commit is contained in:
parent
313f56f39e
commit
9672e41dba
18
spider.py
18
spider.py
|
@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
|
||||||
from git import Repo
|
from git import Repo
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from socket import gethostbyname_ex
|
from socket import gethostbyname_ex
|
||||||
|
from urllib.parse import urljoin
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import certifi
|
import certifi
|
||||||
import json
|
import json
|
||||||
|
@ -148,7 +149,17 @@ def check_content(r):
|
||||||
result['canonical_link'] = None
|
result['canonical_link'] = None
|
||||||
link = soup.find('link', rel='canonical')
|
link = soup.find('link', rel='canonical')
|
||||||
if link:
|
if link:
|
||||||
result['canonical_link'] = link.get('href')
|
result['canonical_link'] = urljoin(r.url, link.get('href'))
|
||||||
|
|
||||||
|
# icon
|
||||||
|
result['icon'] = None
|
||||||
|
link = soup.find('link', rel='icon')
|
||||||
|
if link:
|
||||||
|
result['icon'] = urljoin(r.url, link.get('href'))
|
||||||
|
else:
|
||||||
|
link = soup.find('link', rel='shortcut icon')
|
||||||
|
if link:
|
||||||
|
result['icon'] = urljoin(r.url, link.get('href'))
|
||||||
|
|
||||||
# feed links
|
# feed links
|
||||||
result['feeds'] = []
|
result['feeds'] = []
|
||||||
|
@ -157,10 +168,10 @@ def check_content(r):
|
||||||
|
|
||||||
if len(rss_links) > 0:
|
if len(rss_links) > 0:
|
||||||
for l in rss_links:
|
for l in rss_links:
|
||||||
result['feeds'].append(l.get('href'))
|
result['feeds'].append(urljoin(r.url, l.get('href')))
|
||||||
if len(atom_links) > 0:
|
if len(atom_links) > 0:
|
||||||
for l in rss_links:
|
for l in rss_links:
|
||||||
result['feeds'].append(l.get('href'))
|
result['feeds'].append(urljoin(r.url, l.get('href')))
|
||||||
|
|
||||||
# generator meta tag
|
# generator meta tag
|
||||||
result['generator'] = None
|
result['generator'] = None
|
||||||
|
@ -180,6 +191,7 @@ def check_content(r):
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def check_site(url):
|
def check_site(url):
|
||||||
"""
|
"""
|
||||||
Performs our site check and returns results as a dict.
|
Performs our site check and returns results as a dict.
|
||||||
|
|
Loading…
Reference in a new issue