mirror of
http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern.git
synced 2025-05-16 20:26:58 +00:00
add new col for hostname, check for scheme and add http:// by default
This commit is contained in:
parent
c6518363c1
commit
e764c5e6f3
1 changed files with 12 additions and 7 deletions
|
@ -68,7 +68,7 @@ def get_output_file():
|
||||||
try:
|
try:
|
||||||
return pd.read_csv(args.output)
|
return pd.read_csv(args.output)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
return pd.DataFrame(columns=["URL","Name"])
|
return pd.DataFrame(columns=["Hostname","URL","Name"])
|
||||||
|
|
||||||
def get_blacklist_file():
|
def get_blacklist_file():
|
||||||
try:
|
try:
|
||||||
|
@ -94,7 +94,7 @@ def add_urls(urls):
|
||||||
global output_file
|
global output_file
|
||||||
for url in urls:
|
for url in urls:
|
||||||
parsed_url = urllib.parse.urlparse(url)
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
if (output_file['URL'] == url).any() or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
|
if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Get information about the URL
|
# Get information about the URL
|
||||||
|
@ -103,13 +103,13 @@ def add_urls(urls):
|
||||||
req = session.get(url)
|
req = session.get(url)
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
|
print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
|
||||||
output_file.loc[-1] = [url, ""]
|
output_file.loc[-1] = [parsed_url.hostname, url, ""]
|
||||||
output_file.index += 1
|
output_file.index += 1
|
||||||
output_file = output_file.sort_index()
|
output_file = output_file.sort_index()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
|
if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
|
||||||
output_file.loc[-1] = [url, ""]
|
output_file.loc[-1] = [parsed_url.hostname, url, ""]
|
||||||
else:
|
else:
|
||||||
forbidden = False
|
forbidden = False
|
||||||
html = BeautifulSoup(req.text, features="lxml")
|
html = BeautifulSoup(req.text, features="lxml")
|
||||||
|
@ -121,7 +121,7 @@ def add_urls(urls):
|
||||||
forbidden = True
|
forbidden = True
|
||||||
if forbidden:
|
if forbidden:
|
||||||
continue
|
continue
|
||||||
output_file.loc[-1] = [url, title]
|
output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]]
|
||||||
output_file.index += 1
|
output_file.index += 1
|
||||||
output_file = output_file.sort_index()
|
output_file = output_file.sort_index()
|
||||||
|
|
||||||
|
@ -217,8 +217,13 @@ def crawl_url(url):
|
||||||
crawler_file = crawler_file.sort_index()
|
crawler_file = crawler_file.sort_index()
|
||||||
|
|
||||||
for row in verified_csv_file.itertuples():
|
for row in verified_csv_file.itertuples():
|
||||||
vcsv_urls.append(row.URL)
|
if urllib.parse.urlparse(row.URL).scheme:
|
||||||
vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname)
|
vcsv_urls.append(row.URL)
|
||||||
|
vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname)
|
||||||
|
else:
|
||||||
|
url = "http://" + row.URL
|
||||||
|
vcsv_urls.append(url)
|
||||||
|
vcsv_hostnames.append(urllib.parse.urlparse(url).hostname)
|
||||||
|
|
||||||
for i, url in enumerate(vcsv_urls):
|
for i, url in enumerate(vcsv_urls):
|
||||||
if not crawler_file[crawler_file['URL'] == url].empty:
|
if not crawler_file[crawler_file['URL'] == url].empty:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue