diff --git a/scripts/crawler.py b/scripts/crawler.py index 3b5178c..053c388 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -68,7 +68,7 @@ def get_output_file(): try: return pd.read_csv(args.output) except FileNotFoundError: - return pd.DataFrame(columns=["URL","Name"]) + return pd.DataFrame(columns=["Hostname","URL","Name"]) def get_blacklist_file(): try: @@ -94,7 +94,7 @@ def add_urls(urls): global output_file for url in urls: parsed_url = urllib.parse.urlparse(url) - if (output_file['URL'] == url).any() or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): + if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): continue # Get information about the URL @@ -103,13 +103,13 @@ def add_urls(urls): req = session.get(url) except requests.exceptions.ConnectionError: print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway') - output_file.loc[-1] = [url, ""] + output_file.loc[-1] = [parsed_url.hostname, url, ""] output_file.index += 1 output_file = output_file.sort_index() continue if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]: - output_file.loc[-1] = [url, ""] + output_file.loc[-1] = [parsed_url.hostname, url, ""] else: forbidden = False html = BeautifulSoup(req.text, features="lxml") @@ -121,7 +121,7 @@ def add_urls(urls): forbidden = True if forbidden: continue - output_file.loc[-1] = [url, title] + output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]] output_file.index += 1 output_file = output_file.sort_index() @@ -217,8 +217,13 @@ def crawl_url(url): crawler_file = crawler_file.sort_index() for row in verified_csv_file.itertuples(): - vcsv_urls.append(row.URL) - vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname) + if urllib.parse.urlparse(row.URL).scheme: + vcsv_urls.append(row.URL) + vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname) + else: + url = "http://" + row.URL + vcsv_urls.append(url) + vcsv_hostnames.append(urllib.parse.urlparse(url).hostname) for i, url in enumerate(vcsv_urls): if not crawler_file[crawler_file['URL'] == url].empty: