add new col for hostname, check for scheme and add http:// by default

2025-07-01 13:06:41 +00:00 · 2025-04-05 11:24:35 +00:00 · 2025-04-05 11:24:35 +00:00 · e764c5e6f3
commit e764c5e6f3
parent c6518363c1
1 changed files with 12 additions and 7 deletions
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@ -68,7 +68,7 @@ def get_output_file():
    try:
        return pd.read_csv(args.output)
    except FileNotFoundError:
-        return pd.DataFrame(columns=["URL","Name"])
+        return pd.DataFrame(columns=["Hostname","URL","Name"])

 def get_blacklist_file():
    try:
@ -94,7 +94,7 @@ def add_urls(urls):
    global output_file
    for url in urls:
        parsed_url = urllib.parse.urlparse(url)
-        if (output_file['URL'] == url).any() or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
+        if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
            continue

        # Get information about the URL
@ -103,13 +103,13 @@ def add_urls(urls):
            req = session.get(url)
        except requests.exceptions.ConnectionError:
            print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
-            output_file.loc[-1] = [url, ""]
+            output_file.loc[-1] = [parsed_url.hostname, url, ""]
            output_file.index += 1
            output_file = output_file.sort_index()
            continue

        if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
-            output_file.loc[-1] = [url, ""]
+            output_file.loc[-1] = [parsed_url.hostname, url, ""]
        else:
            forbidden = False
            html = BeautifulSoup(req.text, features="lxml")
@ -121,7 +121,7 @@ def add_urls(urls):
                        forbidden = True
            if forbidden:
                continue
-            output_file.loc[-1] = [url, title]
+            output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]]
        output_file.index += 1
        output_file = output_file.sort_index()

@ -217,8 +217,13 @@ def crawl_url(url):
            crawler_file = crawler_file.sort_index()

 for row in verified_csv_file.itertuples():
-    vcsv_urls.append(row.URL)
-    vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname)
+    if urllib.parse.urlparse(row.URL).scheme:
+        vcsv_urls.append(row.URL)
+        vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname)
+    else:
+        url = "http://" + row.URL
+        vcsv_urls.append(url)
+        vcsv_hostnames.append(urllib.parse.urlparse(url).hostname)

 for i, url in enumerate(vcsv_urls):
    if not crawler_file[crawler_file['URL'] == url].empty: