categorize simplex links and tor onion links

2025-07-01 19:46:41 +00:00 · 2025-04-18 10:04:19 +01:00 · 2025-04-18 10:04:19 +01:00 · 8ef1bb0910
commit 8ef1bb0910
parent bf40730292
1 changed files with 11 additions and 8 deletions
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@ -27,8 +27,6 @@ instancepath=rootpath+'www/participants/'+instance
 verifiedcsvfile=instancepath+'/verified.csv'
 blcsvfile=instancepath+'/blacklist.csv'
 crawlerdir=rootpath+'/crawler'
-if not os.path.exists(crawlerdir):
-    os.makedirs(crawlerdir)

 parser = argparse.ArgumentParser(
    prog='Lantern crawler',
@ -45,8 +43,13 @@ parser.add_argument('-c', '--crawler-file',
 parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them',
                    type=str, default=blcsvfile)
 parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)
+parser.add_argument('-n', '--no-create-crawler-dir', help='Don\'t create the crawler directory automatically.', action='store_true')
 args = parser.parse_args()

+if not args.no_create_crawler_dir:
+    if not os.path.exists(crawlerdir):
+        os.makedirs(crawlerdir)
+
 script_abs_path = os.path.dirname(os.path.abspath(__file__))
 env_path = os.path.join(script_abs_path+"/.env")
 default_env_path = os.path.join(script_abs_path+"/.env.sample")
@ -78,7 +81,7 @@ def get_output_file():
    try:
        return pd.read_csv(args.output)
    except FileNotFoundError:
-        return pd.DataFrame(columns=["Hostname","URL","Name"])
+        return pd.DataFrame(columns=["Hostname","URL","Name", "Category"])

 def get_blacklist_file():
    try:
@ -105,12 +108,12 @@ def add_urls(urls):
    for url in urls:
        parsed_url = urllib.parse.urlparse(url)
        if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any():
-            output_file.loc[-1] = ["", url, "SimpleX Chatroom"]
+            output_file.loc[-1] = ["", url, "", "SimpleX Chatroom"]
            output_file.index += 1
            output_file = output_file.sort_index()
            continue
        elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any():
-            output_file.loc[-1] = ["", url, "SimpleX Server"]
+            output_file.loc[-1] = ["", url, "", "SimpleX Server"]
            output_file.index += 1
            output_file = output_file.sort_index()
            continue
@ -123,13 +126,13 @@ def add_urls(urls):
            req = session.get(url)
        except requests.exceptions.ConnectionError:
            print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
-            output_file.loc[-1] = [parsed_url.hostname, url, ""]
+            output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"]
            output_file.index += 1
            output_file = output_file.sort_index()
            continue

        if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
-            output_file.loc[-1] = [parsed_url.hostname, url, ""]
+            output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"]
        else:
            forbidden = False
            html = BeautifulSoup(req.text, features="lxml")
@ -141,7 +144,7 @@ def add_urls(urls):
                        forbidden = True
            if forbidden:
                continue
-            output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]]
+            output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1], "Tor Hidden Service"]
        output_file.index += 1
        output_file = output_file.sort_index()