From 8ef1bb0910055d4fa0f813e591523cf199a54b5f Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Fri, 18 Apr 2025 10:04:19 +0100
Subject: [PATCH] categorize simplex links and tor onion links

---
 scripts/crawler.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index d5042e5..c7446ea 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -27,8 +27,6 @@ instancepath=rootpath+'www/participants/'+instance
 verifiedcsvfile=instancepath+'/verified.csv'
 blcsvfile=instancepath+'/blacklist.csv'
 crawlerdir=rootpath+'/crawler'
-if not os.path.exists(crawlerdir):
-    os.makedirs(crawlerdir)
 
 parser = argparse.ArgumentParser(
     prog='Lantern crawler',
@@ -45,8 +43,13 @@ parser.add_argument('-c', '--crawler-file',
 parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them',
                     type=str, default=blcsvfile)
 parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)
+parser.add_argument('-n', '--no-create-crawler-dir', help='Don\'t create the crawler directory automatically.', action='store_true')
 args = parser.parse_args()
 
+if not args.no_create_crawler_dir:
+    if not os.path.exists(crawlerdir):
+        os.makedirs(crawlerdir)
+
 script_abs_path = os.path.dirname(os.path.abspath(__file__))
 env_path = os.path.join(script_abs_path+"/.env")
 default_env_path = os.path.join(script_abs_path+"/.env.sample")
@@ -78,7 +81,7 @@ def get_output_file():
     try:
         return pd.read_csv(args.output)
     except FileNotFoundError:
-        return pd.DataFrame(columns=["Hostname","URL","Name"])
+        return pd.DataFrame(columns=["Hostname","URL","Name", "Category"])
 
 def get_blacklist_file():
     try:
@@ -105,12 +108,12 @@ def add_urls(urls):
     for url in urls:
         parsed_url = urllib.parse.urlparse(url)
         if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any():
-            output_file.loc[-1] = ["", url, "SimpleX Chatroom"]
+            output_file.loc[-1] = ["", url, "", "SimpleX Chatroom"]
             output_file.index += 1
             output_file = output_file.sort_index()
             continue
         elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any():
-            output_file.loc[-1] = ["", url, "SimpleX Server"]
+            output_file.loc[-1] = ["", url, "", "SimpleX Server"]
             output_file.index += 1
             output_file = output_file.sort_index()
             continue
@@ -123,13 +126,13 @@ def add_urls(urls):
             req = session.get(url)
         except requests.exceptions.ConnectionError:
             print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
-            output_file.loc[-1] = [parsed_url.hostname, url, ""]
+            output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"]
             output_file.index += 1
             output_file = output_file.sort_index()
             continue
 
         if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
-            output_file.loc[-1] = [parsed_url.hostname, url, ""]
+            output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"]
         else:
             forbidden = False
             html = BeautifulSoup(req.text, features="lxml")
@@ -141,7 +144,7 @@ def add_urls(urls):
                         forbidden = True
             if forbidden:
                 continue
-            output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]]
+            output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1], "Tor Hidden Service"]
         output_file.index += 1
         output_file = output_file.sort_index()