From e989f299d235cf4c40997033b217e299c71b5870 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Wed, 2 Apr 2025 12:09:31 +0000
Subject: [PATCH 01/12] add beautifulsoup4 in requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index cbf4a08..a887ce5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+beautifulsoup4==4.13.3
 certifi==2024.12.14
 charset-normalizer==3.4.1
 idna==3.10

From 0a0233a8a23efa693972bff25f9ea2e2210c5ebd Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Thu, 3 Apr 2025 16:43:22 +0000
Subject: [PATCH 02/12] add crawler.py

---
 scripts/crawler.py | 195 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 scripts/crawler.py

diff --git a/scripts/crawler.py b/scripts/crawler.py
new file mode 100644
index 0000000..0724324
--- /dev/null
+++ b/scripts/crawler.py
@@ -0,0 +1,195 @@
+from bs4 import BeautifulSoup
+import pandas as pd
+import requests
+import urllib.parse
+import argparse
+import os
+import re
+from utils import print_colors
+
+parser = argparse.ArgumentParser(
+    prog='Lantern crawler',
+    description='Crawls .onion sites for links to more .onion sites')
+
+parser.add_argument('-l', '--limit',
+                    help='Page crawl limit per .onion link.', type=int, default=10)
+parser.add_argument('-o', '--output',
+                    help='Output CSV file for found .onion links', type=str, default='onion_crawler.csv')
+parser.add_argument('-c', '--crawler-file',
+                    help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv')
+parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default='verified.csv')
+args = parser.parse_args()
+
+tor_host = os.getenv("TOR_HOST")
+tor_port = os.getenv("TOR_PORT")
+session = requests.session()
+session.proxies = {
+    'http': f'socks5h://{tor_host}:{tor_port}',
+    'https': f'socks5h://{tor_host}:{tor_port}'
+}
+# Set user agent too for the crawler
+session.headers.update({'User-Agent': 'LanternCrawler'})
+
+def get_crawler_file():
+    try:
+        # try to read the CSV file
+        return pd.read_csv(args.crawler_file)
+    except FileNotFoundError:
+        # make a new empty crawler file
+        return pd.DataFrame(columns=["URL","Counter"])
+
+def get_output_file():
+    try:
+        return pd.read_csv(args.output)
+    except FileNotFoundError:
+        return pd.DataFrame(columns=["URL","Name"])
+
+# get list of .onion links from the verified.csv file
+verified_csv_file = pd.read_csv(args.verified_csv)
+crawler_file = get_crawler_file()
+output_file = get_output_file()
+vcsv_urls = []
+vcsv_hostnames = []
+crawled_urls = []
+
+class CrawlerResult:
+    def __init__(self, firstp_urls, thirdp_urls):
+        self.firstp_urls = firstp_urls
+        self.thirdp_urls = thirdp_urls
+
+def add_urls(urls):
+    global output_file
+    for url in urls:
+        parsed_url = urllib.parse.urlparse(url)
+        if (output_file['URL'] == url).any() or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
+            continue
+
+        # Get information about the URL
+        print_colors(f'[+] Querying {url} for information.')
+        try:
+            req = session.get(url)
+        except requests.exceptions.ConnectionError:
+            print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
+            output_file.loc[-1] = [url, ""]
+            output_file.index += 1
+            output_file = output_file.sort_index()
+            continue
+
+        if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
+            output_file.loc[-1] = [url, ""]
+        else:
+            html = BeautifulSoup(req.text, features="lxml")
+            title = html.title.string if html.title is not None else ""
+            output_file.loc[-1] = [url, title]
+        output_file.index += 1
+        output_file = output_file.sort_index()
+
+def extract_urls_html(url, text):
+    html = BeautifulSoup(text, features="lxml")
+    hostname_url = urllib.parse.urlparse(url).hostname
+    result = CrawlerResult([], [])
+
+    for link in html.find_all('a'):
+        if not link.has_attr("href"):
+            print_colors('[E] Could not find link in <a> element, Skipping link')
+            continue
+
+        joined_url = urllib.parse.urljoin(url, link["href"])
+        jurl_parsed = urllib.parse.urlparse(joined_url)
+
+        print_colors(f'[D] Joined URL: {joined_url}')
+        # Check if the URL is a .onion link or not even a web link
+        if jurl_parsed.scheme != 'http':
+            continue
+        if not jurl_parsed.hostname.endswith('.onion'):
+            continue
+
+        print_colors(f'[+] Found url: {joined_url}')
+
+        # Check if the URL is a first-party link
+        if jurl_parsed.hostname == hostname_url:
+            if joined_url not in result.firstp_urls:
+                result.firstp_urls.append(joined_url)
+        else:
+            if joined_url not in result.thirdp_urls:
+                result.thirdp_urls.append(joined_url)
+    return result
+
+def extract_urls_txt(url, text):
+    hostname_url = urllib.parse.urlparse(url).hostname
+    result = CrawlerResult([], [])
+    url_regex = re.compile(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.onion\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)")
+
+    # TODO: It won't find first party URL here, since the regex doesn't capture the path.
+    for found_url in url_regex.findall(text):
+        if hostname_url != urllib.parse.urlparse(found_url):
+            if found_url not in result.thirdp_urls:
+                result.thirdp_urls.append(found_url)
+    return result
+
+def crawl_url(url):
+    global crawler_file
+    # List of urls to crawl.
+    urls = [url]
+    counter = 0
+    counter = crawler_file[crawler_file['URL'] == url]['Counter'].any()
+
+    while counter < args.limit and len(urls) != 0:
+        cur_url = urls.pop()
+        cur_url_scheme = urllib.parse.urlparse(cur_url).scheme
+
+        if cur_url in crawled_urls:
+            # Remove already-crawled urls
+            print_colors(f'[D] {cur_url} has already been crawled. Skipping')
+            continue
+        elif cur_url_scheme != 'http':
+            print_colors(f'[D] Unknown scheme found, Skipping: {cur_url}')
+            continue
+
+        print_colors(f'[+] Crawling {cur_url}')
+        try:
+            req = session.get(cur_url)
+        except requests.exceptions.ConnectionError:
+            print_colors(f'[E] Failed to connect to {cur_url}')
+            continue
+
+        crawled_urls.append(cur_url)
+        result = CrawlerResult([], [])
+
+        # Determine the type of response from the headers
+        if "Content-Type" not in req.headers:
+            print_colors('[+] No content type found, Not extracting links.')
+        elif "text/plain" in req.headers["Content-Type"]:
+            result = extract_urls_txt(cur_url, req.text)
+        elif "text/html" in req.headers["Content-Type"]:
+            result = extract_urls_html(cur_url, req.text)
+        else:
+            print_colors(f'[+] Unknown content type encountered: {req.headers["Content-Type"]}')
+        [urls.append(x) for x in result.firstp_urls if x not in urls]
+        add_urls(result.thirdp_urls)
+        counter += 1
+        
+        # Refresh counter in CSV file
+        if not crawler_file[crawler_file['URL'] == url].empty:
+            crawler_file.loc[crawler_file['URL'] == url, 'Counter'] = counter
+        else:
+            crawler_file.loc[-1] = [url, counter]
+            crawler_file.index += 1
+            crawler_file = crawler_file.sort_index()
+
+for row in verified_csv_file.itertuples():
+    vcsv_urls.append(row.URL)
+    vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname)
+
+for i, url in enumerate(vcsv_urls):
+    if not crawler_file[crawler_file['URL'] == url].empty:
+        print_colors(f'[+] {url} has already been crawled. Skipping')
+        continue
+
+    if vcsv_hostnames[i] is None or not vcsv_hostnames[i].endswith('.onion'):
+        print_colors(f'[+] Skipping over non-onion link in verified.csv file: {url}')
+        continue
+    crawl_url(url)
+    crawler_file.to_csv(args.crawler_file, index=False)
+    output_file.to_csv(args.output, index=False)
+

From 08e2b5ab2091bd4a4f23f223b8979657709ec90d Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Thu, 3 Apr 2025 21:53:37 +0000
Subject: [PATCH 03/12] switch to IsUrlValid

---
 scripts/crawler.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index 0724324..fcdf416 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -5,7 +5,7 @@ import urllib.parse
 import argparse
 import os
 import re
-from utils import print_colors
+from utils import print_colors, IsUrlValid
 
 parser = argparse.ArgumentParser(
     prog='Lantern crawler',
@@ -99,9 +99,7 @@ def extract_urls_html(url, text):
 
         print_colors(f'[D] Joined URL: {joined_url}')
         # Check if the URL is a .onion link or not even a web link
-        if jurl_parsed.scheme != 'http':
-            continue
-        if not jurl_parsed.hostname.endswith('.onion'):
+        if not IsUrlValid(joined_url):
             continue
 
         print_colors(f'[+] Found url: {joined_url}')

From 6cc297fe66874d29be57a97ffcb8c1d2aa12f8c1 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Fri, 4 Apr 2025 18:10:35 +0000
Subject: [PATCH 04/12] add dotenv

---
 scripts/crawler.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index fcdf416..b4cc6d0 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -7,6 +7,8 @@ import os
 import re
 from utils import print_colors, IsUrlValid
 
+from dotenv import load_dotenv
+
 parser = argparse.ArgumentParser(
     prog='Lantern crawler',
     description='Crawls .onion sites for links to more .onion sites')
@@ -20,12 +22,21 @@ parser.add_argument('-c', '--crawler-file',
 parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default='verified.csv')
 args = parser.parse_args()
 
+script_abs_path = os.path.dirname(os.path.abspath(__file__))
+env_path = os.path.join(script_abs_path+"/.env")
+default_env_path = os.path.join(script_abs_path+"/.env.sample")
+
+if os.path.exists(env_path):
+    load_dotenv(dotenv_path=env_path)
+else:
+    load_dotenv(dotenv_path=default_env_path)
+
 tor_host = os.getenv("TOR_HOST")
 tor_port = os.getenv("TOR_PORT")
 session = requests.session()
 session.proxies = {
-    'http': f'socks5h://{tor_host}:{tor_port}',
-    'https': f'socks5h://{tor_host}:{tor_port}'
+    'http': f'{tor_host}:{tor_port}',
+    'https': f'{tor_host}:{tor_port}'
 }
 # Set user agent too for the crawler
 session.headers.update({'User-Agent': 'LanternCrawler'})

From c041e5df199dc5234b56b146fa68c5ba826a3497 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Sat, 5 Apr 2025 00:00:47 +0000
Subject: [PATCH 05/12] replace default parameters with paths to the current
 lantern instance paths

---
 scripts/crawler.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index b4cc6d0..783d714 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -4,11 +4,24 @@ import requests
 import urllib.parse
 import argparse
 import os
+import pwd
 import re
 from utils import print_colors, IsUrlValid
 
 from dotenv import load_dotenv
 
+# Make default parameters for arguments
+rootpath='/srv/darknet-lantern/'
+urlpath=pwd.getpwuid(os.getuid()).pw_dir+"/.darknet_participant_url"
+instance = ""
+if os.path.isfile(urlpath):
+    with open(urlpath) as f:
+        instance = f.read().rstrip()
+
+instancepath=rootpath+'www/participants/'+instance
+verifiedcsvfile=instancepath+'/verified.csv'
+blcsvfile=instancepath+'/blacklist.csv'
+
 parser = argparse.ArgumentParser(
     prog='Lantern crawler',
     description='Crawls .onion sites for links to more .onion sites')
@@ -19,7 +32,9 @@ parser.add_argument('-o', '--output',
                     help='Output CSV file for found .onion links', type=str, default='onion_crawler.csv')
 parser.add_argument('-c', '--crawler-file',
                     help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv')
-parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default='verified.csv')
+parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them',
+                    type=str, default=blcsvfile)
+parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)
 args = parser.parse_args()
 
 script_abs_path = os.path.dirname(os.path.abspath(__file__))

From fe6b826027c2be7af9d53d8ee094fee8f601109e Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Sat, 5 Apr 2025 00:27:54 +0000
Subject: [PATCH 06/12] add blacklist

---
 scripts/crawler.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index 783d714..7a52d9f 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -70,10 +70,17 @@ def get_output_file():
     except FileNotFoundError:
         return pd.DataFrame(columns=["URL","Name"])
 
+def get_blacklist_file():
+    try:
+        return pd.read_csv(args.blacklist_file)
+    except FileNotFoundError:
+        return None
+
 # get list of .onion links from the verified.csv file
 verified_csv_file = pd.read_csv(args.verified_csv)
 crawler_file = get_crawler_file()
 output_file = get_output_file()
+blacklist_file = get_blacklist_file()
 vcsv_urls = []
 vcsv_hostnames = []
 crawled_urls = []
@@ -104,8 +111,16 @@ def add_urls(urls):
         if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
             output_file.loc[-1] = [url, ""]
         else:
+            forbidden = False
             html = BeautifulSoup(req.text, features="lxml")
             title = html.title.string if html.title is not None else ""
+            if blacklist_file is not None:
+                for word in blacklist_file.loc["blacklisted-words"]:
+                    if word in title:
+                        print_colors('[+] Forbidden word found. Rejecting.')
+                        forbidden = True
+            if forbidden:
+                continue
             output_file.loc[-1] = [url, title]
         output_file.index += 1
         output_file = output_file.sort_index()

From c6518363c1c83613ea424206b5a24526d2769ab1 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Sat, 5 Apr 2025 08:56:35 +0000
Subject: [PATCH 07/12] fix blacklist

---
 scripts/crawler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index 7a52d9f..3b5178c 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -115,8 +115,8 @@ def add_urls(urls):
             html = BeautifulSoup(req.text, features="lxml")
             title = html.title.string if html.title is not None else ""
             if blacklist_file is not None:
-                for word in blacklist_file.loc["blacklisted-words"]:
-                    if word in title:
+                for row in blacklist_file.itertuples():
+                    if row[1] in title:
                         print_colors('[+] Forbidden word found. Rejecting.')
                         forbidden = True
             if forbidden:

From e764c5e6f33919c5cb059d678a0ba7571242f0c5 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Sat, 5 Apr 2025 11:24:35 +0000
Subject: [PATCH 08/12] add new col for hostname, check for scheme and add
 http:// by default

---
 scripts/crawler.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index 3b5178c..053c388 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -68,7 +68,7 @@ def get_output_file():
     try:
         return pd.read_csv(args.output)
     except FileNotFoundError:
-        return pd.DataFrame(columns=["URL","Name"])
+        return pd.DataFrame(columns=["Hostname","URL","Name"])
 
 def get_blacklist_file():
     try:
@@ -94,7 +94,7 @@ def add_urls(urls):
     global output_file
     for url in urls:
         parsed_url = urllib.parse.urlparse(url)
-        if (output_file['URL'] == url).any() or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
+        if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
             continue
 
         # Get information about the URL
@@ -103,13 +103,13 @@ def add_urls(urls):
             req = session.get(url)
         except requests.exceptions.ConnectionError:
             print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
-            output_file.loc[-1] = [url, ""]
+            output_file.loc[-1] = [parsed_url.hostname, url, ""]
             output_file.index += 1
             output_file = output_file.sort_index()
             continue
 
         if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
-            output_file.loc[-1] = [url, ""]
+            output_file.loc[-1] = [parsed_url.hostname, url, ""]
         else:
             forbidden = False
             html = BeautifulSoup(req.text, features="lxml")
@@ -121,7 +121,7 @@ def add_urls(urls):
                         forbidden = True
             if forbidden:
                 continue
-            output_file.loc[-1] = [url, title]
+            output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]]
         output_file.index += 1
         output_file = output_file.sort_index()
 
@@ -217,8 +217,13 @@ def crawl_url(url):
             crawler_file = crawler_file.sort_index()
 
 for row in verified_csv_file.itertuples():
-    vcsv_urls.append(row.URL)
-    vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname)
+    if urllib.parse.urlparse(row.URL).scheme:
+        vcsv_urls.append(row.URL)
+        vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname)
+    else:
+        url = "http://" + row.URL
+        vcsv_urls.append(url)
+        vcsv_hostnames.append(urllib.parse.urlparse(url).hostname)
 
 for i, url in enumerate(vcsv_urls):
     if not crawler_file[crawler_file['URL'] == url].empty:

From 323a47dd4636e0b4461dfdb321061d9433a8c415 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Sat, 5 Apr 2025 11:30:50 +0000
Subject: [PATCH 09/12] fix blacklist matching

---
 scripts/crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index 053c388..9bc35a3 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -116,7 +116,7 @@ def add_urls(urls):
             title = html.title.string if html.title is not None else ""
             if blacklist_file is not None:
                 for row in blacklist_file.itertuples():
-                    if row[1] in title:
+                    if row[1].lower() in title.lower():
                         print_colors('[+] Forbidden word found. Rejecting.')
                         forbidden = True
             if forbidden:

From e76d29807d9c9aa788c239b23afc47d0888d1269 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Sat, 5 Apr 2025 12:15:49 +0000
Subject: [PATCH 10/12] make the verified csv file optional argument

---
 scripts/crawler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index 9bc35a3..5d898c2 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -34,7 +34,7 @@ parser.add_argument('-c', '--crawler-file',
                     help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv')
 parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them',
                     type=str, default=blcsvfile)
-parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)
+parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)
 args = parser.parse_args()
 
 script_abs_path = os.path.dirname(os.path.abspath(__file__))
@@ -77,7 +77,7 @@ def get_blacklist_file():
         return None
 
 # get list of .onion links from the verified.csv file
-verified_csv_file = pd.read_csv(args.verified_csv)
+verified_csv_file = pd.read_csv(args.verified_file)
 crawler_file = get_crawler_file()
 output_file = get_output_file()
 blacklist_file = get_blacklist_file()

From 63d89b9b8b0e77c47db41e9c67e38542a1097f18 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Sat, 5 Apr 2025 14:41:26 +0000
Subject: [PATCH 11/12] add crawler dir for default crawler outputs

---
 .gitignore         | 3 ++-
 scripts/crawler.py | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 73b66b1..f039455 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .git
 www/participants/**
+crawler/**
 scripts/__pycache__/**
 .env
-env/
\ No newline at end of file
+env/
diff --git a/scripts/crawler.py b/scripts/crawler.py
index 5d898c2..07858de 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -21,6 +21,9 @@ if os.path.isfile(urlpath):
 instancepath=rootpath+'www/participants/'+instance
 verifiedcsvfile=instancepath+'/verified.csv'
 blcsvfile=instancepath+'/blacklist.csv'
+crawlerdir=instancepath+'/crawler'
+if not os.path.exists(crawlerdir):
+    os.makedirs(crawlerdir)
 
 parser = argparse.ArgumentParser(
     prog='Lantern crawler',
@@ -29,9 +32,11 @@ parser = argparse.ArgumentParser(
 parser.add_argument('-l', '--limit',
                     help='Page crawl limit per .onion link.', type=int, default=10)
 parser.add_argument('-o', '--output',
-                    help='Output CSV file for found .onion links', type=str, default='onion_crawler.csv')
+                    help='Output CSV file for found .onion links', type=str,
+                    default=os.path.join(crawlerdir, 'onion_crawler.csv'))
 parser.add_argument('-c', '--crawler-file',
-                    help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv')
+                    help='Crawler CSV file to log .onion sites and the amount crawled', type=str, 
+                    default=os.path.join(crawlerdir, 'crawler.csv'))
 parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them',
                     type=str, default=blcsvfile)
 parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)

From e7abd258574855c3f31f1cab17f6a861ab71fb98 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Sat, 5 Apr 2025 15:32:01 +0000
Subject: [PATCH 12/12] switch instancepath to rootpath

---
 scripts/crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index 07858de..5ac1452 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -21,7 +21,7 @@ if os.path.isfile(urlpath):
 instancepath=rootpath+'www/participants/'+instance
 verifiedcsvfile=instancepath+'/verified.csv'
 blcsvfile=instancepath+'/blacklist.csv'
-crawlerdir=instancepath+'/crawler'
+crawlerdir=rootpath+'/crawler'
 if not os.path.exists(crawlerdir):
     os.makedirs(crawlerdir)