From e989f299d235cf4c40997033b217e299c71b5870 Mon Sep 17 00:00:00 2001 From: cynthia Date: Wed, 2 Apr 2025 12:09:31 +0000 Subject: [PATCH 01/12] add beautifulsoup4 in requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index cbf4a08..a887ce5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +beautifulsoup4==4.13.3 certifi==2024.12.14 charset-normalizer==3.4.1 idna==3.10 From 0a0233a8a23efa693972bff25f9ea2e2210c5ebd Mon Sep 17 00:00:00 2001 From: cynthia Date: Thu, 3 Apr 2025 16:43:22 +0000 Subject: [PATCH 02/12] add crawler.py --- scripts/crawler.py | 195 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 scripts/crawler.py diff --git a/scripts/crawler.py b/scripts/crawler.py new file mode 100644 index 0000000..0724324 --- /dev/null +++ b/scripts/crawler.py @@ -0,0 +1,195 @@ +from bs4 import BeautifulSoup +import pandas as pd +import requests +import urllib.parse +import argparse +import os +import re +from utils import print_colors + +parser = argparse.ArgumentParser( + prog='Lantern crawler', + description='Crawls .onion sites for links to more .onion sites') + +parser.add_argument('-l', '--limit', + help='Page crawl limit per .onion link.', type=int, default=10) +parser.add_argument('-o', '--output', + help='Output CSV file for found .onion links', type=str, default='onion_crawler.csv') +parser.add_argument('-c', '--crawler-file', + help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv') +parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default='verified.csv') +args = parser.parse_args() + +tor_host = os.getenv("TOR_HOST") +tor_port = os.getenv("TOR_PORT") +session = requests.session() +session.proxies = { + 'http': f'socks5h://{tor_host}:{tor_port}', + 'https': f'socks5h://{tor_host}:{tor_port}' +} +# Set user agent too for the crawler +session.headers.update({'User-Agent': 'LanternCrawler'}) + +def get_crawler_file(): + try: + # try to read the CSV file + return pd.read_csv(args.crawler_file) + except FileNotFoundError: + # make a new empty crawler file + return pd.DataFrame(columns=["URL","Counter"]) + +def get_output_file(): + try: + return pd.read_csv(args.output) + except FileNotFoundError: + return pd.DataFrame(columns=["URL","Name"]) + +# get list of .onion links from the verified.csv file +verified_csv_file = pd.read_csv(args.verified_csv) +crawler_file = get_crawler_file() +output_file = get_output_file() +vcsv_urls = [] +vcsv_hostnames = [] +crawled_urls = [] + +class CrawlerResult: + def __init__(self, firstp_urls, thirdp_urls): + self.firstp_urls = firstp_urls + self.thirdp_urls = thirdp_urls + +def add_urls(urls): + global output_file + for url in urls: + parsed_url = urllib.parse.urlparse(url) + if (output_file['URL'] == url).any() or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): + continue + + # Get information about the URL + print_colors(f'[+] Querying {url} for information.') + try: + req = session.get(url) + except requests.exceptions.ConnectionError: + print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway') + output_file.loc[-1] = [url, ""] + output_file.index += 1 + output_file = output_file.sort_index() + continue + + if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]: + output_file.loc[-1] = [url, ""] + else: + html = BeautifulSoup(req.text, features="lxml") + title = html.title.string if html.title is not None else "" + output_file.loc[-1] = [url, title] + output_file.index += 1 + output_file = output_file.sort_index() + +def extract_urls_html(url, text): + html = BeautifulSoup(text, features="lxml") + hostname_url = urllib.parse.urlparse(url).hostname + result = CrawlerResult([], []) + + for link in html.find_all('a'): + if not link.has_attr("href"): + print_colors('[E] Could not find link in element, Skipping link') + continue + + joined_url = urllib.parse.urljoin(url, link["href"]) + jurl_parsed = urllib.parse.urlparse(joined_url) + + print_colors(f'[D] Joined URL: {joined_url}') + # Check if the URL is a .onion link or not even a web link + if jurl_parsed.scheme != 'http': + continue + if not jurl_parsed.hostname.endswith('.onion'): + continue + + print_colors(f'[+] Found url: {joined_url}') + + # Check if the URL is a first-party link + if jurl_parsed.hostname == hostname_url: + if joined_url not in result.firstp_urls: + result.firstp_urls.append(joined_url) + else: + if joined_url not in result.thirdp_urls: + result.thirdp_urls.append(joined_url) + return result + +def extract_urls_txt(url, text): + hostname_url = urllib.parse.urlparse(url).hostname + result = CrawlerResult([], []) + url_regex = re.compile(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.onion\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)") + + # TODO: It won't find first party URL here, since the regex doesn't capture the path. + for found_url in url_regex.findall(text): + if hostname_url != urllib.parse.urlparse(found_url): + if found_url not in result.thirdp_urls: + result.thirdp_urls.append(found_url) + return result + +def crawl_url(url): + global crawler_file + # List of urls to crawl. + urls = [url] + counter = 0 + counter = crawler_file[crawler_file['URL'] == url]['Counter'].any() + + while counter < args.limit and len(urls) != 0: + cur_url = urls.pop() + cur_url_scheme = urllib.parse.urlparse(cur_url).scheme + + if cur_url in crawled_urls: + # Remove already-crawled urls + print_colors(f'[D] {cur_url} has already been crawled. Skipping') + continue + elif cur_url_scheme != 'http': + print_colors(f'[D] Unknown scheme found, Skipping: {cur_url}') + continue + + print_colors(f'[+] Crawling {cur_url}') + try: + req = session.get(cur_url) + except requests.exceptions.ConnectionError: + print_colors(f'[E] Failed to connect to {cur_url}') + continue + + crawled_urls.append(cur_url) + result = CrawlerResult([], []) + + # Determine the type of response from the headers + if "Content-Type" not in req.headers: + print_colors('[+] No content type found, Not extracting links.') + elif "text/plain" in req.headers["Content-Type"]: + result = extract_urls_txt(cur_url, req.text) + elif "text/html" in req.headers["Content-Type"]: + result = extract_urls_html(cur_url, req.text) + else: + print_colors(f'[+] Unknown content type encountered: {req.headers["Content-Type"]}') + [urls.append(x) for x in result.firstp_urls if x not in urls] + add_urls(result.thirdp_urls) + counter += 1 + + # Refresh counter in CSV file + if not crawler_file[crawler_file['URL'] == url].empty: + crawler_file.loc[crawler_file['URL'] == url, 'Counter'] = counter + else: + crawler_file.loc[-1] = [url, counter] + crawler_file.index += 1 + crawler_file = crawler_file.sort_index() + +for row in verified_csv_file.itertuples(): + vcsv_urls.append(row.URL) + vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname) + +for i, url in enumerate(vcsv_urls): + if not crawler_file[crawler_file['URL'] == url].empty: + print_colors(f'[+] {url} has already been crawled. Skipping') + continue + + if vcsv_hostnames[i] is None or not vcsv_hostnames[i].endswith('.onion'): + print_colors(f'[+] Skipping over non-onion link in verified.csv file: {url}') + continue + crawl_url(url) + crawler_file.to_csv(args.crawler_file, index=False) + output_file.to_csv(args.output, index=False) + From 08e2b5ab2091bd4a4f23f223b8979657709ec90d Mon Sep 17 00:00:00 2001 From: cynthia Date: Thu, 3 Apr 2025 21:53:37 +0000 Subject: [PATCH 03/12] switch to IsUrlValid --- scripts/crawler.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index 0724324..fcdf416 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -5,7 +5,7 @@ import urllib.parse import argparse import os import re -from utils import print_colors +from utils import print_colors, IsUrlValid parser = argparse.ArgumentParser( prog='Lantern crawler', @@ -99,9 +99,7 @@ def extract_urls_html(url, text): print_colors(f'[D] Joined URL: {joined_url}') # Check if the URL is a .onion link or not even a web link - if jurl_parsed.scheme != 'http': - continue - if not jurl_parsed.hostname.endswith('.onion'): + if not IsUrlValid(joined_url): continue print_colors(f'[+] Found url: {joined_url}') From 6cc297fe66874d29be57a97ffcb8c1d2aa12f8c1 Mon Sep 17 00:00:00 2001 From: cynthia Date: Fri, 4 Apr 2025 18:10:35 +0000 Subject: [PATCH 04/12] add dotenv --- scripts/crawler.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index fcdf416..b4cc6d0 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -7,6 +7,8 @@ import os import re from utils import print_colors, IsUrlValid +from dotenv import load_dotenv + parser = argparse.ArgumentParser( prog='Lantern crawler', description='Crawls .onion sites for links to more .onion sites') @@ -20,12 +22,21 @@ parser.add_argument('-c', '--crawler-file', parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default='verified.csv') args = parser.parse_args() +script_abs_path = os.path.dirname(os.path.abspath(__file__)) +env_path = os.path.join(script_abs_path+"/.env") +default_env_path = os.path.join(script_abs_path+"/.env.sample") + +if os.path.exists(env_path): + load_dotenv(dotenv_path=env_path) +else: + load_dotenv(dotenv_path=default_env_path) + tor_host = os.getenv("TOR_HOST") tor_port = os.getenv("TOR_PORT") session = requests.session() session.proxies = { - 'http': f'socks5h://{tor_host}:{tor_port}', - 'https': f'socks5h://{tor_host}:{tor_port}' + 'http': f'{tor_host}:{tor_port}', + 'https': f'{tor_host}:{tor_port}' } # Set user agent too for the crawler session.headers.update({'User-Agent': 'LanternCrawler'}) From c041e5df199dc5234b56b146fa68c5ba826a3497 Mon Sep 17 00:00:00 2001 From: cynthia Date: Sat, 5 Apr 2025 00:00:47 +0000 Subject: [PATCH 05/12] replace default parameters with paths to the current lantern instance paths --- scripts/crawler.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index b4cc6d0..783d714 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -4,11 +4,24 @@ import requests import urllib.parse import argparse import os +import pwd import re from utils import print_colors, IsUrlValid from dotenv import load_dotenv +# Make default parameters for arguments +rootpath='/srv/darknet-lantern/' +urlpath=pwd.getpwuid(os.getuid()).pw_dir+"/.darknet_participant_url" +instance = "" +if os.path.isfile(urlpath): + with open(urlpath) as f: + instance = f.read().rstrip() + +instancepath=rootpath+'www/participants/'+instance +verifiedcsvfile=instancepath+'/verified.csv' +blcsvfile=instancepath+'/blacklist.csv' + parser = argparse.ArgumentParser( prog='Lantern crawler', description='Crawls .onion sites for links to more .onion sites') @@ -19,7 +32,9 @@ parser.add_argument('-o', '--output', help='Output CSV file for found .onion links', type=str, default='onion_crawler.csv') parser.add_argument('-c', '--crawler-file', help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv') -parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default='verified.csv') +parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them', + type=str, default=blcsvfile) +parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile) args = parser.parse_args() script_abs_path = os.path.dirname(os.path.abspath(__file__)) From fe6b826027c2be7af9d53d8ee094fee8f601109e Mon Sep 17 00:00:00 2001 From: cynthia Date: Sat, 5 Apr 2025 00:27:54 +0000 Subject: [PATCH 06/12] add blacklist --- scripts/crawler.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scripts/crawler.py b/scripts/crawler.py index 783d714..7a52d9f 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -70,10 +70,17 @@ def get_output_file(): except FileNotFoundError: return pd.DataFrame(columns=["URL","Name"]) +def get_blacklist_file(): + try: + return pd.read_csv(args.blacklist_file) + except FileNotFoundError: + return None + # get list of .onion links from the verified.csv file verified_csv_file = pd.read_csv(args.verified_csv) crawler_file = get_crawler_file() output_file = get_output_file() +blacklist_file = get_blacklist_file() vcsv_urls = [] vcsv_hostnames = [] crawled_urls = [] @@ -104,8 +111,16 @@ def add_urls(urls): if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]: output_file.loc[-1] = [url, ""] else: + forbidden = False html = BeautifulSoup(req.text, features="lxml") title = html.title.string if html.title is not None else "" + if blacklist_file is not None: + for word in blacklist_file.loc["blacklisted-words"]: + if word in title: + print_colors('[+] Forbidden word found. Rejecting.') + forbidden = True + if forbidden: + continue output_file.loc[-1] = [url, title] output_file.index += 1 output_file = output_file.sort_index() From c6518363c1c83613ea424206b5a24526d2769ab1 Mon Sep 17 00:00:00 2001 From: cynthia Date: Sat, 5 Apr 2025 08:56:35 +0000 Subject: [PATCH 07/12] fix blacklist --- scripts/crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index 7a52d9f..3b5178c 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -115,8 +115,8 @@ def add_urls(urls): html = BeautifulSoup(req.text, features="lxml") title = html.title.string if html.title is not None else "" if blacklist_file is not None: - for word in blacklist_file.loc["blacklisted-words"]: - if word in title: + for row in blacklist_file.itertuples(): + if row[1] in title: print_colors('[+] Forbidden word found. Rejecting.') forbidden = True if forbidden: From e764c5e6f33919c5cb059d678a0ba7571242f0c5 Mon Sep 17 00:00:00 2001 From: cynthia Date: Sat, 5 Apr 2025 11:24:35 +0000 Subject: [PATCH 08/12] add new col for hostname, check for scheme and add http:// by default --- scripts/crawler.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index 3b5178c..053c388 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -68,7 +68,7 @@ def get_output_file(): try: return pd.read_csv(args.output) except FileNotFoundError: - return pd.DataFrame(columns=["URL","Name"]) + return pd.DataFrame(columns=["Hostname","URL","Name"]) def get_blacklist_file(): try: @@ -94,7 +94,7 @@ def add_urls(urls): global output_file for url in urls: parsed_url = urllib.parse.urlparse(url) - if (output_file['URL'] == url).any() or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): + if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): continue # Get information about the URL @@ -103,13 +103,13 @@ def add_urls(urls): req = session.get(url) except requests.exceptions.ConnectionError: print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway') - output_file.loc[-1] = [url, ""] + output_file.loc[-1] = [parsed_url.hostname, url, ""] output_file.index += 1 output_file = output_file.sort_index() continue if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]: - output_file.loc[-1] = [url, ""] + output_file.loc[-1] = [parsed_url.hostname, url, ""] else: forbidden = False html = BeautifulSoup(req.text, features="lxml") @@ -121,7 +121,7 @@ def add_urls(urls): forbidden = True if forbidden: continue - output_file.loc[-1] = [url, title] + output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]] output_file.index += 1 output_file = output_file.sort_index() @@ -217,8 +217,13 @@ def crawl_url(url): crawler_file = crawler_file.sort_index() for row in verified_csv_file.itertuples(): - vcsv_urls.append(row.URL) - vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname) + if urllib.parse.urlparse(row.URL).scheme: + vcsv_urls.append(row.URL) + vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname) + else: + url = "http://" + row.URL + vcsv_urls.append(url) + vcsv_hostnames.append(urllib.parse.urlparse(url).hostname) for i, url in enumerate(vcsv_urls): if not crawler_file[crawler_file['URL'] == url].empty: From 323a47dd4636e0b4461dfdb321061d9433a8c415 Mon Sep 17 00:00:00 2001 From: cynthia Date: Sat, 5 Apr 2025 11:30:50 +0000 Subject: [PATCH 09/12] fix blacklist matching --- scripts/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index 053c388..9bc35a3 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -116,7 +116,7 @@ def add_urls(urls): title = html.title.string if html.title is not None else "" if blacklist_file is not None: for row in blacklist_file.itertuples(): - if row[1] in title: + if row[1].lower() in title.lower(): print_colors('[+] Forbidden word found. Rejecting.') forbidden = True if forbidden: From e76d29807d9c9aa788c239b23afc47d0888d1269 Mon Sep 17 00:00:00 2001 From: cynthia Date: Sat, 5 Apr 2025 12:15:49 +0000 Subject: [PATCH 10/12] make the verified csv file optional argument --- scripts/crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index 9bc35a3..5d898c2 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -34,7 +34,7 @@ parser.add_argument('-c', '--crawler-file', help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv') parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them', type=str, default=blcsvfile) -parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile) +parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile) args = parser.parse_args() script_abs_path = os.path.dirname(os.path.abspath(__file__)) @@ -77,7 +77,7 @@ def get_blacklist_file(): return None # get list of .onion links from the verified.csv file -verified_csv_file = pd.read_csv(args.verified_csv) +verified_csv_file = pd.read_csv(args.verified_file) crawler_file = get_crawler_file() output_file = get_output_file() blacklist_file = get_blacklist_file() From 63d89b9b8b0e77c47db41e9c67e38542a1097f18 Mon Sep 17 00:00:00 2001 From: cynthia Date: Sat, 5 Apr 2025 14:41:26 +0000 Subject: [PATCH 11/12] add crawler dir for default crawler outputs --- .gitignore | 3 ++- scripts/crawler.py | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 73b66b1..f039455 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .git www/participants/** +crawler/** scripts/__pycache__/** .env -env/ \ No newline at end of file +env/ diff --git a/scripts/crawler.py b/scripts/crawler.py index 5d898c2..07858de 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -21,6 +21,9 @@ if os.path.isfile(urlpath): instancepath=rootpath+'www/participants/'+instance verifiedcsvfile=instancepath+'/verified.csv' blcsvfile=instancepath+'/blacklist.csv' +crawlerdir=instancepath+'/crawler' +if not os.path.exists(crawlerdir): + os.makedirs(crawlerdir) parser = argparse.ArgumentParser( prog='Lantern crawler', @@ -29,9 +32,11 @@ parser = argparse.ArgumentParser( parser.add_argument('-l', '--limit', help='Page crawl limit per .onion link.', type=int, default=10) parser.add_argument('-o', '--output', - help='Output CSV file for found .onion links', type=str, default='onion_crawler.csv') + help='Output CSV file for found .onion links', type=str, + default=os.path.join(crawlerdir, 'onion_crawler.csv')) parser.add_argument('-c', '--crawler-file', - help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv') + help='Crawler CSV file to log .onion sites and the amount crawled', type=str, + default=os.path.join(crawlerdir, 'crawler.csv')) parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them', type=str, default=blcsvfile) parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile) From e7abd258574855c3f31f1cab17f6a861ab71fb98 Mon Sep 17 00:00:00 2001 From: cynthia Date: Sat, 5 Apr 2025 15:32:01 +0000 Subject: [PATCH 12/12] switch instancepath to rootpath --- scripts/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index 07858de..5ac1452 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -21,7 +21,7 @@ if os.path.isfile(urlpath): instancepath=rootpath+'www/participants/'+instance verifiedcsvfile=instancepath+'/verified.csv' blcsvfile=instancepath+'/blacklist.csv' -crawlerdir=instancepath+'/crawler' +crawlerdir=rootpath+'/crawler' if not os.path.exists(crawlerdir): os.makedirs(crawlerdir)