From 0a0233a8a23efa693972bff25f9ea2e2210c5ebd Mon Sep 17 00:00:00 2001 From: cynthia Date: Thu, 3 Apr 2025 16:43:22 +0000 Subject: [PATCH] add crawler.py --- scripts/crawler.py | 195 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 scripts/crawler.py diff --git a/scripts/crawler.py b/scripts/crawler.py new file mode 100644 index 0000000..0724324 --- /dev/null +++ b/scripts/crawler.py @@ -0,0 +1,195 @@ +from bs4 import BeautifulSoup +import pandas as pd +import requests +import urllib.parse +import argparse +import os +import re +from utils import print_colors + +parser = argparse.ArgumentParser( + prog='Lantern crawler', + description='Crawls .onion sites for links to more .onion sites') + +parser.add_argument('-l', '--limit', + help='Page crawl limit per .onion link.', type=int, default=10) +parser.add_argument('-o', '--output', + help='Output CSV file for found .onion links', type=str, default='onion_crawler.csv') +parser.add_argument('-c', '--crawler-file', + help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv') +parser.add_argument('verified_csv', help='Input file to read for .onion links to crawl', type=str, default='verified.csv') +args = parser.parse_args() + +tor_host = os.getenv("TOR_HOST") +tor_port = os.getenv("TOR_PORT") +session = requests.session() +session.proxies = { + 'http': f'socks5h://{tor_host}:{tor_port}', + 'https': f'socks5h://{tor_host}:{tor_port}' +} +# Set user agent too for the crawler +session.headers.update({'User-Agent': 'LanternCrawler'}) + +def get_crawler_file(): + try: + # try to read the CSV file + return pd.read_csv(args.crawler_file) + except FileNotFoundError: + # make a new empty crawler file + return pd.DataFrame(columns=["URL","Counter"]) + +def get_output_file(): + try: + return pd.read_csv(args.output) + except FileNotFoundError: + return pd.DataFrame(columns=["URL","Name"]) + +# get list of .onion links from the verified.csv file +verified_csv_file = pd.read_csv(args.verified_csv) +crawler_file = get_crawler_file() +output_file = get_output_file() +vcsv_urls = [] +vcsv_hostnames = [] +crawled_urls = [] + +class CrawlerResult: + def __init__(self, firstp_urls, thirdp_urls): + self.firstp_urls = firstp_urls + self.thirdp_urls = thirdp_urls + +def add_urls(urls): + global output_file + for url in urls: + parsed_url = urllib.parse.urlparse(url) + if (output_file['URL'] == url).any() or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): + continue + + # Get information about the URL + print_colors(f'[+] Querying {url} for information.') + try: + req = session.get(url) + except requests.exceptions.ConnectionError: + print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway') + output_file.loc[-1] = [url, ""] + output_file.index += 1 + output_file = output_file.sort_index() + continue + + if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]: + output_file.loc[-1] = [url, ""] + else: + html = BeautifulSoup(req.text, features="lxml") + title = html.title.string if html.title is not None else "" + output_file.loc[-1] = [url, title] + output_file.index += 1 + output_file = output_file.sort_index() + +def extract_urls_html(url, text): + html = BeautifulSoup(text, features="lxml") + hostname_url = urllib.parse.urlparse(url).hostname + result = CrawlerResult([], []) + + for link in html.find_all('a'): + if not link.has_attr("href"): + print_colors('[E] Could not find link in element, Skipping link') + continue + + joined_url = urllib.parse.urljoin(url, link["href"]) + jurl_parsed = urllib.parse.urlparse(joined_url) + + print_colors(f'[D] Joined URL: {joined_url}') + # Check if the URL is a .onion link or not even a web link + if jurl_parsed.scheme != 'http': + continue + if not jurl_parsed.hostname.endswith('.onion'): + continue + + print_colors(f'[+] Found url: {joined_url}') + + # Check if the URL is a first-party link + if jurl_parsed.hostname == hostname_url: + if joined_url not in result.firstp_urls: + result.firstp_urls.append(joined_url) + else: + if joined_url not in result.thirdp_urls: + result.thirdp_urls.append(joined_url) + return result + +def extract_urls_txt(url, text): + hostname_url = urllib.parse.urlparse(url).hostname + result = CrawlerResult([], []) + url_regex = re.compile(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.onion\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)") + + # TODO: It won't find first party URL here, since the regex doesn't capture the path. + for found_url in url_regex.findall(text): + if hostname_url != urllib.parse.urlparse(found_url): + if found_url not in result.thirdp_urls: + result.thirdp_urls.append(found_url) + return result + +def crawl_url(url): + global crawler_file + # List of urls to crawl. + urls = [url] + counter = 0 + counter = crawler_file[crawler_file['URL'] == url]['Counter'].any() + + while counter < args.limit and len(urls) != 0: + cur_url = urls.pop() + cur_url_scheme = urllib.parse.urlparse(cur_url).scheme + + if cur_url in crawled_urls: + # Remove already-crawled urls + print_colors(f'[D] {cur_url} has already been crawled. Skipping') + continue + elif cur_url_scheme != 'http': + print_colors(f'[D] Unknown scheme found, Skipping: {cur_url}') + continue + + print_colors(f'[+] Crawling {cur_url}') + try: + req = session.get(cur_url) + except requests.exceptions.ConnectionError: + print_colors(f'[E] Failed to connect to {cur_url}') + continue + + crawled_urls.append(cur_url) + result = CrawlerResult([], []) + + # Determine the type of response from the headers + if "Content-Type" not in req.headers: + print_colors('[+] No content type found, Not extracting links.') + elif "text/plain" in req.headers["Content-Type"]: + result = extract_urls_txt(cur_url, req.text) + elif "text/html" in req.headers["Content-Type"]: + result = extract_urls_html(cur_url, req.text) + else: + print_colors(f'[+] Unknown content type encountered: {req.headers["Content-Type"]}') + [urls.append(x) for x in result.firstp_urls if x not in urls] + add_urls(result.thirdp_urls) + counter += 1 + + # Refresh counter in CSV file + if not crawler_file[crawler_file['URL'] == url].empty: + crawler_file.loc[crawler_file['URL'] == url, 'Counter'] = counter + else: + crawler_file.loc[-1] = [url, counter] + crawler_file.index += 1 + crawler_file = crawler_file.sort_index() + +for row in verified_csv_file.itertuples(): + vcsv_urls.append(row.URL) + vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname) + +for i, url in enumerate(vcsv_urls): + if not crawler_file[crawler_file['URL'] == url].empty: + print_colors(f'[+] {url} has already been crawled. Skipping') + continue + + if vcsv_hostnames[i] is None or not vcsv_hostnames[i].endswith('.onion'): + print_colors(f'[+] Skipping over non-onion link in verified.csv file: {url}') + continue + crawl_url(url) + crawler_file.to_csv(args.crawler_file, index=False) + output_file.to_csv(args.output, index=False) +