from bs4 import BeautifulSoup import pandas as pd import requests import urllib.parse import argparse import os import pwd import re # To have the ability to load the SimpleX module import sys sys.path.append("..") from utils import print_colors, IsUrlValid from SimpleX.regex_simplexlinks import IsSimpleXChatroomValid, IsSimpleXServerValid from dotenv import load_dotenv # Make default parameters for arguments rootpath='/srv/darknet-lantern/' urlpath=pwd.getpwuid(os.getuid()).pw_dir+"/.darknet_participant_url" instance = "" if os.path.isfile(urlpath): with open(urlpath) as f: instance = f.read().rstrip() instancepath=rootpath+'www/participants/'+instance verifiedcsvfile=instancepath+'/verified.csv' blcsvfile=instancepath+'/blacklist.csv' crawlerdir=rootpath+'/crawler' if not os.path.exists(crawlerdir): os.makedirs(crawlerdir) parser = argparse.ArgumentParser( prog='Lantern crawler', description='Crawls .onion sites for links to more .onion sites') parser.add_argument('-l', '--limit', help='Page crawl limit per .onion link.', type=int, default=10) parser.add_argument('-o', '--output', help='Output CSV file for found .onion links', type=str, default=os.path.join(crawlerdir, 'onion_crawler.csv')) parser.add_argument('-c', '--crawler-file', help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default=os.path.join(crawlerdir, 'crawler.csv')) parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them', type=str, default=blcsvfile) parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile) args = parser.parse_args() script_abs_path = os.path.dirname(os.path.abspath(__file__)) env_path = os.path.join(script_abs_path+"/.env") default_env_path = os.path.join(script_abs_path+"/.env.sample") if os.path.exists(env_path): load_dotenv(dotenv_path=env_path) else: load_dotenv(dotenv_path=default_env_path) tor_host = os.getenv("TOR_HOST") tor_port = os.getenv("TOR_PORT") session = requests.session() session.proxies = { 'http': f'{tor_host}:{tor_port}', 'https': f'{tor_host}:{tor_port}' } # Set user agent too for the crawler session.headers.update({'User-Agent': 'LanternCrawler'}) def get_crawler_file(): try: # try to read the CSV file return pd.read_csv(args.crawler_file) except FileNotFoundError: # make a new empty crawler file return pd.DataFrame(columns=["URL","Counter"]) def get_output_file(): try: return pd.read_csv(args.output) except FileNotFoundError: return pd.DataFrame(columns=["Hostname","URL","Name"]) def get_blacklist_file(): try: return pd.read_csv(args.blacklist_file) except FileNotFoundError: return None # get list of .onion links from the verified.csv file verified_csv_file = pd.read_csv(args.verified_file) crawler_file = get_crawler_file() output_file = get_output_file() blacklist_file = get_blacklist_file() vcsv_urls = [] vcsv_hostnames = [] crawled_urls = [] class CrawlerResult: def __init__(self, firstp_urls, thirdp_urls): self.firstp_urls = firstp_urls self.thirdp_urls = thirdp_urls def add_urls(urls): global output_file for url in urls: parsed_url = urllib.parse.urlparse(url) if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any(): output_file.loc[-1] = ["", url, "SimpleX Chatroom"] output_file.index += 1 output_file = output_file.sort_index() continue elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any(): output_file.loc[-1] = ["", url, "SimpleX Server"] output_file.index += 1 output_file = output_file.sort_index() continue elif (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): continue # Get information about the URL print_colors(f'[+] Querying {url} for information.') try: req = session.get(url) except requests.exceptions.ConnectionError: print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway') output_file.loc[-1] = [parsed_url.hostname, url, ""] output_file.index += 1 output_file = output_file.sort_index() continue if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]: output_file.loc[-1] = [parsed_url.hostname, url, ""] else: forbidden = False html = BeautifulSoup(req.text, features="lxml") title = html.title.string if html.title is not None else "" if blacklist_file is not None: for row in blacklist_file.itertuples(): if row[1].lower() in title.lower(): print_colors('[+] Forbidden word found. Rejecting.') forbidden = True if forbidden: continue output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]] output_file.index += 1 output_file = output_file.sort_index() def extract_urls_html(url, text): html = BeautifulSoup(text, features="lxml") hostname_url = urllib.parse.urlparse(url).hostname result = CrawlerResult([], []) for link in html.find_all('a'): if not link.has_attr("href"): print_colors('[E] Could not find link in element, Skipping link') continue joined_url = urllib.parse.urljoin(url, link["href"]) jurl_parsed = urllib.parse.urlparse(joined_url) print_colors(f'[D] Joined URL: {joined_url}') # Capture SimpleX URLs if IsSimpleXChatroomValid(joined_url) or IsSimpleXServerValid(joined_url): if url not in result.thirdp_urls: result.thirdp_urls.append(joined_url) continue # Check if the URL is a .onion link or not even a web link if not IsUrlValid(joined_url): continue print_colors(f'[+] Found url: {joined_url}') # Check if the URL is a first-party link if jurl_parsed.hostname == hostname_url: if joined_url not in result.firstp_urls: result.firstp_urls.append(joined_url) else: if joined_url not in result.thirdp_urls: result.thirdp_urls.append(joined_url) return result def extract_urls_txt(url, text): hostname_url = urllib.parse.urlparse(url).hostname result = CrawlerResult([], []) url_regex = re.compile(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.onion\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)") # TODO: It won't find first party URL here, since the regex doesn't capture the path. for found_url in url_regex.findall(text): if hostname_url != urllib.parse.urlparse(found_url): if found_url not in result.thirdp_urls: result.thirdp_urls.append(found_url) return result def crawl_url(url): global crawler_file # List of urls to crawl. urls = [url] counter = 0 counter = crawler_file[crawler_file['URL'] == url]['Counter'].any() while counter < args.limit and len(urls) != 0: cur_url = urls.pop() cur_url_scheme = urllib.parse.urlparse(cur_url).scheme if cur_url in crawled_urls: # Remove already-crawled urls print_colors(f'[D] {cur_url} has already been crawled. Skipping') continue elif cur_url_scheme != 'http': print_colors(f'[D] Unknown scheme found, Skipping: {cur_url}') continue print_colors(f'[+] Crawling {cur_url}') try: req = session.get(cur_url) except requests.exceptions.ConnectionError: print_colors(f'[E] Failed to connect to {cur_url}') continue crawled_urls.append(cur_url) result = CrawlerResult([], []) # Determine the type of response from the headers if "Content-Type" not in req.headers: print_colors('[+] No content type found, Not extracting links.') elif "text/plain" in req.headers["Content-Type"]: result = extract_urls_txt(cur_url, req.text) elif "text/html" in req.headers["Content-Type"]: result = extract_urls_html(cur_url, req.text) else: print_colors(f'[+] Unknown content type encountered: {req.headers["Content-Type"]}') [urls.append(x) for x in result.firstp_urls if x not in urls] add_urls(result.thirdp_urls) counter += 1 # Refresh counter in CSV file if not crawler_file[crawler_file['URL'] == url].empty: crawler_file.loc[crawler_file['URL'] == url, 'Counter'] = counter else: crawler_file.loc[-1] = [url, counter] crawler_file.index += 1 crawler_file = crawler_file.sort_index() for row in verified_csv_file.itertuples(): if urllib.parse.urlparse(row.URL).scheme: vcsv_urls.append(row.URL) vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname) else: url = "http://" + row.URL vcsv_urls.append(url) vcsv_hostnames.append(urllib.parse.urlparse(url).hostname) for i, url in enumerate(vcsv_urls): if not crawler_file[crawler_file['URL'] == url].empty: print_colors(f'[+] {url} has already been crawled. Skipping') continue if vcsv_hostnames[i] is None or not vcsv_hostnames[i].endswith('.onion'): print_colors(f'[+] Skipping over non-onion link in verified.csv file: {url}') continue crawl_url(url) crawler_file.to_csv(args.crawler_file, index=False) output_file.to_csv(args.output, index=False)