From 8ef1bb0910055d4fa0f813e591523cf199a54b5f Mon Sep 17 00:00:00 2001 From: cynthia Date: Fri, 18 Apr 2025 10:04:19 +0100 Subject: [PATCH] categorize simplex links and tor onion links --- scripts/crawler.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index d5042e5..c7446ea 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -27,8 +27,6 @@ instancepath=rootpath+'www/participants/'+instance verifiedcsvfile=instancepath+'/verified.csv' blcsvfile=instancepath+'/blacklist.csv' crawlerdir=rootpath+'/crawler' -if not os.path.exists(crawlerdir): - os.makedirs(crawlerdir) parser = argparse.ArgumentParser( prog='Lantern crawler', @@ -45,8 +43,13 @@ parser.add_argument('-c', '--crawler-file', parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them', type=str, default=blcsvfile) parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile) +parser.add_argument('-n', '--no-create-crawler-dir', help='Don\'t create the crawler directory automatically.', action='store_true') args = parser.parse_args() +if not args.no_create_crawler_dir: + if not os.path.exists(crawlerdir): + os.makedirs(crawlerdir) + script_abs_path = os.path.dirname(os.path.abspath(__file__)) env_path = os.path.join(script_abs_path+"/.env") default_env_path = os.path.join(script_abs_path+"/.env.sample") @@ -78,7 +81,7 @@ def get_output_file(): try: return pd.read_csv(args.output) except FileNotFoundError: - return pd.DataFrame(columns=["Hostname","URL","Name"]) + return pd.DataFrame(columns=["Hostname","URL","Name", "Category"]) def get_blacklist_file(): try: @@ -105,12 +108,12 @@ def add_urls(urls): for url in urls: parsed_url = urllib.parse.urlparse(url) if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any(): - output_file.loc[-1] = ["", url, "SimpleX Chatroom"] + output_file.loc[-1] = ["", url, "", "SimpleX Chatroom"] output_file.index += 1 output_file = output_file.sort_index() continue elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any(): - output_file.loc[-1] = ["", url, "SimpleX Server"] + output_file.loc[-1] = ["", url, "", "SimpleX Server"] output_file.index += 1 output_file = output_file.sort_index() continue @@ -123,13 +126,13 @@ def add_urls(urls): req = session.get(url) except requests.exceptions.ConnectionError: print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway') - output_file.loc[-1] = [parsed_url.hostname, url, ""] + output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"] output_file.index += 1 output_file = output_file.sort_index() continue if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]: - output_file.loc[-1] = [parsed_url.hostname, url, ""] + output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"] else: forbidden = False html = BeautifulSoup(req.text, features="lxml") @@ -141,7 +144,7 @@ def add_urls(urls): forbidden = True if forbidden: continue - output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]] + output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1], "Tor Hidden Service"] output_file.index += 1 output_file = output_file.sort_index()