From bf407302921e56004f186cac1556aae4653db834 Mon Sep 17 00:00:00 2001 From: cynthia Date: Tue, 15 Apr 2025 00:34:57 +0100 Subject: [PATCH] make special case for simplex links --- scripts/crawler.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/scripts/crawler.py b/scripts/crawler.py index 5ac1452..d5042e5 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -6,8 +6,13 @@ import argparse import os import pwd import re -from utils import print_colors, IsUrlValid +# To have the ability to load the SimpleX module +import sys +sys.path.append("..") + +from utils import print_colors, IsUrlValid +from SimpleX.regex_simplexlinks import IsSimpleXChatroomValid, IsSimpleXServerValid from dotenv import load_dotenv # Make default parameters for arguments @@ -99,7 +104,17 @@ def add_urls(urls): global output_file for url in urls: parsed_url = urllib.parse.urlparse(url) - if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): + if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any(): + output_file.loc[-1] = ["", url, "SimpleX Chatroom"] + output_file.index += 1 + output_file = output_file.sort_index() + continue + elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any(): + output_file.loc[-1] = ["", url, "SimpleX Server"] + output_file.index += 1 + output_file = output_file.sort_index() + continue + elif (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): continue # Get information about the URL @@ -144,6 +159,13 @@ def extract_urls_html(url, text): jurl_parsed = urllib.parse.urlparse(joined_url) print_colors(f'[D] Joined URL: {joined_url}') + + # Capture SimpleX URLs + if IsSimpleXChatroomValid(joined_url) or IsSimpleXServerValid(joined_url): + if url not in result.thirdp_urls: + result.thirdp_urls.append(joined_url) + continue + # Check if the URL is a .onion link or not even a web link if not IsUrlValid(joined_url): continue