diff --git a/SimpleX/regex_simplexlinks.py b/SimpleX/regex_simplexlinks.py index 5bdf3a6..67247b2 100644 --- a/SimpleX/regex_simplexlinks.py +++ b/SimpleX/regex_simplexlinks.py @@ -1,5 +1,5 @@ import re -from utils import IsUrlValid +from SimpleX.utils import IsUrlValid import urllib.parse #simplex:/contact#/?v=2-7&smp=smp%3A%2F%2FBD4qkVq8lJUgjHt0kUaxeQBYsKaxDejeecxm6-2vOwI%3D%40 b6geeakpwskovltbesvy3b6ah3ewxfmnhnshojndmpp7wcv2df7bnead.onion %2FOReK0M4-3C5NeZyQx_yFuTHSknVVS-3h%23%2F%3Fv%3D1-3%26dh%3DMCowBQYDK2VuAyEANi5VHx-Q1mIKmgZEg2ls47NGSlntttvcgLLbfKBpym4%253D&data=%7B%22groupLinkId%22%3A%22ndniy85i4DjITgVhB-MXnQ%3D%3D%22%7D @@ -19,13 +19,13 @@ hostname_pattern = re.compile(r'^(?:[a-zA-Z0-9.-]+|[0-9]{1,3}(?:\.[0-9]{1,3}){3} def IsSimpleXChatroomValid(url: str) -> bool: """Validate the SimpleX chatroom URL.""" - REQUIRED_SUBSTRING = "contact#/?v=2-7&smp=smp%3A%2F" + REQUIRED_SUBSTRING = "#/?v=2-7&smp=smp%3A%2F" # Step 1: Check if it starts with http://, https://, or simplex:/ if url.startswith(('http://', 'https://', 'simplex:/')): # Step 1.5: If http:// or https://, check for valid clearnet or onion domain - if url.startswith(('http://', 'https://')): - return IsUrlValid(url) + if url.startswith(('http://', 'https://')) and not IsUrlValid(url): + return False elif not url.startswith('simplex:/'): return False # Must start with one of the valid protocols diff --git a/SimpleX/utils.py b/SimpleX/utils.py index 152571b..3164b8e 100644 --- a/SimpleX/utils.py +++ b/SimpleX/utils.py @@ -6,6 +6,7 @@ def IsOnionValid(url: str)-> bool: """ try: pattern = re.compile(r"^[A-Za-z0-9:/._%-=#?&@]+(.onion)$") + url_pattern = re.compile(r"^(\w+:)?(?://)?(\w+\.)?[a-z2-7]{56}\.onion") url = url.strip().removesuffix('/') if url.startswith('http://'): domain = url.split('/')[2] @@ -22,14 +23,14 @@ def IsOnionValid(url: str)-> bool: return False else: #TODO : edit the url to make sure it has http:// at the beginning, in case if it's missing? (problem is that it only returns true or false) - if pattern.fullmatch(url) is not None: + if url_pattern.match(url) is not None: if len(url.split('.')) > 3: return False else: if len(url) < 62: return False return True - elif pattern.fullmatch(url) is None: + elif url_pattern.match(url) is None: return False else: return False @@ -41,10 +42,11 @@ def IsUrlValid(url:str)->bool: Check if url is valid both dark net end clearnet. """ pattern = re.compile(r"^[A-Za-z0-9:/._%-=#?&@]+$") + onion_pattern = re.compile(r"^(\w+:)?(?://)?(\w+\.)?[a-z2-7]{56}\.onion") url = str(url) if len(url) < 4: return False - if url.endswith('.onion'): + if onion_pattern.match(url) is not None: return IsOnionValid(url) else: if not url.__contains__('.'): diff --git a/scripts/crawler.py b/scripts/crawler.py index 5ac1452..d5042e5 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -6,8 +6,13 @@ import argparse import os import pwd import re -from utils import print_colors, IsUrlValid +# To have the ability to load the SimpleX module +import sys +sys.path.append("..") + +from utils import print_colors, IsUrlValid +from SimpleX.regex_simplexlinks import IsSimpleXChatroomValid, IsSimpleXServerValid from dotenv import load_dotenv # Make default parameters for arguments @@ -99,7 +104,17 @@ def add_urls(urls): global output_file for url in urls: parsed_url = urllib.parse.urlparse(url) - if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): + if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any(): + output_file.loc[-1] = ["", url, "SimpleX Chatroom"] + output_file.index += 1 + output_file = output_file.sort_index() + continue + elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any(): + output_file.loc[-1] = ["", url, "SimpleX Server"] + output_file.index += 1 + output_file = output_file.sort_index() + continue + elif (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"): continue # Get information about the URL @@ -144,6 +159,13 @@ def extract_urls_html(url, text): jurl_parsed = urllib.parse.urlparse(joined_url) print_colors(f'[D] Joined URL: {joined_url}') + + # Capture SimpleX URLs + if IsSimpleXChatroomValid(joined_url) or IsSimpleXServerValid(joined_url): + if url not in result.thirdp_urls: + result.thirdp_urls.append(joined_url) + continue + # Check if the URL is a .onion link or not even a web link if not IsUrlValid(joined_url): continue