From 013673fffbc51ca23a9468c21b02170c8128f724 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Mon, 14 Apr 2025 23:26:46 +0100
Subject: [PATCH 1/3] fix simplex parsing

---
 SimpleX/regex_simplexlinks.py | 7 ++++---
 SimpleX/utils.py              | 8 +++++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/SimpleX/regex_simplexlinks.py b/SimpleX/regex_simplexlinks.py
index 5bdf3a6..3d69628 100644
--- a/SimpleX/regex_simplexlinks.py
+++ b/SimpleX/regex_simplexlinks.py
@@ -20,17 +20,18 @@ hostname_pattern = re.compile(r'^(?:[a-zA-Z0-9.-]+|[0-9]{1,3}(?:\.[0-9]{1,3}){3}
 def IsSimpleXChatroomValid(url: str) -> bool:
     """Validate the SimpleX chatroom URL."""
     REQUIRED_SUBSTRING = "contact#/?v=2-7&smp=smp%3A%2F"
+    REQUIRED_SUBSTRING2 = "contact/#/?v=2-7&smp=smp%3A%2F"
     
     # Step 1: Check if it starts with http://, https://, or simplex:/
     if url.startswith(('http://', 'https://', 'simplex:/')):
         # Step 1.5: If http:// or https://, check for valid clearnet or onion domain
-        if url.startswith(('http://', 'https://')):
-            return IsUrlValid(url)
+        if url.startswith(('http://', 'https://')) and not IsUrlValid(url):
+            return False
     elif not url.startswith('simplex:/'):
         return False  # Must start with one of the valid protocols
 
     # Step 2: Check for the presence of the required substring
-    if REQUIRED_SUBSTRING not in url:
+    if REQUIRED_SUBSTRING not in url and REQUIRED_SUBSTRING2 not in url:
         return False  # Required substring not found
 
     # Step 3: Extract the part after "smp=smp%3A%2F"
diff --git a/SimpleX/utils.py b/SimpleX/utils.py
index 152571b..3164b8e 100644
--- a/SimpleX/utils.py
+++ b/SimpleX/utils.py
@@ -6,6 +6,7 @@ def IsOnionValid(url: str)-> bool:
     """
     try:
         pattern = re.compile(r"^[A-Za-z0-9:/._%-=#?&@]+(.onion)$")
+        url_pattern = re.compile(r"^(\w+:)?(?://)?(\w+\.)?[a-z2-7]{56}\.onion")
         url = url.strip().removesuffix('/')
         if url.startswith('http://'):
             domain = url.split('/')[2]
@@ -22,14 +23,14 @@ def IsOnionValid(url: str)-> bool:
                 return False
         else:
                         #TODO : edit the url to make sure it has http:// at the beginning, in case if it's missing? (problem is that it only returns true or false)
-            if pattern.fullmatch(url) is not None:
+            if url_pattern.match(url) is not None:
                 if len(url.split('.')) > 3:
                     return False
                 else:
                     if len(url) < 62:
                         return False
                     return True
-            elif pattern.fullmatch(url) is None:
+            elif url_pattern.match(url) is None:
                 return False
             else:
                 return False
@@ -41,10 +42,11 @@ def IsUrlValid(url:str)->bool:
         Check if url is valid both dark net end clearnet.
         """
         pattern = re.compile(r"^[A-Za-z0-9:/._%-=#?&@]+$")
+        onion_pattern = re.compile(r"^(\w+:)?(?://)?(\w+\.)?[a-z2-7]{56}\.onion")
         url = str(url)
         if len(url) < 4:
                 return False
-        if url.endswith('.onion'):
+        if onion_pattern.match(url) is not None:
                 return IsOnionValid(url)
         else:
                 if not url.__contains__('.'):

From ca4d949175dc638c2c5750faccb705d195dedd79 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Tue, 15 Apr 2025 00:34:42 +0100
Subject: [PATCH 2/3] fix even more parsing related stuff, including namespace
 conflicts

---
 SimpleX/regex_simplexlinks.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/SimpleX/regex_simplexlinks.py b/SimpleX/regex_simplexlinks.py
index 3d69628..67247b2 100644
--- a/SimpleX/regex_simplexlinks.py
+++ b/SimpleX/regex_simplexlinks.py
@@ -1,5 +1,5 @@
 import re
-from utils import IsUrlValid
+from SimpleX.utils import IsUrlValid
 import urllib.parse
 
 	#simplex:/contact#/?v=2-7&smp=smp%3A%2F%2FBD4qkVq8lJUgjHt0kUaxeQBYsKaxDejeecxm6-2vOwI%3D%40 b6geeakpwskovltbesvy3b6ah3ewxfmnhnshojndmpp7wcv2df7bnead.onion %2FOReK0M4-3C5NeZyQx_yFuTHSknVVS-3h%23%2F%3Fv%3D1-3%26dh%3DMCowBQYDK2VuAyEANi5VHx-Q1mIKmgZEg2ls47NGSlntttvcgLLbfKBpym4%253D&data=%7B%22groupLinkId%22%3A%22ndniy85i4DjITgVhB-MXnQ%3D%3D%22%7D
@@ -19,8 +19,7 @@ hostname_pattern = re.compile(r'^(?:[a-zA-Z0-9.-]+|[0-9]{1,3}(?:\.[0-9]{1,3}){3}
 
 def IsSimpleXChatroomValid(url: str) -> bool:
     """Validate the SimpleX chatroom URL."""
-    REQUIRED_SUBSTRING = "contact#/?v=2-7&smp=smp%3A%2F"
-    REQUIRED_SUBSTRING2 = "contact/#/?v=2-7&smp=smp%3A%2F"
+    REQUIRED_SUBSTRING = "#/?v=2-7&smp=smp%3A%2F"
     
     # Step 1: Check if it starts with http://, https://, or simplex:/
     if url.startswith(('http://', 'https://', 'simplex:/')):
@@ -31,7 +30,7 @@ def IsSimpleXChatroomValid(url: str) -> bool:
         return False  # Must start with one of the valid protocols
 
     # Step 2: Check for the presence of the required substring
-    if REQUIRED_SUBSTRING not in url and REQUIRED_SUBSTRING2 not in url:
+    if REQUIRED_SUBSTRING not in url:
         return False  # Required substring not found
 
     # Step 3: Extract the part after "smp=smp%3A%2F"

From bf407302921e56004f186cac1556aae4653db834 Mon Sep 17 00:00:00 2001
From: cynthia <cynthia@noreply.localhost>
Date: Tue, 15 Apr 2025 00:34:57 +0100
Subject: [PATCH 3/3] make special case for simplex links

---
 scripts/crawler.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/scripts/crawler.py b/scripts/crawler.py
index 5ac1452..d5042e5 100644
--- a/scripts/crawler.py
+++ b/scripts/crawler.py
@@ -6,8 +6,13 @@ import argparse
 import os
 import pwd
 import re
-from utils import print_colors, IsUrlValid
 
+# To have the ability to load the SimpleX module
+import sys
+sys.path.append("..")
+
+from utils import print_colors, IsUrlValid
+from SimpleX.regex_simplexlinks import IsSimpleXChatroomValid, IsSimpleXServerValid
 from dotenv import load_dotenv
 
 # Make default parameters for arguments
@@ -99,7 +104,17 @@ def add_urls(urls):
     global output_file
     for url in urls:
         parsed_url = urllib.parse.urlparse(url)
-        if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
+        if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any():
+            output_file.loc[-1] = ["", url, "SimpleX Chatroom"]
+            output_file.index += 1
+            output_file = output_file.sort_index()
+            continue
+        elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any():
+            output_file.loc[-1] = ["", url, "SimpleX Server"]
+            output_file.index += 1
+            output_file = output_file.sort_index()
+            continue
+        elif (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
             continue
 
         # Get information about the URL
@@ -144,6 +159,13 @@ def extract_urls_html(url, text):
         jurl_parsed = urllib.parse.urlparse(joined_url)
 
         print_colors(f'[D] Joined URL: {joined_url}')
+
+        # Capture SimpleX URLs
+        if IsSimpleXChatroomValid(joined_url) or IsSimpleXServerValid(joined_url):
+            if url not in result.thirdp_urls:
+                result.thirdp_urls.append(joined_url)
+            continue
+
         # Check if the URL is a .onion link or not even a web link
         if not IsUrlValid(joined_url):
             continue