[untested] RecognizeURLType and IsURLValid functions are working

This commit is contained in:
oxeo0 2025-05-30 00:21:20 +02:00
parent 08697f5c40
commit c9a2fbcfdd
5 changed files with 179 additions and 320 deletions

View file

@ -18,28 +18,108 @@ RESET = '\033[m'
# name should contain only up to 64 alphanumeric characters
VALID_NAME_PATTERN = re.compile(r"^[A-Za-z0-9]{1,64}$")
# pattern for regular urls
# TODO: this is very simplified pattern
URL_PATTERN = re.compile(r"^[A-Za-z0-9:\/\._%-=#?&@]+$")
# pattern for regular urls (https://stackoverflow.com/a/3809435)
CLEARNET_URL_PATTERN = re.compile(
r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]"
r"{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
)
# pattern for onion urls (56 bytes of base32 alphabet + .onion)
ONION_URL_PATTERN = re.compile(r"^(\w+:)?(?:\/\/)?(\w+\.)?[a-z2-7]{56}\.onion")
ONION_URL_PATTERN = re.compile(
r"^https?:\/\/([a-zA-Z0-9-]+\.)*[a-z2-7-]{56}\.onion[^\s]*$"
)
# pattern for simplex chatroom links
SIMPLEX_CHATROOM_PATTERN = re.compile(
r"(?:https?:\/\/(?:simplex\.chat|[^\/]+)|simplex:)\/(?:contact|invitation)#\/\?v=[\d-]+"
r"&smp=[^&]+(?:&[^=]+=[^&]*)*(?:&data=\{[^}]*\})?"
)
def print_colors(s:str=' ', bold:bool=False, is_error:bool = False, default:bool=False):
# pattern for smp or xftp simplex server ((smp|xftp):// 44 byte key @ url [:port])
SIMPLEX_SERVER_PATTERN = re.compile(
r"^(smp|xftp):\/\/([a-zA-Z0-9\-_+=]{44})@([a-z2-7]{56}\.onion|"
r"([a-zA-Z0-9\-\.]+\.[a-zA-Z0-9\-\.]+))"
r"{1,}(?::[1-9][0-9]{0,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|"
r"65[0-4][0-9]{2}|655[0-3][0-9]|6553[0-5])?$"
)
def IsSimplexChatroomValid(url: str) -> bool:
"""
Helper function to print with colors
Recognizes Simplex Chatroom link.
Returns True if URL is a SimpleX chatroom,
False otherwise
"""
if is_error:
print(f"{RED}{s}{RESET}")
elif bold:
print(f"{BOLD_PURPLE}{s}{RESET}")
elif is_error and bold:
print(f"{BOLD_RED}{s}{RESET}")
elif default:
print(f'{s}')
return SIMPLEX_CHATROOM_PATTERN.match(url)
def RecognizeSimplexType(url: str) -> str:
"""
Recognizes Simplex Server URL, returns smp, xftp or invalid
"""
match = SIMPLEX_SERVER_PATTERN.match(url)
if match:
return match.group(1)
else:
print(f"{PURPLE}{s}{RESET}")
return 'invalid'
# stub function
def IsXFTPServerValid(url: str) -> bool:
"""
Returns True if URL is a valid SimpleX XFTP Server URL
False otherwise
"""
return RecognizeSimplexType(url) == 'xftp'
# stub function
def IsSMPServerValid(url: str) -> bool:
"""
Returns True if URL is a valid SimpleX SMP Server URL
False otherwise
"""
return RecognizeSimplexType(url) == 'smp'
def IsClearnetLinkValid(url: str) -> bool:
"""
Returns True if URL is a valid clearnet URL
False otherwise
"""
return CLEARNET_URL_PATTERN.match(url)
def IsOnionLinkValid(url: str) -> bool:
"""
Returns True if URL is a valid onion URL
False otherwise
"""
return ONION_URL_PATTERN.match(url)
def RecognizeURLType(url: str) -> str:
"""
Recognizes URL type, can return:
- chatroom - SimpleX chatroom
- xftp - XFTP SimpleX server
- smp - SMP SimpleX server
- onion - onion URL
- clearnet - valid clearnet url
- invalid - none of the above (probably invalid)
"""
# order is important here
# (ex. simplex chatroom is also valid clearnet link)
if IsSimplexChatroomValid(url):
return 'chatroom'
if IsXFTPServerValid(url):
return 'xftp'
if IsSMPServerValid(url):
return 'smp'
if IsOnionLinkValid(url):
return 'onion'
if IsClearnetLinkValid(url):
return 'clearnet'
return 'invalid'
def IsURLValid(url: str) -> bool:
"""
Checks if given URL is valid (RecognizeURLType recognizes it)
"""
return RecognizeURLType(url) != 'invalid'
#### Checking Functions to validate that links are legit ####
@ -54,7 +134,7 @@ def CheckUrl(url):
}
try:
status = requests.get(url, proxies=proxies, timeout=5).status_code
return bool(status == 200)
return status == 200
except requests.ConnectionError:
return False
except requests.exceptions.ReadTimeout:
@ -82,131 +162,13 @@ def IsBannerValid(path: str) -> bool:
return True
def IsOnionValid(url: str) -> bool:
"""
Checks if the domain(param) is a valid onion domain and return True else False.
"""
try:
# make sure the protocol is there
if not url.startswith(('http://', 'https://')):
url = 'http://' + url.strip().removesuffix('/')
domain = url.split('/')[2]
if ONION_URL_PATTERN.fullmatch(domain):
parts_count = len(domain.split('.'))
# TODO: we probably don't really need to check 62 char length
# regex does that beforehand
return (len(domain) == 62) and (parts_count <= 3)
except Exception:
return False
def IsSimpleXChatroomValid(url: str) -> bool:
"""Validate the SimpleX chatroom URL."""
REQUIRED_SUBSTRING = "#/?v=2-7&smp=smp%3A%2F"
# Step 1: Check if it starts with http://, https://, or simplex:/
if url.startswith(('http://', 'https://', 'simplex:/')):
# Step 1.5: If http:// or https://, check for valid clearnet or onion domain
if url.startswith(('http://', 'https://')) \
and RecognizeUrlOnionClear(url) != 'invalid':
return False
elif not url.startswith('simplex:/'):
return False # Must start with one of the valid protocols
# Step 2: Check for the presence of the required substring
if REQUIRED_SUBSTRING not in url:
return False # Required substring not found
# Step 3: Extract the part after "smp=smp%3A%2F"
smp_start = url.find("smp=smp%3A%2F")
if smp_start == -1:
return False # Required substring not found
smp_start += len("smp=smp%3A%2F")
smp_end = url.find("&", smp_start)
if smp_end == -1:
smp_end = len(url) # Take until the end if no "&" is found
smp_value = urllib.parse.unquote(url[smp_start:smp_end]) # Decode the URL-encoded string
# Step 3.5: Check if the smp_value contains a valid hostname
if '@' not in smp_value:
return False # Must contain '@' to separate fingerprint and hostname
fingerprint, hostname = smp_value.split('@', 1)
if RecognizeUrlOnionClear(hostname) != 'invalid':
return False # Invalid hostname
# Step 4: Check for the presence of "%2F" in the original URL
if "%2F" not in url:
return False # Required substring not found
# If all checks pass, return True
return True
def RecognizeUrlOnionClear(url: str) -> str:
"""
Recognize if the URL is invalid, onion or clearnet.
"""
# early terminate preconditions
if len(url) < 4 or (';' in url) or ('.' not in url):
return 'invalid'
# check if possibly onion url, here just perliminary check
# IsOnionValid checks it against regex expression
if '.onion' in url:
if IsOnionValid(url):
return 'onion'
if URL_PATTERN.fullmatch(url):
return 'clearnet'
return 'invalid'
def RecognizeUrlFull(url: str) -> str:
"""
Recognize if URL is smp, xftp, simplex groupchat, onion, clearnet or just invalid
Depends on RecognizeUrlOnionClear
"""
if IsSimpleXChatroomValid(url):
return 'chatroom'
if url.startswith(('http://', 'https://')):
return RecognizeUrlOnionClear(url)
if url.startswith('xftp://'):
if IsSimpleXServerValid(url):
return 'xftp'
if url.startswith('smp://'):
if IsSimpleXServerValid(url):
return 'smp'
return 'invalid'
#def IsUrlValid(url:str)->bool:
# """
# Check if url is valid both dark net end clearnet.
# """
# pattern = re.compile("^[A-Za-z0-9:/.-]+$")
# url = str(url)
# if len(url) < 4:
# return False
# if url.endswith('.onion'):
# return IsOnionValid(url)
# else:
# if not url.__contains__('.'):
# return False
# if pattern.fullmatch(url) is None:
# return False
# return True
def IsStatusValid(status: str) -> bool:
"""
Checks if status contains only ['YES','NO']. Verbose only if False is returned
"""
pattern = ['YES','NO','✔️','','']
pattern = ['YES','NO','']
status = status.strip()
if status not in pattern:
return False
@ -230,7 +192,7 @@ def IsScoreValid(score: str) -> bool:
return True
def IsDescriptionValid(desc:str)->bool:
def IsDescriptionValid(desc: str) -> bool:
"""
Check the categories are only [a-zA-Z0-9.' ] with 256 max chars.
"""
@ -263,40 +225,6 @@ def IsCategoryValid(categories: list[str]) -> bool:
return True
def IsSimpleXServerValid(url: str) -> bool:
pattern = re.compile('[0-9A-Za-z-_]*')
url = url.strip()
try:
if url.startswith(('smp://', 'xftp://')):
# Remove the protocol part
proless = url.split('//', 1)[-1]
# Split the fingerprint and hostname
parts = proless.split('@')
if len(parts) != 2:
return False # Must have exactly one '@' character
fingerprint = parts[0]
hostname = parts[1].split(',')[0] # Get the hostname before any comma
# Check fingerprint length and pattern
if len(fingerprint) == 44 and pattern.match(fingerprint):
# Validate the hostname
if RecognizeUrlOnionClear(hostname) != 'invalid':
# Check for an optional comma and a valid onion domain
if ',' in proless:
onion_part = proless.split(',')[1].strip()
if RecognizeUrlOnionClear(onion_part) != 'invalid':
return False
return True
return False
except Exception as e:
print(e)
# Any error will be a false
return False
def IsNameValid(name: str) -> bool:
"""
Check the parameter name only contains [a-zA-Z0-9] and is 64 chars long.
@ -325,3 +253,19 @@ def send_server_checks(url: str) -> tuple[str, str, str]:
failed_response = response['resp'].get('testFailure')
return (response, resp_type, failed_response)
def print_colors(s:str=' ', bold:bool=False, is_error:bool = False, default:bool=False):
"""
Helper function to print with colors
"""
if is_error:
print(f"{RED}{s}{RESET}")
elif bold:
print(f"{BOLD_PURPLE}{s}{RESET}")
elif is_error and bold:
print(f"{BOLD_RED}{s}{RESET}")
elif default:
print(f'{s}')
else:
print(f"{PURPLE}{s}{RESET}")