[untested] RecognizeURLType and IsURLValid functions are working

This commit is contained in:
oxeo0 2025-05-30 00:21:20 +02:00
parent 08697f5c40
commit c9a2fbcfdd
5 changed files with 179 additions and 320 deletions

View file

@ -11,8 +11,10 @@ import re
import sys
sys.path.append("..")
from utils import print_colors, IsUrlValid
from SimpleX.regex_simplexlinks import IsSimpleXChatroomValid, IsSimpleXServerValid
from utils import (
print_colors, IsURLValid, IsSimplexChatroomValid, RecognizeURLType
)
#from SimpleX.regex_simplexlinks import IsSimpleXChatroomValid, IsSimpleXServerValid
from dotenv import load_dotenv
# Make default parameters for arguments
@ -107,12 +109,12 @@ def add_urls(urls):
global output_file
for url in urls:
parsed_url = urllib.parse.urlparse(url)
if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any():
if IsSimplexChatroomValid(url) and not (output_file['URL'] == url).any():
output_file.loc[-1] = ["", url, "", "SimpleX Chatroom"]
output_file.index += 1
output_file = output_file.sort_index()
continue
elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any():
elif RecognizeURLType(url) in ('smp', 'xftp') and not (output_file['URL'] == url).any():
output_file.loc[-1] = ["", url, "", "SimpleX Server"]
output_file.index += 1
output_file = output_file.sort_index()
@ -164,13 +166,13 @@ def extract_urls_html(url, text):
print_colors(f'[D] Joined URL: {joined_url}')
# Capture SimpleX URLs
if IsSimpleXChatroomValid(joined_url) or IsSimpleXServerValid(joined_url):
if RecognizeURLType(joined_url) in ('smp', 'xftp', 'chatroom'):
if url not in result.thirdp_urls:
result.thirdp_urls.append(joined_url)
continue
# Check if the URL is a .onion link or not even a web link
if not IsUrlValid(joined_url):
if not IsURLValid(joined_url):
continue
print_colors(f'[+] Found url: {joined_url}')
@ -266,4 +268,3 @@ for i, url in enumerate(vcsv_urls):
crawl_url(url)
crawler_file.to_csv(args.crawler_file, index=False)
output_file.to_csv(args.output, index=False)