mirror of
http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern.git
synced 2025-05-17 04:36:57 +00:00
Merge pull request 'crawler simplex' (#53) from cynthia/darknet-lantern:main into main
Reviewed-on: http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern/pulls/53
This commit is contained in:
commit
600663f2fc
3 changed files with 33 additions and 9 deletions
|
@ -1,5 +1,5 @@
|
||||||
import re
|
import re
|
||||||
from utils import IsUrlValid
|
from SimpleX.utils import IsUrlValid
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
#simplex:/contact#/?v=2-7&smp=smp%3A%2F%2FBD4qkVq8lJUgjHt0kUaxeQBYsKaxDejeecxm6-2vOwI%3D%40 b6geeakpwskovltbesvy3b6ah3ewxfmnhnshojndmpp7wcv2df7bnead.onion %2FOReK0M4-3C5NeZyQx_yFuTHSknVVS-3h%23%2F%3Fv%3D1-3%26dh%3DMCowBQYDK2VuAyEANi5VHx-Q1mIKmgZEg2ls47NGSlntttvcgLLbfKBpym4%253D&data=%7B%22groupLinkId%22%3A%22ndniy85i4DjITgVhB-MXnQ%3D%3D%22%7D
|
#simplex:/contact#/?v=2-7&smp=smp%3A%2F%2FBD4qkVq8lJUgjHt0kUaxeQBYsKaxDejeecxm6-2vOwI%3D%40 b6geeakpwskovltbesvy3b6ah3ewxfmnhnshojndmpp7wcv2df7bnead.onion %2FOReK0M4-3C5NeZyQx_yFuTHSknVVS-3h%23%2F%3Fv%3D1-3%26dh%3DMCowBQYDK2VuAyEANi5VHx-Q1mIKmgZEg2ls47NGSlntttvcgLLbfKBpym4%253D&data=%7B%22groupLinkId%22%3A%22ndniy85i4DjITgVhB-MXnQ%3D%3D%22%7D
|
||||||
|
@ -19,13 +19,13 @@ hostname_pattern = re.compile(r'^(?:[a-zA-Z0-9.-]+|[0-9]{1,3}(?:\.[0-9]{1,3}){3}
|
||||||
|
|
||||||
def IsSimpleXChatroomValid(url: str) -> bool:
|
def IsSimpleXChatroomValid(url: str) -> bool:
|
||||||
"""Validate the SimpleX chatroom URL."""
|
"""Validate the SimpleX chatroom URL."""
|
||||||
REQUIRED_SUBSTRING = "contact#/?v=2-7&smp=smp%3A%2F"
|
REQUIRED_SUBSTRING = "#/?v=2-7&smp=smp%3A%2F"
|
||||||
|
|
||||||
# Step 1: Check if it starts with http://, https://, or simplex:/
|
# Step 1: Check if it starts with http://, https://, or simplex:/
|
||||||
if url.startswith(('http://', 'https://', 'simplex:/')):
|
if url.startswith(('http://', 'https://', 'simplex:/')):
|
||||||
# Step 1.5: If http:// or https://, check for valid clearnet or onion domain
|
# Step 1.5: If http:// or https://, check for valid clearnet or onion domain
|
||||||
if url.startswith(('http://', 'https://')):
|
if url.startswith(('http://', 'https://')) and not IsUrlValid(url):
|
||||||
return IsUrlValid(url)
|
return False
|
||||||
elif not url.startswith('simplex:/'):
|
elif not url.startswith('simplex:/'):
|
||||||
return False # Must start with one of the valid protocols
|
return False # Must start with one of the valid protocols
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ def IsOnionValid(url: str)-> bool:
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
pattern = re.compile(r"^[A-Za-z0-9:/._%-=#?&@]+(.onion)$")
|
pattern = re.compile(r"^[A-Za-z0-9:/._%-=#?&@]+(.onion)$")
|
||||||
|
url_pattern = re.compile(r"^(\w+:)?(?://)?(\w+\.)?[a-z2-7]{56}\.onion")
|
||||||
url = url.strip().removesuffix('/')
|
url = url.strip().removesuffix('/')
|
||||||
if url.startswith('http://'):
|
if url.startswith('http://'):
|
||||||
domain = url.split('/')[2]
|
domain = url.split('/')[2]
|
||||||
|
@ -22,14 +23,14 @@ def IsOnionValid(url: str)-> bool:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
#TODO : edit the url to make sure it has http:// at the beginning, in case if it's missing? (problem is that it only returns true or false)
|
#TODO : edit the url to make sure it has http:// at the beginning, in case if it's missing? (problem is that it only returns true or false)
|
||||||
if pattern.fullmatch(url) is not None:
|
if url_pattern.match(url) is not None:
|
||||||
if len(url.split('.')) > 3:
|
if len(url.split('.')) > 3:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
if len(url) < 62:
|
if len(url) < 62:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
elif pattern.fullmatch(url) is None:
|
elif url_pattern.match(url) is None:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -41,10 +42,11 @@ def IsUrlValid(url:str)->bool:
|
||||||
Check if url is valid both dark net end clearnet.
|
Check if url is valid both dark net end clearnet.
|
||||||
"""
|
"""
|
||||||
pattern = re.compile(r"^[A-Za-z0-9:/._%-=#?&@]+$")
|
pattern = re.compile(r"^[A-Za-z0-9:/._%-=#?&@]+$")
|
||||||
|
onion_pattern = re.compile(r"^(\w+:)?(?://)?(\w+\.)?[a-z2-7]{56}\.onion")
|
||||||
url = str(url)
|
url = str(url)
|
||||||
if len(url) < 4:
|
if len(url) < 4:
|
||||||
return False
|
return False
|
||||||
if url.endswith('.onion'):
|
if onion_pattern.match(url) is not None:
|
||||||
return IsOnionValid(url)
|
return IsOnionValid(url)
|
||||||
else:
|
else:
|
||||||
if not url.__contains__('.'):
|
if not url.__contains__('.'):
|
||||||
|
|
|
@ -6,8 +6,13 @@ import argparse
|
||||||
import os
|
import os
|
||||||
import pwd
|
import pwd
|
||||||
import re
|
import re
|
||||||
from utils import print_colors, IsUrlValid
|
|
||||||
|
|
||||||
|
# To have the ability to load the SimpleX module
|
||||||
|
import sys
|
||||||
|
sys.path.append("..")
|
||||||
|
|
||||||
|
from utils import print_colors, IsUrlValid
|
||||||
|
from SimpleX.regex_simplexlinks import IsSimpleXChatroomValid, IsSimpleXServerValid
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Make default parameters for arguments
|
# Make default parameters for arguments
|
||||||
|
@ -99,7 +104,17 @@ def add_urls(urls):
|
||||||
global output_file
|
global output_file
|
||||||
for url in urls:
|
for url in urls:
|
||||||
parsed_url = urllib.parse.urlparse(url)
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
|
if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any():
|
||||||
|
output_file.loc[-1] = ["", url, "SimpleX Chatroom"]
|
||||||
|
output_file.index += 1
|
||||||
|
output_file = output_file.sort_index()
|
||||||
|
continue
|
||||||
|
elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any():
|
||||||
|
output_file.loc[-1] = ["", url, "SimpleX Server"]
|
||||||
|
output_file.index += 1
|
||||||
|
output_file = output_file.sort_index()
|
||||||
|
continue
|
||||||
|
elif (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Get information about the URL
|
# Get information about the URL
|
||||||
|
@ -144,6 +159,13 @@ def extract_urls_html(url, text):
|
||||||
jurl_parsed = urllib.parse.urlparse(joined_url)
|
jurl_parsed = urllib.parse.urlparse(joined_url)
|
||||||
|
|
||||||
print_colors(f'[D] Joined URL: {joined_url}')
|
print_colors(f'[D] Joined URL: {joined_url}')
|
||||||
|
|
||||||
|
# Capture SimpleX URLs
|
||||||
|
if IsSimpleXChatroomValid(joined_url) or IsSimpleXServerValid(joined_url):
|
||||||
|
if url not in result.thirdp_urls:
|
||||||
|
result.thirdp_urls.append(joined_url)
|
||||||
|
continue
|
||||||
|
|
||||||
# Check if the URL is a .onion link or not even a web link
|
# Check if the URL is a .onion link or not even a web link
|
||||||
if not IsUrlValid(joined_url):
|
if not IsUrlValid(joined_url):
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue