mirror of
http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern.git
synced 2025-05-17 12:46:58 +00:00
255 lines
11 KiB
Python
255 lines
11 KiB
Python
import csv, json, pandas as pd, glob, re
|
|
|
|
def main():
|
|
#print("aaa")
|
|
vcsvfile="verified.csv"
|
|
vdf = pd.read_csv(vcsvfile)
|
|
unverifiedcsvfile="unverified.csv"
|
|
uvdf = pd.read_csv(unverifiedcsvfile)
|
|
blcsvfile="blacklist.csv"
|
|
bldf = pd.read_csv(blcsvfile)
|
|
secsvfile="sensitive.csv"
|
|
sedf = pd.read_csv(secsvfile)
|
|
|
|
###########
|
|
# list every word in the blacklist wordlist csv file
|
|
# for each word, check if it matches with any of the rows in unverified.csv
|
|
# if it matches (on any column!), remove that row and write to the csv file
|
|
|
|
|
|
|
|
########### SANITY CHECKS ON UNVERIFIED.CSV ##################
|
|
#print(bldf[['blacklisted-words']])
|
|
bldf[['blacklisted-words']].iterrows()
|
|
rows2delete= [] # it is an empty list at first
|
|
for i,j in uvdf.iterrows():
|
|
#print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description'])
|
|
#print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]])
|
|
#row=uvdf.iloc[[i]] #it displays the index
|
|
row=uvdf.loc[i,:].values.tolist()
|
|
#print(i,row)
|
|
|
|
### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion###
|
|
#print("[+] ROW=",i,"ROW CONTENTS=", IsUrlValid(uvdf.at[i, 'Instance']), IsCategoryValid(uvdf.at[i, 'Category']), IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']), IsDescriptionValid(uvdf.at[i, 'Description']), IsStatusValid(uvdf.at[i, 'Status']), IsScoreValid(uvdf.at[i, 'Score']))
|
|
if IsUrlValid(uvdf.at[i, 'Instance']) is False or IsCategoryValid(uvdf.at[i, 'Category']) is False or IsNameValid(uvdf.at[i, 'Name']) is False or IsUrlValid(uvdf.at[i, 'URL']) is False or IsStatusValid(uvdf.at[i, 'Sensitive']) is False or IsDescriptionValid(uvdf.at[i, 'Description']) is False or IsStatusValid(uvdf.at[i, 'Status']) is False or IsScoreValid(uvdf.at[i, 'Score']) is False:
|
|
#mark the row for deletion as it has invalid inputs
|
|
if i not in rows2delete:
|
|
print("Marking row", i,"for deletion, as it has invalid inputs")
|
|
rows2delete.append(i) #mark the row for deletion if not already done
|
|
|
|
### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
|
|
for k,l in bldf.iterrows():
|
|
#print("[+] Blacklisted word=",k, bldf.at[k, 'blacklisted-words'])
|
|
blword=bldf.at[k, 'blacklisted-words']
|
|
if any(blword in str(x) for x in row) == True:
|
|
#print("found blacklisted word! marking row for deletion")
|
|
if i not in rows2delete:
|
|
print("Marking row", i,"for deletion, as it matches with a blacklisted word")
|
|
rows2delete.append(i) #mark the row for deletion if not already done
|
|
### SANITY CHECK 3: Mark all the rows that are supposed to be sensitive ###
|
|
for k,l in sedf.iterrows():
|
|
#print("[+] Sensitive word=",k, sedf.at[k, 'sensitive-words'])
|
|
seword=sedf.at[k, 'sensitive-words']
|
|
if any(seword in str(x) for x in row) == True:
|
|
if uvdf.at[i, 'Sensitive'] != '✔️':
|
|
print("Marking row", i,"as sensitive, as it matches with a sensitive word")
|
|
uvdf.at[i, 'Sensitive']='✔️'
|
|
|
|
print('[-] Rows to delete: ',rows2delete)
|
|
|
|
for i in rows2delete:
|
|
row=uvdf.loc[i,:].values.tolist()
|
|
print('[+] REMOVING ROW :',i,row)
|
|
uvdf.drop(i, inplace= True)
|
|
uvdf.to_csv(unverifiedcsvfile, index=False)
|
|
##############################################
|
|
|
|
# list every word in the sensitive wordlist csv file
|
|
# if it matches (on any column!), mark the sensitive column as V
|
|
|
|
|
|
#############################################################################
|
|
return True
|
|
##############################################################################
|
|
|
|
|
|
#### PROTECTIONS AGAINST MALICIOUS CSV INPUTS ####
|
|
|
|
def IsOnionValid(url: str)-> bool:
|
|
"""
|
|
Checks if the domain(param) is a valid onion domain and return True else False.
|
|
"""
|
|
# check if the characters are only [a-zA-Z0-9.] with maximum 128 chars max?
|
|
# check that it is only url.onion or subdomain.url.onion,
|
|
# if OK return True
|
|
#if not : return False
|
|
try:
|
|
pattern = re.compile("^[A-Za-z0-9.]+(\.onion)?$")
|
|
url = url.strip().removesuffix('/')
|
|
if url.startswith('http://'):
|
|
#print('URL starts with http')
|
|
# Removes the http://
|
|
domain = url.split('/')[2]
|
|
if pattern.fullmatch(domain) is not None:
|
|
if len(domain.split('.')) > 3:
|
|
n_subdomians = len(domain.split('.'))
|
|
# Checks if there is more than 1 subdomain. "subdomain.url.onion" only
|
|
#print(f"This domain have more than one subdomain. There are {n_subdomians} subdomains")
|
|
return False
|
|
else:
|
|
if len(domain) < 62:
|
|
#print("Domain length is less than 62.")
|
|
return False
|
|
return True
|
|
elif pattern.fullmatch(domain) is None:
|
|
#print("Domain contains invalid character.")
|
|
#print(domain)
|
|
return False
|
|
else:
|
|
#print("Domain not valid")
|
|
return False
|
|
else:
|
|
#print("URL doesn't start http")
|
|
if pattern.fullmatch(url) is not None:
|
|
if len(url.split('.')) > 3:
|
|
n_subdomians = len(url.split('.'))
|
|
# Checks if there is more than 1 subdomain. "subdomain.url.onion" only
|
|
#print(f"This domain have more than one subdomain. There are {n_subdomians - 1} subdomains")
|
|
return False
|
|
else:
|
|
if len(url) < 62:
|
|
#print("Domain length is less than 62.")
|
|
return False
|
|
return True
|
|
elif pattern.fullmatch(url) is None:
|
|
#print("Domain contains invalid character.")
|
|
#print(url)
|
|
return False
|
|
else:
|
|
#print("Domain not valid")
|
|
return False
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
|
|
|
|
def IsUrlValid(url:str)->bool:
|
|
"""
|
|
Check if url is valid both dark net end clearnet.
|
|
"""
|
|
# check if the characters are only [a-zA-Z0-9.:/] with maximum 128 chars max?
|
|
# check that it is only http(s)://wordA.wordB or http(s)://WordC.WordB.WordC, (onion or not), clearnet is fine too (double check if those are fine!)
|
|
# if OK return True
|
|
#if not : return False
|
|
pattern = re.compile("^[A-Za-z0-9:/.]+$")
|
|
if url.endswith('.onion'):
|
|
return IsOnionValid(url)
|
|
else:
|
|
if not url.__contains__('.'):
|
|
#print("No (DOT) in clearnet url")
|
|
return False
|
|
if pattern.fullmatch(url) is None:
|
|
#print('Url contains invalid chars')
|
|
return False
|
|
return True
|
|
|
|
def IsStatusValid(status: str)-> bool:
|
|
"""
|
|
Checks if status contains only [v,x,❌,✔️]. Verbose only if False is returned
|
|
"""
|
|
pattern = ['y','n','✔️','❌','','nan']
|
|
status = str(status)
|
|
status.strip()
|
|
#print('[+] STATUS = ',status.splitlines())
|
|
if len(status) > 4:
|
|
#print("Status: Got more than one character or nothing.")
|
|
return False
|
|
elif (status not in pattern):
|
|
#print("Status: Got an invalid character it must be either y, n, ✔️, or ❌ ")
|
|
return False
|
|
|
|
return True
|
|
|
|
def IsScoreValid(score:str)->bool:
|
|
"""
|
|
Check the Score is only "^[0-9.,]+$" with 8 max chars.
|
|
"""
|
|
# check if the characters are only [a-zA-Z0-9.,' ] with maximum 256 chars max
|
|
#(careful with the ' and , make sure you test if it fucks the csv up or else)
|
|
# if OK return True
|
|
#if not : return False
|
|
pattern = re.compile("^[0-9.,]+$")
|
|
score = str(score)
|
|
score.strip()
|
|
if pattern.fullmatch(score) is None:
|
|
# empty description is fine as it's optional
|
|
return False
|
|
elif len(score) > 8:
|
|
#print("score is greater than 8 chars")
|
|
return False
|
|
# empty score is fine
|
|
return True
|
|
|
|
|
|
def IsDescriptionValid(desc:str)->bool:
|
|
"""
|
|
Check the categories are only [a-zA-Z0-9,.' ] with 256 max chars.
|
|
"""
|
|
# check if the characters are only [a-zA-Z0-9.,' ] with maximum 256 chars max
|
|
#(careful with the ' and , make sure you test if it fucks the csv up or else)
|
|
# if OK return True
|
|
#if not : return False
|
|
pattern = re.compile("^[A-Za-z0-9-.,' ]+$")
|
|
desc.strip()
|
|
if pattern.fullmatch(desc) is None:
|
|
# empty description is fine as it's optional
|
|
return False
|
|
if desc == "DEFAULT":
|
|
return False
|
|
elif len(desc) > 256:
|
|
#print("desc is greater than 256 chars")
|
|
return False
|
|
return True
|
|
|
|
def IsCategoryValid(categories: list)-> bool:
|
|
"""
|
|
Check the categories are only [a-zA-Z0-9 ] with 64 max chars.
|
|
"""
|
|
# check if the characters are only [a-zA-Z0-9 ] with maximum 64 chars max
|
|
#(careful with the ' and , make sure you test if it fucks the csv up or else)
|
|
# if OK return True
|
|
#if not : return False
|
|
pattern = re.compile("^[A-Za-z0-9 ]+$")
|
|
for category in categories:
|
|
category.strip()
|
|
if pattern.fullmatch(category) is None:
|
|
#print('Got an empty list or invalid chars')
|
|
return False
|
|
elif len(category) > 64:
|
|
#print('Category is too long')
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
def IsNameValid(name: str)->bool:
|
|
"""
|
|
Check the parameter name only contains [a-zA-Z0-9 ] and is 64 chars long.
|
|
"""
|
|
# check if the characters are only [a-zA-Z0-9 ] with maximum 64 chars max
|
|
#(careful with the ' and , make sure you test if it fucks the csv up or else)
|
|
# if OK return True
|
|
#if not : return False
|
|
pattern = re.compile("^[A-Za-z0-9 ]+$")
|
|
name = name.strip()
|
|
if (pattern.fullmatch(name) is None):
|
|
#print("Got an invalid character or nothing")
|
|
return False
|
|
elif len(name) > 64:
|
|
#print(f'Got a name length greater than 64. {len(name)}')
|
|
return False
|
|
return True
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|