WIP still, but good progress on option 4

This commit is contained in:
root 2025-01-09 22:12:19 +01:00
parent d891df79a2
commit 9d81e7a779
10 changed files with 420 additions and 107 deletions

View file

@ -29,6 +29,8 @@ def main():
templatepath=rootpath+'templates/'
verifiedcsvfile=instancepath+'/verified.csv'
unverifiedcsvfile=instancepath+'/unverified.csv'
blcsvfile=instancepath+'/blacklist.csv'
secsvfile=instancepath+'/sensitive.csv'
# check if instancepath exists, if not then create the directory
if not os.path.exists(instancepath):
os.makedirs(instancepath)
@ -43,6 +45,8 @@ def main():
# now that they exist, get vdf and uvdf
vdf = pd.read_csv(verifiedcsvfile)
uvdf = pd.read_csv(unverifiedcsvfile)
bldf = pd.read_csv(blcsvfile)
sedf = pd.read_csv(secsvfile)
print("[+] file exists, your Webring URL is", instance)
isitvalid = "y"
else:
@ -226,8 +230,8 @@ Managing Wordlists:
webring_participant_url = input("What is the onion domain of the new webring participant? (ex: uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion) ")
# check if the directory exists locally or not,
participantdir=rootpath+'www/participants/'+webring_participant_url
#if not os.path.isdir(participantdir): # to test on your own instance
if os.path.isdir(participantdir):
if not os.path.isdir(participantdir): # to test on your own instance
#if os.path.isdir(participantdir):
# if it does, it means that the webring is ALREADY added
print("[-] Webring Participant is already listed, skipping.")
return False
@ -298,15 +302,63 @@ Managing Wordlists:
print("[+] file written, let's read it")
f = open(csvfilepath,"r")
print(f.read())
# TODO and remove all of the invalid entries !!!
#######################################################################
#newrow=[instance,category,name,url,sensi,desc,'','']
#print("[+] NEWROW=",newrow)
#uvdf.loc[-1] = newrow # adding a row
#uvdf.index = uvdf.index + 1 # shifting index
#uvdf = uvdf.sort_index() # sorting by index
#print("[+] New row added! now writing the csv file:")
#uvdf.to_csv(unverifiedcsvfile, index=False)
########### PERFORM SANITY CHECKS ON the webring participant's verified.csv and unverified.csv ##################
for w in ['verified.csv','unverified.csv']:
csvfilepath=participantdir+'/'+w
csvdf = pd.read_csv(csvfilepath)
#print(bldf[['blacklisted-words']])
bldf[['blacklisted-words']].iterrows()
rows2delete= [] # it is an empty list at first
for i,j in csvdf.iterrows():
#print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description'])
#print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]])
#row=uvdf.iloc[[i]] #it displays the index
row=csvdf.loc[i,:].values.tolist()
#print(i,row)
### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion###
#print("[+] ROW=",i,"ROW CONTENTS=", IsUrlValid(uvdf.at[i, 'Instance']), IsCategoryValid(uvdf.at[i, 'Category']), IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']), IsDescriptionValid(uvdf.at[i, 'Description']), IsStatusValid(uvdf.at[i, 'Status']), IsScoreValid(uvdf.at[i, 'Score']))
if IsUrlValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsUrlValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or IsDescriptionValid(csvdf.at[i, 'Description']) is False or IsStatusValid(csvdf.at[i, 'Status']) is False or IsScoreValid(csvdf.at[i, 'Score']) is False:
#mark the row for deletion as it has invalid inputs
if i not in rows2delete:
print("Marking row", i,"for deletion, as it has invalid inputs")
rows2delete.append(i) #mark the row for deletion if not already done
### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
for k,l in bldf.iterrows():
#print("[+] Blacklisted word=",k, bldf.at[k, 'blacklisted-words'])
blword=bldf.at[k, 'blacklisted-words']
if any(blword in str(x) for x in row) == True:
#print("found blacklisted word! marking row for deletion")
if i not in rows2delete:
print("Marking row", i,"for deletion, as it matches with a blacklisted word")
rows2delete.append(i) #mark the row for deletion if not already done
### SANITY CHECK 3: Mark all the rows that are supposed to be sensitive ###
for k,l in sedf.iterrows():
#print("[+] Sensitive word=",k, sedf.at[k, 'sensitive-words'])
seword=sedf.at[k, 'sensitive-words']
if any(seword in str(x) for x in row) == True:
if csvdf.at[i, 'Sensitive'] != '✔️':
print("Marking row", i,"as sensitive, as it matches with a sensitive word")
uvdf.at[i, 'Sensitive']='✔️'
print('[-] Rows to delete: ',rows2delete)
# TODO : MAKE SURE IT WORKS IN PROD
for i in rows2delete:
row=csvdf.loc[i,:].values.tolist()
print('[+] REMOVING ROW :',i,row)
csvdf.drop(i, inplace= True)
csvdf.to_csv(csvfilepath, index=False)
##############################################
case "5":
@ -375,6 +427,8 @@ Managing Wordlists:
# TODO print("do you want to 1) add words or 2) remove words ?")
# TODO display the contents of blacklist.csv file
# TODO CASE 10 : cleanup all duplicates in unverified + verified.csv, based on the url (check if each url appears more than once, and if they do, remove them + write to csv file)
case _:
print("[-] Exiting")
return True
@ -420,26 +474,26 @@ def IsOnionValid(url: str)-> bool:
pattern = re.compile("^[A-Za-z0-9.]+(\.onion)?$")
url = url.strip().removesuffix('/')
if url.startswith('http://'):
print('URL starts with http')
#print('URL starts with http')
# Removes the http://
domain = url.split('/')[2]
if pattern.fullmatch(domain) is not None:
if len(domain.split('.')) > 3:
n_subdomians = len(domain.split('.'))
# Checks if there is more than 1 subdomain. "subdomain.url.onion" only
print(f"This domain have more than one subdomain. There are {n_subdomians} subdomains")
#print(f"This domain have more than one subdomain. There are {n_subdomians} subdomains")
return False
else:
if len(domain) < 62:
print("Domain length is less than 62.")
#print("Domain length is less than 62.")
return False
return True
elif pattern.fullmatch(domain) is None:
print("Domain contains invalid character.")
print(domain)
#print("Domain contains invalid character.")
#print(domain)
return False
else:
print("Domain not valid")
#print("Domain not valid")
return False
else:
#print("URL doesn't start http")
@ -447,25 +501,23 @@ def IsOnionValid(url: str)-> bool:
if len(url.split('.')) > 3:
n_subdomians = len(url.split('.'))
# Checks if there is more than 1 subdomain. "subdomain.url.onion" only
print(f"This domain have more than one subdomain. There are {n_subdomians - 1} subdomains")
#print(f"This domain have more than one subdomain. There are {n_subdomians - 1} subdomains")
return False
else:
if len(url) < 62:
print("Domain length is less than 62.")
#print("Domain length is less than 62.")
return False
return True
elif pattern.fullmatch(url) is None:
print("Domain contains invalid character.")
print(url)
#print("Domain contains invalid character.")
#print(url)
return False
else:
print("Domain not valid")
#print("Domain not valid")
return False
except Exception as e:
print(f"Error: {e}")
def IsUrlValid(url:str)->bool:
"""
Check if url is valid both dark net end clearnet.
@ -475,34 +527,57 @@ def IsUrlValid(url:str)->bool:
# if OK return True
#if not : return False
pattern = re.compile("^[A-Za-z0-9:/.]+$")
url = str(url)
if url.endswith('.onion'):
return IsOnionValid(url)
else:
if not url.__contains__('.'):
print("No (DOT) in clearnet url")
#print("No (DOT) in clearnet url")
return False
if pattern.fullmatch(url) is None:
print('Url contains invalid chars')
#print('Url contains invalid chars')
return False
return True
def IsStatusValid(status: str)-> bool:
"""
Checks if status contains only [v,w]. Verbose only if False is returned
Checks if status contains only [v,x,,]. Verbose only if False is returned
"""
# check if the characters are only [vx] with maximum 1 chars max
# if OK return True
#if not : return False
pattern = ['y','n']
if len(status) != 1:
print("Got more than one character or nothing.")
pattern = ['y','n','✔️','','','nan']
status = str(status)
status.strip()
#print('[+] STATUS = ',status.splitlines())
if len(status) > 4:
#print("Status: Got more than one character or nothing.")
return False
elif (status not in pattern):
print("Got an invalid character it must be either y or n")
#print("Status: Got an invalid character it must be either y, n, ✔️, or ❌ ")
return False
return True
def IsScoreValid(score:str)->bool:
"""
Check the Score is only "^[0-9.,]+$" with 8 max chars.
"""
# check if the characters are only [a-zA-Z0-9.,' ] with maximum 256 chars max
#(careful with the ' and , make sure you test if it fucks the csv up or else)
# if OK return True
#if not : return False
pattern = re.compile("^[0-9.,]+$")
score = str(score)
score.strip()
if pattern.fullmatch(score) is None:
# empty description is fine as it's optional
return False
elif len(score) > 8:
#print("score is greater than 8 chars")
return False
# empty score is fine
return True
def IsDescriptionValid(desc:str)->bool:
"""
Check the categories are only [a-zA-Z0-9,.' ] with 256 max chars.
@ -512,12 +587,15 @@ def IsDescriptionValid(desc:str)->bool:
# if OK return True
#if not : return False
pattern = re.compile("^[A-Za-z0-9-.,' ]+$")
desc = str(desc)
desc.strip()
# empty description is fine as it's optional
if pattern.fullmatch(desc) is None:
# empty description is fine as it's optional
return False
if desc == "DEFAULT":
return False
elif len(desc) > 256:
print("desc is greater than 256 chars")
#print("desc is greater than 256 chars")
return False
return True
@ -536,7 +614,8 @@ def IsCategoryValid(categories: list)-> bool:
#print('Got an empty list or invalid chars')
return False
elif len(category) > 64:
print('Category is too long')
#print('Category is too long')
return False
else:
return True
@ -554,12 +633,13 @@ def IsNameValid(name: str)->bool:
#print("Got an invalid character or nothing")
return False
elif len(name) > 64:
print(f'Got a name lenght greater than 64. {len(name)}')
#print(f'Got a name length greater than 64. {len(name)}')
return False
return True
if __name__ == '__main__':
main()