import csv, json, pandas as pd, glob, re def main(): #print("aaa") vcsvfile="verified.csv" vdf = pd.read_csv(vcsvfile) unverifiedcsvfile="unverified.csv" uvdf = pd.read_csv(unverifiedcsvfile) blcsvfile="blacklist.csv" bldf = pd.read_csv(blcsvfile) secsvfile="sensitive.csv" sedf = pd.read_csv(secsvfile) ########### # list every word in the blacklist wordlist csv file # for each word, check if it matches with any of the rows in unverified.csv # if it matches (on any column!), remove that row and write to the csv file ########### SANITY CHECKS ON UNVERIFIED.CSV ################## #print(bldf[['blacklisted-words']]) bldf[['blacklisted-words']].iterrows() rows2delete= [] # it is an empty list at first for i,j in uvdf.iterrows(): #print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description']) #print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]]) #row=uvdf.iloc[[i]] #it displays the index row=uvdf.loc[i,:].values.tolist() #print(i,row) ### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion### #print("[+] ROW=",i,"ROW CONTENTS=", IsUrlValid(uvdf.at[i, 'Instance']), IsCategoryValid(uvdf.at[i, 'Category']), IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']), IsDescriptionValid(uvdf.at[i, 'Description']), IsStatusValid(uvdf.at[i, 'Status']), IsScoreValid(uvdf.at[i, 'Score'])) if IsUrlValid(uvdf.at[i, 'Instance']) is False or IsCategoryValid(uvdf.at[i, 'Category']) is False or IsNameValid(uvdf.at[i, 'Name']) is False or IsUrlValid(uvdf.at[i, 'URL']) is False or IsStatusValid(uvdf.at[i, 'Sensitive']) is False or IsDescriptionValid(uvdf.at[i, 'Description']) is False or IsStatusValid(uvdf.at[i, 'Status']) is False or IsScoreValid(uvdf.at[i, 'Score']) is False: #mark the row for deletion as it has invalid inputs if i not in rows2delete: print("Marking row", i,"for deletion, as it has invalid inputs") rows2delete.append(i) #mark the row for deletion if not already done ### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ### for k,l in bldf.iterrows(): #print("[+] Blacklisted word=",k, bldf.at[k, 'blacklisted-words']) blword=bldf.at[k, 'blacklisted-words'] if any(blword in str(x) for x in row) == True: #print("found blacklisted word! marking row for deletion") if i not in rows2delete: print("Marking row", i,"for deletion, as it matches with a blacklisted word") rows2delete.append(i) #mark the row for deletion if not already done ### SANITY CHECK 3: Mark all the rows that are supposed to be sensitive ### for k,l in sedf.iterrows(): #print("[+] Sensitive word=",k, sedf.at[k, 'sensitive-words']) seword=sedf.at[k, 'sensitive-words'] if any(seword in str(x) for x in row) == True: if uvdf.at[i, 'Sensitive'] != '✔️': print("Marking row", i,"as sensitive, as it matches with a sensitive word") uvdf.at[i, 'Sensitive']='✔️' print('[-] Rows to delete: ',rows2delete) for i in rows2delete: row=uvdf.loc[i,:].values.tolist() print('[+] REMOVING ROW :',i,row) uvdf.drop(i, inplace= True) uvdf.to_csv(unverifiedcsvfile, index=False) ############################################## # list every word in the sensitive wordlist csv file # if it matches (on any column!), mark the sensitive column as V ############################################################################# return True ############################################################################## #### PROTECTIONS AGAINST MALICIOUS CSV INPUTS #### def IsOnionValid(url: str)-> bool: """ Checks if the domain(param) is a valid onion domain and return True else False. """ # check if the characters are only [a-zA-Z0-9.] with maximum 128 chars max? # check that it is only url.onion or subdomain.url.onion, # if OK return True #if not : return False try: pattern = re.compile("^[A-Za-z0-9.]+(\.onion)?$") url = url.strip().removesuffix('/') if url.startswith('http://'): #print('URL starts with http') # Removes the http:// domain = url.split('/')[2] if pattern.fullmatch(domain) is not None: if len(domain.split('.')) > 3: n_subdomians = len(domain.split('.')) # Checks if there is more than 1 subdomain. "subdomain.url.onion" only #print(f"This domain have more than one subdomain. There are {n_subdomians} subdomains") return False else: if len(domain) < 62: #print("Domain length is less than 62.") return False return True elif pattern.fullmatch(domain) is None: #print("Domain contains invalid character.") #print(domain) return False else: #print("Domain not valid") return False else: #print("URL doesn't start http") if pattern.fullmatch(url) is not None: if len(url.split('.')) > 3: n_subdomians = len(url.split('.')) # Checks if there is more than 1 subdomain. "subdomain.url.onion" only #print(f"This domain have more than one subdomain. There are {n_subdomians - 1} subdomains") return False else: if len(url) < 62: #print("Domain length is less than 62.") return False return True elif pattern.fullmatch(url) is None: #print("Domain contains invalid character.") #print(url) return False else: #print("Domain not valid") return False except Exception as e: print(f"Error: {e}") def IsUrlValid(url:str)->bool: """ Check if url is valid both dark net end clearnet. """ # check if the characters are only [a-zA-Z0-9.:/] with maximum 128 chars max? # check that it is only http(s)://wordA.wordB or http(s)://WordC.WordB.WordC, (onion or not), clearnet is fine too (double check if those are fine!) # if OK return True #if not : return False pattern = re.compile("^[A-Za-z0-9:/.]+$") if url.endswith('.onion'): return IsOnionValid(url) else: if not url.__contains__('.'): #print("No (DOT) in clearnet url") return False if pattern.fullmatch(url) is None: #print('Url contains invalid chars') return False return True def IsStatusValid(status: str)-> bool: """ Checks if status contains only [v,x,❌,✔️]. Verbose only if False is returned """ pattern = ['y','n','✔️','❌','','nan'] status = str(status) status.strip() #print('[+] STATUS = ',status.splitlines()) if len(status) > 4: #print("Status: Got more than one character or nothing.") return False elif (status not in pattern): #print("Status: Got an invalid character it must be either y, n, ✔️, or ❌ ") return False return True def IsScoreValid(score:str)->bool: """ Check the Score is only "^[0-9.,]+$" with 8 max chars. """ # check if the characters are only [a-zA-Z0-9.,' ] with maximum 256 chars max #(careful with the ' and , make sure you test if it fucks the csv up or else) # if OK return True #if not : return False pattern = re.compile("^[0-9.,]+$") score = str(score) score.strip() if pattern.fullmatch(score) is None: # empty description is fine as it's optional return False elif len(score) > 8: #print("score is greater than 8 chars") return False # empty score is fine return True def IsDescriptionValid(desc:str)->bool: """ Check the categories are only [a-zA-Z0-9,.' ] with 256 max chars. """ # check if the characters are only [a-zA-Z0-9.,' ] with maximum 256 chars max #(careful with the ' and , make sure you test if it fucks the csv up or else) # if OK return True #if not : return False pattern = re.compile("^[A-Za-z0-9-.,' ]+$") desc.strip() if pattern.fullmatch(desc) is None: # empty description is fine as it's optional return False if desc == "DEFAULT": return False elif len(desc) > 256: #print("desc is greater than 256 chars") return False return True def IsCategoryValid(categories: list)-> bool: """ Check the categories are only [a-zA-Z0-9 ] with 64 max chars. """ # check if the characters are only [a-zA-Z0-9 ] with maximum 64 chars max #(careful with the ' and , make sure you test if it fucks the csv up or else) # if OK return True #if not : return False pattern = re.compile("^[A-Za-z0-9 ]+$") for category in categories: category.strip() if pattern.fullmatch(category) is None: #print('Got an empty list or invalid chars') return False elif len(category) > 64: #print('Category is too long') return False else: return True def IsNameValid(name: str)->bool: """ Check the parameter name only contains [a-zA-Z0-9 ] and is 64 chars long. """ # check if the characters are only [a-zA-Z0-9 ] with maximum 64 chars max #(careful with the ' and , make sure you test if it fucks the csv up or else) # if OK return True #if not : return False pattern = re.compile("^[A-Za-z0-9 ]+$") name = name.strip() if (pattern.fullmatch(name) is None): #print("Got an invalid character or nothing") return False elif len(name) > 64: #print(f'Got a name length greater than 64. {len(name)}') return False return True if __name__ == '__main__': main()