WIP still, but good progress on option 4

2025-07-02 11:56:40 +00:00 · 2025-01-09 22:12:19 +01:00 · 2025-01-09 22:12:19 +01:00 · 9d81e7a779
commit 9d81e7a779
parent d891df79a2
10 changed files with 420 additions and 107 deletions
--- a/scripts/darknet_exploration.py
+++ b/scripts/darknet_exploration.py
@ -29,6 +29,8 @@ def main():
 			templatepath=rootpath+'templates/'
 			verifiedcsvfile=instancepath+'/verified.csv'
 			unverifiedcsvfile=instancepath+'/unverified.csv'
+			blcsvfile=instancepath+'/blacklist.csv'
+			secsvfile=instancepath+'/sensitive.csv'
 			# check if instancepath exists, if not then create the directory
 			if not os.path.exists(instancepath):
 				os.makedirs(instancepath)
@ -43,6 +45,8 @@ def main():
 			# now that they exist, get vdf and uvdf
 			vdf = pd.read_csv(verifiedcsvfile)
 			uvdf = pd.read_csv(unverifiedcsvfile)
+			bldf = pd.read_csv(blcsvfile)
+			sedf = pd.read_csv(secsvfile)
 			print("[+] file exists, your Webring URL is", instance)
 			isitvalid = "y"
 		else:
@ -226,8 +230,8 @@ Managing Wordlists:
 					webring_participant_url = input("What is the onion domain of the new webring participant? (ex: uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion)  ")
 				# check if the directory exists locally or not,
 				participantdir=rootpath+'www/participants/'+webring_participant_url
-				#if not os.path.isdir(participantdir): # to test on your own instance
-				if os.path.isdir(participantdir):
+				if not os.path.isdir(participantdir): # to test on your own instance
+				#if os.path.isdir(participantdir):
 					# if it does, it means that the webring is ALREADY added
 					print("[-] Webring Participant is already listed, skipping.")
 					return False
@ -298,15 +302,63 @@ Managing Wordlists:
 							print("[+] file written, let's read it")
 							f = open(csvfilepath,"r")
 							print(f.read())
-							#  TODO and remove all of the invalid entries !!!
-					#######################################################################
-					#newrow=[instance,category,name,url,sensi,desc,'','']
-					#print("[+] NEWROW=",newrow)
-					#uvdf.loc[-1] = newrow  # adding a row
-					#uvdf.index = uvdf.index + 1  # shifting index
-					#uvdf = uvdf.sort_index()  # sorting by index
-					#print("[+] New row added! now writing the csv file:")
-					#uvdf.to_csv(unverifiedcsvfile, index=False)
+
+						########### PERFORM SANITY CHECKS ON the webring participant's verified.csv and unverified.csv ##################
+						for w in ['verified.csv','unverified.csv']:
+							csvfilepath=participantdir+'/'+w
+							csvdf = pd.read_csv(csvfilepath)
+							
+							#print(bldf[['blacklisted-words']])
+							bldf[['blacklisted-words']].iterrows()
+							rows2delete= [] # it is an empty list at first
+							for i,j in csvdf.iterrows():
+								#print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description'])
+								#print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]])
+								#row=uvdf.iloc[[i]] #it displays the index
+								row=csvdf.loc[i,:].values.tolist()
+								#print(i,row)
+	
+								### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion###
+								#print("[+] ROW=",i,"ROW CONTENTS=",  IsUrlValid(uvdf.at[i, 'Instance']),  IsCategoryValid(uvdf.at[i, 'Category']),  IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']),  IsDescriptionValid(uvdf.at[i, 'Description']),  IsStatusValid(uvdf.at[i, 'Status']),  IsScoreValid(uvdf.at[i, 'Score']))
+								if IsUrlValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsUrlValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or  IsDescriptionValid(csvdf.at[i, 'Description']) is False or  IsStatusValid(csvdf.at[i, 'Status']) is False or  IsScoreValid(csvdf.at[i, 'Score']) is False:
+									#mark the row for deletion as it has invalid inputs
+									if i not in rows2delete:
+										print("Marking row", i,"for deletion, as it has invalid inputs")
+										rows2delete.append(i) #mark the row for deletion if not already done
+	
+								### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
+								for k,l in bldf.iterrows():
+									#print("[+] Blacklisted word=",k,  bldf.at[k, 'blacklisted-words'])
+									blword=bldf.at[k, 'blacklisted-words']
+									if any(blword in str(x) for x in row) == True:
+										#print("found blacklisted word! marking row for deletion")
+										if i not in rows2delete:
+											print("Marking row", i,"for deletion, as it matches with a blacklisted word")
+											rows2delete.append(i) #mark the row for deletion if not already done
+
+								### SANITY CHECK 3: Mark all the rows that are supposed to be sensitive ###
+								for k,l in sedf.iterrows():
+									#print("[+] Sensitive word=",k,  sedf.at[k, 'sensitive-words'])
+									seword=sedf.at[k, 'sensitive-words']
+									if any(seword in str(x) for x in row) == True:
+										if csvdf.at[i, 'Sensitive'] != '✔️':
+											print("Marking row", i,"as sensitive, as it matches with a sensitive word")
+											uvdf.at[i, 'Sensitive']='✔️'
+
+							print('[-] Rows to delete: ',rows2delete)
+	
+							# TODO : MAKE SURE IT WORKS IN PROD
+							for i in rows2delete:
+								row=csvdf.loc[i,:].values.tolist()
+								print('[+] REMOVING ROW :',i,row)
+								csvdf.drop(i, inplace= True)
+								csvdf.to_csv(csvfilepath, index=False)
+##############################################
+
+
+
+
+

 			
 			case "5":
@ -375,6 +427,8 @@ Managing Wordlists:
 				# TODO  print("do you want to 1) add words  or 2) remove words ?")
 					# TODO  display the contents of blacklist.csv file
 			
+			# TODO CASE 10 : cleanup all duplicates in unverified + verified.csv, based on the url (check if each url appears more than once, and if they do, remove them + write to csv file)
+	
 			case _:
 				print("[-] Exiting")
 				return True
@ -420,26 +474,26 @@ def IsOnionValid(url: str)-> bool:
        pattern = re.compile("^[A-Za-z0-9.]+(\.onion)?$")
        url = url.strip().removesuffix('/')
        if url.startswith('http://'):
-            print('URL starts with http')
+            #print('URL starts with http')
            # Removes the http://
            domain = url.split('/')[2]
            if pattern.fullmatch(domain) is not None:
                if len(domain.split('.')) > 3:
                    n_subdomians = len(domain.split('.'))
                    # Checks if there is more than 1 subdomain. "subdomain.url.onion" only
-                    print(f"This domain have more than one subdomain. There are {n_subdomians} subdomains")
+                    #print(f"This domain have more than one subdomain. There are {n_subdomians} subdomains")
                    return False
                else:
                    if len(domain) < 62:
-                        print("Domain length is less than 62.")
+                        #print("Domain length is less than 62.")
                        return False
                    return True
            elif pattern.fullmatch(domain) is None:
-                print("Domain contains invalid character.")
-                print(domain)
+                #print("Domain contains invalid character.")
+                #print(domain)
                return False
            else:
-                print("Domain not valid")
+                #print("Domain not valid")
                return False
        else:
            #print("URL doesn't start http")
@ -447,25 +501,23 @@ def IsOnionValid(url: str)-> bool:
                if len(url.split('.')) > 3:
                    n_subdomians = len(url.split('.'))
                    # Checks if there is more than 1 subdomain. "subdomain.url.onion" only
-                    print(f"This domain have more than one subdomain. There are {n_subdomians - 1} subdomains")
+                    #print(f"This domain have more than one subdomain. There are {n_subdomians - 1} subdomains")
                    return False
                else:
                    if len(url) < 62:
-                        print("Domain length is less than 62.")
+                        #print("Domain length is less than 62.")
                        return False
                    return True
            elif pattern.fullmatch(url) is None:
-                print("Domain contains invalid character.")
-                print(url)
+                #print("Domain contains invalid character.")
+                #print(url)
                return False
            else:
-                print("Domain not valid")
+                #print("Domain not valid")
                return False
    except Exception as e:
        print(f"Error: {e}")

-
-
 def IsUrlValid(url:str)->bool:
        """
        Check if url is valid both dark net end clearnet.
@ -475,34 +527,57 @@ def IsUrlValid(url:str)->bool:
                        #  if OK return True
        #if not : return False
        pattern  = re.compile("^[A-Za-z0-9:/.]+$")
+        url = str(url)
        if url.endswith('.onion'):
                return IsOnionValid(url)
        else:
                if not url.__contains__('.'):
-                        print("No (DOT) in clearnet url")
+                        #print("No (DOT) in clearnet url")
                        return False
                if pattern.fullmatch(url) is None:
-                        print('Url contains invalid chars')
+                        #print('Url contains invalid chars')
                        return False
                return True

 def IsStatusValid(status: str)-> bool:
        """
-        Checks if status contains only [v,w]. Verbose only if False is returned
+        Checks if status contains only [v,x,❌,✔️]. Verbose only if False is returned
        """
-        # check if the characters are only [vx] with maximum 1 chars max
-                #  if OK return True
-        #if not : return False
-        pattern = ['y','n']
-        if len(status) != 1:
-                print("Got more than one character or nothing.")
+        pattern = ['y','n','✔️','❌','','nan']
+        status = str(status)
+        status.strip()
+        #print('[+] STATUS = ',status.splitlines())
+        if len(status) > 4:
+                #print("Status: Got more than one character or nothing.")
                return False
        elif (status not in pattern):
-                print("Got an invalid character it must be either y or n")
+                #print("Status: Got an invalid character it must be either y, n, ✔️, or ❌ ")
                return False
-				
+
        return True

+
+def IsScoreValid(score:str)->bool:
+        """
+        Check the Score is only "^[0-9.,]+$" with 8 max chars.
+        """
+        # check if the characters are only [a-zA-Z0-9.,' ] with maximum 256 chars max
+                #(careful with the ' and , make sure you test if it fucks the csv up or else)
+                #  if OK return True
+        #if not : return False
+        pattern = re.compile("^[0-9.,]+$")
+        score = str(score)
+        score.strip()
+        if pattern.fullmatch(score) is None:
+                # empty description is fine as it's optional
+                return False
+        elif len(score) > 8:
+                #print("score is greater than 8 chars")
+                return False
+        # empty score is fine
+        return True
+
+
 def IsDescriptionValid(desc:str)->bool:
        """
        Check the categories are only [a-zA-Z0-9,.' ] with 256 max chars.
@ -512,12 +587,15 @@ def IsDescriptionValid(desc:str)->bool:
                #  if OK return True
        #if not : return False
        pattern = re.compile("^[A-Za-z0-9-.,' ]+$")
+        desc = str(desc)
        desc.strip()
-		# empty description is fine as it's optional
+        if pattern.fullmatch(desc) is None:
+                # empty description is fine as it's optional
+                return False
        if desc == "DEFAULT":
                return False
        elif len(desc) > 256:
-                print("desc is greater than 256 chars")
+                #print("desc is greater than 256 chars")
                return False
        return True

@ -536,7 +614,8 @@ def IsCategoryValid(categories: list)-> bool:
                        #print('Got an empty list or invalid chars')
                        return False
                elif len(category) > 64:
-                        print('Category is too long')
+                        #print('Category is too long')
+                        return False
                else:
                       return True

@ -554,12 +633,13 @@ def IsNameValid(name: str)->bool:
                #print("Got an invalid character or nothing")
                return False
        elif len(name) > 64:
-                print(f'Got a name lenght greater than 64. {len(name)}')
+                #print(f'Got a name length greater than 64. {len(name)}')
                return False
        return True




+
 if __name__ == '__main__':
 	main()