updated finished option 9 and 10, only remaining 7 and 8

2025-06-30 22:36:40 +00:00 · 2025-01-16 08:52:42 +01:00 · 2025-01-16 08:52:42 +01:00 · c863f71951
commit c863f71951
parent 1092755d26
4 changed files with 66 additions and 109 deletions
--- a/README.md
+++ b/README.md
@ -6,15 +6,14 @@ DONE:
 -py : option 6) Trust/Untrust/Blacklist a webring participant
 -php : make a search engine prompt that only accepts [a-zA-Z.://], it must refuse every other character 
 -py : fix uptimecheck.py to match the new csv format 
-
-DOING:
 -php : if valid make it filter your own verified.csv and unverified.csv files
+-py : option 9)  cleanup all duplicates in your own unverified.csv and verified.csv
+-py : option 10) perform sanity checks on all csv files (to mark them as sensitive or remove the ones that are blacklisted)
+
+TODO:
 -py : option 7) Add/Remove words in the sensitive list			(assigned to anon)
 -py : option 8) Add/Remove words in the blacklist				(assigned to anon)

-TODO:
-py : option 9)  cleanup all duplicates in your own unverified.csv and verified.csv
-py : option 10) perform sanity checks on all csv files (to mark them as sensitive or remove the ones that are blacklisted)

 ``` 

--- a/scripts/darknet_exploration.py
+++ b/scripts/darknet_exploration.py
@ -243,7 +243,7 @@ Maintenance:



-			#TODO check if it works when you have a second webring participant 
+			#check if it works when you have a second webring participant 
 			case "4":  
 				print("4) Synchronize new links from existing webring participants, into your unverified.csv file")
 				# iterate through each existing directories in www/participants/* to get each webring participant
@ -263,7 +263,7 @@ Maintenance:
 					# NOTE check if the webring participant is yourself, if it is, then skip it
 					if participant != myinstance: # prod: dont use your own intance
 					#if participant == myinstance: # preprod testing only on your own instance
-						#TODO overwrite the existing files in the participant's directory, with their version (download all the csv files from them again)
+						#overwrite the existing files in the participant's directory, with their version (download all the csv files from them again)
 						basewurl='http://'+participant+'/participants/'+participant+'/'
 						print(basewurl)
 						print('[+] Downloading the files of ',participant, ": ")
@ -779,7 +779,7 @@ Maintenance:
 								# TODO  if valid! remove word at index 
 								# TODO  if invalid! just pass to ask for another word
 			
-			# TODO CASE 9 : cleanup all duplicates in unverified + verified.csv, based on the url (check if each url appears more than once, and if they do, remove them + write to csv file)
+			# CASE 9 : cleanup all duplicates in unverified + verified.csv, based on the url (check if each url appears more than once, and if they do, remove them + write to csv file)
 			case "9":
 				print("[+] 9) Cleaning up all duplicates in your own unverified + verified.csv (based on the url)")
 									# ignore it if the index is "indextocheck" and if the index is already listed in rows2delete
@ -790,97 +790,68 @@ Maintenance:
 				for w in ['verified.csv','unverified.csv']:
 					#instancepath=rootpath+'www/participants/'+instance # fyi
 					csvfilepath=instancepath+'/'+w
+					
 					print(csvfilepath)
 					csvdf = pd.read_csv(csvfilepath)
-					rows2deletevdf= [] # it is an empty list at first
-					rows2deleteuvdf= [] # it is an empty list at first
-					# iterate through each row of the csv file
-					for i,j in csvdf.iterrows():
-						#print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description'])
-						#print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]])
-						#row=uvdf.iloc[[i]] #it displays the index
-						row=csvdf.loc[i,:].values.tolist()
-						print(row)
-
-						# for each link in the participant's verified/unverified csv files,
-						# check if the link is already listed in your own verified.csv or unverified.csv
-						filterterm=csvdf.at[i, 'URL']
-						filter_vdf= vdf[vdf.URL.str.contains(filterterm)]
-						filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm)]
-						# for unverified.csv, and verified.csv	;
-						if len(filter_vdf.index) > 1:
-							# if link exists more than once in verified.csv, remove the duplicates in verified.csv 
-							for m,n in filter_vdf.iterrows():
-								if m != i and m not in rows2deletevdf:
-									rows2deletevdf.append(m) #mark the DUPLICATE ROWS for deletion, meaning the ones that are not i!!! if not already done
-										#TODO make sure it doesnt mark the previous i indexes for deletion (as it is a duplicate of the next row)
-						if len(filter_vdf.index) == 1:
-							# if link exists ONCE in verified.csv check that it doesnt in unverified.csv: 
-							if len(filter_uvdf.index) >= 1:
-								# if link exists ONCE in verified.csv AND in unverified.csv, cleanup the duplicates in unverified.csv - KO
-								for m,n in filter_uvdf.iterrows():
-									if m != i and m not in rows2deleteuvdf:
-										rows2deleteuvdf.append(m) #mark the DUPLICATE ROWS for deletion, meaning the ones that are not i!!! if not already done
-										#TODO make sure it doesnt mark the previous i indexes for deletion (as it is a duplicate of the next row)
-
-							#else:
-								# link only exists in verified.csv, and not in unverified.csv - OK
-						if len(filter_vdf.index) == 0:
-							# if link does not exist in verified.csv, check if it exists in unverified.csv:
-							if len(filter_uvdf.index) > 1:
-								# link exists more than once in unverified.csv, get rid of the duplicates in unverified.csv - KO
-								for m,n in filter_uvdf.iterrows():
-									if m != i and m not in rows2deleteuvdf:
-										rows2deleteuvdf.append(m) #mark the DUPLICATE ROWS for deletion, meaning the ones that are not i!!! if not already done
-										#TODO make sure it doesnt mark the previous i indexes for deletion (as it is a duplicate of the next row)
-							#else:
-								# link either exists once or doesnt exist in unverified.csv, therefore OK
-					#rows2deletevdf.sort()
-					#reverse it so that it starts removing the last elements first
-					#rows2deletevdf = rows2deletevdf[::-1]
-					print("ROWS TO DELETE IN VERIFIED.CSV:", rows2deletevdf)
-					if rows2deletevdf != []:
-						for p in rows2deletevdf:
-							row=vdf.loc[p,:].values.tolist()
-							print('[+] REMOVING ROW :',p,row)
-							vdf.drop(p, inplace= True)
-							vdf.to_csv(verifiedcsvfile, index=False)
-							rows2deletevdf= [] # it is an empty list at first
-
-
-					#rows2deleteuvdf.sort()
-					#reverse it so that it starts removing the last elements first
-					#rows2deleteuvdf = rows2deleteuvdf[::-1]
-					print("ROWS TO DELETE IN UNVERIFIED.CSV:", rows2deleteuvdf)
-					if rows2deleteuvdf != []:
-						for p in rows2deleteuvdf:
-							row=uvdf.loc[p,:].values.tolist()
-							print('[+] REMOVING ROW :',p,row)
-							uvdf.drop(p, inplace= True)
-							uvdf.to_csv(unverifiedcsvfile, index=False)
-							rows2deleteuvdf= [] # it is an empty list at first
-
-
-						
-
-
-
-						#if len(filter_uvdf.index) == 1 and len(filter_vdf.index) == 1:
-							# if link exists only ONCE in verified.csv or unverified.csv, then skip
-							# if link doesnt exist in either of your verified/unverified csv files, 
-							# then add it to your own unverified.csv file:
-						#	newrow=row
-						#	uvdf.loc[-1] = newrow  # adding a row
-						#	uvdf.index = uvdf.index + 1  # shifting index
-						#	uvdf = uvdf.sort_index()  # sorting by index
-						#	uvdf.to_csv(unverifiedcsvfile, index=False)
-						#	print("[+] ")
-						#else:
-						#	print('[-] Skipping row as it is already added in ',w,row,)
-
-
+					print("REMOVING DUPLICATES IN", csvfilepath)	
+					csvdf = csvdf.drop_duplicates(subset=['URL'])
+					csvdf.to_csv(csvfilepath, index=False)
+					print(csvdf[['URL']])
 			case "10":
 				print("[+] 10) perform sanity checks on all csv files (to mark them as sensitive / or remove the ones that are blacklisted)")
+				participantspath = rootpath+'www/participants/' 
+				for participant in os.listdir(participantspath):
+					print("Participant:",participant)
+					participantdir= participantspath+participant
+					a=0	
+					if a == 0:
+						if a== 0:
+							################ BEGIN SANITY CHECKS FOR EACH PARTICIPANTS ##############
+							# iterate through the participant's verified.csv and unverified.csv files
+							for w in ['verified.csv','unverified.csv']:
+								csvfilepath=participantdir+'/'+w
+								print(csvfilepath)
+								csvdf = pd.read_csv(csvfilepath)
+								#print(bldf[['blacklisted-words']])
+								bldf[['blacklisted-words']].iterrows()
+								rows2delete= [] # it is an empty list at first
+								for i,j in csvdf.iterrows():
+									#print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description'])
+									#print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]])
+									#row=uvdf.iloc[[i]] #it displays the index
+									row=csvdf.loc[i,:].values.tolist()
+									print(row)
+									#print(i,row)
+
+
+
+									################################ SANITY CHECKS ####################################	
+									### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion###
+									#print("[+] ROW=",i,"ROW CONTENTS=",  IsUrlValid(uvdf.at[i, 'Instance']),  IsCategoryValid(uvdf.at[i, 'Category']),  IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']),  IsDescriptionValid(uvdf.at[i, 'Description']),  IsStatusValid(uvdf.at[i, 'Status']),  IsScoreValid(uvdf.at[i, 'Score']))
+									if IsUrlValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsUrlValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or  IsDescriptionValid(csvdf.at[i, 'Description']) is False or  IsStatusValid(csvdf.at[i, 'Status']) is False or  IsScoreValid(csvdf.at[i, 'Score']) is False:
+										#mark the row for deletion as it has invalid inputs
+										if i not in rows2delete:
+											print("Marking row", i,"for deletion, as it has invalid inputs")
+											rows2delete.append(i) #mark the row for deletion if not already done
+	
+									### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
+									for k,l in bldf.iterrows():
+										#print("[+] Blacklisted word=",k,  bldf.at[k, 'blacklisted-words'])
+										blword=bldf.at[k, 'blacklisted-words']
+										if any(blword in str(x) for x in row) == True:
+											#print("found blacklisted word! marking row for deletion")
+											if i not in rows2delete:
+												print("Marking row", i,"for deletion, as it matches with a blacklisted word")
+												rows2delete.append(i) #mark the row for deletion if not already done
+	
+								for i in rows2delete:
+									row=csvdf.loc[i,:].values.tolist()
+									print('[+] REMOVING ROW :',i,row)
+									csvdf.drop(i, inplace= True)
+									csvdf.to_csv(csvfilepath, index=False)
+
+
+
 				# TODO find the list of all csv files (in www/participants/*/*.csv) (templates should remain empty by default)
 					# copy what was done in option 4, to :
 						# delete the ones that have invalid entries
--- a/www/header.php
+++ b/www/header.php
@ -36,7 +36,7 @@ if (($handle = fopen($csvfile, "r")) !== FALSE) {
 		if ($oldcatname != $data[1]){
 			echo "<td><tr></tr><tr></tr><tr></tr></td>"; // skip a row if new category
 			echo "<td><tr></tr><tr></tr><tr></tr></td>"; // skip a row if new category
-			echo  "<td><p><h4>" . $data[1] . '| </h4></p></td>'; // display the category as its the first row with this new category
+			echo  "<td><p><h4>" . $data[1] . ' </h4></p></td>'; // display the category as its the first row with this new category
 			$oldcatname=$data[1];
 		}else{
 			echo  "<td><p>" . ' </p></td>'; // category is already displayed so skip it (empty cell in category column)
--- a/www/participants/webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion/unverified.csv
+++ b/www/participants/webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion/unverified.csv
@ -1,18 +1,5 @@
 Instance,Category,Name,URL,Sensitive,Description,Status,Score
 uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,,List of links to go to popular darknet places,✔️,100.0
-uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,,List of links to go to popular darknet places,✔️,100.0
-uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,,List of links to go to popular darknet places,✔️,100.0
-uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzldruga77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,✔️,List of links to go to popular darknet places,❌,0.0
-uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,,List of links to go to popular darknet places,✔️,100.0
-webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Hackliberty,Hackliberty Gitea,http://vkp7367tcjpqdwwckigrdrvmwvispvbpg5rlsr2chjxvppfg7hipagyd.onion,,,✔️,100.0
-webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Hackliberty,Hackliberty Gitea,http://vkp7367tcjpqdwwckigrdrvmwvispvbpg5rlsr2chjxvppfg7hipagyd.onion,,,✔️,100.0
-webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Hackliberty,Hackliberty Gitea,http://vkp7367tcjpqdwwckigrdrvmwvispvbpg5rlsr2chjxvppfg7hipagyd.onion,,,✔️,100.0
 webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Hackliberty,Hackliberty Gitea,http://vkp7367tcjpqdwwckigrdrvmwvispvbpg5rlsr2chjxvppfg7hipagyd.onion,,,✔️,100.0
 webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Forums,Hackliberty Forum,http://yw7nc56v4nsudvwewhmhhwltxpncedfuc43qbubj4nmwhdhwtiu4o6yd.onion/,,,✔️,100.0
-webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Forums,Hackliberty Forum,http://yw7nc56v4nsudvwewhmhhwltxpncedfuc43qbubj4nmwhdhwtiu4o6yd.onion/,,,✔️,100.0
-webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Forums,Hackliberty Forum,http://yw7nc56v4nsudvwewhmhhwltxpncedfuc43qbubj4nmwhdhwtiu4o6yd.onion/,,,✔️,100.0
-webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Forums,Hackliberty Forum,http://yw7nc56v4nsudvwewhmhhwltxpncedfuc43qbubj4nmwhdhwtiu4o6yd.onion/,,,✔️,100.0
-webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Communities,Hackliberty main website,http://kj3wvs3wyfhm3uhhuqxlrhhcp6dneuau4mmvptlor27ghmrqx63fqnid.onion/,,,✔️,100.0
-webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Communities,Hackliberty main website,http://kj3wvs3wyfhm3uhhuqxlrhhcp6dneuau4mmvptlor27ghmrqx63fqnid.onion/,,,✔️,100.0
-webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Communities,Hackliberty main website,http://kj3wvs3wyfhm3uhhuqxlrhhcp6dneuau4mmvptlor27ghmrqx63fqnid.onion/,,,✔️,100.0
 webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Communities,Hackliberty main website,http://kj3wvs3wyfhm3uhhuqxlrhhcp6dneuau4mmvptlor27ghmrqx63fqnid.onion/,,,✔️,100.0