fixed the synchronization feature

2025-07-02 11:56:40 +00:00 · 2025-01-12 18:14:21 +01:00 · 2025-01-12 18:14:21 +01:00 · 49e39f481c
commit 49e39f481c
parent dc5a91c561
3 changed files with 162 additions and 90 deletions
--- a/scripts/darknet_exploration.py
+++ b/scripts/darknet_exploration.py
@ -259,105 +259,169 @@ Maintenance:
 				for participant in os.listdir(participantsdir):
 					participantdir=participantsdir+participant
 					#print(participant)
-					# TODO check if the webring participant is yourself, if it is, then skip it
+					
+					# NOTE check if the webring participant is yourself, if it is, then skip it
 					if participant != myinstance: # prod: dont use your own intance
 					#if participant == myinstance: # preprod testing only on your own instance
-						#print("[+] Webring Participant is valid, adding it if it's not already added.")
-						print('[+] PARTICIPANT=',participant)
-						# check if the participant is already listed in webring-participants.csv or not, and add them if not already listed
-						# and display only the matching entries in unverified.csv in an array format (display it in CLI). 
-						filter_wdf = wdf[wdf.URL.str.contains(participant)]
-						#print(filter_wdf[['Name','URL']])	
-						# check if there are no results, dont proceed if there are none!
-						if filter_wdf.size == 0: #skip if webring participant is already listed, otherwise proceed
-							newrow=[name,participant,desc,trusted,status,score]
-							#print("[+] NEWROW=",newrow)
-							wdf.loc[-1] = newrow  # adding a row
-							wdf.index = wdf.index + 1  # shifting index
-							wdf = wdf.sort_index()  # sorting by index
-							print("[+] New row added! now writing the csv file:",webringcsvfile)
-							wdf.to_csv(webringcsvfile, index=False)
-						else:
-							print('[+] Webring participant is already listed in your own webring-participants.csv file!')
+						#TODO overwrite the existing files in the participant's directory, with their version (download all the csv files from them again)
+						basewurl='http://'+participant+'/participants/'+participant+'/'
+						print(basewurl)
+						print('[+] Downloading the files of ',participant, ": ")
+						w_vcsv=basewurl+'verified.csv'
+						w_uvcsv=basewurl+'unverified.csv'
+						#print(CheckUrl(w_uvcsv))
+						w_blcsv=basewurl+'blacklist.csv'
+						#print(CheckUrl(w_blcsv))
+						w_scsv=basewurl+'sensitive.csv'
+						#print(CheckUrl(w_scsv))
+						w_webcsv=basewurl+'webring-participants.csv'
+						#print(CheckUrl(w_webcsv))

-						# iterate through the participant's verified.csv and unverified.csv files
-						for w in ['verified.csv','unverified.csv']:
-							csvfilepath=participantdir+'/'+w
-							print(csvfilepath)
-							csvdf = pd.read_csv(csvfilepath)
-							#print(bldf[['blacklisted-words']])
-							bldf[['blacklisted-words']].iterrows()
-							rows2delete= [] # it is an empty list at first
-							for i,j in csvdf.iterrows():
-								#print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description'])
-								#print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]])
-								#row=uvdf.iloc[[i]] #it displays the index
-								row=csvdf.loc[i,:].values.tolist()
-								print(row)
-								#print(i,row)
+						# verify that their verified.csv csv file exists at basewurl+'verified.csv'
+						if CheckUrl(w_vcsv) is False or CheckUrl(w_uvcsv) is False or CheckUrl(w_blcsv) is False or CheckUrl(w_scsv) is False or CheckUrl(w_webcsv) is False:
+							print("[-] Webring Participant isn't reachable, skipping")
+							#return False #dont do anything if the webring participant isnt reachable. 
+						else: #if the webring participant is reachable, proceed 
+							print("[+] Webring Participant is reachable, updating their csv files:")
+							for i in ['verified.csv','unverified.csv','blacklist.csv','sensitive.csv','webring-participants.csv']:
+								# FOR EACH CSV FILE TO GET:
+								# URL: basewurl / FILE.CSV
+								# PATH: participantdir / FILE.CSV
+								#print('[+] DOWNLOADING ',basewurl+i)
+								# download the external csv file and save it into the "text" variable:
+								#response = urllib.request.urlopen(basewurl+i)
+								response = requests.get(basewurl+i, proxies=proxies)
+								#data = response.read()      # a `bytes` object
+								#text = data.decode('utf-8')
+								text = response.text
+								# save the text variable into the destination file:
+								#print('[+] SAVING IT INTO ',participantdir+'/'+i)
+								csvfilepath=participantdir+'/'+i
+								with open(csvfilepath, "w") as file:
+									file.write(text)
+								#print("[+] file written, let's read it")
+								f = open(csvfilepath,"r")
+								#print(f.read())
+
+							# download the banner.png image:
+
+							bannerurl=basewurl+'banner.png'
+							bannerpath=participantdir+'/banner.png'
+							r = requests.get(bannerurl, stream=True, proxies=proxies)
+							with open(bannerpath, 'wb') as f:
+								r.raw.decode_content = True
+								shutil.copyfileobj(r.raw, f)
+
+							# SANITY CHECK ON THE BANNER PNG IMAGE:
+							if IsBannerValid(bannerpath):
+								#print('[+] Banner is valid')
+								pass
+							else:
+								# if false, overwrite it with the template banner png file
+								#print('[-] Banner is not valid, replacing it with the default banner')
+								os.remove(bannerpath)
+								# copy templates/banner.png to bannerpath
+								bannertemplatepath=templatepath+'banner.png'
+								shutil.copyfile(bannertemplatepath, bannerpath)
+
+
+							#print("[+] Webring Participant is valid, adding it if it's not already added.")
+							#print('[+] PARTICIPANT=',participant)
+							# check if the participant is already listed in webring-participants.csv or not, and add them if not already listed
+							# and display only the matching entries in unverified.csv in an array format (display it in CLI). 
+							filter_wdf = wdf[wdf.URL.str.contains(participant)]
+							#print(filter_wdf[['Name','URL']])	
+							# check if there are no results, dont proceed if there are none!
+							if filter_wdf.size == 0: #skip if webring participant is already listed, otherwise proceed
+								newrow=[name,participant,desc,trusted,status,score]
+								#print("[+] NEWROW=",newrow)
+								wdf.loc[-1] = newrow  # adding a row
+								wdf.index = wdf.index + 1  # shifting index
+								wdf = wdf.sort_index()  # sorting by index
+								#print("[+] New row added! now writing the csv file:",webringcsvfile)
+								wdf.to_csv(webringcsvfile, index=False)
+							else:
+								pass
+								#print('[+] Webring participant is already listed in your own webring-participants.csv file!')
+
+							# iterate through the participant's verified.csv and unverified.csv files
+							for w in ['verified.csv','unverified.csv']:
+								csvfilepath=participantdir+'/'+w
+								print(csvfilepath)
+								csvdf = pd.read_csv(csvfilepath)
+								#print(bldf[['blacklisted-words']])
+								bldf[['blacklisted-words']].iterrows()
+								rows2delete= [] # it is an empty list at first
+								for i,j in csvdf.iterrows():
+									#print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description'])
+									#print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]])
+									#row=uvdf.iloc[[i]] #it displays the index
+									row=csvdf.loc[i,:].values.tolist()
+									print(row)
+									#print(i,row)



-								################################ SANITY CHECKS ####################################	
-								### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion###
-								#print("[+] ROW=",i,"ROW CONTENTS=",  IsUrlValid(uvdf.at[i, 'Instance']),  IsCategoryValid(uvdf.at[i, 'Category']),  IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']),  IsDescriptionValid(uvdf.at[i, 'Description']),  IsStatusValid(uvdf.at[i, 'Status']),  IsScoreValid(uvdf.at[i, 'Score']))
-								if IsUrlValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsUrlValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or  IsDescriptionValid(csvdf.at[i, 'Description']) is False or  IsStatusValid(csvdf.at[i, 'Status']) is False or  IsScoreValid(csvdf.at[i, 'Score']) is False:
-									#mark the row for deletion as it has invalid inputs
-									if i not in rows2delete:
-										print("Marking row", i,"for deletion, as it has invalid inputs")
-										rows2delete.append(i) #mark the row for deletion if not already done
-	
-								### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
-								for k,l in bldf.iterrows():
-									#print("[+] Blacklisted word=",k,  bldf.at[k, 'blacklisted-words'])
-									blword=bldf.at[k, 'blacklisted-words']
-									if any(blword in str(x) for x in row) == True:
-										#print("found blacklisted word! marking row for deletion")
+									################################ SANITY CHECKS ####################################	
+									### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion###
+									#print("[+] ROW=",i,"ROW CONTENTS=",  IsUrlValid(uvdf.at[i, 'Instance']),  IsCategoryValid(uvdf.at[i, 'Category']),  IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']),  IsDescriptionValid(uvdf.at[i, 'Description']),  IsStatusValid(uvdf.at[i, 'Status']),  IsScoreValid(uvdf.at[i, 'Score']))
+									if IsUrlValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsUrlValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or  IsDescriptionValid(csvdf.at[i, 'Description']) is False or  IsStatusValid(csvdf.at[i, 'Status']) is False or  IsScoreValid(csvdf.at[i, 'Score']) is False:
+										#mark the row for deletion as it has invalid inputs
 										if i not in rows2delete:
-											print("Marking row", i,"for deletion, as it matches with a blacklisted word")
+											print("Marking row", i,"for deletion, as it has invalid inputs")
 											rows2delete.append(i) #mark the row for deletion if not already done
-									else:
-										# not a blacklisted link, therefore it is suitable to be added to your own csv files:
-										################################ CHECKING FOR DUPLICATES! #########################
-										# for each link in the participant's verified/unverified csv files,
-										# check if the link is already listed in your own verified.csv or unverified.csv
-										filterterm=csvdf.at[i, 'URL']
-										filter_vdf= vdf[vdf.URL.str.contains(filterterm)]
-										filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm)]
-										if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0:
-											#if link doesnt exist in either of your verified/unverified csv files, 
-											# then add it to your own unverified.csv file:
-											newrow=row
-											uvdf.loc[-1] = newrow  # adding a row
-											uvdf.index = uvdf.index + 1  # shifting index
-											uvdf = uvdf.sort_index()  # sorting by index
-											uvdf.to_csv(unverifiedcsvfile, index=False)
-											print("[+] New row added to your own unverified.csv file!")
+	
+									### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
+									for k,l in bldf.iterrows():
+										#print("[+] Blacklisted word=",k,  bldf.at[k, 'blacklisted-words'])
+										blword=bldf.at[k, 'blacklisted-words']
+										if any(blword in str(x) for x in row) == True:
+											#print("found blacklisted word! marking row for deletion")
+											if i not in rows2delete:
+												print("Marking row", i,"for deletion, as it matches with a blacklisted word")
+												rows2delete.append(i) #mark the row for deletion if not already done
 										else:
-											print('[-] Skipping row as it is already added in ',w,row,)
+											# not a blacklisted link, therefore it is suitable to be added to your own csv files:
+											################################ CHECKING FOR DUPLICATES! #########################
+											# for each link in the participant's verified/unverified csv files,
+											# check if the link is already listed in your own verified.csv or unverified.csv
+											filterterm=csvdf.at[i, 'URL']
+											filter_vdf= vdf[vdf.URL.str.contains(filterterm)]
+											filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm)]
+											if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0:
+												#if link doesnt exist in either of your verified/unverified csv files, 
+												# then add it to your own unverified.csv file:
+												newrow=row
+												uvdf.loc[-1] = newrow  # adding a row
+												uvdf.index = uvdf.index + 1  # shifting index
+												uvdf = uvdf.sort_index()  # sorting by index
+												uvdf.to_csv(unverifiedcsvfile, index=False)
+												print("[+] New row added to your own unverified.csv file!")
+											else:
+												print('[-] Skipping row as it is already added in ',w,row,)

 											
-		
-										###################### APPENDING TO YOUR OWN UNVERIFIED.CSV FILE###################
+			
+											###################### APPENDING TO YOUR OWN UNVERIFIED.CSV FILE###################


-								### SANITY CHECK 3: Mark all the rows that are supposed to be sensitive ###
-								for k,l in sedf.iterrows():
-									#print("[+] Sensitive word=",k,  sedf.at[k, 'sensitive-words'])
-									seword=sedf.at[k, 'sensitive-words']
-									if any(seword in str(x) for x in row) == True:
-										if csvdf.at[i, 'Sensitive'] != '✔️':
-											print("Marking row", i,"as sensitive, as it matches with a sensitive word")
-											csvdf.at[i, 'Sensitive']='✔️'
+									### SANITY CHECK 3: Mark all the rows that are supposed to be sensitive ###
+									for k,l in sedf.iterrows():
+										#print("[+] Sensitive word=",k,  sedf.at[k, 'sensitive-words'])
+										seword=sedf.at[k, 'sensitive-words']
+										if any(seword in str(x) for x in row) == True:
+											if csvdf.at[i, 'Sensitive'] != '✔️':
+												print("Marking row", i,"as sensitive, as it matches with a sensitive word")
+												csvdf.at[i, 'Sensitive']='✔️'

-								print('[-] Rows to delete: ',rows2delete)
-							# only delete rows after you've gone through all the unverified.csv OR verified.csv rows'
-							for i in rows2delete:
-								row=csvdf.loc[i,:].values.tolist()
-								print('[+] REMOVING ROW :',i,row)
-								csvdf.drop(i, inplace= True)
-								csvdf.to_csv(csvfilepath, index=False)
-								rows2delete= [] # it is an empty list at first
+									print('[-] Rows to delete: ',rows2delete)
+								# only delete rows after you've gone through all the unverified.csv OR verified.csv rows'
+								for i in rows2delete:
+									row=csvdf.loc[i,:].values.tolist()
+									print('[+] REMOVING ROW :',i,row)
+									csvdf.drop(i, inplace= True)
+									csvdf.to_csv(csvfilepath, index=False)
+									rows2delete= [] # it is an empty list at first

 					

@ -682,18 +746,18 @@ def CheckUrl(url):
 	}
 	try:
 		status = requests.get(url,proxies=proxies, timeout=5).status_code
-		print('[+]',url,status)
+		#print('[+]',url,status)
 		if status != 502:
-			print(url,"✔️")
+			#print(url,"✔️")
 			return True
 		else:
-			print(url,"❌")
+			#print(url,"❌")
 			return False
 	except requests.ConnectionError as e:
-		print(url,"❌")
+		#print(url,"❌")
 		return False
 	except requests.exceptions.ReadTimeout as e:
-		print(url,"❌")
+		#print(url,"❌")
 		return False

 #### PROTECTIONS AGAINST MALICIOUS CSV INPUTS ####
@ -825,6 +889,10 @@ def IsScoreValid(score:str)->bool:
        pattern = re.compile("^[0-9.,]+$")
        score = str(score)
        score.strip()
+        #pattern = ['','nan']
+        if score in ['','nan']:
+            #Score can be empty when initially added
+            return True
        if pattern.fullmatch(score) is None:
                # empty description is fine as it's optional
                return False