fixed the synchronization feature

This commit is contained in:
root 2025-01-12 18:14:21 +01:00
parent dc5a91c561
commit 49e39f481c
3 changed files with 162 additions and 90 deletions

View file

@ -259,105 +259,169 @@ Maintenance:
for participant in os.listdir(participantsdir):
participantdir=participantsdir+participant
#print(participant)
# TODO check if the webring participant is yourself, if it is, then skip it
# NOTE check if the webring participant is yourself, if it is, then skip it
if participant != myinstance: # prod: dont use your own intance
#if participant == myinstance: # preprod testing only on your own instance
#print("[+] Webring Participant is valid, adding it if it's not already added.")
print('[+] PARTICIPANT=',participant)
# check if the participant is already listed in webring-participants.csv or not, and add them if not already listed
# and display only the matching entries in unverified.csv in an array format (display it in CLI).
filter_wdf = wdf[wdf.URL.str.contains(participant)]
#print(filter_wdf[['Name','URL']])
# check if there are no results, dont proceed if there are none!
if filter_wdf.size == 0: #skip if webring participant is already listed, otherwise proceed
newrow=[name,participant,desc,trusted,status,score]
#print("[+] NEWROW=",newrow)
wdf.loc[-1] = newrow # adding a row
wdf.index = wdf.index + 1 # shifting index
wdf = wdf.sort_index() # sorting by index
print("[+] New row added! now writing the csv file:",webringcsvfile)
wdf.to_csv(webringcsvfile, index=False)
else:
print('[+] Webring participant is already listed in your own webring-participants.csv file!')
#TODO overwrite the existing files in the participant's directory, with their version (download all the csv files from them again)
basewurl='http://'+participant+'/participants/'+participant+'/'
print(basewurl)
print('[+] Downloading the files of ',participant, ": ")
w_vcsv=basewurl+'verified.csv'
w_uvcsv=basewurl+'unverified.csv'
#print(CheckUrl(w_uvcsv))
w_blcsv=basewurl+'blacklist.csv'
#print(CheckUrl(w_blcsv))
w_scsv=basewurl+'sensitive.csv'
#print(CheckUrl(w_scsv))
w_webcsv=basewurl+'webring-participants.csv'
#print(CheckUrl(w_webcsv))
# iterate through the participant's verified.csv and unverified.csv files
for w in ['verified.csv','unverified.csv']:
csvfilepath=participantdir+'/'+w
print(csvfilepath)
csvdf = pd.read_csv(csvfilepath)
#print(bldf[['blacklisted-words']])
bldf[['blacklisted-words']].iterrows()
rows2delete= [] # it is an empty list at first
for i,j in csvdf.iterrows():
#print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description'])
#print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]])
#row=uvdf.iloc[[i]] #it displays the index
row=csvdf.loc[i,:].values.tolist()
print(row)
#print(i,row)
# verify that their verified.csv csv file exists at basewurl+'verified.csv'
if CheckUrl(w_vcsv) is False or CheckUrl(w_uvcsv) is False or CheckUrl(w_blcsv) is False or CheckUrl(w_scsv) is False or CheckUrl(w_webcsv) is False:
print("[-] Webring Participant isn't reachable, skipping")
#return False #dont do anything if the webring participant isnt reachable.
else: #if the webring participant is reachable, proceed
print("[+] Webring Participant is reachable, updating their csv files:")
for i in ['verified.csv','unverified.csv','blacklist.csv','sensitive.csv','webring-participants.csv']:
# FOR EACH CSV FILE TO GET:
# URL: basewurl / FILE.CSV
# PATH: participantdir / FILE.CSV
#print('[+] DOWNLOADING ',basewurl+i)
# download the external csv file and save it into the "text" variable:
#response = urllib.request.urlopen(basewurl+i)
response = requests.get(basewurl+i, proxies=proxies)
#data = response.read() # a `bytes` object
#text = data.decode('utf-8')
text = response.text
# save the text variable into the destination file:
#print('[+] SAVING IT INTO ',participantdir+'/'+i)
csvfilepath=participantdir+'/'+i
with open(csvfilepath, "w") as file:
file.write(text)
#print("[+] file written, let's read it")
f = open(csvfilepath,"r")
#print(f.read())
# download the banner.png image:
bannerurl=basewurl+'banner.png'
bannerpath=participantdir+'/banner.png'
r = requests.get(bannerurl, stream=True, proxies=proxies)
with open(bannerpath, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
# SANITY CHECK ON THE BANNER PNG IMAGE:
if IsBannerValid(bannerpath):
#print('[+] Banner is valid')
pass
else:
# if false, overwrite it with the template banner png file
#print('[-] Banner is not valid, replacing it with the default banner')
os.remove(bannerpath)
# copy templates/banner.png to bannerpath
bannertemplatepath=templatepath+'banner.png'
shutil.copyfile(bannertemplatepath, bannerpath)
#print("[+] Webring Participant is valid, adding it if it's not already added.")
#print('[+] PARTICIPANT=',participant)
# check if the participant is already listed in webring-participants.csv or not, and add them if not already listed
# and display only the matching entries in unverified.csv in an array format (display it in CLI).
filter_wdf = wdf[wdf.URL.str.contains(participant)]
#print(filter_wdf[['Name','URL']])
# check if there are no results, dont proceed if there are none!
if filter_wdf.size == 0: #skip if webring participant is already listed, otherwise proceed
newrow=[name,participant,desc,trusted,status,score]
#print("[+] NEWROW=",newrow)
wdf.loc[-1] = newrow # adding a row
wdf.index = wdf.index + 1 # shifting index
wdf = wdf.sort_index() # sorting by index
#print("[+] New row added! now writing the csv file:",webringcsvfile)
wdf.to_csv(webringcsvfile, index=False)
else:
pass
#print('[+] Webring participant is already listed in your own webring-participants.csv file!')
# iterate through the participant's verified.csv and unverified.csv files
for w in ['verified.csv','unverified.csv']:
csvfilepath=participantdir+'/'+w
print(csvfilepath)
csvdf = pd.read_csv(csvfilepath)
#print(bldf[['blacklisted-words']])
bldf[['blacklisted-words']].iterrows()
rows2delete= [] # it is an empty list at first
for i,j in csvdf.iterrows():
#print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description'])
#print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]])
#row=uvdf.iloc[[i]] #it displays the index
row=csvdf.loc[i,:].values.tolist()
print(row)
#print(i,row)
################################ SANITY CHECKS ####################################
### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion###
#print("[+] ROW=",i,"ROW CONTENTS=", IsUrlValid(uvdf.at[i, 'Instance']), IsCategoryValid(uvdf.at[i, 'Category']), IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']), IsDescriptionValid(uvdf.at[i, 'Description']), IsStatusValid(uvdf.at[i, 'Status']), IsScoreValid(uvdf.at[i, 'Score']))
if IsUrlValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsUrlValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or IsDescriptionValid(csvdf.at[i, 'Description']) is False or IsStatusValid(csvdf.at[i, 'Status']) is False or IsScoreValid(csvdf.at[i, 'Score']) is False:
#mark the row for deletion as it has invalid inputs
if i not in rows2delete:
print("Marking row", i,"for deletion, as it has invalid inputs")
rows2delete.append(i) #mark the row for deletion if not already done
### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
for k,l in bldf.iterrows():
#print("[+] Blacklisted word=",k, bldf.at[k, 'blacklisted-words'])
blword=bldf.at[k, 'blacklisted-words']
if any(blword in str(x) for x in row) == True:
#print("found blacklisted word! marking row for deletion")
################################ SANITY CHECKS ####################################
### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion###
#print("[+] ROW=",i,"ROW CONTENTS=", IsUrlValid(uvdf.at[i, 'Instance']), IsCategoryValid(uvdf.at[i, 'Category']), IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']), IsDescriptionValid(uvdf.at[i, 'Description']), IsStatusValid(uvdf.at[i, 'Status']), IsScoreValid(uvdf.at[i, 'Score']))
if IsUrlValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsUrlValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or IsDescriptionValid(csvdf.at[i, 'Description']) is False or IsStatusValid(csvdf.at[i, 'Status']) is False or IsScoreValid(csvdf.at[i, 'Score']) is False:
#mark the row for deletion as it has invalid inputs
if i not in rows2delete:
print("Marking row", i,"for deletion, as it matches with a blacklisted word")
print("Marking row", i,"for deletion, as it has invalid inputs")
rows2delete.append(i) #mark the row for deletion if not already done
else:
# not a blacklisted link, therefore it is suitable to be added to your own csv files:
################################ CHECKING FOR DUPLICATES! #########################
# for each link in the participant's verified/unverified csv files,
# check if the link is already listed in your own verified.csv or unverified.csv
filterterm=csvdf.at[i, 'URL']
filter_vdf= vdf[vdf.URL.str.contains(filterterm)]
filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm)]
if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0:
#if link doesnt exist in either of your verified/unverified csv files,
# then add it to your own unverified.csv file:
newrow=row
uvdf.loc[-1] = newrow # adding a row
uvdf.index = uvdf.index + 1 # shifting index
uvdf = uvdf.sort_index() # sorting by index
uvdf.to_csv(unverifiedcsvfile, index=False)
print("[+] New row added to your own unverified.csv file!")
### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
for k,l in bldf.iterrows():
#print("[+] Blacklisted word=",k, bldf.at[k, 'blacklisted-words'])
blword=bldf.at[k, 'blacklisted-words']
if any(blword in str(x) for x in row) == True:
#print("found blacklisted word! marking row for deletion")
if i not in rows2delete:
print("Marking row", i,"for deletion, as it matches with a blacklisted word")
rows2delete.append(i) #mark the row for deletion if not already done
else:
print('[-] Skipping row as it is already added in ',w,row,)
# not a blacklisted link, therefore it is suitable to be added to your own csv files:
################################ CHECKING FOR DUPLICATES! #########################
# for each link in the participant's verified/unverified csv files,
# check if the link is already listed in your own verified.csv or unverified.csv
filterterm=csvdf.at[i, 'URL']
filter_vdf= vdf[vdf.URL.str.contains(filterterm)]
filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm)]
if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0:
#if link doesnt exist in either of your verified/unverified csv files,
# then add it to your own unverified.csv file:
newrow=row
uvdf.loc[-1] = newrow # adding a row
uvdf.index = uvdf.index + 1 # shifting index
uvdf = uvdf.sort_index() # sorting by index
uvdf.to_csv(unverifiedcsvfile, index=False)
print("[+] New row added to your own unverified.csv file!")
else:
print('[-] Skipping row as it is already added in ',w,row,)
###################### APPENDING TO YOUR OWN UNVERIFIED.CSV FILE###################
###################### APPENDING TO YOUR OWN UNVERIFIED.CSV FILE###################
### SANITY CHECK 3: Mark all the rows that are supposed to be sensitive ###
for k,l in sedf.iterrows():
#print("[+] Sensitive word=",k, sedf.at[k, 'sensitive-words'])
seword=sedf.at[k, 'sensitive-words']
if any(seword in str(x) for x in row) == True:
if csvdf.at[i, 'Sensitive'] != '✔️':
print("Marking row", i,"as sensitive, as it matches with a sensitive word")
csvdf.at[i, 'Sensitive']='✔️'
### SANITY CHECK 3: Mark all the rows that are supposed to be sensitive ###
for k,l in sedf.iterrows():
#print("[+] Sensitive word=",k, sedf.at[k, 'sensitive-words'])
seword=sedf.at[k, 'sensitive-words']
if any(seword in str(x) for x in row) == True:
if csvdf.at[i, 'Sensitive'] != '✔️':
print("Marking row", i,"as sensitive, as it matches with a sensitive word")
csvdf.at[i, 'Sensitive']='✔️'
print('[-] Rows to delete: ',rows2delete)
# only delete rows after you've gone through all the unverified.csv OR verified.csv rows'
for i in rows2delete:
row=csvdf.loc[i,:].values.tolist()
print('[+] REMOVING ROW :',i,row)
csvdf.drop(i, inplace= True)
csvdf.to_csv(csvfilepath, index=False)
rows2delete= [] # it is an empty list at first
print('[-] Rows to delete: ',rows2delete)
# only delete rows after you've gone through all the unverified.csv OR verified.csv rows'
for i in rows2delete:
row=csvdf.loc[i,:].values.tolist()
print('[+] REMOVING ROW :',i,row)
csvdf.drop(i, inplace= True)
csvdf.to_csv(csvfilepath, index=False)
rows2delete= [] # it is an empty list at first
@ -682,18 +746,18 @@ def CheckUrl(url):
}
try:
status = requests.get(url,proxies=proxies, timeout=5).status_code
print('[+]',url,status)
#print('[+]',url,status)
if status != 502:
print(url,"✔️")
#print(url,"✔️")
return True
else:
print(url,"")
#print(url,"❌")
return False
except requests.ConnectionError as e:
print(url,"")
#print(url,"❌")
return False
except requests.exceptions.ReadTimeout as e:
print(url,"")
#print(url,"❌")
return False
#### PROTECTIONS AGAINST MALICIOUS CSV INPUTS ####
@ -825,6 +889,10 @@ def IsScoreValid(score:str)->bool:
pattern = re.compile("^[0-9.,]+$")
score = str(score)
score.strip()
#pattern = ['','nan']
if score in ['','nan']:
#Score can be empty when initially added
return True
if pattern.fullmatch(score) is None:
# empty description is fine as it's optional
return False