make sure that simplex chatroom links with a ; arent accepted, and that option 4 removes duplicates before iterating

This commit is contained in:
root 2025-05-04 23:02:02 +02:00
parent 075ea091d4
commit 22489e571e
2 changed files with 35 additions and 23 deletions

View file

@ -144,11 +144,11 @@ def main():
src=templatepath+i src=templatepath+i
shutil.copyfile(src, filepath) shutil.copyfile(src, filepath)
# now that they exist, get vdf and uvdf and the rest # now that they exist, get vdf and uvdf and the rest
vdf = pd.read_csv(verifiedcsvfile) vdf = pd.read_csv(verifiedcsvfile, on_bad_lines='skip')
uvdf = pd.read_csv(unverifiedcsvfile) uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
bldf = pd.read_csv(blcsvfile) bldf = pd.read_csv(blcsvfile, on_bad_lines='skip')
sedf = pd.read_csv(secsvfile) sedf = pd.read_csv(secsvfile, on_bad_lines='skip')
webpdf = pd.read_csv(webpcsvfile) webpdf = pd.read_csv(webpcsvfile, on_bad_lines='skip')
print_colors(f"[+] file exists, your Webring URL is {instance}") print_colors(f"[+] file exists, your Webring URL is {instance}")
##### CHECK IF ARGUMENTS ARE PASSED TO ENTER PROMPT-LESS MODE ##### ##### CHECK IF ARGUMENTS ARE PASSED TO ENTER PROMPT-LESS MODE #####
@ -257,8 +257,8 @@ Maintenance:
case 2: case 2:
print_colors("[+] Trust/Untrust/Blacklist a Website entry (move an entry from unverified to verified.csv)") print_colors("[+] Trust/Untrust/Blacklist a Website entry (move an entry from unverified to verified.csv)")
while True: while True:
vdf = pd.read_csv(verifiedcsvfile) vdf = pd.read_csv(verifiedcsvfile, on_bad_lines='skip')
uvdf = pd.read_csv(unverifiedcsvfile) uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
# search for a word # search for a word
name='' name=''
@ -385,7 +385,7 @@ Maintenance:
for w in ['verified.csv','unverified.csv']: for w in ['verified.csv','unverified.csv']:
csvfilepath=instancepath+'/'+w csvfilepath=instancepath+'/'+w
print_colors(f"{csvfilepath}") print_colors(f"{csvfilepath}")
csvdf = pd.read_csv(csvfilepath) csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
rows2delete= [] # it is an empty list at first rows2delete= [] # it is an empty list at first
for i,j in csvdf.iterrows(): for i,j in csvdf.iterrows():
row=csvdf.loc[i,:].values.tolist() row=csvdf.loc[i,:].values.tolist()
@ -535,7 +535,7 @@ Maintenance:
status='' status=''
score='' score=''
webringcsvfile=instancepath+'/'+'webring-participants.csv' webringcsvfile=instancepath+'/'+'webring-participants.csv'
wdf = pd.read_csv(webringcsvfile) wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip')
for participant in os.listdir(participantsdir): for participant in os.listdir(participantsdir):
participantdir=participantsdir+participant participantdir=participantsdir+participant
@ -610,7 +610,15 @@ Maintenance:
for w in ['verified.csv','unverified.csv']: for w in ['verified.csv','unverified.csv']:
csvfilepath=participantdir+'/'+w csvfilepath=participantdir+'/'+w
print_colors(f"{csvfilepath}") print_colors(f"{csvfilepath}")
csvdf = pd.read_csv(csvfilepath) csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
print("[+] Removing the participant's duplicate entries... ")
# REMOVE DUPLICATES !!! do not accept any duplicate from remote participants
csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False)
csvdf.to_csv(csvfilepath, index=False)
csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
bldf[['blacklisted-words']].iterrows() bldf[['blacklisted-words']].iterrows()
rows2delete= [] # it is an empty list at first rows2delete= [] # it is an empty list at first
for i,j in csvdf.iterrows(): for i,j in csvdf.iterrows():
@ -640,6 +648,7 @@ Maintenance:
#mark the row for deletion as it has invalid inputs #mark the row for deletion as it has invalid inputs
if i not in rows2delete: if i not in rows2delete:
print_colors(f"Marking row {i} for deletion, as it has invalid inputs") print_colors(f"Marking row {i} for deletion, as it has invalid inputs")
print(row)
rows2delete.append(i) #mark the row for deletion if not already done rows2delete.append(i) #mark the row for deletion if not already done
### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ### ### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
@ -667,6 +676,7 @@ Maintenance:
uvdf.index = uvdf.index + 1 # shifting index uvdf.index = uvdf.index + 1 # shifting index
uvdf = uvdf.sort_index() # sorting by index uvdf = uvdf.sort_index() # sorting by index
uvdf.to_csv(unverifiedcsvfile, index=False) uvdf.to_csv(unverifiedcsvfile, index=False)
print("[+] NEW ROW =",newrow)
print_colors("[+] New row added to your own unverified.csv file!") print_colors("[+] New row added to your own unverified.csv file!")
else: else:
pass pass
@ -736,7 +746,7 @@ Maintenance:
score='' score=''
newrow=[name,webring_participant_url,desc,trusted,status,score] newrow=[name,webring_participant_url,desc,trusted,status,score]
webringcsvfile=instancepath+'/'+'webring-participants.csv' webringcsvfile=instancepath+'/'+'webring-participants.csv'
wdf = pd.read_csv(webringcsvfile) wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip')
wdf.loc[-1] = newrow # adding a row wdf.loc[-1] = newrow # adding a row
wdf.index = wdf.index + 1 # shifting index wdf.index = wdf.index + 1 # shifting index
wdf = wdf.sort_index() # sorting by index wdf = wdf.sort_index() # sorting by index
@ -783,7 +793,7 @@ Maintenance:
########### PERFORM SANITY CHECKS ON the webring participant's verified.csv and unverified.csv ################## ########### PERFORM SANITY CHECKS ON the webring participant's verified.csv and unverified.csv ##################
for w in ['verified.csv','unverified.csv']: for w in ['verified.csv','unverified.csv']:
csvfilepath=participantdir+'/'+w csvfilepath=participantdir+'/'+w
csvdf = pd.read_csv(csvfilepath) csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
#print_colors(bldf[['blacklisted-words']]) #print_colors(bldf[['blacklisted-words']])
bldf[['blacklisted-words']].iterrows() bldf[['blacklisted-words']].iterrows()
@ -852,7 +862,7 @@ Maintenance:
while True: while True:
print_colors("[+] Trust/UnTrust/Blacklist a webring participant (Potentially dangerous)") print_colors("[+] Trust/UnTrust/Blacklist a webring participant (Potentially dangerous)")
webringcsvfile=instancepath+'/'+'webring-participants.csv' webringcsvfile=instancepath+'/'+'webring-participants.csv'
wdf = pd.read_csv(webringcsvfile) wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip')
print_colors(f'{wdf[["URL","Trusted"]]}') print_colors(f'{wdf[["URL","Trusted"]]}')
try: try:
index = int(input("What is the index of the webring participant that you want to edit? -1 to exit ").strip()) index = int(input("What is the index of the webring participant that you want to edit? -1 to exit ").strip())
@ -1120,7 +1130,7 @@ Maintenance:
csvfilepath = os.path.join(instancepath, w) csvfilepath = os.path.join(instancepath, w)
print_colors(f"Processing file: {csvfilepath}") print_colors(f"Processing file: {csvfilepath}")
try: try:
csvdf = pd.read_csv(csvfilepath) csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
print_colors(f"Removing duplicates in {csvfilepath}") print_colors(f"Removing duplicates in {csvfilepath}")
#print_colors(f"{csvdf[['URL']]}") #print_colors(f"{csvdf[['URL']]}")
csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False)
@ -1146,7 +1156,7 @@ Maintenance:
for w in ['verified.csv','unverified.csv']: for w in ['verified.csv','unverified.csv']:
csvfilepath=participantdir+'/'+w csvfilepath=participantdir+'/'+w
print_colors(f"{csvfilepath}") print_colors(f"{csvfilepath}")
csvdf = pd.read_csv(csvfilepath) csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
rows2delete= [] # it is an empty list at first rows2delete= [] # it is an empty list at first
for i,j in csvdf.iterrows(): for i,j in csvdf.iterrows():
row=csvdf.loc[i,:].values.tolist() row=csvdf.loc[i,:].values.tolist()
@ -1208,10 +1218,10 @@ Maintenance:
case 11: case 11:
#review the submitted websites: #review the submitted websites:
try: try:
submission_df = pd.read_csv(submission_file_abs_path) submission_df = pd.read_csv(submission_file_abs_path, on_bad_lines='skip')
verified_csv_df = pd.read_csv(verifiedcsvfile) verified_csv_df = pd.read_csv(verifiedcsvfile, on_bad_lines='skip')
unverified_csv_df = pd.read_csv(unverifiedcsvfile) unverified_csv_df = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
blacklist_df = pd.read_csv(blcsvfile) blacklist_df = pd.read_csv(blcsvfile, on_bad_lines='skip')
blacklisted_words = [word for word in blacklist_df['blacklisted-words']] blacklisted_words = [word for word in blacklist_df['blacklisted-words']]
for i, row in submission_df.iterrows(): for i, row in submission_df.iterrows():
link = row['link'] link = row['link']
@ -1290,10 +1300,10 @@ Maintenance:
# review the crawled websites # review the crawled websites
try: try:
print(crawled_file_abs_path) print(crawled_file_abs_path)
crawled_df = pd.read_csv(crawled_file_abs_path) crawled_df = pd.read_csv(crawled_file_abs_path, on_bad_lines='skip')
verified_csv_df = pd.read_csv(verifiedcsvfile) verified_csv_df = pd.read_csv(verifiedcsvfile, on_bad_lines='skip')
unverified_csv_df = pd.read_csv(unverifiedcsvfile) unverified_csv_df = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
blacklist_df = pd.read_csv(blcsvfile) blacklist_df = pd.read_csv(blcsvfile, on_bad_lines='skip')
blacklisted_words = [word for word in blacklist_df['blacklisted-words']] blacklisted_words = [word for word in blacklist_df['blacklisted-words']]
for i, row in crawled_df.iterrows(): for i, row in crawled_df.iterrows():
link = row['URL'] link = row['URL']

View file

@ -150,6 +150,8 @@ def IsUrlValid(url:str)->bool:
else: else:
if not url.__contains__('.'): if not url.__contains__('.'):
return False return False
if url.__contains__(';'):
return False #required otherwise lantern thinks there are extra columns
if pattern.fullmatch(url) is None: if pattern.fullmatch(url) is None:
return False return False
return True return True