make sure that simplex chatroom links with a ; arent accepted, and that option 4 removes duplicates before iterating

2025-07-01 22:06:40 +00:00 · 2025-05-04 23:02:02 +02:00 · 2025-05-04 23:02:02 +02:00 · 22489e571e
commit 22489e571e
parent 075ea091d4
2 changed files with 35 additions and 23 deletions
--- a/scripts/lantern.py
+++ b/scripts/lantern.py
@ -144,11 +144,11 @@ def main():
            src=templatepath+i
            shutil.copyfile(src, filepath)
    # now that they exist, get vdf and uvdf and the rest
-    vdf = pd.read_csv(verifiedcsvfile)
+    vdf = pd.read_csv(verifiedcsvfile, on_bad_lines='skip')
-    uvdf = pd.read_csv(unverifiedcsvfile)
+    uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
-    bldf = pd.read_csv(blcsvfile)
+    bldf = pd.read_csv(blcsvfile, on_bad_lines='skip')
-    sedf = pd.read_csv(secsvfile)
+    sedf = pd.read_csv(secsvfile, on_bad_lines='skip')
-    webpdf = pd.read_csv(webpcsvfile)
+    webpdf = pd.read_csv(webpcsvfile, on_bad_lines='skip')
    print_colors(f"[+] file exists, your Webring URL is {instance}")
 	##### CHECK IF ARGUMENTS ARE PASSED TO ENTER PROMPT-LESS MODE #####
@ -257,8 +257,8 @@ Maintenance:
            case 2:
                print_colors("[+] Trust/Untrust/Blacklist a Website entry (move an entry from unverified to verified.csv)")
                while True:
-                    vdf = pd.read_csv(verifiedcsvfile)
+                    vdf = pd.read_csv(verifiedcsvfile, on_bad_lines='skip')
-                    uvdf = pd.read_csv(unverifiedcsvfile)
+                    uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
                    # search for a word
                    name=''
@ -385,7 +385,7 @@ Maintenance:
                                    for w in ['verified.csv','unverified.csv']:
                                        csvfilepath=instancepath+'/'+w
                                        print_colors(f"{csvfilepath}")
-                                        csvdf = pd.read_csv(csvfilepath)
+                                        csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
                                        rows2delete= [] # it is an empty list at first
                                        for i,j in csvdf.iterrows():
                                            row=csvdf.loc[i,:].values.tolist()
@ -535,7 +535,7 @@ Maintenance:
                status=''
                score=''
                webringcsvfile=instancepath+'/'+'webring-participants.csv'
-                wdf = pd.read_csv(webringcsvfile)
+                wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip')
                for participant in os.listdir(participantsdir):
                    participantdir=participantsdir+participant
@ -610,7 +610,15 @@ Maintenance:
                            for w in ['verified.csv','unverified.csv']:
                                csvfilepath=participantdir+'/'+w
                                print_colors(f"{csvfilepath}")
-                                csvdf = pd.read_csv(csvfilepath)
+                                csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
                                print("[+] Removing the participant's duplicate entries... ")
                                # REMOVE DUPLICATES !!! do not accept any duplicate from remote participants
                                csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False)
                                csvdf.to_csv(csvfilepath, index=False)
                                csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
                                bldf[['blacklisted-words']].iterrows()
                                rows2delete= [] # it is an empty list at first
                                for i,j in csvdf.iterrows():
@ -640,6 +648,7 @@ Maintenance:
                                        #mark the row for deletion as it has invalid inputs
                                        if i not in rows2delete:
                                            print_colors(f"Marking row {i} for deletion, as it has invalid inputs")
                                            print(row)
                                            rows2delete.append(i) #mark the row for deletion if not already done
                                    ### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
@ -667,6 +676,7 @@ Maintenance:
                                                    uvdf.index = uvdf.index + 1  # shifting index
                                                    uvdf = uvdf.sort_index()  # sorting by index
                                                    uvdf.to_csv(unverifiedcsvfile, index=False)
                                                    print("[+] NEW ROW =",newrow)
                                                    print_colors("[+] New row added to your own unverified.csv file!")
                                                else:
                                                    pass
@ -736,7 +746,7 @@ Maintenance:
                        score=''
                        newrow=[name,webring_participant_url,desc,trusted,status,score]
                        webringcsvfile=instancepath+'/'+'webring-participants.csv'
-                        wdf = pd.read_csv(webringcsvfile)
+                        wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip')
                        wdf.loc[-1] = newrow  # adding a row
                        wdf.index = wdf.index + 1  # shifting index
                        wdf = wdf.sort_index()  # sorting by index
@ -783,7 +793,7 @@ Maintenance:
                        ########### PERFORM SANITY CHECKS ON the webring participant's verified.csv and unverified.csv ##################
                        for w in ['verified.csv','unverified.csv']:
                            csvfilepath=participantdir+'/'+w
-                            csvdf = pd.read_csv(csvfilepath)
+                            csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
                            #print_colors(bldf[['blacklisted-words']])
                            bldf[['blacklisted-words']].iterrows()
@ -852,7 +862,7 @@ Maintenance:
                while True:
                    print_colors("[+] Trust/UnTrust/Blacklist a webring participant (Potentially dangerous)")
                    webringcsvfile=instancepath+'/'+'webring-participants.csv'
-                    wdf = pd.read_csv(webringcsvfile)
+                    wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip')
                    print_colors(f'{wdf[["URL","Trusted"]]}')
                    try:
                        index = int(input("What is the index of the webring participant that you want to edit? -1 to exit ").strip())
@ -1120,7 +1130,7 @@ Maintenance:
                    csvfilepath = os.path.join(instancepath, w)
                    print_colors(f"Processing file: {csvfilepath}")
                    try:
-                        csvdf = pd.read_csv(csvfilepath)
+                        csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
                        print_colors(f"Removing duplicates in {csvfilepath}")
                        #print_colors(f"{csvdf[['URL']]}")
                        csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False)
@ -1146,7 +1156,7 @@ Maintenance:
                    for w in ['verified.csv','unverified.csv']:
                        csvfilepath=participantdir+'/'+w
                        print_colors(f"{csvfilepath}")
-                        csvdf = pd.read_csv(csvfilepath)
+                        csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
                        rows2delete= [] # it is an empty list at first
                        for i,j in csvdf.iterrows():
                            row=csvdf.loc[i,:].values.tolist()
@ -1208,10 +1218,10 @@ Maintenance:
            case 11:
                #review the submitted websites:
                try:       
-                    submission_df = pd.read_csv(submission_file_abs_path)
+                    submission_df = pd.read_csv(submission_file_abs_path, on_bad_lines='skip')
-                    verified_csv_df = pd.read_csv(verifiedcsvfile)
+                    verified_csv_df = pd.read_csv(verifiedcsvfile, on_bad_lines='skip')
-                    unverified_csv_df = pd.read_csv(unverifiedcsvfile)
+                    unverified_csv_df = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
-                    blacklist_df = pd.read_csv(blcsvfile)
+                    blacklist_df = pd.read_csv(blcsvfile, on_bad_lines='skip')
                    blacklisted_words = [word for word in blacklist_df['blacklisted-words']]
                    for i, row in submission_df.iterrows():
                        link = row['link']
@ -1290,10 +1300,10 @@ Maintenance:
                # review the crawled websites 
                try:       
                    print(crawled_file_abs_path)
-                    crawled_df = pd.read_csv(crawled_file_abs_path)
+                    crawled_df = pd.read_csv(crawled_file_abs_path, on_bad_lines='skip')
-                    verified_csv_df = pd.read_csv(verifiedcsvfile)
+                    verified_csv_df = pd.read_csv(verifiedcsvfile, on_bad_lines='skip')
-                    unverified_csv_df = pd.read_csv(unverifiedcsvfile)
+                    unverified_csv_df = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
-                    blacklist_df = pd.read_csv(blcsvfile)
+                    blacklist_df = pd.read_csv(blcsvfile, on_bad_lines='skip')
                    blacklisted_words = [word for word in blacklist_df['blacklisted-words']]
                    for i, row in crawled_df.iterrows():
                        link = row['URL']
--- a/scripts/utils.py
+++ b/scripts/utils.py
@ -150,6 +150,8 @@ def IsUrlValid(url:str)->bool:
        else:
                if not url.__contains__('.'):
                        return False
                if url.__contains__(';'):
                        return False #required otherwise lantern thinks there are extra columns
                if pattern.fullmatch(url) is None:
                        return False
                return True