Finished refactoring options 9 and 10

2025-07-01 19:06:41 +00:00 · 2025-05-30 19:38:04 +00:00 · 2025-05-30 19:38:04 +00:00 · 2a827c0b8b
commit 2a827c0b8b
parent 19e582203b
2 changed files with 84 additions and 97 deletions
--- a/scripts/lantern.py
+++ b/scripts/lantern.py
@ -561,9 +561,12 @@ Maintenance:
                        
                        participant_url = generate_local_participant_dir(participant.URL)

-                        print_colors('[+] Reading webrring participant\'s verified and unverified, and removing unverified and blacklisted rows')
-                        participant_verified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}verified.csv'), local_blacklist)
-                        participant_unverified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}unverified.csv'), local_blacklist)
+                        print_colors('[+] Reading webrring participant\'s verified and unverified')
+                        participant_verified_df, participant_unverified_df = get_participant_local_verified_and_unverified(participant_url)
+
+                        print_colors('[+] Removing unvalidated and blacklisted rows')
+                        participant_verified_df = lantern.clean_csv(participant_verified_df, local_blacklist)
+                        participant_unverified_df = lantern.clean_csv(participant_unverified_df, local_blacklist)

                        print_colors('[+] Marking sensitive rows')
                        participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive)
@ -585,7 +588,6 @@ Maintenance:

                except Exception as err:
                    print_colors("[-] Option 4 failed suddently, please try again", is_error=True)
-                    raise err

                break

@ -1005,11 +1007,14 @@ Maintenance:
                print_colors("[+] 9) Cleaning up all duplicates in your own unverified + verified.csv (based on the url)")
                
                try:
-
+                    
+                    print_colors('[+] Reading local verified and unverified')
                    verified_df, unverified_df = get_local_verified_and_unverified()

+                    print_colors('[+] Removing cross dataframe replications')
                    verified_df, unverified_df = remove_cross_dataframe_replications(verified_df, unverified_df)

+                    print_colors('[+] Saving local verified and unverified')
                    save_local_verified_and_unverified(verified_df, unverified_df)

                except Exception as err:
@ -1020,80 +1025,30 @@ Maintenance:
            case 10:
                print_colors("[+] 10) perform sanity checks on all csv files (to mark them as sensitive / or remove the ones that are blacklisted)")

+                try:
+                    print_colors('[+] Reading local blacklist and sensitive words')
+                    local_blacklist, local_sensitive = get_local_blacklist_and_sensitive()

+                    for participant in os.listdir(conf.PARTICIPANT_DIR):
+                        participant_local_dir = conf.PARTICIPANT_DIR + participant + '/'

-                print_colors('[+] Reading local blacklist and sensitive words')
-                local_blacklist, local_sensitive = get_local_blacklist_and_sensitive()
+                        print_colors('[+] Reading webrring participant\'s verified and unverified')
+                        participant_verified_df, participant_unverified_df = get_participant_local_verified_and_unverified(participant_local_dir)

+                        print_colors('[+] Removing unverified and blacklisted rows')
+                        participant_verified_df = lantern.clean_csv(participant_verified_df, local_blacklist)
+                        participant_unverified_df = lantern.clean_csv(participant_unverified_df, local_blacklist)
+                        
+                        print_colors('[+] Marking sensitive rows')
+                        participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive)
+                        participant_unverified_df = lantern.mark_sensitive(participant_unverified_df, local_sensitive)

+                        print_colors('[+] Saving local participant verified and unverified')
+                        save_local_participant_verified_and_unverified(participant_verified_df, participant_unverified_df, participant_local_dir)
+                
+                except Exception as err:
+                    print_colors("[-] Option 10 failed suddently, please try again", is_error=True)

-                participantspath = rootpath+'www/participants/'
-                for participant in os.listdir(participantspath):
-                    print_colors(f"Participant: {participant}")
-                    #read=input("Continue?")
-                    participantdir= participantspath+participant
-                    ################ BEGIN SANITY CHECKS FOR EACH PARTICIPANTS ##############
-                    # iterate through the participant's verified.csv and unverified.csv files
-                    for w in ['verified.csv','unverified.csv']:
-                        csvfilepath=participantdir+'/'+w
-                        print_colors(f"{csvfilepath}")
-                        csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip')
-                        rows2delete= [] # it is an empty list at first
-                        for i,j in csvdf.iterrows():
-                            row=csvdf.loc[i,:].values.tolist()
-                            #print_colors(f"{row}")
-
-
-
-                            ################################ SANITY CHECKS ####################################
-							### SANITY CHECK 0: make sure that ✔️ and x are replaced with YES/NO, as it changed since v1.0.1 ###
-                            if  csvdf.at[i, 'Status'] == "✔️" or csvdf.at[i, 'Status'] == "YES" :
-                                csvdf.at[i, 'Status'] = "YES" 
-                                csvdf.to_csv(csvfilepath, index=False)
-                            else:
-                                csvdf.at[i, 'Status'] = "NO" 
-                                csvdf.to_csv(csvfilepath, index=False)
-
-                            if  csvdf.at[i, 'Sensitive'] == "✔️" or csvdf.at[i, 'Sensitive'] == "YES" :
-                                csvdf.at[i, 'Sensitive'] = "YES" 
-                                csvdf.to_csv(csvfilepath, index=False)
-                            else:
-                                csvdf.at[i, 'Sensitive'] = "NO" 
-                                csvdf.to_csv(csvfilepath, index=False)
-							
-                            ### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion###
-                            if IsURLValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsURLValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or  IsDescriptionValid(csvdf.at[i, 'Description']) is False or  IsStatusValid(csvdf.at[i, 'Status']) is False or  IsScoreValid(csvdf.at[i, 'Score']) is False:
-                                if i not in rows2delete:
-                                    print_colors(f"Marking row {i} for deletion, as it has invalid inputs")
-                                    #print_colors(f"{row}")
-                                    print(IsURLValid(csvdf.at[i, 'Instance']), IsCategoryValid(csvdf.at[i, 'Category']), IsNameValid(csvdf.at[i, 'Name']), IsURLValid(csvdf.at[i, 'URL']), IsStatusValid(csvdf.at[i, 'Sensitive']), IsDescriptionValid(csvdf.at[i, 'Description']), IsStatusValid(csvdf.at[i, 'Status']),  IsScoreValid(csvdf.at[i, 'Score']))
-                                    rows2delete.append(i)
-                                    read=input("Continue?")
-
-                            ### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ###
-                            for k,l in bldf.iterrows():
-                                blword=bldf.at[k, 'blacklisted-words']
-                                if any(blword in str(x) for x in row) == True:
-                                    if i not in rows2delete:
-                                        print_colors(f"Marking row {i} for deletion, as it matches with the blacklisted word {blword}")
-                                        rows2delete.append(i)
-                                        #read=input("Continue?")
-                            ### SANITY CHECK 3: Mark all rows that match sensitive words to be sensitive = YES
-                            for k,l in sedf.iterrows():
-                                seword=sedf.at[k, 'sensitive-words']
-                                if any(seword in str(x) for x in row) == True:
-                                    print_colors(f"Marking row {i} as sensitive, as it matches with the sensitive word {seword}")
-                                    csvdf.at[i, 'Sensitive']="YES"
-                                    csvdf.to_csv(csvfilepath, index=False)
-                                    #read=input("Continue?")
-
-
-                        for i in rows2delete:
-                            row=csvdf.loc[i,:].values.tolist()
-                            print_colors(f'[+] REMOVING ROW : {i} {row}')
-                            csvdf.drop(i, inplace= True)
-                            csvdf.to_csv(csvfilepath, index=False)
-                            #read=input("Continue?")
                break
            
            case 11: