fix lantern: remove duplicates upon adding new link

2025-07-01 19:46:41 +00:00 · 2025-05-09 15:56:04 +02:00 · 2025-05-09 15:56:04 +02:00 · a39cf7fdb0
commit a39cf7fdb0
parent a418270b4a
1 changed files with 23 additions and 10 deletions
--- a/scripts/lantern.py
+++ b/scripts/lantern.py
@ -229,8 +229,28 @@ Maintenance:
                    newrow=[instance,category,name,url,sensi,desc,'YES','100']
                    print_colors(f"[+] NEWROW= {newrow}")
                    # (rest is automatic: status, score, instance is = '' because it is your own instance)
-                    # TODO check if the entry doesnt already exist in verified.csv and in unverified.csv
-                    # if it doesnt exist, add it into unverified.csv
+                    # delete existing entries in verified.csv
+                    vdf_same_url_filter = vdf["URL"] == url # check for same url
+                    vdf_same_url_filter_count = vdf_same_url_filter.sum() # total url matches
+                    if vdf_same_url_filter_count > 0:
+                        print(f"Found {vdf_same_url_filter_count} row(s) with the same url in verified.csv")
+                        for index, row in vdf[vdf_same_url_filter].iterrows():
+                            print_colors(f"[+] ROW[{index}]= {list(row)}")
+                        vdf = vdf[~vdf_same_url_filter].reset_index(drop=True) # keep only entries that do not match filter
+                        print(f"Deleted {vdf_same_url_filter_count} row(s) with the same url in verified.csv")
+                        if desc == '': # if the description is empty = it means that it goes in unverified.csv, so save modified verified.csv file now
+                            vdf.to_csv(verifiedcsvfile, index=False)
+                    # delete existing entries in unverified.csv
+                    uvdf_same_url_filter = uvdf["URL"] == url # check for same url
+                    uvdf_same_url_filter_count = uvdf_same_url_filter.sum() # total url matches
+                    if uvdf_same_url_filter_count > 0:
+                        print(f"Found {uvdf_same_url_filter_count} row(s) with the same url in unverified.csv")
+                        for index, row in uvdf[uvdf_same_url_filter].iterrows():
+                            print_colors(f"[+] ROW[{index}]= {list(row)}")
+                        uvdf = uvdf[~uvdf_same_url_filter].reset_index(drop=True) # keep only entries that do not match filter
+                        print(f"Deleted {uvdf_same_url_filter_count} row(s) with the same url in unverified.csv")
+                        if desc != '': # if the description isnt empty = it means that it goes in verified.csv, so save modified unverified.csv file now
+                            uvdf.to_csv(unverifiedcsvfile, index=False)
                    if desc == '': # if the description is empty = it means that it goes in unverified.csv 
                        print("Adding new row in unverified.csv since description is empty")
                        uvdf.loc[-1] = newrow  # adding a row
@ -674,7 +694,6 @@ Maintenance:
                                                uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
                                                # TODO DELETE ALL DUPLICATES OF UVDF !
                                                uvdf = uvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False)
-                                                uvdf.to_csv(unverifiedcsvfile, index=False)
                                                filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm,na=False)]
                                                if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0:
                                                    newrow=row
@ -710,13 +729,6 @@ Maintenance:
                                    csvdf.drop(i, inplace= True)
                                    csvdf.to_csv(csvfilepath, index=False)
                                    rows2delete= [] # it is an empty list at first
-                # TODO DELETE ALL DUPLICATES OF UVDF !
-                uvdf = uvdf.sort_index()  # sorting by index
-                uvdf = uvdf.sort_values(by=["Category","Score"], ascending=[True,False])  # sorting categories
-
-                uvdf = uvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False)
-                uvdf.to_csv(unverifiedcsvfile, index=False)
-
                break


@ -1446,3 +1458,4 @@ Maintenance:

 if __name__ == '__main__':
   main()
+