diff --git a/scripts/lantern.py b/scripts/lantern.py index 43e5c8c..746a1da 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -674,6 +674,7 @@ Maintenance: uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip') # TODO DELETE ALL DUPLICATES OF UVDF ! uvdf = uvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) + uvdf.to_csv(unverifiedcsvfile, index=False) filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm,na=False)] if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0: newrow=row @@ -709,6 +710,13 @@ Maintenance: csvdf.drop(i, inplace= True) csvdf.to_csv(csvfilepath, index=False) rows2delete= [] # it is an empty list at first + # TODO DELETE ALL DUPLICATES OF UVDF ! + uvdf = uvdf.sort_index() # sorting by index + uvdf = uvdf.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories + + uvdf = uvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) + uvdf.to_csv(unverifiedcsvfile, index=False) + break