diff --git a/scripts/lantern.py b/scripts/lantern.py index a4d973d..bf12ca9 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -635,6 +635,7 @@ Maintenance: print("[+] Removing the participant's duplicate entries... ") # REMOVE DUPLICATES !!! do not accept any duplicate from remote participants csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) + csvdf = csvdf.drop_duplicates(subset=['Name'], keep="first", inplace=False) csvdf.to_csv(csvfilepath, index=False) csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') @@ -649,6 +650,7 @@ Maintenance: + ################################ SANITY CHECKS #################################### ### SANITY CHECK 0: make sure that ✔️ and x are replaced with YES/NO, as it changed since v1.0.1 ### if csvdf.at[i, 'Status'] == "✔️" or csvdf.at[i, 'Status'] == "YES" : @@ -689,13 +691,16 @@ Maintenance: filterterm=csvdf.at[i, 'URL'] #print('1)',filterterm) filter_vdf= vdf[vdf.URL.str.contains(filterterm,na=False)] + filter_vdf2= vdf[vdf.Name.str.contains(filterterm,na=False)] # do not accept the new link if the name already exists in verified.csv #print('2)',filter_vdf) #print('3)',uvdf[uvdf.URL.str.contains(filterterm,na=False)] ) uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip') # TODO DELETE ALL DUPLICATES OF UVDF ! uvdf = uvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) + uvdf = uvdf.drop_duplicates(subset=['Name'], keep="first", inplace=False) filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm,na=False)] - if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0: + filter_uvdf2= uvdf[uvdf.Name.str.contains(filterterm,na=False)] # do not accept the new link if the name already exists in unverified.csv + if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0 and len(filter_uvdf2.index) == 0 and len(filter_vdf2.index) == 0 : newrow=row uvdf.loc[-1] = newrow # adding a row uvdf.index = uvdf.index + 1 # shifting index @@ -723,12 +728,66 @@ Maintenance: #print_colors(f'[-] Rows to delete: {rows2delete}', is_error=True) # only delete rows after you've gone through all the unverified.csv OR verified.csv rows' + # check for NAME duplicates and mark them for deletion: + # remove name duplicates that are in unverifie.csv yet exist in verified.csv (as verified.csv takes the priority) + if w == 'unverified.csv': + try: + # check if the given row Name already exists in verified.csv + filterterm=csvdf.at[i, 'Name'] + filter_vdf= vdf[vdf.Name.str.contains(filterterm,na=False)] + print('[+] CHECKING FOR DUPLIATES: ',filterterm) + if len(filter_vdf.index) != 0: + # drop the unverified.csv row if its name already exists in verified.csv + print('[+] DUPLICATE FOUND, MARKING ROW FOR DELETION: ',row) + rows2delete.append(i) #mark the row for deletion if not already done + except: + pass + + for i in rows2delete: row=csvdf.loc[i,:].values.tolist() print_colors(f'[+] REMOVING ROW: {i}{row}') csvdf.drop(i, inplace= True) csvdf.to_csv(csvfilepath, index=False) rows2delete= [] # it is an empty list at first + + # fill missing description in our unverified.csv that other participants verified.csv have filled + if w == 'verified.csv': + uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip') + # merge participant's verified.csv on our unverified.csv on URL + merged_df = uvdf.merge(csvdf[['URL', 'Description']], + on='URL', + how='left', + suffixes=('', '_participant')) + # filter empty description that has participant's description + no_descr_filter = ((merged_df['Description'].isna()) | (merged_df['Description'].str.strip() == '')) & \ + (~merged_df['Description_participant'].isna()) & (merged_df['Description_participant'].str.strip() != '') + no_descr_filter_count = no_descr_filter.sum() + # update our empty description if the participant has any filled description + if no_descr_filter_count > 0: + merged_df.loc[no_descr_filter, 'Description'] = merged_df.loc[no_descr_filter, 'Description_participant'] + # keep only original columns + uvdf_updated = merged_df[uvdf.columns] + uvdf_updated.to_csv(unverifiedcsvfile, index=False) + print(f'[+] Updated {no_descr_filter_count} empty description(s) in your unverified.csv found on partipant\'s {w}') + # remove all name duplicates from your own unverified.csv file: + for i,j in uvdf.iterrows(): + row=uvdf.loc[i,:].values.tolist() + # check if the given row Name already exists in verified.csv + filterterm=uvdf.at[i, 'Name'] + filter_vdf= vdf[vdf.Name.str.contains(filterterm,na=False)] + print('[+] CHECKING FOR DUPLIATES: ',filterterm) + if len(filter_vdf.index) != 0: + # drop the unverified.csv row if its name already exists in verified.csv + print('[+] DUPLICATE FOUND, MARKING ROW FOR DELETION: ',row) + rows2delete.append(i) #mark the row for deletion if not already done + for i in rows2delete: + row=uvdf.loc[i,:].values.tolist() + print_colors(f'[+] REMOVING ROW: {i}{row}') + uvdf.drop(i, inplace= True) + uvdf.to_csv(unverifiedcsvfile, index=False) + rows2delete= [] # it is an empty list at first + break