mirror of
http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern.git
synced 2025-05-16 04:06:59 +00:00
fix lantern: remove duplicates upon adding new link
This commit is contained in:
parent
a418270b4a
commit
a39cf7fdb0
1 changed files with 23 additions and 10 deletions
|
@ -229,8 +229,28 @@ Maintenance:
|
||||||
newrow=[instance,category,name,url,sensi,desc,'YES','100']
|
newrow=[instance,category,name,url,sensi,desc,'YES','100']
|
||||||
print_colors(f"[+] NEWROW= {newrow}")
|
print_colors(f"[+] NEWROW= {newrow}")
|
||||||
# (rest is automatic: status, score, instance is = '' because it is your own instance)
|
# (rest is automatic: status, score, instance is = '' because it is your own instance)
|
||||||
# TODO check if the entry doesnt already exist in verified.csv and in unverified.csv
|
# delete existing entries in verified.csv
|
||||||
# if it doesnt exist, add it into unverified.csv
|
vdf_same_url_filter = vdf["URL"] == url # check for same url
|
||||||
|
vdf_same_url_filter_count = vdf_same_url_filter.sum() # total url matches
|
||||||
|
if vdf_same_url_filter_count > 0:
|
||||||
|
print(f"Found {vdf_same_url_filter_count} row(s) with the same url in verified.csv")
|
||||||
|
for index, row in vdf[vdf_same_url_filter].iterrows():
|
||||||
|
print_colors(f"[+] ROW[{index}]= {list(row)}")
|
||||||
|
vdf = vdf[~vdf_same_url_filter].reset_index(drop=True) # keep only entries that do not match filter
|
||||||
|
print(f"Deleted {vdf_same_url_filter_count} row(s) with the same url in verified.csv")
|
||||||
|
if desc == '': # if the description is empty = it means that it goes in unverified.csv, so save modified verified.csv file now
|
||||||
|
vdf.to_csv(verifiedcsvfile, index=False)
|
||||||
|
# delete existing entries in unverified.csv
|
||||||
|
uvdf_same_url_filter = uvdf["URL"] == url # check for same url
|
||||||
|
uvdf_same_url_filter_count = uvdf_same_url_filter.sum() # total url matches
|
||||||
|
if uvdf_same_url_filter_count > 0:
|
||||||
|
print(f"Found {uvdf_same_url_filter_count} row(s) with the same url in unverified.csv")
|
||||||
|
for index, row in uvdf[uvdf_same_url_filter].iterrows():
|
||||||
|
print_colors(f"[+] ROW[{index}]= {list(row)}")
|
||||||
|
uvdf = uvdf[~uvdf_same_url_filter].reset_index(drop=True) # keep only entries that do not match filter
|
||||||
|
print(f"Deleted {uvdf_same_url_filter_count} row(s) with the same url in unverified.csv")
|
||||||
|
if desc != '': # if the description isnt empty = it means that it goes in verified.csv, so save modified unverified.csv file now
|
||||||
|
uvdf.to_csv(unverifiedcsvfile, index=False)
|
||||||
if desc == '': # if the description is empty = it means that it goes in unverified.csv
|
if desc == '': # if the description is empty = it means that it goes in unverified.csv
|
||||||
print("Adding new row in unverified.csv since description is empty")
|
print("Adding new row in unverified.csv since description is empty")
|
||||||
uvdf.loc[-1] = newrow # adding a row
|
uvdf.loc[-1] = newrow # adding a row
|
||||||
|
@ -674,7 +694,6 @@ Maintenance:
|
||||||
uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
|
uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip')
|
||||||
# TODO DELETE ALL DUPLICATES OF UVDF !
|
# TODO DELETE ALL DUPLICATES OF UVDF !
|
||||||
uvdf = uvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False)
|
uvdf = uvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False)
|
||||||
uvdf.to_csv(unverifiedcsvfile, index=False)
|
|
||||||
filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm,na=False)]
|
filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm,na=False)]
|
||||||
if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0:
|
if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0:
|
||||||
newrow=row
|
newrow=row
|
||||||
|
@ -710,13 +729,6 @@ Maintenance:
|
||||||
csvdf.drop(i, inplace= True)
|
csvdf.drop(i, inplace= True)
|
||||||
csvdf.to_csv(csvfilepath, index=False)
|
csvdf.to_csv(csvfilepath, index=False)
|
||||||
rows2delete= [] # it is an empty list at first
|
rows2delete= [] # it is an empty list at first
|
||||||
# TODO DELETE ALL DUPLICATES OF UVDF !
|
|
||||||
uvdf = uvdf.sort_index() # sorting by index
|
|
||||||
uvdf = uvdf.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories
|
|
||||||
|
|
||||||
uvdf = uvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False)
|
|
||||||
uvdf.to_csv(unverifiedcsvfile, index=False)
|
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
@ -1446,3 +1458,4 @@ Maintenance:
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue