finish option12 : listing crawled links

This commit is contained in:
root 2025-05-03 22:54:04 +02:00
parent 6de26c5fa5
commit 212e44c3a5
3 changed files with 127 additions and 3 deletions

Binary file not shown.

View file

@ -130,6 +130,7 @@ def main():
secsvfile=instancepath+'/sensitive.csv'
webpcsvfile=instancepath+'/webring-participants.csv'
submission_file_abs_path = os.path.abspath('submissions/submission.csv')
crawled_file_abs_path = os.path.abspath('crawler/onion_crawler.csv')
if not os.path.exists(instancepath):
print_colors(f"{rootpath}",is_error=True, bold=True)
@ -182,10 +183,11 @@ Maintenance:
9) Remove the duplicate URLs for your own instance
10) Perform sanity checks on all csv files for all instances (to mark them as sensitive / or remove the ones that are blacklisted)
11) Review submissions (Add to verified.csv /add to unverified.csv /delete /blacklist)
12) Review crawled websites (Add to verified.csv /add to unverified.csv /delete /blacklist)
0) Exit
""")
option = input("Select an option? (0-11): ").strip()
option = input("Select an option? (0-12): ").strip()
try:
option = int(option)
except ValueError:
@ -1202,6 +1204,7 @@ Maintenance:
break
case 11:
#review the submitted websites:
try:
submission_df = pd.read_csv(submission_file_abs_path)
verified_csv_df = pd.read_csv(verifiedcsvfile)
@ -1215,6 +1218,7 @@ Maintenance:
print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit")
if link in blacklisted_words:
print_colors("Black listed entry found", bold=True)
#TODO delete the entry as its already blacklisted
continue
else:
name = row['name']
@ -1277,7 +1281,127 @@ Maintenance:
break
finally:
print_colors("End of file")
print_colors("No more submissions to review, exiting.")
case 12:
# review the crawled websites
try:
print(crawled_file_abs_path)
crawled_df = pd.read_csv(crawled_file_abs_path)
verified_csv_df = pd.read_csv(verifiedcsvfile)
unverified_csv_df = pd.read_csv(unverifiedcsvfile)
blacklist_df = pd.read_csv(blcsvfile)
blacklisted_words = [word for word in blacklist_df['blacklisted-words']]
for i, row in crawled_df.iterrows():
link = row['URL']
print('\n',row[['URL','Category','Name']])
print('\nLink to verify: ',link)
print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit")
if link in blacklisted_words:
print_colors("Black listed entry found", bold=True)
#TODO delete the entry as its already blacklisted
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(submission_file_abs_path, index=False)
continue
else:
name = row['Name']
category = row['Category']
#desc = row['esc']
desc = ''
#sensi = "YES" if row['sensitive'] == 'y' else "NO"
sensi = ''
number = int(input("Enter an option: "))
if number == 1:
# Add to verified.csv
# ask the name if invalid
while(IsNameValid(name) is not True):
name = input("What is the name of the website? ")
# ask the category
while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')):
category = input("What is the website Category? (ex: Indexes) ")
# ask the sensitivity
choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ")
if choice == "n":
sensi = 'NO'
else:
sensi = 'YES'
# ask if its sensitive or not
# ask the user to write a description
newrow=[instance,category,name,link,sensi,desc,'YES','100']
verified_csv_df.loc[-1] = newrow # adding a row
verified_csv_df.index = verified_csv_df.index + 1 # shifting index
verified_csv_df = verified_csv_df.sort_index() # sorting by index
verified_csv_df = verified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories
print_colors("[+] New row added! now writing the csv file")
verified_csv_df.to_csv(verifiedcsvfile, index=False)
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(crawled_file_abs_path, index=False)
elif number == 2:
# Add to unverified.csv
# consider it as sensitive by default and category must just be 'crawled'
# ask the name if invalid
while(IsNameValid(name) is not True):
name = input("What is the name of the website? ")
# ask the category
print('CATEGORY = ', category)
while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')):
category = input("What is the website Category? (ex: Indexes) ")
choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ")
if choice == "n":
sensi = 'NO'
else:
sensi = 'YES'
# ask for the category, if empty then the category is 'crawled'
# add new row
newrow=[instance,category,name,link,sensi,desc,'YES','100']
unverified_csv_df.loc[-1] = newrow # adding a row
unverified_csv_df.index = unverified_csv_df.index + 1 # shifting index
unverified_csv_df = unverified_csv_df.sort_index() # sorting by index
unverified_csv_df = unverified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories
print_colors("[+] New row added! now writing the csv file")
unverified_csv_df.to_csv(unverifiedcsvfile, index=False)
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(crawled_file_abs_path, index=False)
elif number == 3:
# Delete from crawled_onion.csv
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(crawled_file_abs_path, index=False)
elif number == 4:
# Add to blacklist.csv
newrow=[link]
blacklist_df.loc[-1] = newrow # adding a row
blacklist_df.index = blacklist_df.index + 1 # shifting index
blacklist_df = blacklist_df.sort_index() # sorting by index
print_colors("[+] New row added! now writing the csv file")
blacklist_df.to_csv(blcsvfile, index=False)
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(crawled_file_abs_path, index=False)
elif number == -1:
break
else:
print_colors("Invalid Number",is_error=True)
continue
except Exception as e:
print_colors(f'Try again {e}',is_error=True)
break
finally:
print_colors("No more crawled websites to review, exiting.")
break
case 0:

0
submissions/README.md Normal file → Executable file
View file