finish option12 : listing crawled links

This commit is contained in:
root 2025-05-03 22:54:04 +02:00
parent 6de26c5fa5
commit 212e44c3a5
3 changed files with 127 additions and 3 deletions

Binary file not shown.

View file

@ -130,6 +130,7 @@ def main():
secsvfile=instancepath+'/sensitive.csv' secsvfile=instancepath+'/sensitive.csv'
webpcsvfile=instancepath+'/webring-participants.csv' webpcsvfile=instancepath+'/webring-participants.csv'
submission_file_abs_path = os.path.abspath('submissions/submission.csv') submission_file_abs_path = os.path.abspath('submissions/submission.csv')
crawled_file_abs_path = os.path.abspath('crawler/onion_crawler.csv')
if not os.path.exists(instancepath): if not os.path.exists(instancepath):
print_colors(f"{rootpath}",is_error=True, bold=True) print_colors(f"{rootpath}",is_error=True, bold=True)
@ -181,11 +182,12 @@ Managing Wordlists:
Maintenance: Maintenance:
9) Remove the duplicate URLs for your own instance 9) Remove the duplicate URLs for your own instance
10) Perform sanity checks on all csv files for all instances (to mark them as sensitive / or remove the ones that are blacklisted) 10) Perform sanity checks on all csv files for all instances (to mark them as sensitive / or remove the ones that are blacklisted)
11) Review submissions (Add to verified.csv/ add to unverified.csv/ delete /blacklist) 11) Review submissions (Add to verified.csv /add to unverified.csv /delete /blacklist)
12) Review crawled websites (Add to verified.csv /add to unverified.csv /delete /blacklist)
0) Exit 0) Exit
""") """)
option = input("Select an option? (0-11): ").strip() option = input("Select an option? (0-12): ").strip()
try: try:
option = int(option) option = int(option)
except ValueError: except ValueError:
@ -1202,6 +1204,7 @@ Maintenance:
break break
case 11: case 11:
#review the submitted websites:
try: try:
submission_df = pd.read_csv(submission_file_abs_path) submission_df = pd.read_csv(submission_file_abs_path)
verified_csv_df = pd.read_csv(verifiedcsvfile) verified_csv_df = pd.read_csv(verifiedcsvfile)
@ -1215,6 +1218,7 @@ Maintenance:
print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit") print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit")
if link in blacklisted_words: if link in blacklisted_words:
print_colors("Black listed entry found", bold=True) print_colors("Black listed entry found", bold=True)
#TODO delete the entry as its already blacklisted
continue continue
else: else:
name = row['name'] name = row['name']
@ -1277,7 +1281,127 @@ Maintenance:
break break
finally: finally:
print_colors("End of file") print_colors("No more submissions to review, exiting.")
case 12:
# review the crawled websites
try:
print(crawled_file_abs_path)
crawled_df = pd.read_csv(crawled_file_abs_path)
verified_csv_df = pd.read_csv(verifiedcsvfile)
unverified_csv_df = pd.read_csv(unverifiedcsvfile)
blacklist_df = pd.read_csv(blcsvfile)
blacklisted_words = [word for word in blacklist_df['blacklisted-words']]
for i, row in crawled_df.iterrows():
link = row['URL']
print('\n',row[['URL','Category','Name']])
print('\nLink to verify: ',link)
print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit")
if link in blacklisted_words:
print_colors("Black listed entry found", bold=True)
#TODO delete the entry as its already blacklisted
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(submission_file_abs_path, index=False)
continue
else:
name = row['Name']
category = row['Category']
#desc = row['esc']
desc = ''
#sensi = "YES" if row['sensitive'] == 'y' else "NO"
sensi = ''
number = int(input("Enter an option: "))
if number == 1:
# Add to verified.csv
# ask the name if invalid
while(IsNameValid(name) is not True):
name = input("What is the name of the website? ")
# ask the category
while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')):
category = input("What is the website Category? (ex: Indexes) ")
# ask the sensitivity
choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ")
if choice == "n":
sensi = 'NO'
else:
sensi = 'YES'
# ask if its sensitive or not
# ask the user to write a description
newrow=[instance,category,name,link,sensi,desc,'YES','100']
verified_csv_df.loc[-1] = newrow # adding a row
verified_csv_df.index = verified_csv_df.index + 1 # shifting index
verified_csv_df = verified_csv_df.sort_index() # sorting by index
verified_csv_df = verified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories
print_colors("[+] New row added! now writing the csv file")
verified_csv_df.to_csv(verifiedcsvfile, index=False)
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(crawled_file_abs_path, index=False)
elif number == 2:
# Add to unverified.csv
# consider it as sensitive by default and category must just be 'crawled'
# ask the name if invalid
while(IsNameValid(name) is not True):
name = input("What is the name of the website? ")
# ask the category
print('CATEGORY = ', category)
while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')):
category = input("What is the website Category? (ex: Indexes) ")
choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ")
if choice == "n":
sensi = 'NO'
else:
sensi = 'YES'
# ask for the category, if empty then the category is 'crawled'
# add new row
newrow=[instance,category,name,link,sensi,desc,'YES','100']
unverified_csv_df.loc[-1] = newrow # adding a row
unverified_csv_df.index = unverified_csv_df.index + 1 # shifting index
unverified_csv_df = unverified_csv_df.sort_index() # sorting by index
unverified_csv_df = unverified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories
print_colors("[+] New row added! now writing the csv file")
unverified_csv_df.to_csv(unverifiedcsvfile, index=False)
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(crawled_file_abs_path, index=False)
elif number == 3:
# Delete from crawled_onion.csv
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(crawled_file_abs_path, index=False)
elif number == 4:
# Add to blacklist.csv
newrow=[link]
blacklist_df.loc[-1] = newrow # adding a row
blacklist_df.index = blacklist_df.index + 1 # shifting index
blacklist_df = blacklist_df.sort_index() # sorting by index
print_colors("[+] New row added! now writing the csv file")
blacklist_df.to_csv(blcsvfile, index=False)
crawled_df.drop(index=i,inplace=True)
crawled_df.to_csv(crawled_file_abs_path, index=False)
elif number == -1:
break
else:
print_colors("Invalid Number",is_error=True)
continue
except Exception as e:
print_colors(f'Try again {e}',is_error=True)
break
finally:
print_colors("No more crawled websites to review, exiting.")
break break
case 0: case 0:

0
submissions/README.md Normal file → Executable file
View file