diff --git a/SimpleX/__pycache__/regex_simplexlinks.cpython-311.pyc b/SimpleX/__pycache__/regex_simplexlinks.cpython-311.pyc new file mode 100644 index 0000000..1fe6ae3 Binary files /dev/null and b/SimpleX/__pycache__/regex_simplexlinks.cpython-311.pyc differ diff --git a/scripts/lantern.py b/scripts/lantern.py index 19c9cfb..a31acc2 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -130,6 +130,7 @@ def main(): secsvfile=instancepath+'/sensitive.csv' webpcsvfile=instancepath+'/webring-participants.csv' submission_file_abs_path = os.path.abspath('submissions/submission.csv') + crawled_file_abs_path = os.path.abspath('crawler/onion_crawler.csv') if not os.path.exists(instancepath): print_colors(f"{rootpath}",is_error=True, bold=True) @@ -181,11 +182,12 @@ Managing Wordlists: Maintenance: 9) Remove the duplicate URLs for your own instance 10) Perform sanity checks on all csv files for all instances (to mark them as sensitive / or remove the ones that are blacklisted) - 11) Review submissions (Add to verified.csv/ add to unverified.csv/ delete /blacklist) + 11) Review submissions (Add to verified.csv /add to unverified.csv /delete /blacklist) + 12) Review crawled websites (Add to verified.csv /add to unverified.csv /delete /blacklist) 0) Exit """) - option = input("Select an option? (0-11): ").strip() + option = input("Select an option? (0-12): ").strip() try: option = int(option) except ValueError: @@ -1202,6 +1204,7 @@ Maintenance: break case 11: + #review the submitted websites: try: submission_df = pd.read_csv(submission_file_abs_path) verified_csv_df = pd.read_csv(verifiedcsvfile) @@ -1215,6 +1218,7 @@ Maintenance: print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit") if link in blacklisted_words: print_colors("Black listed entry found", bold=True) + #TODO delete the entry as its already blacklisted continue else: name = row['name'] @@ -1277,7 +1281,127 @@ Maintenance: break finally: - print_colors("End of file") + print_colors("No more submissions to review, exiting.") + + + case 12: + # review the crawled websites + try: + print(crawled_file_abs_path) + crawled_df = pd.read_csv(crawled_file_abs_path) + verified_csv_df = pd.read_csv(verifiedcsvfile) + unverified_csv_df = pd.read_csv(unverifiedcsvfile) + blacklist_df = pd.read_csv(blcsvfile) + blacklisted_words = [word for word in blacklist_df['blacklisted-words']] + for i, row in crawled_df.iterrows(): + link = row['URL'] + print('\n',row[['URL','Category','Name']]) + print('\nLink to verify: ',link) + print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit") + if link in blacklisted_words: + print_colors("Black listed entry found", bold=True) + #TODO delete the entry as its already blacklisted + crawled_df.drop(index=i,inplace=True) + crawled_df.to_csv(submission_file_abs_path, index=False) + continue + else: + name = row['Name'] + category = row['Category'] + #desc = row['esc'] + desc = '' + #sensi = "YES" if row['sensitive'] == 'y' else "NO" + sensi = '' + number = int(input("Enter an option: ")) + if number == 1: + # Add to verified.csv + # ask the name if invalid + while(IsNameValid(name) is not True): + name = input("What is the name of the website? ") + + # ask the category + while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')): + category = input("What is the website Category? (ex: Indexes) ") + # ask the sensitivity + choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ") + if choice == "n": + sensi = 'NO' + else: + sensi = 'YES' + + # ask if its sensitive or not + # ask the user to write a description + newrow=[instance,category,name,link,sensi,desc,'YES','100'] + verified_csv_df.loc[-1] = newrow # adding a row + verified_csv_df.index = verified_csv_df.index + 1 # shifting index + verified_csv_df = verified_csv_df.sort_index() # sorting by index + verified_csv_df = verified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories + print_colors("[+] New row added! now writing the csv file") + verified_csv_df.to_csv(verifiedcsvfile, index=False) + crawled_df.drop(index=i,inplace=True) + crawled_df.to_csv(crawled_file_abs_path, index=False) + elif number == 2: + # Add to unverified.csv + # consider it as sensitive by default and category must just be 'crawled' + # ask the name if invalid + while(IsNameValid(name) is not True): + name = input("What is the name of the website? ") + # ask the category + print('CATEGORY = ', category) + while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')): + category = input("What is the website Category? (ex: Indexes) ") + choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ") + if choice == "n": + sensi = 'NO' + else: + sensi = 'YES' + # ask for the category, if empty then the category is 'crawled' + # add new row + newrow=[instance,category,name,link,sensi,desc,'YES','100'] + + unverified_csv_df.loc[-1] = newrow # adding a row + unverified_csv_df.index = unverified_csv_df.index + 1 # shifting index + unverified_csv_df = unverified_csv_df.sort_index() # sorting by index + unverified_csv_df = unverified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories + print_colors("[+] New row added! now writing the csv file") + unverified_csv_df.to_csv(unverifiedcsvfile, index=False) + crawled_df.drop(index=i,inplace=True) + crawled_df.to_csv(crawled_file_abs_path, index=False) + + elif number == 3: + # Delete from crawled_onion.csv + crawled_df.drop(index=i,inplace=True) + crawled_df.to_csv(crawled_file_abs_path, index=False) + + elif number == 4: + # Add to blacklist.csv + newrow=[link] + + blacklist_df.loc[-1] = newrow # adding a row + blacklist_df.index = blacklist_df.index + 1 # shifting index + blacklist_df = blacklist_df.sort_index() # sorting by index + print_colors("[+] New row added! now writing the csv file") + blacklist_df.to_csv(blcsvfile, index=False) + crawled_df.drop(index=i,inplace=True) + crawled_df.to_csv(crawled_file_abs_path, index=False) + + elif number == -1: + break + + else: + print_colors("Invalid Number",is_error=True) + continue + + + + + + + except Exception as e: + print_colors(f'Try again {e}',is_error=True) + break + + finally: + print_colors("No more crawled websites to review, exiting.") break case 0: diff --git a/submissions/README.md b/submissions/README.md old mode 100644 new mode 100755