finish option12 : listing crawled links

2025-07-01 12:46:41 +00:00 · 2025-05-03 22:54:04 +02:00 · 2025-05-03 22:54:04 +02:00 · 212e44c3a5
commit 212e44c3a5
parent 6de26c5fa5
3 changed files with 127 additions and 3 deletions
--- a/SimpleX/pycache/regex_simplexlinks.cpython-311.pyc
+++ b/SimpleX/pycache/regex_simplexlinks.cpython-311.pyc
--- a/scripts/lantern.py
+++ b/scripts/lantern.py
@ -130,6 +130,7 @@ def main():
    secsvfile=instancepath+'/sensitive.csv'
    webpcsvfile=instancepath+'/webring-participants.csv'
    submission_file_abs_path = os.path.abspath('submissions/submission.csv')
+    crawled_file_abs_path = os.path.abspath('crawler/onion_crawler.csv')

    if not os.path.exists(instancepath):
        print_colors(f"{rootpath}",is_error=True, bold=True)
@ -182,10 +183,11 @@ Maintenance:
 9) Remove the duplicate URLs for your own instance
 10) Perform sanity checks on all csv files for all instances (to mark them as sensitive / or remove the ones that are blacklisted)
 11) Review submissions (Add to verified.csv /add to unverified.csv /delete /blacklist)
+ 12) Review crawled websites (Add to verified.csv /add to unverified.csv /delete /blacklist)
        
 0) Exit
        """)
-        option = input("Select an option? (0-11): ").strip()
+        option = input("Select an option? (0-12): ").strip()
        try:
            option = int(option)
        except ValueError:
@ -1202,6 +1204,7 @@ Maintenance:
                break
            
            case 11:
+                #review the submitted websites:
                try:       
                    submission_df = pd.read_csv(submission_file_abs_path)
                    verified_csv_df = pd.read_csv(verifiedcsvfile)
@ -1215,6 +1218,7 @@ Maintenance:
                        print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv  \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit")
                        if link in blacklisted_words:
                            print_colors("Black listed entry found", bold=True)
+                            #TODO delete the entry as its already blacklisted
                            continue
                        else:
                            name = row['name']
@ -1277,7 +1281,127 @@ Maintenance:
                    break 
                
                finally:
-                    print_colors("End of file")
+                    print_colors("No more submissions to review, exiting.")
+
+
+            case 12:
+                # review the crawled websites 
+                try:       
+                    print(crawled_file_abs_path)
+                    crawled_df = pd.read_csv(crawled_file_abs_path)
+                    verified_csv_df = pd.read_csv(verifiedcsvfile)
+                    unverified_csv_df = pd.read_csv(unverifiedcsvfile)
+                    blacklist_df = pd.read_csv(blcsvfile)
+                    blacklisted_words = [word for word in blacklist_df['blacklisted-words']]
+                    for i, row in crawled_df.iterrows():
+                        link = row['URL']
+                        print('\n',row[['URL','Category','Name']])
+                        print('\nLink to verify: ',link)
+                        print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv  \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit")
+                        if link in blacklisted_words:
+                            print_colors("Black listed entry found", bold=True)
+                            #TODO delete the entry as its already blacklisted
+                            crawled_df.drop(index=i,inplace=True)
+                            crawled_df.to_csv(submission_file_abs_path, index=False)
+                            continue
+                        else:
+                            name = row['Name']
+                            category = row['Category']
+                            #desc = row['esc']
+                            desc = ''
+                            #sensi = "YES" if row['sensitive'] == 'y' else "NO"
+                            sensi = ''
+                            number = int(input("Enter an option: "))
+                            if number == 1:
+                                # Add to verified.csv
+                                # ask the name if invalid
+                                while(IsNameValid(name) is not True):
+                                    name = input("What is the name of the website? ")
+
+                                # ask the category
+                                while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')):
+                                    category = input("What is the website Category? (ex: Indexes) ")
+                                # ask the sensitivity
+                                choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ")
+                                if choice == "n":
+                                    sensi = 'NO'
+                                else:
+                                    sensi = 'YES'
+
+                                # ask if its sensitive or not
+                                # ask the user to write a description
+                                newrow=[instance,category,name,link,sensi,desc,'YES','100']
+                                verified_csv_df.loc[-1] = newrow  # adding a row
+                                verified_csv_df.index = verified_csv_df.index + 1  # shifting index
+                                verified_csv_df = verified_csv_df.sort_index()  # sorting by index
+                                verified_csv_df = verified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False])  # sorting categories
+                                print_colors("[+] New row added! now writing the csv file")
+                                verified_csv_df.to_csv(verifiedcsvfile, index=False)
+                                crawled_df.drop(index=i,inplace=True)
+                                crawled_df.to_csv(crawled_file_abs_path, index=False)
+                            elif number == 2:
+                                # Add to unverified.csv
+                                # consider it as sensitive by default and category must just be 'crawled'
+                                # ask the name if invalid
+                                while(IsNameValid(name) is not True):
+                                    name = input("What is the name of the website? ")
+                                # ask the category
+                                print('CATEGORY = ', category)
+                                while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')):
+                                    category = input("What is the website Category? (ex: Indexes) ")
+                                choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ")
+                                if choice == "n":
+                                    sensi = 'NO'
+                                else:
+                                    sensi = 'YES'
+                                # ask for the category, if empty then the category is 'crawled'
+                                # add new row
+                                newrow=[instance,category,name,link,sensi,desc,'YES','100']
+
+                                unverified_csv_df.loc[-1] = newrow  # adding a row
+                                unverified_csv_df.index = unverified_csv_df.index + 1  # shifting index
+                                unverified_csv_df = unverified_csv_df.sort_index()  # sorting by index
+                                unverified_csv_df = unverified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False])  # sorting categories
+                                print_colors("[+] New row added! now writing the csv file")
+                                unverified_csv_df.to_csv(unverifiedcsvfile, index=False)
+                                crawled_df.drop(index=i,inplace=True)
+                                crawled_df.to_csv(crawled_file_abs_path, index=False)
+                            
+                            elif number == 3:
+                                # Delete from crawled_onion.csv 
+                                crawled_df.drop(index=i,inplace=True)
+                                crawled_df.to_csv(crawled_file_abs_path, index=False)
+
+                            elif number == 4:
+                                # Add to blacklist.csv 
+                                newrow=[link]
+
+                                blacklist_df.loc[-1] = newrow  # adding a row
+                                blacklist_df.index = blacklist_df.index + 1  # shifting index
+                                blacklist_df = blacklist_df.sort_index()  # sorting by index
+                                print_colors("[+] New row added! now writing the csv file")
+                                blacklist_df.to_csv(blcsvfile, index=False)
+                                crawled_df.drop(index=i,inplace=True)
+                                crawled_df.to_csv(crawled_file_abs_path, index=False)
+
+                            elif number == -1:
+                                break
+
+                            else:
+                                print_colors("Invalid Number",is_error=True)
+                                continue
+
+
+
+ 
+
+                        
+                except Exception as e:
+                    print_colors(f'Try again {e}',is_error=True)
+                    break 
+                
+                finally:
+                    print_colors("No more crawled websites to review, exiting.")

                break
            case 0:
--- a/submissions/README.md
+++ b/submissions/README.md