mirror of
http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern.git
synced 2025-05-16 04:06:59 +00:00
finish option12 : listing crawled links
This commit is contained in:
parent
6de26c5fa5
commit
212e44c3a5
3 changed files with 127 additions and 3 deletions
BIN
SimpleX/__pycache__/regex_simplexlinks.cpython-311.pyc
Normal file
BIN
SimpleX/__pycache__/regex_simplexlinks.cpython-311.pyc
Normal file
Binary file not shown.
|
@ -130,6 +130,7 @@ def main():
|
|||
secsvfile=instancepath+'/sensitive.csv'
|
||||
webpcsvfile=instancepath+'/webring-participants.csv'
|
||||
submission_file_abs_path = os.path.abspath('submissions/submission.csv')
|
||||
crawled_file_abs_path = os.path.abspath('crawler/onion_crawler.csv')
|
||||
|
||||
if not os.path.exists(instancepath):
|
||||
print_colors(f"{rootpath}",is_error=True, bold=True)
|
||||
|
@ -182,10 +183,11 @@ Maintenance:
|
|||
9) Remove the duplicate URLs for your own instance
|
||||
10) Perform sanity checks on all csv files for all instances (to mark them as sensitive / or remove the ones that are blacklisted)
|
||||
11) Review submissions (Add to verified.csv /add to unverified.csv /delete /blacklist)
|
||||
12) Review crawled websites (Add to verified.csv /add to unverified.csv /delete /blacklist)
|
||||
|
||||
0) Exit
|
||||
""")
|
||||
option = input("Select an option? (0-11): ").strip()
|
||||
option = input("Select an option? (0-12): ").strip()
|
||||
try:
|
||||
option = int(option)
|
||||
except ValueError:
|
||||
|
@ -1202,6 +1204,7 @@ Maintenance:
|
|||
break
|
||||
|
||||
case 11:
|
||||
#review the submitted websites:
|
||||
try:
|
||||
submission_df = pd.read_csv(submission_file_abs_path)
|
||||
verified_csv_df = pd.read_csv(verifiedcsvfile)
|
||||
|
@ -1215,6 +1218,7 @@ Maintenance:
|
|||
print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit")
|
||||
if link in blacklisted_words:
|
||||
print_colors("Black listed entry found", bold=True)
|
||||
#TODO delete the entry as its already blacklisted
|
||||
continue
|
||||
else:
|
||||
name = row['name']
|
||||
|
@ -1277,7 +1281,127 @@ Maintenance:
|
|||
break
|
||||
|
||||
finally:
|
||||
print_colors("End of file")
|
||||
print_colors("No more submissions to review, exiting.")
|
||||
|
||||
|
||||
case 12:
|
||||
# review the crawled websites
|
||||
try:
|
||||
print(crawled_file_abs_path)
|
||||
crawled_df = pd.read_csv(crawled_file_abs_path)
|
||||
verified_csv_df = pd.read_csv(verifiedcsvfile)
|
||||
unverified_csv_df = pd.read_csv(unverifiedcsvfile)
|
||||
blacklist_df = pd.read_csv(blcsvfile)
|
||||
blacklisted_words = [word for word in blacklist_df['blacklisted-words']]
|
||||
for i, row in crawled_df.iterrows():
|
||||
link = row['URL']
|
||||
print('\n',row[['URL','Category','Name']])
|
||||
print('\nLink to verify: ',link)
|
||||
print_colors("\n1) Move entry to verified.csv \n2) Move entry from submission.csv to unverified.csv \n3) Delete from submission.csv file \n4) Add to blacklist.csv \n-1) exit")
|
||||
if link in blacklisted_words:
|
||||
print_colors("Black listed entry found", bold=True)
|
||||
#TODO delete the entry as its already blacklisted
|
||||
crawled_df.drop(index=i,inplace=True)
|
||||
crawled_df.to_csv(submission_file_abs_path, index=False)
|
||||
continue
|
||||
else:
|
||||
name = row['Name']
|
||||
category = row['Category']
|
||||
#desc = row['esc']
|
||||
desc = ''
|
||||
#sensi = "YES" if row['sensitive'] == 'y' else "NO"
|
||||
sensi = ''
|
||||
number = int(input("Enter an option: "))
|
||||
if number == 1:
|
||||
# Add to verified.csv
|
||||
# ask the name if invalid
|
||||
while(IsNameValid(name) is not True):
|
||||
name = input("What is the name of the website? ")
|
||||
|
||||
# ask the category
|
||||
while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')):
|
||||
category = input("What is the website Category? (ex: Indexes) ")
|
||||
# ask the sensitivity
|
||||
choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ")
|
||||
if choice == "n":
|
||||
sensi = 'NO'
|
||||
else:
|
||||
sensi = 'YES'
|
||||
|
||||
# ask if its sensitive or not
|
||||
# ask the user to write a description
|
||||
newrow=[instance,category,name,link,sensi,desc,'YES','100']
|
||||
verified_csv_df.loc[-1] = newrow # adding a row
|
||||
verified_csv_df.index = verified_csv_df.index + 1 # shifting index
|
||||
verified_csv_df = verified_csv_df.sort_index() # sorting by index
|
||||
verified_csv_df = verified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories
|
||||
print_colors("[+] New row added! now writing the csv file")
|
||||
verified_csv_df.to_csv(verifiedcsvfile, index=False)
|
||||
crawled_df.drop(index=i,inplace=True)
|
||||
crawled_df.to_csv(crawled_file_abs_path, index=False)
|
||||
elif number == 2:
|
||||
# Add to unverified.csv
|
||||
# consider it as sensitive by default and category must just be 'crawled'
|
||||
# ask the name if invalid
|
||||
while(IsNameValid(name) is not True):
|
||||
name = input("What is the name of the website? ")
|
||||
# ask the category
|
||||
print('CATEGORY = ', category)
|
||||
while((IsCategoryValid(category) != True) or (category == 'Tor Hidden Service')):
|
||||
category = input("What is the website Category? (ex: Indexes) ")
|
||||
choice=input("Is the website sensitive ? (ex: related to drugs) (y/n) ")
|
||||
if choice == "n":
|
||||
sensi = 'NO'
|
||||
else:
|
||||
sensi = 'YES'
|
||||
# ask for the category, if empty then the category is 'crawled'
|
||||
# add new row
|
||||
newrow=[instance,category,name,link,sensi,desc,'YES','100']
|
||||
|
||||
unverified_csv_df.loc[-1] = newrow # adding a row
|
||||
unverified_csv_df.index = unverified_csv_df.index + 1 # shifting index
|
||||
unverified_csv_df = unverified_csv_df.sort_index() # sorting by index
|
||||
unverified_csv_df = unverified_csv_df.sort_values(by=["Category","Score"], ascending=[True,False]) # sorting categories
|
||||
print_colors("[+] New row added! now writing the csv file")
|
||||
unverified_csv_df.to_csv(unverifiedcsvfile, index=False)
|
||||
crawled_df.drop(index=i,inplace=True)
|
||||
crawled_df.to_csv(crawled_file_abs_path, index=False)
|
||||
|
||||
elif number == 3:
|
||||
# Delete from crawled_onion.csv
|
||||
crawled_df.drop(index=i,inplace=True)
|
||||
crawled_df.to_csv(crawled_file_abs_path, index=False)
|
||||
|
||||
elif number == 4:
|
||||
# Add to blacklist.csv
|
||||
newrow=[link]
|
||||
|
||||
blacklist_df.loc[-1] = newrow # adding a row
|
||||
blacklist_df.index = blacklist_df.index + 1 # shifting index
|
||||
blacklist_df = blacklist_df.sort_index() # sorting by index
|
||||
print_colors("[+] New row added! now writing the csv file")
|
||||
blacklist_df.to_csv(blcsvfile, index=False)
|
||||
crawled_df.drop(index=i,inplace=True)
|
||||
crawled_df.to_csv(crawled_file_abs_path, index=False)
|
||||
|
||||
elif number == -1:
|
||||
break
|
||||
|
||||
else:
|
||||
print_colors("Invalid Number",is_error=True)
|
||||
continue
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print_colors(f'Try again {e}',is_error=True)
|
||||
break
|
||||
|
||||
finally:
|
||||
print_colors("No more crawled websites to review, exiting.")
|
||||
|
||||
break
|
||||
case 0:
|
||||
|
|
0
submissions/README.md
Normal file → Executable file
0
submissions/README.md
Normal file → Executable file
Loading…
Add table
Add a link
Reference in a new issue