From b07ac08547e3aad8d22c9a0f1090bb26fc4f5d1f Mon Sep 17 00:00:00 2001 From: doctor_dev Date: Fri, 30 May 2025 12:12:09 +0000 Subject: [PATCH 1/9] Refactored option 4 + added conf.py + added some TODO comments for review --- .gitignore | 2 +- requirements.txt | 18 +- scripts/conf.py | 22 +++ scripts/lantern.py | 283 +++++--------------------------- scripts/logic/lantern_logic.py | 96 +++++++++++ scripts/utils.py | 289 +++++++++++++++++++++++++++++---- 6 files changed, 429 insertions(+), 281 deletions(-) create mode 100644 scripts/conf.py create mode 100644 scripts/logic/lantern_logic.py diff --git a/.gitignore b/.gitignore index fc36635..8c43f1e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ .git www/participants/** crawler/** -scripts/__pycache__/** +__pycache__/ .env env/ submissions/submission.csv diff --git a/requirements.txt b/requirements.txt index a887ce5..95f9083 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,17 @@ beautifulsoup4==4.13.3 -certifi==2024.12.14 -charset-normalizer==3.4.1 +certifi==2025.4.26 +charset-normalizer==3.4.2 +dotenv==0.9.9 idna==3.10 -numpy==2.2.2 +numpy==2.2.6 pandas==2.2.3 +pillow==11.2.1 +PySocks==1.7.1 python-dateutil==2.9.0.post0 -python-socks==2.6.1 -pytz==2024.2 +python-dotenv==1.1.0 +pytz==2025.2 requests==2.32.3 six==1.17.0 -tzdata==2025.1 -urllib3==2.3.0 -python-dotenv==1.0.1 +tzdata==2025.2 +urllib3==2.4.0 websockets==15.0.1 diff --git a/scripts/conf.py b/scripts/conf.py new file mode 100644 index 0000000..ed6b5eb --- /dev/null +++ b/scripts/conf.py @@ -0,0 +1,22 @@ +ROOT_PATH = '/srv/darknet-lantern/' +STATIC_PATH = ROOT_PATH + 'www/' +TEMPLATE_PATH = ROOT_PATH + 'templates/' + +PARTICIPANT_DIR = STATIC_PATH + 'participants/' +OFFICIAL_PARTICIPANTS_FILE = STATIC_PATH + '.official_participants' +WEBRING_CSV_FILE = 'webring-participants.csv' + +LOCAL_DIR = '' # Assign on script startup + +PROXIES = { + 'http': 'socks5h://127.0.0.1:9050', + 'https': 'socks5h://127.0.0.1:9050' +} + +CSV_FILES = [ + 'verified.csv', + 'unverified.csv', + 'blacklist.csv', + 'sensitive.csv', + 'webring-participants.csv' +] \ No newline at end of file diff --git a/scripts/lantern.py b/scripts/lantern.py index 6408f03..8985bcd 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -1,6 +1,9 @@ +###TODO: importing * is bad practice should import just utils and use it like in lantern_logic.py from utils import * +import logic.lantern_logic as lantern from dotenv import load_dotenv + import os, pwd import pandas as pd import requests @@ -532,259 +535,59 @@ Maintenance: ##################################################### - - #check if it works when you have a second webring participant case 4: - print_colors("4) Synchronize new links from existing webring participants, into your unverified.csv file") - participantsdir=rootpath+'www/participants/' - name='' - desc='' - trusted='' - status='' - score='' - webringcsvfile=instancepath+'/'+'webring-participants.csv' - wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip') - for participant in os.listdir(participantsdir): - participantdir=participantsdir+participant + print_colors("4) Synchronize new links from new or existing webring participants, into your local csv files") - # NOTE check if the webring participant is yourself, if it is, then skip it - if participant != myinstance: # prod: dont use your own intance - #if participant == myinstance: # preprod testing only on your own instance - #overwrite the existing files in the participant's directory, with their version (download all the csv files from them again) - basewurl='http://'+participant+'/participants/'+participant+'/' - print_colors(f"{basewurl}") - print_colors(f"[+] Downloading the files of: {participant} ") - w_vcsv=basewurl+'verified.csv' - w_uvcsv=basewurl+'unverified.csv' - w_blcsv=basewurl+'blacklist.csv' - w_scsv=basewurl+'sensitive.csv' - w_webcsv=basewurl+'webring-participants.csv' + print_colors('[+] Syncing official webrings to local webrings') - # verify that their verified.csv csv file exists at basewurl+'verified.csv' - if CheckUrl(w_vcsv) is False or CheckUrl(w_uvcsv) is False or CheckUrl(w_blcsv) is False or CheckUrl(w_scsv) is False or CheckUrl(w_webcsv) is False: - print_colors("[-] Webring Participant isn't reachable, skipping", is_error=True) - else: #if the webring participant is reachable, proceed - print_colors("[+] Webring Participant is reachable, updating their csv files:") - for i in ['verified.csv','unverified.csv','blacklist.csv','sensitive.csv','webring-participants.csv']: - # FOR EACH CSV FILE TO GET: - # URL: basewurl / FILE.CSV - # PATH: participantdir / FILE.CSV - # download the external csv file and save it into the "text" variable: - #response = urllib.request.urlopen(basewurl+i) - response = requests.get(basewurl+i, proxies=proxies) - #data = response.read() # a `bytes` object - #text = data.decode('utf-8') - text = response.text - # save the text variable into the destination file: - csvfilepath=participantdir+'/'+i - with open(csvfilepath, "w") as file: - file.write(text) - f = open(csvfilepath,"r") + webring_df = verify_official_participants_registered() - # download the banner.png image: + current_instance = get_current_instance() + + for participant in webring_df.itertuples(index=False, name='columns'): + # Check if the participant is my instance + if current_instance in participant: + continue - bannerurl=basewurl+'banner.png' - bannerpath=participantdir+'/banner.png' - r = requests.get(bannerurl, stream=True, proxies=proxies) - with open(bannerpath, 'wb') as f: - r.raw.decode_content = True - shutil.copyfileobj(r.raw, f) + if not is_participant_reachable(participant.URL): + print_colors("[-] Webring {participant.URL} isn't reachable, skipping", is_error=True) + continue + + print_colors('[+] Downloading participant\'s files to store locally') + lantern.download_participant_data(participant.URL) - # SANITY CHECK ON THE BANNER PNG IMAGE: - if IsBannerValid(bannerpath): - pass - else: - # if false, overwrite it with the template banner png file - os.remove(bannerpath) - # copy templates/banner.png to bannerpath - bannertemplatepath=templatepath+'banner.png' - shutil.copyfile(bannertemplatepath, bannerpath) + print_colors('[+] Reading local blacklist and sensitive words') + local_blacklist, local_sensitive = get_local_blacklist_and_sensitive() + print_colors('[+] Reading local verified and unverified') + local_verified_df, local_unverified_df = get_local_verified_and_unverified() + + participant_url = generate_local_participant_dir(participant.URL) - # check if the participant is already listed in webring-participants.csv or not, and add them if not already listed - # and display only the matching entries in unverified.csv in an array format (display it in CLI). - filter_wdf = wdf[wdf.URL.str.contains(participant,na=False)] - # check if there are no results, dont proceed if there are none! - if filter_wdf.size == 0: #skip if webring participant is already listed, otherwise proceed - newrow=[name,participant,desc,trusted,status,score] - wdf.loc[-1] = newrow # adding a row - wdf.index = wdf.index + 1 # shifting index - wdf = wdf.sort_index() # sorting by index - wdf.to_csv(webringcsvfile, index=False) - else: - pass + print_colors('[+] Reading webrring participant\'s verified and unverified, and removing unverified and blacklisted rows') + participant_verified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}verified.csv'), local_blacklist) + participant_unverified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}unverified.csv'), local_blacklist) - # iterate through the participant's verified.csv and unverified.csv files - for w in ['verified.csv','unverified.csv']: - csvfilepath=participantdir+'/'+w - print_colors(f"{csvfilepath}") - csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') + print_colors('[+] Marking sensitive rows') + participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive) + participant_unverified_df = lantern.mark_sensitive(participant_unverified_df, local_sensitive) + + if participant.Trusted == 'YES': + print_colors('[+] This participant is trusted, copying participant\'s verified to local verified') + local_verified_df = merge_verification_df(local_verified_df, participant_verified_df) + + else: + print_colors('[+] This participant is not trusted, copying participant\'s verified to local unverified') + local_unverified_df = merge_verification_df(local_unverified_df, participant_verified_df) + + print_colors('[+] Copying participant\'s unverified to local unverified') + local_unverified_df = merge_verification_df(local_unverified_df, participant_unverified_df) - print("[+] Removing the participant's duplicate entries... ") - # REMOVE DUPLICATES !!! do not accept any duplicate from remote participants - csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) - csvdf = csvdf.drop_duplicates(subset=['Name'], keep="first", inplace=False) - csvdf.to_csv(csvfilepath, index=False) - - csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') - - bldf[['blacklisted-words']].iterrows() - rows2delete= [] # it is an empty list at first - for i,j in csvdf.iterrows(): - row=csvdf.loc[i,:].values.tolist() - # check the number of columns in said row, - # print('rowcolnum:',len(row),' colnum:',len(csvdf.columns)) - # print_colors(f"{row}") - - - - - ################################ SANITY CHECKS #################################### - ### SANITY CHECK 0: make sure that ✔️ and x are replaced with YES/NO, as it changed since v1.0.1 ### - if csvdf.at[i, 'Status'] == "✔️" or csvdf.at[i, 'Status'] == "YES" : - csvdf.at[i, 'Status'] = "YES" - csvdf.to_csv(csvfilepath, index=False) - else: - csvdf.at[i, 'Status'] = "NO" - csvdf.to_csv(csvfilepath, index=False) - - if csvdf.at[i, 'Sensitive'] == "✔️" or csvdf.at[i, 'Sensitive'] == "YES" : - csvdf.at[i, 'Sensitive'] = "YES" - csvdf.to_csv(csvfilepath, index=False) - else: - csvdf.at[i, 'Sensitive'] = "NO" - csvdf.to_csv(csvfilepath, index=False) - - ### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion### - if IsUrlValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsUrlValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or IsDescriptionValid(csvdf.at[i, 'Description']) is False or IsStatusValid(csvdf.at[i, 'Status']) is False or IsScoreValid(csvdf.at[i, 'Score']) is False: - #mark the row for deletion as it has invalid inputs - if i not in rows2delete: - print_colors(f"Marking row {i} for deletion, as it has invalid inputs") - print(row) - rows2delete.append(i) #mark the row for deletion if not already done - - ### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ### - for k,l in bldf.iterrows(): - blword=bldf.at[k, 'blacklisted-words'] - if any(blword in str(x) for x in row) == True: - if i not in rows2delete: - print_colors(f"Marking row {i} for deletion, as it matches with a blacklisted word") - rows2delete.append(i) #mark the row for deletion if not already done - else: - if i not in rows2delete: - # not a blacklisted link, therefore it is suitable to be added to your own csv files: - ################################ CHECKING FOR DUPLICATES! ######################### - # for each link in the participant's verified/unverified csv files, - # check if the link is already listed in your own verified.csv or unverified.csv - filterterm=csvdf.at[i, 'URL'] - #print('1)',filterterm) - filter_vdf= vdf[vdf.URL.str.contains(filterterm,na=False)] - filter_vdf2= vdf[vdf.Name.str.contains(filterterm,na=False)] # do not accept the new link if the name already exists in verified.csv - #print('2)',filter_vdf) - #print('3)',uvdf[uvdf.URL.str.contains(filterterm,na=False)] ) - uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip') - # TODO DELETE ALL DUPLICATES OF UVDF ! - uvdf = uvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) - uvdf = uvdf.drop_duplicates(subset=['Name'], keep="first", inplace=False) - filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm,na=False)] - filter_uvdf2= uvdf[uvdf.Name.str.contains(filterterm,na=False)] # do not accept the new link if the name already exists in unverified.csv - if len(filter_uvdf.index) == 0 and len(filter_vdf.index) == 0 and len(filter_uvdf2.index) == 0 and len(filter_vdf2.index) == 0 : - newrow=row - uvdf.loc[-1] = newrow # adding a row - uvdf.index = uvdf.index + 1 # shifting index - uvdf = uvdf.sort_index() # sorting by index - uvdf.to_csv(unverifiedcsvfile, index=False) - - print("[+] NEW ROW =",newrow) - print_colors("[+] New row added to your own unverified.csv file!") - else: - pass - #print_colors(f'[-] Skipping row as it is already added in {w} {row}',is_error=True) - - - - ###################### APPENDING TO YOUR OWN UNVERIFIED.CSV FILE################### - - - ### SANITY CHECK 3: Mark all the rows that are supposed to be sensitive ### - for k,l in sedf.iterrows(): - seword=sedf.at[k, 'sensitive-words'] - if any(seword in str(x) for x in row) == True: - if csvdf.at[i, 'Sensitive'] != 'NO': - print_colors(f"Marking row {i} as sensitive, as it matches with a sensitive word") - csvdf.at[i, 'Sensitive']='YES' - - #print_colors(f'[-] Rows to delete: {rows2delete}', is_error=True) - # only delete rows after you've gone through all the unverified.csv OR verified.csv rows' - # check for NAME duplicates and mark them for deletion: - # remove name duplicates that are in unverifie.csv yet exist in verified.csv (as verified.csv takes the priority) - if w == 'unverified.csv': - try: - # check if the given row Name already exists in verified.csv - filterterm=csvdf.at[i, 'Name'] - filter_vdf= vdf[vdf.Name.str.contains(filterterm,na=False)] - print('[+] CHECKING FOR DUPLIATES: ',filterterm) - if len(filter_vdf.index) != 0: - # drop the unverified.csv row if its name already exists in verified.csv - print('[+] DUPLICATE FOUND, MARKING ROW FOR DELETION: ',row) - rows2delete.append(i) #mark the row for deletion if not already done - except: - pass - - - for i in rows2delete: - row=csvdf.loc[i,:].values.tolist() - print_colors(f'[+] REMOVING ROW: {i}{row}') - csvdf.drop(i, inplace= True) - csvdf.to_csv(csvfilepath, index=False) - rows2delete= [] # it is an empty list at first - - # fill missing description in our unverified.csv that other participants verified.csv have filled - if w == 'verified.csv': - uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip') - # merge participant's verified.csv on our unverified.csv on URL - merged_df = uvdf.merge(csvdf[['URL', 'Description']], - on='URL', - how='left', - suffixes=('', '_participant')) - # filter empty description that has participant's description - no_descr_filter = ((merged_df['Description'].isna()) | (merged_df['Description'].str.strip() == '')) & \ - (~merged_df['Description_participant'].isna()) & (merged_df['Description_participant'].str.strip() != '') - no_descr_filter_count = no_descr_filter.sum() - # update our empty description if the participant has any filled description - if no_descr_filter_count > 0: - merged_df.loc[no_descr_filter, 'Description'] = merged_df.loc[no_descr_filter, 'Description_participant'] - # keep only original columns - uvdf_updated = merged_df[uvdf.columns] - uvdf_updated.to_csv(unverifiedcsvfile, index=False) - print(f'[+] Updated {no_descr_filter_count} empty description(s) in your unverified.csv found on partipant\'s {w}') - # remove all name duplicates from your own unverified.csv file: - for i,j in uvdf.iterrows(): - row=uvdf.loc[i,:].values.tolist() - # check if the given row Name already exists in verified.csv - filterterm=uvdf.at[i, 'Name'] - filter_vdf= vdf[vdf.Name.str.contains(filterterm,na=False)] - print('[+] CHECKING FOR DUPLIATES: ',filterterm) - if len(filter_vdf.index) != 0: - # drop the unverified.csv row if its name already exists in verified.csv - print('[+] DUPLICATE FOUND, MARKING ROW FOR DELETION: ',row) - rows2delete.append(i) #mark the row for deletion if not already done - for i in rows2delete: - row=uvdf.loc[i,:].values.tolist() - print_colors(f'[+] REMOVING ROW: {i}{row}') - uvdf.drop(i, inplace= True) - uvdf.to_csv(unverifiedcsvfile, index=False) - rows2delete= [] # it is an empty list at first + print_colors('[+] Saving local verified and unverified') + save_local_verified_and_unverified(local_verified_df, local_unverified_df) break - - - - - - - case 5: print_colors("[+] Add a new webring participant (and download their files into their directory (without trusting them yet!))") webring_participant_url = '' diff --git a/scripts/logic/lantern_logic.py b/scripts/logic/lantern_logic.py new file mode 100644 index 0000000..e0f732f --- /dev/null +++ b/scripts/logic/lantern_logic.py @@ -0,0 +1,96 @@ +import utils +import os +import conf +import requests + +def download_participant_data(participant): + """ + Downloads the participants csv files and banner + + Parameters: + participant (str): The url of the webring participant. + + Returns: + Boolean: True if all files downloaded, False if any of them failed + """ + + try: + utils.print_colors(f"[+] Downloading webring {participant} csv files and banner") + + local_participant_dir = utils.generate_local_participant_dir(participant) + + os.makedirs(local_participant_dir, exist_ok=True) + + for file_name in conf.CSV_FILES: + + csv_res = requests.get(f'{utils.generate_participant_url(participant)}{file_name}', proxies=conf.PROXIES, timeout=10) + + with open(f'{local_participant_dir}{file_name}', "w") as file: + file.write(csv_res.text) + + banner_res = requests.get(f'{utils.generate_participant_url(participant)}banner.png', stream=True, proxies=conf.PROXIES, timeout=10) + + banner_path = f'{local_participant_dir}banner.png' + + with open(banner_path, 'wb') as f: + f.write(banner_res.content) + + # SANITY CHECK ON THE BANNER PNG IMAGE: + if not utils.IsBannerValid(banner_path): + # if false, overwrite it with the template banner png file + os.remove(banner_path) + shutil.copyfile(f'{conf.TEMPLATE_PATH}banner.png', banner_path) + + utils.print_colors(f"[+] Downloaded webring {participant} csv files and banner") + + except Exception: + print_colors("[-] Downloading webring participant's files failed.", is_error=True) + +def clean_csv(df, blacklist): + """ + Cleans duplications and blacklisted rows + + Parameters: + df (dataframe): The dataframe we want to clean. + blacklist (list): The blacklisted words. + + Returns: + Dataframe: Cleaned dataframe. + """ + try: + if not df.empty: + df = utils.remove_duplications(df) + + df = df[~df.apply(lambda row: any(word in str(value) for word in blacklist for value in row), axis=1)] + + if not df.empty: + df = df[df.apply(utils.is_row_valid, axis=1)] + + except Exception: + print_colors("[-] cleaning dataframe failed", is_error=True) + + return df + +def mark_sensitive(df, sensitive_list): + """ + Marks rows as sensitive + + Parameters: + df (dataframe): The dataframe we want to mark. + sensitive (list): The sensitive words. + + Returns: + Dataframe: Marked dataframe. + """ + + try: + if not df.empty: + sensitive_rows = df.apply(lambda row: any(word in str(value) for word in sensitive_list for value in row), axis=1) + + df.loc[sensitive_rows, 'Sensitive'] = 'YES' + df.loc[~sensitive_rows, 'Sensitive'] = 'NO' + + except Exception: + print_colors("[-] MArking sensitive words failed.", is_error=True) + + return df \ No newline at end of file diff --git a/scripts/utils.py b/scripts/utils.py index 190daf5..513bd15 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -7,7 +7,8 @@ import json #from SimpleX.utils import IsUrlValid import urllib.parse from websockets.sync.client import connect - +import conf +import pandas as pd PURPLE = '\033[35;40m' BOLD_PURPLE = '\033[35;40;1m' @@ -16,8 +17,24 @@ BOLD_RED = '\033[31;40;1m' RESET = '\033[m' +def get_current_instance(): + """ + Checks if all URL files are actually reachable via Tor -#### Checking Functions to validate that links are legit #### + Returns: + str: the local instance onion url + """ + + #expanduser gives the current user directory + instance_file = os.path.expanduser("~") + '/.darknet_participant_url' + + with open(instance_file) as f: + return f.read().rstrip() + +#Set the local dir on script run +conf.LOCAL_DIR = conf.PARTICIPANT_DIR + get_current_instance() + '/' + +###################### Validations ###################### def CheckUrl(url): """ @@ -29,7 +46,7 @@ def CheckUrl(url): } try: status = requests.get(url,proxies=proxies, timeout=5).status_code - if status != 502: + if status == 200: return True else: return False @@ -38,6 +55,31 @@ def CheckUrl(url): except requests.exceptions.ReadTimeout as e: return False +###TODO: should replace checkUrl +# checks if all the webring participants are reachable +def is_participant_reachable(instance): + """ + Checks if all URL files are actually reachable via Tor + + Parameters: + instance (str): The participant onion address + + Returns: + Boolean: False if any file is unreachable, True if all are reachable + """ + + url = generate_participant_url(instance) + + # Checks all files on a webring participant , if all reached returns true + for file_name in conf.CSV_FILES: + try: + status = requests.get(f'{url}{file_name}',proxies=conf.PROXIES, timeout=10).status_code + if status != 200: + return False + except Exception: + return False + + return True #### PROTECTIONS AGAINST MALICIOUS CSV INPUTS #### def IsBannerValid(path: str) -> bool: @@ -59,7 +101,6 @@ def IsBannerValid(path: str) -> bool: return False return True - def IsOnionValid(url: str)-> bool: """ Checks if the domain(param) is a valid onion domain and return True else False. @@ -159,25 +200,6 @@ def IsUrlValid(url:str)->bool: return False return True - -#def IsUrlValid(url:str)->bool: -# """ -# Check if url is valid both dark net end clearnet. -# """ -# pattern = re.compile("^[A-Za-z0-9:/.-]+$") -# url = str(url) -# if len(url) < 4: -# return False -# if url.endswith('.onion'): -# return IsOnionValid(url) -# else: -# if not url.__contains__('.'): -# return False -# if pattern.fullmatch(url) is None: -# return False -# return True - - def IsStatusValid(status: str)-> bool: """ Checks if status contains only ['YES','NO']. Verbose only if False is returned @@ -191,7 +213,6 @@ def IsStatusValid(status: str)-> bool: return True - def IsScoreValid(score:str)->bool: """ Check the Score is only "^[0-9.,]+$" with 8 max chars. @@ -207,7 +228,6 @@ def IsScoreValid(score:str)->bool: return False return True - def IsDescriptionValid(desc:str)->bool: """ Check the categories are only [a-zA-Z0-9.' ] with 256 max chars. @@ -239,8 +259,6 @@ def IsCategoryValid(categories: list)-> bool: else: return True - - def IsSimpleXServerValid(url: str) -> bool: pattern = re.compile('[0-9A-Za-z-_]*') url = url.strip() @@ -274,8 +292,6 @@ def IsSimpleXServerValid(url: str) -> bool: # Any error will be a false return False - - def IsNameValid(name: str)->bool: """ Check the parameter name only contains [a-zA-Z0-9 ] and is 64 chars long. @@ -292,7 +308,6 @@ def IsNameValid(name: str)->bool: return False return True - def print_colors(s:str=' ', bold:bool=False, is_error:bool = False, default:bool=False): """ Helper function to print with colors @@ -308,8 +323,6 @@ def print_colors(s:str=' ', bold:bool=False, is_error:bool = False, default:bool else: print(f"{PURPLE}{s}{RESET}") - - def IsSimpleXOnionValid(url: str)-> bool: """ Checks if the domain(param) is a valid onion domain and return True else False. @@ -383,3 +396,215 @@ def send_server_checks(url:str) -> (): failed_response = response['resp'].get('testFailure') return (response, resp_type, failed_response) + +def is_row_valid(row): + """ + validates dataframe row to check if all field are valid + + Parameters: + row (dict): dataframe row + + Returns: + Boolean: True if row is valid, False if row isn't valid + """ + try: + return ( + IsUrlValid(row['Instance']) and + IsCategoryValid(row['Category']) and + IsNameValid(row['Name']) and + IsUrlValid(row['URL']) and + IsStatusValid(row['Sensitive']) and + IsDescriptionValid(row['Description']) and + IsStatusValid(row['Status']) and + IsScoreValid(row['Score']) + ) + + except Exception: + return False + +###################### General ###################### + +def merge_verification_df(receiving_df, merging_df): + """ + merges 2 dataframes of type verified or unverified (do not merge duplications by name or url) + + Parameters: + receiving_df (Dataframe): dataframe we want to receive the data + merging_df (Dataframe): dataframe we want to merge into the receiving dataframe + + Returns: + Dataframe: the combined dataframe will be returned + """ + try: + filtered_df = merging_df[~((merging_df['URL'].isin(receiving_df['URL'])) | merging_df['Name'].isin(receiving_df['Name']))] + + if filtered_df.empty: + return receiving_df + + elif receiving_df.empty: + return filtered_df + + else: + return pd.concat([receiving_df, filtered_df], ignore_index=True) + + except Exception: + return receiving_df + +def remove_duplications(df): + """ + remove url and name duplications from the dataframe + + Parameters: + df (Dataframe): the dataframe to remove duplications from + + Returns: + Dataframe: the dataframe after all duplications were removed + """ + try: + df = df.drop_duplicates(subset='Name') + df = df.drop_duplicates(subset='URL') + + except Exception: + pass + + return df + +###TODO: can later remove the inputs and have a "global" local verified and unverified or a class of the local(lantern host) participant +def save_local_verified_and_unverified(verified_df, unverified_df): + """ + saves the local verified and unverified + + Parameters: + verified_df (Dataframe): local verified rows dataframe + unverified_df (Dataframe): local unverified rows dataframe + + Returns: + Dataframe: the combined dataframe will be returned + """ + try: + current_instance = get_current_instance() + '/' + + verified_df.to_csv(f'{conf.PARTICIPANT_DIR}{current_instance}verified.csv', index=False) + + unverified_df.to_csv(f'{conf.PARTICIPANT_DIR}{current_instance}unverified.csv', index=False) + + return True + + except Exception: + print_colors('[-] Saving verified and unverified failed',is_error=True ) + return False + +###################### Getters/Generators ###################### +def generate_participant_url(participant): + """ + generates url of the webring participant + + Parameters: + participant(str): participant's onion address/instance + + Returns: + str: the url of the webring participant + """ + + return f'http://{participant}/participants/{participant}/' + +def generate_local_participant_dir(participant): + """ + generates local files path of the webring participant + + Parameters: + participant(str): participant's onion address/instance + + Returns: + str: the local path of the webring participant's files + """ + + return f'{conf.PARTICIPANT_DIR}{participant}/' + +def get_official_participants(): + """ + reads all the official webring participants + + Returns: + list: list of all the official webring participants + """ + + try: + current_instance = get_current_instance() + + with open(conf.OFFICIAL_PARTICIPANTS_FILE, 'r') as file: + return [line.strip() for line in file if current_instance not in line] + + except Exception: + print_colors('[-] Couldn\'t read official webring participants file',is_error=True ) + +def get_local_blacklist_and_sensitive(): + """ + reads the local blacklisted words and the local sensitive words + + Returns: + blacklist(list): list of all the words that are blacklisted + sensitive_list(list): list of all the words that are sensitive + """ + try: + current_instance = get_current_instance() + '/' + + blacklist_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}blacklist.csv') + blacklist = blacklist_df.iloc[:, 0].tolist() + + sensitive_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}sensitive.csv') + sensitive_list = sensitive_df.iloc[:, 0].tolist() + + return blacklist, sensitive_list + + except Exception: + print_colors('[-] Failed reading the blacklist and sensitive words file',is_error=True ) + return [], [] + +def get_local_verified_and_unverified(): + """ + reads the local verified csv and the local unverified csv + + Returns: + verified_df(Dataframe): verified.csv as dataframe + unverified_df(Dataframe): unverified.csv as dataframe + """ + + try: + current_instance = get_current_instance() + '/' + + verified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}verified.csv') + + unverified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}unverified.csv') + + return verified_df, unverified_df + + except Exception: + print_colors('[-] Failed reading the verified and unverified files',is_error=True ) + return pd.DataFrame(), pd.DataFrame() + +def get_local_webring_participants(): + """ + make sure the official participants are registered in the webring csv file + + Returns: + Dataframe: the verified local webring participants dataframe + """ + + try: + webring_df = pd.read_csv(conf.LOCAL_DIR + conf.WEBRING_CSV_FILE) + + # finds any missing official webrings in the local webring file + missing_participants = set(get_official_participants()) - set(webring_df['URL']) + + for participant in missing_participants: + new_row = [{'Name': '','URL': participant,'Description': '','Trusted': 'NO','Status': '','Score': ''}] + webring_df = pd.concat([webring_df, pd.DataFrame(new_row)], ignore_index=True) + + webring_df.to_csv(conf.LOCAL_DIR + conf.WEBRING_CSV_FILE, index=False) + + return webring_df + + except Exception: + print_colors(f'[-] failed reading webring participants file',is_error=True ) + return pd.DataFrame() From 7e6b75ec9d218fd51d0977a733e84f7003274c76 Mon Sep 17 00:00:00 2001 From: doctor_dev Date: Fri, 30 May 2025 15:30:01 +0000 Subject: [PATCH 2/9] Started refactoring 9 --- scripts/lantern.py | 2 ++ scripts/utils.py | 90 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/scripts/lantern.py b/scripts/lantern.py index 8985bcd..befcd5f 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -1002,6 +1002,7 @@ Maintenance: case 9: print_colors("[+] 9) Cleaning up all duplicates in your own unverified + verified.csv (based on the url)") + for w in ['verified.csv', 'unverified.csv']: csvfilepath = os.path.join(instancepath, w) print_colors(f"Processing file: {csvfilepath}") @@ -1022,6 +1023,7 @@ Maintenance: case 10: print_colors("[+] 10) perform sanity checks on all csv files (to mark them as sensitive / or remove the ones that are blacklisted)") + participantspath = rootpath+'www/participants/' for participant in os.listdir(participantspath): print_colors(f"Participant: {participant}") diff --git a/scripts/utils.py b/scripts/utils.py index 513bd15..f31340e 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -469,6 +469,23 @@ def remove_duplications(df): return df +def remove_cross_dataframe_replications(main_df, sub_df): + try: + + main_df = remove_duplications(main_df) + sub_df = remove_duplications(sub_df) + + mask = sub_df['URL'].isin(main_fd['URL']) | df_a['Name'].isin(df_b['Name']) + + sub_df = sub_df[~mask] + + return sub_df + + except: + pass + + return main_df, sub_df + ###TODO: can later remove the inputs and have a "global" local verified and unverified or a class of the local(lantern host) participant def save_local_verified_and_unverified(verified_df, unverified_df): """ @@ -521,6 +538,39 @@ def generate_local_participant_dir(participant): return f'{conf.PARTICIPANT_DIR}{participant}/' +def get_participant_local_verified_and_unverified(participant): + """ + reads the local verified csv and the local unverified csv of a participant + + Parameters: + participant (str): participant's onion address/instance + + Returns: + verified_df(Dataframe): verified.csv as dataframe + unverified_df(Dataframe): unverified.csv as dataframe + """ + + try: + current_instance = get_current_instance() + '/' + try: + verified_df = pd.read_csv(f'{participant}verified.csv') + + except FileNotFoundError: + print_colors("[-] File not found: verified.csv", is_error=True) + + try: + unverified_df = pd.read_csv(f'{participant}unverified.csv') + + except FileNotFoundError: + print_colors("[-] Participant File not found: unverified.csv", is_error=True) + + return verified_df, unverified_df + + except Exception: + print_colors('[-] Failed reading the verified and unverified files',is_error=True) + + return pd.DataFrame(), pd.DataFrame() + def get_official_participants(): """ reads all the official webring participants @@ -548,22 +598,32 @@ def get_local_blacklist_and_sensitive(): """ try: current_instance = get_current_instance() + '/' + try: + blacklist_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}blacklist.csv') + blacklist = blacklist_df.iloc[:, 0].tolist() - blacklist_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}blacklist.csv') - blacklist = blacklist_df.iloc[:, 0].tolist() + except FileNotFoundError: + print_colors("[-] File not found: blacklist.csv", is_error=True) + + try: + sensitive_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}sensitive.csv') + sensitive_list = sensitive_df.iloc[:, 0].tolist() + + except FileNotFoundError: + print_colors("[-] File not found: sensitive.csv", is_error=True) - sensitive_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}sensitive.csv') - sensitive_list = sensitive_df.iloc[:, 0].tolist() return blacklist, sensitive_list + except Exception: - print_colors('[-] Failed reading the blacklist and sensitive words file',is_error=True ) - return [], [] + print_colors('[-] Failed reading the blacklist and sensitive words file',is_error=True) + + return [], [] def get_local_verified_and_unverified(): """ - reads the local verified csv and the local unverified csv + reads the local verified csv and the local unverified csv of the instance Returns: verified_df(Dataframe): verified.csv as dataframe @@ -572,16 +632,24 @@ def get_local_verified_and_unverified(): try: current_instance = get_current_instance() + '/' + try: + verified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}verified.csv') + + except FileNotFoundError: + print_colors("[-] File not found: verified.csv", is_error=True) - verified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}verified.csv') + try: + unverified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}unverified.csv') - unverified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}unverified.csv') + except FileNotFoundError: + print_colors("[-] File not found: unverified.csv", is_error=True) return verified_df, unverified_df except Exception: - print_colors('[-] Failed reading the verified and unverified files',is_error=True ) - return pd.DataFrame(), pd.DataFrame() + print_colors('[-] Failed reading the verified and unverified files',is_error=True) + + return pd.DataFrame(), pd.DataFrame() def get_local_webring_participants(): """ From 6a70e12646d04f8817488846bbf5d18c0d49685a Mon Sep 17 00:00:00 2001 From: doctor_dev Date: Fri, 30 May 2025 15:30:38 +0000 Subject: [PATCH 3/9] Started refactoring 9 - again --- scripts/lantern.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/lantern.py b/scripts/lantern.py index befcd5f..5e2fae6 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -1003,6 +1003,10 @@ Maintenance: case 9: print_colors("[+] 9) Cleaning up all duplicates in your own unverified + verified.csv (based on the url)") + verified_df, unverified_df = utils.get_local_verified_and_unverified() + + + for w in ['verified.csv', 'unverified.csv']: csvfilepath = os.path.join(instancepath, w) print_colors(f"Processing file: {csvfilepath}") @@ -1014,8 +1018,6 @@ Maintenance: #print_colors(f"{csvdf[['URL']]}") csvdf.to_csv(csvfilepath, index=False) print_colors(f"Cleaned data:\n{csvdf[['URL']]}") - except FileNotFoundError: - print_colors(f"File not found: {csvfilepath}") except Exception as e: print_colors(f"An error occurred while processing {csvfilepath}: {e}") break @@ -1023,7 +1025,7 @@ Maintenance: case 10: print_colors("[+] 10) perform sanity checks on all csv files (to mark them as sensitive / or remove the ones that are blacklisted)") - + participantspath = rootpath+'www/participants/' for participant in os.listdir(participantspath): print_colors(f"Participant: {participant}") From 4a17f880c65fd1e35d5e244b27ea7f8746de8ce8 Mon Sep 17 00:00:00 2001 From: doctor_dev Date: Fri, 30 May 2025 16:19:22 +0000 Subject: [PATCH 4/9] fixed commenting out the RecognizedURL --- scripts/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.py b/scripts/utils.py index 26b12eb..904b75d 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -112,7 +112,7 @@ def IsURLValid(url: str) -> bool: """ Checks if given URL is valid (RecognizeURLType recognizes it) """ - return #RecognizeURLType(url) != 'invalid' + return RecognizeURLType(url) != 'invalid' def CheckUrl(url): From c4ebef10a48681607a9445d39fa4028108c59bf1 Mon Sep 17 00:00:00 2001 From: doctor_dev Date: Fri, 30 May 2025 17:59:12 +0000 Subject: [PATCH 5/9] finished option 9 ready for test --- .gitignore | 1 + scripts/conf.py | 2 + scripts/lantern.py | 110 ++++++++++++++++++++++----------------------- scripts/utils.py | 10 ++--- 4 files changed, 61 insertions(+), 62 deletions(-) diff --git a/.gitignore b/.gitignore index 8c43f1e..868c262 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ __pycache__/ env/ submissions/submission.csv venv/ +local_testing/* diff --git a/scripts/conf.py b/scripts/conf.py index 3c2728e..9b79107 100644 --- a/scripts/conf.py +++ b/scripts/conf.py @@ -1,3 +1,5 @@ +import re + ROOT_PATH = '/srv/darknet-lantern/' STATIC_PATH = ROOT_PATH + 'www/' TEMPLATE_PATH = ROOT_PATH + 'templates/' diff --git a/scripts/lantern.py b/scripts/lantern.py index 2bd1454..7a2109d 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -533,55 +533,60 @@ Maintenance: case 4: print_colors("4) Synchronize new links from new or existing webring participants, into your local csv files") - print_colors('[+] Syncing official webrings to local webrings') + try: - webring_df = verify_official_participants_registered() - - current_instance = get_current_instance() + print_colors('[+] Syncing official webrings to local webrings') - for participant in webring_df.itertuples(index=False, name='columns'): - # Check if the participant is my instance - if current_instance in participant: - continue + webring_df = verify_official_participants_registered() - if not is_participant_reachable(participant.URL): - print_colors("[-] Webring {participant.URL} isn't reachable, skipping", is_error=True) - continue + current_instance = get_current_instance() - print_colors('[+] Downloading participant\'s files to store locally') - lantern.download_participant_data(participant.URL) + for participant in webring_df.itertuples(index=False, name='columns'): + # Check if the participant is my instance + if current_instance in participant: + continue - print_colors('[+] Reading local blacklist and sensitive words') - local_blacklist, local_sensitive = get_local_blacklist_and_sensitive() - - print_colors('[+] Reading local verified and unverified') - local_verified_df, local_unverified_df = get_local_verified_and_unverified() - - participant_url = generate_local_participant_dir(participant.URL) - - print_colors('[+] Reading webrring participant\'s verified and unverified, and removing unverified and blacklisted rows') - participant_verified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}verified.csv'), local_blacklist) - participant_unverified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}unverified.csv'), local_blacklist) - - print_colors('[+] Marking sensitive rows') - participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive) - participant_unverified_df = lantern.mark_sensitive(participant_unverified_df, local_sensitive) + if not is_participant_reachable(participant.URL): + print_colors("[-] Webring {participant.URL} isn't reachable, skipping", is_error=True) + continue - if participant.Trusted == 'YES': - print_colors('[+] This participant is trusted, copying participant\'s verified to local verified') - local_verified_df = merge_verification_df(local_verified_df, participant_verified_df) - - else: - print_colors('[+] This participant is not trusted, copying participant\'s verified to local unverified') - local_unverified_df = merge_verification_df(local_unverified_df, participant_verified_df) - - print_colors('[+] Copying participant\'s unverified to local unverified') - local_unverified_df = merge_verification_df(local_unverified_df, participant_unverified_df) + print_colors('[+] Downloading participant\'s files to store locally') + lantern.download_participant_data(participant.URL) - print_colors('[+] Saving local verified and unverified') - save_local_verified_and_unverified(local_verified_df, local_unverified_df) + print_colors('[+] Reading local blacklist and sensitive words') + local_blacklist, local_sensitive = get_local_blacklist_and_sensitive() - break + print_colors('[+] Reading local verified and unverified') + local_verified_df, local_unverified_df = get_local_verified_and_unverified() + + participant_url = generate_local_participant_dir(participant.URL) + + print_colors('[+] Reading webrring participant\'s verified and unverified, and removing unverified and blacklisted rows') + participant_verified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}verified.csv'), local_blacklist) + participant_unverified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}unverified.csv'), local_blacklist) + + print_colors('[+] Marking sensitive rows') + participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive) + participant_unverified_df = lantern.mark_sensitive(participant_unverified_df, local_sensitive) + + if participant.Trusted == 'YES': + print_colors('[+] This participant is trusted, copying participant\'s verified to local verified') + local_verified_df = merge_verification_df(local_verified_df, participant_verified_df) + + else: + print_colors('[+] This participant is not trusted, copying participant\'s verified to local unverified') + local_unverified_df = merge_verification_df(local_unverified_df, participant_verified_df) + + print_colors('[+] Copying participant\'s unverified to local unverified') + local_unverified_df = merge_verification_df(local_unverified_df, participant_unverified_df) + + print_colors('[+] Saving local verified and unverified') + save_local_verified_and_unverified(local_verified_df, local_unverified_df) + + break + + except Exception: + print_colors("[-] Option 4 failed suddently, please try again", is_error=True) case 5: print_colors("[+] Add a new webring participant (and download their files into their directory (without trusting them yet!))") @@ -997,26 +1002,17 @@ Maintenance: case 9: print_colors("[+] 9) Cleaning up all duplicates in your own unverified + verified.csv (based on the url)") + + try: - verified_df, unverified_df = utils.get_local_verified_and_unverified() + verified_df, unverified_df = utils.get_local_verified_and_unverified() + verified_df, unverified_df = remove_cross_dataframe_replications(verified_df, unverified_df) + save_local_verified_and_unverified(verified_df, unverified_df) - for w in ['verified.csv', 'unverified.csv']: - csvfilepath = os.path.join(instancepath, w) - print_colors(f"Processing file: {csvfilepath}") - try: - csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') - print_colors(f"Removing duplicates in {csvfilepath}") - #print_colors(f"{csvdf[['URL']]}") - csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) - #print_colors(f"{csvdf[['URL']]}") - csvdf.to_csv(csvfilepath, index=False) - print_colors(f"Cleaned data:\n{csvdf[['URL']]}") - except Exception as e: - print_colors(f"An error occurred while processing {csvfilepath}: {e}") - break - break + except Exception: + print_colors("[-] Option 9 failed suddently, please try again", is_error=True) case 10: print_colors("[+] 10) perform sanity checks on all csv files (to mark them as sensitive / or remove the ones that are blacklisted)") diff --git a/scripts/utils.py b/scripts/utils.py index e33482d..9c0580c 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -334,7 +334,7 @@ def remove_duplications(df): df = df.drop_duplicates(subset='URL') except Exception: - pass + print_colors('[-] Removing duplication failed',is_error=True) return df @@ -348,10 +348,8 @@ def remove_cross_dataframe_replications(main_df, sub_df): sub_df = sub_df[~mask] - return sub_df - except: - pass + print_colors('[-] Removing cross dataframe duplications failed',is_error=True) return main_df, sub_df @@ -374,10 +372,12 @@ def save_local_verified_and_unverified(verified_df, unverified_df): unverified_df.to_csv(f'{conf.PARTICIPANT_DIR}{current_instance}unverified.csv', index=False) + print_colors('[+] Verified and unverified saved successfully') + return True except Exception: - print_colors('[-] Saving verified and unverified failed',is_error=True ) + print_colors('[-] Saving verified and unverified failed',is_error=True) return False ###################### Getters/Generators ###################### From 1b67f7a2184640e3ba0844b32dacf98c2f2c8c67 Mon Sep 17 00:00:00 2001 From: doctor_dev Date: Fri, 30 May 2025 18:20:52 +0000 Subject: [PATCH 6/9] starting work on option 10 --- scripts/lantern.py | 20 +++++++++++++++----- scripts/logic/lantern_logic.py | 8 ++++---- scripts/utils.py | 26 ++++++++++++++------------ 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/scripts/lantern.py b/scripts/lantern.py index 7a2109d..1f6a556 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -583,10 +583,11 @@ Maintenance: print_colors('[+] Saving local verified and unverified') save_local_verified_and_unverified(local_verified_df, local_unverified_df) - break - - except Exception: + except Exception as err: print_colors("[-] Option 4 failed suddently, please try again", is_error=True) + raise err + + break case 5: print_colors("[+] Add a new webring participant (and download their files into their directory (without trusting them yet!))") @@ -1005,18 +1006,27 @@ Maintenance: try: - verified_df, unverified_df = utils.get_local_verified_and_unverified() + verified_df, unverified_df = get_local_verified_and_unverified() verified_df, unverified_df = remove_cross_dataframe_replications(verified_df, unverified_df) save_local_verified_and_unverified(verified_df, unverified_df) - except Exception: + except Exception as err: print_colors("[-] Option 9 failed suddently, please try again", is_error=True) + break + case 10: print_colors("[+] 10) perform sanity checks on all csv files (to mark them as sensitive / or remove the ones that are blacklisted)") + + + print_colors('[+] Reading local blacklist and sensitive words') + local_blacklist, local_sensitive = get_local_blacklist_and_sensitive() + + + participantspath = rootpath+'www/participants/' for participant in os.listdir(participantspath): print_colors(f"Participant: {participant}") diff --git a/scripts/logic/lantern_logic.py b/scripts/logic/lantern_logic.py index e0f732f..61590fb 100644 --- a/scripts/logic/lantern_logic.py +++ b/scripts/logic/lantern_logic.py @@ -43,7 +43,7 @@ def download_participant_data(participant): utils.print_colors(f"[+] Downloaded webring {participant} csv files and banner") - except Exception: + except Exception as err: print_colors("[-] Downloading webring participant's files failed.", is_error=True) def clean_csv(df, blacklist): @@ -66,7 +66,7 @@ def clean_csv(df, blacklist): if not df.empty: df = df[df.apply(utils.is_row_valid, axis=1)] - except Exception: + except Exception as err: print_colors("[-] cleaning dataframe failed", is_error=True) return df @@ -90,7 +90,7 @@ def mark_sensitive(df, sensitive_list): df.loc[sensitive_rows, 'Sensitive'] = 'YES' df.loc[~sensitive_rows, 'Sensitive'] = 'NO' - except Exception: + except Exception as err: print_colors("[-] MArking sensitive words failed.", is_error=True) - + return df \ No newline at end of file diff --git a/scripts/utils.py b/scripts/utils.py index 9c0580c..0411822 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -152,7 +152,7 @@ def is_participant_reachable(instance): status = requests.get(f'{url}{file_name}',proxies=conf.PROXIES, timeout=10).status_code if status != 200: return False - except Exception: + except Exception as err: return False return True @@ -288,7 +288,7 @@ def is_row_valid(row): IsScoreValid(row['Score']) ) - except Exception: + except Exception as err: return False ###################### General ###################### @@ -316,7 +316,7 @@ def merge_verification_df(receiving_df, merging_df): else: return pd.concat([receiving_df, filtered_df], ignore_index=True) - except Exception: + except Exception as err: return receiving_df def remove_duplications(df): @@ -333,7 +333,7 @@ def remove_duplications(df): df = df.drop_duplicates(subset='Name') df = df.drop_duplicates(subset='URL') - except Exception: + except Exception as err: print_colors('[-] Removing duplication failed',is_error=True) return df @@ -344,12 +344,14 @@ def remove_cross_dataframe_replications(main_df, sub_df): main_df = remove_duplications(main_df) sub_df = remove_duplications(sub_df) - mask = sub_df['URL'].isin(main_fd['URL']) | df_a['Name'].isin(df_b['Name']) + mask = sub_df['URL'].isin(main_df['URL']) | sub_df['Name'].isin(main_df['Name']) sub_df = sub_df[~mask] - except: + except Exception as err: print_colors('[-] Removing cross dataframe duplications failed',is_error=True) + raise err #REMOVE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + return main_df, sub_df @@ -376,7 +378,7 @@ def save_local_verified_and_unverified(verified_df, unverified_df): return True - except Exception: + except Exception as err: print_colors('[-] Saving verified and unverified failed',is_error=True) return False @@ -435,7 +437,7 @@ def get_participant_local_verified_and_unverified(participant): return verified_df, unverified_df - except Exception: + except Exception as err: print_colors('[-] Failed reading the verified and unverified files',is_error=True) return pd.DataFrame(), pd.DataFrame() @@ -454,7 +456,7 @@ def get_official_participants(): with open(conf.OFFICIAL_PARTICIPANTS_FILE, 'r') as file: return [line.strip() for line in file if current_instance not in line] - except Exception: + except Exception as err: print_colors('[-] Couldn\'t read official webring participants file',is_error=True ) def get_local_blacklist_and_sensitive(): @@ -485,7 +487,7 @@ def get_local_blacklist_and_sensitive(): return blacklist, sensitive_list - except Exception: + except Exception as err: print_colors('[-] Failed reading the blacklist and sensitive words file',is_error=True) return [], [] @@ -515,7 +517,7 @@ def get_local_verified_and_unverified(): return verified_df, unverified_df - except Exception: + except Exception as err: print_colors('[-] Failed reading the verified and unverified files',is_error=True) return pd.DataFrame(), pd.DataFrame() @@ -542,7 +544,7 @@ def get_local_webring_participants(): return webring_df - except Exception: + except Exception as err: print_colors(f'[-] failed reading webring participants file',is_error=True ) return pd.DataFrame() From 19e582203b515be271aa8cc169611987551139df Mon Sep 17 00:00:00 2001 From: doctor_dev Date: Fri, 30 May 2025 18:21:40 +0000 Subject: [PATCH 7/9] starting work on option 10 - again --- scripts/lantern.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lantern.py b/scripts/lantern.py index 1f6a556..db178b8 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -537,7 +537,7 @@ Maintenance: print_colors('[+] Syncing official webrings to local webrings') - webring_df = verify_official_participants_registered() + webring_df = get_local_webring_participants() current_instance = get_current_instance() From 2a827c0b8b7b336c6b3c93578470306f1ec7e490 Mon Sep 17 00:00:00 2001 From: doctor_dev Date: Fri, 30 May 2025 19:38:04 +0000 Subject: [PATCH 8/9] Finished refactoring options 9 and 10 --- scripts/lantern.py | 103 +++++++++++++-------------------------------- scripts/utils.py | 78 ++++++++++++++++++++++++---------- 2 files changed, 84 insertions(+), 97 deletions(-) diff --git a/scripts/lantern.py b/scripts/lantern.py index db178b8..701d834 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -561,9 +561,12 @@ Maintenance: participant_url = generate_local_participant_dir(participant.URL) - print_colors('[+] Reading webrring participant\'s verified and unverified, and removing unverified and blacklisted rows') - participant_verified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}verified.csv'), local_blacklist) - participant_unverified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}unverified.csv'), local_blacklist) + print_colors('[+] Reading webrring participant\'s verified and unverified') + participant_verified_df, participant_unverified_df = get_participant_local_verified_and_unverified(participant_url) + + print_colors('[+] Removing unvalidated and blacklisted rows') + participant_verified_df = lantern.clean_csv(participant_verified_df, local_blacklist) + participant_unverified_df = lantern.clean_csv(participant_unverified_df, local_blacklist) print_colors('[+] Marking sensitive rows') participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive) @@ -585,7 +588,6 @@ Maintenance: except Exception as err: print_colors("[-] Option 4 failed suddently, please try again", is_error=True) - raise err break @@ -1005,11 +1007,14 @@ Maintenance: print_colors("[+] 9) Cleaning up all duplicates in your own unverified + verified.csv (based on the url)") try: - + + print_colors('[+] Reading local verified and unverified') verified_df, unverified_df = get_local_verified_and_unverified() + print_colors('[+] Removing cross dataframe replications') verified_df, unverified_df = remove_cross_dataframe_replications(verified_df, unverified_df) + print_colors('[+] Saving local verified and unverified') save_local_verified_and_unverified(verified_df, unverified_df) except Exception as err: @@ -1020,80 +1025,30 @@ Maintenance: case 10: print_colors("[+] 10) perform sanity checks on all csv files (to mark them as sensitive / or remove the ones that are blacklisted)") + try: + print_colors('[+] Reading local blacklist and sensitive words') + local_blacklist, local_sensitive = get_local_blacklist_and_sensitive() + for participant in os.listdir(conf.PARTICIPANT_DIR): + participant_local_dir = conf.PARTICIPANT_DIR + participant + '/' - print_colors('[+] Reading local blacklist and sensitive words') - local_blacklist, local_sensitive = get_local_blacklist_and_sensitive() + print_colors('[+] Reading webrring participant\'s verified and unverified') + participant_verified_df, participant_unverified_df = get_participant_local_verified_and_unverified(participant_local_dir) + print_colors('[+] Removing unverified and blacklisted rows') + participant_verified_df = lantern.clean_csv(participant_verified_df, local_blacklist) + participant_unverified_df = lantern.clean_csv(participant_unverified_df, local_blacklist) + + print_colors('[+] Marking sensitive rows') + participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive) + participant_unverified_df = lantern.mark_sensitive(participant_unverified_df, local_sensitive) + print_colors('[+] Saving local participant verified and unverified') + save_local_participant_verified_and_unverified(participant_verified_df, participant_unverified_df, participant_local_dir) + + except Exception as err: + print_colors("[-] Option 10 failed suddently, please try again", is_error=True) - participantspath = rootpath+'www/participants/' - for participant in os.listdir(participantspath): - print_colors(f"Participant: {participant}") - #read=input("Continue?") - participantdir= participantspath+participant - ################ BEGIN SANITY CHECKS FOR EACH PARTICIPANTS ############## - # iterate through the participant's verified.csv and unverified.csv files - for w in ['verified.csv','unverified.csv']: - csvfilepath=participantdir+'/'+w - print_colors(f"{csvfilepath}") - csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') - rows2delete= [] # it is an empty list at first - for i,j in csvdf.iterrows(): - row=csvdf.loc[i,:].values.tolist() - #print_colors(f"{row}") - - - - ################################ SANITY CHECKS #################################### - ### SANITY CHECK 0: make sure that ✔️ and x are replaced with YES/NO, as it changed since v1.0.1 ### - if csvdf.at[i, 'Status'] == "✔️" or csvdf.at[i, 'Status'] == "YES" : - csvdf.at[i, 'Status'] = "YES" - csvdf.to_csv(csvfilepath, index=False) - else: - csvdf.at[i, 'Status'] = "NO" - csvdf.to_csv(csvfilepath, index=False) - - if csvdf.at[i, 'Sensitive'] == "✔️" or csvdf.at[i, 'Sensitive'] == "YES" : - csvdf.at[i, 'Sensitive'] = "YES" - csvdf.to_csv(csvfilepath, index=False) - else: - csvdf.at[i, 'Sensitive'] = "NO" - csvdf.to_csv(csvfilepath, index=False) - - ### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion### - if IsURLValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsURLValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or IsDescriptionValid(csvdf.at[i, 'Description']) is False or IsStatusValid(csvdf.at[i, 'Status']) is False or IsScoreValid(csvdf.at[i, 'Score']) is False: - if i not in rows2delete: - print_colors(f"Marking row {i} for deletion, as it has invalid inputs") - #print_colors(f"{row}") - print(IsURLValid(csvdf.at[i, 'Instance']), IsCategoryValid(csvdf.at[i, 'Category']), IsNameValid(csvdf.at[i, 'Name']), IsURLValid(csvdf.at[i, 'URL']), IsStatusValid(csvdf.at[i, 'Sensitive']), IsDescriptionValid(csvdf.at[i, 'Description']), IsStatusValid(csvdf.at[i, 'Status']), IsScoreValid(csvdf.at[i, 'Score'])) - rows2delete.append(i) - read=input("Continue?") - - ### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ### - for k,l in bldf.iterrows(): - blword=bldf.at[k, 'blacklisted-words'] - if any(blword in str(x) for x in row) == True: - if i not in rows2delete: - print_colors(f"Marking row {i} for deletion, as it matches with the blacklisted word {blword}") - rows2delete.append(i) - #read=input("Continue?") - ### SANITY CHECK 3: Mark all rows that match sensitive words to be sensitive = YES - for k,l in sedf.iterrows(): - seword=sedf.at[k, 'sensitive-words'] - if any(seword in str(x) for x in row) == True: - print_colors(f"Marking row {i} as sensitive, as it matches with the sensitive word {seword}") - csvdf.at[i, 'Sensitive']="YES" - csvdf.to_csv(csvfilepath, index=False) - #read=input("Continue?") - - - for i in rows2delete: - row=csvdf.loc[i,:].values.tolist() - print_colors(f'[+] REMOVING ROW : {i} {row}') - csvdf.drop(i, inplace= True) - csvdf.to_csv(csvfilepath, index=False) - #read=input("Continue?") break case 11: diff --git a/scripts/utils.py b/scripts/utils.py index 0411822..c15c57e 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -60,7 +60,7 @@ def IsXFTPServerValid(url: str) -> bool: Returns True if URL is a valid SimpleX XFTP Server URL False otherwise """ - return conf.RecognizeSimplexType(url) == 'xftp' + return RecognizeSimplexType(url) == 'xftp' # stub function def IsSMPServerValid(url: str) -> bool: @@ -68,7 +68,7 @@ def IsSMPServerValid(url: str) -> bool: Returns True if URL is a valid SimpleX SMP Server URL False otherwise """ - return conf.RecognizeSimplexType(url) == 'smp' + return RecognizeSimplexType(url) == 'smp' def IsClearnetLinkValid(url: str) -> bool: """ @@ -242,7 +242,7 @@ def IsNameValid(name: str) -> bool: Check the parameter name only contains [a-zA-Z0-9] and is 64 chars long. """ try: - return bool(VALID_NAME_PATTERN.fullmatch(name.strip())) + return bool(conf.VALID_NAME_PATTERN.fullmatch(name.strip())) except Exception: return False @@ -278,10 +278,10 @@ def is_row_valid(row): """ try: return ( - IsUrlValid(row['Instance']) and + IsURLValid(row['Instance']) and IsCategoryValid(row['Category']) and IsNameValid(row['Name']) and - IsUrlValid(row['URL']) and + IsURLValid(row['URL']) and IsStatusValid(row['Sensitive']) and IsDescriptionValid(row['Description']) and IsStatusValid(row['Status']) and @@ -339,6 +339,17 @@ def remove_duplications(df): return df def remove_cross_dataframe_replications(main_df, sub_df): + """ + remove replications from sub_df that exist in main_df + + Parameters: + main_df (Dataframe): the dataframe to keep replications + sub_df (Dataframe): the dataframe to remove replications + + Returns: + Dataframe: the main_df with removed duplications + Dataframe: the sub_df with removed duplications and removed replications + """ try: main_df = remove_duplications(main_df) @@ -350,7 +361,6 @@ def remove_cross_dataframe_replications(main_df, sub_df): except Exception as err: print_colors('[-] Removing cross dataframe duplications failed',is_error=True) - raise err #REMOVE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! return main_df, sub_df @@ -365,7 +375,7 @@ def save_local_verified_and_unverified(verified_df, unverified_df): unverified_df (Dataframe): local unverified rows dataframe Returns: - Dataframe: the combined dataframe will be returned + bool: True if successful, False if not """ try: current_instance = get_current_instance() + '/' @@ -382,6 +392,32 @@ def save_local_verified_and_unverified(verified_df, unverified_df): print_colors('[-] Saving verified and unverified failed',is_error=True) return False +def save_local_participant_verified_and_unverified(verified_df, unverified_df, participant): + """ + saves the local verified and unverified of a participant + + Parameters: + verified_df (Dataframe): local verified rows dataframe + unverified_df (Dataframe): local unverified rows dataframe + participant (str): participant's onion local path + + Returns: + bool: True if successful, False if not + """ + try: + + verified_df.to_csv(f'{participant}verified.csv', index=False) + + unverified_df.to_csv(f'{participant}unverified.csv', index=False) + + print_colors('[+] Verified and unverified saved successfully') + + return True + + except Exception as err: + print_colors('[-] Saving verified and unverified failed',is_error=True) + return False + ###################### Getters/Generators ###################### def generate_participant_url(participant): """ @@ -414,7 +450,7 @@ def get_participant_local_verified_and_unverified(participant): reads the local verified csv and the local unverified csv of a participant Parameters: - participant (str): participant's onion address/instance + participant (str): participant's local files path Returns: verified_df(Dataframe): verified.csv as dataframe @@ -422,25 +458,21 @@ def get_participant_local_verified_and_unverified(participant): """ try: - current_instance = get_current_instance() + '/' - try: - verified_df = pd.read_csv(f'{participant}verified.csv') - - except FileNotFoundError: - print_colors("[-] File not found: verified.csv", is_error=True) + verified_df = pd.read_csv(f'{participant}verified.csv') + + except FileNotFoundError: + print_colors("[-] File not found: verified.csv", is_error=True) + return pd.Dataframe(), pd.Dataframe() - try: - unverified_df = pd.read_csv(f'{participant}unverified.csv') + try: + unverified_df = pd.read_csv(f'{participant}unverified.csv') - except FileNotFoundError: - print_colors("[-] Participant File not found: unverified.csv", is_error=True) + except FileNotFoundError: + print_colors("[-] Participant File not found: unverified.csv", is_error=True) + return pd.Dataframe(), pd.Dataframe() - return verified_df, unverified_df + return verified_df, unverified_df - except Exception as err: - print_colors('[-] Failed reading the verified and unverified files',is_error=True) - - return pd.DataFrame(), pd.DataFrame() def get_official_participants(): """ From 8b0ba4833fcf456597da57deecf44e185e11664f Mon Sep 17 00:00:00 2001 From: doctor_dev Date: Fri, 30 May 2025 19:50:34 +0000 Subject: [PATCH 9/9] added dummy folder to git --- .gitignore | 1 + scripts/local_testing/.gitkeep | 0 2 files changed, 1 insertion(+) create mode 100644 scripts/local_testing/.gitkeep diff --git a/.gitignore b/.gitignore index 868c262..e1495ea 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ env/ submissions/submission.csv venv/ local_testing/* +!your_folder/.gitkeep diff --git a/scripts/local_testing/.gitkeep b/scripts/local_testing/.gitkeep new file mode 100644 index 0000000..e69de29