diff --git a/.gitignore b/.gitignore index 8c43f1e..e1495ea 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ __pycache__/ env/ submissions/submission.csv venv/ +local_testing/* +!your_folder/.gitkeep diff --git a/scripts/lantern.py b/scripts/lantern.py index 304afc9..701d834 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -533,53 +533,61 @@ Maintenance: case 4: print_colors("4) Synchronize new links from new or existing webring participants, into your local csv files") - print_colors('[+] Syncing official webrings to local webrings') + try: - webring_df = get_local_webring_participants() - - current_instance = get_current_instance() + print_colors('[+] Syncing official webrings to local webrings') - for participant in webring_df.itertuples(index=False, name='columns'): - # Check if the participant is my instance - if current_instance in participant: - continue + webring_df = get_local_webring_participants() - if not is_participant_reachable(participant.URL): - print_colors("[-] Webring {participant.URL} isn't reachable, skipping", is_error=True) - continue + current_instance = get_current_instance() - print_colors('[+] Downloading participant\'s files to store locally') - lantern.download_participant_data(participant.URL) + for participant in webring_df.itertuples(index=False, name='columns'): + # Check if the participant is my instance + if current_instance in participant: + continue - print_colors('[+] Reading local blacklist and sensitive words') - local_blacklist, local_sensitive = get_local_blacklist_and_sensitive() - - print_colors('[+] Reading local verified and unverified') - local_verified_df, local_unverified_df = get_local_verified_and_unverified() - - participant_url = generate_local_participant_dir(participant.URL) - - print_colors('[+] Reading webrring participant\'s verified and unverified, and removing unverified and blacklisted rows') - participant_verified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}verified.csv'), local_blacklist) - participant_unverified_df = lantern.clean_csv(pd.read_csv(f'{participant_url}unverified.csv'), local_blacklist) - - print_colors('[+] Marking sensitive rows') - participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive) - participant_unverified_df = lantern.mark_sensitive(participant_unverified_df, local_sensitive) + if not is_participant_reachable(participant.URL): + print_colors("[-] Webring {participant.URL} isn't reachable, skipping", is_error=True) + continue - if participant.Trusted == 'YES': - print_colors('[+] This participant is trusted, copying participant\'s verified to local verified') - local_verified_df = merge_verification_df(local_verified_df, participant_verified_df) - - else: - print_colors('[+] This participant is not trusted, copying participant\'s verified to local unverified') - local_unverified_df = merge_verification_df(local_unverified_df, participant_verified_df) - - print_colors('[+] Copying participant\'s unverified to local unverified') - local_unverified_df = merge_verification_df(local_unverified_df, participant_unverified_df) + print_colors('[+] Downloading participant\'s files to store locally') + lantern.download_participant_data(participant.URL) - print_colors('[+] Saving local verified and unverified') - save_local_verified_and_unverified(local_verified_df, local_unverified_df) + print_colors('[+] Reading local blacklist and sensitive words') + local_blacklist, local_sensitive = get_local_blacklist_and_sensitive() + + print_colors('[+] Reading local verified and unverified') + local_verified_df, local_unverified_df = get_local_verified_and_unverified() + + participant_url = generate_local_participant_dir(participant.URL) + + print_colors('[+] Reading webrring participant\'s verified and unverified') + participant_verified_df, participant_unverified_df = get_participant_local_verified_and_unverified(participant_url) + + print_colors('[+] Removing unvalidated and blacklisted rows') + participant_verified_df = lantern.clean_csv(participant_verified_df, local_blacklist) + participant_unverified_df = lantern.clean_csv(participant_unverified_df, local_blacklist) + + print_colors('[+] Marking sensitive rows') + participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive) + participant_unverified_df = lantern.mark_sensitive(participant_unverified_df, local_sensitive) + + if participant.Trusted == 'YES': + print_colors('[+] This participant is trusted, copying participant\'s verified to local verified') + local_verified_df = merge_verification_df(local_verified_df, participant_verified_df) + + else: + print_colors('[+] This participant is not trusted, copying participant\'s verified to local unverified') + local_unverified_df = merge_verification_df(local_unverified_df, participant_verified_df) + + print_colors('[+] Copying participant\'s unverified to local unverified') + local_unverified_df = merge_verification_df(local_unverified_df, participant_unverified_df) + + print_colors('[+] Saving local verified and unverified') + save_local_verified_and_unverified(local_verified_df, local_unverified_df) + + except Exception as err: + print_colors("[-] Option 4 failed suddently, please try again", is_error=True) break @@ -997,93 +1005,50 @@ Maintenance: case 9: print_colors("[+] 9) Cleaning up all duplicates in your own unverified + verified.csv (based on the url)") - for w in ['verified.csv', 'unverified.csv']: - csvfilepath = os.path.join(instancepath, w) - print_colors(f"Processing file: {csvfilepath}") - try: - csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') - print_colors(f"Removing duplicates in {csvfilepath}") - #print_colors(f"{csvdf[['URL']]}") - csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) - #print_colors(f"{csvdf[['URL']]}") - csvdf.to_csv(csvfilepath, index=False) - print_colors(f"Cleaned data:\n{csvdf[['URL']]}") - except FileNotFoundError: - print_colors(f"File not found: {csvfilepath}") - except Exception as e: - print_colors(f"An error occurred while processing {csvfilepath}: {e}") - break + + try: + + print_colors('[+] Reading local verified and unverified') + verified_df, unverified_df = get_local_verified_and_unverified() + + print_colors('[+] Removing cross dataframe replications') + verified_df, unverified_df = remove_cross_dataframe_replications(verified_df, unverified_df) + + print_colors('[+] Saving local verified and unverified') + save_local_verified_and_unverified(verified_df, unverified_df) + + except Exception as err: + print_colors("[-] Option 9 failed suddently, please try again", is_error=True) + break case 10: print_colors("[+] 10) perform sanity checks on all csv files (to mark them as sensitive / or remove the ones that are blacklisted)") - participantspath = rootpath+'www/participants/' - for participant in os.listdir(participantspath): - print_colors(f"Participant: {participant}") - #read=input("Continue?") - participantdir= participantspath+participant - ################ BEGIN SANITY CHECKS FOR EACH PARTICIPANTS ############## - # iterate through the participant's verified.csv and unverified.csv files - for w in ['verified.csv','unverified.csv']: - csvfilepath=participantdir+'/'+w - print_colors(f"{csvfilepath}") - csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') - rows2delete= [] # it is an empty list at first - for i,j in csvdf.iterrows(): - row=csvdf.loc[i,:].values.tolist() - #print_colors(f"{row}") + try: + print_colors('[+] Reading local blacklist and sensitive words') + local_blacklist, local_sensitive = get_local_blacklist_and_sensitive() + for participant in os.listdir(conf.PARTICIPANT_DIR): + participant_local_dir = conf.PARTICIPANT_DIR + participant + '/' - ################################ SANITY CHECKS #################################### - ### SANITY CHECK 0: make sure that ✔️ and x are replaced with YES/NO, as it changed since v1.0.1 ### - if csvdf.at[i, 'Status'] == "✔️" or csvdf.at[i, 'Status'] == "YES" : - csvdf.at[i, 'Status'] = "YES" - csvdf.to_csv(csvfilepath, index=False) - else: - csvdf.at[i, 'Status'] = "NO" - csvdf.to_csv(csvfilepath, index=False) + print_colors('[+] Reading webrring participant\'s verified and unverified') + participant_verified_df, participant_unverified_df = get_participant_local_verified_and_unverified(participant_local_dir) - if csvdf.at[i, 'Sensitive'] == "✔️" or csvdf.at[i, 'Sensitive'] == "YES" : - csvdf.at[i, 'Sensitive'] = "YES" - csvdf.to_csv(csvfilepath, index=False) - else: - csvdf.at[i, 'Sensitive'] = "NO" - csvdf.to_csv(csvfilepath, index=False) - - ### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion### - if IsURLValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsURLValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or IsDescriptionValid(csvdf.at[i, 'Description']) is False or IsStatusValid(csvdf.at[i, 'Status']) is False or IsScoreValid(csvdf.at[i, 'Score']) is False: - if i not in rows2delete: - print_colors(f"Marking row {i} for deletion, as it has invalid inputs") - #print_colors(f"{row}") - print(IsURLValid(csvdf.at[i, 'Instance']), IsCategoryValid(csvdf.at[i, 'Category']), IsNameValid(csvdf.at[i, 'Name']), IsURLValid(csvdf.at[i, 'URL']), IsStatusValid(csvdf.at[i, 'Sensitive']), IsDescriptionValid(csvdf.at[i, 'Description']), IsStatusValid(csvdf.at[i, 'Status']), IsScoreValid(csvdf.at[i, 'Score'])) - rows2delete.append(i) - read=input("Continue?") + print_colors('[+] Removing unverified and blacklisted rows') + participant_verified_df = lantern.clean_csv(participant_verified_df, local_blacklist) + participant_unverified_df = lantern.clean_csv(participant_unverified_df, local_blacklist) + + print_colors('[+] Marking sensitive rows') + participant_verified_df = lantern.mark_sensitive(participant_verified_df, local_sensitive) + participant_unverified_df = lantern.mark_sensitive(participant_unverified_df, local_sensitive) - ### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ### - for k,l in bldf.iterrows(): - blword=bldf.at[k, 'blacklisted-words'] - if any(blword in str(x) for x in row) == True: - if i not in rows2delete: - print_colors(f"Marking row {i} for deletion, as it matches with the blacklisted word {blword}") - rows2delete.append(i) - #read=input("Continue?") - ### SANITY CHECK 3: Mark all rows that match sensitive words to be sensitive = YES - for k,l in sedf.iterrows(): - seword=sedf.at[k, 'sensitive-words'] - if any(seword in str(x) for x in row) == True: - print_colors(f"Marking row {i} as sensitive, as it matches with the sensitive word {seword}") - csvdf.at[i, 'Sensitive']="YES" - csvdf.to_csv(csvfilepath, index=False) - #read=input("Continue?") + print_colors('[+] Saving local participant verified and unverified') + save_local_participant_verified_and_unverified(participant_verified_df, participant_unverified_df, participant_local_dir) + + except Exception as err: + print_colors("[-] Option 10 failed suddently, please try again", is_error=True) - - for i in rows2delete: - row=csvdf.loc[i,:].values.tolist() - print_colors(f'[+] REMOVING ROW : {i} {row}') - csvdf.drop(i, inplace= True) - csvdf.to_csv(csvfilepath, index=False) - #read=input("Continue?") break case 11: diff --git a/scripts/local_testing/.gitkeep b/scripts/local_testing/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/logic/lantern_logic.py b/scripts/logic/lantern_logic.py index e0f732f..61590fb 100644 --- a/scripts/logic/lantern_logic.py +++ b/scripts/logic/lantern_logic.py @@ -43,7 +43,7 @@ def download_participant_data(participant): utils.print_colors(f"[+] Downloaded webring {participant} csv files and banner") - except Exception: + except Exception as err: print_colors("[-] Downloading webring participant's files failed.", is_error=True) def clean_csv(df, blacklist): @@ -66,7 +66,7 @@ def clean_csv(df, blacklist): if not df.empty: df = df[df.apply(utils.is_row_valid, axis=1)] - except Exception: + except Exception as err: print_colors("[-] cleaning dataframe failed", is_error=True) return df @@ -90,7 +90,7 @@ def mark_sensitive(df, sensitive_list): df.loc[sensitive_rows, 'Sensitive'] = 'YES' df.loc[~sensitive_rows, 'Sensitive'] = 'NO' - except Exception: + except Exception as err: print_colors("[-] MArking sensitive words failed.", is_error=True) - + return df \ No newline at end of file diff --git a/scripts/utils.py b/scripts/utils.py index 78530a1..c15c57e 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -152,7 +152,7 @@ def is_participant_reachable(instance): status = requests.get(f'{url}{file_name}',proxies=conf.PROXIES, timeout=10).status_code if status != 200: return False - except Exception: + except Exception as err: return False return True @@ -278,17 +278,17 @@ def is_row_valid(row): """ try: return ( - IsUrlValid(row['Instance']) and + IsURLValid(row['Instance']) and IsCategoryValid(row['Category']) and IsNameValid(row['Name']) and - IsUrlValid(row['URL']) and + IsURLValid(row['URL']) and IsStatusValid(row['Sensitive']) and IsDescriptionValid(row['Description']) and IsStatusValid(row['Status']) and IsScoreValid(row['Score']) ) - except Exception: + except Exception as err: return False ###################### General ###################### @@ -316,7 +316,7 @@ def merge_verification_df(receiving_df, merging_df): else: return pd.concat([receiving_df, filtered_df], ignore_index=True) - except Exception: + except Exception as err: return receiving_df def remove_duplications(df): @@ -333,11 +333,38 @@ def remove_duplications(df): df = df.drop_duplicates(subset='Name') df = df.drop_duplicates(subset='URL') - except Exception: - pass + except Exception as err: + print_colors('[-] Removing duplication failed',is_error=True) return df +def remove_cross_dataframe_replications(main_df, sub_df): + """ + remove replications from sub_df that exist in main_df + + Parameters: + main_df (Dataframe): the dataframe to keep replications + sub_df (Dataframe): the dataframe to remove replications + + Returns: + Dataframe: the main_df with removed duplications + Dataframe: the sub_df with removed duplications and removed replications + """ + try: + + main_df = remove_duplications(main_df) + sub_df = remove_duplications(sub_df) + + mask = sub_df['URL'].isin(main_df['URL']) | sub_df['Name'].isin(main_df['Name']) + + sub_df = sub_df[~mask] + + except Exception as err: + print_colors('[-] Removing cross dataframe duplications failed',is_error=True) + + + return main_df, sub_df + ###TODO: can later remove the inputs and have a "global" local verified and unverified or a class of the local(lantern host) participant def save_local_verified_and_unverified(verified_df, unverified_df): """ @@ -348,7 +375,7 @@ def save_local_verified_and_unverified(verified_df, unverified_df): unverified_df (Dataframe): local unverified rows dataframe Returns: - Dataframe: the combined dataframe will be returned + bool: True if successful, False if not """ try: current_instance = get_current_instance() + '/' @@ -357,10 +384,38 @@ def save_local_verified_and_unverified(verified_df, unverified_df): unverified_df.to_csv(f'{conf.PARTICIPANT_DIR}{current_instance}unverified.csv', index=False) + print_colors('[+] Verified and unverified saved successfully') + return True - except Exception: - print_colors('[-] Saving verified and unverified failed',is_error=True ) + except Exception as err: + print_colors('[-] Saving verified and unverified failed',is_error=True) + return False + +def save_local_participant_verified_and_unverified(verified_df, unverified_df, participant): + """ + saves the local verified and unverified of a participant + + Parameters: + verified_df (Dataframe): local verified rows dataframe + unverified_df (Dataframe): local unverified rows dataframe + participant (str): participant's onion local path + + Returns: + bool: True if successful, False if not + """ + try: + + verified_df.to_csv(f'{participant}verified.csv', index=False) + + unverified_df.to_csv(f'{participant}unverified.csv', index=False) + + print_colors('[+] Verified and unverified saved successfully') + + return True + + except Exception as err: + print_colors('[-] Saving verified and unverified failed',is_error=True) return False ###################### Getters/Generators ###################### @@ -390,6 +445,35 @@ def generate_local_participant_dir(participant): return f'{conf.PARTICIPANT_DIR}{participant}/' +def get_participant_local_verified_and_unverified(participant): + """ + reads the local verified csv and the local unverified csv of a participant + + Parameters: + participant (str): participant's local files path + + Returns: + verified_df(Dataframe): verified.csv as dataframe + unverified_df(Dataframe): unverified.csv as dataframe + """ + + try: + verified_df = pd.read_csv(f'{participant}verified.csv') + + except FileNotFoundError: + print_colors("[-] File not found: verified.csv", is_error=True) + return pd.Dataframe(), pd.Dataframe() + + try: + unverified_df = pd.read_csv(f'{participant}unverified.csv') + + except FileNotFoundError: + print_colors("[-] Participant File not found: unverified.csv", is_error=True) + return pd.Dataframe(), pd.Dataframe() + + return verified_df, unverified_df + + def get_official_participants(): """ reads all the official webring participants @@ -404,7 +488,7 @@ def get_official_participants(): with open(conf.OFFICIAL_PARTICIPANTS_FILE, 'r') as file: return [line.strip() for line in file if current_instance not in line] - except Exception: + except Exception as err: print_colors('[-] Couldn\'t read official webring participants file',is_error=True ) def get_local_blacklist_and_sensitive(): @@ -417,22 +501,32 @@ def get_local_blacklist_and_sensitive(): """ try: current_instance = get_current_instance() + '/' + try: + blacklist_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}blacklist.csv') + blacklist = blacklist_df.iloc[:, 0].tolist() - blacklist_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}blacklist.csv') - blacklist = blacklist_df.iloc[:, 0].tolist() + except FileNotFoundError: + print_colors("[-] File not found: blacklist.csv", is_error=True) + + try: + sensitive_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}sensitive.csv') + sensitive_list = sensitive_df.iloc[:, 0].tolist() + + except FileNotFoundError: + print_colors("[-] File not found: sensitive.csv", is_error=True) - sensitive_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}sensitive.csv') - sensitive_list = sensitive_df.iloc[:, 0].tolist() return blacklist, sensitive_list - except Exception: - print_colors('[-] Failed reading the blacklist and sensitive words file',is_error=True ) - return [], [] + + except Exception as err: + print_colors('[-] Failed reading the blacklist and sensitive words file',is_error=True) + + return [], [] def get_local_verified_and_unverified(): """ - reads the local verified csv and the local unverified csv + reads the local verified csv and the local unverified csv of the instance Returns: verified_df(Dataframe): verified.csv as dataframe @@ -441,16 +535,24 @@ def get_local_verified_and_unverified(): try: current_instance = get_current_instance() + '/' + try: + verified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}verified.csv') + + except FileNotFoundError: + print_colors("[-] File not found: verified.csv", is_error=True) - verified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}verified.csv') + try: + unverified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}unverified.csv') - unverified_df = pd.read_csv(f'{conf.PARTICIPANT_DIR}{current_instance}unverified.csv') + except FileNotFoundError: + print_colors("[-] File not found: unverified.csv", is_error=True) return verified_df, unverified_df - except Exception: - print_colors('[-] Failed reading the verified and unverified files',is_error=True ) - return pd.DataFrame(), pd.DataFrame() + except Exception as err: + print_colors('[-] Failed reading the verified and unverified files',is_error=True) + + return pd.DataFrame(), pd.DataFrame() def get_local_webring_participants(): """ @@ -474,7 +576,7 @@ def get_local_webring_participants(): return webring_df - except Exception: + except Exception as err: print_colors(f'[-] failed reading webring participants file',is_error=True ) return pd.DataFrame()