diff --git a/scripts/lantern.py b/scripts/lantern.py index 81f8975..0bf45ac 100644 --- a/scripts/lantern.py +++ b/scripts/lantern.py @@ -144,11 +144,11 @@ def main(): src=templatepath+i shutil.copyfile(src, filepath) # now that they exist, get vdf and uvdf and the rest - vdf = pd.read_csv(verifiedcsvfile) - uvdf = pd.read_csv(unverifiedcsvfile) - bldf = pd.read_csv(blcsvfile) - sedf = pd.read_csv(secsvfile) - webpdf = pd.read_csv(webpcsvfile) + vdf = pd.read_csv(verifiedcsvfile, on_bad_lines='skip') + uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip') + bldf = pd.read_csv(blcsvfile, on_bad_lines='skip') + sedf = pd.read_csv(secsvfile, on_bad_lines='skip') + webpdf = pd.read_csv(webpcsvfile, on_bad_lines='skip') print_colors(f"[+] file exists, your Webring URL is {instance}") ##### CHECK IF ARGUMENTS ARE PASSED TO ENTER PROMPT-LESS MODE ##### @@ -257,8 +257,8 @@ Maintenance: case 2: print_colors("[+] Trust/Untrust/Blacklist a Website entry (move an entry from unverified to verified.csv)") while True: - vdf = pd.read_csv(verifiedcsvfile) - uvdf = pd.read_csv(unverifiedcsvfile) + vdf = pd.read_csv(verifiedcsvfile, on_bad_lines='skip') + uvdf = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip') # search for a word name='' @@ -385,7 +385,7 @@ Maintenance: for w in ['verified.csv','unverified.csv']: csvfilepath=instancepath+'/'+w print_colors(f"{csvfilepath}") - csvdf = pd.read_csv(csvfilepath) + csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') rows2delete= [] # it is an empty list at first for i,j in csvdf.iterrows(): row=csvdf.loc[i,:].values.tolist() @@ -535,7 +535,7 @@ Maintenance: status='' score='' webringcsvfile=instancepath+'/'+'webring-participants.csv' - wdf = pd.read_csv(webringcsvfile) + wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip') for participant in os.listdir(participantsdir): participantdir=participantsdir+participant @@ -610,7 +610,15 @@ Maintenance: for w in ['verified.csv','unverified.csv']: csvfilepath=participantdir+'/'+w print_colors(f"{csvfilepath}") - csvdf = pd.read_csv(csvfilepath) + csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') + + print("[+] Removing the participant's duplicate entries... ") + # REMOVE DUPLICATES !!! do not accept any duplicate from remote participants + csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) + csvdf.to_csv(csvfilepath, index=False) + + csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') + bldf[['blacklisted-words']].iterrows() rows2delete= [] # it is an empty list at first for i,j in csvdf.iterrows(): @@ -640,6 +648,7 @@ Maintenance: #mark the row for deletion as it has invalid inputs if i not in rows2delete: print_colors(f"Marking row {i} for deletion, as it has invalid inputs") + print(row) rows2delete.append(i) #mark the row for deletion if not already done ### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ### @@ -667,6 +676,7 @@ Maintenance: uvdf.index = uvdf.index + 1 # shifting index uvdf = uvdf.sort_index() # sorting by index uvdf.to_csv(unverifiedcsvfile, index=False) + print("[+] NEW ROW =",newrow) print_colors("[+] New row added to your own unverified.csv file!") else: pass @@ -736,7 +746,7 @@ Maintenance: score='' newrow=[name,webring_participant_url,desc,trusted,status,score] webringcsvfile=instancepath+'/'+'webring-participants.csv' - wdf = pd.read_csv(webringcsvfile) + wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip') wdf.loc[-1] = newrow # adding a row wdf.index = wdf.index + 1 # shifting index wdf = wdf.sort_index() # sorting by index @@ -783,7 +793,7 @@ Maintenance: ########### PERFORM SANITY CHECKS ON the webring participant's verified.csv and unverified.csv ################## for w in ['verified.csv','unverified.csv']: csvfilepath=participantdir+'/'+w - csvdf = pd.read_csv(csvfilepath) + csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') #print_colors(bldf[['blacklisted-words']]) bldf[['blacklisted-words']].iterrows() @@ -852,7 +862,7 @@ Maintenance: while True: print_colors("[+] Trust/UnTrust/Blacklist a webring participant (Potentially dangerous)") webringcsvfile=instancepath+'/'+'webring-participants.csv' - wdf = pd.read_csv(webringcsvfile) + wdf = pd.read_csv(webringcsvfile, on_bad_lines='skip') print_colors(f'{wdf[["URL","Trusted"]]}') try: index = int(input("What is the index of the webring participant that you want to edit? -1 to exit ").strip()) @@ -1120,7 +1130,7 @@ Maintenance: csvfilepath = os.path.join(instancepath, w) print_colors(f"Processing file: {csvfilepath}") try: - csvdf = pd.read_csv(csvfilepath) + csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') print_colors(f"Removing duplicates in {csvfilepath}") #print_colors(f"{csvdf[['URL']]}") csvdf = csvdf.drop_duplicates(subset=['URL'], keep="first", inplace=False) @@ -1146,7 +1156,7 @@ Maintenance: for w in ['verified.csv','unverified.csv']: csvfilepath=participantdir+'/'+w print_colors(f"{csvfilepath}") - csvdf = pd.read_csv(csvfilepath) + csvdf = pd.read_csv(csvfilepath, on_bad_lines='skip') rows2delete= [] # it is an empty list at first for i,j in csvdf.iterrows(): row=csvdf.loc[i,:].values.tolist() @@ -1208,10 +1218,10 @@ Maintenance: case 11: #review the submitted websites: try: - submission_df = pd.read_csv(submission_file_abs_path) - verified_csv_df = pd.read_csv(verifiedcsvfile) - unverified_csv_df = pd.read_csv(unverifiedcsvfile) - blacklist_df = pd.read_csv(blcsvfile) + submission_df = pd.read_csv(submission_file_abs_path, on_bad_lines='skip') + verified_csv_df = pd.read_csv(verifiedcsvfile, on_bad_lines='skip') + unverified_csv_df = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip') + blacklist_df = pd.read_csv(blcsvfile, on_bad_lines='skip') blacklisted_words = [word for word in blacklist_df['blacklisted-words']] for i, row in submission_df.iterrows(): link = row['link'] @@ -1290,10 +1300,10 @@ Maintenance: # review the crawled websites try: print(crawled_file_abs_path) - crawled_df = pd.read_csv(crawled_file_abs_path) - verified_csv_df = pd.read_csv(verifiedcsvfile) - unverified_csv_df = pd.read_csv(unverifiedcsvfile) - blacklist_df = pd.read_csv(blcsvfile) + crawled_df = pd.read_csv(crawled_file_abs_path, on_bad_lines='skip') + verified_csv_df = pd.read_csv(verifiedcsvfile, on_bad_lines='skip') + unverified_csv_df = pd.read_csv(unverifiedcsvfile, on_bad_lines='skip') + blacklist_df = pd.read_csv(blcsvfile, on_bad_lines='skip') blacklisted_words = [word for word in blacklist_df['blacklisted-words']] for i, row in crawled_df.iterrows(): link = row['URL'] diff --git a/scripts/utils.py b/scripts/utils.py index 391a464..ad8a819 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -150,6 +150,8 @@ def IsUrlValid(url:str)->bool: else: if not url.__contains__('.'): return False + if url.__contains__(';'): + return False #required otherwise lantern thinks there are extra columns if pattern.fullmatch(url) is None: return False return True