diff --git a/README.md b/README.md index 7f31699..a3daa67 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,14 @@ DONE: -py : option 6) Trust/Untrust/Blacklist a webring participant -php : make a search engine prompt that only accepts [a-zA-Z.://], it must refuse every other character -py : fix uptimecheck.py to match the new csv format - -DOING: -php : if valid make it filter your own verified.csv and unverified.csv files +-py : option 9) cleanup all duplicates in your own unverified.csv and verified.csv +-py : option 10) perform sanity checks on all csv files (to mark them as sensitive or remove the ones that are blacklisted) + +TODO: -py : option 7) Add/Remove words in the sensitive list (assigned to anon) -py : option 8) Add/Remove words in the blacklist (assigned to anon) -TODO: --py : option 9) cleanup all duplicates in your own unverified.csv and verified.csv --py : option 10) perform sanity checks on all csv files (to mark them as sensitive or remove the ones that are blacklisted) ``` diff --git a/scripts/darknet_exploration.py b/scripts/darknet_exploration.py index 76a2c90..b2dfa74 100644 --- a/scripts/darknet_exploration.py +++ b/scripts/darknet_exploration.py @@ -243,7 +243,7 @@ Maintenance: - #TODO check if it works when you have a second webring participant + #check if it works when you have a second webring participant case "4": print("4) Synchronize new links from existing webring participants, into your unverified.csv file") # iterate through each existing directories in www/participants/* to get each webring participant @@ -263,7 +263,7 @@ Maintenance: # NOTE check if the webring participant is yourself, if it is, then skip it if participant != myinstance: # prod: dont use your own intance #if participant == myinstance: # preprod testing only on your own instance - #TODO overwrite the existing files in the participant's directory, with their version (download all the csv files from them again) + #overwrite the existing files in the participant's directory, with their version (download all the csv files from them again) basewurl='http://'+participant+'/participants/'+participant+'/' print(basewurl) print('[+] Downloading the files of ',participant, ": ") @@ -779,7 +779,7 @@ Maintenance: # TODO if valid! remove word at index # TODO if invalid! just pass to ask for another word - # TODO CASE 9 : cleanup all duplicates in unverified + verified.csv, based on the url (check if each url appears more than once, and if they do, remove them + write to csv file) + # CASE 9 : cleanup all duplicates in unverified + verified.csv, based on the url (check if each url appears more than once, and if they do, remove them + write to csv file) case "9": print("[+] 9) Cleaning up all duplicates in your own unverified + verified.csv (based on the url)") # ignore it if the index is "indextocheck" and if the index is already listed in rows2delete @@ -790,97 +790,68 @@ Maintenance: for w in ['verified.csv','unverified.csv']: #instancepath=rootpath+'www/participants/'+instance # fyi csvfilepath=instancepath+'/'+w + print(csvfilepath) csvdf = pd.read_csv(csvfilepath) - rows2deletevdf= [] # it is an empty list at first - rows2deleteuvdf= [] # it is an empty list at first - # iterate through each row of the csv file - for i,j in csvdf.iterrows(): - #print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description']) - #print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]]) - #row=uvdf.iloc[[i]] #it displays the index - row=csvdf.loc[i,:].values.tolist() - print(row) - - # for each link in the participant's verified/unverified csv files, - # check if the link is already listed in your own verified.csv or unverified.csv - filterterm=csvdf.at[i, 'URL'] - filter_vdf= vdf[vdf.URL.str.contains(filterterm)] - filter_uvdf= uvdf[uvdf.URL.str.contains(filterterm)] - # for unverified.csv, and verified.csv ; - if len(filter_vdf.index) > 1: - # if link exists more than once in verified.csv, remove the duplicates in verified.csv - for m,n in filter_vdf.iterrows(): - if m != i and m not in rows2deletevdf: - rows2deletevdf.append(m) #mark the DUPLICATE ROWS for deletion, meaning the ones that are not i!!! if not already done - #TODO make sure it doesnt mark the previous i indexes for deletion (as it is a duplicate of the next row) - if len(filter_vdf.index) == 1: - # if link exists ONCE in verified.csv check that it doesnt in unverified.csv: - if len(filter_uvdf.index) >= 1: - # if link exists ONCE in verified.csv AND in unverified.csv, cleanup the duplicates in unverified.csv - KO - for m,n in filter_uvdf.iterrows(): - if m != i and m not in rows2deleteuvdf: - rows2deleteuvdf.append(m) #mark the DUPLICATE ROWS for deletion, meaning the ones that are not i!!! if not already done - #TODO make sure it doesnt mark the previous i indexes for deletion (as it is a duplicate of the next row) - - #else: - # link only exists in verified.csv, and not in unverified.csv - OK - if len(filter_vdf.index) == 0: - # if link does not exist in verified.csv, check if it exists in unverified.csv: - if len(filter_uvdf.index) > 1: - # link exists more than once in unverified.csv, get rid of the duplicates in unverified.csv - KO - for m,n in filter_uvdf.iterrows(): - if m != i and m not in rows2deleteuvdf: - rows2deleteuvdf.append(m) #mark the DUPLICATE ROWS for deletion, meaning the ones that are not i!!! if not already done - #TODO make sure it doesnt mark the previous i indexes for deletion (as it is a duplicate of the next row) - #else: - # link either exists once or doesnt exist in unverified.csv, therefore OK - #rows2deletevdf.sort() - #reverse it so that it starts removing the last elements first - #rows2deletevdf = rows2deletevdf[::-1] - print("ROWS TO DELETE IN VERIFIED.CSV:", rows2deletevdf) - if rows2deletevdf != []: - for p in rows2deletevdf: - row=vdf.loc[p,:].values.tolist() - print('[+] REMOVING ROW :',p,row) - vdf.drop(p, inplace= True) - vdf.to_csv(verifiedcsvfile, index=False) - rows2deletevdf= [] # it is an empty list at first - - - #rows2deleteuvdf.sort() - #reverse it so that it starts removing the last elements first - #rows2deleteuvdf = rows2deleteuvdf[::-1] - print("ROWS TO DELETE IN UNVERIFIED.CSV:", rows2deleteuvdf) - if rows2deleteuvdf != []: - for p in rows2deleteuvdf: - row=uvdf.loc[p,:].values.tolist() - print('[+] REMOVING ROW :',p,row) - uvdf.drop(p, inplace= True) - uvdf.to_csv(unverifiedcsvfile, index=False) - rows2deleteuvdf= [] # it is an empty list at first - - - - - - - #if len(filter_uvdf.index) == 1 and len(filter_vdf.index) == 1: - # if link exists only ONCE in verified.csv or unverified.csv, then skip - # if link doesnt exist in either of your verified/unverified csv files, - # then add it to your own unverified.csv file: - # newrow=row - # uvdf.loc[-1] = newrow # adding a row - # uvdf.index = uvdf.index + 1 # shifting index - # uvdf = uvdf.sort_index() # sorting by index - # uvdf.to_csv(unverifiedcsvfile, index=False) - # print("[+] ") - #else: - # print('[-] Skipping row as it is already added in ',w,row,) - - + print("REMOVING DUPLICATES IN", csvfilepath) + csvdf = csvdf.drop_duplicates(subset=['URL']) + csvdf.to_csv(csvfilepath, index=False) + print(csvdf[['URL']]) case "10": print("[+] 10) perform sanity checks on all csv files (to mark them as sensitive / or remove the ones that are blacklisted)") + participantspath = rootpath+'www/participants/' + for participant in os.listdir(participantspath): + print("Participant:",participant) + participantdir= participantspath+participant + a=0 + if a == 0: + if a== 0: + ################ BEGIN SANITY CHECKS FOR EACH PARTICIPANTS ############## + # iterate through the participant's verified.csv and unverified.csv files + for w in ['verified.csv','unverified.csv']: + csvfilepath=participantdir+'/'+w + print(csvfilepath) + csvdf = pd.read_csv(csvfilepath) + #print(bldf[['blacklisted-words']]) + bldf[['blacklisted-words']].iterrows() + rows2delete= [] # it is an empty list at first + for i,j in csvdf.iterrows(): + #print("[+] Unverified.csv ROW=",i, uvdf.at[i, 'Instance'], uvdf.at[i, 'Category'], uvdf.at[i, 'Name'], uvdf.at[i, 'URL'], uvdf.at[i, 'Description']) + #print("[+] Unverified.csv ROW=",i, uvdf.iloc[[i]]) + #row=uvdf.iloc[[i]] #it displays the index + row=csvdf.loc[i,:].values.tolist() + print(row) + #print(i,row) + + + + ################################ SANITY CHECKS #################################### + ### SANITY CHECK 1: Mark all the rows that have incorrect formatting for deletion### + #print("[+] ROW=",i,"ROW CONTENTS=", IsUrlValid(uvdf.at[i, 'Instance']), IsCategoryValid(uvdf.at[i, 'Category']), IsNameValid(uvdf.at[i, 'Name']), IsUrlValid(uvdf.at[i, 'URL']), IsStatusValid(uvdf.at[i, 'Sensitive']), IsDescriptionValid(uvdf.at[i, 'Description']), IsStatusValid(uvdf.at[i, 'Status']), IsScoreValid(uvdf.at[i, 'Score'])) + if IsUrlValid(csvdf.at[i, 'Instance']) is False or IsCategoryValid(csvdf.at[i, 'Category']) is False or IsNameValid(csvdf.at[i, 'Name']) is False or IsUrlValid(csvdf.at[i, 'URL']) is False or IsStatusValid(csvdf.at[i, 'Sensitive']) is False or IsDescriptionValid(csvdf.at[i, 'Description']) is False or IsStatusValid(csvdf.at[i, 'Status']) is False or IsScoreValid(csvdf.at[i, 'Score']) is False: + #mark the row for deletion as it has invalid inputs + if i not in rows2delete: + print("Marking row", i,"for deletion, as it has invalid inputs") + rows2delete.append(i) #mark the row for deletion if not already done + + ### SANITY CHECK 2: Mark all rows that are not allowed (blacklist) for deletion ### + for k,l in bldf.iterrows(): + #print("[+] Blacklisted word=",k, bldf.at[k, 'blacklisted-words']) + blword=bldf.at[k, 'blacklisted-words'] + if any(blword in str(x) for x in row) == True: + #print("found blacklisted word! marking row for deletion") + if i not in rows2delete: + print("Marking row", i,"for deletion, as it matches with a blacklisted word") + rows2delete.append(i) #mark the row for deletion if not already done + + for i in rows2delete: + row=csvdf.loc[i,:].values.tolist() + print('[+] REMOVING ROW :',i,row) + csvdf.drop(i, inplace= True) + csvdf.to_csv(csvfilepath, index=False) + + + # TODO find the list of all csv files (in www/participants/*/*.csv) (templates should remain empty by default) # copy what was done in option 4, to : # delete the ones that have invalid entries diff --git a/www/header.php b/www/header.php index ea0a328..625a785 100644 --- a/www/header.php +++ b/www/header.php @@ -36,7 +36,7 @@ if (($handle = fopen($csvfile, "r")) !== FALSE) { if ($oldcatname != $data[1]){ echo ""; // skip a row if new category echo ""; // skip a row if new category - echo "

" . $data[1] . '|

'; // display the category as its the first row with this new category + echo "

" . $data[1] . '

'; // display the category as its the first row with this new category $oldcatname=$data[1]; }else{ echo "

" . '

'; // category is already displayed so skip it (empty cell in category column) diff --git a/www/participants/webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion/unverified.csv b/www/participants/webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion/unverified.csv index 3933f56..70f8d92 100644 --- a/www/participants/webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion/unverified.csv +++ b/www/participants/webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion/unverified.csv @@ -1,18 +1,5 @@ Instance,Category,Name,URL,Sensitive,Description,Status,Score uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,,List of links to go to popular darknet places,✔️,100.0 -uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,,List of links to go to popular darknet places,✔️,100.0 -uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,,List of links to go to popular darknet places,✔️,100.0 -uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzldruga77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,✔️,List of links to go to popular darknet places,❌,0.0 -uptime.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion,Infos and Links,Tor Taxi,http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/,,List of links to go to popular darknet places,✔️,100.0 -webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Hackliberty,Hackliberty Gitea,http://vkp7367tcjpqdwwckigrdrvmwvispvbpg5rlsr2chjxvppfg7hipagyd.onion,,,✔️,100.0 -webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Hackliberty,Hackliberty Gitea,http://vkp7367tcjpqdwwckigrdrvmwvispvbpg5rlsr2chjxvppfg7hipagyd.onion,,,✔️,100.0 -webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Hackliberty,Hackliberty Gitea,http://vkp7367tcjpqdwwckigrdrvmwvispvbpg5rlsr2chjxvppfg7hipagyd.onion,,,✔️,100.0 webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Hackliberty,Hackliberty Gitea,http://vkp7367tcjpqdwwckigrdrvmwvispvbpg5rlsr2chjxvppfg7hipagyd.onion,,,✔️,100.0 webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Forums,Hackliberty Forum,http://yw7nc56v4nsudvwewhmhhwltxpncedfuc43qbubj4nmwhdhwtiu4o6yd.onion/,,,✔️,100.0 -webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Forums,Hackliberty Forum,http://yw7nc56v4nsudvwewhmhhwltxpncedfuc43qbubj4nmwhdhwtiu4o6yd.onion/,,,✔️,100.0 -webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Forums,Hackliberty Forum,http://yw7nc56v4nsudvwewhmhhwltxpncedfuc43qbubj4nmwhdhwtiu4o6yd.onion/,,,✔️,100.0 -webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Forums,Hackliberty Forum,http://yw7nc56v4nsudvwewhmhhwltxpncedfuc43qbubj4nmwhdhwtiu4o6yd.onion/,,,✔️,100.0 -webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Communities,Hackliberty main website,http://kj3wvs3wyfhm3uhhuqxlrhhcp6dneuau4mmvptlor27ghmrqx63fqnid.onion/,,,✔️,100.0 -webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Communities,Hackliberty main website,http://kj3wvs3wyfhm3uhhuqxlrhhcp6dneuau4mmvptlor27ghmrqx63fqnid.onion/,,,✔️,100.0 -webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Communities,Hackliberty main website,http://kj3wvs3wyfhm3uhhuqxlrhhcp6dneuau4mmvptlor27ghmrqx63fqnid.onion/,,,✔️,100.0 webring.nowhevi57f4lxxd6db43miewcsgtovakbh6v5f52ci7csc2yjzy5rnid.onion,Communities,Hackliberty main website,http://kj3wvs3wyfhm3uhhuqxlrhhcp6dneuau4mmvptlor27ghmrqx63fqnid.onion/,,,✔️,100.0