mirror of
http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern.git
synced 2025-05-16 20:26:58 +00:00
add blacklist
This commit is contained in:
parent
c041e5df19
commit
fe6b826027
1 changed files with 15 additions and 0 deletions
|
@ -70,10 +70,17 @@ def get_output_file():
|
|||
except FileNotFoundError:
|
||||
return pd.DataFrame(columns=["URL","Name"])
|
||||
|
||||
def get_blacklist_file():
|
||||
try:
|
||||
return pd.read_csv(args.blacklist_file)
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
|
||||
# get list of .onion links from the verified.csv file
|
||||
verified_csv_file = pd.read_csv(args.verified_csv)
|
||||
crawler_file = get_crawler_file()
|
||||
output_file = get_output_file()
|
||||
blacklist_file = get_blacklist_file()
|
||||
vcsv_urls = []
|
||||
vcsv_hostnames = []
|
||||
crawled_urls = []
|
||||
|
@ -104,8 +111,16 @@ def add_urls(urls):
|
|||
if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
|
||||
output_file.loc[-1] = [url, ""]
|
||||
else:
|
||||
forbidden = False
|
||||
html = BeautifulSoup(req.text, features="lxml")
|
||||
title = html.title.string if html.title is not None else ""
|
||||
if blacklist_file is not None:
|
||||
for word in blacklist_file.loc["blacklisted-words"]:
|
||||
if word in title:
|
||||
print_colors('[+] Forbidden word found. Rejecting.')
|
||||
forbidden = True
|
||||
if forbidden:
|
||||
continue
|
||||
output_file.loc[-1] = [url, title]
|
||||
output_file.index += 1
|
||||
output_file = output_file.sort_index()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue