mirror of
http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern.git
synced 2025-05-16 20:26:58 +00:00
add blacklist
This commit is contained in:
parent
c041e5df19
commit
fe6b826027
1 changed files with 15 additions and 0 deletions
|
@ -70,10 +70,17 @@ def get_output_file():
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
return pd.DataFrame(columns=["URL","Name"])
|
return pd.DataFrame(columns=["URL","Name"])
|
||||||
|
|
||||||
|
def get_blacklist_file():
|
||||||
|
try:
|
||||||
|
return pd.read_csv(args.blacklist_file)
|
||||||
|
except FileNotFoundError:
|
||||||
|
return None
|
||||||
|
|
||||||
# get list of .onion links from the verified.csv file
|
# get list of .onion links from the verified.csv file
|
||||||
verified_csv_file = pd.read_csv(args.verified_csv)
|
verified_csv_file = pd.read_csv(args.verified_csv)
|
||||||
crawler_file = get_crawler_file()
|
crawler_file = get_crawler_file()
|
||||||
output_file = get_output_file()
|
output_file = get_output_file()
|
||||||
|
blacklist_file = get_blacklist_file()
|
||||||
vcsv_urls = []
|
vcsv_urls = []
|
||||||
vcsv_hostnames = []
|
vcsv_hostnames = []
|
||||||
crawled_urls = []
|
crawled_urls = []
|
||||||
|
@ -104,8 +111,16 @@ def add_urls(urls):
|
||||||
if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
|
if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
|
||||||
output_file.loc[-1] = [url, ""]
|
output_file.loc[-1] = [url, ""]
|
||||||
else:
|
else:
|
||||||
|
forbidden = False
|
||||||
html = BeautifulSoup(req.text, features="lxml")
|
html = BeautifulSoup(req.text, features="lxml")
|
||||||
title = html.title.string if html.title is not None else ""
|
title = html.title.string if html.title is not None else ""
|
||||||
|
if blacklist_file is not None:
|
||||||
|
for word in blacklist_file.loc["blacklisted-words"]:
|
||||||
|
if word in title:
|
||||||
|
print_colors('[+] Forbidden word found. Rejecting.')
|
||||||
|
forbidden = True
|
||||||
|
if forbidden:
|
||||||
|
continue
|
||||||
output_file.loc[-1] = [url, title]
|
output_file.loc[-1] = [url, title]
|
||||||
output_file.index += 1
|
output_file.index += 1
|
||||||
output_file = output_file.sort_index()
|
output_file = output_file.sort_index()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue