diff --git a/scripts/crawler.py b/scripts/crawler.py index 783d714..7a52d9f 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -70,10 +70,17 @@ def get_output_file(): except FileNotFoundError: return pd.DataFrame(columns=["URL","Name"]) +def get_blacklist_file(): + try: + return pd.read_csv(args.blacklist_file) + except FileNotFoundError: + return None + # get list of .onion links from the verified.csv file verified_csv_file = pd.read_csv(args.verified_csv) crawler_file = get_crawler_file() output_file = get_output_file() +blacklist_file = get_blacklist_file() vcsv_urls = [] vcsv_hostnames = [] crawled_urls = [] @@ -104,8 +111,16 @@ def add_urls(urls): if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]: output_file.loc[-1] = [url, ""] else: + forbidden = False html = BeautifulSoup(req.text, features="lxml") title = html.title.string if html.title is not None else "" + if blacklist_file is not None: + for word in blacklist_file.loc["blacklisted-words"]: + if word in title: + print_colors('[+] Forbidden word found. Rejecting.') + forbidden = True + if forbidden: + continue output_file.loc[-1] = [url, title] output_file.index += 1 output_file = output_file.sort_index()