mirror of
http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern.git
synced 2025-05-16 20:26:58 +00:00
Merge pull request 'crawler.py' (#48) from cynthia/darknet-lantern:main into main
Reviewed-on: http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern/pulls/48 thx, sent payment + works as intended
This commit is contained in:
commit
029f2a361c
3 changed files with 247 additions and 1 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -1,5 +1,6 @@
|
|||
.git
|
||||
www/participants/**
|
||||
crawler/**
|
||||
scripts/__pycache__/**
|
||||
.env
|
||||
env/
|
||||
env/
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
beautifulsoup4==4.13.3
|
||||
certifi==2024.12.14
|
||||
charset-normalizer==3.4.1
|
||||
idna==3.10
|
||||
|
|
244
scripts/crawler.py
Normal file
244
scripts/crawler.py
Normal file
|
@ -0,0 +1,244 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import requests
|
||||
import urllib.parse
|
||||
import argparse
|
||||
import os
|
||||
import pwd
|
||||
import re
|
||||
from utils import print_colors, IsUrlValid
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Make default parameters for arguments
|
||||
rootpath='/srv/darknet-lantern/'
|
||||
urlpath=pwd.getpwuid(os.getuid()).pw_dir+"/.darknet_participant_url"
|
||||
instance = ""
|
||||
if os.path.isfile(urlpath):
|
||||
with open(urlpath) as f:
|
||||
instance = f.read().rstrip()
|
||||
|
||||
instancepath=rootpath+'www/participants/'+instance
|
||||
verifiedcsvfile=instancepath+'/verified.csv'
|
||||
blcsvfile=instancepath+'/blacklist.csv'
|
||||
crawlerdir=rootpath+'/crawler'
|
||||
if not os.path.exists(crawlerdir):
|
||||
os.makedirs(crawlerdir)
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='Lantern crawler',
|
||||
description='Crawls .onion sites for links to more .onion sites')
|
||||
|
||||
parser.add_argument('-l', '--limit',
|
||||
help='Page crawl limit per .onion link.', type=int, default=10)
|
||||
parser.add_argument('-o', '--output',
|
||||
help='Output CSV file for found .onion links', type=str,
|
||||
default=os.path.join(crawlerdir, 'onion_crawler.csv'))
|
||||
parser.add_argument('-c', '--crawler-file',
|
||||
help='Crawler CSV file to log .onion sites and the amount crawled', type=str,
|
||||
default=os.path.join(crawlerdir, 'crawler.csv'))
|
||||
parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them',
|
||||
type=str, default=blcsvfile)
|
||||
parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)
|
||||
args = parser.parse_args()
|
||||
|
||||
script_abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
env_path = os.path.join(script_abs_path+"/.env")
|
||||
default_env_path = os.path.join(script_abs_path+"/.env.sample")
|
||||
|
||||
if os.path.exists(env_path):
|
||||
load_dotenv(dotenv_path=env_path)
|
||||
else:
|
||||
load_dotenv(dotenv_path=default_env_path)
|
||||
|
||||
tor_host = os.getenv("TOR_HOST")
|
||||
tor_port = os.getenv("TOR_PORT")
|
||||
session = requests.session()
|
||||
session.proxies = {
|
||||
'http': f'{tor_host}:{tor_port}',
|
||||
'https': f'{tor_host}:{tor_port}'
|
||||
}
|
||||
# Set user agent too for the crawler
|
||||
session.headers.update({'User-Agent': 'LanternCrawler'})
|
||||
|
||||
def get_crawler_file():
|
||||
try:
|
||||
# try to read the CSV file
|
||||
return pd.read_csv(args.crawler_file)
|
||||
except FileNotFoundError:
|
||||
# make a new empty crawler file
|
||||
return pd.DataFrame(columns=["URL","Counter"])
|
||||
|
||||
def get_output_file():
|
||||
try:
|
||||
return pd.read_csv(args.output)
|
||||
except FileNotFoundError:
|
||||
return pd.DataFrame(columns=["Hostname","URL","Name"])
|
||||
|
||||
def get_blacklist_file():
|
||||
try:
|
||||
return pd.read_csv(args.blacklist_file)
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
|
||||
# get list of .onion links from the verified.csv file
|
||||
verified_csv_file = pd.read_csv(args.verified_file)
|
||||
crawler_file = get_crawler_file()
|
||||
output_file = get_output_file()
|
||||
blacklist_file = get_blacklist_file()
|
||||
vcsv_urls = []
|
||||
vcsv_hostnames = []
|
||||
crawled_urls = []
|
||||
|
||||
class CrawlerResult:
|
||||
def __init__(self, firstp_urls, thirdp_urls):
|
||||
self.firstp_urls = firstp_urls
|
||||
self.thirdp_urls = thirdp_urls
|
||||
|
||||
def add_urls(urls):
|
||||
global output_file
|
||||
for url in urls:
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
|
||||
continue
|
||||
|
||||
# Get information about the URL
|
||||
print_colors(f'[+] Querying {url} for information.')
|
||||
try:
|
||||
req = session.get(url)
|
||||
except requests.exceptions.ConnectionError:
|
||||
print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
|
||||
output_file.loc[-1] = [parsed_url.hostname, url, ""]
|
||||
output_file.index += 1
|
||||
output_file = output_file.sort_index()
|
||||
continue
|
||||
|
||||
if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
|
||||
output_file.loc[-1] = [parsed_url.hostname, url, ""]
|
||||
else:
|
||||
forbidden = False
|
||||
html = BeautifulSoup(req.text, features="lxml")
|
||||
title = html.title.string if html.title is not None else ""
|
||||
if blacklist_file is not None:
|
||||
for row in blacklist_file.itertuples():
|
||||
if row[1].lower() in title.lower():
|
||||
print_colors('[+] Forbidden word found. Rejecting.')
|
||||
forbidden = True
|
||||
if forbidden:
|
||||
continue
|
||||
output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]]
|
||||
output_file.index += 1
|
||||
output_file = output_file.sort_index()
|
||||
|
||||
def extract_urls_html(url, text):
|
||||
html = BeautifulSoup(text, features="lxml")
|
||||
hostname_url = urllib.parse.urlparse(url).hostname
|
||||
result = CrawlerResult([], [])
|
||||
|
||||
for link in html.find_all('a'):
|
||||
if not link.has_attr("href"):
|
||||
print_colors('[E] Could not find link in <a> element, Skipping link')
|
||||
continue
|
||||
|
||||
joined_url = urllib.parse.urljoin(url, link["href"])
|
||||
jurl_parsed = urllib.parse.urlparse(joined_url)
|
||||
|
||||
print_colors(f'[D] Joined URL: {joined_url}')
|
||||
# Check if the URL is a .onion link or not even a web link
|
||||
if not IsUrlValid(joined_url):
|
||||
continue
|
||||
|
||||
print_colors(f'[+] Found url: {joined_url}')
|
||||
|
||||
# Check if the URL is a first-party link
|
||||
if jurl_parsed.hostname == hostname_url:
|
||||
if joined_url not in result.firstp_urls:
|
||||
result.firstp_urls.append(joined_url)
|
||||
else:
|
||||
if joined_url not in result.thirdp_urls:
|
||||
result.thirdp_urls.append(joined_url)
|
||||
return result
|
||||
|
||||
def extract_urls_txt(url, text):
|
||||
hostname_url = urllib.parse.urlparse(url).hostname
|
||||
result = CrawlerResult([], [])
|
||||
url_regex = re.compile(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.onion\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)")
|
||||
|
||||
# TODO: It won't find first party URL here, since the regex doesn't capture the path.
|
||||
for found_url in url_regex.findall(text):
|
||||
if hostname_url != urllib.parse.urlparse(found_url):
|
||||
if found_url not in result.thirdp_urls:
|
||||
result.thirdp_urls.append(found_url)
|
||||
return result
|
||||
|
||||
def crawl_url(url):
|
||||
global crawler_file
|
||||
# List of urls to crawl.
|
||||
urls = [url]
|
||||
counter = 0
|
||||
counter = crawler_file[crawler_file['URL'] == url]['Counter'].any()
|
||||
|
||||
while counter < args.limit and len(urls) != 0:
|
||||
cur_url = urls.pop()
|
||||
cur_url_scheme = urllib.parse.urlparse(cur_url).scheme
|
||||
|
||||
if cur_url in crawled_urls:
|
||||
# Remove already-crawled urls
|
||||
print_colors(f'[D] {cur_url} has already been crawled. Skipping')
|
||||
continue
|
||||
elif cur_url_scheme != 'http':
|
||||
print_colors(f'[D] Unknown scheme found, Skipping: {cur_url}')
|
||||
continue
|
||||
|
||||
print_colors(f'[+] Crawling {cur_url}')
|
||||
try:
|
||||
req = session.get(cur_url)
|
||||
except requests.exceptions.ConnectionError:
|
||||
print_colors(f'[E] Failed to connect to {cur_url}')
|
||||
continue
|
||||
|
||||
crawled_urls.append(cur_url)
|
||||
result = CrawlerResult([], [])
|
||||
|
||||
# Determine the type of response from the headers
|
||||
if "Content-Type" not in req.headers:
|
||||
print_colors('[+] No content type found, Not extracting links.')
|
||||
elif "text/plain" in req.headers["Content-Type"]:
|
||||
result = extract_urls_txt(cur_url, req.text)
|
||||
elif "text/html" in req.headers["Content-Type"]:
|
||||
result = extract_urls_html(cur_url, req.text)
|
||||
else:
|
||||
print_colors(f'[+] Unknown content type encountered: {req.headers["Content-Type"]}')
|
||||
[urls.append(x) for x in result.firstp_urls if x not in urls]
|
||||
add_urls(result.thirdp_urls)
|
||||
counter += 1
|
||||
|
||||
# Refresh counter in CSV file
|
||||
if not crawler_file[crawler_file['URL'] == url].empty:
|
||||
crawler_file.loc[crawler_file['URL'] == url, 'Counter'] = counter
|
||||
else:
|
||||
crawler_file.loc[-1] = [url, counter]
|
||||
crawler_file.index += 1
|
||||
crawler_file = crawler_file.sort_index()
|
||||
|
||||
for row in verified_csv_file.itertuples():
|
||||
if urllib.parse.urlparse(row.URL).scheme:
|
||||
vcsv_urls.append(row.URL)
|
||||
vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname)
|
||||
else:
|
||||
url = "http://" + row.URL
|
||||
vcsv_urls.append(url)
|
||||
vcsv_hostnames.append(urllib.parse.urlparse(url).hostname)
|
||||
|
||||
for i, url in enumerate(vcsv_urls):
|
||||
if not crawler_file[crawler_file['URL'] == url].empty:
|
||||
print_colors(f'[+] {url} has already been crawled. Skipping')
|
||||
continue
|
||||
|
||||
if vcsv_hostnames[i] is None or not vcsv_hostnames[i].endswith('.onion'):
|
||||
print_colors(f'[+] Skipping over non-onion link in verified.csv file: {url}')
|
||||
continue
|
||||
crawl_url(url)
|
||||
crawler_file.to_csv(args.crawler_file, index=False)
|
||||
output_file.to_csv(args.output, index=False)
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue