darknet-lantern/scripts/crawler.py

from bs4 import BeautifulSoup
import pandas as pd
import requests
import urllib.parse
import argparse
import os
import pwd
import re
from utils import print_colors, IsUrlValid

from dotenv import load_dotenv

# Make default parameters for arguments
rootpath='/srv/darknet-lantern/'
urlpath=pwd.getpwuid(os.getuid()).pw_dir+"/.darknet_participant_url"
instance = ""
if os.path.isfile(urlpath):
    with open(urlpath) as f:
        instance = f.read().rstrip()

instancepath=rootpath+'www/participants/'+instance
verifiedcsvfile=instancepath+'/verified.csv'
blcsvfile=instancepath+'/blacklist.csv'
crawlerdir=rootpath+'/crawler'
if not os.path.exists(crawlerdir):
    os.makedirs(crawlerdir)

parser = argparse.ArgumentParser(
    prog='Lantern crawler',
    description='Crawls .onion sites for links to more .onion sites')

parser.add_argument('-l', '--limit',
                    help='Page crawl limit per .onion link.', type=int, default=10)
parser.add_argument('-o', '--output',
                    help='Output CSV file for found .onion links', type=str,
                    default=os.path.join(crawlerdir, 'onion_crawler.csv'))
parser.add_argument('-c', '--crawler-file',
                    help='Crawler CSV file to log .onion sites and the amount crawled', type=str,
                    default=os.path.join(crawlerdir, 'crawler.csv'))
parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them',
                    type=str, default=blcsvfile)
parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)
args = parser.parse_args()

script_abs_path = os.path.dirname(os.path.abspath(__file__))
env_path = os.path.join(script_abs_path+"/.env")
default_env_path = os.path.join(script_abs_path+"/.env.sample")

if os.path.exists(env_path):
    load_dotenv(dotenv_path=env_path)
else:
    load_dotenv(dotenv_path=default_env_path)

tor_host = os.getenv("TOR_HOST")
tor_port = os.getenv("TOR_PORT")
session = requests.session()
session.proxies = {
    'http': f'{tor_host}:{tor_port}',
    'https': f'{tor_host}:{tor_port}'
}
# Set user agent too for the crawler
session.headers.update({'User-Agent': 'LanternCrawler'})

def get_crawler_file():
    try:
        # try to read the CSV file
        return pd.read_csv(args.crawler_file)
    except FileNotFoundError:
        # make a new empty crawler file
        return pd.DataFrame(columns=["URL","Counter"])

def get_output_file():
    try:
        return pd.read_csv(args.output)
    except FileNotFoundError:
        return pd.DataFrame(columns=["Hostname","URL","Name"])

def get_blacklist_file():
    try:
        return pd.read_csv(args.blacklist_file)
    except FileNotFoundError:
        return None

# get list of .onion links from the verified.csv file
verified_csv_file = pd.read_csv(args.verified_file)
crawler_file = get_crawler_file()
output_file = get_output_file()
blacklist_file = get_blacklist_file()
vcsv_urls = []
vcsv_hostnames = []
crawled_urls = []

class CrawlerResult:
    def __init__(self, firstp_urls, thirdp_urls):
        self.firstp_urls = firstp_urls
        self.thirdp_urls = thirdp_urls

def add_urls(urls):
    global output_file
    for url in urls:
        parsed_url = urllib.parse.urlparse(url)
        if (output_file['Hostname'] == parsed_url.hostname).any() or parsed_url.hostname is None or parsed_url.hostname in vcsv_hostnames or not parsed_url.hostname.endswith(".onion"):
            continue

        # Get information about the URL
        print_colors(f'[+] Querying {url} for information.')
        try:
            req = session.get(url)
        except requests.exceptions.ConnectionError:
            print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
            output_file.loc[-1] = [parsed_url.hostname, url, ""]
            output_file.index += 1
            output_file = output_file.sort_index()
            continue

        if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
            output_file.loc[-1] = [parsed_url.hostname, url, ""]
        else:
            forbidden = False
            html = BeautifulSoup(req.text, features="lxml")
            title = html.title.string if html.title is not None else ""
            if blacklist_file is not None:
                for row in blacklist_file.itertuples():
                    if row[1].lower() in title.lower():
                        print_colors('[+] Forbidden word found. Rejecting.')
                        forbidden = True
            if forbidden:
                continue
            output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]]
        output_file.index += 1
        output_file = output_file.sort_index()

def extract_urls_html(url, text):
    html = BeautifulSoup(text, features="lxml")
    hostname_url = urllib.parse.urlparse(url).hostname
    result = CrawlerResult([], [])

    for link in html.find_all('a'):
        if not link.has_attr("href"):
            print_colors('[E] Could not find link in <a> element, Skipping link')
            continue

        joined_url = urllib.parse.urljoin(url, link["href"])
        jurl_parsed = urllib.parse.urlparse(joined_url)

        print_colors(f'[D] Joined URL: {joined_url}')
        # Check if the URL is a .onion link or not even a web link
        if not IsUrlValid(joined_url):
            continue

        print_colors(f'[+] Found url: {joined_url}')

        # Check if the URL is a first-party link
        if jurl_parsed.hostname == hostname_url:
            if joined_url not in result.firstp_urls:
                result.firstp_urls.append(joined_url)
        else:
            if joined_url not in result.thirdp_urls:
                result.thirdp_urls.append(joined_url)
    return result

def extract_urls_txt(url, text):
    hostname_url = urllib.parse.urlparse(url).hostname
    result = CrawlerResult([], [])
    url_regex = re.compile(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.onion\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)")

    # TODO: It won't find first party URL here, since the regex doesn't capture the path.
    for found_url in url_regex.findall(text):
        if hostname_url != urllib.parse.urlparse(found_url):
            if found_url not in result.thirdp_urls:
                result.thirdp_urls.append(found_url)
    return result

def crawl_url(url):
    global crawler_file
    # List of urls to crawl.
    urls = [url]
    counter = 0
    counter = crawler_file[crawler_file['URL'] == url]['Counter'].any()

    while counter < args.limit and len(urls) != 0:
        cur_url = urls.pop()
        cur_url_scheme = urllib.parse.urlparse(cur_url).scheme

        if cur_url in crawled_urls:
            # Remove already-crawled urls
            print_colors(f'[D] {cur_url} has already been crawled. Skipping')
            continue
        elif cur_url_scheme != 'http':
            print_colors(f'[D] Unknown scheme found, Skipping: {cur_url}')
            continue

        print_colors(f'[+] Crawling {cur_url}')
        try:
            req = session.get(cur_url)
        except requests.exceptions.ConnectionError:
            print_colors(f'[E] Failed to connect to {cur_url}')
            continue

        crawled_urls.append(cur_url)
        result = CrawlerResult([], [])

        # Determine the type of response from the headers
        if "Content-Type" not in req.headers:
            print_colors('[+] No content type found, Not extracting links.')
        elif "text/plain" in req.headers["Content-Type"]:
            result = extract_urls_txt(cur_url, req.text)
        elif "text/html" in req.headers["Content-Type"]:
            result = extract_urls_html(cur_url, req.text)
        else:
            print_colors(f'[+] Unknown content type encountered: {req.headers["Content-Type"]}')
        [urls.append(x) for x in result.firstp_urls if x not in urls]
        add_urls(result.thirdp_urls)
        counter += 1

        # Refresh counter in CSV file
        if not crawler_file[crawler_file['URL'] == url].empty:
            crawler_file.loc[crawler_file['URL'] == url, 'Counter'] = counter
        else:
            crawler_file.loc[-1] = [url, counter]
            crawler_file.index += 1
            crawler_file = crawler_file.sort_index()

for row in verified_csv_file.itertuples():
    if urllib.parse.urlparse(row.URL).scheme:
        vcsv_urls.append(row.URL)
        vcsv_hostnames.append(urllib.parse.urlparse(row.URL).hostname)
    else:
        url = "http://" + row.URL
        vcsv_urls.append(url)
        vcsv_hostnames.append(urllib.parse.urlparse(url).hostname)

for i, url in enumerate(vcsv_urls):
    if not crawler_file[crawler_file['URL'] == url].empty:
        print_colors(f'[+] {url} has already been crawled. Skipping')
        continue

    if vcsv_hostnames[i] is None or not vcsv_hostnames[i].endswith('.onion'):
        print_colors(f'[+] Skipping over non-onion link in verified.csv file: {url}')
        continue
    crawl_url(url)
    crawler_file.to_csv(args.crawler_file, index=False)
    output_file.to_csv(args.output, index=False)