categorize simplex links and tor onion links

This commit is contained in:
cynthia 2025-04-18 10:04:19 +01:00
parent bf40730292
commit 8ef1bb0910

View file

@ -27,8 +27,6 @@ instancepath=rootpath+'www/participants/'+instance
verifiedcsvfile=instancepath+'/verified.csv'
blcsvfile=instancepath+'/blacklist.csv'
crawlerdir=rootpath+'/crawler'
if not os.path.exists(crawlerdir):
os.makedirs(crawlerdir)
parser = argparse.ArgumentParser(
prog='Lantern crawler',
@ -45,8 +43,13 @@ parser.add_argument('-c', '--crawler-file',
parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them',
type=str, default=blcsvfile)
parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)
parser.add_argument('-n', '--no-create-crawler-dir', help='Don\'t create the crawler directory automatically.', action='store_true')
args = parser.parse_args()
if not args.no_create_crawler_dir:
if not os.path.exists(crawlerdir):
os.makedirs(crawlerdir)
script_abs_path = os.path.dirname(os.path.abspath(__file__))
env_path = os.path.join(script_abs_path+"/.env")
default_env_path = os.path.join(script_abs_path+"/.env.sample")
@ -78,7 +81,7 @@ def get_output_file():
try:
return pd.read_csv(args.output)
except FileNotFoundError:
return pd.DataFrame(columns=["Hostname","URL","Name"])
return pd.DataFrame(columns=["Hostname","URL","Name", "Category"])
def get_blacklist_file():
try:
@ -105,12 +108,12 @@ def add_urls(urls):
for url in urls:
parsed_url = urllib.parse.urlparse(url)
if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any():
output_file.loc[-1] = ["", url, "SimpleX Chatroom"]
output_file.loc[-1] = ["", url, "", "SimpleX Chatroom"]
output_file.index += 1
output_file = output_file.sort_index()
continue
elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any():
output_file.loc[-1] = ["", url, "SimpleX Server"]
output_file.loc[-1] = ["", url, "", "SimpleX Server"]
output_file.index += 1
output_file = output_file.sort_index()
continue
@ -123,13 +126,13 @@ def add_urls(urls):
req = session.get(url)
except requests.exceptions.ConnectionError:
print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
output_file.loc[-1] = [parsed_url.hostname, url, ""]
output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"]
output_file.index += 1
output_file = output_file.sort_index()
continue
if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
output_file.loc[-1] = [parsed_url.hostname, url, ""]
output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"]
else:
forbidden = False
html = BeautifulSoup(req.text, features="lxml")
@ -141,7 +144,7 @@ def add_urls(urls):
forbidden = True
if forbidden:
continue
output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]]
output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1], "Tor Hidden Service"]
output_file.index += 1
output_file = output_file.sort_index()