mirror of
http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/darknet-lantern.git
synced 2025-05-16 20:26:58 +00:00
categorize simplex links and tor onion links
This commit is contained in:
parent
bf40730292
commit
8ef1bb0910
1 changed files with 11 additions and 8 deletions
|
@ -27,8 +27,6 @@ instancepath=rootpath+'www/participants/'+instance
|
|||
verifiedcsvfile=instancepath+'/verified.csv'
|
||||
blcsvfile=instancepath+'/blacklist.csv'
|
||||
crawlerdir=rootpath+'/crawler'
|
||||
if not os.path.exists(crawlerdir):
|
||||
os.makedirs(crawlerdir)
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='Lantern crawler',
|
||||
|
@ -45,8 +43,13 @@ parser.add_argument('-c', '--crawler-file',
|
|||
parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them',
|
||||
type=str, default=blcsvfile)
|
||||
parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)
|
||||
parser.add_argument('-n', '--no-create-crawler-dir', help='Don\'t create the crawler directory automatically.', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.no_create_crawler_dir:
|
||||
if not os.path.exists(crawlerdir):
|
||||
os.makedirs(crawlerdir)
|
||||
|
||||
script_abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
env_path = os.path.join(script_abs_path+"/.env")
|
||||
default_env_path = os.path.join(script_abs_path+"/.env.sample")
|
||||
|
@ -78,7 +81,7 @@ def get_output_file():
|
|||
try:
|
||||
return pd.read_csv(args.output)
|
||||
except FileNotFoundError:
|
||||
return pd.DataFrame(columns=["Hostname","URL","Name"])
|
||||
return pd.DataFrame(columns=["Hostname","URL","Name", "Category"])
|
||||
|
||||
def get_blacklist_file():
|
||||
try:
|
||||
|
@ -105,12 +108,12 @@ def add_urls(urls):
|
|||
for url in urls:
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
if IsSimpleXChatroomValid(url) and not (output_file['URL'] == url).any():
|
||||
output_file.loc[-1] = ["", url, "SimpleX Chatroom"]
|
||||
output_file.loc[-1] = ["", url, "", "SimpleX Chatroom"]
|
||||
output_file.index += 1
|
||||
output_file = output_file.sort_index()
|
||||
continue
|
||||
elif IsSimpleXServerValid(url) and not (output_file['URL'] == url).any():
|
||||
output_file.loc[-1] = ["", url, "SimpleX Server"]
|
||||
output_file.loc[-1] = ["", url, "", "SimpleX Server"]
|
||||
output_file.index += 1
|
||||
output_file = output_file.sort_index()
|
||||
continue
|
||||
|
@ -123,13 +126,13 @@ def add_urls(urls):
|
|||
req = session.get(url)
|
||||
except requests.exceptions.ConnectionError:
|
||||
print_colors(f'[E] Dead 3rd party link: {url}. Adding anyway')
|
||||
output_file.loc[-1] = [parsed_url.hostname, url, ""]
|
||||
output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"]
|
||||
output_file.index += 1
|
||||
output_file = output_file.sort_index()
|
||||
continue
|
||||
|
||||
if "Content-Type" not in req.headers or "text/html" not in req.headers["Content-Type"]:
|
||||
output_file.loc[-1] = [parsed_url.hostname, url, ""]
|
||||
output_file.loc[-1] = [parsed_url.hostname, url, "", "Tor Hidden Service"]
|
||||
else:
|
||||
forbidden = False
|
||||
html = BeautifulSoup(req.text, features="lxml")
|
||||
|
@ -141,7 +144,7 @@ def add_urls(urls):
|
|||
forbidden = True
|
||||
if forbidden:
|
||||
continue
|
||||
output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1]]
|
||||
output_file.loc[-1] = [parsed_url.hostname, url, repr(title)[1:-1], "Tor Hidden Service"]
|
||||
output_file.index += 1
|
||||
output_file = output_file.sort_index()
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue