From 63d89b9b8b0e77c47db41e9c67e38542a1097f18 Mon Sep 17 00:00:00 2001 From: cynthia Date: Sat, 5 Apr 2025 14:41:26 +0000 Subject: [PATCH] add crawler dir for default crawler outputs --- .gitignore | 3 ++- scripts/crawler.py | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 73b66b1..f039455 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .git www/participants/** +crawler/** scripts/__pycache__/** .env -env/ \ No newline at end of file +env/ diff --git a/scripts/crawler.py b/scripts/crawler.py index 5d898c2..07858de 100644 --- a/scripts/crawler.py +++ b/scripts/crawler.py @@ -21,6 +21,9 @@ if os.path.isfile(urlpath): instancepath=rootpath+'www/participants/'+instance verifiedcsvfile=instancepath+'/verified.csv' blcsvfile=instancepath+'/blacklist.csv' +crawlerdir=instancepath+'/crawler' +if not os.path.exists(crawlerdir): + os.makedirs(crawlerdir) parser = argparse.ArgumentParser( prog='Lantern crawler', @@ -29,9 +32,11 @@ parser = argparse.ArgumentParser( parser.add_argument('-l', '--limit', help='Page crawl limit per .onion link.', type=int, default=10) parser.add_argument('-o', '--output', - help='Output CSV file for found .onion links', type=str, default='onion_crawler.csv') + help='Output CSV file for found .onion links', type=str, + default=os.path.join(crawlerdir, 'onion_crawler.csv')) parser.add_argument('-c', '--crawler-file', - help='Crawler CSV file to log .onion sites and the amount crawled', type=str, default='crawler.csv') + help='Crawler CSV file to log .onion sites and the amount crawled', type=str, + default=os.path.join(crawlerdir, 'crawler.csv')) parser.add_argument('-b', '--blacklist-file', help='Blacklist CSV files to filter out sites with forbidden words in them', type=str, default=blcsvfile) parser.add_argument('-V', '--verified-file', help='Input file to read for .onion links to crawl', type=str, default=verifiedcsvfile)