blog-contributions/convert_old_blog.py
2025-05-07 16:21:53 +02:00

197 lines
6.6 KiB
Python

import os
import re
import sys
import json
import shutil
import requests
import html2text
OLLAMA_HOST = 'http://ollama:11434/'
OLLAMA_PROMPT = """Analyze the blog post and give answer to following questions:
* Who wrote it? (usually occurs at the beginning)
* When was it written? (2001-12-30 format)
The response format should be JSON. You can not output anything else other than JSON. If the response can't be found, add `null` value. Here's an example how output should look like:
```
{"author": "nihilist", "date": "2024-03-01"}
```
Important: do not output anything else!
"""
NIHILIST_XMR = '8AUYjhQeG3D5aodJDtqG499N5jXXM71gYKD8LgSsFB9BUV1o7muLv3DXHoydRTK4SZaaUBq4EAUqpZHLrX2VZLH71Jrd9k8'
# make generation somewhat deterministic
OLLAMA_SEED = 17
OLLAMA_MODEL = 'gemma3:12b'
def is_webp(fpath: str) -> bool:
with open(fpath, 'rb') as f:
f.seek(8)
return f.read(8) == b'WEBPVP8L'
def html_to_papermod(html_data: str) -> str:
# try to find author, date and other metadata in html file
h = html2text.HTML2Text()
# disable random wrapping
h.body_width = 0
return h.handle(html_data)
def extract_blog_info(html_string: str) -> dict[str, str]:
match = re.search(r'<p><img.*?<ba>(.*?)</ba>.*?<h1>(.*?)</h1>', html_string, re.DOTALL)
user = None
title = None
if match:
user = match.group(1).strip()
title = match.group(2).strip()
return user, title
def extract_git_issue_number(html_string: str) -> int:
match = re.search(r'issues/(\d+)">git issue<', html_string)
if match:
return int(match.group(1))
else:
return 0
def extract_xmr_address(md_string: str) -> int:
xmrs = re.findall(r":_.*([1-9A-HJ-NP-Za-km-z]{95})", md_string)
print(xmrs)
#xmrs = [x[-95:] for x in xmrs]
try:
# found 2 xmr addresses, extract the authors probable one
if len(xmrs) == 2:
xmrs.remove(NIHILIST_XMR)
except:
pass
if len(xmrs) != 0:
return xmrs[0]
return None
def extract_blog_info_ai(md_string: str) -> dict[str, str]:
print(f"Extracting info from {len(md_string)} bytes.")
#num_ctx = 8192 if len(md_string) < (8192*2.5) else 16384
#num_ctx = (int(len(md_string)/(8192*2.5))+1)*8192
num_ctx = 2048
r = requests.post(OLLAMA_HOST+'/api/generate', json={
"model": OLLAMA_MODEL,
"system": OLLAMA_PROMPT,
"prompt": md_string[:1024], # 1024 first bytes is enough
"format": "json",
"options": {"num_ctx": num_ctx, "seed": OLLAMA_SEED}, #len(md_string)//2},
"stream": False
})
resp = r.json()
if resp.get('prompt_eval_count', 0) == num_ctx:
print('WARNING: LLM context saturated, may give wrong answer')
return resp
# remove unnecessary header and footer, git issue reference, some other stuff
def do_markdown_fixes(md_string: str) -> str:
# there are 3 paragraphs in the footer: Nihilism, My Links, About
# some customize that tho
for _ in range(3):
foot_str = "#### "
found_at = md_string.rfind(foot_str)
#print('footer found at:', found_at)
if found_at >= 0:
md_string = md_string[:found_at]
else:
break
# remove git issue reference from the issue
git_regex = r'!\[\]\(\.\.\/logos\/daturagit\.png\).*?\s*directly!'
md_string = re.sub(git_regex, "", md_string, flags=re.DOTALL)
# remove header
git_regex = r'\[The Nihilism Opsec Blog\].*?\s*\(\.\.\/index\.html\)'
md_string = re.sub(git_regex, "", md_string, flags=re.DOTALL)
# remove weird backticks html2text lefts sometimes after code blocks
md_string = md_string.replace('\n`\n', '')
# to make mkdocs happy, we replace all links to html to md. may break some links if external website uses index.html
md_string = md_string.replace('/index.html', '/index.md')
return md_string
def cut_until_title(md_string: str) -> str:
return md_string[md_string.find('# '):]
def make_papermod_post(post_dir: str, md_string: str, blog_info: dict[str, str]) -> None:
header = f"""---
author: {blog_info['author']}
date: {blog_info['date']}
gitea_url: "http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/blog-contributions/issues/{blog_info.get('git', 0)}"
xmr: {blog_info['xmr']}
---
"""
f = open(os.path.join(post_dir, 'index.md'), 'w', encoding='utf-8')
f.write(header)
f.write(md_string)
f.close()
def walk_all_blogs(old_opsec_path: str) -> None:
for root, _, files in os.walk(old_opsec_path):
rel_name = root[root.find('opsec/')+6:]
post_dir = os.path.join('./docs/opsec', rel_name)
os.makedirs(post_dir, exist_ok=True)
for fi in files:
#print(fi)
# skip the index site
if os.path.basename(root) == '' and fi == 'index.html':
continue
if fi.endswith(('.png', '.jpg', '.jpeg', '.mp4')):
#print(f"Copying asset {fi}")
shutil.copy(os.path.join(root, fi), os.path.join(post_dir, fi))
continue
# only accept index.html at this point
if not fi.endswith('.html'):
continue
f = open(os.path.join(root, fi), 'r', encoding='utf-8')
fc = f.read()
f.close()
print(os.path.basename(root), extract_blog_info(fc))
blog_info = {}
blog_md_raw = html_to_papermod(fc)
blog_info['git'] = extract_git_issue_number(fc)
blog_info['xmr'] = extract_xmr_address(blog_md_raw)
# remove unnecessary stuff we'll include anyway in hugo
blog_md = do_markdown_fixes(blog_md_raw)
ai_extract = extract_blog_info_ai(blog_md)
print(ai_extract.get('response'), ai_extract.get('prompt_eval_count'))
blog_md = cut_until_title(blog_md)
try:
blog_info.update(json.loads(ai_extract.get('response')))
if blog_info['date'] in ('0000-00-00', None):
blog_info['date'] = '2001-01-30'
# in case model acts dumb
blog_info['date'] = blog_info['date'].replace('/', '-')
make_papermod_post(post_dir, blog_md, blog_info)
except Exception as e:
print(e)
if __name__ == '__main__':
if len(sys.argv) != 2:
print('Usage: convert_old_blog.py /path/to/blog-contributions/opsec')
exit(1)
old_opsec_path = sys.argv[1]
walk_all_blogs(old_opsec_path)