add avif compression script

2025-07-01 22:16:41 +00:00 · 2025-05-09 01:14:22 +02:00 · 2025-05-09 01:14:22 +02:00 · b22c1fdbc7
commit b22c1fdbc7
parent e981184453
8 changed files with 63 additions and 0 deletions
--- a/scripts/blogfix.sh
+++ b/scripts/blogfix.sh
@ -0,0 +1,4 @@
+#!/bin/sh
+
+# url-encode '%' char: https://github.com/gohugoio/hugo/issues/4586
+sed -i 's/#51%_attack/#51%25_attack/g' ./content/posts/monerop2pool/index.md
--- a/scripts/convert_old_blog.py
+++ b/scripts/convert_old_blog.py
@ -0,0 +1,197 @@
+import os
+import re
+import sys
+import json
+import shutil
+import requests
+import html2text
+
+OLLAMA_HOST = 'http://ollama:11434/'
+OLLAMA_PROMPT = """Analyze the blog post and give answer to following questions:
+*   Who wrote it? (usually occurs at the beginning)
+*   When was it written? (2001-12-30 format)
+The response format should be JSON. You can not output anything else other than JSON. If the response can't be found, add `null` value. Here's an example how output should look like:
+```
+{"author": "nihilist", "date": "2024-03-01"}
+```
+Important: do not output anything else!
+"""
+NIHILIST_XMR = '8AUYjhQeG3D5aodJDtqG499N5jXXM71gYKD8LgSsFB9BUV1o7muLv3DXHoydRTK4SZaaUBq4EAUqpZHLrX2VZLH71Jrd9k8'
+# make generation somewhat deterministic
+OLLAMA_SEED = 17
+OLLAMA_MODEL = 'gemma3:12b'
+
+def is_webp(fpath: str) -> bool:
+    with open(fpath, 'rb') as f:
+        f.seek(8)
+        return f.read(8) == b'WEBPVP8L'
+
+def html_to_papermod(html_data: str) -> str:
+
+    # try to find author, date and other metadata in html file
+    
+    h = html2text.HTML2Text()
+    # disable random wrapping
+    h.body_width = 0
+    return h.handle(html_data)
+
+def extract_blog_info(html_string: str) -> dict[str, str]:
+    match = re.search(r'<p><img.*?<ba>(.*?)</ba>.*?<h1>(.*?)</h1>', html_string, re.DOTALL)
+
+    user = None
+    title = None
+    if match:
+        user = match.group(1).strip()
+        title = match.group(2).strip()
+    
+    return user, title
+
+def extract_git_issue_number(html_string: str) -> int:
+    match = re.search(r'issues/(\d+)">git issue<', html_string)
+    if match:
+        return int(match.group(1))
+    else:
+        return 0
+
+def extract_xmr_address(md_string: str) -> int:
+    xmrs = re.findall(r":_.*([1-9A-HJ-NP-Za-km-z]{95})", md_string)
+    print(xmrs)
+    #xmrs = [x[-95:] for x in xmrs]
+    try:
+        # found 2 xmr addresses, extract the authors probable one
+        if len(xmrs) == 2:
+            xmrs.remove(NIHILIST_XMR)
+    except:
+        pass
+    
+    if len(xmrs) != 0:
+        return xmrs[0]
+
+    return None
+
+def extract_blog_info_ai(md_string: str) -> dict[str, str]:
+    print(f"Extracting info from {len(md_string)} bytes.")
+    
+    #num_ctx = 8192 if len(md_string) < (8192*2.5) else 16384
+    #num_ctx = (int(len(md_string)/(8192*2.5))+1)*8192
+    num_ctx = 2048
+    
+    r = requests.post(OLLAMA_HOST+'/api/generate', json={
+      "model": OLLAMA_MODEL,
+      "system": OLLAMA_PROMPT,
+      "prompt": md_string[:1024],  # 1024 first bytes is enough
+      "format": "json",
+      "options": {"num_ctx": num_ctx, "seed": OLLAMA_SEED}, #len(md_string)//2},
+      "stream": False
+    })
+
+    resp = r.json()
+    if resp.get('prompt_eval_count', 0) == num_ctx:
+        print('WARNING: LLM context saturated, may give wrong answer')
+
+    return resp
+
+# remove unnecessary header and footer, git issue reference, some other stuff
+def do_markdown_fixes(md_string: str) -> str:
+    
+    # there are 3 paragraphs in the footer: Nihilism, My Links, About
+    # some customize that tho
+    for _ in range(3):
+        foot_str = "#### "
+        found_at = md_string.rfind(foot_str)
+        #print('footer found at:', found_at)
+        if found_at >= 0:
+            md_string = md_string[:found_at]
+        else:
+            break
+        
+    # remove git issue reference from the issue
+    git_regex = r'!\[\]\(\.\.\/logos\/daturagit\.png\).*?\s*directly!'
+    md_string = re.sub(git_regex, "", md_string, flags=re.DOTALL)
+        
+    # remove header
+    git_regex = r'\[The Nihilism Opsec Blog\].*?\s*\(\.\.\/index\.html\)'
+    md_string = re.sub(git_regex, "", md_string, flags=re.DOTALL)
+    
+    # remove weird backticks html2text lefts sometimes after code blocks
+    md_string = md_string.replace('\n`\n', '')
+    
+    # to make mkdocs happy, we replace all links to html to md. may break some links if external website uses index.html
+    md_string = md_string.replace('/index.html', '/index.md')
+    
+    return md_string
+
+def cut_until_title(md_string: str) -> str:
+    return md_string[md_string.find('# '):]
+
+def make_papermod_post(post_dir: str, md_string: str, blog_info: dict[str, str]) -> None:
+    
+    header = f"""---
+author: {blog_info['author']}
+date: {blog_info['date']}
+gitea_url: "http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/blog-contributions/issues/{blog_info.get('git', 0)}"
+xmr: {blog_info['xmr']}
+---
+"""
+    f = open(os.path.join(post_dir, 'index.md'), 'w', encoding='utf-8')
+    f.write(header)
+    f.write(md_string)
+    f.close()
+
+def walk_all_blogs(old_opsec_path: str) -> None:
+    for root, _, files in os.walk(old_opsec_path):
+    
+        rel_name = root[root.find('opsec/')+6:]
+        post_dir = os.path.join('./docs/opsec', rel_name)
+        os.makedirs(post_dir, exist_ok=True)
+        
+        for fi in files:
+            #print(fi)
+            
+            # skip the index site
+            if os.path.basename(root) == '' and fi == 'index.html':
+                continue
+            
+            if fi.endswith(('.png', '.jpg', '.jpeg', '.mp4')):
+                #print(f"Copying asset {fi}")
+                shutil.copy(os.path.join(root, fi), os.path.join(post_dir, fi))
+                continue
+            
+            # only accept index.html at this point
+            if not fi.endswith('.html'):
+                continue
+
+            f = open(os.path.join(root, fi), 'r', encoding='utf-8')
+            fc = f.read()
+            f.close()
+            print(os.path.basename(root), extract_blog_info(fc))
+
+            blog_info = {}
+            blog_md_raw = html_to_papermod(fc)
+            blog_info['git'] = extract_git_issue_number(fc)
+            blog_info['xmr'] = extract_xmr_address(blog_md_raw)
+
+            # remove unnecessary stuff we'll include anyway in hugo
+            blog_md = do_markdown_fixes(blog_md_raw)
+            
+            ai_extract = extract_blog_info_ai(blog_md)
+            print(ai_extract.get('response'), ai_extract.get('prompt_eval_count'))
+            
+            blog_md = cut_until_title(blog_md)
+            
+            try:
+                blog_info.update(json.loads(ai_extract.get('response')))
+                if blog_info['date'] in ('0000-00-00', None):
+                    blog_info['date'] = '2001-01-30'
+                # in case model acts dumb
+                blog_info['date'] = blog_info['date'].replace('/', '-')
+                make_papermod_post(post_dir, blog_md, blog_info)
+            except Exception as e:
+                print(e)
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print('Usage: convert_old_blog.py /path/to/blog-contributions/opsec')
+        exit(1)
+    old_opsec_path = sys.argv[1]
+    walk_all_blogs(old_opsec_path)
--- a/scripts/exp/blogtranslate.py
+++ b/scripts/exp/blogtranslate.py
@ -0,0 +1,105 @@
+#!/usr/bin/python3
+
+# experimental script converting all blog posts into given language using ollama
+
+import requests
+import shutil
+import os
+
+LANG = 'spanish'
+OLLAMA_HOST = 'http://ollama:11434/'
+OLLAMA_MODEL = 'qwen3:1.7b'
+OLLAMA_PROMPT = f'''Translate the blog post into {LANG}, keep the technical language as well as Linux commands in english. Keep the overall structure - source code tags, metadata key names (like "author", "title", "date", ...) but translate the values of metadata fields. Only output the translated blog.
+
+For example this fragment:
+```
+---
+author: XMRonly
+title: "How to Get an Email Account Anonymously (Emails as a Service)"
+date: 2024-10-16
+description: "This blog post explains how to sign up for an anonymous email account using Proton Mail and Tor, focusing on privacy and avoiding personal information input. It highlights the limitations of traditional email and the need for privacy-focused alternatives.."
+summary: "This blog post explains how to sign up for an anonymous email account using Proton Mail and Tor, focusing on privacy and avoiding personal information input. It highlights the limitations of traditional email and the need for privacy-focused alternatives.."
+tags: ['email', 'anonymous', 'privacy', 'protonmail', 'tor', 'opsec']
+ShowToc: true
+TocOpen: true
+editPost:
+    URL: "http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/blog-contributions/issues/26"
+    Text: "Suggest Changes"
+    appendFilePath: false
+---
+
+
+![](../../assets/img/user.png) XMRonly - 2024 / 10 / 16
+
+# How to Get an Email Account Anonymously (Emails as a Service)
+
+![](0.png)
+
+
+
+## **Introduction**
+
+Email is one of the most widely used forms of online communication, both for personal and professional interactions. With billions sent daily, you would expect email to be secure, accessible, and readable by only the intended recipient. Unfortunately, email is an old technology and this is not always the case. With metadata being visible, large email providers scanning emails, as well as potential government surveillance in some parts of the world, it is no surprise that email is hardly considered private. As such, you may want to send an email that is not tied to your real identity. In this article, we will explore how to sign up for email account anonymously. Specifically, we will explore a privacy-focused email provider, **Proton Mail** , and how to sign up using Tor without inputting any additional information whatsoever.```
+
+Should become:
+```
+---
+author: XMRonly
+title: "Cómo Obtener una Cuenta de Correo Electrónico Anónimamente (Servicios de Correo Electrónico)"
+date: 2024-10-16
+description: "Este artículo explica cómo registrarse para una cuenta de correo electrónico anónima utilizando Proton Mail y Tor, centrándose en la privacidad y evitando introducir información personal. Destaca las limitaciones del correo tradicional y la necesidad de alternativas centradas en la privacidad."
+summary: "Este artículo explica cómo registrarse para una cuenta de correo electrónico anónima utilizando Proton Mail y Tor, centrándose en la privacidad y evitando introducir información personal. Destaca las limitaciones del correo tradicional y la necesidad de alternativas centradas en la privacidad."
+tags: ['correo', 'anónimo', 'privacidad', 'protonmail', 'tor', 'opsec']
+ShowToc: true
+TocOpen: true
+editPost:
+    URL: "http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/blog-contributions/issues/26"
+    Text: "Sugerir Cambios"
+    appendFilePath: false
+---
+
+![](../../assets/img/user.png) XMRonly - 2024 / 10 / 16
+
+# Cómo Obtener una Cuenta de Correo Electrónico Anónimamente (Servicios de Correo Electrónico)
+
+![](0.png)
+
+## **Introducción**
+
+El correo electrónico es uno de los métodos más utilizados para la comunicación en línea, tanto en interacciones personales como profesionales. Con miles de millones enviados diariamente, se esperaría que el correo sea seguro, accesible y legible únicamente por el destinatario previsto. Desafortunadamente, el correo electrónico es una tecnología antigua y esto no siempre es el caso. Dado que la metadatos son visibles, los grandes proveedores de correo escanean correos electrónicos y existe la posibilidad de vigilancia gubernamental en algunas partes del mundo, no es sorprendente que el correo electrónico rara vez se considere privado. Como resultado, podrías querer enviar un correo electrónico que no esté vinculado a tu identidad real. En este artículo, exploraremos cómo registrarse para una cuenta de correo electrónica anónimamente. Específicamente, examinaremos un proveedor centrado en la privacidad, **Proton Mail**, y cómo registrarse usando Tor sin introducir ninguna información adicional.```'''
+
+new_content_dir = f'content.{LANG}'
+
+for root, _, files in os.walk('./content'):
+    for fi in files:
+
+        infpath = os.path.join(root, fi)
+        outfpath = os.path.join(root.replace('./content', new_content_dir), fi)
+        os.makedirs(os.path.dirname(outfpath), exist_ok=True)
+    
+        if fi != 'index.md':
+            shutil.copy(infpath, outfpath)
+            continue
+        
+        f = open(infpath, encoding='utf-8')
+        fc = f.read()
+        f.close()
+        
+        num_ctx = (int(len(fc)/(8192*2))+1)*8192
+
+        r = requests.post(OLLAMA_HOST+'/api/generate', json={
+          "model": OLLAMA_MODEL,
+          "system": OLLAMA_PROMPT,
+          "prompt": fc,
+          "options": {"num_ctx": num_ctx, "seed": 14},
+          "stream": False
+        })
+
+        resp = r.json()
+        print(root, num_ctx, resp.get('prompt_eval_count', 0))
+        
+        fo = open(outfpath, 'w', encoding='utf-8')
+        data = resp.get('response')
+        startidx = data.index('</think>')+10
+        fo.write(data[startidx:])
+        fo.close()
--- a/scripts/imagecompressv2.py
+++ b/scripts/imagecompressv2.py
@ -0,0 +1,63 @@
+#!/usr/bin/python3
+
+# requires imagemagick, avifenc, svt-av1
+
+import os
+import subprocess
+import concurrent.futures
+
+# below this threshold, it won't be compressed with avif
+MIN_FILE_SIZE = 30000
+
+def detect_compressible(fpath):
+    mimetype = subprocess.check_output(["file", "--mime-type", "-b", fpath]).strip().decode('utf-8')
+
+    if mimetype in ('image/png', 'image/jpeg'):
+        return True
+
+    if mimetype == 'image/webp':
+        fileout = subprocess.check_output(["file", fpath]).decode('utf-8')
+        if 'lossless' in fileout:
+            return True
+
+    return False
+
+def compress(fpath):
+    rtpath = '/tmp/'+os.urandom(4).hex()+'.png'
+    subprocess.run(["magick", fpath, rtpath])
+    subprocess.run(["avifenc", rtpath, "--yuv", "420", "--range", "l", "-q", "50", "-c", "svt", "-j", "1", "--speed", "0", "--ignore-exif", "-o", fpath])
+    try:
+        os.remove(rtpath)
+    except:
+        pass
+
+def process_files_in_directory(directory):
+    compressible_files = []
+
+    for root, _, files in os.walk(directory):
+        for fi in files:
+            full_path = os.path.join(root, fi)
+            print(f"Checking file: {full_path}")
+            if os.path.getsize(full_path) < MIN_FILE_SIZE:
+                continue
+            if detect_compressible(full_path):
+                compressible_files.append(full_path)
+
+    return compressible_files
+
+def main(directory, max_workers):
+    compressible_files = process_files_in_directory(directory)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(compress, fpath): fpath for fpath in compressible_files}
+
+        for future in concurrent.futures.as_completed(futures):
+            fpath = futures[future]
+            try:
+                future.result()
+                print(f"Successfully compressed: {fpath}")
+            except Exception as exc:
+                print(f"Compression failed for {fpath}: {exc}")
+
+if __name__ == "__main__":
+    main('../docs/', max_workers=32)