add avif compression script

This commit is contained in:
oxeo0 2025-05-09 01:14:22 +02:00
parent e981184453
commit b22c1fdbc7
8 changed files with 63 additions and 0 deletions

4
scripts/blogfix.sh Normal file
View file

@ -0,0 +1,4 @@
#!/bin/sh
# url-encode '%' char: https://github.com/gohugoio/hugo/issues/4586
sed -i 's/#51%_attack/#51%25_attack/g' ./content/posts/monerop2pool/index.md

197
scripts/convert_old_blog.py Normal file
View file

@ -0,0 +1,197 @@
import os
import re
import sys
import json
import shutil
import requests
import html2text
OLLAMA_HOST = 'http://ollama:11434/'
OLLAMA_PROMPT = """Analyze the blog post and give answer to following questions:
* Who wrote it? (usually occurs at the beginning)
* When was it written? (2001-12-30 format)
The response format should be JSON. You can not output anything else other than JSON. If the response can't be found, add `null` value. Here's an example how output should look like:
```
{"author": "nihilist", "date": "2024-03-01"}
```
Important: do not output anything else!
"""
NIHILIST_XMR = '8AUYjhQeG3D5aodJDtqG499N5jXXM71gYKD8LgSsFB9BUV1o7muLv3DXHoydRTK4SZaaUBq4EAUqpZHLrX2VZLH71Jrd9k8'
# make generation somewhat deterministic
OLLAMA_SEED = 17
OLLAMA_MODEL = 'gemma3:12b'
def is_webp(fpath: str) -> bool:
with open(fpath, 'rb') as f:
f.seek(8)
return f.read(8) == b'WEBPVP8L'
def html_to_papermod(html_data: str) -> str:
# try to find author, date and other metadata in html file
h = html2text.HTML2Text()
# disable random wrapping
h.body_width = 0
return h.handle(html_data)
def extract_blog_info(html_string: str) -> dict[str, str]:
match = re.search(r'<p><img.*?<ba>(.*?)</ba>.*?<h1>(.*?)</h1>', html_string, re.DOTALL)
user = None
title = None
if match:
user = match.group(1).strip()
title = match.group(2).strip()
return user, title
def extract_git_issue_number(html_string: str) -> int:
match = re.search(r'issues/(\d+)">git issue<', html_string)
if match:
return int(match.group(1))
else:
return 0
def extract_xmr_address(md_string: str) -> int:
xmrs = re.findall(r":_.*([1-9A-HJ-NP-Za-km-z]{95})", md_string)
print(xmrs)
#xmrs = [x[-95:] for x in xmrs]
try:
# found 2 xmr addresses, extract the authors probable one
if len(xmrs) == 2:
xmrs.remove(NIHILIST_XMR)
except:
pass
if len(xmrs) != 0:
return xmrs[0]
return None
def extract_blog_info_ai(md_string: str) -> dict[str, str]:
print(f"Extracting info from {len(md_string)} bytes.")
#num_ctx = 8192 if len(md_string) < (8192*2.5) else 16384
#num_ctx = (int(len(md_string)/(8192*2.5))+1)*8192
num_ctx = 2048
r = requests.post(OLLAMA_HOST+'/api/generate', json={
"model": OLLAMA_MODEL,
"system": OLLAMA_PROMPT,
"prompt": md_string[:1024], # 1024 first bytes is enough
"format": "json",
"options": {"num_ctx": num_ctx, "seed": OLLAMA_SEED}, #len(md_string)//2},
"stream": False
})
resp = r.json()
if resp.get('prompt_eval_count', 0) == num_ctx:
print('WARNING: LLM context saturated, may give wrong answer')
return resp
# remove unnecessary header and footer, git issue reference, some other stuff
def do_markdown_fixes(md_string: str) -> str:
# there are 3 paragraphs in the footer: Nihilism, My Links, About
# some customize that tho
for _ in range(3):
foot_str = "#### "
found_at = md_string.rfind(foot_str)
#print('footer found at:', found_at)
if found_at >= 0:
md_string = md_string[:found_at]
else:
break
# remove git issue reference from the issue
git_regex = r'!\[\]\(\.\.\/logos\/daturagit\.png\).*?\s*directly!'
md_string = re.sub(git_regex, "", md_string, flags=re.DOTALL)
# remove header
git_regex = r'\[The Nihilism Opsec Blog\].*?\s*\(\.\.\/index\.html\)'
md_string = re.sub(git_regex, "", md_string, flags=re.DOTALL)
# remove weird backticks html2text lefts sometimes after code blocks
md_string = md_string.replace('\n`\n', '')
# to make mkdocs happy, we replace all links to html to md. may break some links if external website uses index.html
md_string = md_string.replace('/index.html', '/index.md')
return md_string
def cut_until_title(md_string: str) -> str:
return md_string[md_string.find('# '):]
def make_papermod_post(post_dir: str, md_string: str, blog_info: dict[str, str]) -> None:
header = f"""---
author: {blog_info['author']}
date: {blog_info['date']}
gitea_url: "http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/blog-contributions/issues/{blog_info.get('git', 0)}"
xmr: {blog_info['xmr']}
---
"""
f = open(os.path.join(post_dir, 'index.md'), 'w', encoding='utf-8')
f.write(header)
f.write(md_string)
f.close()
def walk_all_blogs(old_opsec_path: str) -> None:
for root, _, files in os.walk(old_opsec_path):
rel_name = root[root.find('opsec/')+6:]
post_dir = os.path.join('./docs/opsec', rel_name)
os.makedirs(post_dir, exist_ok=True)
for fi in files:
#print(fi)
# skip the index site
if os.path.basename(root) == '' and fi == 'index.html':
continue
if fi.endswith(('.png', '.jpg', '.jpeg', '.mp4')):
#print(f"Copying asset {fi}")
shutil.copy(os.path.join(root, fi), os.path.join(post_dir, fi))
continue
# only accept index.html at this point
if not fi.endswith('.html'):
continue
f = open(os.path.join(root, fi), 'r', encoding='utf-8')
fc = f.read()
f.close()
print(os.path.basename(root), extract_blog_info(fc))
blog_info = {}
blog_md_raw = html_to_papermod(fc)
blog_info['git'] = extract_git_issue_number(fc)
blog_info['xmr'] = extract_xmr_address(blog_md_raw)
# remove unnecessary stuff we'll include anyway in hugo
blog_md = do_markdown_fixes(blog_md_raw)
ai_extract = extract_blog_info_ai(blog_md)
print(ai_extract.get('response'), ai_extract.get('prompt_eval_count'))
blog_md = cut_until_title(blog_md)
try:
blog_info.update(json.loads(ai_extract.get('response')))
if blog_info['date'] in ('0000-00-00', None):
blog_info['date'] = '2001-01-30'
# in case model acts dumb
blog_info['date'] = blog_info['date'].replace('/', '-')
make_papermod_post(post_dir, blog_md, blog_info)
except Exception as e:
print(e)
if __name__ == '__main__':
if len(sys.argv) != 2:
print('Usage: convert_old_blog.py /path/to/blog-contributions/opsec')
exit(1)
old_opsec_path = sys.argv[1]
walk_all_blogs(old_opsec_path)

View file

@ -0,0 +1,105 @@
#!/usr/bin/python3
# experimental script converting all blog posts into given language using ollama
import requests
import shutil
import os
LANG = 'spanish'
OLLAMA_HOST = 'http://ollama:11434/'
OLLAMA_MODEL = 'qwen3:1.7b'
OLLAMA_PROMPT = f'''Translate the blog post into {LANG}, keep the technical language as well as Linux commands in english. Keep the overall structure - source code tags, metadata key names (like "author", "title", "date", ...) but translate the values of metadata fields. Only output the translated blog.
For example this fragment:
```
---
author: XMRonly
title: "How to Get an Email Account Anonymously (Emails as a Service)"
date: 2024-10-16
description: "This blog post explains how to sign up for an anonymous email account using Proton Mail and Tor, focusing on privacy and avoiding personal information input. It highlights the limitations of traditional email and the need for privacy-focused alternatives.."
summary: "This blog post explains how to sign up for an anonymous email account using Proton Mail and Tor, focusing on privacy and avoiding personal information input. It highlights the limitations of traditional email and the need for privacy-focused alternatives.."
tags: ['email', 'anonymous', 'privacy', 'protonmail', 'tor', 'opsec']
ShowToc: true
TocOpen: true
editPost:
URL: "http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/blog-contributions/issues/26"
Text: "Suggest Changes"
appendFilePath: false
---
![](../../assets/img/user.png) XMRonly - 2024 / 10 / 16
# How to Get an Email Account Anonymously (Emails as a Service)
![](0.png)
## **Introduction**
Email is one of the most widely used forms of online communication, both for personal and professional interactions. With billions sent daily, you would expect email to be secure, accessible, and readable by only the intended recipient. Unfortunately, email is an old technology and this is not always the case. With metadata being visible, large email providers scanning emails, as well as potential government surveillance in some parts of the world, it is no surprise that email is hardly considered private. As such, you may want to send an email that is not tied to your real identity. In this article, we will explore how to sign up for email account anonymously. Specifically, we will explore a privacy-focused email provider, **Proton Mail** , and how to sign up using Tor without inputting any additional information whatsoever.```
Should become:
```
---
author: XMRonly
title: "Cómo Obtener una Cuenta de Correo Electrónico Anónimamente (Servicios de Correo Electrónico)"
date: 2024-10-16
description: "Este artículo explica cómo registrarse para una cuenta de correo electrónico anónima utilizando Proton Mail y Tor, centrándose en la privacidad y evitando introducir información personal. Destaca las limitaciones del correo tradicional y la necesidad de alternativas centradas en la privacidad."
summary: "Este artículo explica cómo registrarse para una cuenta de correo electrónico anónima utilizando Proton Mail y Tor, centrándose en la privacidad y evitando introducir información personal. Destaca las limitaciones del correo tradicional y la necesidad de alternativas centradas en la privacidad."
tags: ['correo', 'anónimo', 'privacidad', 'protonmail', 'tor', 'opsec']
ShowToc: true
TocOpen: true
editPost:
URL: "http://git.nowherejezfoltodf4jiyl6r56jnzintap5vyjlia7fkirfsnfizflqd.onion/nihilist/blog-contributions/issues/26"
Text: "Sugerir Cambios"
appendFilePath: false
---
![](../../assets/img/user.png) XMRonly - 2024 / 10 / 16
# Cómo Obtener una Cuenta de Correo Electrónico Anónimamente (Servicios de Correo Electrónico)
![](0.png)
## **Introducción**
El correo electrónico es uno de los métodos más utilizados para la comunicación en línea, tanto en interacciones personales como profesionales. Con miles de millones enviados diariamente, se esperaría que el correo sea seguro, accesible y legible únicamente por el destinatario previsto. Desafortunadamente, el correo electrónico es una tecnología antigua y esto no siempre es el caso. Dado que la metadatos son visibles, los grandes proveedores de correo escanean correos electrónicos y existe la posibilidad de vigilancia gubernamental en algunas partes del mundo, no es sorprendente que el correo electrónico rara vez se considere privado. Como resultado, podrías querer enviar un correo electrónico que no esté vinculado a tu identidad real. En este artículo, exploraremos cómo registrarse para una cuenta de correo electrónica anónimamente. Específicamente, examinaremos un proveedor centrado en la privacidad, **Proton Mail**, y cómo registrarse usando Tor sin introducir ninguna información adicional.```'''
new_content_dir = f'content.{LANG}'
for root, _, files in os.walk('./content'):
for fi in files:
infpath = os.path.join(root, fi)
outfpath = os.path.join(root.replace('./content', new_content_dir), fi)
os.makedirs(os.path.dirname(outfpath), exist_ok=True)
if fi != 'index.md':
shutil.copy(infpath, outfpath)
continue
f = open(infpath, encoding='utf-8')
fc = f.read()
f.close()
num_ctx = (int(len(fc)/(8192*2))+1)*8192
r = requests.post(OLLAMA_HOST+'/api/generate', json={
"model": OLLAMA_MODEL,
"system": OLLAMA_PROMPT,
"prompt": fc,
"options": {"num_ctx": num_ctx, "seed": 14},
"stream": False
})
resp = r.json()
print(root, num_ctx, resp.get('prompt_eval_count', 0))
fo = open(outfpath, 'w', encoding='utf-8')
data = resp.get('response')
startidx = data.index('</think>')+10
fo.write(data[startidx:])
fo.close()

View file

@ -0,0 +1,63 @@
#!/usr/bin/python3
# requires imagemagick, avifenc, svt-av1
import os
import subprocess
import concurrent.futures
# below this threshold, it won't be compressed with avif
MIN_FILE_SIZE = 30000
def detect_compressible(fpath):
mimetype = subprocess.check_output(["file", "--mime-type", "-b", fpath]).strip().decode('utf-8')
if mimetype in ('image/png', 'image/jpeg'):
return True
if mimetype == 'image/webp':
fileout = subprocess.check_output(["file", fpath]).decode('utf-8')
if 'lossless' in fileout:
return True
return False
def compress(fpath):
rtpath = '/tmp/'+os.urandom(4).hex()+'.png'
subprocess.run(["magick", fpath, rtpath])
subprocess.run(["avifenc", rtpath, "--yuv", "420", "--range", "l", "-q", "50", "-c", "svt", "-j", "1", "--speed", "0", "--ignore-exif", "-o", fpath])
try:
os.remove(rtpath)
except:
pass
def process_files_in_directory(directory):
compressible_files = []
for root, _, files in os.walk(directory):
for fi in files:
full_path = os.path.join(root, fi)
print(f"Checking file: {full_path}")
if os.path.getsize(full_path) < MIN_FILE_SIZE:
continue
if detect_compressible(full_path):
compressible_files.append(full_path)
return compressible_files
def main(directory, max_workers):
compressible_files = process_files_in_directory(directory)
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(compress, fpath): fpath for fpath in compressible_files}
for future in concurrent.futures.as_completed(futures):
fpath = futures[future]
try:
future.result()
print(f"Successfully compressed: {fpath}")
except Exception as exc:
print(f"Compression failed for {fpath}: {exc}")
if __name__ == "__main__":
main('../docs/', max_workers=32)