From 079bd25899be9edaffa484a6f3f1e02695b0fe06 Mon Sep 17 00:00:00 2001 From: Tomas Kracmar Date: Sun, 14 Sep 2025 16:56:08 +0200 Subject: [PATCH] First commit --- .gitignore | 21 ++ Makefile | 57 ++++ README.md | 56 ++++ build.py | 246 ++++++++++++++++++ config.yaml | 22 ++ db.py | 91 +++++++ emb.py | 10 + .../2025-09-19/sample-welsh-rainforests.md | 11 + inbox.txt | 4 + requirements.txt | 12 + scripts/build_from_db.py | 154 +++++++++++ scripts/db_cli.py | 18 ++ scripts/ingest_list.py | 130 +++++++++ scripts/schema.sql | 71 +++++ scripts/sync_entries_to_db.py | 38 +++ templates/item.html.j2 | 2 + templates/newsletter.html.j2 | 7 + templates/prompt.txt | 24 ++ templates/style_bank.md | 2 + 19 files changed, 976 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README.md create mode 100644 build.py create mode 100644 config.yaml create mode 100644 db.py create mode 100644 emb.py create mode 100644 entries/2025-09-19/sample-welsh-rainforests.md create mode 100644 inbox.txt create mode 100644 requirements.txt create mode 100644 scripts/build_from_db.py create mode 100644 scripts/db_cli.py create mode 100644 scripts/ingest_list.py create mode 100644 scripts/schema.sql create mode 100644 scripts/sync_entries_to_db.py create mode 100644 templates/item.html.j2 create mode 100644 templates/newsletter.html.j2 create mode 100644 templates/prompt.txt create mode 100644 templates/style_bank.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b409e14 --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +# --- Python / environment --- +.venv/ +__pycache__/ +*.py[cod] +*.egg-info/ +.DS_Store + +# --- Secrets --- +.env + +# --- Local database (optional) --- +# Comment out if you DO want to commit newsletter history! +data/*.db + +# --- Build outputs --- +dist/ +*.html + +# --- Logs / temp --- +*.log +.cache/ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..dbfea05 --- /dev/null +++ b/Makefile @@ -0,0 +1,57 @@ +DATE ?= $(shell date +%F) +ENTRIES_DIR ?= entries/$(DATE) +OUT ?= dist/$(DATE).html +LIMIT ?= 10 + +PROVIDER ?= +API_BASE ?= +MODEL ?= +TEMP ?= + +LLM_FLAGS := +ifneq ($(strip $(PROVIDER)),) + LLM_FLAGS += --llm-provider $(PROVIDER) +endif +ifneq ($(strip $(API_BASE)),) + LLM_FLAGS += --llm-api-base $(API_BASE) +endif +ifneq ($(strip $(MODEL)),) + LLM_FLAGS += --llm-model $(MODEL) +endif +ifneq ($(strip $(TEMP)),) + LLM_FLAGS += --temperature $(TEMP) +endif + +.PHONY: init ingest stubs build draft build-db draft-db sync-db clean + +init: + python scripts/db_cli.py init + +ingest: + python scripts/ingest_list.py --list inbox.txt + +stubs: + python scripts/ingest_list.py --list inbox.txt --fetch --stubs --date $(DATE) --sleep 0.5 + +build: + python build.py $(ENTRIES_DIR) --out $(OUT) $(LLM_FLAGS) + +draft: + python build.py $(ENTRIES_DIR) --out $(OUT) --publish $(LLM_FLAGS) + +build-db: + python scripts/build_from_db.py --limit $(LIMIT) --out $(OUT) $(LLM_FLAGS) + +draft-db: + python scripts/build_from_db.py --limit $(LIMIT) --out $(OUT) --publish $(LLM_FLAGS) + +sync-db: + python scripts/sync_entries_to_db.py --dir $(ENTRIES_DIR) + +clean: + rm -rf dist/* + + +# Dry run: generate summaries and print HTML to stdout (no file write, no publish) +dry-run: + python build.py $(ENTRIES_DIR) --dry-run $(LLM_FLAGS) diff --git a/README.md b/README.md new file mode 100644 index 0000000..d532b8f --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# Offline Newsletter Builder (Archaeology/History) + +Local-first toolchain to turn your article links (or pasted texts) into a Friday Ghost draft with consistent summaries, plus a SQLite database for memory and related-article suggestions. + +## Highlights +- Paste URLs into `inbox.txt` → run importer → get `.md` stubs **and** a growing SQLite DB. +- One command to build the weekly newsletter (HTML) and optionally create a **Ghost draft**. +- Consistent tone via `templates/prompt.txt` (+ your `templates/style_bank.md` samples). +- "Memory" via `data/newsletter.db` (sources, summaries, embeddings, FTS), used to auto-suggest **related** items. + +## Quick start +```bash +python -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt + +# Configure secrets +cp .env.example .env +# Edit .env to set your keys (or point to local OpenWebUI/Ollama) + +# Create DB schema +python scripts/db_cli.py init + +# Ingest URLs (from inbox.txt) +python scripts/ingest_list.py --list inbox.txt --fetch --stubs --date 2025-09-19 --sleep 0.5 + +# Build newsletter from stubs (HTML + optional Ghost draft) +python build.py entries/2025-09-19 --out dist/2025-09-19.html --publish + +# (Optional) Build directly from DB (Top-N sources) +python scripts/build_from_db.py --limit 10 --out dist/2025-09-19.html --publish +``` + +## Sample Run (Pretend Output) + +```bash +$ make init +Initialized schema in data/newsletter.db + +$ make stubs DATE=2025-09-19 +Stub: entries/2025-09-19/bbc-welsh-rainforests.md +Stub: entries/2025-09-19/nature-bone-discovery.md +Ingested: 2 URLs into data/newsletter.db + +$ make build DATE=2025-09-19 +[build] Using LLM provider=openwebui model=qwen2.5-7b-instruct +[build] Generating summary for: Velšské deštné pralesy ... +[build] -> Summary written back to DB (id=5, embedding stored) +[build] Generating summary for: Kosterní nález u Nisy ... +[build] -> Summary written back to DB (id=6, embedding stored) +Saved: dist/2025-09-19.html + +$ make draft DATE=2025-09-19 +Draft: https://your-ghost-site.ghost.io/p/objevy-tydne-2025-09-19-draft +``` + +You can open the generated HTML under `dist/2025-09-19.html` in your browser to review before publishing. diff --git a/build.py b/build.py new file mode 100644 index 0000000..6c94a35 --- /dev/null +++ b/build.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +import os, sys, glob, time, json, html, argparse, pathlib, textwrap, re +from datetime import date +import yaml, requests, jwt +from jinja2 import Template +from dotenv import load_dotenv + +load_dotenv() + +ROOT = pathlib.Path(__file__).parent +TEMPLATES = ROOT / "templates" + +def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8") + +def load_config(): + cfg = yaml.safe_load(read_file(ROOT / "config.yaml")) + cfg["date"] = date.today().isoformat() + return cfg + +def parse_front_matter(text): + m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M) + if not m: + return {}, text.strip() + import yaml as _yaml + fm = _yaml.safe_load(m.group(1)) or {} + body = m.group(2).strip() + return fm, body + +# ---- LLM config / client ---- +from dataclasses import dataclass + +@dataclass +class LLMConfig: + provider: str + api_base: str + model: str + api_key: str | None + temperature: float + top_p: float + presence_penalty: float + frequency_penalty: float + timeout_seconds: int + max_retries: int + +def resolve_llm_config(cfg: dict, args) -> LLMConfig: + llm_cfg = cfg.get("llm", {}) if cfg else {} + + def pick(cli_val, env_key, cfg_key, default=None): + if cli_val is not None: + return cli_val + if env_key and os.getenv(env_key): + return os.getenv(env_key) + return llm_cfg.get(cfg_key, default) + + provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui") + api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base", + "http://localhost:3000" if provider=="openwebui" else + "http://localhost:11434" if provider=="ollama" else + "https://api.openai.com") + model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model", + "qwen2.5-7b-instruct" if provider=="openwebui" else + "llama3.1:8b-instruct" if provider=="ollama" else + "gpt-4o-mini") + api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None) + + temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2)) + top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0)) + presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0)) + frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0)) + timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120)) + max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2)) + + return LLMConfig( + provider=provider, api_base=api_base, model=model, api_key=api_key, + temperature=temperature, top_p=top_p, + presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + timeout_seconds=timeout_seconds, max_retries=max_retries + ) + +def chat_completion_llm(messages, llm: LLMConfig): + if llm.provider == "openwebui": + url = f"{llm.api_base.rstrip('/')}/api/chat/completions" + headers = {"Content-Type":"application/json"} + if llm.api_key: + headers["Authorization"] = f"Bearer {llm.api_key}" + elif llm.provider == "ollama": + url = f"{llm.api_base.rstrip('/')}/v1/chat/completions" + headers = {"Content-Type":"application/json"} + if llm.api_key: + headers["Authorization"] = f"Bearer {llm.api_key}" + else: + url = f"{llm.api_base.rstrip('/')}/v1/chat/completions" + headers = {"Content-Type":"application/json"} + if llm.api_key: + headers["Authorization"] = f"Bearer {llm.api_key}" + + payload = { + "model": llm.model, + "messages": messages, + "temperature": llm.temperature, + "top_p": llm.top_p, + "presence_penalty": llm.presence_penalty, + "frequency_penalty": llm.frequency_penalty, + "stream": False + } + + attempt = 0 + last_err = None + while attempt <= llm.max_retries: + try: + r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds) + r.raise_for_status() + data = r.json() + return data["choices"][0]["message"]["content"] + except Exception as e: + last_err = e + attempt += 1 + if attempt > llm.max_retries: + break + time.sleep(min(2**attempt, 8)) + raise RuntimeError(f"LLM request failed after {llm.max_retries} retries: {last_err}") + +def call_llm_via_messages(prompt: str, llm: LLMConfig) -> str: + return chat_completion_llm([{"role":"user","content": prompt}], llm) + +# ---- Ghost ---- +def ghost_jwt(key: str) -> str: + key_id, secret = key.split(':') + iat = int(time.time()) + header = {"alg": "HS256", "kid": key_id, "typ": "JWT"} + payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"} + return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header) + +def create_ghost_draft(ghost_url, ghost_key, html_content, title, tags): + token = ghost_jwt(ghost_key) + payload = { "posts": [{ + "title": title, "html": html_content, "status": "draft", + "tags": [{"name": t} for t in tags] + }]} + r = requests.post( + f"{ghost_url}/posts/", + headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"}, + data=json.dumps(payload), timeout=60 + ) + r.raise_for_status() + return r.json()["posts"][0]["url"] + +# ---- Memory/embeddings ---- +from db import connect as db_connect, topk_similar +from emb import embed_text + +def build_related_hint_auto(title, body, llm_cfg, cfg_db): + api_base = os.getenv("EMB_API_BASE", llm_cfg.api_base) + api_key = os.getenv("EMB_API_KEY", llm_cfg.api_key) + model = os.getenv("EMB_MODEL", cfg_db.get("embed_model", "text-embedding-3-small")) + qtext = (title + "\n\n" + body)[:5000] + try: + vec = embed_text(qtext, api_base, api_key, model) + except Exception: + return "—" + con = db_connect(cfg_db["path"]) + hits = topk_similar(con, model=model, query_vec=vec, + ref_table="summaries", + k=cfg_db.get("related_top_k",3), + min_sim=cfg_db.get("min_similarity",0.78)) + if not hits: + return "—" + lines = [] + for sid, t, s, nd in hits: + lines.append(f"- {nd or 'dříve'}: {t}") + return "O podobném tématu jsme psali:\n" + "\n".join(lines) + "\nZmiň jednou větou souvislost." + +def main(): + ap = argparse.ArgumentParser(description="Offline-first generator + Ghost draft") + ap.add_argument("entries_dir", help="entries/YYYY-MM-DD directory") + ap.add_argument("--out", help="Output HTML path, e.g. dist/2025-09-19.html") + ap.add_argument("--dry-run", action="store_true") + ap.add_argument("--publish", action="store_true") + # LLM overrides + ap.add_argument("--llm-provider") + ap.add_argument("--llm-api-base") + ap.add_argument("--llm-model") + ap.add_argument("--temperature", type=float) + ap.add_argument("--top-p", type=float) + ap.add_argument("--presence-penalty", type=float) + ap.add_argument("--frequency-penalty", type=float) + ap.add_argument("--timeout-seconds", type=int) + ap.add_argument("--max-retries", type=int) + args = ap.parse_args() + + cfg = load_config() + llm = resolve_llm_config(cfg, args) + + item_tpl = read_file(TEMPLATES / "item.html.j2") + news_tpl = read_file(TEMPLATES / "newsletter.html.j2") + prompt_template = read_file(TEMPLATES / "prompt.txt") + style_examples = read_file(TEMPLATES / "style_bank.md").strip() + prompt_template = prompt_template.replace("{style_examples}", style_examples) + + paths = sorted(glob.glob(os.path.join(args.entries_dir, "*.md"))) + blocks = [] + for p in paths: + fm_text = pathlib.Path(p).read_text(encoding="utf-8") + fm, body = parse_front_matter(fm_text) + if fm.get("status","todo") == "skip": + continue + title = fm.get("title") or pathlib.Path(p).stem.replace("-"," ").title() + source_name = fm.get("source_name","Zdroj neuveden") + related_hint = build_related_hint_auto(title, body, llm, cfg.get("db",{})) + prompt = (prompt_template + .replace("{title}", title) + .replace("{body}", body) + .replace("{source_name}", source_name) + .replace("{related_hint}", related_hint)) + summary = call_llm_via_messages(prompt, llm) + block_html = Template(item_tpl).render(title=title, summary=summary) + blocks.append(block_html) + + newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"]) + newsletter_subtitle = cfg.get("newsletter_subtitle","") + html_out = Template(news_tpl).render( + newsletter_title=newsletter_title, + newsletter_subtitle=newsletter_subtitle, + blocks=blocks + ) + + if args.out: + outp = pathlib.Path(args.out) + outp.parent.mkdir(parents=True, exist_ok=True) + outp.write_text(html_out, encoding="utf-8") + print(f"Saved: {outp}") + + if args.publish: + ghost_url = os.getenv("GHOST_ADMIN_API_URL") + ghost_key = os.getenv("GHOST_ADMIN_API_KEY") + if not (ghost_url and ghost_key): + print("Missing GHOST_ADMIN_API_URL or GHOST_ADMIN_API_KEY in .env", file=sys.stderr) + sys.exit(2) + url = create_ghost_draft(ghost_url, ghost_key, html_out, newsletter_title, cfg.get("default_tags",[])) + print("Draft:", url) + + if not (args.out or args.publish): + print(html_out) + +if __name__ == "__main__": + main() diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..ab02050 --- /dev/null +++ b/config.yaml @@ -0,0 +1,22 @@ +newsletter_title: "Objevy týdne – {{ date }}" +newsletter_subtitle: "Archeologie, historie a příbuzné vědy" +default_tags: ["Newsletter","Archeologie"] +model: "gpt-4o-mini" +temperature: 0.2 + +llm: + provider: "openwebui" # openwebui | ollama | openai + api_base: "http://localhost:3000" + model: "qwen2.5-7b-instruct" + temperature: 0.2 + top_p: 1.0 + presence_penalty: 0.0 + frequency_penalty: 0.0 + timeout_seconds: 120 + max_retries: 2 + +db: + path: "data/newsletter.db" + embed_model: "text-embedding-3-small" + related_top_k: 3 + min_similarity: 0.78 diff --git a/db.py b/db.py new file mode 100644 index 0000000..8a79a12 --- /dev/null +++ b/db.py @@ -0,0 +1,91 @@ +import sqlite3 +from pathlib import Path +import math +import struct + +def connect(db_path: str) -> sqlite3.Connection: + Path(db_path).parent.mkdir(parents=True, exist_ok=True) + con = sqlite3.connect(db_path) + con.execute("PRAGMA foreign_keys = ON;") + return con + +def init_db(con: sqlite3.Connection, schema_path: str = "scripts/schema.sql"): + schema = Path(schema_path).read_text(encoding="utf-8") + con.executescript(schema) + con.commit() + +def pack_vec(vec): + return struct.pack("<%sf" % len(vec), *vec) + +def unpack_vec(blob): + fcount = len(blob)//4 + return list(struct.unpack("<%sf" % fcount, blob)) + +def cosine(a, b): + na = math.sqrt(sum(x*x for x in a)); nb = math.sqrt(sum(x*x for x in b)) + if na == 0 or nb == 0: return 0.0 + return sum(x*y for x,y in zip(a,b)) / (na*nb) + +def upsert_source(con, url=None, title=None, publisher=None, date_published=None, content=None, tags=None): + con.execute( + """INSERT INTO sources(url, title, publisher, date_published, content) + VALUES(?,?,?,?,?) + ON CONFLICT(url) DO UPDATE SET + title=COALESCE(excluded.title, title), + publisher=COALESCE(excluded.publisher, publisher), + date_published=COALESCE(excluded.date_published, date_published), + content=COALESCE(excluded.content, content) + """, (url, title, publisher, date_published, content) + ) + sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0] + if tags: + for t in tags: + con.execute("INSERT OR IGNORE INTO tags(name) VALUES(?)", (t,)) + tid = con.execute("SELECT id FROM tags WHERE name=?", (t,)).fetchone()[0] + con.execute("INSERT OR IGNORE INTO source_tags(source_id, tag_id) VALUES(?,?)", (sid, tid)) + con.commit() + return sid + +def insert_summary(con, source_id, title, summary, newsletter_date=None, tone_version=None): + cur = con.cursor() + cur.execute( + """INSERT INTO summaries(source_id, title, summary, newsletter_date, tone_version) + VALUES (?,?,?,?,?)""", + (source_id, title, summary, newsletter_date, tone_version) + ) + con.commit() + return cur.lastrowid + +def upsert_embedding(con, ref_table, ref_id, model, vec): + dim = len(vec) + blob = pack_vec(vec) + con.execute( + """INSERT INTO embeddings(ref_table, ref_id, model, dim, vec) + VALUES (?,?,?,?,?) + ON CONFLICT(ref_table, ref_id, model) DO UPDATE SET vec=excluded.vec, dim=excluded.dim""", + (ref_table, ref_id, model, dim, blob) + ) + con.commit() + +def topk_similar(con, model, query_vec, ref_table="summaries", k=3, min_sim=0.78): + rows = con.execute( + "SELECT ref_id, dim, vec FROM embeddings WHERE ref_table=? AND model=?;", + (ref_table, model) + ).fetchall() + scored = [] + for ref_id, dim, blob in rows: + vec = unpack_vec(blob) + if len(vec) != len(query_vec): + continue + sim = cosine(query_vec, vec) + if sim >= min_sim: + scored.append((sim, ref_id)) + scored.sort(reverse=True) + ref_ids = [rid for _, rid in scored[:k]] + if not ref_ids: return [] + if ref_table == "summaries": + q = "SELECT id, title, summary, newsletter_date FROM summaries WHERE id IN (%s)" % ",".join("?"*len(ref_ids)) + return con.execute(q, ref_ids).fetchall() + else: + q = "SELECT id, title, url, date_published FROM sources WHERE id IN (%s)" % ",".join("?"*len(ref_ids)) + return con.execute(q, ref_ids).fetchall() diff --git a/emb.py b/emb.py new file mode 100644 index 0000000..b3c65c0 --- /dev/null +++ b/emb.py @@ -0,0 +1,10 @@ +import os, requests + +def embed_text(text: str, api_base: str, api_key: str|None, model: str) -> list[float]: + url = f"{api_base.rstrip('/')}/v1/embeddings" + headers = {"Content-Type":"application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + r = requests.post(url, headers=headers, json={"model": model, "input": text}, timeout=60) + r.raise_for_status() + return r.json()["data"][0]["embedding"] diff --git a/entries/2025-09-19/sample-welsh-rainforests.md b/entries/2025-09-19/sample-welsh-rainforests.md new file mode 100644 index 0000000..115df8f --- /dev/null +++ b/entries/2025-09-19/sample-welsh-rainforests.md @@ -0,0 +1,11 @@ +--- +title: "Velšské deštné pralesy" +source_name: "BBC / Wikipedie" +url: "https://www.bbc.com/news/science-environment-00000000" +tags: ["folklor","krajina"] +status: "todo" +--- + +Deštné pralesy má asi většina z nás spojené spíš s Amazonií nebo jihovýchodní Asií, ale ony existují nejen v tropickém klimatickém pásu, ale i v našem, mírném. Jako deštný se označuje les s více než 200 cm srážek ročně. A právě takový les se kdysi rozkládal od Walesu až po Skotsko. + +Dnes z něj zbývá necelých 10 % rozlohy, ale je o něj pečováno velmi pečlivě. Tento prales je také zdrojem celé řady místních mýtů. V tom asi nejznámějším, Mabinogi, se ve čtvrté větvi stane roztržka mezi Pryderim a Gwydionem... diff --git a/inbox.txt b/inbox.txt new file mode 100644 index 0000000..454b82f --- /dev/null +++ b/inbox.txt @@ -0,0 +1,4 @@ +# Paste URLs here (one per line) +https://www.bbc.com/news/science-environment-00000000 +https://www.nature.com/articles/xxxxxxxx +https://antiquity.ac.uk/article/yyyyyyyy diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..837d836 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +pyyaml +jinja2 +python-dotenv +requests +markdown +pyjwt +beautifulsoup4 +readability-lxml +trafilatura +tldextract +python-slugify +numpy diff --git a/scripts/build_from_db.py b/scripts/build_from_db.py new file mode 100644 index 0000000..5db020b --- /dev/null +++ b/scripts/build_from_db.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +import os, sys, time, json, argparse, pathlib +import yaml, requests, jwt +from jinja2 import Template +from dotenv import load_dotenv +from datetime import date + +from db import connect as db_connect, insert_summary, upsert_embedding +from emb import embed_text + +load_dotenv() +ROOT = pathlib.Path(__file__).parent +REPO = ROOT.parent +TEMPLATES = REPO / "templates" + +def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8") + +def load_config(): + cfg = yaml.safe_load(read_file(REPO / "config.yaml")) + cfg["date"] = date.today().isoformat() + return cfg + +from dataclasses import dataclass +import requests + +@dataclass +class LLMConfig: + provider: str + api_base: str + model: str + api_key: str | None + temperature: float + top_p: float + presence_penalty: float + frequency_penalty: float + timeout_seconds: int + max_retries: int + +def resolve_llm_config(cfg: dict, args) -> LLMConfig: + llm_cfg = cfg.get("llm", {}) if cfg else {} + def pick(cli_val, env_key, cfg_key, default=None): + if cli_val is not None: + return cli_val + if env_key and os.getenv(env_key): + return os.getenv(env_key) + return llm_cfg.get(cfg_key, default) + provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui") + api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base", + "http://localhost:3000" if provider=="openwebui" else + "http://localhost:11434" if provider=="ollama" else + "https://api.openai.com") + model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model", + "qwen2.5-7b-instruct" if provider=="openwebui" else + "llama3.1:8b-instruct" if provider=="ollama" else + "gpt-4o-mini") + api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None) + temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2)) + top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0)) + presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0)) + frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0)) + timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120)) + max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2)) + return LLMConfig(provider, api_base, model, api_key, temperature, top_p, presence_penalty, frequency_penalty, timeout_seconds, max_retries) + +def chat_completion_llm(messages, llm: LLMConfig): + if llm.provider == "openwebui": + url = f"{llm.api_base.rstrip('/')}/api/chat/completions" + elif llm.provider == "ollama": + url = f"{llm.api_base.rstrip('/')}/v1/chat/completions" + else: + url = f"{llm.api_base.rstrip('/')}/v1/chat/completions" + headers = {"Content-Type":"application/json"} + if llm.api_key: headers["Authorization"] = f"Bearer {llm.api_key}" + payload = {"model": llm.model, "messages": messages, "temperature": llm.temperature, "top_p": llm.top_p, + "presence_penalty": llm.presence_penalty, "frequency_penalty": llm.frequency_penalty, "stream": False} + r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds) + r.raise_for_status() + return r.json()["choices"][0]["message"]["content"] + +def main(): + ap = argparse.ArgumentParser(description="Build directly from DB (Top-N sources)") + ap.add_argument("--db", default="data/newsletter.db") + ap.add_argument("--limit", type=int, default=10) + ap.add_argument("--out", required=True) + ap.add_argument("--publish", action="store_true") + # LLM overrides + ap.add_argument("--llm-provider"); ap.add_argument("--llm-api-base") + ap.add_argument("--llm-model"); ap.add_argument("--temperature", type=float) + ap.add_argument("--top-p", type=float); ap.add_argument("--presence-penalty", type=float) + ap.add_argument("--frequency-penalty", type=float); ap.add_argument("--timeout-seconds", type=int) + ap.add_argument("--max-retries", type=int) + args = ap.parse_args() + + cfg = load_config() + llm = resolve_llm_config(cfg, args) + con = db_connect(args.db) + + rows = con.execute("SELECT id, url, title, publisher FROM sources ORDER BY id DESC LIMIT ?", (args.limit,)).fetchall() + + prompt_template = (TEMPLATES / "prompt.txt").read_text(encoding="utf-8") + style_examples = (TEMPLATES / "style_bank.md").read_text(encoding="utf-8").strip() + prompt_template = prompt_template.replace("{style_examples}", style_examples) + + item_tpl = (TEMPLATES / "item.html.j2").read_text(encoding="utf-8") + news_tpl = (TEMPLATES / "newsletter.html.j2").read_text(encoding="utf-8") + + blocks = [] + for sid, url, title, publisher in rows: + body = (con.execute("SELECT content FROM sources WHERE id=?", (sid,)).fetchone()[0]) or "" + related_hint = "—" + prompt = (prompt_template + .replace("{title}", title or url) + .replace("{body}", body) + .replace("{source_name}", publisher or "Zdroj neuveden") + .replace("{related_hint}", related_hint)) + summary = chat_completion_llm([{"role":"user","content": prompt}], llm) + + sum_id = insert_summary(con, sid, title or url, summary, newsletter_date=cfg["date"], tone_version="v1") + try: + vec = embed_text(summary, os.getenv("EMB_API_BASE", cfg["llm"]["api_base"]), os.getenv("EMB_API_KEY", os.getenv("LLM_API_KEY")), os.getenv("EMB_MODEL", cfg["db"]["embed_model"])) + upsert_embedding(con, "summaries", sum_id, os.getenv("EMB_MODEL", cfg["db"]["embed_model"]), vec) + except Exception: + pass + + blocks.append(Template(item_tpl).render(title=(title or url), summary=summary)) + + newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"]) + newsletter_subtitle = cfg.get("newsletter_subtitle","") + html_out = Template(news_tpl).render(newsletter_title=newsletter_title, newsletter_subtitle=newsletter_subtitle, blocks=blocks) + + outp = pathlib.Path(args.out); outp.parent.mkdir(parents=True, exist_ok=True); outp.write_text(html_out, encoding="utf-8") + print(f"Saved: {outp}") + + if args.publish: + ghost_url = os.getenv("GHOST_ADMIN_API_URL") + ghost_key = os.getenv("GHOST_ADMIN_API_KEY") + if ghost_url and ghost_key: + def ghost_jwt(key: str) -> str: + key_id, secret = key.split(':') + iat = int(time.time()) + header = {"alg": "HS256", "kid": key_id, "typ": "JWT"} + payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"} + import jwt + return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header) + token = ghost_jwt(ghost_key) + payload = {"posts":[{"title": newsletter_title, "html": html_out, "status": "draft", "tags": [{"name": t} for t in cfg.get("default_tags",[])]}]} + r = requests.post(f"{ghost_url}/posts/", headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"}, data=json.dumps(payload), timeout=60) + r.raise_for_status() + print("Draft:", r.json()["posts"][0]["url"]) + else: + print("Missing Ghost creds; skipped publish.") + +if __name__ == "__main__": + main() diff --git a/scripts/db_cli.py b/scripts/db_cli.py new file mode 100644 index 0000000..8d5f2d9 --- /dev/null +++ b/scripts/db_cli.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +import argparse +from db import connect, init_db + +def main(): + ap = argparse.ArgumentParser(description="DB CLI") + ap.add_argument("cmd", choices=["init"]) + ap.add_argument("--db", default="data/newsletter.db") + ap.add_argument("--schema", default="scripts/schema.sql") + args = ap.parse_args() + + con = connect(args.db) + if args.cmd == "init": + init_db(con, args.schema) + print(f"Initialized schema in {args.db}") + +if __name__ == "__main__": + main() diff --git a/scripts/ingest_list.py b/scripts/ingest_list.py new file mode 100644 index 0000000..12c06bd --- /dev/null +++ b/scripts/ingest_list.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +import argparse, sqlite3, re, sys, time +from pathlib import Path +from urllib.parse import urlparse, urlunparse +import requests, tldextract +from bs4 import BeautifulSoup +from readability import Document as ReadabilityDoc +import trafilatura +from slugify import slugify +from datetime import date + +def connect(db_path): + con = sqlite3.connect(db_path) + con.execute("PRAGMA foreign_keys=ON;") + return con + +def upsert_source(con, url, title=None, publisher=None, date_published=None, content=None, tags=None): + con.execute( + """INSERT INTO sources(url, title, publisher, date_published, content) + VALUES (?,?,?,?,?) + ON CONFLICT(url) DO UPDATE SET + title=COALESCE(excluded.title, title), + publisher=COALESCE(excluded.publisher, publisher), + date_published=COALESCE(excluded.date_published, date_published), + content=COALESCE(excluded.content, content) + """, (url, title, publisher, date_published, content) + ) + sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0] + con.commit() + return sid + +def normalize_url(u: str) -> str: + p = urlparse(u.strip()) + if not p.scheme: + p = p._replace(scheme="https") + query = re.sub(r'(&|\?)?(utm_[^=&]+|fbclid|gclid)=[^&]*', '', p.query) + if query.startswith('&'): query = query[1:] + return urlunparse((p.scheme, p.netloc, p.path, p.params, query, "")) + +def domain(url: str) -> str: + ext = tldextract.extract(url) + return ".".join(part for part in [ext.domain, ext.suffix] if part) + +def fetch_readable(url: str, timeout=20): + try: + r = requests.get(url, timeout=timeout, headers={"User-Agent":"Mozilla/5.0"}) + r.raise_for_status() + html = r.text + except Exception: + return "", "" + try: + txt = trafilatura.extract(html, include_comments=False, include_tables=False, favor_recall=True) + if txt: + soup = BeautifulSoup(html, "html.parser") + t = (soup.title.string.strip() if soup.title and soup.title.string else "") + return t, txt.strip() + except Exception: + pass + try: + doc = ReadabilityDoc(html) + content_html = doc.summary() + soup = BeautifulSoup(content_html, "html.parser") + txt = soup.get_text(separator="\n").strip() + return (doc.short_title() or "").strip(), txt + except Exception: + return "", "" + +def write_stub(entries_dir: Path, title: str, url: str, source_name: str|None, text: str|None): + entries_dir.mkdir(parents=True, exist_ok=True) + from datetime import datetime + slug = slugify(title or domain(url) or "clanek")[:60] or "clanek" + path = entries_dir / f"{slug}.md" + body = text.strip() if text else "" + fm = f'''--- +title: "{(title or "").replace('"',"\"")}" +source_name: "{(source_name or domain(url) or "").replace('"',"\"")}" +url: "{url}" +tags: [] +status: "todo" +--- + +{body} +''' + path.write_text(fm, encoding="utf-8") + return path + +def read_lines(source_path: str|None): + if source_path: + return Path(source_path).read_text(encoding="utf-8", errors="ignore").splitlines() + return sys.stdin.read().splitlines() + +def main(): + ap = argparse.ArgumentParser(description="Ingest list of URLs into SQLite and optional entry stubs") + ap.add_argument("--db", default="data/newsletter.db") + ap.add_argument("--list", help="Text file with URLs (one per line). If omitted, read from STDIN.") + ap.add_argument("--fetch", action="store_true", help="Fetch & extract readable text") + ap.add_argument("--sleep", type=float, default=0.0) + ap.add_argument("--stubs", action="store_true") + ap.add_argument("--date", default=date.today().isoformat()) + args = ap.parse_args() + + con = connect(args.db) + lines = read_lines(args.list) + + urls = [] + for ln in lines: + ln = ln.strip() + if not ln or ln.startswith("#"): + continue + if re.match(r"^\w+://", ln) or re.match(r"^[\w\.-]+\.[a-z]{2,}(/|$)", ln): + urls.append(normalize_url(ln)) + seen = set(); urls = [u for u in urls if not (u in seen or seen.add(u))] + + stubs_dir = Path(f"entries/{args.date}") if args.stubs else None + kept = 0 + for url in urls: + pub = domain(url) + title, text = ("","") + if args.fetch: + title, text = fetch_readable(url) + sid = upsert_source(con, url=url, title=(title or None), publisher=pub, content=(text or None)) + kept += 1 + if stubs_dir: + stub = write_stub(stubs_dir, title or url, url, pub, text) + print(f"Stub: {stub}") + if args.sleep: time.sleep(args.sleep) + print(f"Ingested: {kept} URLs into {args.db}") + +if __name__ == "__main__": + main() diff --git a/scripts/schema.sql b/scripts/schema.sql new file mode 100644 index 0000000..19c05e5 --- /dev/null +++ b/scripts/schema.sql @@ -0,0 +1,71 @@ +PRAGMA foreign_keys = ON; + +CREATE TABLE IF NOT EXISTS sources ( + id INTEGER PRIMARY KEY, + url TEXT UNIQUE, + title TEXT, + publisher TEXT, + date_published TEXT, + content TEXT +); + +CREATE TABLE IF NOT EXISTS summaries ( + id INTEGER PRIMARY KEY, + source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL, + title TEXT NOT NULL, + summary TEXT NOT NULL, + newsletter_date TEXT, + tone_version TEXT, + created_at TEXT DEFAULT (datetime('now')) +); + +CREATE TABLE IF NOT EXISTS tags ( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE +); +CREATE TABLE IF NOT EXISTS source_tags ( + source_id INTEGER REFERENCES sources(id) ON DELETE CASCADE, + tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE, + PRIMARY KEY(source_id, tag_id) +); + +CREATE TABLE IF NOT EXISTS embeddings ( + id INTEGER PRIMARY KEY, + ref_table TEXT NOT NULL CHECK(ref_table IN ('sources','summaries')), + ref_id INTEGER NOT NULL, + model TEXT NOT NULL, + dim INTEGER NOT NULL, + vec BLOB NOT NULL, + UNIQUE(ref_table, ref_id, model) +); + +CREATE VIRTUAL TABLE IF NOT EXISTS sources_fts USING fts5( + title, content, content='sources', content_rowid='id' +); +CREATE VIRTUAL TABLE IF NOT EXISTS summaries_fts USING fts5( + title, summary, content='summaries', content_rowid='id' +); + +CREATE TRIGGER IF NOT EXISTS sources_ai AFTER INSERT ON sources BEGIN + INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content); +END; +CREATE TRIGGER IF NOT EXISTS sources_au AFTER UPDATE ON sources BEGIN + INSERT INTO sources_fts(sources_fts, rowid, title, content) + VALUES('delete', old.id, old.title, old.content); + INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content); +END; +CREATE TRIGGER IF NOT EXISTS sources_ad AFTER DELETE ON sources BEGIN + INSERT INTO sources_fts(sources_fts, rowid, title, content) VALUES('delete', old.id, old.title, old.content); +END; + +CREATE TRIGGER IF NOT EXISTS summaries_ai AFTER INSERT ON summaries BEGIN + INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary); +END; +CREATE TRIGGER IF NOT EXISTS summaries_au AFTER UPDATE ON summaries BEGIN + INSERT INTO summaries_fts(summaries_fts, rowid, title, summary) + VALUES('delete', old.id, old.title, old.summary); + INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary); +END; +CREATE TRIGGER IF NOT EXISTS summaries_ad AFTER DELETE ON summaries BEGIN + INSERT INTO summaries_fts(summaries_fts, rowid, title, summary) VALUES('delete', old.id, old.title, old.summary); +END; diff --git a/scripts/sync_entries_to_db.py b/scripts/sync_entries_to_db.py new file mode 100644 index 0000000..499fb9b --- /dev/null +++ b/scripts/sync_entries_to_db.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +import argparse, re +from pathlib import Path +import yaml +from db import connect, upsert_source + +def parse_front_matter(text: str): + m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M) + if not m: + return {}, text.strip() + fm = yaml.safe_load(m.group(1)) or {} + body = m.group(2).strip() + return fm, body + +def main(): + ap = argparse.ArgumentParser(description="Sync entries/*.md into SQLite sources") + ap.add_argument("--db", default="data/newsletter.db") + ap.add_argument("--dir", required=True, help="entries/YYYY-MM-DD directory") + args = ap.parse_args() + + con = connect(args.db) + for p in Path(args.dir).glob("*.md"): + text = p.read_text(encoding="utf-8") + fm, body = parse_front_matter(text) + title = fm.get("title") or p.stem + url = fm.get("url") + publisher = fm.get("source_name") + upsert_source(con, + url=url, + title=title, + publisher=publisher, + date_published=None, + content=body + ) + print(f"Synced: {p.name}") + +if __name__ == "__main__": + main() diff --git a/templates/item.html.j2 b/templates/item.html.j2 new file mode 100644 index 0000000..07138c2 --- /dev/null +++ b/templates/item.html.j2 @@ -0,0 +1,2 @@ +

{{ title }}

+

{{ summary }}

diff --git a/templates/newsletter.html.j2 b/templates/newsletter.html.j2 new file mode 100644 index 0000000..9fe914f --- /dev/null +++ b/templates/newsletter.html.j2 @@ -0,0 +1,7 @@ +

{{ newsletter_title }}

+

{{ newsletter_subtitle }}

+
+{% for b in blocks %} +{{ b }} +{% if not loop.last %}
{% endif %} +{% endfor %} diff --git a/templates/prompt.txt b/templates/prompt.txt new file mode 100644 index 0000000..22c568f --- /dev/null +++ b/templates/prompt.txt @@ -0,0 +1,24 @@ +Jsi editor newsletteru o archeologii, historii a příbuzných vědách. Piš česky, srozumitelně pro laiky, věcně, bez žargonu; jednu větu můžeš udělat mírně poutavou, ale ne bulvární. Drž rozsah 3–4 věty. + +ŠABLONA OBSAHU: +1) Co je novinka/objev. +2) Proč je důležitý (dopad, změna pohledu, metoda). +3) Kontext nebo zajímavost (kultura, datace, souvislosti). +4) (volitelně) Jedna věta s odkazem na předchozí související text, pokud je k dispozici. +Na konec přidej: „Zdroj: {source_name}“. + +=== UKÁZKY STYLU === +{style_examples} + +=== VSTUPNÍ POLOŽKA === +TITULEK: {title} +TEXT: +{body} + +=== NÁVAZNOSTI (VOLITELNÉ) === +{related_hint} + +=== POKYNY === +- Drž 3–4 věty. +- Pokud jsou náležitosti neúplné, buď opatrný a nic si nedovymýšlej. +- Zakonči přesně: „Zdroj: {source_name}“. diff --git a/templates/style_bank.md b/templates/style_bank.md new file mode 100644 index 0000000..66ef12e --- /dev/null +++ b/templates/style_bank.md @@ -0,0 +1,2 @@ +- Velšské deštné pralesy: Velká Británie je domovem unikátních zbytků mírného deštného pralesa, který se kdysi rozkládal od Walesu až po Skotsko. Dnes přežívá jen asi 10 % původní plochy, ale probíhají programy jeho ochrany a obnovy. Pralesy se objevují i v mytologii, například ve čtvrté větvi velšského eposu Mabinogi, kde jsou spojeny s kouzelnými zvířaty. Kontrolovaná pastva, kterou je naši předci udržovali zdravé, se používá i dnes. +- Podmořské stavby lovců‑sběračů: Vědci objevili pod hladinou Baltu kilometr dlouhou kamennou linii, pravděpodobně prehistorickou loveckou strukturu. Nález naznačuje promyšlenou organizaci lovu i přizpůsobení se krajině, kterou později zaplavilo moře. Podobné konstrukce známe z jiných částí světa, ale v Evropě jsou vzácné.