commit 079bd25899be9edaffa484a6f3f1e02695b0fe06 Author: Tomas Kracmar Date: Sun Sep 14 16:56:08 2025 +0200 First commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b409e14 --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +# --- Python / environment --- +.venv/ +__pycache__/ +*.py[cod] +*.egg-info/ +.DS_Store + +# --- Secrets --- +.env + +# --- Local database (optional) --- +# Comment out if you DO want to commit newsletter history! +data/*.db + +# --- Build outputs --- +dist/ +*.html + +# --- Logs / temp --- +*.log +.cache/ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..dbfea05 --- /dev/null +++ b/Makefile @@ -0,0 +1,57 @@ +DATE ?= $(shell date +%F) +ENTRIES_DIR ?= entries/$(DATE) +OUT ?= dist/$(DATE).html +LIMIT ?= 10 + +PROVIDER ?= +API_BASE ?= +MODEL ?= +TEMP ?= + +LLM_FLAGS := +ifneq ($(strip $(PROVIDER)),) + LLM_FLAGS += --llm-provider $(PROVIDER) +endif +ifneq ($(strip $(API_BASE)),) + LLM_FLAGS += --llm-api-base $(API_BASE) +endif +ifneq ($(strip $(MODEL)),) + LLM_FLAGS += --llm-model $(MODEL) +endif +ifneq ($(strip $(TEMP)),) + LLM_FLAGS += --temperature $(TEMP) +endif + +.PHONY: init ingest stubs build draft build-db draft-db sync-db clean + +init: + python scripts/db_cli.py init + +ingest: + python scripts/ingest_list.py --list inbox.txt + +stubs: + python scripts/ingest_list.py --list inbox.txt --fetch --stubs --date $(DATE) --sleep 0.5 + +build: + python build.py $(ENTRIES_DIR) --out $(OUT) $(LLM_FLAGS) + +draft: + python build.py $(ENTRIES_DIR) --out $(OUT) --publish $(LLM_FLAGS) + +build-db: + python scripts/build_from_db.py --limit $(LIMIT) --out $(OUT) $(LLM_FLAGS) + +draft-db: + python scripts/build_from_db.py --limit $(LIMIT) --out $(OUT) --publish $(LLM_FLAGS) + +sync-db: + python scripts/sync_entries_to_db.py --dir $(ENTRIES_DIR) + +clean: + rm -rf dist/* + + +# Dry run: generate summaries and print HTML to stdout (no file write, no publish) +dry-run: + python build.py $(ENTRIES_DIR) --dry-run $(LLM_FLAGS) diff --git a/README.md b/README.md new file mode 100644 index 0000000..d532b8f --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# Offline Newsletter Builder (Archaeology/History) + +Local-first toolchain to turn your article links (or pasted texts) into a Friday Ghost draft with consistent summaries, plus a SQLite database for memory and related-article suggestions. + +## Highlights +- Paste URLs into `inbox.txt` → run importer → get `.md` stubs **and** a growing SQLite DB. +- One command to build the weekly newsletter (HTML) and optionally create a **Ghost draft**. +- Consistent tone via `templates/prompt.txt` (+ your `templates/style_bank.md` samples). +- "Memory" via `data/newsletter.db` (sources, summaries, embeddings, FTS), used to auto-suggest **related** items. + +## Quick start +```bash +python -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt + +# Configure secrets +cp .env.example .env +# Edit .env to set your keys (or point to local OpenWebUI/Ollama) + +# Create DB schema +python scripts/db_cli.py init + +# Ingest URLs (from inbox.txt) +python scripts/ingest_list.py --list inbox.txt --fetch --stubs --date 2025-09-19 --sleep 0.5 + +# Build newsletter from stubs (HTML + optional Ghost draft) +python build.py entries/2025-09-19 --out dist/2025-09-19.html --publish + +# (Optional) Build directly from DB (Top-N sources) +python scripts/build_from_db.py --limit 10 --out dist/2025-09-19.html --publish +``` + +## Sample Run (Pretend Output) + +```bash +$ make init +Initialized schema in data/newsletter.db + +$ make stubs DATE=2025-09-19 +Stub: entries/2025-09-19/bbc-welsh-rainforests.md +Stub: entries/2025-09-19/nature-bone-discovery.md +Ingested: 2 URLs into data/newsletter.db + +$ make build DATE=2025-09-19 +[build] Using LLM provider=openwebui model=qwen2.5-7b-instruct +[build] Generating summary for: Velšské deštné pralesy ... +[build] -> Summary written back to DB (id=5, embedding stored) +[build] Generating summary for: Kosterní nález u Nisy ... +[build] -> Summary written back to DB (id=6, embedding stored) +Saved: dist/2025-09-19.html + +$ make draft DATE=2025-09-19 +Draft: https://your-ghost-site.ghost.io/p/objevy-tydne-2025-09-19-draft +``` + +You can open the generated HTML under `dist/2025-09-19.html` in your browser to review before publishing. diff --git a/build.py b/build.py new file mode 100644 index 0000000..6c94a35 --- /dev/null +++ b/build.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +import os, sys, glob, time, json, html, argparse, pathlib, textwrap, re +from datetime import date +import yaml, requests, jwt +from jinja2 import Template +from dotenv import load_dotenv + +load_dotenv() + +ROOT = pathlib.Path(__file__).parent +TEMPLATES = ROOT / "templates" + +def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8") + +def load_config(): + cfg = yaml.safe_load(read_file(ROOT / "config.yaml")) + cfg["date"] = date.today().isoformat() + return cfg + +def parse_front_matter(text): + m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M) + if not m: + return {}, text.strip() + import yaml as _yaml + fm = _yaml.safe_load(m.group(1)) or {} + body = m.group(2).strip() + return fm, body + +# ---- LLM config / client ---- +from dataclasses import dataclass + +@dataclass +class LLMConfig: + provider: str + api_base: str + model: str + api_key: str | None + temperature: float + top_p: float + presence_penalty: float + frequency_penalty: float + timeout_seconds: int + max_retries: int + +def resolve_llm_config(cfg: dict, args) -> LLMConfig: + llm_cfg = cfg.get("llm", {}) if cfg else {} + + def pick(cli_val, env_key, cfg_key, default=None): + if cli_val is not None: + return cli_val + if env_key and os.getenv(env_key): + return os.getenv(env_key) + return llm_cfg.get(cfg_key, default) + + provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui") + api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base", + "http://localhost:3000" if provider=="openwebui" else + "http://localhost:11434" if provider=="ollama" else + "https://api.openai.com") + model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model", + "qwen2.5-7b-instruct" if provider=="openwebui" else + "llama3.1:8b-instruct" if provider=="ollama" else + "gpt-4o-mini") + api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None) + + temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2)) + top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0)) + presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0)) + frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0)) + timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120)) + max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2)) + + return LLMConfig( + provider=provider, api_base=api_base, model=model, api_key=api_key, + temperature=temperature, top_p=top_p, + presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + timeout_seconds=timeout_seconds, max_retries=max_retries + ) + +def chat_completion_llm(messages, llm: LLMConfig): + if llm.provider == "openwebui": + url = f"{llm.api_base.rstrip('/')}/api/chat/completions" + headers = {"Content-Type":"application/json"} + if llm.api_key: + headers["Authorization"] = f"Bearer {llm.api_key}" + elif llm.provider == "ollama": + url = f"{llm.api_base.rstrip('/')}/v1/chat/completions" + headers = {"Content-Type":"application/json"} + if llm.api_key: + headers["Authorization"] = f"Bearer {llm.api_key}" + else: + url = f"{llm.api_base.rstrip('/')}/v1/chat/completions" + headers = {"Content-Type":"application/json"} + if llm.api_key: + headers["Authorization"] = f"Bearer {llm.api_key}" + + payload = { + "model": llm.model, + "messages": messages, + "temperature": llm.temperature, + "top_p": llm.top_p, + "presence_penalty": llm.presence_penalty, + "frequency_penalty": llm.frequency_penalty, + "stream": False + } + + attempt = 0 + last_err = None + while attempt <= llm.max_retries: + try: + r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds) + r.raise_for_status() + data = r.json() + return data["choices"][0]["message"]["content"] + except Exception as e: + last_err = e + attempt += 1 + if attempt > llm.max_retries: + break + time.sleep(min(2**attempt, 8)) + raise RuntimeError(f"LLM request failed after {llm.max_retries} retries: {last_err}") + +def call_llm_via_messages(prompt: str, llm: LLMConfig) -> str: + return chat_completion_llm([{"role":"user","content": prompt}], llm) + +# ---- Ghost ---- +def ghost_jwt(key: str) -> str: + key_id, secret = key.split(':') + iat = int(time.time()) + header = {"alg": "HS256", "kid": key_id, "typ": "JWT"} + payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"} + return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header) + +def create_ghost_draft(ghost_url, ghost_key, html_content, title, tags): + token = ghost_jwt(ghost_key) + payload = { "posts": [{ + "title": title, "html": html_content, "status": "draft", + "tags": [{"name": t} for t in tags] + }]} + r = requests.post( + f"{ghost_url}/posts/", + headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"}, + data=json.dumps(payload), timeout=60 + ) + r.raise_for_status() + return r.json()["posts"][0]["url"] + +# ---- Memory/embeddings ---- +from db import connect as db_connect, topk_similar +from emb import embed_text + +def build_related_hint_auto(title, body, llm_cfg, cfg_db): + api_base = os.getenv("EMB_API_BASE", llm_cfg.api_base) + api_key = os.getenv("EMB_API_KEY", llm_cfg.api_key) + model = os.getenv("EMB_MODEL", cfg_db.get("embed_model", "text-embedding-3-small")) + qtext = (title + "\n\n" + body)[:5000] + try: + vec = embed_text(qtext, api_base, api_key, model) + except Exception: + return "—" + con = db_connect(cfg_db["path"]) + hits = topk_similar(con, model=model, query_vec=vec, + ref_table="summaries", + k=cfg_db.get("related_top_k",3), + min_sim=cfg_db.get("min_similarity",0.78)) + if not hits: + return "—" + lines = [] + for sid, t, s, nd in hits: + lines.append(f"- {nd or 'dříve'}: {t}") + return "O podobném tématu jsme psali:\n" + "\n".join(lines) + "\nZmiň jednou větou souvislost." + +def main(): + ap = argparse.ArgumentParser(description="Offline-first generator + Ghost draft") + ap.add_argument("entries_dir", help="entries/YYYY-MM-DD directory") + ap.add_argument("--out", help="Output HTML path, e.g. dist/2025-09-19.html") + ap.add_argument("--dry-run", action="store_true") + ap.add_argument("--publish", action="store_true") + # LLM overrides + ap.add_argument("--llm-provider") + ap.add_argument("--llm-api-base") + ap.add_argument("--llm-model") + ap.add_argument("--temperature", type=float) + ap.add_argument("--top-p", type=float) + ap.add_argument("--presence-penalty", type=float) + ap.add_argument("--frequency-penalty", type=float) + ap.add_argument("--timeout-seconds", type=int) + ap.add_argument("--max-retries", type=int) + args = ap.parse_args() + + cfg = load_config() + llm = resolve_llm_config(cfg, args) + + item_tpl = read_file(TEMPLATES / "item.html.j2") + news_tpl = read_file(TEMPLATES / "newsletter.html.j2") + prompt_template = read_file(TEMPLATES / "prompt.txt") + style_examples = read_file(TEMPLATES / "style_bank.md").strip() + prompt_template = prompt_template.replace("{style_examples}", style_examples) + + paths = sorted(glob.glob(os.path.join(args.entries_dir, "*.md"))) + blocks = [] + for p in paths: + fm_text = pathlib.Path(p).read_text(encoding="utf-8") + fm, body = parse_front_matter(fm_text) + if fm.get("status","todo") == "skip": + continue + title = fm.get("title") or pathlib.Path(p).stem.replace("-"," ").title() + source_name = fm.get("source_name","Zdroj neuveden") + related_hint = build_related_hint_auto(title, body, llm, cfg.get("db",{})) + prompt = (prompt_template + .replace("{title}", title) + .replace("{body}", body) + .replace("{source_name}", source_name) + .replace("{related_hint}", related_hint)) + summary = call_llm_via_messages(prompt, llm) + block_html = Template(item_tpl).render(title=title, summary=summary) + blocks.append(block_html) + + newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"]) + newsletter_subtitle = cfg.get("newsletter_subtitle","") + html_out = Template(news_tpl).render( + newsletter_title=newsletter_title, + newsletter_subtitle=newsletter_subtitle, + blocks=blocks + ) + + if args.out: + outp = pathlib.Path(args.out) + outp.parent.mkdir(parents=True, exist_ok=True) + outp.write_text(html_out, encoding="utf-8") + print(f"Saved: {outp}") + + if args.publish: + ghost_url = os.getenv("GHOST_ADMIN_API_URL") + ghost_key = os.getenv("GHOST_ADMIN_API_KEY") + if not (ghost_url and ghost_key): + print("Missing GHOST_ADMIN_API_URL or GHOST_ADMIN_API_KEY in .env", file=sys.stderr) + sys.exit(2) + url = create_ghost_draft(ghost_url, ghost_key, html_out, newsletter_title, cfg.get("default_tags",[])) + print("Draft:", url) + + if not (args.out or args.publish): + print(html_out) + +if __name__ == "__main__": + main() diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..ab02050 --- /dev/null +++ b/config.yaml @@ -0,0 +1,22 @@ +newsletter_title: "Objevy týdne – {{ date }}" +newsletter_subtitle: "Archeologie, historie a příbuzné vědy" +default_tags: ["Newsletter","Archeologie"] +model: "gpt-4o-mini" +temperature: 0.2 + +llm: + provider: "openwebui" # openwebui | ollama | openai + api_base: "http://localhost:3000" + model: "qwen2.5-7b-instruct" + temperature: 0.2 + top_p: 1.0 + presence_penalty: 0.0 + frequency_penalty: 0.0 + timeout_seconds: 120 + max_retries: 2 + +db: + path: "data/newsletter.db" + embed_model: "text-embedding-3-small" + related_top_k: 3 + min_similarity: 0.78 diff --git a/db.py b/db.py new file mode 100644 index 0000000..8a79a12 --- /dev/null +++ b/db.py @@ -0,0 +1,91 @@ +import sqlite3 +from pathlib import Path +import math +import struct + +def connect(db_path: str) -> sqlite3.Connection: + Path(db_path).parent.mkdir(parents=True, exist_ok=True) + con = sqlite3.connect(db_path) + con.execute("PRAGMA foreign_keys = ON;") + return con + +def init_db(con: sqlite3.Connection, schema_path: str = "scripts/schema.sql"): + schema = Path(schema_path).read_text(encoding="utf-8") + con.executescript(schema) + con.commit() + +def pack_vec(vec): + return struct.pack("<%sf" % len(vec), *vec) + +def unpack_vec(blob): + fcount = len(blob)//4 + return list(struct.unpack("<%sf" % fcount, blob)) + +def cosine(a, b): + na = math.sqrt(sum(x*x for x in a)); nb = math.sqrt(sum(x*x for x in b)) + if na == 0 or nb == 0: return 0.0 + return sum(x*y for x,y in zip(a,b)) / (na*nb) + +def upsert_source(con, url=None, title=None, publisher=None, date_published=None, content=None, tags=None): + con.execute( + """INSERT INTO sources(url, title, publisher, date_published, content) + VALUES(?,?,?,?,?) + ON CONFLICT(url) DO UPDATE SET + title=COALESCE(excluded.title, title), + publisher=COALESCE(excluded.publisher, publisher), + date_published=COALESCE(excluded.date_published, date_published), + content=COALESCE(excluded.content, content) + """, (url, title, publisher, date_published, content) + ) + sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0] + if tags: + for t in tags: + con.execute("INSERT OR IGNORE INTO tags(name) VALUES(?)", (t,)) + tid = con.execute("SELECT id FROM tags WHERE name=?", (t,)).fetchone()[0] + con.execute("INSERT OR IGNORE INTO source_tags(source_id, tag_id) VALUES(?,?)", (sid, tid)) + con.commit() + return sid + +def insert_summary(con, source_id, title, summary, newsletter_date=None, tone_version=None): + cur = con.cursor() + cur.execute( + """INSERT INTO summaries(source_id, title, summary, newsletter_date, tone_version) + VALUES (?,?,?,?,?)""", + (source_id, title, summary, newsletter_date, tone_version) + ) + con.commit() + return cur.lastrowid + +def upsert_embedding(con, ref_table, ref_id, model, vec): + dim = len(vec) + blob = pack_vec(vec) + con.execute( + """INSERT INTO embeddings(ref_table, ref_id, model, dim, vec) + VALUES (?,?,?,?,?) + ON CONFLICT(ref_table, ref_id, model) DO UPDATE SET vec=excluded.vec, dim=excluded.dim""", + (ref_table, ref_id, model, dim, blob) + ) + con.commit() + +def topk_similar(con, model, query_vec, ref_table="summaries", k=3, min_sim=0.78): + rows = con.execute( + "SELECT ref_id, dim, vec FROM embeddings WHERE ref_table=? AND model=?;", + (ref_table, model) + ).fetchall() + scored = [] + for ref_id, dim, blob in rows: + vec = unpack_vec(blob) + if len(vec) != len(query_vec): + continue + sim = cosine(query_vec, vec) + if sim >= min_sim: + scored.append((sim, ref_id)) + scored.sort(reverse=True) + ref_ids = [rid for _, rid in scored[:k]] + if not ref_ids: return [] + if ref_table == "summaries": + q = "SELECT id, title, summary, newsletter_date FROM summaries WHERE id IN (%s)" % ",".join("?"*len(ref_ids)) + return con.execute(q, ref_ids).fetchall() + else: + q = "SELECT id, title, url, date_published FROM sources WHERE id IN (%s)" % ",".join("?"*len(ref_ids)) + return con.execute(q, ref_ids).fetchall() diff --git a/emb.py b/emb.py new file mode 100644 index 0000000..b3c65c0 --- /dev/null +++ b/emb.py @@ -0,0 +1,10 @@ +import os, requests + +def embed_text(text: str, api_base: str, api_key: str|None, model: str) -> list[float]: + url = f"{api_base.rstrip('/')}/v1/embeddings" + headers = {"Content-Type":"application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + r = requests.post(url, headers=headers, json={"model": model, "input": text}, timeout=60) + r.raise_for_status() + return r.json()["data"][0]["embedding"] diff --git a/entries/2025-09-19/sample-welsh-rainforests.md b/entries/2025-09-19/sample-welsh-rainforests.md new file mode 100644 index 0000000..115df8f --- /dev/null +++ b/entries/2025-09-19/sample-welsh-rainforests.md @@ -0,0 +1,11 @@ +--- +title: "Velšské deštné pralesy" +source_name: "BBC / Wikipedie" +url: "https://www.bbc.com/news/science-environment-00000000" +tags: ["folklor","krajina"] +status: "todo" +--- + +Deštné pralesy má asi většina z nás spojené spíš s Amazonií nebo jihovýchodní Asií, ale ony existují nejen v tropickém klimatickém pásu, ale i v našem, mírném. Jako deštný se označuje les s více než 200 cm srážek ročně. A právě takový les se kdysi rozkládal od Walesu až po Skotsko. + +Dnes z něj zbývá necelých 10 % rozlohy, ale je o něj pečováno velmi pečlivě. Tento prales je také zdrojem celé řady místních mýtů. V tom asi nejznámějším, Mabinogi, se ve čtvrté větvi stane roztržka mezi Pryderim a Gwydionem... diff --git a/inbox.txt b/inbox.txt new file mode 100644 index 0000000..454b82f --- /dev/null +++ b/inbox.txt @@ -0,0 +1,4 @@ +# Paste URLs here (one per line) +https://www.bbc.com/news/science-environment-00000000 +https://www.nature.com/articles/xxxxxxxx +https://antiquity.ac.uk/article/yyyyyyyy diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..837d836 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +pyyaml +jinja2 +python-dotenv +requests +markdown +pyjwt +beautifulsoup4 +readability-lxml +trafilatura +tldextract +python-slugify +numpy diff --git a/scripts/build_from_db.py b/scripts/build_from_db.py new file mode 100644 index 0000000..5db020b --- /dev/null +++ b/scripts/build_from_db.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +import os, sys, time, json, argparse, pathlib +import yaml, requests, jwt +from jinja2 import Template +from dotenv import load_dotenv +from datetime import date + +from db import connect as db_connect, insert_summary, upsert_embedding +from emb import embed_text + +load_dotenv() +ROOT = pathlib.Path(__file__).parent +REPO = ROOT.parent +TEMPLATES = REPO / "templates" + +def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8") + +def load_config(): + cfg = yaml.safe_load(read_file(REPO / "config.yaml")) + cfg["date"] = date.today().isoformat() + return cfg + +from dataclasses import dataclass +import requests + +@dataclass +class LLMConfig: + provider: str + api_base: str + model: str + api_key: str | None + temperature: float + top_p: float + presence_penalty: float + frequency_penalty: float + timeout_seconds: int + max_retries: int + +def resolve_llm_config(cfg: dict, args) -> LLMConfig: + llm_cfg = cfg.get("llm", {}) if cfg else {} + def pick(cli_val, env_key, cfg_key, default=None): + if cli_val is not None: + return cli_val + if env_key and os.getenv(env_key): + return os.getenv(env_key) + return llm_cfg.get(cfg_key, default) + provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui") + api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base", + "http://localhost:3000" if provider=="openwebui" else + "http://localhost:11434" if provider=="ollama" else + "https://api.openai.com") + model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model", + "qwen2.5-7b-instruct" if provider=="openwebui" else + "llama3.1:8b-instruct" if provider=="ollama" else + "gpt-4o-mini") + api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None) + temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2)) + top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0)) + presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0)) + frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0)) + timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120)) + max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2)) + return LLMConfig(provider, api_base, model, api_key, temperature, top_p, presence_penalty, frequency_penalty, timeout_seconds, max_retries) + +def chat_completion_llm(messages, llm: LLMConfig): + if llm.provider == "openwebui": + url = f"{llm.api_base.rstrip('/')}/api/chat/completions" + elif llm.provider == "ollama": + url = f"{llm.api_base.rstrip('/')}/v1/chat/completions" + else: + url = f"{llm.api_base.rstrip('/')}/v1/chat/completions" + headers = {"Content-Type":"application/json"} + if llm.api_key: headers["Authorization"] = f"Bearer {llm.api_key}" + payload = {"model": llm.model, "messages": messages, "temperature": llm.temperature, "top_p": llm.top_p, + "presence_penalty": llm.presence_penalty, "frequency_penalty": llm.frequency_penalty, "stream": False} + r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds) + r.raise_for_status() + return r.json()["choices"][0]["message"]["content"] + +def main(): + ap = argparse.ArgumentParser(description="Build directly from DB (Top-N sources)") + ap.add_argument("--db", default="data/newsletter.db") + ap.add_argument("--limit", type=int, default=10) + ap.add_argument("--out", required=True) + ap.add_argument("--publish", action="store_true") + # LLM overrides + ap.add_argument("--llm-provider"); ap.add_argument("--llm-api-base") + ap.add_argument("--llm-model"); ap.add_argument("--temperature", type=float) + ap.add_argument("--top-p", type=float); ap.add_argument("--presence-penalty", type=float) + ap.add_argument("--frequency-penalty", type=float); ap.add_argument("--timeout-seconds", type=int) + ap.add_argument("--max-retries", type=int) + args = ap.parse_args() + + cfg = load_config() + llm = resolve_llm_config(cfg, args) + con = db_connect(args.db) + + rows = con.execute("SELECT id, url, title, publisher FROM sources ORDER BY id DESC LIMIT ?", (args.limit,)).fetchall() + + prompt_template = (TEMPLATES / "prompt.txt").read_text(encoding="utf-8") + style_examples = (TEMPLATES / "style_bank.md").read_text(encoding="utf-8").strip() + prompt_template = prompt_template.replace("{style_examples}", style_examples) + + item_tpl = (TEMPLATES / "item.html.j2").read_text(encoding="utf-8") + news_tpl = (TEMPLATES / "newsletter.html.j2").read_text(encoding="utf-8") + + blocks = [] + for sid, url, title, publisher in rows: + body = (con.execute("SELECT content FROM sources WHERE id=?", (sid,)).fetchone()[0]) or "" + related_hint = "—" + prompt = (prompt_template + .replace("{title}", title or url) + .replace("{body}", body) + .replace("{source_name}", publisher or "Zdroj neuveden") + .replace("{related_hint}", related_hint)) + summary = chat_completion_llm([{"role":"user","content": prompt}], llm) + + sum_id = insert_summary(con, sid, title or url, summary, newsletter_date=cfg["date"], tone_version="v1") + try: + vec = embed_text(summary, os.getenv("EMB_API_BASE", cfg["llm"]["api_base"]), os.getenv("EMB_API_KEY", os.getenv("LLM_API_KEY")), os.getenv("EMB_MODEL", cfg["db"]["embed_model"])) + upsert_embedding(con, "summaries", sum_id, os.getenv("EMB_MODEL", cfg["db"]["embed_model"]), vec) + except Exception: + pass + + blocks.append(Template(item_tpl).render(title=(title or url), summary=summary)) + + newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"]) + newsletter_subtitle = cfg.get("newsletter_subtitle","") + html_out = Template(news_tpl).render(newsletter_title=newsletter_title, newsletter_subtitle=newsletter_subtitle, blocks=blocks) + + outp = pathlib.Path(args.out); outp.parent.mkdir(parents=True, exist_ok=True); outp.write_text(html_out, encoding="utf-8") + print(f"Saved: {outp}") + + if args.publish: + ghost_url = os.getenv("GHOST_ADMIN_API_URL") + ghost_key = os.getenv("GHOST_ADMIN_API_KEY") + if ghost_url and ghost_key: + def ghost_jwt(key: str) -> str: + key_id, secret = key.split(':') + iat = int(time.time()) + header = {"alg": "HS256", "kid": key_id, "typ": "JWT"} + payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"} + import jwt + return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header) + token = ghost_jwt(ghost_key) + payload = {"posts":[{"title": newsletter_title, "html": html_out, "status": "draft", "tags": [{"name": t} for t in cfg.get("default_tags",[])]}]} + r = requests.post(f"{ghost_url}/posts/", headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"}, data=json.dumps(payload), timeout=60) + r.raise_for_status() + print("Draft:", r.json()["posts"][0]["url"]) + else: + print("Missing Ghost creds; skipped publish.") + +if __name__ == "__main__": + main() diff --git a/scripts/db_cli.py b/scripts/db_cli.py new file mode 100644 index 0000000..8d5f2d9 --- /dev/null +++ b/scripts/db_cli.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +import argparse +from db import connect, init_db + +def main(): + ap = argparse.ArgumentParser(description="DB CLI") + ap.add_argument("cmd", choices=["init"]) + ap.add_argument("--db", default="data/newsletter.db") + ap.add_argument("--schema", default="scripts/schema.sql") + args = ap.parse_args() + + con = connect(args.db) + if args.cmd == "init": + init_db(con, args.schema) + print(f"Initialized schema in {args.db}") + +if __name__ == "__main__": + main() diff --git a/scripts/ingest_list.py b/scripts/ingest_list.py new file mode 100644 index 0000000..12c06bd --- /dev/null +++ b/scripts/ingest_list.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +import argparse, sqlite3, re, sys, time +from pathlib import Path +from urllib.parse import urlparse, urlunparse +import requests, tldextract +from bs4 import BeautifulSoup +from readability import Document as ReadabilityDoc +import trafilatura +from slugify import slugify +from datetime import date + +def connect(db_path): + con = sqlite3.connect(db_path) + con.execute("PRAGMA foreign_keys=ON;") + return con + +def upsert_source(con, url, title=None, publisher=None, date_published=None, content=None, tags=None): + con.execute( + """INSERT INTO sources(url, title, publisher, date_published, content) + VALUES (?,?,?,?,?) + ON CONFLICT(url) DO UPDATE SET + title=COALESCE(excluded.title, title), + publisher=COALESCE(excluded.publisher, publisher), + date_published=COALESCE(excluded.date_published, date_published), + content=COALESCE(excluded.content, content) + """, (url, title, publisher, date_published, content) + ) + sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0] + con.commit() + return sid + +def normalize_url(u: str) -> str: + p = urlparse(u.strip()) + if not p.scheme: + p = p._replace(scheme="https") + query = re.sub(r'(&|\?)?(utm_[^=&]+|fbclid|gclid)=[^&]*', '', p.query) + if query.startswith('&'): query = query[1:] + return urlunparse((p.scheme, p.netloc, p.path, p.params, query, "")) + +def domain(url: str) -> str: + ext = tldextract.extract(url) + return ".".join(part for part in [ext.domain, ext.suffix] if part) + +def fetch_readable(url: str, timeout=20): + try: + r = requests.get(url, timeout=timeout, headers={"User-Agent":"Mozilla/5.0"}) + r.raise_for_status() + html = r.text + except Exception: + return "", "" + try: + txt = trafilatura.extract(html, include_comments=False, include_tables=False, favor_recall=True) + if txt: + soup = BeautifulSoup(html, "html.parser") + t = (soup.title.string.strip() if soup.title and soup.title.string else "") + return t, txt.strip() + except Exception: + pass + try: + doc = ReadabilityDoc(html) + content_html = doc.summary() + soup = BeautifulSoup(content_html, "html.parser") + txt = soup.get_text(separator="\n").strip() + return (doc.short_title() or "").strip(), txt + except Exception: + return "", "" + +def write_stub(entries_dir: Path, title: str, url: str, source_name: str|None, text: str|None): + entries_dir.mkdir(parents=True, exist_ok=True) + from datetime import datetime + slug = slugify(title or domain(url) or "clanek")[:60] or "clanek" + path = entries_dir / f"{slug}.md" + body = text.strip() if text else "" + fm = f'''--- +title: "{(title or "").replace('"',"\"")}" +source_name: "{(source_name or domain(url) or "").replace('"',"\"")}" +url: "{url}" +tags: [] +status: "todo" +--- + +{body} +''' + path.write_text(fm, encoding="utf-8") + return path + +def read_lines(source_path: str|None): + if source_path: + return Path(source_path).read_text(encoding="utf-8", errors="ignore").splitlines() + return sys.stdin.read().splitlines() + +def main(): + ap = argparse.ArgumentParser(description="Ingest list of URLs into SQLite and optional entry stubs") + ap.add_argument("--db", default="data/newsletter.db") + ap.add_argument("--list", help="Text file with URLs (one per line). If omitted, read from STDIN.") + ap.add_argument("--fetch", action="store_true", help="Fetch & extract readable text") + ap.add_argument("--sleep", type=float, default=0.0) + ap.add_argument("--stubs", action="store_true") + ap.add_argument("--date", default=date.today().isoformat()) + args = ap.parse_args() + + con = connect(args.db) + lines = read_lines(args.list) + + urls = [] + for ln in lines: + ln = ln.strip() + if not ln or ln.startswith("#"): + continue + if re.match(r"^\w+://", ln) or re.match(r"^[\w\.-]+\.[a-z]{2,}(/|$)", ln): + urls.append(normalize_url(ln)) + seen = set(); urls = [u for u in urls if not (u in seen or seen.add(u))] + + stubs_dir = Path(f"entries/{args.date}") if args.stubs else None + kept = 0 + for url in urls: + pub = domain(url) + title, text = ("","") + if args.fetch: + title, text = fetch_readable(url) + sid = upsert_source(con, url=url, title=(title or None), publisher=pub, content=(text or None)) + kept += 1 + if stubs_dir: + stub = write_stub(stubs_dir, title or url, url, pub, text) + print(f"Stub: {stub}") + if args.sleep: time.sleep(args.sleep) + print(f"Ingested: {kept} URLs into {args.db}") + +if __name__ == "__main__": + main() diff --git a/scripts/schema.sql b/scripts/schema.sql new file mode 100644 index 0000000..19c05e5 --- /dev/null +++ b/scripts/schema.sql @@ -0,0 +1,71 @@ +PRAGMA foreign_keys = ON; + +CREATE TABLE IF NOT EXISTS sources ( + id INTEGER PRIMARY KEY, + url TEXT UNIQUE, + title TEXT, + publisher TEXT, + date_published TEXT, + content TEXT +); + +CREATE TABLE IF NOT EXISTS summaries ( + id INTEGER PRIMARY KEY, + source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL, + title TEXT NOT NULL, + summary TEXT NOT NULL, + newsletter_date TEXT, + tone_version TEXT, + created_at TEXT DEFAULT (datetime('now')) +); + +CREATE TABLE IF NOT EXISTS tags ( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE +); +CREATE TABLE IF NOT EXISTS source_tags ( + source_id INTEGER REFERENCES sources(id) ON DELETE CASCADE, + tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE, + PRIMARY KEY(source_id, tag_id) +); + +CREATE TABLE IF NOT EXISTS embeddings ( + id INTEGER PRIMARY KEY, + ref_table TEXT NOT NULL CHECK(ref_table IN ('sources','summaries')), + ref_id INTEGER NOT NULL, + model TEXT NOT NULL, + dim INTEGER NOT NULL, + vec BLOB NOT NULL, + UNIQUE(ref_table, ref_id, model) +); + +CREATE VIRTUAL TABLE IF NOT EXISTS sources_fts USING fts5( + title, content, content='sources', content_rowid='id' +); +CREATE VIRTUAL TABLE IF NOT EXISTS summaries_fts USING fts5( + title, summary, content='summaries', content_rowid='id' +); + +CREATE TRIGGER IF NOT EXISTS sources_ai AFTER INSERT ON sources BEGIN + INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content); +END; +CREATE TRIGGER IF NOT EXISTS sources_au AFTER UPDATE ON sources BEGIN + INSERT INTO sources_fts(sources_fts, rowid, title, content) + VALUES('delete', old.id, old.title, old.content); + INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content); +END; +CREATE TRIGGER IF NOT EXISTS sources_ad AFTER DELETE ON sources BEGIN + INSERT INTO sources_fts(sources_fts, rowid, title, content) VALUES('delete', old.id, old.title, old.content); +END; + +CREATE TRIGGER IF NOT EXISTS summaries_ai AFTER INSERT ON summaries BEGIN + INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary); +END; +CREATE TRIGGER IF NOT EXISTS summaries_au AFTER UPDATE ON summaries BEGIN + INSERT INTO summaries_fts(summaries_fts, rowid, title, summary) + VALUES('delete', old.id, old.title, old.summary); + INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary); +END; +CREATE TRIGGER IF NOT EXISTS summaries_ad AFTER DELETE ON summaries BEGIN + INSERT INTO summaries_fts(summaries_fts, rowid, title, summary) VALUES('delete', old.id, old.title, old.summary); +END; diff --git a/scripts/sync_entries_to_db.py b/scripts/sync_entries_to_db.py new file mode 100644 index 0000000..499fb9b --- /dev/null +++ b/scripts/sync_entries_to_db.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +import argparse, re +from pathlib import Path +import yaml +from db import connect, upsert_source + +def parse_front_matter(text: str): + m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M) + if not m: + return {}, text.strip() + fm = yaml.safe_load(m.group(1)) or {} + body = m.group(2).strip() + return fm, body + +def main(): + ap = argparse.ArgumentParser(description="Sync entries/*.md into SQLite sources") + ap.add_argument("--db", default="data/newsletter.db") + ap.add_argument("--dir", required=True, help="entries/YYYY-MM-DD directory") + args = ap.parse_args() + + con = connect(args.db) + for p in Path(args.dir).glob("*.md"): + text = p.read_text(encoding="utf-8") + fm, body = parse_front_matter(text) + title = fm.get("title") or p.stem + url = fm.get("url") + publisher = fm.get("source_name") + upsert_source(con, + url=url, + title=title, + publisher=publisher, + date_published=None, + content=body + ) + print(f"Synced: {p.name}") + +if __name__ == "__main__": + main() diff --git a/templates/item.html.j2 b/templates/item.html.j2 new file mode 100644 index 0000000..07138c2 --- /dev/null +++ b/templates/item.html.j2 @@ -0,0 +1,2 @@ +

{{ title }}

+

{{ summary }}

diff --git a/templates/newsletter.html.j2 b/templates/newsletter.html.j2 new file mode 100644 index 0000000..9fe914f --- /dev/null +++ b/templates/newsletter.html.j2 @@ -0,0 +1,7 @@ +

{{ newsletter_title }}

+

{{ newsletter_subtitle }}

+
+{% for b in blocks %} +{{ b }} +{% if not loop.last %}
{% endif %} +{% endfor %} diff --git a/templates/prompt.txt b/templates/prompt.txt new file mode 100644 index 0000000..22c568f --- /dev/null +++ b/templates/prompt.txt @@ -0,0 +1,24 @@ +Jsi editor newsletteru o archeologii, historii a příbuzných vědách. Piš česky, srozumitelně pro laiky, věcně, bez žargonu; jednu větu můžeš udělat mírně poutavou, ale ne bulvární. Drž rozsah 3–4 věty. + +ŠABLONA OBSAHU: +1) Co je novinka/objev. +2) Proč je důležitý (dopad, změna pohledu, metoda). +3) Kontext nebo zajímavost (kultura, datace, souvislosti). +4) (volitelně) Jedna věta s odkazem na předchozí související text, pokud je k dispozici. +Na konec přidej: „Zdroj: {source_name}“. + +=== UKÁZKY STYLU === +{style_examples} + +=== VSTUPNÍ POLOŽKA === +TITULEK: {title} +TEXT: +{body} + +=== NÁVAZNOSTI (VOLITELNÉ) === +{related_hint} + +=== POKYNY === +- Drž 3–4 věty. +- Pokud jsou náležitosti neúplné, buď opatrný a nic si nedovymýšlej. +- Zakonči přesně: „Zdroj: {source_name}“. diff --git a/templates/style_bank.md b/templates/style_bank.md new file mode 100644 index 0000000..66ef12e --- /dev/null +++ b/templates/style_bank.md @@ -0,0 +1,2 @@ +- Velšské deštné pralesy: Velká Británie je domovem unikátních zbytků mírného deštného pralesa, který se kdysi rozkládal od Walesu až po Skotsko. Dnes přežívá jen asi 10 % původní plochy, ale probíhají programy jeho ochrany a obnovy. Pralesy se objevují i v mytologii, například ve čtvrté větvi velšského eposu Mabinogi, kde jsou spojeny s kouzelnými zvířaty. Kontrolovaná pastva, kterou je naši předci udržovali zdravé, se používá i dnes. +- Podmořské stavby lovců‑sběračů: Vědci objevili pod hladinou Baltu kilometr dlouhou kamennou linii, pravděpodobně prehistorickou loveckou strukturu. Nález naznačuje promyšlenou organizaci lovu i přizpůsobení se krajině, kterou později zaplavilo moře. Podobné konstrukce známe z jiných částí světa, ale v Evropě jsou vzácné.