First commit

2025-09-14 16:56:08 +02:00
commit 079bd25899
19 changed files with 976 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,21 @@
 # --- Python / environment ---
 .venv/
 __pycache__/
 *.py[cod]
 *.egg-info/
 .DS_Store
 # --- Secrets ---
 .env
 # --- Local database (optional) ---
 # Comment out if you DO want to commit newsletter history!
 data/*.db
 # --- Build outputs ---
 dist/
 *.html
 # --- Logs / temp ---
 *.log
 .cache/
--- a/57
+++ b/57
@@ -0,0 +1,57 @@
 DATE ?= $(shell date +%F)
 ENTRIES_DIR ?= entries/$(DATE)
 OUT ?= dist/$(DATE).html
 LIMIT ?= 10
 PROVIDER ?=
 API_BASE ?=
 MODEL ?=
 TEMP ?=
 LLM_FLAGS :=
 ifneq ($(strip $(PROVIDER)),)
  LLM_FLAGS += --llm-provider $(PROVIDER)
 endif
 ifneq ($(strip $(API_BASE)),)
  LLM_FLAGS += --llm-api-base $(API_BASE)
 endif
 ifneq ($(strip $(MODEL)),)
  LLM_FLAGS += --llm-model $(MODEL)
 endif
 ifneq ($(strip $(TEMP)),)
  LLM_FLAGS += --temperature $(TEMP)
 endif
 .PHONY: init ingest stubs build draft build-db draft-db sync-db clean
 init:
 	python scripts/db_cli.py init
 ingest:
 	python scripts/ingest_list.py --list inbox.txt
 stubs:
 	python scripts/ingest_list.py --list inbox.txt --fetch --stubs --date $(DATE) --sleep 0.5
 build:
 	python build.py $(ENTRIES_DIR) --out $(OUT) $(LLM_FLAGS)
 draft:
 	python build.py $(ENTRIES_DIR) --out $(OUT) --publish $(LLM_FLAGS)
 build-db:
 	python scripts/build_from_db.py --limit $(LIMIT) --out $(OUT) $(LLM_FLAGS)
 draft-db:
 	python scripts/build_from_db.py --limit $(LIMIT) --out $(OUT) --publish $(LLM_FLAGS)
 sync-db:
 	python scripts/sync_entries_to_db.py --dir $(ENTRIES_DIR)
 clean:
 	rm -rf dist/*
 # Dry run: generate summaries and print HTML to stdout (no file write, no publish)
 dry-run:
 	python build.py $(ENTRIES_DIR) --dry-run $(LLM_FLAGS)
--- a/README.md
+++ b/README.md
@@ -0,0 +1,56 @@
 # Offline Newsletter Builder (Archaeology/History)
 Local-first toolchain to turn your article links (or pasted texts) into a Friday Ghost draft with consistent summaries, plus a SQLite database for memory and related-article suggestions.
 ## Highlights
 - Paste URLs into `inbox.txt` → run importer → get `.md` stubs **and** a growing SQLite DB.
 - One command to build the weekly newsletter (HTML) and optionally create a **Ghost draft**.
 - Consistent tone via `templates/prompt.txt` (+ your `templates/style_bank.md` samples).
 - "Memory" via `data/newsletter.db` (sources, summaries, embeddings, FTS), used to auto-suggest **related** items.
 ## Quick start
 ```bash
 python -m venv .venv && source .venv/bin/activate
 pip install -r requirements.txt
 # Configure secrets
 cp .env.example .env
 # Edit .env to set your keys (or point to local OpenWebUI/Ollama)
 # Create DB schema
 python scripts/db_cli.py init
 # Ingest URLs (from inbox.txt)
 python scripts/ingest_list.py --list inbox.txt --fetch --stubs --date 2025-09-19 --sleep 0.5
 # Build newsletter from stubs (HTML + optional Ghost draft)
 python build.py entries/2025-09-19 --out dist/2025-09-19.html --publish
 # (Optional) Build directly from DB (Top-N sources)
 python scripts/build_from_db.py --limit 10 --out dist/2025-09-19.html --publish
 ```
 ## Sample Run (Pretend Output)
 ```bash
 $ make init
 Initialized schema in data/newsletter.db
 $ make stubs DATE=2025-09-19
 Stub: entries/2025-09-19/bbc-welsh-rainforests.md
 Stub: entries/2025-09-19/nature-bone-discovery.md
 Ingested: 2 URLs into data/newsletter.db
 $ make build DATE=2025-09-19
 [build] Using LLM provider=openwebui model=qwen2.5-7b-instruct
 [build] Generating summary for: Velšské deštné pralesy ...
 [build] -> Summary written back to DB (id=5, embedding stored)
 [build] Generating summary for: Kosterní nález u Nisy ...
 [build] -> Summary written back to DB (id=6, embedding stored)
 Saved: dist/2025-09-19.html
 $ make draft DATE=2025-09-19
 Draft: https://your-ghost-site.ghost.io/p/objevy-tydne-2025-09-19-draft
 ```
 You can open the generated HTML under `dist/2025-09-19.html` in your browser to review before publishing.
--- a/build.py
+++ b/build.py
@@ -0,0 +1,246 @@
 #!/usr/bin/env python3
 import os, sys, glob, time, json, html, argparse, pathlib, textwrap, re
 from datetime import date
 import yaml, requests, jwt
 from jinja2 import Template
 from dotenv import load_dotenv
 load_dotenv()
 ROOT = pathlib.Path(__file__).parent
 TEMPLATES = ROOT / "templates"
 def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8")
 def load_config():
    cfg = yaml.safe_load(read_file(ROOT / "config.yaml"))
    cfg["date"] = date.today().isoformat()
    return cfg
 def parse_front_matter(text):
    m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M)
    if not m:
        return {}, text.strip()
    import yaml as _yaml
    fm = _yaml.safe_load(m.group(1)) or {}
    body = m.group(2).strip()
    return fm, body
 # ---- LLM config / client ----
 from dataclasses import dataclass
@dataclass
 class LLMConfig:
    provider: str
    api_base: str
    model: str
    api_key: str | None
    temperature: float
    top_p: float
    presence_penalty: float
    frequency_penalty: float
    timeout_seconds: int
    max_retries: int
 def resolve_llm_config(cfg: dict, args) -> LLMConfig:
    llm_cfg = cfg.get("llm", {}) if cfg else {}
    def pick(cli_val, env_key, cfg_key, default=None):
        if cli_val is not None:
            return cli_val
        if env_key and os.getenv(env_key):
            return os.getenv(env_key)
        return llm_cfg.get(cfg_key, default)
    provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui")
    api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base",
                    "http://localhost:3000" if provider=="openwebui" else
                    "http://localhost:11434" if provider=="ollama" else
                    "https://api.openai.com")
    model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model",
                 "qwen2.5-7b-instruct" if provider=="openwebui" else
                 "llama3.1:8b-instruct" if provider=="ollama" else
                 "gpt-4o-mini")
    api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None)
    temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2))
    top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0))
    presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0))
    frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0))
    timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120))
    max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2))
    return LLMConfig(
        provider=provider, api_base=api_base, model=model, api_key=api_key,
        temperature=temperature, top_p=top_p,
        presence_penalty=presence_penalty, frequency_penalty=frequency_penalty,
        timeout_seconds=timeout_seconds, max_retries=max_retries
    )
 def chat_completion_llm(messages, llm: LLMConfig):
    if llm.provider == "openwebui":
        url = f"{llm.api_base.rstrip('/')}/api/chat/completions"
        headers = {"Content-Type":"application/json"}
        if llm.api_key:
            headers["Authorization"] = f"Bearer {llm.api_key}"
    elif llm.provider == "ollama":
        url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
        headers = {"Content-Type":"application/json"}
        if llm.api_key:
            headers["Authorization"] = f"Bearer {llm.api_key}"
    else:
        url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
        headers = {"Content-Type":"application/json"}
        if llm.api_key:
            headers["Authorization"] = f"Bearer {llm.api_key}"
    payload = {
        "model": llm.model,
        "messages": messages,
        "temperature": llm.temperature,
        "top_p": llm.top_p,
        "presence_penalty": llm.presence_penalty,
        "frequency_penalty": llm.frequency_penalty,
        "stream": False
    }
    attempt = 0
    last_err = None
    while attempt <= llm.max_retries:
        try:
            r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds)
            r.raise_for_status()
            data = r.json()
            return data["choices"][0]["message"]["content"]
        except Exception as e:
            last_err = e
            attempt += 1
            if attempt > llm.max_retries:
                break
            time.sleep(min(2**attempt, 8))
    raise RuntimeError(f"LLM request failed after {llm.max_retries} retries: {last_err}")
 def call_llm_via_messages(prompt: str, llm: LLMConfig) -> str:
    return chat_completion_llm([{"role":"user","content": prompt}], llm)
 # ---- Ghost ----
 def ghost_jwt(key: str) -> str:
    key_id, secret = key.split(':')
    iat = int(time.time())
    header = {"alg": "HS256", "kid": key_id, "typ": "JWT"}
    payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"}
    return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)
 def create_ghost_draft(ghost_url, ghost_key, html_content, title, tags):
    token = ghost_jwt(ghost_key)
    payload = { "posts": [{
        "title": title, "html": html_content, "status": "draft",
        "tags": [{"name": t} for t in tags]
    }]}
    r = requests.post(
        f"{ghost_url}/posts/",
        headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"},
        data=json.dumps(payload), timeout=60
    )
    r.raise_for_status()
    return r.json()["posts"][0]["url"]
 # ---- Memory/embeddings ----
 from db import connect as db_connect, topk_similar
 from emb import embed_text
 def build_related_hint_auto(title, body, llm_cfg, cfg_db):
    api_base = os.getenv("EMB_API_BASE", llm_cfg.api_base)
    api_key  = os.getenv("EMB_API_KEY", llm_cfg.api_key)
    model    = os.getenv("EMB_MODEL", cfg_db.get("embed_model", "text-embedding-3-small"))
    qtext = (title + "\n\n" + body)[:5000]
    try:
        vec = embed_text(qtext, api_base, api_key, model)
    except Exception:
        return "—"
    con = db_connect(cfg_db["path"])
    hits = topk_similar(con, model=model, query_vec=vec,
                        ref_table="summaries",
                        k=cfg_db.get("related_top_k",3),
                        min_sim=cfg_db.get("min_similarity",0.78))
    if not hits:
        return "—"
    lines = []
    for sid, t, s, nd in hits:
        lines.append(f"- {nd or 'dříve'}: {t}")
    return "O podobném tématu jsme psali:\n" + "\n".join(lines) + "\nZmiň jednou větou souvislost."
 def main():
    ap = argparse.ArgumentParser(description="Offline-first generator + Ghost draft")
    ap.add_argument("entries_dir", help="entries/YYYY-MM-DD directory")
    ap.add_argument("--out", help="Output HTML path, e.g. dist/2025-09-19.html")
    ap.add_argument("--dry-run", action="store_true")
    ap.add_argument("--publish", action="store_true")
    # LLM overrides
    ap.add_argument("--llm-provider")
    ap.add_argument("--llm-api-base")
    ap.add_argument("--llm-model")
    ap.add_argument("--temperature", type=float)
    ap.add_argument("--top-p", type=float)
    ap.add_argument("--presence-penalty", type=float)
    ap.add_argument("--frequency-penalty", type=float)
    ap.add_argument("--timeout-seconds", type=int)
    ap.add_argument("--max-retries", type=int)
    args = ap.parse_args()
    cfg = load_config()
    llm = resolve_llm_config(cfg, args)
    item_tpl = read_file(TEMPLATES / "item.html.j2")
    news_tpl = read_file(TEMPLATES / "newsletter.html.j2")
    prompt_template = read_file(TEMPLATES / "prompt.txt")
    style_examples = read_file(TEMPLATES / "style_bank.md").strip()
    prompt_template = prompt_template.replace("{style_examples}", style_examples)
    paths = sorted(glob.glob(os.path.join(args.entries_dir, "*.md")))
    blocks = []
    for p in paths:
        fm_text = pathlib.Path(p).read_text(encoding="utf-8")
        fm, body = parse_front_matter(fm_text)
        if fm.get("status","todo") == "skip":
            continue
        title = fm.get("title") or pathlib.Path(p).stem.replace("-"," ").title()
        source_name = fm.get("source_name","Zdroj neuveden")
        related_hint = build_related_hint_auto(title, body, llm, cfg.get("db",{}))
        prompt = (prompt_template
            .replace("{title}", title)
            .replace("{body}", body)
            .replace("{source_name}", source_name)
            .replace("{related_hint}", related_hint))
        summary = call_llm_via_messages(prompt, llm)
        block_html = Template(item_tpl).render(title=title, summary=summary)
        blocks.append(block_html)
    newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"])
    newsletter_subtitle = cfg.get("newsletter_subtitle","")
    html_out = Template(news_tpl).render(
        newsletter_title=newsletter_title,
        newsletter_subtitle=newsletter_subtitle,
        blocks=blocks
    )
    if args.out:
        outp = pathlib.Path(args.out)
        outp.parent.mkdir(parents=True, exist_ok=True)
        outp.write_text(html_out, encoding="utf-8")
        print(f"Saved: {outp}")
    if args.publish:
        ghost_url = os.getenv("GHOST_ADMIN_API_URL")
        ghost_key = os.getenv("GHOST_ADMIN_API_KEY")
        if not (ghost_url and ghost_key):
            print("Missing GHOST_ADMIN_API_URL or GHOST_ADMIN_API_KEY in .env", file=sys.stderr)
            sys.exit(2)
        url = create_ghost_draft(ghost_url, ghost_key, html_out, newsletter_title, cfg.get("default_tags",[]))
        print("Draft:", url)
    if not (args.out or args.publish):
        print(html_out)
 if __name__ == "__main__":
    main()
--- a/config.yaml
+++ b/config.yaml
@@ -0,0 +1,22 @@
 newsletter_title: "Objevy týdne – {{ date }}"
 newsletter_subtitle: "Archeologie, historie a příbuzné vědy"
 default_tags: ["Newsletter","Archeologie"]
 model: "gpt-4o-mini"
 temperature: 0.2
 llm:
  provider: "openwebui"            # openwebui | ollama | openai
  api_base: "http://localhost:3000"
  model: "qwen2.5-7b-instruct"
  temperature: 0.2
  top_p: 1.0
  presence_penalty: 0.0
  frequency_penalty: 0.0
  timeout_seconds: 120
  max_retries: 2
 db:
  path: "data/newsletter.db"
  embed_model: "text-embedding-3-small"
  related_top_k: 3
  min_similarity: 0.78
--- a/db.py
+++ b/db.py
@@ -0,0 +1,91 @@
 import sqlite3
 from pathlib import Path
 import math
 import struct
 def connect(db_path: str) -> sqlite3.Connection:
    Path(db_path).parent.mkdir(parents=True, exist_ok=True)
    con = sqlite3.connect(db_path)
    con.execute("PRAGMA foreign_keys = ON;")
    return con
 def init_db(con: sqlite3.Connection, schema_path: str = "scripts/schema.sql"):
    schema = Path(schema_path).read_text(encoding="utf-8")
    con.executescript(schema)
    con.commit()
 def pack_vec(vec):
    return struct.pack("<%sf" % len(vec), *vec)
 def unpack_vec(blob):
    fcount = len(blob)//4
    return list(struct.unpack("<%sf" % fcount, blob))
 def cosine(a, b):
    na = math.sqrt(sum(x*x for x in a)); nb = math.sqrt(sum(x*x for x in b))
    if na == 0 or nb == 0: return 0.0
    return sum(x*y for x,y in zip(a,b)) / (na*nb)
 def upsert_source(con, url=None, title=None, publisher=None, date_published=None, content=None, tags=None):
    con.execute(
        """INSERT INTO sources(url, title, publisher, date_published, content)
            VALUES(?,?,?,?,?)
            ON CONFLICT(url) DO UPDATE SET
              title=COALESCE(excluded.title, title),
              publisher=COALESCE(excluded.publisher, publisher),
              date_published=COALESCE(excluded.date_published, date_published),
              content=COALESCE(excluded.content, content)
        """, (url, title, publisher, date_published, content)
    )
    sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0]
    if tags:
        for t in tags:
            con.execute("INSERT OR IGNORE INTO tags(name) VALUES(?)", (t,))
            tid = con.execute("SELECT id FROM tags WHERE name=?", (t,)).fetchone()[0]
            con.execute("INSERT OR IGNORE INTO source_tags(source_id, tag_id) VALUES(?,?)", (sid, tid))
    con.commit()
    return sid
 def insert_summary(con, source_id, title, summary, newsletter_date=None, tone_version=None):
    cur = con.cursor()
    cur.execute(
        """INSERT INTO summaries(source_id, title, summary, newsletter_date, tone_version)
               VALUES (?,?,?,?,?)""",
        (source_id, title, summary, newsletter_date, tone_version)
    )
    con.commit()
    return cur.lastrowid
 def upsert_embedding(con, ref_table, ref_id, model, vec):
    dim = len(vec)
    blob = pack_vec(vec)
    con.execute(
        """INSERT INTO embeddings(ref_table, ref_id, model, dim, vec)
               VALUES (?,?,?,?,?)
               ON CONFLICT(ref_table, ref_id, model) DO UPDATE SET vec=excluded.vec, dim=excluded.dim""",
        (ref_table, ref_id, model, dim, blob)
    )
    con.commit()
 def topk_similar(con, model, query_vec, ref_table="summaries", k=3, min_sim=0.78):
    rows = con.execute(
        "SELECT ref_id, dim, vec FROM embeddings WHERE ref_table=? AND model=?;",
        (ref_table, model)
    ).fetchall()
    scored = []
    for ref_id, dim, blob in rows:
        vec = unpack_vec(blob)
        if len(vec) != len(query_vec):
            continue
        sim = cosine(query_vec, vec)
        if sim >= min_sim:
            scored.append((sim, ref_id))
    scored.sort(reverse=True)
    ref_ids = [rid for _, rid in scored[:k]]
    if not ref_ids: return []
    if ref_table == "summaries":
        q = "SELECT id, title, summary, newsletter_date FROM summaries WHERE id IN (%s)" % ",".join("?"*len(ref_ids))
        return con.execute(q, ref_ids).fetchall()
    else:
        q = "SELECT id, title, url, date_published FROM sources WHERE id IN (%s)" % ",".join("?"*len(ref_ids))
        return con.execute(q, ref_ids).fetchall()
--- a/emb.py
+++ b/emb.py
@@ -0,0 +1,10 @@
 import os, requests
 def embed_text(text: str, api_base: str, api_key: str|None, model: str) -> list[float]:
    url = f"{api_base.rstrip('/')}/v1/embeddings"
    headers = {"Content-Type":"application/json"}
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
    r = requests.post(url, headers=headers, json={"model": model, "input": text}, timeout=60)
    r.raise_for_status()
    return r.json()["data"][0]["embedding"]
--- a/entries/2025-09-19/sample-welsh-rainforests.md
+++ b/entries/2025-09-19/sample-welsh-rainforests.md
@@ -0,0 +1,11 @@
 ---
 title: "Velšské deštné pralesy"
 source_name: "BBC / Wikipedie"
 url: "https://www.bbc.com/news/science-environment-00000000"
 tags: ["folklor","krajina"]
 status: "todo"
 ---
 Deštné pralesy má asi většina z nás spojené spíš s Amazonií nebo jihovýchodní Asií, ale ony existují nejen v tropickém klimatickém pásu, ale i v našem, mírném. Jako deštný se označuje les s více než 200 cm srážek ročně. A právě takový les se kdysi rozkládal od Walesu až po Skotsko.
 Dnes z něj zbývá necelých 10 % rozlohy, ale je o něj pečováno velmi pečlivě. Tento prales je také zdrojem celé řady místních mýtů. V tom asi nejznámějším, Mabinogi, se ve čtvrté větvi stane roztržka mezi Pryderim a Gwydionem...
--- a/inbox.txt
+++ b/inbox.txt
@@ -0,0 +1,4 @@
 # Paste URLs here (one per line)
 https://www.bbc.com/news/science-environment-00000000
 https://www.nature.com/articles/xxxxxxxx
 https://antiquity.ac.uk/article/yyyyyyyy
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,12 @@
 pyyaml
 jinja2
 python-dotenv
 requests
 markdown
 pyjwt
 beautifulsoup4
 readability-lxml
 trafilatura
 tldextract
 python-slugify
 numpy
--- a/scripts/build_from_db.py
+++ b/scripts/build_from_db.py
@@ -0,0 +1,154 @@
 #!/usr/bin/env python3
 import os, sys, time, json, argparse, pathlib
 import yaml, requests, jwt
 from jinja2 import Template
 from dotenv import load_dotenv
 from datetime import date
 from db import connect as db_connect, insert_summary, upsert_embedding
 from emb import embed_text
 load_dotenv()
 ROOT = pathlib.Path(__file__).parent
 REPO = ROOT.parent
 TEMPLATES = REPO / "templates"
 def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8")
 def load_config():
    cfg = yaml.safe_load(read_file(REPO / "config.yaml"))
    cfg["date"] = date.today().isoformat()
    return cfg
 from dataclasses import dataclass
 import requests
@dataclass
 class LLMConfig:
    provider: str
    api_base: str
    model: str
    api_key: str | None
    temperature: float
    top_p: float
    presence_penalty: float
    frequency_penalty: float
    timeout_seconds: int
    max_retries: int
 def resolve_llm_config(cfg: dict, args) -> LLMConfig:
    llm_cfg = cfg.get("llm", {}) if cfg else {}
    def pick(cli_val, env_key, cfg_key, default=None):
        if cli_val is not None:
            return cli_val
        if env_key and os.getenv(env_key):
            return os.getenv(env_key)
        return llm_cfg.get(cfg_key, default)
    provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui")
    api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base",
                    "http://localhost:3000" if provider=="openwebui" else
                    "http://localhost:11434" if provider=="ollama" else
                    "https://api.openai.com")
    model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model",
                 "qwen2.5-7b-instruct" if provider=="openwebui" else
                 "llama3.1:8b-instruct" if provider=="ollama" else
                 "gpt-4o-mini")
    api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None)
    temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2))
    top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0))
    presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0))
    frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0))
    timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120))
    max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2))
    return LLMConfig(provider, api_base, model, api_key, temperature, top_p, presence_penalty, frequency_penalty, timeout_seconds, max_retries)
 def chat_completion_llm(messages, llm: LLMConfig):
    if llm.provider == "openwebui":
        url = f"{llm.api_base.rstrip('/')}/api/chat/completions"
    elif llm.provider == "ollama":
        url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
    else:
        url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
    headers = {"Content-Type":"application/json"}
    if llm.api_key: headers["Authorization"] = f"Bearer {llm.api_key}"
    payload = {"model": llm.model, "messages": messages, "temperature": llm.temperature, "top_p": llm.top_p,
               "presence_penalty": llm.presence_penalty, "frequency_penalty": llm.frequency_penalty, "stream": False}
    r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds)
    r.raise_for_status()
    return r.json()["choices"][0]["message"]["content"]
 def main():
    ap = argparse.ArgumentParser(description="Build directly from DB (Top-N sources)")
    ap.add_argument("--db", default="data/newsletter.db")
    ap.add_argument("--limit", type=int, default=10)
    ap.add_argument("--out", required=True)
    ap.add_argument("--publish", action="store_true")
    # LLM overrides
    ap.add_argument("--llm-provider"); ap.add_argument("--llm-api-base")
    ap.add_argument("--llm-model"); ap.add_argument("--temperature", type=float)
    ap.add_argument("--top-p", type=float); ap.add_argument("--presence-penalty", type=float)
    ap.add_argument("--frequency-penalty", type=float); ap.add_argument("--timeout-seconds", type=int)
    ap.add_argument("--max-retries", type=int)
    args = ap.parse_args()
    cfg = load_config()
    llm = resolve_llm_config(cfg, args)
    con = db_connect(args.db)
    rows = con.execute("SELECT id, url, title, publisher FROM sources ORDER BY id DESC LIMIT ?", (args.limit,)).fetchall()
    prompt_template = (TEMPLATES / "prompt.txt").read_text(encoding="utf-8")
    style_examples = (TEMPLATES / "style_bank.md").read_text(encoding="utf-8").strip()
    prompt_template = prompt_template.replace("{style_examples}", style_examples)
    item_tpl = (TEMPLATES / "item.html.j2").read_text(encoding="utf-8")
    news_tpl = (TEMPLATES / "newsletter.html.j2").read_text(encoding="utf-8")
    blocks = []
    for sid, url, title, publisher in rows:
        body = (con.execute("SELECT content FROM sources WHERE id=?", (sid,)).fetchone()[0]) or ""
        related_hint = "—"
        prompt = (prompt_template
            .replace("{title}", title or url)
            .replace("{body}", body)
            .replace("{source_name}", publisher or "Zdroj neuveden")
            .replace("{related_hint}", related_hint))
        summary = chat_completion_llm([{"role":"user","content": prompt}], llm)
        sum_id = insert_summary(con, sid, title or url, summary, newsletter_date=cfg["date"], tone_version="v1")
        try:
            vec = embed_text(summary, os.getenv("EMB_API_BASE", cfg["llm"]["api_base"]), os.getenv("EMB_API_KEY", os.getenv("LLM_API_KEY")), os.getenv("EMB_MODEL", cfg["db"]["embed_model"]))
            upsert_embedding(con, "summaries", sum_id, os.getenv("EMB_MODEL", cfg["db"]["embed_model"]), vec)
        except Exception:
            pass
        blocks.append(Template(item_tpl).render(title=(title or url), summary=summary))
    newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"])
    newsletter_subtitle = cfg.get("newsletter_subtitle","")
    html_out = Template(news_tpl).render(newsletter_title=newsletter_title, newsletter_subtitle=newsletter_subtitle, blocks=blocks)
    outp = pathlib.Path(args.out); outp.parent.mkdir(parents=True, exist_ok=True); outp.write_text(html_out, encoding="utf-8")
    print(f"Saved: {outp}")
    if args.publish:
        ghost_url = os.getenv("GHOST_ADMIN_API_URL")
        ghost_key = os.getenv("GHOST_ADMIN_API_KEY")
        if ghost_url and ghost_key:
            def ghost_jwt(key: str) -> str:
                key_id, secret = key.split(':')
                iat = int(time.time())
                header = {"alg": "HS256", "kid": key_id, "typ": "JWT"}
                payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"}
                import jwt
                return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)
            token = ghost_jwt(ghost_key)
            payload = {"posts":[{"title": newsletter_title, "html": html_out, "status": "draft", "tags": [{"name": t} for t in cfg.get("default_tags",[])]}]}
            r = requests.post(f"{ghost_url}/posts/", headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"}, data=json.dumps(payload), timeout=60)
            r.raise_for_status()
            print("Draft:", r.json()["posts"][0]["url"])
        else:
            print("Missing Ghost creds; skipped publish.")
 if __name__ == "__main__":
    main()
--- a/scripts/db_cli.py
+++ b/scripts/db_cli.py
@@ -0,0 +1,18 @@
 #!/usr/bin/env python3
 import argparse
 from db import connect, init_db
 def main():
    ap = argparse.ArgumentParser(description="DB CLI")
    ap.add_argument("cmd", choices=["init"])
    ap.add_argument("--db", default="data/newsletter.db")
    ap.add_argument("--schema", default="scripts/schema.sql")
    args = ap.parse_args()
    con = connect(args.db)
    if args.cmd == "init":
        init_db(con, args.schema)
        print(f"Initialized schema in {args.db}")
 if __name__ == "__main__":
    main()
--- a/scripts/ingest_list.py
+++ b/scripts/ingest_list.py
@@ -0,0 +1,130 @@
 #!/usr/bin/env python3
 import argparse, sqlite3, re, sys, time
 from pathlib import Path
 from urllib.parse import urlparse, urlunparse
 import requests, tldextract
 from bs4 import BeautifulSoup
 from readability import Document as ReadabilityDoc
 import trafilatura
 from slugify import slugify
 from datetime import date
 def connect(db_path): 
    con = sqlite3.connect(db_path)
    con.execute("PRAGMA foreign_keys=ON;")
    return con
 def upsert_source(con, url, title=None, publisher=None, date_published=None, content=None, tags=None):
    con.execute(
        """INSERT INTO sources(url, title, publisher, date_published, content)
        VALUES (?,?,?,?,?)
        ON CONFLICT(url) DO UPDATE SET
          title=COALESCE(excluded.title, title),
          publisher=COALESCE(excluded.publisher, publisher),
          date_published=COALESCE(excluded.date_published, date_published),
          content=COALESCE(excluded.content, content)
        """, (url, title, publisher, date_published, content)
    )
    sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0]
    con.commit()
    return sid
 def normalize_url(u: str) -> str:
    p = urlparse(u.strip())
    if not p.scheme:
        p = p._replace(scheme="https")
    query = re.sub(r'(&|\?)?(utm_[^=&]+|fbclid|gclid)=[^&]*', '', p.query)
    if query.startswith('&'): query = query[1:]
    return urlunparse((p.scheme, p.netloc, p.path, p.params, query, ""))
 def domain(url: str) -> str:
    ext = tldextract.extract(url)
    return ".".join(part for part in [ext.domain, ext.suffix] if part)
 def fetch_readable(url: str, timeout=20):
    try:
        r = requests.get(url, timeout=timeout, headers={"User-Agent":"Mozilla/5.0"})
        r.raise_for_status()
        html = r.text
    except Exception:
        return "", ""
    try:
        txt = trafilatura.extract(html, include_comments=False, include_tables=False, favor_recall=True)
        if txt:
            soup = BeautifulSoup(html, "html.parser")
            t = (soup.title.string.strip() if soup.title and soup.title.string else "")
            return t, txt.strip()
    except Exception:
        pass
    try:
        doc = ReadabilityDoc(html)
        content_html = doc.summary()
        soup = BeautifulSoup(content_html, "html.parser")
        txt = soup.get_text(separator="\n").strip()
        return (doc.short_title() or "").strip(), txt
    except Exception:
        return "", ""
 def write_stub(entries_dir: Path, title: str, url: str, source_name: str|None, text: str|None):
    entries_dir.mkdir(parents=True, exist_ok=True)
    from datetime import datetime
    slug = slugify(title or domain(url) or "clanek")[:60] or "clanek"
    path = entries_dir / f"{slug}.md"
    body = text.strip() if text else ""
    fm = f'''---
 title: "{(title or "").replace('"',"\"")}"
 source_name: "{(source_name or domain(url) or "").replace('"',"\"")}"
 url: "{url}"
 tags: []
 status: "todo"
 ---
 {body}
 '''
    path.write_text(fm, encoding="utf-8")
    return path
 def read_lines(source_path: str|None):
    if source_path:
        return Path(source_path).read_text(encoding="utf-8", errors="ignore").splitlines()
    return sys.stdin.read().splitlines()
 def main():
    ap = argparse.ArgumentParser(description="Ingest list of URLs into SQLite and optional entry stubs")
    ap.add_argument("--db", default="data/newsletter.db")
    ap.add_argument("--list", help="Text file with URLs (one per line). If omitted, read from STDIN.")
    ap.add_argument("--fetch", action="store_true", help="Fetch & extract readable text")
    ap.add_argument("--sleep", type=float, default=0.0)
    ap.add_argument("--stubs", action="store_true")
    ap.add_argument("--date", default=date.today().isoformat())
    args = ap.parse_args()
    con = connect(args.db)
    lines = read_lines(args.list)
    urls = []
    for ln in lines:
        ln = ln.strip()
        if not ln or ln.startswith("#"): 
            continue
        if re.match(r"^\w+://", ln) or re.match(r"^[\w\.-]+\.[a-z]{2,}(/|$)", ln):
            urls.append(normalize_url(ln))
    seen = set(); urls = [u for u in urls if not (u in seen or seen.add(u))]
    stubs_dir = Path(f"entries/{args.date}") if args.stubs else None
    kept = 0
    for url in urls:
        pub = domain(url)
        title, text = ("","")
        if args.fetch:
            title, text = fetch_readable(url)
        sid = upsert_source(con, url=url, title=(title or None), publisher=pub, content=(text or None))
        kept += 1
        if stubs_dir:
            stub = write_stub(stubs_dir, title or url, url, pub, text)
            print(f"Stub: {stub}")
        if args.sleep: time.sleep(args.sleep)
    print(f"Ingested: {kept} URLs into {args.db}")
 if __name__ == "__main__":
    main()
--- a/scripts/schema.sql
+++ b/scripts/schema.sql
@@ -0,0 +1,71 @@
 PRAGMA foreign_keys = ON;
 CREATE TABLE IF NOT EXISTS sources (
  id INTEGER PRIMARY KEY,
  url TEXT UNIQUE,
  title TEXT,
  publisher TEXT,
  date_published TEXT,
  content TEXT
 );
 CREATE TABLE IF NOT EXISTS summaries (
  id INTEGER PRIMARY KEY,
  source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
  title TEXT NOT NULL,
  summary TEXT NOT NULL,
  newsletter_date TEXT,
  tone_version TEXT,
  created_at TEXT DEFAULT (datetime('now'))
 );
 CREATE TABLE IF NOT EXISTS tags (
  id INTEGER PRIMARY KEY,
  name TEXT UNIQUE
 );
 CREATE TABLE IF NOT EXISTS source_tags (
  source_id INTEGER REFERENCES sources(id) ON DELETE CASCADE,
  tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
  PRIMARY KEY(source_id, tag_id)
 );
 CREATE TABLE IF NOT EXISTS embeddings (
  id INTEGER PRIMARY KEY,
  ref_table TEXT NOT NULL CHECK(ref_table IN ('sources','summaries')),
  ref_id INTEGER NOT NULL,
  model TEXT NOT NULL,
  dim INTEGER NOT NULL,
  vec BLOB NOT NULL,
  UNIQUE(ref_table, ref_id, model)
 );
 CREATE VIRTUAL TABLE IF NOT EXISTS sources_fts USING fts5(
  title, content, content='sources', content_rowid='id'
 );
 CREATE VIRTUAL TABLE IF NOT EXISTS summaries_fts USING fts5(
  title, summary, content='summaries', content_rowid='id'
 );
 CREATE TRIGGER IF NOT EXISTS sources_ai AFTER INSERT ON sources BEGIN
  INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content);
 END;
 CREATE TRIGGER IF NOT EXISTS sources_au AFTER UPDATE ON sources BEGIN
  INSERT INTO sources_fts(sources_fts, rowid, title, content)
    VALUES('delete', old.id, old.title, old.content);
  INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content);
 END;
 CREATE TRIGGER IF NOT EXISTS sources_ad AFTER DELETE ON sources BEGIN
  INSERT INTO sources_fts(sources_fts, rowid, title, content) VALUES('delete', old.id, old.title, old.content);
 END;
 CREATE TRIGGER IF NOT EXISTS summaries_ai AFTER INSERT ON summaries BEGIN
  INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary);
 END;
 CREATE TRIGGER IF NOT EXISTS summaries_au AFTER UPDATE ON summaries BEGIN
  INSERT INTO summaries_fts(summaries_fts, rowid, title, summary)
    VALUES('delete', old.id, old.title, old.summary);
  INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary);
 END;
 CREATE TRIGGER IF NOT EXISTS summaries_ad AFTER DELETE ON summaries BEGIN
  INSERT INTO summaries_fts(summaries_fts, rowid, title, summary) VALUES('delete', old.id, old.title, old.summary);
 END;
--- a/scripts/sync_entries_to_db.py
+++ b/scripts/sync_entries_to_db.py
@@ -0,0 +1,38 @@
 #!/usr/bin/env python3
 import argparse, re
 from pathlib import Path
 import yaml
 from db import connect, upsert_source
 def parse_front_matter(text: str):
    m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M)
    if not m:
        return {}, text.strip()
    fm = yaml.safe_load(m.group(1)) or {}
    body = m.group(2).strip()
    return fm, body
 def main():
    ap = argparse.ArgumentParser(description="Sync entries/*.md into SQLite sources")
    ap.add_argument("--db", default="data/newsletter.db")
    ap.add_argument("--dir", required=True, help="entries/YYYY-MM-DD directory")
    args = ap.parse_args()
    con = connect(args.db)
    for p in Path(args.dir).glob("*.md"):
        text = p.read_text(encoding="utf-8")
        fm, body = parse_front_matter(text)
        title = fm.get("title") or p.stem
        url = fm.get("url")
        publisher = fm.get("source_name")
        upsert_source(con,
            url=url,
            title=title,
            publisher=publisher,
            date_published=None,
            content=body
        )
        print(f"Synced: {p.name}")
 if __name__ == "__main__":
    main()
--- a/templates/item.html.j2
+++ b/templates/item.html.j2
@@ -0,0 +1,2 @@
 <h3><strong>{{ title }}</strong></h3>
 <p>{{ summary }}</p>
--- a/templates/newsletter.html.j2
+++ b/templates/newsletter.html.j2
@@ -0,0 +1,7 @@
 <h2>{{ newsletter_title }}</h2>
 <p><em>{{ newsletter_subtitle }}</em></p>
 <hr>
 {% for b in blocks %}
 {{ b }}
 {% if not loop.last %}<hr>{% endif %}
 {% endfor %}
--- a/templates/prompt.txt
+++ b/templates/prompt.txt
@@ -0,0 +1,24 @@
 Jsi editor newsletteru o archeologii, historii a příbuzných vědách. Piš česky, srozumitelně pro laiky, věcně, bez žargonu; jednu větu můžeš udělat mírně poutavou, ale ne bulvární. Drž rozsah 3–4 věty.
 ŠABLONA OBSAHU:
 1) Co je novinka/objev.
 2) Proč je důležitý (dopad, změna pohledu, metoda).
 3) Kontext nebo zajímavost (kultura, datace, souvislosti).
 4) (volitelně) Jedna věta s odkazem na předchozí související text, pokud je k dispozici.
 Na konec přidej: „Zdroj: {source_name}“.
 === UKÁZKY STYLU ===
 {style_examples}
 === VSTUPNÍ POLOŽKA ===
 TITULEK: {title}
 TEXT:
 {body}
 === NÁVAZNOSTI (VOLITELNÉ) ===
 {related_hint}
 === POKYNY ===
 - Drž 3–4 věty.
 - Pokud jsou náležitosti neúplné, buď opatrný a nic si nedovymýšlej.
 - Zakonči přesně: „Zdroj: {source_name}“.
--- a/templates/style_bank.md
+++ b/templates/style_bank.md
@@ -0,0 +1,2 @@
 - Velšské deštné pralesy: Velká Británie je domovem unikátních zbytků mírného deštného pralesa, který se kdysi rozkládal od Walesu až po Skotsko. Dnes přežívá jen asi 10 % původní plochy, ale probíhají programy jeho ochrany a obnovy. Pralesy se objevují i v mytologii, například ve čtvrté větvi velšského eposu Mabinogi, kde jsou spojeny s kouzelnými zvířaty. Kontrolovaná pastva, kterou je naši předci udržovali zdravé, se používá i dnes.
 - Podmořské stavby lovců‑sběračů: Vědci objevili pod hladinou Baltu kilometr dlouhou kamennou linii, pravděpodobně prehistorickou loveckou strukturu. Nález naznačuje promyšlenou organizaci lovu i přizpůsobení se krajině, kterou později zaplavilo moře. Podobné konstrukce známe z jiných částí světa, ale v Evropě jsou vzácné.
		`@@ -0,0 +1,2 @@`
							`<h3><strong>{{ title }}</strong></h3>`
							`<p>{{ summary }}</p>`
		`@@ -0,0 +1,2 @@`
							`- Velšské deštné pralesy: Velká Británie je domovem unikátních zbytků mírného deštného pralesa, který se kdysi rozkládal od Walesu až po Skotsko. Dnes přežívá jen asi 10 % původní plochy, ale probíhají programy jeho ochrany a obnovy. Pralesy se objevují i v mytologii, například ve čtvrté větvi velšského eposu Mabinogi, kde jsou spojeny s kouzelnými zvířaty. Kontrolovaná pastva, kterou je naši předci udržovali zdravé, se používá i dnes.`
							`- Podmořské stavby lovců‑sběračů: Vědci objevili pod hladinou Baltu kilometr dlouhou kamennou linii, pravděpodobně prehistorickou loveckou strukturu. Nález naznačuje promyšlenou organizaci lovu i přizpůsobení se krajině, kterou později zaplavilo moře. Podobné konstrukce známe z jiných částí světa, ale v Evropě jsou vzácné.`