First commit

This commit is contained in:
2025-09-14 16:56:08 +02:00
commit 079bd25899
19 changed files with 976 additions and 0 deletions

21
.gitignore vendored Normal file
View File

@@ -0,0 +1,21 @@
# --- Python / environment ---
.venv/
__pycache__/
*.py[cod]
*.egg-info/
.DS_Store
# --- Secrets ---
.env
# --- Local database (optional) ---
# Comment out if you DO want to commit newsletter history!
data/*.db
# --- Build outputs ---
dist/
*.html
# --- Logs / temp ---
*.log
.cache/

57
Makefile Normal file
View File

@@ -0,0 +1,57 @@
DATE ?= $(shell date +%F)
ENTRIES_DIR ?= entries/$(DATE)
OUT ?= dist/$(DATE).html
LIMIT ?= 10
PROVIDER ?=
API_BASE ?=
MODEL ?=
TEMP ?=
LLM_FLAGS :=
ifneq ($(strip $(PROVIDER)),)
LLM_FLAGS += --llm-provider $(PROVIDER)
endif
ifneq ($(strip $(API_BASE)),)
LLM_FLAGS += --llm-api-base $(API_BASE)
endif
ifneq ($(strip $(MODEL)),)
LLM_FLAGS += --llm-model $(MODEL)
endif
ifneq ($(strip $(TEMP)),)
LLM_FLAGS += --temperature $(TEMP)
endif
.PHONY: init ingest stubs build draft build-db draft-db sync-db clean
init:
python scripts/db_cli.py init
ingest:
python scripts/ingest_list.py --list inbox.txt
stubs:
python scripts/ingest_list.py --list inbox.txt --fetch --stubs --date $(DATE) --sleep 0.5
build:
python build.py $(ENTRIES_DIR) --out $(OUT) $(LLM_FLAGS)
draft:
python build.py $(ENTRIES_DIR) --out $(OUT) --publish $(LLM_FLAGS)
build-db:
python scripts/build_from_db.py --limit $(LIMIT) --out $(OUT) $(LLM_FLAGS)
draft-db:
python scripts/build_from_db.py --limit $(LIMIT) --out $(OUT) --publish $(LLM_FLAGS)
sync-db:
python scripts/sync_entries_to_db.py --dir $(ENTRIES_DIR)
clean:
rm -rf dist/*
# Dry run: generate summaries and print HTML to stdout (no file write, no publish)
dry-run:
python build.py $(ENTRIES_DIR) --dry-run $(LLM_FLAGS)

56
README.md Normal file
View File

@@ -0,0 +1,56 @@
# Offline Newsletter Builder (Archaeology/History)
Local-first toolchain to turn your article links (or pasted texts) into a Friday Ghost draft with consistent summaries, plus a SQLite database for memory and related-article suggestions.
## Highlights
- Paste URLs into `inbox.txt` → run importer → get `.md` stubs **and** a growing SQLite DB.
- One command to build the weekly newsletter (HTML) and optionally create a **Ghost draft**.
- Consistent tone via `templates/prompt.txt` (+ your `templates/style_bank.md` samples).
- "Memory" via `data/newsletter.db` (sources, summaries, embeddings, FTS), used to auto-suggest **related** items.
## Quick start
```bash
python -m venv .venv && source .venv/bin/activate
pip install -r requirements.txt
# Configure secrets
cp .env.example .env
# Edit .env to set your keys (or point to local OpenWebUI/Ollama)
# Create DB schema
python scripts/db_cli.py init
# Ingest URLs (from inbox.txt)
python scripts/ingest_list.py --list inbox.txt --fetch --stubs --date 2025-09-19 --sleep 0.5
# Build newsletter from stubs (HTML + optional Ghost draft)
python build.py entries/2025-09-19 --out dist/2025-09-19.html --publish
# (Optional) Build directly from DB (Top-N sources)
python scripts/build_from_db.py --limit 10 --out dist/2025-09-19.html --publish
```
## Sample Run (Pretend Output)
```bash
$ make init
Initialized schema in data/newsletter.db
$ make stubs DATE=2025-09-19
Stub: entries/2025-09-19/bbc-welsh-rainforests.md
Stub: entries/2025-09-19/nature-bone-discovery.md
Ingested: 2 URLs into data/newsletter.db
$ make build DATE=2025-09-19
[build] Using LLM provider=openwebui model=qwen2.5-7b-instruct
[build] Generating summary for: Velšské deštné pralesy ...
[build] -> Summary written back to DB (id=5, embedding stored)
[build] Generating summary for: Kosterní nález u Nisy ...
[build] -> Summary written back to DB (id=6, embedding stored)
Saved: dist/2025-09-19.html
$ make draft DATE=2025-09-19
Draft: https://your-ghost-site.ghost.io/p/objevy-tydne-2025-09-19-draft
```
You can open the generated HTML under `dist/2025-09-19.html` in your browser to review before publishing.

246
build.py Normal file
View File

@@ -0,0 +1,246 @@
#!/usr/bin/env python3
import os, sys, glob, time, json, html, argparse, pathlib, textwrap, re
from datetime import date
import yaml, requests, jwt
from jinja2 import Template
from dotenv import load_dotenv
load_dotenv()
ROOT = pathlib.Path(__file__).parent
TEMPLATES = ROOT / "templates"
def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8")
def load_config():
cfg = yaml.safe_load(read_file(ROOT / "config.yaml"))
cfg["date"] = date.today().isoformat()
return cfg
def parse_front_matter(text):
m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M)
if not m:
return {}, text.strip()
import yaml as _yaml
fm = _yaml.safe_load(m.group(1)) or {}
body = m.group(2).strip()
return fm, body
# ---- LLM config / client ----
from dataclasses import dataclass
@dataclass
class LLMConfig:
provider: str
api_base: str
model: str
api_key: str | None
temperature: float
top_p: float
presence_penalty: float
frequency_penalty: float
timeout_seconds: int
max_retries: int
def resolve_llm_config(cfg: dict, args) -> LLMConfig:
llm_cfg = cfg.get("llm", {}) if cfg else {}
def pick(cli_val, env_key, cfg_key, default=None):
if cli_val is not None:
return cli_val
if env_key and os.getenv(env_key):
return os.getenv(env_key)
return llm_cfg.get(cfg_key, default)
provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui")
api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base",
"http://localhost:3000" if provider=="openwebui" else
"http://localhost:11434" if provider=="ollama" else
"https://api.openai.com")
model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model",
"qwen2.5-7b-instruct" if provider=="openwebui" else
"llama3.1:8b-instruct" if provider=="ollama" else
"gpt-4o-mini")
api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None)
temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2))
top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0))
presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0))
frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0))
timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120))
max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2))
return LLMConfig(
provider=provider, api_base=api_base, model=model, api_key=api_key,
temperature=temperature, top_p=top_p,
presence_penalty=presence_penalty, frequency_penalty=frequency_penalty,
timeout_seconds=timeout_seconds, max_retries=max_retries
)
def chat_completion_llm(messages, llm: LLMConfig):
if llm.provider == "openwebui":
url = f"{llm.api_base.rstrip('/')}/api/chat/completions"
headers = {"Content-Type":"application/json"}
if llm.api_key:
headers["Authorization"] = f"Bearer {llm.api_key}"
elif llm.provider == "ollama":
url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
headers = {"Content-Type":"application/json"}
if llm.api_key:
headers["Authorization"] = f"Bearer {llm.api_key}"
else:
url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
headers = {"Content-Type":"application/json"}
if llm.api_key:
headers["Authorization"] = f"Bearer {llm.api_key}"
payload = {
"model": llm.model,
"messages": messages,
"temperature": llm.temperature,
"top_p": llm.top_p,
"presence_penalty": llm.presence_penalty,
"frequency_penalty": llm.frequency_penalty,
"stream": False
}
attempt = 0
last_err = None
while attempt <= llm.max_retries:
try:
r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds)
r.raise_for_status()
data = r.json()
return data["choices"][0]["message"]["content"]
except Exception as e:
last_err = e
attempt += 1
if attempt > llm.max_retries:
break
time.sleep(min(2**attempt, 8))
raise RuntimeError(f"LLM request failed after {llm.max_retries} retries: {last_err}")
def call_llm_via_messages(prompt: str, llm: LLMConfig) -> str:
return chat_completion_llm([{"role":"user","content": prompt}], llm)
# ---- Ghost ----
def ghost_jwt(key: str) -> str:
key_id, secret = key.split(':')
iat = int(time.time())
header = {"alg": "HS256", "kid": key_id, "typ": "JWT"}
payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"}
return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)
def create_ghost_draft(ghost_url, ghost_key, html_content, title, tags):
token = ghost_jwt(ghost_key)
payload = { "posts": [{
"title": title, "html": html_content, "status": "draft",
"tags": [{"name": t} for t in tags]
}]}
r = requests.post(
f"{ghost_url}/posts/",
headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"},
data=json.dumps(payload), timeout=60
)
r.raise_for_status()
return r.json()["posts"][0]["url"]
# ---- Memory/embeddings ----
from db import connect as db_connect, topk_similar
from emb import embed_text
def build_related_hint_auto(title, body, llm_cfg, cfg_db):
api_base = os.getenv("EMB_API_BASE", llm_cfg.api_base)
api_key = os.getenv("EMB_API_KEY", llm_cfg.api_key)
model = os.getenv("EMB_MODEL", cfg_db.get("embed_model", "text-embedding-3-small"))
qtext = (title + "\n\n" + body)[:5000]
try:
vec = embed_text(qtext, api_base, api_key, model)
except Exception:
return ""
con = db_connect(cfg_db["path"])
hits = topk_similar(con, model=model, query_vec=vec,
ref_table="summaries",
k=cfg_db.get("related_top_k",3),
min_sim=cfg_db.get("min_similarity",0.78))
if not hits:
return ""
lines = []
for sid, t, s, nd in hits:
lines.append(f"- {nd or 'dříve'}: {t}")
return "O podobném tématu jsme psali:\n" + "\n".join(lines) + "\nZmiň jednou větou souvislost."
def main():
ap = argparse.ArgumentParser(description="Offline-first generator + Ghost draft")
ap.add_argument("entries_dir", help="entries/YYYY-MM-DD directory")
ap.add_argument("--out", help="Output HTML path, e.g. dist/2025-09-19.html")
ap.add_argument("--dry-run", action="store_true")
ap.add_argument("--publish", action="store_true")
# LLM overrides
ap.add_argument("--llm-provider")
ap.add_argument("--llm-api-base")
ap.add_argument("--llm-model")
ap.add_argument("--temperature", type=float)
ap.add_argument("--top-p", type=float)
ap.add_argument("--presence-penalty", type=float)
ap.add_argument("--frequency-penalty", type=float)
ap.add_argument("--timeout-seconds", type=int)
ap.add_argument("--max-retries", type=int)
args = ap.parse_args()
cfg = load_config()
llm = resolve_llm_config(cfg, args)
item_tpl = read_file(TEMPLATES / "item.html.j2")
news_tpl = read_file(TEMPLATES / "newsletter.html.j2")
prompt_template = read_file(TEMPLATES / "prompt.txt")
style_examples = read_file(TEMPLATES / "style_bank.md").strip()
prompt_template = prompt_template.replace("{style_examples}", style_examples)
paths = sorted(glob.glob(os.path.join(args.entries_dir, "*.md")))
blocks = []
for p in paths:
fm_text = pathlib.Path(p).read_text(encoding="utf-8")
fm, body = parse_front_matter(fm_text)
if fm.get("status","todo") == "skip":
continue
title = fm.get("title") or pathlib.Path(p).stem.replace("-"," ").title()
source_name = fm.get("source_name","Zdroj neuveden")
related_hint = build_related_hint_auto(title, body, llm, cfg.get("db",{}))
prompt = (prompt_template
.replace("{title}", title)
.replace("{body}", body)
.replace("{source_name}", source_name)
.replace("{related_hint}", related_hint))
summary = call_llm_via_messages(prompt, llm)
block_html = Template(item_tpl).render(title=title, summary=summary)
blocks.append(block_html)
newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"])
newsletter_subtitle = cfg.get("newsletter_subtitle","")
html_out = Template(news_tpl).render(
newsletter_title=newsletter_title,
newsletter_subtitle=newsletter_subtitle,
blocks=blocks
)
if args.out:
outp = pathlib.Path(args.out)
outp.parent.mkdir(parents=True, exist_ok=True)
outp.write_text(html_out, encoding="utf-8")
print(f"Saved: {outp}")
if args.publish:
ghost_url = os.getenv("GHOST_ADMIN_API_URL")
ghost_key = os.getenv("GHOST_ADMIN_API_KEY")
if not (ghost_url and ghost_key):
print("Missing GHOST_ADMIN_API_URL or GHOST_ADMIN_API_KEY in .env", file=sys.stderr)
sys.exit(2)
url = create_ghost_draft(ghost_url, ghost_key, html_out, newsletter_title, cfg.get("default_tags",[]))
print("Draft:", url)
if not (args.out or args.publish):
print(html_out)
if __name__ == "__main__":
main()

22
config.yaml Normal file
View File

@@ -0,0 +1,22 @@
newsletter_title: "Objevy týdne {{ date }}"
newsletter_subtitle: "Archeologie, historie a příbuzné vědy"
default_tags: ["Newsletter","Archeologie"]
model: "gpt-4o-mini"
temperature: 0.2
llm:
provider: "openwebui" # openwebui | ollama | openai
api_base: "http://localhost:3000"
model: "qwen2.5-7b-instruct"
temperature: 0.2
top_p: 1.0
presence_penalty: 0.0
frequency_penalty: 0.0
timeout_seconds: 120
max_retries: 2
db:
path: "data/newsletter.db"
embed_model: "text-embedding-3-small"
related_top_k: 3
min_similarity: 0.78

91
db.py Normal file
View File

@@ -0,0 +1,91 @@
import sqlite3
from pathlib import Path
import math
import struct
def connect(db_path: str) -> sqlite3.Connection:
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
con = sqlite3.connect(db_path)
con.execute("PRAGMA foreign_keys = ON;")
return con
def init_db(con: sqlite3.Connection, schema_path: str = "scripts/schema.sql"):
schema = Path(schema_path).read_text(encoding="utf-8")
con.executescript(schema)
con.commit()
def pack_vec(vec):
return struct.pack("<%sf" % len(vec), *vec)
def unpack_vec(blob):
fcount = len(blob)//4
return list(struct.unpack("<%sf" % fcount, blob))
def cosine(a, b):
na = math.sqrt(sum(x*x for x in a)); nb = math.sqrt(sum(x*x for x in b))
if na == 0 or nb == 0: return 0.0
return sum(x*y for x,y in zip(a,b)) / (na*nb)
def upsert_source(con, url=None, title=None, publisher=None, date_published=None, content=None, tags=None):
con.execute(
"""INSERT INTO sources(url, title, publisher, date_published, content)
VALUES(?,?,?,?,?)
ON CONFLICT(url) DO UPDATE SET
title=COALESCE(excluded.title, title),
publisher=COALESCE(excluded.publisher, publisher),
date_published=COALESCE(excluded.date_published, date_published),
content=COALESCE(excluded.content, content)
""", (url, title, publisher, date_published, content)
)
sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0]
if tags:
for t in tags:
con.execute("INSERT OR IGNORE INTO tags(name) VALUES(?)", (t,))
tid = con.execute("SELECT id FROM tags WHERE name=?", (t,)).fetchone()[0]
con.execute("INSERT OR IGNORE INTO source_tags(source_id, tag_id) VALUES(?,?)", (sid, tid))
con.commit()
return sid
def insert_summary(con, source_id, title, summary, newsletter_date=None, tone_version=None):
cur = con.cursor()
cur.execute(
"""INSERT INTO summaries(source_id, title, summary, newsletter_date, tone_version)
VALUES (?,?,?,?,?)""",
(source_id, title, summary, newsletter_date, tone_version)
)
con.commit()
return cur.lastrowid
def upsert_embedding(con, ref_table, ref_id, model, vec):
dim = len(vec)
blob = pack_vec(vec)
con.execute(
"""INSERT INTO embeddings(ref_table, ref_id, model, dim, vec)
VALUES (?,?,?,?,?)
ON CONFLICT(ref_table, ref_id, model) DO UPDATE SET vec=excluded.vec, dim=excluded.dim""",
(ref_table, ref_id, model, dim, blob)
)
con.commit()
def topk_similar(con, model, query_vec, ref_table="summaries", k=3, min_sim=0.78):
rows = con.execute(
"SELECT ref_id, dim, vec FROM embeddings WHERE ref_table=? AND model=?;",
(ref_table, model)
).fetchall()
scored = []
for ref_id, dim, blob in rows:
vec = unpack_vec(blob)
if len(vec) != len(query_vec):
continue
sim = cosine(query_vec, vec)
if sim >= min_sim:
scored.append((sim, ref_id))
scored.sort(reverse=True)
ref_ids = [rid for _, rid in scored[:k]]
if not ref_ids: return []
if ref_table == "summaries":
q = "SELECT id, title, summary, newsletter_date FROM summaries WHERE id IN (%s)" % ",".join("?"*len(ref_ids))
return con.execute(q, ref_ids).fetchall()
else:
q = "SELECT id, title, url, date_published FROM sources WHERE id IN (%s)" % ",".join("?"*len(ref_ids))
return con.execute(q, ref_ids).fetchall()

10
emb.py Normal file
View File

@@ -0,0 +1,10 @@
import os, requests
def embed_text(text: str, api_base: str, api_key: str|None, model: str) -> list[float]:
url = f"{api_base.rstrip('/')}/v1/embeddings"
headers = {"Content-Type":"application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
r = requests.post(url, headers=headers, json={"model": model, "input": text}, timeout=60)
r.raise_for_status()
return r.json()["data"][0]["embedding"]

View File

@@ -0,0 +1,11 @@
---
title: "Velšské deštné pralesy"
source_name: "BBC / Wikipedie"
url: "https://www.bbc.com/news/science-environment-00000000"
tags: ["folklor","krajina"]
status: "todo"
---
Deštné pralesy má asi většina z nás spojené spíš s Amazonií nebo jihovýchodní Asií, ale ony existují nejen v tropickém klimatickém pásu, ale i v našem, mírném. Jako deštný se označuje les s více než 200 cm srážek ročně. A právě takový les se kdysi rozkládal od Walesu až po Skotsko.
Dnes z něj zbývá necelých 10 % rozlohy, ale je o něj pečováno velmi pečlivě. Tento prales je také zdrojem celé řady místních mýtů. V tom asi nejznámějším, Mabinogi, se ve čtvrté větvi stane roztržka mezi Pryderim a Gwydionem...

4
inbox.txt Normal file
View File

@@ -0,0 +1,4 @@
# Paste URLs here (one per line)
https://www.bbc.com/news/science-environment-00000000
https://www.nature.com/articles/xxxxxxxx
https://antiquity.ac.uk/article/yyyyyyyy

12
requirements.txt Normal file
View File

@@ -0,0 +1,12 @@
pyyaml
jinja2
python-dotenv
requests
markdown
pyjwt
beautifulsoup4
readability-lxml
trafilatura
tldextract
python-slugify
numpy

154
scripts/build_from_db.py Normal file
View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python3
import os, sys, time, json, argparse, pathlib
import yaml, requests, jwt
from jinja2 import Template
from dotenv import load_dotenv
from datetime import date
from db import connect as db_connect, insert_summary, upsert_embedding
from emb import embed_text
load_dotenv()
ROOT = pathlib.Path(__file__).parent
REPO = ROOT.parent
TEMPLATES = REPO / "templates"
def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8")
def load_config():
cfg = yaml.safe_load(read_file(REPO / "config.yaml"))
cfg["date"] = date.today().isoformat()
return cfg
from dataclasses import dataclass
import requests
@dataclass
class LLMConfig:
provider: str
api_base: str
model: str
api_key: str | None
temperature: float
top_p: float
presence_penalty: float
frequency_penalty: float
timeout_seconds: int
max_retries: int
def resolve_llm_config(cfg: dict, args) -> LLMConfig:
llm_cfg = cfg.get("llm", {}) if cfg else {}
def pick(cli_val, env_key, cfg_key, default=None):
if cli_val is not None:
return cli_val
if env_key and os.getenv(env_key):
return os.getenv(env_key)
return llm_cfg.get(cfg_key, default)
provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui")
api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base",
"http://localhost:3000" if provider=="openwebui" else
"http://localhost:11434" if provider=="ollama" else
"https://api.openai.com")
model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model",
"qwen2.5-7b-instruct" if provider=="openwebui" else
"llama3.1:8b-instruct" if provider=="ollama" else
"gpt-4o-mini")
api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None)
temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2))
top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0))
presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0))
frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0))
timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120))
max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2))
return LLMConfig(provider, api_base, model, api_key, temperature, top_p, presence_penalty, frequency_penalty, timeout_seconds, max_retries)
def chat_completion_llm(messages, llm: LLMConfig):
if llm.provider == "openwebui":
url = f"{llm.api_base.rstrip('/')}/api/chat/completions"
elif llm.provider == "ollama":
url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
else:
url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
headers = {"Content-Type":"application/json"}
if llm.api_key: headers["Authorization"] = f"Bearer {llm.api_key}"
payload = {"model": llm.model, "messages": messages, "temperature": llm.temperature, "top_p": llm.top_p,
"presence_penalty": llm.presence_penalty, "frequency_penalty": llm.frequency_penalty, "stream": False}
r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds)
r.raise_for_status()
return r.json()["choices"][0]["message"]["content"]
def main():
ap = argparse.ArgumentParser(description="Build directly from DB (Top-N sources)")
ap.add_argument("--db", default="data/newsletter.db")
ap.add_argument("--limit", type=int, default=10)
ap.add_argument("--out", required=True)
ap.add_argument("--publish", action="store_true")
# LLM overrides
ap.add_argument("--llm-provider"); ap.add_argument("--llm-api-base")
ap.add_argument("--llm-model"); ap.add_argument("--temperature", type=float)
ap.add_argument("--top-p", type=float); ap.add_argument("--presence-penalty", type=float)
ap.add_argument("--frequency-penalty", type=float); ap.add_argument("--timeout-seconds", type=int)
ap.add_argument("--max-retries", type=int)
args = ap.parse_args()
cfg = load_config()
llm = resolve_llm_config(cfg, args)
con = db_connect(args.db)
rows = con.execute("SELECT id, url, title, publisher FROM sources ORDER BY id DESC LIMIT ?", (args.limit,)).fetchall()
prompt_template = (TEMPLATES / "prompt.txt").read_text(encoding="utf-8")
style_examples = (TEMPLATES / "style_bank.md").read_text(encoding="utf-8").strip()
prompt_template = prompt_template.replace("{style_examples}", style_examples)
item_tpl = (TEMPLATES / "item.html.j2").read_text(encoding="utf-8")
news_tpl = (TEMPLATES / "newsletter.html.j2").read_text(encoding="utf-8")
blocks = []
for sid, url, title, publisher in rows:
body = (con.execute("SELECT content FROM sources WHERE id=?", (sid,)).fetchone()[0]) or ""
related_hint = ""
prompt = (prompt_template
.replace("{title}", title or url)
.replace("{body}", body)
.replace("{source_name}", publisher or "Zdroj neuveden")
.replace("{related_hint}", related_hint))
summary = chat_completion_llm([{"role":"user","content": prompt}], llm)
sum_id = insert_summary(con, sid, title or url, summary, newsletter_date=cfg["date"], tone_version="v1")
try:
vec = embed_text(summary, os.getenv("EMB_API_BASE", cfg["llm"]["api_base"]), os.getenv("EMB_API_KEY", os.getenv("LLM_API_KEY")), os.getenv("EMB_MODEL", cfg["db"]["embed_model"]))
upsert_embedding(con, "summaries", sum_id, os.getenv("EMB_MODEL", cfg["db"]["embed_model"]), vec)
except Exception:
pass
blocks.append(Template(item_tpl).render(title=(title or url), summary=summary))
newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"])
newsletter_subtitle = cfg.get("newsletter_subtitle","")
html_out = Template(news_tpl).render(newsletter_title=newsletter_title, newsletter_subtitle=newsletter_subtitle, blocks=blocks)
outp = pathlib.Path(args.out); outp.parent.mkdir(parents=True, exist_ok=True); outp.write_text(html_out, encoding="utf-8")
print(f"Saved: {outp}")
if args.publish:
ghost_url = os.getenv("GHOST_ADMIN_API_URL")
ghost_key = os.getenv("GHOST_ADMIN_API_KEY")
if ghost_url and ghost_key:
def ghost_jwt(key: str) -> str:
key_id, secret = key.split(':')
iat = int(time.time())
header = {"alg": "HS256", "kid": key_id, "typ": "JWT"}
payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"}
import jwt
return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)
token = ghost_jwt(ghost_key)
payload = {"posts":[{"title": newsletter_title, "html": html_out, "status": "draft", "tags": [{"name": t} for t in cfg.get("default_tags",[])]}]}
r = requests.post(f"{ghost_url}/posts/", headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"}, data=json.dumps(payload), timeout=60)
r.raise_for_status()
print("Draft:", r.json()["posts"][0]["url"])
else:
print("Missing Ghost creds; skipped publish.")
if __name__ == "__main__":
main()

18
scripts/db_cli.py Normal file
View File

@@ -0,0 +1,18 @@
#!/usr/bin/env python3
import argparse
from db import connect, init_db
def main():
ap = argparse.ArgumentParser(description="DB CLI")
ap.add_argument("cmd", choices=["init"])
ap.add_argument("--db", default="data/newsletter.db")
ap.add_argument("--schema", default="scripts/schema.sql")
args = ap.parse_args()
con = connect(args.db)
if args.cmd == "init":
init_db(con, args.schema)
print(f"Initialized schema in {args.db}")
if __name__ == "__main__":
main()

130
scripts/ingest_list.py Normal file
View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
import argparse, sqlite3, re, sys, time
from pathlib import Path
from urllib.parse import urlparse, urlunparse
import requests, tldextract
from bs4 import BeautifulSoup
from readability import Document as ReadabilityDoc
import trafilatura
from slugify import slugify
from datetime import date
def connect(db_path):
con = sqlite3.connect(db_path)
con.execute("PRAGMA foreign_keys=ON;")
return con
def upsert_source(con, url, title=None, publisher=None, date_published=None, content=None, tags=None):
con.execute(
"""INSERT INTO sources(url, title, publisher, date_published, content)
VALUES (?,?,?,?,?)
ON CONFLICT(url) DO UPDATE SET
title=COALESCE(excluded.title, title),
publisher=COALESCE(excluded.publisher, publisher),
date_published=COALESCE(excluded.date_published, date_published),
content=COALESCE(excluded.content, content)
""", (url, title, publisher, date_published, content)
)
sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0]
con.commit()
return sid
def normalize_url(u: str) -> str:
p = urlparse(u.strip())
if not p.scheme:
p = p._replace(scheme="https")
query = re.sub(r'(&|\?)?(utm_[^=&]+|fbclid|gclid)=[^&]*', '', p.query)
if query.startswith('&'): query = query[1:]
return urlunparse((p.scheme, p.netloc, p.path, p.params, query, ""))
def domain(url: str) -> str:
ext = tldextract.extract(url)
return ".".join(part for part in [ext.domain, ext.suffix] if part)
def fetch_readable(url: str, timeout=20):
try:
r = requests.get(url, timeout=timeout, headers={"User-Agent":"Mozilla/5.0"})
r.raise_for_status()
html = r.text
except Exception:
return "", ""
try:
txt = trafilatura.extract(html, include_comments=False, include_tables=False, favor_recall=True)
if txt:
soup = BeautifulSoup(html, "html.parser")
t = (soup.title.string.strip() if soup.title and soup.title.string else "")
return t, txt.strip()
except Exception:
pass
try:
doc = ReadabilityDoc(html)
content_html = doc.summary()
soup = BeautifulSoup(content_html, "html.parser")
txt = soup.get_text(separator="\n").strip()
return (doc.short_title() or "").strip(), txt
except Exception:
return "", ""
def write_stub(entries_dir: Path, title: str, url: str, source_name: str|None, text: str|None):
entries_dir.mkdir(parents=True, exist_ok=True)
from datetime import datetime
slug = slugify(title or domain(url) or "clanek")[:60] or "clanek"
path = entries_dir / f"{slug}.md"
body = text.strip() if text else ""
fm = f'''---
title: "{(title or "").replace('"',"\"")}"
source_name: "{(source_name or domain(url) or "").replace('"',"\"")}"
url: "{url}"
tags: []
status: "todo"
---
{body}
'''
path.write_text(fm, encoding="utf-8")
return path
def read_lines(source_path: str|None):
if source_path:
return Path(source_path).read_text(encoding="utf-8", errors="ignore").splitlines()
return sys.stdin.read().splitlines()
def main():
ap = argparse.ArgumentParser(description="Ingest list of URLs into SQLite and optional entry stubs")
ap.add_argument("--db", default="data/newsletter.db")
ap.add_argument("--list", help="Text file with URLs (one per line). If omitted, read from STDIN.")
ap.add_argument("--fetch", action="store_true", help="Fetch & extract readable text")
ap.add_argument("--sleep", type=float, default=0.0)
ap.add_argument("--stubs", action="store_true")
ap.add_argument("--date", default=date.today().isoformat())
args = ap.parse_args()
con = connect(args.db)
lines = read_lines(args.list)
urls = []
for ln in lines:
ln = ln.strip()
if not ln or ln.startswith("#"):
continue
if re.match(r"^\w+://", ln) or re.match(r"^[\w\.-]+\.[a-z]{2,}(/|$)", ln):
urls.append(normalize_url(ln))
seen = set(); urls = [u for u in urls if not (u in seen or seen.add(u))]
stubs_dir = Path(f"entries/{args.date}") if args.stubs else None
kept = 0
for url in urls:
pub = domain(url)
title, text = ("","")
if args.fetch:
title, text = fetch_readable(url)
sid = upsert_source(con, url=url, title=(title or None), publisher=pub, content=(text or None))
kept += 1
if stubs_dir:
stub = write_stub(stubs_dir, title or url, url, pub, text)
print(f"Stub: {stub}")
if args.sleep: time.sleep(args.sleep)
print(f"Ingested: {kept} URLs into {args.db}")
if __name__ == "__main__":
main()

71
scripts/schema.sql Normal file
View File

@@ -0,0 +1,71 @@
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS sources (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
title TEXT,
publisher TEXT,
date_published TEXT,
content TEXT
);
CREATE TABLE IF NOT EXISTS summaries (
id INTEGER PRIMARY KEY,
source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
title TEXT NOT NULL,
summary TEXT NOT NULL,
newsletter_date TEXT,
tone_version TEXT,
created_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT UNIQUE
);
CREATE TABLE IF NOT EXISTS source_tags (
source_id INTEGER REFERENCES sources(id) ON DELETE CASCADE,
tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
PRIMARY KEY(source_id, tag_id)
);
CREATE TABLE IF NOT EXISTS embeddings (
id INTEGER PRIMARY KEY,
ref_table TEXT NOT NULL CHECK(ref_table IN ('sources','summaries')),
ref_id INTEGER NOT NULL,
model TEXT NOT NULL,
dim INTEGER NOT NULL,
vec BLOB NOT NULL,
UNIQUE(ref_table, ref_id, model)
);
CREATE VIRTUAL TABLE IF NOT EXISTS sources_fts USING fts5(
title, content, content='sources', content_rowid='id'
);
CREATE VIRTUAL TABLE IF NOT EXISTS summaries_fts USING fts5(
title, summary, content='summaries', content_rowid='id'
);
CREATE TRIGGER IF NOT EXISTS sources_ai AFTER INSERT ON sources BEGIN
INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content);
END;
CREATE TRIGGER IF NOT EXISTS sources_au AFTER UPDATE ON sources BEGIN
INSERT INTO sources_fts(sources_fts, rowid, title, content)
VALUES('delete', old.id, old.title, old.content);
INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content);
END;
CREATE TRIGGER IF NOT EXISTS sources_ad AFTER DELETE ON sources BEGIN
INSERT INTO sources_fts(sources_fts, rowid, title, content) VALUES('delete', old.id, old.title, old.content);
END;
CREATE TRIGGER IF NOT EXISTS summaries_ai AFTER INSERT ON summaries BEGIN
INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary);
END;
CREATE TRIGGER IF NOT EXISTS summaries_au AFTER UPDATE ON summaries BEGIN
INSERT INTO summaries_fts(summaries_fts, rowid, title, summary)
VALUES('delete', old.id, old.title, old.summary);
INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary);
END;
CREATE TRIGGER IF NOT EXISTS summaries_ad AFTER DELETE ON summaries BEGIN
INSERT INTO summaries_fts(summaries_fts, rowid, title, summary) VALUES('delete', old.id, old.title, old.summary);
END;

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env python3
import argparse, re
from pathlib import Path
import yaml
from db import connect, upsert_source
def parse_front_matter(text: str):
m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M)
if not m:
return {}, text.strip()
fm = yaml.safe_load(m.group(1)) or {}
body = m.group(2).strip()
return fm, body
def main():
ap = argparse.ArgumentParser(description="Sync entries/*.md into SQLite sources")
ap.add_argument("--db", default="data/newsletter.db")
ap.add_argument("--dir", required=True, help="entries/YYYY-MM-DD directory")
args = ap.parse_args()
con = connect(args.db)
for p in Path(args.dir).glob("*.md"):
text = p.read_text(encoding="utf-8")
fm, body = parse_front_matter(text)
title = fm.get("title") or p.stem
url = fm.get("url")
publisher = fm.get("source_name")
upsert_source(con,
url=url,
title=title,
publisher=publisher,
date_published=None,
content=body
)
print(f"Synced: {p.name}")
if __name__ == "__main__":
main()

2
templates/item.html.j2 Normal file
View File

@@ -0,0 +1,2 @@
<h3><strong>{{ title }}</strong></h3>
<p>{{ summary }}</p>

View File

@@ -0,0 +1,7 @@
<h2>{{ newsletter_title }}</h2>
<p><em>{{ newsletter_subtitle }}</em></p>
<hr>
{% for b in blocks %}
{{ b }}
{% if not loop.last %}<hr>{% endif %}
{% endfor %}

24
templates/prompt.txt Normal file
View File

@@ -0,0 +1,24 @@
Jsi editor newsletteru o archeologii, historii a příbuzných vědách. Piš česky, srozumitelně pro laiky, věcně, bez žargonu; jednu větu můžeš udělat mírně poutavou, ale ne bulvární. Drž rozsah 34 věty.
ŠABLONA OBSAHU:
1) Co je novinka/objev.
2) Proč je důležitý (dopad, změna pohledu, metoda).
3) Kontext nebo zajímavost (kultura, datace, souvislosti).
4) (volitelně) Jedna věta s odkazem na předchozí související text, pokud je k dispozici.
Na konec přidej: „Zdroj: {source_name}“.
=== UKÁZKY STYLU ===
{style_examples}
=== VSTUPNÍ POLOŽKA ===
TITULEK: {title}
TEXT:
{body}
=== NÁVAZNOSTI (VOLITELNÉ) ===
{related_hint}
=== POKYNY ===
- Drž 34 věty.
- Pokud jsou náležitosti neúplné, buď opatrný a nic si nedovymýšlej.
- Zakonči přesně: „Zdroj: {source_name}“.

2
templates/style_bank.md Normal file
View File

@@ -0,0 +1,2 @@
- Velšské deštné pralesy: Velká Británie je domovem unikátních zbytků mírného deštného pralesa, který se kdysi rozkládal od Walesu až po Skotsko. Dnes přežívá jen asi 10 % původní plochy, ale probíhají programy jeho ochrany a obnovy. Pralesy se objevují i v mytologii, například ve čtvrté větvi velšského eposu Mabinogi, kde jsou spojeny s kouzelnými zvířaty. Kontrolovaná pastva, kterou je naši předci udržovali zdravé, se používá i dnes.
- Podmořské stavby lovcůsběračů: Vědci objevili pod hladinou Baltu kilometr dlouhou kamennou linii, pravděpodobně prehistorickou loveckou strukturu. Nález naznačuje promyšlenou organizaci lovu i přizpůsobení se krajině, kterou později zaplavilo moře. Podobné konstrukce známe z jiných částí světa, ale v Evropě jsou vzácné.