First commit

This commit is contained in:
2025-09-14 16:56:08 +02:00
commit 079bd25899
19 changed files with 976 additions and 0 deletions

154
scripts/build_from_db.py Normal file
View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python3
import os, sys, time, json, argparse, pathlib
import yaml, requests, jwt
from jinja2 import Template
from dotenv import load_dotenv
from datetime import date
from db import connect as db_connect, insert_summary, upsert_embedding
from emb import embed_text
load_dotenv()
ROOT = pathlib.Path(__file__).parent
REPO = ROOT.parent
TEMPLATES = REPO / "templates"
def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8")
def load_config():
cfg = yaml.safe_load(read_file(REPO / "config.yaml"))
cfg["date"] = date.today().isoformat()
return cfg
from dataclasses import dataclass
import requests
@dataclass
class LLMConfig:
provider: str
api_base: str
model: str
api_key: str | None
temperature: float
top_p: float
presence_penalty: float
frequency_penalty: float
timeout_seconds: int
max_retries: int
def resolve_llm_config(cfg: dict, args) -> LLMConfig:
llm_cfg = cfg.get("llm", {}) if cfg else {}
def pick(cli_val, env_key, cfg_key, default=None):
if cli_val is not None:
return cli_val
if env_key and os.getenv(env_key):
return os.getenv(env_key)
return llm_cfg.get(cfg_key, default)
provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui")
api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base",
"http://localhost:3000" if provider=="openwebui" else
"http://localhost:11434" if provider=="ollama" else
"https://api.openai.com")
model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model",
"qwen2.5-7b-instruct" if provider=="openwebui" else
"llama3.1:8b-instruct" if provider=="ollama" else
"gpt-4o-mini")
api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None)
temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2))
top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0))
presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0))
frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0))
timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120))
max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2))
return LLMConfig(provider, api_base, model, api_key, temperature, top_p, presence_penalty, frequency_penalty, timeout_seconds, max_retries)
def chat_completion_llm(messages, llm: LLMConfig):
if llm.provider == "openwebui":
url = f"{llm.api_base.rstrip('/')}/api/chat/completions"
elif llm.provider == "ollama":
url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
else:
url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
headers = {"Content-Type":"application/json"}
if llm.api_key: headers["Authorization"] = f"Bearer {llm.api_key}"
payload = {"model": llm.model, "messages": messages, "temperature": llm.temperature, "top_p": llm.top_p,
"presence_penalty": llm.presence_penalty, "frequency_penalty": llm.frequency_penalty, "stream": False}
r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds)
r.raise_for_status()
return r.json()["choices"][0]["message"]["content"]
def main():
ap = argparse.ArgumentParser(description="Build directly from DB (Top-N sources)")
ap.add_argument("--db", default="data/newsletter.db")
ap.add_argument("--limit", type=int, default=10)
ap.add_argument("--out", required=True)
ap.add_argument("--publish", action="store_true")
# LLM overrides
ap.add_argument("--llm-provider"); ap.add_argument("--llm-api-base")
ap.add_argument("--llm-model"); ap.add_argument("--temperature", type=float)
ap.add_argument("--top-p", type=float); ap.add_argument("--presence-penalty", type=float)
ap.add_argument("--frequency-penalty", type=float); ap.add_argument("--timeout-seconds", type=int)
ap.add_argument("--max-retries", type=int)
args = ap.parse_args()
cfg = load_config()
llm = resolve_llm_config(cfg, args)
con = db_connect(args.db)
rows = con.execute("SELECT id, url, title, publisher FROM sources ORDER BY id DESC LIMIT ?", (args.limit,)).fetchall()
prompt_template = (TEMPLATES / "prompt.txt").read_text(encoding="utf-8")
style_examples = (TEMPLATES / "style_bank.md").read_text(encoding="utf-8").strip()
prompt_template = prompt_template.replace("{style_examples}", style_examples)
item_tpl = (TEMPLATES / "item.html.j2").read_text(encoding="utf-8")
news_tpl = (TEMPLATES / "newsletter.html.j2").read_text(encoding="utf-8")
blocks = []
for sid, url, title, publisher in rows:
body = (con.execute("SELECT content FROM sources WHERE id=?", (sid,)).fetchone()[0]) or ""
related_hint = ""
prompt = (prompt_template
.replace("{title}", title or url)
.replace("{body}", body)
.replace("{source_name}", publisher or "Zdroj neuveden")
.replace("{related_hint}", related_hint))
summary = chat_completion_llm([{"role":"user","content": prompt}], llm)
sum_id = insert_summary(con, sid, title or url, summary, newsletter_date=cfg["date"], tone_version="v1")
try:
vec = embed_text(summary, os.getenv("EMB_API_BASE", cfg["llm"]["api_base"]), os.getenv("EMB_API_KEY", os.getenv("LLM_API_KEY")), os.getenv("EMB_MODEL", cfg["db"]["embed_model"]))
upsert_embedding(con, "summaries", sum_id, os.getenv("EMB_MODEL", cfg["db"]["embed_model"]), vec)
except Exception:
pass
blocks.append(Template(item_tpl).render(title=(title or url), summary=summary))
newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"])
newsletter_subtitle = cfg.get("newsletter_subtitle","")
html_out = Template(news_tpl).render(newsletter_title=newsletter_title, newsletter_subtitle=newsletter_subtitle, blocks=blocks)
outp = pathlib.Path(args.out); outp.parent.mkdir(parents=True, exist_ok=True); outp.write_text(html_out, encoding="utf-8")
print(f"Saved: {outp}")
if args.publish:
ghost_url = os.getenv("GHOST_ADMIN_API_URL")
ghost_key = os.getenv("GHOST_ADMIN_API_KEY")
if ghost_url and ghost_key:
def ghost_jwt(key: str) -> str:
key_id, secret = key.split(':')
iat = int(time.time())
header = {"alg": "HS256", "kid": key_id, "typ": "JWT"}
payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"}
import jwt
return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)
token = ghost_jwt(ghost_key)
payload = {"posts":[{"title": newsletter_title, "html": html_out, "status": "draft", "tags": [{"name": t} for t in cfg.get("default_tags",[])]}]}
r = requests.post(f"{ghost_url}/posts/", headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"}, data=json.dumps(payload), timeout=60)
r.raise_for_status()
print("Draft:", r.json()["posts"][0]["url"])
else:
print("Missing Ghost creds; skipped publish.")
if __name__ == "__main__":
main()

18
scripts/db_cli.py Normal file
View File

@@ -0,0 +1,18 @@
#!/usr/bin/env python3
import argparse
from db import connect, init_db
def main():
ap = argparse.ArgumentParser(description="DB CLI")
ap.add_argument("cmd", choices=["init"])
ap.add_argument("--db", default="data/newsletter.db")
ap.add_argument("--schema", default="scripts/schema.sql")
args = ap.parse_args()
con = connect(args.db)
if args.cmd == "init":
init_db(con, args.schema)
print(f"Initialized schema in {args.db}")
if __name__ == "__main__":
main()

130
scripts/ingest_list.py Normal file
View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
import argparse, sqlite3, re, sys, time
from pathlib import Path
from urllib.parse import urlparse, urlunparse
import requests, tldextract
from bs4 import BeautifulSoup
from readability import Document as ReadabilityDoc
import trafilatura
from slugify import slugify
from datetime import date
def connect(db_path):
con = sqlite3.connect(db_path)
con.execute("PRAGMA foreign_keys=ON;")
return con
def upsert_source(con, url, title=None, publisher=None, date_published=None, content=None, tags=None):
con.execute(
"""INSERT INTO sources(url, title, publisher, date_published, content)
VALUES (?,?,?,?,?)
ON CONFLICT(url) DO UPDATE SET
title=COALESCE(excluded.title, title),
publisher=COALESCE(excluded.publisher, publisher),
date_published=COALESCE(excluded.date_published, date_published),
content=COALESCE(excluded.content, content)
""", (url, title, publisher, date_published, content)
)
sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0]
con.commit()
return sid
def normalize_url(u: str) -> str:
p = urlparse(u.strip())
if not p.scheme:
p = p._replace(scheme="https")
query = re.sub(r'(&|\?)?(utm_[^=&]+|fbclid|gclid)=[^&]*', '', p.query)
if query.startswith('&'): query = query[1:]
return urlunparse((p.scheme, p.netloc, p.path, p.params, query, ""))
def domain(url: str) -> str:
ext = tldextract.extract(url)
return ".".join(part for part in [ext.domain, ext.suffix] if part)
def fetch_readable(url: str, timeout=20):
try:
r = requests.get(url, timeout=timeout, headers={"User-Agent":"Mozilla/5.0"})
r.raise_for_status()
html = r.text
except Exception:
return "", ""
try:
txt = trafilatura.extract(html, include_comments=False, include_tables=False, favor_recall=True)
if txt:
soup = BeautifulSoup(html, "html.parser")
t = (soup.title.string.strip() if soup.title and soup.title.string else "")
return t, txt.strip()
except Exception:
pass
try:
doc = ReadabilityDoc(html)
content_html = doc.summary()
soup = BeautifulSoup(content_html, "html.parser")
txt = soup.get_text(separator="\n").strip()
return (doc.short_title() or "").strip(), txt
except Exception:
return "", ""
def write_stub(entries_dir: Path, title: str, url: str, source_name: str|None, text: str|None):
entries_dir.mkdir(parents=True, exist_ok=True)
from datetime import datetime
slug = slugify(title or domain(url) or "clanek")[:60] or "clanek"
path = entries_dir / f"{slug}.md"
body = text.strip() if text else ""
fm = f'''---
title: "{(title or "").replace('"',"\"")}"
source_name: "{(source_name or domain(url) or "").replace('"',"\"")}"
url: "{url}"
tags: []
status: "todo"
---
{body}
'''
path.write_text(fm, encoding="utf-8")
return path
def read_lines(source_path: str|None):
if source_path:
return Path(source_path).read_text(encoding="utf-8", errors="ignore").splitlines()
return sys.stdin.read().splitlines()
def main():
ap = argparse.ArgumentParser(description="Ingest list of URLs into SQLite and optional entry stubs")
ap.add_argument("--db", default="data/newsletter.db")
ap.add_argument("--list", help="Text file with URLs (one per line). If omitted, read from STDIN.")
ap.add_argument("--fetch", action="store_true", help="Fetch & extract readable text")
ap.add_argument("--sleep", type=float, default=0.0)
ap.add_argument("--stubs", action="store_true")
ap.add_argument("--date", default=date.today().isoformat())
args = ap.parse_args()
con = connect(args.db)
lines = read_lines(args.list)
urls = []
for ln in lines:
ln = ln.strip()
if not ln or ln.startswith("#"):
continue
if re.match(r"^\w+://", ln) or re.match(r"^[\w\.-]+\.[a-z]{2,}(/|$)", ln):
urls.append(normalize_url(ln))
seen = set(); urls = [u for u in urls if not (u in seen or seen.add(u))]
stubs_dir = Path(f"entries/{args.date}") if args.stubs else None
kept = 0
for url in urls:
pub = domain(url)
title, text = ("","")
if args.fetch:
title, text = fetch_readable(url)
sid = upsert_source(con, url=url, title=(title or None), publisher=pub, content=(text or None))
kept += 1
if stubs_dir:
stub = write_stub(stubs_dir, title or url, url, pub, text)
print(f"Stub: {stub}")
if args.sleep: time.sleep(args.sleep)
print(f"Ingested: {kept} URLs into {args.db}")
if __name__ == "__main__":
main()

71
scripts/schema.sql Normal file
View File

@@ -0,0 +1,71 @@
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS sources (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
title TEXT,
publisher TEXT,
date_published TEXT,
content TEXT
);
CREATE TABLE IF NOT EXISTS summaries (
id INTEGER PRIMARY KEY,
source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
title TEXT NOT NULL,
summary TEXT NOT NULL,
newsletter_date TEXT,
tone_version TEXT,
created_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT UNIQUE
);
CREATE TABLE IF NOT EXISTS source_tags (
source_id INTEGER REFERENCES sources(id) ON DELETE CASCADE,
tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
PRIMARY KEY(source_id, tag_id)
);
CREATE TABLE IF NOT EXISTS embeddings (
id INTEGER PRIMARY KEY,
ref_table TEXT NOT NULL CHECK(ref_table IN ('sources','summaries')),
ref_id INTEGER NOT NULL,
model TEXT NOT NULL,
dim INTEGER NOT NULL,
vec BLOB NOT NULL,
UNIQUE(ref_table, ref_id, model)
);
CREATE VIRTUAL TABLE IF NOT EXISTS sources_fts USING fts5(
title, content, content='sources', content_rowid='id'
);
CREATE VIRTUAL TABLE IF NOT EXISTS summaries_fts USING fts5(
title, summary, content='summaries', content_rowid='id'
);
CREATE TRIGGER IF NOT EXISTS sources_ai AFTER INSERT ON sources BEGIN
INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content);
END;
CREATE TRIGGER IF NOT EXISTS sources_au AFTER UPDATE ON sources BEGIN
INSERT INTO sources_fts(sources_fts, rowid, title, content)
VALUES('delete', old.id, old.title, old.content);
INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content);
END;
CREATE TRIGGER IF NOT EXISTS sources_ad AFTER DELETE ON sources BEGIN
INSERT INTO sources_fts(sources_fts, rowid, title, content) VALUES('delete', old.id, old.title, old.content);
END;
CREATE TRIGGER IF NOT EXISTS summaries_ai AFTER INSERT ON summaries BEGIN
INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary);
END;
CREATE TRIGGER IF NOT EXISTS summaries_au AFTER UPDATE ON summaries BEGIN
INSERT INTO summaries_fts(summaries_fts, rowid, title, summary)
VALUES('delete', old.id, old.title, old.summary);
INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary);
END;
CREATE TRIGGER IF NOT EXISTS summaries_ad AFTER DELETE ON summaries BEGIN
INSERT INTO summaries_fts(summaries_fts, rowid, title, summary) VALUES('delete', old.id, old.title, old.summary);
END;

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env python3
import argparse, re
from pathlib import Path
import yaml
from db import connect, upsert_source
def parse_front_matter(text: str):
m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M)
if not m:
return {}, text.strip()
fm = yaml.safe_load(m.group(1)) or {}
body = m.group(2).strip()
return fm, body
def main():
ap = argparse.ArgumentParser(description="Sync entries/*.md into SQLite sources")
ap.add_argument("--db", default="data/newsletter.db")
ap.add_argument("--dir", required=True, help="entries/YYYY-MM-DD directory")
args = ap.parse_args()
con = connect(args.db)
for p in Path(args.dir).glob("*.md"):
text = p.read_text(encoding="utf-8")
fm, body = parse_front_matter(text)
title = fm.get("title") or p.stem
url = fm.get("url")
publisher = fm.get("source_name")
upsert_source(con,
url=url,
title=title,
publisher=publisher,
date_published=None,
content=body
)
print(f"Synced: {p.name}")
if __name__ == "__main__":
main()