First commit
This commit is contained in:
154
scripts/build_from_db.py
Normal file
154
scripts/build_from_db.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python3
|
||||
import os, sys, time, json, argparse, pathlib
|
||||
import yaml, requests, jwt
|
||||
from jinja2 import Template
|
||||
from dotenv import load_dotenv
|
||||
from datetime import date
|
||||
|
||||
from db import connect as db_connect, insert_summary, upsert_embedding
|
||||
from emb import embed_text
|
||||
|
||||
load_dotenv()
|
||||
ROOT = pathlib.Path(__file__).parent
|
||||
REPO = ROOT.parent
|
||||
TEMPLATES = REPO / "templates"
|
||||
|
||||
def read_file(p): return pathlib.Path(p).read_text(encoding="utf-8")
|
||||
|
||||
def load_config():
|
||||
cfg = yaml.safe_load(read_file(REPO / "config.yaml"))
|
||||
cfg["date"] = date.today().isoformat()
|
||||
return cfg
|
||||
|
||||
from dataclasses import dataclass
|
||||
import requests
|
||||
|
||||
@dataclass
|
||||
class LLMConfig:
|
||||
provider: str
|
||||
api_base: str
|
||||
model: str
|
||||
api_key: str | None
|
||||
temperature: float
|
||||
top_p: float
|
||||
presence_penalty: float
|
||||
frequency_penalty: float
|
||||
timeout_seconds: int
|
||||
max_retries: int
|
||||
|
||||
def resolve_llm_config(cfg: dict, args) -> LLMConfig:
|
||||
llm_cfg = cfg.get("llm", {}) if cfg else {}
|
||||
def pick(cli_val, env_key, cfg_key, default=None):
|
||||
if cli_val is not None:
|
||||
return cli_val
|
||||
if env_key and os.getenv(env_key):
|
||||
return os.getenv(env_key)
|
||||
return llm_cfg.get(cfg_key, default)
|
||||
provider = pick(getattr(args, "llm_provider", None), "LLM_PROVIDER", "provider", "openwebui")
|
||||
api_base = pick(getattr(args, "llm_api_base", None), "LLM_API_BASE", "api_base",
|
||||
"http://localhost:3000" if provider=="openwebui" else
|
||||
"http://localhost:11434" if provider=="ollama" else
|
||||
"https://api.openai.com")
|
||||
model = pick(getattr(args, "llm_model", None), "LLM_MODEL", "model",
|
||||
"qwen2.5-7b-instruct" if provider=="openwebui" else
|
||||
"llama3.1:8b-instruct" if provider=="ollama" else
|
||||
"gpt-4o-mini")
|
||||
api_key = os.getenv("LLM_API_KEY") or (os.getenv("OPENAI_API_KEY") if provider=="openai" else None)
|
||||
temperature = float(pick(getattr(args, "temperature", None), "LLM_TEMPERATURE", "temperature", 0.2))
|
||||
top_p = float(pick(getattr(args, "top_p", None), "LLM_TOP_P", "top_p", 1.0))
|
||||
presence_penalty = float(pick(getattr(args, "presence_penalty", None), "LLM_PRESENCE_PENALTY", "presence_penalty", 0.0))
|
||||
frequency_penalty = float(pick(getattr(args, "frequency_penalty", None), "LLM_FREQUENCY_PENALTY", "frequency_penalty", 0.0))
|
||||
timeout_seconds = int(pick(getattr(args, "timeout_seconds", None), "LLM_TIMEOUT_SECONDS", "timeout_seconds", 120))
|
||||
max_retries = int(pick(getattr(args, "max_retries", None), "LLM_MAX_RETRIES", "max_retries", 2))
|
||||
return LLMConfig(provider, api_base, model, api_key, temperature, top_p, presence_penalty, frequency_penalty, timeout_seconds, max_retries)
|
||||
|
||||
def chat_completion_llm(messages, llm: LLMConfig):
|
||||
if llm.provider == "openwebui":
|
||||
url = f"{llm.api_base.rstrip('/')}/api/chat/completions"
|
||||
elif llm.provider == "ollama":
|
||||
url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
|
||||
else:
|
||||
url = f"{llm.api_base.rstrip('/')}/v1/chat/completions"
|
||||
headers = {"Content-Type":"application/json"}
|
||||
if llm.api_key: headers["Authorization"] = f"Bearer {llm.api_key}"
|
||||
payload = {"model": llm.model, "messages": messages, "temperature": llm.temperature, "top_p": llm.top_p,
|
||||
"presence_penalty": llm.presence_penalty, "frequency_penalty": llm.frequency_penalty, "stream": False}
|
||||
r = requests.post(url, headers=headers, json=payload, timeout=llm.timeout_seconds)
|
||||
r.raise_for_status()
|
||||
return r.json()["choices"][0]["message"]["content"]
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Build directly from DB (Top-N sources)")
|
||||
ap.add_argument("--db", default="data/newsletter.db")
|
||||
ap.add_argument("--limit", type=int, default=10)
|
||||
ap.add_argument("--out", required=True)
|
||||
ap.add_argument("--publish", action="store_true")
|
||||
# LLM overrides
|
||||
ap.add_argument("--llm-provider"); ap.add_argument("--llm-api-base")
|
||||
ap.add_argument("--llm-model"); ap.add_argument("--temperature", type=float)
|
||||
ap.add_argument("--top-p", type=float); ap.add_argument("--presence-penalty", type=float)
|
||||
ap.add_argument("--frequency-penalty", type=float); ap.add_argument("--timeout-seconds", type=int)
|
||||
ap.add_argument("--max-retries", type=int)
|
||||
args = ap.parse_args()
|
||||
|
||||
cfg = load_config()
|
||||
llm = resolve_llm_config(cfg, args)
|
||||
con = db_connect(args.db)
|
||||
|
||||
rows = con.execute("SELECT id, url, title, publisher FROM sources ORDER BY id DESC LIMIT ?", (args.limit,)).fetchall()
|
||||
|
||||
prompt_template = (TEMPLATES / "prompt.txt").read_text(encoding="utf-8")
|
||||
style_examples = (TEMPLATES / "style_bank.md").read_text(encoding="utf-8").strip()
|
||||
prompt_template = prompt_template.replace("{style_examples}", style_examples)
|
||||
|
||||
item_tpl = (TEMPLATES / "item.html.j2").read_text(encoding="utf-8")
|
||||
news_tpl = (TEMPLATES / "newsletter.html.j2").read_text(encoding="utf-8")
|
||||
|
||||
blocks = []
|
||||
for sid, url, title, publisher in rows:
|
||||
body = (con.execute("SELECT content FROM sources WHERE id=?", (sid,)).fetchone()[0]) or ""
|
||||
related_hint = "—"
|
||||
prompt = (prompt_template
|
||||
.replace("{title}", title or url)
|
||||
.replace("{body}", body)
|
||||
.replace("{source_name}", publisher or "Zdroj neuveden")
|
||||
.replace("{related_hint}", related_hint))
|
||||
summary = chat_completion_llm([{"role":"user","content": prompt}], llm)
|
||||
|
||||
sum_id = insert_summary(con, sid, title or url, summary, newsletter_date=cfg["date"], tone_version="v1")
|
||||
try:
|
||||
vec = embed_text(summary, os.getenv("EMB_API_BASE", cfg["llm"]["api_base"]), os.getenv("EMB_API_KEY", os.getenv("LLM_API_KEY")), os.getenv("EMB_MODEL", cfg["db"]["embed_model"]))
|
||||
upsert_embedding(con, "summaries", sum_id, os.getenv("EMB_MODEL", cfg["db"]["embed_model"]), vec)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
blocks.append(Template(item_tpl).render(title=(title or url), summary=summary))
|
||||
|
||||
newsletter_title = Template(cfg["newsletter_title"]).render(date=cfg["date"])
|
||||
newsletter_subtitle = cfg.get("newsletter_subtitle","")
|
||||
html_out = Template(news_tpl).render(newsletter_title=newsletter_title, newsletter_subtitle=newsletter_subtitle, blocks=blocks)
|
||||
|
||||
outp = pathlib.Path(args.out); outp.parent.mkdir(parents=True, exist_ok=True); outp.write_text(html_out, encoding="utf-8")
|
||||
print(f"Saved: {outp}")
|
||||
|
||||
if args.publish:
|
||||
ghost_url = os.getenv("GHOST_ADMIN_API_URL")
|
||||
ghost_key = os.getenv("GHOST_ADMIN_API_KEY")
|
||||
if ghost_url and ghost_key:
|
||||
def ghost_jwt(key: str) -> str:
|
||||
key_id, secret = key.split(':')
|
||||
iat = int(time.time())
|
||||
header = {"alg": "HS256", "kid": key_id, "typ": "JWT"}
|
||||
payload = {"iat": iat, "exp": iat + 5 * 60, "aud": "/admin/"}
|
||||
import jwt
|
||||
return jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)
|
||||
token = ghost_jwt(ghost_key)
|
||||
payload = {"posts":[{"title": newsletter_title, "html": html_out, "status": "draft", "tags": [{"name": t} for t in cfg.get("default_tags",[])]}]}
|
||||
r = requests.post(f"{ghost_url}/posts/", headers={"Authorization": f"Ghost {token}", "Content-Type": "application/json"}, data=json.dumps(payload), timeout=60)
|
||||
r.raise_for_status()
|
||||
print("Draft:", r.json()["posts"][0]["url"])
|
||||
else:
|
||||
print("Missing Ghost creds; skipped publish.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
18
scripts/db_cli.py
Normal file
18
scripts/db_cli.py
Normal file
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
from db import connect, init_db
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="DB CLI")
|
||||
ap.add_argument("cmd", choices=["init"])
|
||||
ap.add_argument("--db", default="data/newsletter.db")
|
||||
ap.add_argument("--schema", default="scripts/schema.sql")
|
||||
args = ap.parse_args()
|
||||
|
||||
con = connect(args.db)
|
||||
if args.cmd == "init":
|
||||
init_db(con, args.schema)
|
||||
print(f"Initialized schema in {args.db}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
130
scripts/ingest_list.py
Normal file
130
scripts/ingest_list.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse, sqlite3, re, sys, time
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
import requests, tldextract
|
||||
from bs4 import BeautifulSoup
|
||||
from readability import Document as ReadabilityDoc
|
||||
import trafilatura
|
||||
from slugify import slugify
|
||||
from datetime import date
|
||||
|
||||
def connect(db_path):
|
||||
con = sqlite3.connect(db_path)
|
||||
con.execute("PRAGMA foreign_keys=ON;")
|
||||
return con
|
||||
|
||||
def upsert_source(con, url, title=None, publisher=None, date_published=None, content=None, tags=None):
|
||||
con.execute(
|
||||
"""INSERT INTO sources(url, title, publisher, date_published, content)
|
||||
VALUES (?,?,?,?,?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
title=COALESCE(excluded.title, title),
|
||||
publisher=COALESCE(excluded.publisher, publisher),
|
||||
date_published=COALESCE(excluded.date_published, date_published),
|
||||
content=COALESCE(excluded.content, content)
|
||||
""", (url, title, publisher, date_published, content)
|
||||
)
|
||||
sid = con.execute("SELECT id FROM sources WHERE url=?", (url,)).fetchone()[0]
|
||||
con.commit()
|
||||
return sid
|
||||
|
||||
def normalize_url(u: str) -> str:
|
||||
p = urlparse(u.strip())
|
||||
if not p.scheme:
|
||||
p = p._replace(scheme="https")
|
||||
query = re.sub(r'(&|\?)?(utm_[^=&]+|fbclid|gclid)=[^&]*', '', p.query)
|
||||
if query.startswith('&'): query = query[1:]
|
||||
return urlunparse((p.scheme, p.netloc, p.path, p.params, query, ""))
|
||||
|
||||
def domain(url: str) -> str:
|
||||
ext = tldextract.extract(url)
|
||||
return ".".join(part for part in [ext.domain, ext.suffix] if part)
|
||||
|
||||
def fetch_readable(url: str, timeout=20):
|
||||
try:
|
||||
r = requests.get(url, timeout=timeout, headers={"User-Agent":"Mozilla/5.0"})
|
||||
r.raise_for_status()
|
||||
html = r.text
|
||||
except Exception:
|
||||
return "", ""
|
||||
try:
|
||||
txt = trafilatura.extract(html, include_comments=False, include_tables=False, favor_recall=True)
|
||||
if txt:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
t = (soup.title.string.strip() if soup.title and soup.title.string else "")
|
||||
return t, txt.strip()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
doc = ReadabilityDoc(html)
|
||||
content_html = doc.summary()
|
||||
soup = BeautifulSoup(content_html, "html.parser")
|
||||
txt = soup.get_text(separator="\n").strip()
|
||||
return (doc.short_title() or "").strip(), txt
|
||||
except Exception:
|
||||
return "", ""
|
||||
|
||||
def write_stub(entries_dir: Path, title: str, url: str, source_name: str|None, text: str|None):
|
||||
entries_dir.mkdir(parents=True, exist_ok=True)
|
||||
from datetime import datetime
|
||||
slug = slugify(title or domain(url) or "clanek")[:60] or "clanek"
|
||||
path = entries_dir / f"{slug}.md"
|
||||
body = text.strip() if text else ""
|
||||
fm = f'''---
|
||||
title: "{(title or "").replace('"',"\"")}"
|
||||
source_name: "{(source_name or domain(url) or "").replace('"',"\"")}"
|
||||
url: "{url}"
|
||||
tags: []
|
||||
status: "todo"
|
||||
---
|
||||
|
||||
{body}
|
||||
'''
|
||||
path.write_text(fm, encoding="utf-8")
|
||||
return path
|
||||
|
||||
def read_lines(source_path: str|None):
|
||||
if source_path:
|
||||
return Path(source_path).read_text(encoding="utf-8", errors="ignore").splitlines()
|
||||
return sys.stdin.read().splitlines()
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Ingest list of URLs into SQLite and optional entry stubs")
|
||||
ap.add_argument("--db", default="data/newsletter.db")
|
||||
ap.add_argument("--list", help="Text file with URLs (one per line). If omitted, read from STDIN.")
|
||||
ap.add_argument("--fetch", action="store_true", help="Fetch & extract readable text")
|
||||
ap.add_argument("--sleep", type=float, default=0.0)
|
||||
ap.add_argument("--stubs", action="store_true")
|
||||
ap.add_argument("--date", default=date.today().isoformat())
|
||||
args = ap.parse_args()
|
||||
|
||||
con = connect(args.db)
|
||||
lines = read_lines(args.list)
|
||||
|
||||
urls = []
|
||||
for ln in lines:
|
||||
ln = ln.strip()
|
||||
if not ln or ln.startswith("#"):
|
||||
continue
|
||||
if re.match(r"^\w+://", ln) or re.match(r"^[\w\.-]+\.[a-z]{2,}(/|$)", ln):
|
||||
urls.append(normalize_url(ln))
|
||||
seen = set(); urls = [u for u in urls if not (u in seen or seen.add(u))]
|
||||
|
||||
stubs_dir = Path(f"entries/{args.date}") if args.stubs else None
|
||||
kept = 0
|
||||
for url in urls:
|
||||
pub = domain(url)
|
||||
title, text = ("","")
|
||||
if args.fetch:
|
||||
title, text = fetch_readable(url)
|
||||
sid = upsert_source(con, url=url, title=(title or None), publisher=pub, content=(text or None))
|
||||
kept += 1
|
||||
if stubs_dir:
|
||||
stub = write_stub(stubs_dir, title or url, url, pub, text)
|
||||
print(f"Stub: {stub}")
|
||||
if args.sleep: time.sleep(args.sleep)
|
||||
print(f"Ingested: {kept} URLs into {args.db}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
71
scripts/schema.sql
Normal file
71
scripts/schema.sql
Normal file
@@ -0,0 +1,71 @@
|
||||
PRAGMA foreign_keys = ON;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sources (
|
||||
id INTEGER PRIMARY KEY,
|
||||
url TEXT UNIQUE,
|
||||
title TEXT,
|
||||
publisher TEXT,
|
||||
date_published TEXT,
|
||||
content TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS summaries (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
|
||||
title TEXT NOT NULL,
|
||||
summary TEXT NOT NULL,
|
||||
newsletter_date TEXT,
|
||||
tone_version TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS tags (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT UNIQUE
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS source_tags (
|
||||
source_id INTEGER REFERENCES sources(id) ON DELETE CASCADE,
|
||||
tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY(source_id, tag_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS embeddings (
|
||||
id INTEGER PRIMARY KEY,
|
||||
ref_table TEXT NOT NULL CHECK(ref_table IN ('sources','summaries')),
|
||||
ref_id INTEGER NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
dim INTEGER NOT NULL,
|
||||
vec BLOB NOT NULL,
|
||||
UNIQUE(ref_table, ref_id, model)
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS sources_fts USING fts5(
|
||||
title, content, content='sources', content_rowid='id'
|
||||
);
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS summaries_fts USING fts5(
|
||||
title, summary, content='summaries', content_rowid='id'
|
||||
);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS sources_ai AFTER INSERT ON sources BEGIN
|
||||
INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content);
|
||||
END;
|
||||
CREATE TRIGGER IF NOT EXISTS sources_au AFTER UPDATE ON sources BEGIN
|
||||
INSERT INTO sources_fts(sources_fts, rowid, title, content)
|
||||
VALUES('delete', old.id, old.title, old.content);
|
||||
INSERT INTO sources_fts(rowid, title, content) VALUES (new.id, new.title, new.content);
|
||||
END;
|
||||
CREATE TRIGGER IF NOT EXISTS sources_ad AFTER DELETE ON sources BEGIN
|
||||
INSERT INTO sources_fts(sources_fts, rowid, title, content) VALUES('delete', old.id, old.title, old.content);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS summaries_ai AFTER INSERT ON summaries BEGIN
|
||||
INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary);
|
||||
END;
|
||||
CREATE TRIGGER IF NOT EXISTS summaries_au AFTER UPDATE ON summaries BEGIN
|
||||
INSERT INTO summaries_fts(summaries_fts, rowid, title, summary)
|
||||
VALUES('delete', old.id, old.title, old.summary);
|
||||
INSERT INTO summaries_fts(rowid, title, summary) VALUES (new.id, new.title, new.summary);
|
||||
END;
|
||||
CREATE TRIGGER IF NOT EXISTS summaries_ad AFTER DELETE ON summaries BEGIN
|
||||
INSERT INTO summaries_fts(summaries_fts, rowid, title, summary) VALUES('delete', old.id, old.title, old.summary);
|
||||
END;
|
38
scripts/sync_entries_to_db.py
Normal file
38
scripts/sync_entries_to_db.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse, re
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
from db import connect, upsert_source
|
||||
|
||||
def parse_front_matter(text: str):
|
||||
m = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.S|re.M)
|
||||
if not m:
|
||||
return {}, text.strip()
|
||||
fm = yaml.safe_load(m.group(1)) or {}
|
||||
body = m.group(2).strip()
|
||||
return fm, body
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Sync entries/*.md into SQLite sources")
|
||||
ap.add_argument("--db", default="data/newsletter.db")
|
||||
ap.add_argument("--dir", required=True, help="entries/YYYY-MM-DD directory")
|
||||
args = ap.parse_args()
|
||||
|
||||
con = connect(args.db)
|
||||
for p in Path(args.dir).glob("*.md"):
|
||||
text = p.read_text(encoding="utf-8")
|
||||
fm, body = parse_front_matter(text)
|
||||
title = fm.get("title") or p.stem
|
||||
url = fm.get("url")
|
||||
publisher = fm.get("source_name")
|
||||
upsert_source(con,
|
||||
url=url,
|
||||
title=title,
|
||||
publisher=publisher,
|
||||
date_published=None,
|
||||
content=body
|
||||
)
|
||||
print(f"Synced: {p.name}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user