521 lines
19 KiB
Python
521 lines
19 KiB
Python
from flask import Flask, request, redirect, send_file, abort, Response, make_response
|
|
import os, json, time, requests, re
|
|
from pathlib import Path
|
|
from redis import Redis
|
|
from rq import Queue
|
|
|
|
MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
|
|
MEILI_KEY = os.getenv("MEILI_KEY", "") # from .env
|
|
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
|
|
LIBRARY_ROOT = Path(os.getenv("LIBRARY_ROOT", "/library")).resolve()
|
|
TRANSCRIPT_ROOT = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts")).resolve()
|
|
|
|
app = Flask(__name__)
|
|
FEED_LOG = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts")) / "_feed.log"
|
|
q = Queue(connection=Redis.from_url(REDIS_URL))
|
|
|
|
PAGE = """
|
|
<!doctype html><html><head><meta charset="utf-8">
|
|
<title>PodX - unified search</title>
|
|
<style>
|
|
body{font-family:system-ui, sans-serif;max-width:880px;margin:2rem auto;padding:0 1rem}
|
|
form{display:flex;gap:.5rem;margin-bottom:1rem}
|
|
input[type=url]{flex:1;padding:.7rem}
|
|
button{padding:.7rem 1rem}
|
|
.card{border:1px solid #ddd;padding:1rem;border-radius:8px;margin:.5rem 0}
|
|
small{color:#666}
|
|
input[type=search]{width:100%;padding:.6rem;margin:.5rem 0 1rem}
|
|
mark{background: #fff2a8}
|
|
.badge{display:inline-block;font-size:.75rem;border:1px solid #999;padding:.1rem .4rem;border-radius:999px;margin-right:.4rem}
|
|
</style></head><body>
|
|
<h1>PodX</h1>
|
|
<form action="/enqueue" method="post">
|
|
<input type="url" name="url" placeholder="Paste podcast/video/article URL…" required>
|
|
<button type="submit">Fetch</button>
|
|
</form>
|
|
<details><summary>Batch</summary>
|
|
<form action="/enqueue_batch" method="post">
|
|
<textarea name="urls" rows="4" style="width:100%" placeholder="One URL per line"></textarea>
|
|
<button type="submit">Queue All</button>
|
|
</form>
|
|
</details>
|
|
|
|
<h2>Unified search (podcasts + PDFs + EPUB + Kiwix + Web)</h2>
|
|
<form id="sform">
|
|
<input type="search" name="q" placeholder='e.g., "vector database" OR retrieval augmented generation' autofocus />
|
|
</form>
|
|
<div id="results"></div>
|
|
|
|
<script>
|
|
const form = document.getElementById('sform');
|
|
async function doSearch(){
|
|
const q = new URLSearchParams(new FormData(form)).toString();
|
|
const r = await fetch('/search?'+q);
|
|
document.getElementById('results').innerHTML = await r.text();
|
|
}
|
|
form.addEventListener('input', doSearch);
|
|
doSearch();
|
|
</script>
|
|
|
|
<h2>Recent jobs</h2>
|
|
<div id="feed"></div>
|
|
<script>
|
|
(async function poll(){
|
|
try{
|
|
const r = await fetch('/recent');
|
|
document.getElementById('feed').innerHTML = await r.text();
|
|
}catch(e){}
|
|
setTimeout(poll, 4000);
|
|
})();
|
|
</script>
|
|
|
|
<div style="margin-top:1rem;padding:1rem;border:1px solid #ddd;border-radius:8px;">
|
|
<h3 style="margin-top:0;">Activity</h3>
|
|
<div id="status-summary" style="font-family:system-ui, sans-serif; font-size:14px; margin-bottom:0.5rem;">Loading…</div>
|
|
<pre id="status-feed" style="max-height:300px; overflow:auto; background:#f8f9fa; padding:0.5rem; border-radius:6px; border:1px solid #eee;"></pre>
|
|
</div>
|
|
<script>
|
|
(async function(){
|
|
const feed = document.getElementById('status-feed');
|
|
const sum = document.getElementById('status-summary');
|
|
async function tick(){
|
|
try{
|
|
const r = await fetch('/api/status');
|
|
const j = await r.json();
|
|
if(!j.ok) throw new Error('not ok');
|
|
const ev = j.events || [];
|
|
const last = j.summary || {};
|
|
sum.textContent = last.last_status ? `${last.last_status} — ${last.last_title||''}` : 'Idle';
|
|
feed.textContent = ev.map(e => {
|
|
const s = e.status || '';
|
|
const u = e.url || e.path || e.title || '';
|
|
const up = e.uploader ? ` [${e.uploader}]` : '';
|
|
return `${s.padEnd(14)} ${u}${up}`;
|
|
}).join('\\n');
|
|
}catch(e){
|
|
sum.textContent = 'Status unavailable';
|
|
}
|
|
}
|
|
tick();
|
|
setInterval(tick, 2000);
|
|
})();
|
|
</script>
|
|
</body></html>
|
|
"""
|
|
|
|
def read_feed_tail(max_lines: int = 200):
|
|
if not FEED_LOG.exists():
|
|
return []
|
|
try:
|
|
with open(FEED_LOG, "rb") as f:
|
|
try:
|
|
f.seek(-65536, 2) # read last ~64KB
|
|
except OSError:
|
|
f.seek(0)
|
|
data = f.read().decode("utf-8", errors="ignore")
|
|
except Exception:
|
|
return []
|
|
lines = [x.strip() for x in data.splitlines() if x.strip()]
|
|
events = []
|
|
for ln in lines[-max_lines:]:
|
|
try:
|
|
events.append(json.loads(ln))
|
|
except Exception:
|
|
pass
|
|
return events
|
|
|
|
@app.get("/api/status")
|
|
def api_status():
|
|
events = read_feed_tail(200)
|
|
last = events[-1] if events else {}
|
|
summary = {
|
|
"last_status": last.get("status"),
|
|
"last_title": last.get("title") or last.get("path") or last.get("url"),
|
|
"last_time": int(time.time()),
|
|
"count": len(events),
|
|
}
|
|
return {"ok": True, "summary": summary, "events": events}
|
|
|
|
def meili_search(qstr, limit=30):
|
|
if not qstr.strip():
|
|
return []
|
|
try:
|
|
r = requests.post(
|
|
f"{MEILI_URL}/indexes/library/search",
|
|
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
|
|
data=json.dumps({"q": qstr, "limit": limit}),
|
|
timeout=5,
|
|
)
|
|
if r.status_code != 200:
|
|
return []
|
|
return r.json().get("hits", [])
|
|
except Exception:
|
|
return []
|
|
|
|
@app.get("/health")
|
|
def health():
|
|
return "ok"
|
|
|
|
@app.get("/")
|
|
def index():
|
|
return PAGE
|
|
|
|
@app.post("/enqueue")
|
|
def enqueue():
|
|
url = request.form["url"].strip()
|
|
q.enqueue("worker.handle_url", url)
|
|
return redirect("/")
|
|
|
|
@app.post("/enqueue_batch")
|
|
def enqueue_batch():
|
|
urls = [u.strip() for u in request.form["urls"].splitlines() if u.strip()]
|
|
for u in urls: q.enqueue("worker.handle_url", u)
|
|
return redirect("/")
|
|
|
|
@app.get("/recent")
|
|
def recent():
|
|
try:
|
|
with open("/transcripts/_feed.log", "r", encoding="utf-8") as f:
|
|
tail = f.readlines()[-40:]
|
|
except FileNotFoundError:
|
|
tail=[]
|
|
html = []
|
|
for line in reversed(tail):
|
|
try: item = json.loads(line)
|
|
except: continue
|
|
html.append(f"<div class='card'><b>{item.get('title','')}</b><br><small>{item.get('uploader','')} — {item.get('date','')} — {item.get('status','')}</small><br><small>{item.get('path','')}</small></div>")
|
|
return "\n".join(html)
|
|
|
|
def _safe_under(base: Path, rel_path: str) -> Path:
|
|
"""
|
|
Resolve rel_path safely under base. If an absolute path is provided and it is
|
|
already under base, allow it. Otherwise join to base. Reject any path that
|
|
escapes base.
|
|
"""
|
|
try:
|
|
p = Path(rel_path)
|
|
if p.is_absolute():
|
|
candidate = p.resolve()
|
|
else:
|
|
candidate = (base / rel_path).resolve()
|
|
except Exception:
|
|
raise FileNotFoundError("Invalid path")
|
|
|
|
base_str = str(base.resolve())
|
|
cand_str = str(candidate)
|
|
# allow exact base or any child path
|
|
if cand_str == base_str or cand_str.startswith(base_str + os.sep):
|
|
return candidate
|
|
raise FileNotFoundError("Path escapes base")
|
|
|
|
def _vtt_header():
|
|
return "WEBVTT\n\n"
|
|
|
|
def _srt_to_vtt_text(srt_text: str) -> str:
|
|
# Minimal conversion: SRT -> VTT timestamp format + header
|
|
# Replace commas in timecodes with dots
|
|
body = re.sub(r"(?m)(\d{2}:\d{2}:\d{2}),(\d{3})", r"\1.\2", srt_text)
|
|
# Ensure a WEBVTT header
|
|
if not body.lstrip().upper().startswith("WEBVTT"):
|
|
body = _vtt_header() + body
|
|
return body
|
|
|
|
def _json_to_vtt_text(json_text: str) -> str:
|
|
# Expect Whisper-style segments [{'start':..,'end':..,'text':..}, ...]
|
|
try:
|
|
data = json.loads(json_text)
|
|
except Exception:
|
|
return _vtt_header()
|
|
segments = data.get("segments") or data # support raw list
|
|
out = [_vtt_header()]
|
|
idx = 1
|
|
for seg in segments or []:
|
|
try:
|
|
start = float(seg.get("start", 0))
|
|
end = float(seg.get("end", start + 0.5))
|
|
text = str(seg.get("text", "")).strip()
|
|
except Exception:
|
|
continue
|
|
def fmt(t):
|
|
h = int(t // 3600); m = int((t % 3600) // 60); s = t - h*3600 - m*60
|
|
return f"{h:02d}:{m:02d}:{s:06.3f}".replace(",", ".")
|
|
out.append(f"{idx}")
|
|
out.append(f"{fmt(start)} --> {fmt(end)}")
|
|
out.append(text or "…")
|
|
out.append("") # blank line
|
|
idx += 1
|
|
return "\n".join(out).rstrip() + "\n"
|
|
|
|
def _parse_vtt_to_cues(vtt_text: str):
|
|
"""Very small VTT parser -> list of dicts {start,end,text} (seconds, seconds, str)."""
|
|
def to_seconds(ts: str) -> float:
|
|
# 00:00:00.000 or 00:00.000 (allow both)
|
|
parts = ts.replace(",", ".").split(":")
|
|
try:
|
|
if len(parts) == 3:
|
|
h, m, s = int(parts[0]), int(parts[1]), float(parts[2])
|
|
else:
|
|
h, m, s = 0, int(parts[0]), float(parts[1])
|
|
return h*3600 + m*60 + s
|
|
except Exception:
|
|
return 0.0
|
|
cues = []
|
|
lines = [ln.rstrip("\n\r") for ln in vtt_text.splitlines()]
|
|
i = 0
|
|
while i < len(lines):
|
|
ln = lines[i].strip()
|
|
i += 1
|
|
if not ln or ln.upper().startswith("WEBVTT"):
|
|
continue
|
|
# Optional numeric counter line
|
|
if ln.isdigit() and i < len(lines):
|
|
ln = lines[i].strip(); i += 1
|
|
if "-->" in ln:
|
|
try:
|
|
l, r = ln.split("-->", 1)
|
|
start = to_seconds(l.strip())
|
|
end = to_seconds(r.strip().split(" ")[0])
|
|
except Exception:
|
|
start = end = 0.0
|
|
texts = []
|
|
while i < len(lines) and lines[i].strip() != "":
|
|
texts.append(lines[i])
|
|
i += 1
|
|
# skip blank separator
|
|
while i < len(lines) and lines[i].strip() == "":
|
|
i += 1
|
|
cue_text = " ".join([t.strip() for t in texts]).strip()
|
|
if cue_text:
|
|
cues.append({"start": start, "end": end, "text": cue_text})
|
|
return cues
|
|
|
|
def _load_transcript_variants(basename: str):
|
|
"""
|
|
Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None}
|
|
- Tries exact filename matches first.
|
|
- If not found, falls back to the first file whose name starts with the basename (prefix match).
|
|
"""
|
|
root = TRANSCRIPT_ROOT
|
|
|
|
def try_read(path: Path, k: str):
|
|
try:
|
|
rp = path.resolve()
|
|
if not str(rp).startswith(str(root)):
|
|
return None
|
|
if rp.exists():
|
|
with open(rp, "r", encoding="utf-8", errors="ignore") as f:
|
|
return (k, f.read(), str(rp))
|
|
except Exception:
|
|
return None
|
|
return None
|
|
|
|
# 1) exact matches
|
|
exact = [
|
|
(root / f"{basename}.vtt", "vtt"),
|
|
(root / f"{basename}.srt", "srt"),
|
|
(root / f"{basename}.json", "json"),
|
|
(root / f"{basename}.txt", "txt"),
|
|
]
|
|
for p, k in exact:
|
|
got = try_read(p, k)
|
|
if got:
|
|
return got
|
|
|
|
# 2) prefix/fuzzy matches (e.g., "<base>*.vtt", "<base>*.txt", etc.)
|
|
exts = [("vtt","vtt"), ("srt","srt"), ("json","json"), ("txt","txt")]
|
|
for ext, k in exts:
|
|
try:
|
|
for gp in root.glob(f"{basename}*.{ext}"):
|
|
got = try_read(gp, k)
|
|
if got:
|
|
return got
|
|
except Exception:
|
|
continue
|
|
|
|
return (None, "", "")
|
|
|
|
@app.get("/search")
|
|
def search():
|
|
qstr = request.args.get("q","")
|
|
hits = meili_search(qstr)
|
|
out=[]
|
|
for h in hits:
|
|
t = h.get("title","")
|
|
src = h.get("source","")
|
|
typ = h.get("type","")
|
|
ctx = h.get("_formatted",{}).get("text", h.get("text","")[:300])
|
|
segs = h.get("segments",[])
|
|
ts = int(segs[0]["start"]) if segs else 0
|
|
if typ == 'podcast':
|
|
open_link = f"/play?file={requests.utils.quote(src)}&t={ts}"
|
|
else:
|
|
open_link = f"/play?file={requests.utils.quote(src)}"
|
|
transcript_link = f" | <a href=\"/subtitle?file={requests.utils.quote(src)}\">Transcript</a>" if typ == 'podcast' else ""
|
|
badge = f"<span class='badge'>{typ}</span>"
|
|
out.append(
|
|
f"<div class='card'><b>{badge}{t}</b><br><small>{src}</small>"
|
|
f"<p>{ctx}</p>"
|
|
f"<a href='{open_link}'>Open</a>"
|
|
f"{transcript_link}"
|
|
f"</div>"
|
|
)
|
|
return "\n".join(out) or "<small>No results yet.</small>"
|
|
|
|
@app.get("/open")
|
|
def open_local():
|
|
file = request.args.get("file","")
|
|
t = int(request.args.get("t","0"))
|
|
return f"<pre>{file}\nStart at: {t} sec</pre>"
|
|
|
|
@app.get('/media')
|
|
def media():
|
|
rel = request.args.get('file', '')
|
|
try:
|
|
full = _safe_under(LIBRARY_ROOT, rel)
|
|
except Exception:
|
|
return abort(404)
|
|
if not full.exists():
|
|
return abort(404)
|
|
# Let Flask guess mimetype
|
|
return send_file(str(full), conditional=True)
|
|
|
|
@app.get('/play')
|
|
def play():
|
|
rel = request.args.get('file', '')
|
|
t = int(request.args.get('t', '0') or 0)
|
|
src = f"/media?file={requests.utils.quote(rel)}"
|
|
track = f"/subtitle?file={requests.utils.quote(rel)}&format=vtt"
|
|
return (
|
|
"<!doctype html><meta charset='utf-8'>"
|
|
"<title>Play</title>"
|
|
"<style>body{font-family:system-ui;margin:1rem}</style>"
|
|
f"<h3>{rel}</h3>"
|
|
f"<video id='v' controls style='max-width:100%;width:100%'>"
|
|
f" <source src='{src}'>"
|
|
f" <track kind='subtitles' src='{track}' srclang='en' label='Transcript' default>"
|
|
" Your browser cannot play this media."
|
|
"</video>"
|
|
"<script>const v=document.getElementById('v');"
|
|
f"v.addEventListener('loadedmetadata',()=>{{try{{v.currentTime={t};}}catch(e){{}}}});"
|
|
"</script>"
|
|
)
|
|
|
|
@app.get("/subtitle")
|
|
def subtitle():
|
|
file = request.args.get("file", "")
|
|
fmt = request.args.get("format", "").lower() # when 'vtt', serve as text/vtt for player
|
|
base = os.path.splitext(os.path.basename(file))[0]
|
|
|
|
kind, content, used_path = _load_transcript_variants(base)
|
|
|
|
# Build a VTT if requested/needed and we can
|
|
if fmt == "vtt":
|
|
vtt = ""
|
|
if kind == "vtt":
|
|
vtt = content if content.lstrip().upper().startswith("WEBVTT") else _vtt_header() + content
|
|
elif kind == "srt":
|
|
vtt = _srt_to_vtt_text(content)
|
|
elif kind == "json":
|
|
vtt = _json_to_vtt_text(content)
|
|
else:
|
|
# No structured timing available
|
|
return abort(404)
|
|
resp = make_response(vtt)
|
|
resp.headers["Content-Type"] = "text/vtt; charset=utf-8"
|
|
return resp
|
|
|
|
# Otherwise, render a simple HTML preview
|
|
if kind in ("vtt", "srt", "json"):
|
|
# Normalize to VTT first
|
|
if kind == "vtt":
|
|
vtt_text = content if content.lstrip().upper().startswith("WEBVTT") else _vtt_header() + content
|
|
elif kind == "srt":
|
|
vtt_text = _srt_to_vtt_text(content)
|
|
else:
|
|
vtt_text = _json_to_vtt_text(content)
|
|
|
|
# If ?raw=1 is present, show raw VTT for debugging
|
|
if request.args.get("raw") == "1":
|
|
return (
|
|
"<!doctype html><meta charset='utf-8'>"
|
|
"<title>Transcript</title>"
|
|
"<style>body{font-family:system-ui;margin:1rem}</style>"
|
|
f"<h3>Transcript (raw VTT): {base}</h3>"
|
|
f"<pre style='white-space:pre-wrap'>{vtt_text}</pre>"
|
|
)
|
|
|
|
# Otherwise render a readable transcript with clickable timestamps
|
|
cues = _parse_vtt_to_cues(vtt_text)
|
|
# Build HTML list
|
|
items = []
|
|
for c in cues:
|
|
mm = int(c["start"] // 60)
|
|
ss = int(c["start"] % 60)
|
|
hh = int(c["start"] // 3600)
|
|
ts_label = f"{hh:02d}:{mm%60:02d}:{ss:02d}" if hh else f"{mm:02d}:{ss:02d}"
|
|
items.append(
|
|
"<div class='cue'>"
|
|
f"<button class='ts' data-t='{int(c['start'])}'>{ts_label}</button>"
|
|
f"<span class='text'>{c['text']}</span>"
|
|
"</div>"
|
|
)
|
|
html = (
|
|
"<!doctype html><meta charset='utf-8'>"
|
|
"<title>Transcript</title>"
|
|
"<style>"
|
|
":root{--fg:#111;--muted:#666;--bg:#fff;--ring:#e9ecef;}"
|
|
"body{font-family:system-ui;margin:1rem;line-height:1.5;color:var(--fg);background:var(--bg)}"
|
|
".wrap{max-width:900px;margin:0 auto}"
|
|
".meta{color:var(--muted);margin:.25rem 0 1rem}"
|
|
".cue{display:flex;gap:.75rem;align-items:flex-start;padding:.35rem .25rem;border-bottom:1px solid #f0f0f0}"
|
|
".cue .text{white-space:pre-wrap}"
|
|
".ts{font:inherit;border:1px solid #ccc;background:#fafafa;border-radius:6px;padding:.15rem .45rem;cursor:pointer}"
|
|
".ts:hover{background:#f2f2f2}"
|
|
"</style>"
|
|
f"<div class='wrap'><h3>Transcript: {base}</h3>"
|
|
"<div class='meta'>Click a timestamp to open the player at that point.</div>"
|
|
f"<div id='list'>{''.join(items) or '<small>No cues found.</small>'}</div>"
|
|
"<script>\n"
|
|
"const file=new URLSearchParams(location.search).get('file')||'';\n"
|
|
"document.querySelectorAll('.ts').forEach(b=>{b.addEventListener('click',()=>{\n"
|
|
" const t=b.dataset.t||'0';\n"
|
|
" const url='/play?file='+encodeURIComponent(file)+'&t='+t;\n"
|
|
" window.open(url,'_blank');\n"
|
|
"});});\n"
|
|
"</script></div>"
|
|
)
|
|
return html
|
|
elif kind == "txt":
|
|
# Normalize and lightly beautify plain text transcripts
|
|
safe = content.strip()
|
|
|
|
# Remove common timestamp patterns like [00:12:34], (00:12), 00:12:34 -
|
|
safe = re.sub(r"\[(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\]\s*", "", safe)
|
|
safe = re.sub(r"\((\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\)\s*", "", safe)
|
|
safe = re.sub(r"(?m)^\s*(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\s*[-–—]?\s*", "", safe)
|
|
|
|
# Collapse multiple blank lines
|
|
safe = re.sub(r"\n{3,}", "\n\n", safe)
|
|
|
|
# Paragraphization: split on blank lines, collapse inner newlines to spaces
|
|
paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()]
|
|
clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]]
|
|
items = "".join(f"<p>{p}</p>" for p in clean_paras)
|
|
|
|
fallback = f"<pre style='white-space:pre-wrap'>{safe}</pre>"
|
|
body = items if items else fallback
|
|
return (
|
|
"<!doctype html><meta charset='utf-8'>"
|
|
"<title>Transcript</title>"
|
|
"<style>"
|
|
"body{font-family:system-ui;margin:1rem;line-height:1.7;color:#111}"
|
|
".wrap{max-width:900px;margin:0 auto}"
|
|
"p{margin:.5rem 0}"
|
|
".wrap p{text-wrap:pretty}"
|
|
"</style>"
|
|
f"<div class='wrap'><h3>Transcript (plain text): {base}</h3>"
|
|
f"{body}</div>"
|
|
)
|
|
else:
|
|
return "<small>No transcript found.</small>"
|