Podcast sync
This commit is contained in:
309
app/worker.py
309
app/worker.py
@@ -1,6 +1,7 @@
|
||||
import os, subprocess, shutil, json, re, orjson, requests
|
||||
from pathlib import Path
|
||||
import math
|
||||
import difflib
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
|
||||
@@ -12,6 +13,11 @@ MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3")
|
||||
COMPUTE = os.getenv("WHISPER_PRECISION","int8")
|
||||
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()
|
||||
|
||||
# RSS resolver config
|
||||
RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json"))
|
||||
RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150")) # seconds
|
||||
DEFAULT_TRANSCRIPT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
|
||||
|
||||
OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
|
||||
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
|
||||
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
|
||||
@@ -39,6 +45,160 @@ def log(feed):
|
||||
def sanitize(name):
|
||||
return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
|
||||
|
||||
# ---------- RSS transcript resolver ----------
|
||||
|
||||
def _normalize_title(t: str) -> str:
|
||||
t = (t or "").lower()
|
||||
t = re.sub(r"\s+", " ", t)
|
||||
# remove punctuation-ish
|
||||
t = re.sub(r"[^a-z0-9 _-]+", "", t)
|
||||
return t.strip()
|
||||
|
||||
def _stem_without_date(stem: str) -> str:
|
||||
# drop leading YYYYMMDD - from filenames created by yt-dlp template
|
||||
m = re.match(r"^\d{8}\s*-\s*(.*)$", stem)
|
||||
return m.group(1) if m else stem
|
||||
|
||||
def _extract_date_from_stem(stem: str) -> str | None:
|
||||
m = re.search(r"\b(\d{8})\b", stem)
|
||||
return m.group(1) if m else None
|
||||
|
||||
def _best_title_match(title: str, candidates: list[str]) -> tuple[str, float]:
|
||||
"""Return (best_title, score 0..1) using difflib SequenceMatcher."""
|
||||
if not candidates:
|
||||
return "", 0.0
|
||||
norm_title = _normalize_title(title)
|
||||
best = ("", 0.0)
|
||||
for c in candidates:
|
||||
score = difflib.SequenceMatcher(None, norm_title, _normalize_title(c)).ratio()
|
||||
if score > best[1]:
|
||||
best = (c, score)
|
||||
return best
|
||||
|
||||
def _load_rss_index() -> list[dict]:
|
||||
try:
|
||||
if RSS_INDEX_PATH.exists():
|
||||
data = json.loads(RSS_INDEX_PATH.read_text(encoding="utf-8"))
|
||||
# supports {"episodes":[...]} or a flat list
|
||||
if isinstance(data, dict) and "episodes" in data:
|
||||
return data["episodes"] or []
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"[resolver] failed to load RSS index: {e}", flush=True)
|
||||
return []
|
||||
|
||||
def match_media_to_rss(media_path: Path) -> dict | None:
|
||||
"""Try to match a local media file to an RSS episode entry."""
|
||||
episodes = _load_rss_index()
|
||||
if not episodes:
|
||||
return None
|
||||
|
||||
stem = media_path.stem
|
||||
title_no_date = _stem_without_date(stem)
|
||||
file_date = _extract_date_from_stem(stem)
|
||||
# duration tolerance
|
||||
media_dur = media_duration_seconds(media_path)
|
||||
|
||||
# Candidates: filter by date if present, else all
|
||||
if file_date:
|
||||
pool = [e for e in episodes if (str(e.get("date", "")) == file_date or str(e.get("pubdate", "")) == file_date)]
|
||||
if not pool:
|
||||
pool = episodes
|
||||
else:
|
||||
pool = episodes
|
||||
|
||||
# Pick best by (title similarity, duration proximity)
|
||||
best_ep, best_score = None, -1.0
|
||||
for ep in pool:
|
||||
ep_title = ep.get("title") or ep.get("itunes_title") or ""
|
||||
sim = _best_title_match(title_no_date, [ep_title])[1]
|
||||
dur = float(ep.get("duration_sec") or ep.get("duration") or 0.0)
|
||||
dur_ok = True
|
||||
if media_dur and dur:
|
||||
dur_ok = abs(media_dur - dur) <= RSS_DURATION_TOLERANCE
|
||||
score = sim + (0.1 if dur_ok else 0.0)
|
||||
if score > best_score:
|
||||
best_score, best_ep = score, ep
|
||||
|
||||
if best_ep and best_score >= 0.5:
|
||||
print(f"[resolver] matched '{stem}' -> '{best_ep.get('title','')}' score={best_score:.2f}", flush=True)
|
||||
return best_ep
|
||||
return None
|
||||
|
||||
def _choose_transcript_url(ep: dict) -> tuple[str, str] | tuple[None, None]:
|
||||
"""Return (url, kind) preferring txt, vtt, then srt. 'kind' in {'txt','vtt','srt'}."""
|
||||
# unified structure from rss_ingest.py: ep["transcripts"] = [{"url":..., "type": ...}, ...]
|
||||
items = ep.get("transcripts") or []
|
||||
# some ingesters store separate keys
|
||||
if not items:
|
||||
for key, kind in [("transcript_txt","txt"), ("transcript_vtt","vtt"), ("transcript_srt","srt")]:
|
||||
if ep.get(key):
|
||||
items.append({"url": ep[key], "type": kind})
|
||||
# preference order
|
||||
for kind in ["txt", "vtt", "srt"]:
|
||||
for it in items:
|
||||
t = (it.get("type") or "").lower()
|
||||
u = it.get("url") or ""
|
||||
if u and (kind in t or (kind == "txt" and t in ["text","plain","text/plain"]) or (kind in u.lower())):
|
||||
return u, kind
|
||||
return (None, None)
|
||||
|
||||
def fetch_rss_transcript(ep: dict, dest_dir: Path) -> Path | None:
|
||||
"""Download transcript to dest_dir and return local Path; convert VTT->SRT if needed."""
|
||||
url, kind = _choose_transcript_url(ep)
|
||||
if not url:
|
||||
return None
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
# filename from episode title
|
||||
safe = sanitize(ep.get("title") or ep.get("guid") or "episode")
|
||||
path = dest_dir / f"{safe}.{kind if kind!='txt' else 'txt'}"
|
||||
try:
|
||||
r = requests.get(url, timeout=30)
|
||||
r.raise_for_status()
|
||||
mode = "wb" if kind in ("vtt","srt") else "w"
|
||||
if mode == "wb":
|
||||
path.write_bytes(r.content)
|
||||
else:
|
||||
path.write_text(r.text, encoding="utf-8")
|
||||
print(f"[resolver] downloaded transcript ({kind}) from {url}", flush=True)
|
||||
return path
|
||||
except Exception as e:
|
||||
print(f"[resolver] failed to fetch transcript: {e}", flush=True)
|
||||
return None
|
||||
|
||||
def use_rss_transcript(media_path: Path, ep: dict) -> Path | None:
|
||||
"""Create standard transcript artifacts from an RSS transcript (txt/vtt/srt)."""
|
||||
# Prefer direct download; else if rss_ingest already saved a local file path, try that.
|
||||
sidecar = None
|
||||
local_hint = ep.get("transcript_local")
|
||||
if local_hint:
|
||||
p = Path(local_hint)
|
||||
if p.exists():
|
||||
sidecar = p
|
||||
if sidecar is None:
|
||||
sidecar = fetch_rss_transcript(ep, TMP)
|
||||
|
||||
if not sidecar or not sidecar.exists():
|
||||
return None
|
||||
|
||||
# Convert to plain text
|
||||
plain = transcript_text_from_file(sidecar)
|
||||
lang = (ep.get("language") or ep.get("lang") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
|
||||
base = write_plain_transcript(media_path, plain, language=lang)
|
||||
# Place an SRT next to video for Plex
|
||||
ensure_sidecar_next_to_media(sidecar, media_path, lang=lang)
|
||||
# Write provenance sidecar
|
||||
(base.with_suffix(".prov.json")).write_bytes(orjson.dumps({
|
||||
"source": "rss",
|
||||
"feed": ep.get("feed_url"),
|
||||
"guid": ep.get("guid"),
|
||||
"episode_title": ep.get("title"),
|
||||
"transcript_kind": sidecar.suffix.lower().lstrip("."),
|
||||
"transcript_url": _choose_transcript_url(ep)[0] or "",
|
||||
}))
|
||||
return base
|
||||
|
||||
def find_sidecar_transcript(media_path: Path) -> Path | None:
|
||||
"""Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
|
||||
Tries common variants including language-suffixed SRT/VTT.
|
||||
@@ -57,6 +217,110 @@ def find_sidecar_transcript(media_path: Path) -> Path | None:
|
||||
return candidates[0] if candidates else None
|
||||
|
||||
|
||||
# ---------- Transcript repository reuse helpers ----------
|
||||
|
||||
def find_repo_transcript_for_media(media_path: Path) -> Path | None:
|
||||
"""Search the transcript repository (/transcripts) for an existing transcript
|
||||
that likely belongs to this media file (match by YYYYMMDD in filename and/or
|
||||
fuzzy title similarity). Returns a path to a matching .json if found."""
|
||||
try:
|
||||
stem = media_path.stem
|
||||
title_no_date = _stem_without_date(stem)
|
||||
file_date = _extract_date_from_stem(stem)
|
||||
best_json, best_score = None, 0.0
|
||||
for j in TRN.glob("*.json"):
|
||||
try:
|
||||
data = json.loads(j.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
continue
|
||||
other_file = Path(data.get("file", ""))
|
||||
other_stem = other_file.stem if other_file else j.stem
|
||||
other_date = _extract_date_from_stem(other_stem)
|
||||
# If both have dates and they differ a lot, skip
|
||||
if file_date and other_date and file_date != other_date:
|
||||
continue
|
||||
# Compare titles (without dates)
|
||||
sim = difflib.SequenceMatcher(
|
||||
None,
|
||||
_normalize_title(title_no_date),
|
||||
_normalize_title(_stem_without_date(other_stem)),
|
||||
).ratio()
|
||||
# Nudge score when dates match
|
||||
if file_date and other_date and file_date == other_date:
|
||||
sim += 0.1
|
||||
if sim > best_score:
|
||||
best_score, best_json = sim, j
|
||||
# Require a reasonable similarity
|
||||
return best_json if best_json and best_score >= 0.60 else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None:
|
||||
"""Copy/retarget an existing transcript JSON/TXT (and make SRT/VTT if possible)
|
||||
from the repository so that it belongs to the provided media_path. Returns
|
||||
the new base path in /transcripts or None."""
|
||||
try:
|
||||
# load the source transcript
|
||||
data = json.loads(repo_json.read_text(encoding="utf-8"))
|
||||
src_base = TRN / Path(repo_json).stem
|
||||
src_txt = src_base.with_suffix(".txt")
|
||||
src_srt = src_base.with_suffix(".srt")
|
||||
src_vtt = src_base.with_suffix(".vtt")
|
||||
|
||||
# write the retargeted artifacts
|
||||
new_title = media_path.stem
|
||||
new_base = TRN / new_title
|
||||
new_base.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# update file path
|
||||
data["file"] = str(media_path)
|
||||
(new_base.with_suffix(".json")).write_bytes(orjson.dumps(data))
|
||||
|
||||
# copy or synthesize TXT
|
||||
if src_txt.exists():
|
||||
shutil.copy2(src_txt, new_base.with_suffix(".txt"))
|
||||
else:
|
||||
# fallback: concatenate segments
|
||||
txt = " ".join(s.get("text", "") for s in data.get("segments", []))
|
||||
(new_base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
|
||||
|
||||
# copy SRT/VTT if present; otherwise synthesize SRT from segments
|
||||
if src_srt.exists():
|
||||
shutil.copy2(src_srt, new_base.with_suffix(".srt"))
|
||||
else:
|
||||
# synthesize SRT
|
||||
def fmt_ts(t):
|
||||
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
|
||||
return f"{h:02}:{m:02}:{s:06.3f}".replace('.',',')
|
||||
with open(new_base.with_suffix(".srt"), "w", encoding="utf-8") as srt:
|
||||
for i, s in enumerate(data.get("segments", []), 1):
|
||||
srt.write(f"{i}\n{fmt_ts(s.get('start',0.0))} --> {fmt_ts(s.get('end',0.0))}\n{s.get('text','').strip()}\n\n")
|
||||
if src_vtt.exists():
|
||||
shutil.copy2(src_vtt, new_base.with_suffix(".vtt"))
|
||||
else:
|
||||
# synthesize VTT from segments
|
||||
def fmt_ts_vtt(t):
|
||||
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
|
||||
return f"{h:02}:{m:02}:{s:06.3f}"
|
||||
with open(new_base.with_suffix(".vtt"), "w", encoding="utf-8") as vtt:
|
||||
vtt.write("WEBVTT\n\n")
|
||||
for s in data.get("segments", []):
|
||||
vtt.write(f"{fmt_ts_vtt(s.get('start',0.0))} --> {fmt_ts_vtt(s.get('end',0.0))} \n{s.get('text','').strip()}\n\n")
|
||||
|
||||
# ensure sidecar next to media
|
||||
try:
|
||||
lang = (data.get("language") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
|
||||
ensure_sidecar_next_to_media(new_base.with_suffix(".srt"), media_path, lang=lang)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return new_base
|
||||
except Exception as e:
|
||||
print(f"[resolver] failed to reuse repo transcript: {e}", flush=True)
|
||||
return None
|
||||
|
||||
|
||||
def transcript_text_from_file(path: Path) -> str:
|
||||
"""Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
|
||||
try:
|
||||
@@ -84,8 +348,10 @@ def transcript_text_from_file(path: Path) -> str:
|
||||
|
||||
|
||||
def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
|
||||
"""Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed."""
|
||||
"""Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed. If the sidecar is .txt, do nothing."""
|
||||
try:
|
||||
if sidecar.suffix.lower() == ".txt":
|
||||
return
|
||||
if sidecar.suffix.lower() == ".srt":
|
||||
dst = media_path.with_suffix(f".{lang}.srt")
|
||||
shutil.copy2(sidecar, dst)
|
||||
@@ -407,6 +673,19 @@ def handle_local_file(path_str: str):
|
||||
"uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
|
||||
log(info)
|
||||
|
||||
# 0) Try RSS resolver first: if episode with transcript exists, use it (skip Whisper)
|
||||
try:
|
||||
ep = match_media_to_rss(p)
|
||||
except Exception as _e:
|
||||
ep = None
|
||||
if ep:
|
||||
base = use_rss_transcript(p, ep)
|
||||
if base:
|
||||
index_meili(base.with_suffix(".json"))
|
||||
publish_to_openwebui([base.with_suffix(".txt")])
|
||||
log({**info, **{"status": "done", "note": "used_rss_transcript"}})
|
||||
return
|
||||
|
||||
# 1) Prefer an existing transcript sidecar if present
|
||||
sidecar = find_sidecar_transcript(p)
|
||||
if sidecar:
|
||||
@@ -419,6 +698,16 @@ def handle_local_file(path_str: str):
|
||||
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
|
||||
return
|
||||
|
||||
# 1.5) Reuse a transcript that exists in the repository for a matching episode
|
||||
repo_json = find_repo_transcript_for_media(p)
|
||||
if repo_json:
|
||||
base = reuse_repo_transcript(p, repo_json)
|
||||
if base:
|
||||
index_meili(base.with_suffix(".json"))
|
||||
publish_to_openwebui([base.with_suffix(".txt")])
|
||||
log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
|
||||
return
|
||||
|
||||
# 2) Otherwise, run transcription
|
||||
base = transcribe(p)
|
||||
index_meili(base.with_suffix(".json"))
|
||||
@@ -464,7 +753,23 @@ def handle_url(url: str):
|
||||
"date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
|
||||
"path": str(dest)})
|
||||
log({**info, **{"status":"transcribing", "progress": 0}})
|
||||
base = transcribe(dest)
|
||||
# Try RSS transcript resolver first
|
||||
ep = None
|
||||
try:
|
||||
ep = match_media_to_rss(dest)
|
||||
except Exception:
|
||||
ep = None
|
||||
if ep:
|
||||
base = use_rss_transcript(dest, ep)
|
||||
else:
|
||||
base = None
|
||||
# 1.5) If we didn't get an RSS transcript and there is a matching one already in the repo, reuse it
|
||||
if not base:
|
||||
repo_json = find_repo_transcript_for_media(dest)
|
||||
if repo_json:
|
||||
base = reuse_repo_transcript(dest, repo_json)
|
||||
if not base:
|
||||
base = transcribe(dest)
|
||||
index_meili(base.with_suffix(".json"))
|
||||
publish_to_openwebui([base.with_suffix(".txt")])
|
||||
log({**info, **{"status":"done"}})
|
||||
|
||||
Reference in New Issue
Block a user