Adding second worker

2025-09-07 17:30:45 +02:00
parent f4b456ccae
commit bd846a8d9f
3 changed files with 560 additions and 50 deletions
--- a/app/rss_ingest.py
+++ b/app/rss_ingest.py
@@ -20,6 +20,11 @@ MEDIA_EXTS = {".mp3", ".m4a", ".flac", ".wav", ".ogg", ".opus", ".mp4", ".m4v",
 # Fuzzy title match threshold for media ↔ transcript pairing
 TITLE_MATCH_THRESHOLD = float(os.getenv("RSS_TITLE_MATCH_THRESHOLD", "0.60"))
 # Download podcast audio (enclosures) to a local library
 PODCASTS_ROOT = Path(os.getenv("PODCASTS_ROOT", str(LIB / "Podcasts")))
 PODCASTS_PER_SHOW = os.getenv("PODCASTS_PER_SHOW", "true").lower() in {"1","true","yes","y"}
 DOWNLOAD_AUDIO = os.getenv("RSS_DOWNLOAD_AUDIO", "true").lower() in {"1","true","yes","y"}
 # Namespace map (extend as needed)
 NS = {
    "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
@@ -31,6 +36,7 @@ NS = {
 TRN.mkdir(parents=True, exist_ok=True)
 OUT_INDEX.parent.mkdir(parents=True, exist_ok=True)
 PODCASTS_ROOT.mkdir(parents=True, exist_ok=True)
 def _text(el):
@@ -235,6 +241,7 @@ def parse_feed(feed_url: str):
            duration_sec = _parse_duration(dur) or None
            enclosure = _find_ns(it, "enclosure")
            audio_url = enclosure.get("url") if enclosure is not None else ""
            audio_type = enclosure.get("type") if enclosure is not None else ""
            if not audio_url:
                for mc in _findall_ns(it, "media:content"):
@@ -253,6 +260,7 @@ def parse_feed(feed_url: str):
                "date": date,
                "duration_sec": duration_sec,
                "audio_url": audio_url,
                "audio_type": audio_type,
                "language": DEFAULT_LANG,
                "transcripts": transcripts,
            }
@@ -276,6 +284,40 @@ def parse_feed(feed_url: str):
                            if created:
                                t["sidecars"] = created
            # Optionally download podcast audio locally
            local_audio_path = None
            if DOWNLOAD_AUDIO and audio_url:
                show_dir = PODCASTS_ROOT / _slug(show_title or "Podcast") if PODCASTS_PER_SHOW else PODCASTS_ROOT
                base_name = f"{(date or '00000000')} - {_slug(title or guid or 'episode')}"
                ext = _guess_audio_ext(audio_type, audio_url)
                target = (show_dir / base_name).with_suffix(ext)
                # Avoid re-download if already exists
                if not target.exists():
                    saved = _download_stream(audio_url, target)
                    if saved is None:
                        # Try a non-streaming fallback
                        saved = _download(audio_url, target)
                else:
                    saved = target
                if saved and saved.exists():
                    local_audio_path = saved
                    # If we previously downloaded transcript sidecars, try to place them next to this audio
                    for t in item_rec.get("transcripts", []) or []:
                        lp = t.get("local_path")
                        if lp:
                            try:
                                lp = Path(lp)
                                if lp.exists() and lp.suffix.lower() in {'.srt','.vtt','.txt'}:
                                    sc = _sidecar_path_for(local_audio_path, t.get('language') or DEFAULT_LANG, lp.suffix.lower())
                                    if not sc.exists():
                                        sc.parent.mkdir(parents=True, exist_ok=True)
                                        shutil.copy2(lp, sc)
                                        t.setdefault("sidecars", []).append(str(sc))
                            except Exception:
                                pass
            if local_audio_path:
                item_rec["local_audio"] = str(local_audio_path)
            items.append(item_rec)
        return {"feed_url": feed_url, "show": show_title, "episodes": items}
--- a/app/worker.py
+++ b/app/worker.py
@@ -2,6 +2,7 @@ import os, subprocess, shutil, json, re, orjson, requests
 from pathlib import Path
 import math
 import difflib
 import time
 from faster_whisper import WhisperModel
 from xml.sax.saxutils import escape as xml_escape
@@ -11,10 +12,21 @@ MEILI_KEY = os.getenv("MEILI_KEY", "")
 LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
 TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
 TMP = Path(os.getenv("TMP_ROOT", "/tmpdl"))
 MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3")
 COMPUTE = os.getenv("WHISPER_PRECISION","int8")
 WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()
 # Whisper device/config controls
 WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "auto").strip()
 WHISPER_DEVICE_INDEX = int(os.getenv("WHISPER_DEVICE_INDEX", "0"))
 WHISPER_CPU_THREADS = int(os.getenv("WHISPER_CPU_THREADS", "4"))
 # Whisper logging & resume controls
 WHISPER_LOG_SEGMENTS = os.getenv("WHISPER_LOG_SEGMENTS", "1") not in ("0", "false", "False")
 WHISPER_RESUME = os.getenv("WHISPER_RESUME", "1") not in ("0", "false", "False")
 PARTIAL_SAVE_EVERY_SEGS = int(os.getenv("WHISPER_PARTIAL_SAVE_EVERY_SEGS", "20"))
 # RSS resolver config
 RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json"))
 RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150"))  # seconds
@@ -34,9 +46,55 @@ _model = None
 def get_model():
    global _model
    if _model is None:
-        _model = WhisperModel(MODEL_NAME, compute_type=COMPUTE)
+        print(f"[whisper] loading model='{MODEL_NAME}' device='{WHISPER_DEVICE}' idx={WHISPER_DEVICE_INDEX} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
        _model = WhisperModel(
            MODEL_NAME,
            device=WHISPER_DEVICE,
            device_index=WHISPER_DEVICE_INDEX,
            compute_type=COMPUTE,
            cpu_threads=WHISPER_CPU_THREADS,
        )
    return _model
 # --- Helper: Reset model with new device and device_index ---
 def reset_model(device: str, device_index: int | None = None):
    """Reset the global _model to a new WhisperModel with the given device and device_index."""
    global _model
    idx = device_index if device_index is not None else WHISPER_DEVICE_INDEX
    print(f"[whisper] resetting model='{MODEL_NAME}' device='{device}' idx={idx} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
    _model = WhisperModel(
        MODEL_NAME,
        device=device,
        device_index=idx,
        compute_type=COMPUTE,
        cpu_threads=WHISPER_CPU_THREADS,
    )
 # --- Helper: Run transcribe with fallback to CPU on GPU/oom errors ---
 def run_transcribe_with_fallback(wav_path: Path, lang):
    """
    Try to transcribe with current model; on GPU/CUDA/HIP/ROCm/OOM errors, reset to CPU and retry once.
    Returns (segments, info) or raises exception.
    """
    model = get_model()
    try:
        return model.transcribe(str(wav_path), vad_filter=True, language=lang)
    except Exception as e:
        msg = str(e)
        gpu_errs = [
            "CUDA", "cublas", "out of memory", "HIP", "ROCm", "device-side assert", "CUDNN", "cudaError", "cuda runtime", "cudaMalloc"
        ]
        if any(err.lower() in msg.lower() for err in gpu_errs):
            print(f"[whisper] GPU error detected: '{msg}'. Retrying on CPU...", flush=True)
            reset_model("cpu", 0)
            try:
                model = get_model()
                return model.transcribe(str(wav_path), vad_filter=True, language=lang)
            except Exception as e2:
                print(f"[whisper] CPU fallback also failed: {e2}", flush=True)
                raise
        raise
 def log(feed):
    try:
        with open(TRN / "_feed.log", "a", encoding="utf-8") as f:
@@ -216,6 +274,11 @@ def use_rss_transcript(media_path: Path, ep: dict) -> Path | None:
        txt_path = base.with_suffix(".txt")
        transcript_text = txt_path.read_text(encoding="utf-8") if txt_path.exists() else None
        write_episode_nfo(media_path, meta, transcript_text)
        # Save local artwork for Plex/Kodi
        try:
            save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
        except Exception:
            pass
    except Exception as e:
        print(f"[post] NFO write failed: {e}", flush=True)
    return base
@@ -351,6 +414,11 @@ def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None:
            txtp = new_base.with_suffix(".txt")
            ttxt = txtp.read_text(encoding="utf-8") if txtp.exists() else None
            write_episode_nfo(media_path, meta, ttxt)
            # Save local artwork for Plex/Kodi
            try:
                save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
            except Exception:
                pass
        except Exception as e:
            print(f"[post] NFO write failed: {e}", flush=True)
@@ -403,6 +471,161 @@ def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "e
        print(f"[post] sidecar copy/convert failed: {e}", flush=True)
 # --- small helpers for progress/ETA formatting ---
 def _fmt_eta(sec: float) -> str:
    try:
        sec = max(0, int(sec))
        h, rem = divmod(sec, 3600)
        m, s = divmod(rem, 60)
        if h:
            return f"{h}h {m}m {s}s"
        if m:
            return f"{m}m {s}s"
        return f"{s}s"
    except Exception:
        return ""
 def save_episode_artwork(image_url: str | None, media_path: Path, show_title: str | None = None):
    """Download episode artwork from image_url and save next to the media as '<basename>.jpg'.
    Also drop a folder-level 'poster.jpg' for the show directory if not present.
    Best-effort; failures are logged but non-fatal.
    """
    if not image_url:
        return
    try:
        resp = requests.get(image_url, timeout=30, stream=True)
        resp.raise_for_status()
        # Determine content-type and write a temporary file
        ctype = (resp.headers.get("Content-Type") or "").lower()
        tmp_file = media_path.with_suffix(".art.tmp")
        with open(tmp_file, "wb") as out:
            for chunk in resp.iter_content(chunk_size=8192):
                if chunk:
                    out.write(chunk)
        # Always provide a .jpg next to the media for Plex
        episode_jpg = media_path.with_suffix(".jpg")
        if "image/jpeg" in ctype:
            # Already JPEG
            shutil.move(str(tmp_file), str(episode_jpg))
        else:
            # Try converting to JPEG with ffmpeg; if it fails, keep bytes as-is
            try:
                subprocess.run(
                    ["ffmpeg", "-nostdin", "-y", "-i", str(tmp_file), str(episode_jpg)],
                    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
                )
                try:
                    tmp_file.unlink()
                except Exception:
                    pass
            except Exception:
                shutil.move(str(tmp_file), str(episode_jpg))
        # Also drop a folder poster once per show (helps Plex folder views)
        try:
            show_poster = media_path.parent / "poster.jpg"
            if not show_poster.exists():
                shutil.copy2(episode_jpg, show_poster)
        except Exception:
            pass
    except Exception as e:
        print(f"[post] artwork download failed: {e}", flush=True)
 def find_companion_files(src: Path) -> dict:
    """Return likely yt-dlp companion files for a downloaded media file."""
    out = {}
    # info.json can be either "<name>.<ext>.info.json" or "<name>.info.json"
    cands_info = [
        src.parent / f"{src.name}.info.json",
        src.parent / f"{src.stem}.info.json",
    ]
    out["info"] = next((p for p in cands_info if p.exists()), None)
    # thumbnails may be "<name>.<ext>.jpg" or "<name>.jpg" (we convert to jpg)
    cand_thumbs = [
        src.parent / f"{src.name}.jpg",
        src.parent / f"{src.stem}.jpg",
        src.parent / f"{src.stem}.jpeg",
        src.parent / f"{src.stem}.png",
        src.parent / f"{src.stem}.webp",
    ]
    out["thumb"] = next((p for p in cand_thumbs if p.exists()), None)
    # subtitles (keep multiple)
    subs = []
    for s in src.parent.glob(f"{src.stem}*.srt"):
        subs.append(s)
    for s in src.parent.glob(f"{src.stem}*.vtt"):
        subs.append(s)
    out["subs"] = subs
    return out
 def load_info_json(path: Path) -> dict | None:
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        return None
 def _iso_from_yyyymmdd(s: str | None) -> str | None:
    if not s or not re.match(r"^\d{8}$", s):
        return None
    return f"{s[0:4]}-{s[4:6]}-{s[6:8]}"
 def build_meta_from_sources(media_path: Path, uploader: str, fallback_meta: dict, ep: dict | None = None) -> dict:
    """
    Merge metadata from (priority): RSS episode `ep` -> yt-dlp info.json (if present) -> fallback.
    Returns a dict compatible with write_episode_nfo().
    """
    # Start with fallback
    meta = dict(fallback_meta)
    # Augment from info.json if present
    info = None
    for cand in [
        media_path.parent / f"{media_path.name}.info.json",
        media_path.parent / f"{media_path.stem}.info.json",
    ]:
        if cand.exists():
            info = load_info_json(cand)
            break
    if info:
        meta.setdefault("title", info.get("title"))
        meta.setdefault("episode_title", info.get("title"))
        meta.setdefault("description", info.get("description") or info.get("fulltitle"))
        # upload_date is YYYYMMDD
        iso = _iso_from_yyyymmdd(info.get("upload_date"))
        if iso:
            meta["pubdate_iso"] = iso
        # Prefer video duration if present
        if not meta.get("duration_sec") and info.get("duration"):
            meta["duration_sec"] = info.get("duration")
        # thumbnail URL
        if not meta.get("image"):
            meta["image"] = info.get("thumbnail")
        # show/uploader
        if not meta.get("show"):
            meta["show"] = info.get("uploader") or uploader
    # Finally, layer RSS data on top if available (most authoritative for podcasts)
    if ep:
        meta.update({
            "title": ep.get("title") or meta.get("title"),
            "episode_title": ep.get("title") or meta.get("episode_title"),
            "show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or meta.get("show") or uploader,
            "description": ep.get("description") or ep.get("content") or meta.get("description", ""),
            "pubdate": ep.get("pubdate") or meta.get("pubdate", ""),
            "pubdate_iso": ep.get("date_iso") or meta.get("pubdate_iso", meta.get("pubdate")),
            "duration_sec": ep.get("duration_sec") or ep.get("duration") or meta.get("duration_sec"),
            "image": ep.get("image") or ep.get("image_url") or meta.get("image", ""),
            "guid": ep.get("guid") or meta.get("guid", ""),
        })
    return meta
 # ---------- Kodi/Plex NFO writer ----------
 from datetime import datetime
@@ -505,8 +728,12 @@ def yt_dlp(url, outdir):
    base_cmd = [
        "yt-dlp", "-o", outtmpl,
        "-f", "bv*+ba/best",
-        "-x", "--audio-format", "m4a",
+        "--write-info-json",
        "--write-thumbnail",
        "--convert-thumbnails", "jpg",
        "--write-subs", "--write-auto-subs",
        "--sub-langs", os.getenv("YTDLP_SUBS_LANGS", "en.*,en"),
        "--convert-subs", "srt",
        "--no-playlist", "--no-warnings", "--restrict-filenames",
    ]
@@ -527,10 +754,13 @@ def yt_dlp(url, outdir):
        ]
        subprocess.check_call(retry_cmd)
-    media = (list(outdir.rglob("*.[mM][pP]4")) +
+    media = (
-             list(outdir.rglob("*.mkv")) +
+        list(outdir.rglob("*.[mM][pP]4")) +
-             list(outdir.rglob("*.m4a")) +
+        list(outdir.rglob("*.mkv")) +
-             list(outdir.rglob("*.mp3")))
+        list(outdir.rglob("*.webm")) +
        list(outdir.rglob("*.m4a")) +
        list(outdir.rglob("*.mp3"))
    )
    return sorted(media, key=lambda p: p.stat().st_mtime)[-1:]
 def extract_audio(src: Path, outdir: Path) -> Path:
@@ -550,6 +780,27 @@ def extract_audio(src: Path, outdir: Path) -> Path:
        raise RuntimeError(f"ffmpeg extract failed: {e.output.decode(errors='ignore')}")
    return wav_path
 # --- WAV trimming helper ---
 def trim_wav(src_wav: Path, start_sec: float, outdir: Path) -> Path:
    """Return a trimmed 16k mono WAV starting at start_sec from src_wav."""
    outdir.mkdir(parents=True, exist_ok=True)
    if not start_sec or start_sec <= 0.0:
        return src_wav
    dst = outdir / (src_wav.stem + f".from_{int(start_sec)}s.wav")
    try:
        subprocess.check_output([
            "ffmpeg", "-nostdin", "-y",
            "-ss", str(max(0.0, float(start_sec))),
            "-i", str(src_wav),
            "-vn", "-ac", "1", "-ar", "16000",
            "-f", "wav", str(dst),
        ], stderr=subprocess.STDOUT)
        return dst
    except subprocess.CalledProcessError as e:
        # If trimming fails, fall back to full file
        print(f"[whisper] trim failed, using full WAV: {e.output.decode(errors='ignore')}", flush=True)
        return src_wav
 def media_duration_seconds(path: Path) -> float:
    """Return duration in seconds using ffprobe; fallback to 0.0 on error."""
    try:
@@ -561,32 +812,91 @@ def media_duration_seconds(path: Path) -> float:
    except Exception:
        return 0.0
 # --- Partial transcript helpers ---
 def _partial_paths(title: str) -> tuple[Path, Path]:
    base = TRN / title
    return base.with_suffix(".partial.json"), base.with_suffix(".partial.txt")
 def _save_partial(title: str, language: str, segs: list[dict]):
    pjson, ptxt = _partial_paths(title)
    try:
        # Save JSON
        pjson.write_bytes(orjson.dumps({"file": str((TRN / title).with_suffix('.wav')), "language": language, "segments": segs}))
    except Exception as e:
        print(f"[whisper] partial json save failed: {e}", flush=True)
    try:
        # Save TXT snapshot
        ptxt.write_text(" ".join(s.get("text","") for s in segs), encoding="utf-8")
    except Exception as e:
        print(f"[whisper] partial txt save failed: {e}", flush=True)
 def transcribe(media_path: Path):
    model = get_model()
    print(f"[whisper] start transcribe: {media_path}", flush=True)
    # 1) Robustly extract audio to 16k mono WAV (fixes pyAV/webm edge cases)
    wav = extract_audio(media_path, TMP)
    # 2) Language
    lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
    # 3) Transcribe
    segments, info = model.transcribe(str(wav), vad_filter=True, language=lang)
    title = media_path.stem
    base = TRN / title
-    # Determine duration for progress; use extracted WAV (accurate for transcription input)
+    # Resume support: if a partial checkpoint exists, load it and trim input
    resume_segments = []
    resume_offset = 0.0
    language_hint = None
    if WHISPER_RESUME:
        pjson, ptxt = _partial_paths(title)
        if pjson.exists():
            try:
                pdata = json.loads(pjson.read_text(encoding="utf-8"))
                resume_segments = pdata.get("segments", []) or []
                if resume_segments:
                    resume_offset = float(resume_segments[-1].get("end", 0.0))
                language_hint = pdata.get("language")
                print(f"[whisper] resuming from ~{resume_offset:.2f}s with {len(resume_segments)} segments", flush=True)
            except Exception as e:
                print(f"[whisper] failed to load partial: {e}", flush=True)
    # If resuming, trim WAV from last end time
    wav_for_run = trim_wav(wav, resume_offset, TMP)
    # 2) Language selection
    lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
    if language_hint and WHISPER_LANGUAGE.lower() == "auto":
        # carry hint forward if available
        lang = language_hint
    # 3) Transcribe
    segments, info = run_transcribe_with_fallback(wav_for_run, lang)
    # Determine duration for progress; use full WAV duration for consistent % regardless of resume
    dur = media_duration_seconds(wav) or 0.0
    # Start wall clock timer for speed/ETA
    start_wall = time.time()
    if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur:
        print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True)
        resume_offset = 0.0
    last_pct = -1
-    segs, text_parts = [], []
+    segs = list(resume_segments)  # start with what we already have
    text_parts = [s.get("text","") for s in resume_segments]
    # Walk new segments; shift their timestamps by resume_offset if trimmed
    seg_count_since_save = 0
    seg_index = len(resume_segments)
    for s in segments:
-        seg = {"start": s.start, "end": s.end, "text": s.text}
+        seg_index += 1
        start = (s.start or 0.0) + resume_offset
        end   = (s.end or 0.0) + resume_offset
        seg = {"start": start, "end": end, "text": s.text}
        segs.append(seg)
        text_parts.append(s.text)
        if WHISPER_LOG_SEGMENTS:
            print(f"[whisper] {start:8.2f}–{end:8.2f}  {s.text.strip()}", flush=True)
        # progress logging every +5%
-        if dur > 0 and s.end is not None:
+        if dur > 0 and end is not None:
-            pct = int(min(100, max(0, (s.end / dur) * 100)))
+            pct = int(min(100, max(0, (end / dur) * 100)))
            if pct >= last_pct + 5:
                log({
                    "status": "transcribing",
@@ -595,19 +905,50 @@ def transcribe(media_path: Path):
                    "progress": pct
                })
                last_pct = pct
                # compute realtime speed and ETA for console logs
                try:
                    elapsed = max(0.001, time.time() - start_wall)
                    processed = max(0.0, float(end))
                    speed = (processed / elapsed) if elapsed > 0 else 0.0  # seconds processed per second
                    # represent as X real-time factor
                    rtf = speed  # 1.0 == real-time
                    eta = ((dur - processed) / speed) if (speed > 0 and dur > 0) else 0
                    print(f"[whisper] progress {pct:3d}%  seg={seg_index:5d}  rtf={rtf:0.2f}x  eta={_fmt_eta(eta)}", flush=True)
                    # also mirror to feed log with speed/eta
                    try:
                        log({
                            "status": "transcribing",
                            "path": str(media_path),
                            "title": title,
                            "progress": pct,
                            "speed_rtf": round(rtf, 2),
                            "eta_sec": int(max(0, eta))
                        })
                    except Exception:
                        pass
                except Exception:
                    pass
        # periodic partial save
        seg_count_since_save += 1
        if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS:
            _save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs)
            seg_count_since_save = 0
    # ensure we mark 100% on completion
    if last_pct < 100:
        log({"status": "transcribing", "path": str(media_path), "title": title, "progress": 100})
    txt = " ".join(text_parts).strip()
-    # Write transcript artifacts
+    # Write final transcript artifacts
-    open(base.with_suffix(".json"), "wb").write(orjson.dumps({
+    (base.with_suffix(".json")).write_bytes(orjson.dumps({
        "file": str(media_path),
        "language": info.language,
        "segments": segs
    }))
-    open(base.with_suffix(".txt"), "w", encoding="utf-8").write(txt)
+    (base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
    def fmt_ts(t):
        h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
@@ -630,9 +971,10 @@ def transcribe(media_path: Path):
        shutil.copy2(srt_src, srt_dst)
    except Exception as e:
        print(f"[post] could not copy srt -> {srt_dst}: {e}", flush=True)
-    # Write Kodi/Plex-compatible NFO using basic metadata
+
    # Write Kodi/Plex-compatible NFO using enhanced metadata (same as before)
    try:
-        meta = {
+        fallback = {
            "title": title,
            "episode_title": title,
            "show": media_path.parent.name,
@@ -642,18 +984,42 @@ def transcribe(media_path: Path):
            "image": "",
            "guid": "",
        }
        meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None)
        ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
        write_episode_nfo(media_path, meta, ttxt)
        try:
            save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
        except Exception:
            pass
    except Exception as e:
        print(f"[post] NFO write failed: {e}", flush=True)
-    # Optional: cleanup temporary WAV
+    # Cleanup temp WAVs
    try:
        if wav_for_run != wav and wav_for_run.exists():
            wav_for_run.unlink()
        if wav.exists():
            wav.unlink()
    except Exception:
        pass
    # Remove partial checkpoints on success
    if WHISPER_RESUME:
        try:
            pjson, ptxt = _partial_paths(title)
            if pjson.exists(): pjson.unlink()
            if ptxt.exists(): ptxt.unlink()
        except Exception:
            pass
    # Final average speed over whole transcription
    try:
        total_elapsed = max(0.001, time.time() - start_wall)
        avg_rtf = (dur / total_elapsed) if total_elapsed > 0 else 0.0
        print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True)
    except Exception:
        pass
    print(f"[whisper] finished: {media_path}  lang={info.language}  segments={len(segs)}  dur={dur:.2f}s", flush=True)
    return base
 def index_meili(json_path: Path):
@@ -829,7 +1195,8 @@ def handle_local_file(path_str: str):
            index_meili(base.with_suffix(".json"))
            publish_to_openwebui([base.with_suffix(".txt")])
            try:
-                meta = {
+                # Use info.json (if present) to enrich metadata
                fallback = {
                    "title": title,
                    "episode_title": title,
                    "show": p.parent.name,
@@ -839,8 +1206,14 @@ def handle_local_file(path_str: str):
                    "image": "",
                    "guid": "",
                }
                meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
                ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
                write_episode_nfo(p, meta, ttxt)
                # Try to fetch and save artwork locally
                try:
                    save_episode_artwork(meta.get("image"), p, meta.get("show"))
                except Exception:
                    pass
            except Exception as e:
                print(f"[post] NFO write failed: {e}", flush=True)
            log({**info, **{"status": "done", "note": "used_existing_transcript"}})
@@ -855,7 +1228,8 @@ def handle_local_file(path_str: str):
                publish_to_openwebui([base.with_suffix(".txt")])
                try:
                    data = json.loads((base.with_suffix(".json")).read_text(encoding="utf-8"))
-                    meta = {
+                    # Start with repo metadata, then enrich from yt-dlp info.json if any
                    meta_repo = {
                        "title": data.get("title") or title,
                        "episode_title": data.get("title") or title,
                        "show": data.get("show") or p.parent.name,
@@ -865,8 +1239,13 @@ def handle_local_file(path_str: str):
                        "image": data.get("image"),
                        "guid": data.get("guid") or data.get("id"),
                    }
                    meta = build_meta_from_sources(p, p.parent.name, meta_repo, ep=None)
                    ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
                    write_episode_nfo(p, meta, ttxt)
                    try:
                        save_episode_artwork(meta.get("image"), p, meta.get("show"))
                    except Exception:
                        pass
                except Exception as e:
                    print(f"[post] NFO write failed: {e}", flush=True)
                log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
@@ -876,6 +1255,26 @@ def handle_local_file(path_str: str):
        base = transcribe(p)
        index_meili(base.with_suffix(".json"))
        publish_to_openwebui([base.with_suffix(".txt")])
        try:
            fallback = {
                "title": title,
                "episode_title": title,
                "show": p.parent.name,
                "description": "",
                "pubdate": _extract_date_from_stem(title),
                "duration_sec": media_duration_seconds(p),
                "image": "",
                "guid": "",
            }
            meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
            ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
            write_episode_nfo(p, meta, ttxt)
            try:
                save_episode_artwork(meta.get("image"), p, meta.get("show"))
            except Exception:
                pass
        except Exception as e:
            print(f"[post] NFO write failed: {e}", flush=True)
        log({**info, **{"status": "done"}})
    except Exception as e:
        log({"url": path_str, "status": "error", "error": str(e)})
@@ -913,6 +1312,45 @@ def handle_url(url: str):
            dest_dir.mkdir(parents=True, exist_ok=True)
            dest = dest_dir / sanitize(f.name)
            shutil.move(str(f), dest)
            # Move companion files produced by yt-dlp (info.json, thumbnail, subtitles)
            try:
                companions = find_companion_files(f)
                # info.json -> prefer "<dest.name>.info.json", fallback to "<dest.stem>.info.json"
                if companions.get("info") and companions["info"].exists():
                    dest_info = dest.parent / f"{dest.name}.info.json"
                    try:
                        shutil.move(str(companions["info"]), dest_info)
                    except Exception:
                        # fallback naming without extension
                        dest_info2 = dest.parent / f"{dest.stem}.info.json"
                        try:
                            shutil.move(str(companions['info']), dest_info2)
                        except Exception:
                            pass
                # thumbnail -> "<dest>.jpg"
                if companions.get("thumb") and companions["thumb"].exists():
                    try:
                        shutil.move(str(companions["thumb"]), str(dest.with_suffix(".jpg")))
                    except Exception:
                        pass
                # subtitles -> preserve language suffix: "<dest.stem><suffix>"
                for s in companions.get("subs", []):
                    if not s.exists():
                        continue
                    suffix_tail = ""
                    s_name = s.name
                    f_stem = f.stem
                    if s_name.startswith(f_stem):
                        suffix_tail = s_name[len(f_stem):]  # includes leading dot if present
                    else:
                        suffix_tail = s.suffix
                    dest_sub = dest.parent / f"{dest.stem}{suffix_tail}"
                    try:
                        shutil.move(str(s), str(dest_sub))
                    except Exception:
                        pass
            except Exception:
                pass
            info.update({"title": dest.stem, "uploader": uploader,
                         "date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
                         "path": str(dest)})
@@ -937,34 +1375,28 @@ def handle_url(url: str):
            index_meili(base.with_suffix(".json"))
            publish_to_openwebui([base.with_suffix(".txt")])
            try:
-                if 'ep' in locals() and ep:
+                # Build metadata from RSS (if matched), yt-dlp info.json, and sensible fallbacks
-                    meta = {
+                fallback = {
-                        "title": ep.get("title"),
+                    "title": dest.stem,
-                        "episode_title": ep.get("title"),
+                    "episode_title": dest.stem,
-                        "show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or uploader,
+                    "show": uploader,
-                        "description": ep.get("description") or ep.get("content"),
+                    "description": "",
-                        "pubdate": ep.get("pubdate"),
+                    "pubdate": _extract_date_from_stem(dest.stem),
-                        "pubdate_iso": ep.get("date_iso"),
+                    "duration_sec": media_duration_seconds(dest),
-                        "duration_sec": ep.get("duration_sec") or ep.get("duration") or media_duration_seconds(dest),
+                    "image": "",
-                        "image": ep.get("image") or ep.get("image_url"),
+                    "guid": "",
-                        "guid": ep.get("guid"),
+                }
-                    }
+                meta = build_meta_from_sources(dest, uploader, fallback, ep if 'ep' in locals() else None)
                else:
                    meta = {
                        "title": dest.stem,
                        "episode_title": dest.stem,
                        "show": uploader,
                        "description": "",
                        "pubdate": _extract_date_from_stem(dest.stem),
                        "duration_sec": media_duration_seconds(dest),
                        "image": "",
                        "guid": "",
                    }
                ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
                write_episode_nfo(dest, meta, ttxt)
                # Save local artwork for Plex/Kodi from meta image url
                try:
                    save_episode_artwork(meta.get("image"), dest, meta.get("show"))
                except Exception:
                    pass
            except Exception as e:
                print(f"[post] NFO write failed: {e}", flush=True)
            log({**info, **{"status":"done"}})
    except Exception as e:
        log({"url": url, "status":"error", "error": str(e)})
-        raise
+        raise
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -40,6 +40,42 @@ services:
      TMP_ROOT: /tmpdl
      WHISPER_MODEL: large-v3
      WHISPER_PRECISION: int8
      WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
      WHISPER_RESUME: ${WHISPER_RESUME:-1}
      WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
      PYTHONPATH: /app
      JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
      JOB_TTL: ${JOB_TTL:-86400}
      RESULT_TTL: ${RESULT_TTL:-86400}
      FAILURE_TTL: ${FAILURE_TTL:-86400}
    volumes:
      - ${LIBRARY_HOST_DIR:-./library}:/library
      - ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
      - ${TMP_HOST_DIR:-./tmp}:/tmpdl
      - ${MODELS_HOST_DIR:-./models}:/root/.cache/huggingface
    depends_on: [meili, redis]
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "exit 0"]
    extra_hosts:
      - host.docker.internal:host-gateway
  podx-worker-transcribe:
    build: ./app
    container_name: podx-worker-transcribe
    command: ["rq", "worker", "-u", "redis://redis:6379/0", "transcribe"]
    env_file: [.env]
    environment:
      MEILI_URL: http://meili:7700
      REDIS_URL: redis://redis:6379/0
      LIBRARY_ROOT: /library
      TRANSCRIPT_ROOT: /transcripts
      TMP_ROOT: /tmpdl
      WHISPER_MODEL: large-v3
      WHISPER_PRECISION: int8
      WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
      WHISPER_RESUME: ${WHISPER_RESUME:-1}
      WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
      PYTHONPATH: /app
      JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
      JOB_TTL: ${JOB_TTL:-86400}
@@ -90,7 +126,7 @@ services:
      # - COOKIE_FILE=/config/cookies.txt
      # Optional: yt-dlp options (JSON). Example enables Android client fallback
      # - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}}
-      - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1}
+      - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1,"writesubtitles":true,"writeautomaticsub":true,"subtitleslangs":["en.*"],"convertsubs":"srt","writeinfojson":true,"writethumbnail":true,"converttumbnails":"jpg"}
    volumes:
      - ${LIBRARY_HOST_DIR:-./library}:/downloads
      # Optional cookies file on host → /config/cookies.txt inside container