diff --git a/app/rss_ingest.py b/app/rss_ingest.py index fc1d46b..c4043dc 100644 --- a/app/rss_ingest.py +++ b/app/rss_ingest.py @@ -20,6 +20,11 @@ MEDIA_EXTS = {".mp3", ".m4a", ".flac", ".wav", ".ogg", ".opus", ".mp4", ".m4v", # Fuzzy title match threshold for media ↔ transcript pairing TITLE_MATCH_THRESHOLD = float(os.getenv("RSS_TITLE_MATCH_THRESHOLD", "0.60")) +# Download podcast audio (enclosures) to a local library +PODCASTS_ROOT = Path(os.getenv("PODCASTS_ROOT", str(LIB / "Podcasts"))) +PODCASTS_PER_SHOW = os.getenv("PODCASTS_PER_SHOW", "true").lower() in {"1","true","yes","y"} +DOWNLOAD_AUDIO = os.getenv("RSS_DOWNLOAD_AUDIO", "true").lower() in {"1","true","yes","y"} + # Namespace map (extend as needed) NS = { "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", @@ -31,6 +36,7 @@ NS = { TRN.mkdir(parents=True, exist_ok=True) OUT_INDEX.parent.mkdir(parents=True, exist_ok=True) +PODCASTS_ROOT.mkdir(parents=True, exist_ok=True) def _text(el): @@ -235,6 +241,7 @@ def parse_feed(feed_url: str): duration_sec = _parse_duration(dur) or None enclosure = _find_ns(it, "enclosure") audio_url = enclosure.get("url") if enclosure is not None else "" + audio_type = enclosure.get("type") if enclosure is not None else "" if not audio_url: for mc in _findall_ns(it, "media:content"): @@ -253,6 +260,7 @@ def parse_feed(feed_url: str): "date": date, "duration_sec": duration_sec, "audio_url": audio_url, + "audio_type": audio_type, "language": DEFAULT_LANG, "transcripts": transcripts, } @@ -276,6 +284,40 @@ def parse_feed(feed_url: str): if created: t["sidecars"] = created + # Optionally download podcast audio locally + local_audio_path = None + if DOWNLOAD_AUDIO and audio_url: + show_dir = PODCASTS_ROOT / _slug(show_title or "Podcast") if PODCASTS_PER_SHOW else PODCASTS_ROOT + base_name = f"{(date or '00000000')} - {_slug(title or guid or 'episode')}" + ext = _guess_audio_ext(audio_type, audio_url) + target = (show_dir / base_name).with_suffix(ext) + # Avoid re-download if already exists + if not target.exists(): + saved = _download_stream(audio_url, target) + if saved is None: + # Try a non-streaming fallback + saved = _download(audio_url, target) + else: + saved = target + if saved and saved.exists(): + local_audio_path = saved + # If we previously downloaded transcript sidecars, try to place them next to this audio + for t in item_rec.get("transcripts", []) or []: + lp = t.get("local_path") + if lp: + try: + lp = Path(lp) + if lp.exists() and lp.suffix.lower() in {'.srt','.vtt','.txt'}: + sc = _sidecar_path_for(local_audio_path, t.get('language') or DEFAULT_LANG, lp.suffix.lower()) + if not sc.exists(): + sc.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(lp, sc) + t.setdefault("sidecars", []).append(str(sc)) + except Exception: + pass + if local_audio_path: + item_rec["local_audio"] = str(local_audio_path) + items.append(item_rec) return {"feed_url": feed_url, "show": show_title, "episodes": items} diff --git a/app/worker.py b/app/worker.py index b68ccee..6e051ac 100644 --- a/app/worker.py +++ b/app/worker.py @@ -2,6 +2,7 @@ import os, subprocess, shutil, json, re, orjson, requests from pathlib import Path import math import difflib +import time from faster_whisper import WhisperModel from xml.sax.saxutils import escape as xml_escape @@ -11,10 +12,21 @@ MEILI_KEY = os.getenv("MEILI_KEY", "") LIB = Path(os.getenv("LIBRARY_ROOT", "/library")) TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts")) TMP = Path(os.getenv("TMP_ROOT", "/tmpdl")) + MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3") COMPUTE = os.getenv("WHISPER_PRECISION","int8") WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip() +# Whisper device/config controls +WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "auto").strip() +WHISPER_DEVICE_INDEX = int(os.getenv("WHISPER_DEVICE_INDEX", "0")) +WHISPER_CPU_THREADS = int(os.getenv("WHISPER_CPU_THREADS", "4")) + +# Whisper logging & resume controls +WHISPER_LOG_SEGMENTS = os.getenv("WHISPER_LOG_SEGMENTS", "1") not in ("0", "false", "False") +WHISPER_RESUME = os.getenv("WHISPER_RESUME", "1") not in ("0", "false", "False") +PARTIAL_SAVE_EVERY_SEGS = int(os.getenv("WHISPER_PARTIAL_SAVE_EVERY_SEGS", "20")) + # RSS resolver config RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json")) RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150")) # seconds @@ -34,9 +46,55 @@ _model = None def get_model(): global _model if _model is None: - _model = WhisperModel(MODEL_NAME, compute_type=COMPUTE) + print(f"[whisper] loading model='{MODEL_NAME}' device='{WHISPER_DEVICE}' idx={WHISPER_DEVICE_INDEX} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True) + _model = WhisperModel( + MODEL_NAME, + device=WHISPER_DEVICE, + device_index=WHISPER_DEVICE_INDEX, + compute_type=COMPUTE, + cpu_threads=WHISPER_CPU_THREADS, + ) return _model +# --- Helper: Reset model with new device and device_index --- +def reset_model(device: str, device_index: int | None = None): + """Reset the global _model to a new WhisperModel with the given device and device_index.""" + global _model + idx = device_index if device_index is not None else WHISPER_DEVICE_INDEX + print(f"[whisper] resetting model='{MODEL_NAME}' device='{device}' idx={idx} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True) + _model = WhisperModel( + MODEL_NAME, + device=device, + device_index=idx, + compute_type=COMPUTE, + cpu_threads=WHISPER_CPU_THREADS, + ) + +# --- Helper: Run transcribe with fallback to CPU on GPU/oom errors --- +def run_transcribe_with_fallback(wav_path: Path, lang): + """ + Try to transcribe with current model; on GPU/CUDA/HIP/ROCm/OOM errors, reset to CPU and retry once. + Returns (segments, info) or raises exception. + """ + model = get_model() + try: + return model.transcribe(str(wav_path), vad_filter=True, language=lang) + except Exception as e: + msg = str(e) + gpu_errs = [ + "CUDA", "cublas", "out of memory", "HIP", "ROCm", "device-side assert", "CUDNN", "cudaError", "cuda runtime", "cudaMalloc" + ] + if any(err.lower() in msg.lower() for err in gpu_errs): + print(f"[whisper] GPU error detected: '{msg}'. Retrying on CPU...", flush=True) + reset_model("cpu", 0) + try: + model = get_model() + return model.transcribe(str(wav_path), vad_filter=True, language=lang) + except Exception as e2: + print(f"[whisper] CPU fallback also failed: {e2}", flush=True) + raise + raise + def log(feed): try: with open(TRN / "_feed.log", "a", encoding="utf-8") as f: @@ -216,6 +274,11 @@ def use_rss_transcript(media_path: Path, ep: dict) -> Path | None: txt_path = base.with_suffix(".txt") transcript_text = txt_path.read_text(encoding="utf-8") if txt_path.exists() else None write_episode_nfo(media_path, meta, transcript_text) + # Save local artwork for Plex/Kodi + try: + save_episode_artwork(meta.get("image"), media_path, meta.get("show")) + except Exception: + pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) return base @@ -351,6 +414,11 @@ def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None: txtp = new_base.with_suffix(".txt") ttxt = txtp.read_text(encoding="utf-8") if txtp.exists() else None write_episode_nfo(media_path, meta, ttxt) + # Save local artwork for Plex/Kodi + try: + save_episode_artwork(meta.get("image"), media_path, meta.get("show")) + except Exception: + pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) @@ -403,6 +471,161 @@ def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "e print(f"[post] sidecar copy/convert failed: {e}", flush=True) +# --- small helpers for progress/ETA formatting --- +def _fmt_eta(sec: float) -> str: + try: + sec = max(0, int(sec)) + h, rem = divmod(sec, 3600) + m, s = divmod(rem, 60) + if h: + return f"{h}h {m}m {s}s" + if m: + return f"{m}m {s}s" + return f"{s}s" + except Exception: + return "" + + +def save_episode_artwork(image_url: str | None, media_path: Path, show_title: str | None = None): + """Download episode artwork from image_url and save next to the media as '.jpg'. + Also drop a folder-level 'poster.jpg' for the show directory if not present. + Best-effort; failures are logged but non-fatal. + """ + if not image_url: + return + try: + resp = requests.get(image_url, timeout=30, stream=True) + resp.raise_for_status() + # Determine content-type and write a temporary file + ctype = (resp.headers.get("Content-Type") or "").lower() + tmp_file = media_path.with_suffix(".art.tmp") + with open(tmp_file, "wb") as out: + for chunk in resp.iter_content(chunk_size=8192): + if chunk: + out.write(chunk) + + # Always provide a .jpg next to the media for Plex + episode_jpg = media_path.with_suffix(".jpg") + if "image/jpeg" in ctype: + # Already JPEG + shutil.move(str(tmp_file), str(episode_jpg)) + else: + # Try converting to JPEG with ffmpeg; if it fails, keep bytes as-is + try: + subprocess.run( + ["ffmpeg", "-nostdin", "-y", "-i", str(tmp_file), str(episode_jpg)], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True + ) + try: + tmp_file.unlink() + except Exception: + pass + except Exception: + shutil.move(str(tmp_file), str(episode_jpg)) + + # Also drop a folder poster once per show (helps Plex folder views) + try: + show_poster = media_path.parent / "poster.jpg" + if not show_poster.exists(): + shutil.copy2(episode_jpg, show_poster) + except Exception: + pass + + except Exception as e: + print(f"[post] artwork download failed: {e}", flush=True) + + + +def find_companion_files(src: Path) -> dict: + """Return likely yt-dlp companion files for a downloaded media file.""" + out = {} + # info.json can be either "..info.json" or ".info.json" + cands_info = [ + src.parent / f"{src.name}.info.json", + src.parent / f"{src.stem}.info.json", + ] + out["info"] = next((p for p in cands_info if p.exists()), None) + + # thumbnails may be "..jpg" or ".jpg" (we convert to jpg) + cand_thumbs = [ + src.parent / f"{src.name}.jpg", + src.parent / f"{src.stem}.jpg", + src.parent / f"{src.stem}.jpeg", + src.parent / f"{src.stem}.png", + src.parent / f"{src.stem}.webp", + ] + out["thumb"] = next((p for p in cand_thumbs if p.exists()), None) + + # subtitles (keep multiple) + subs = [] + for s in src.parent.glob(f"{src.stem}*.srt"): + subs.append(s) + for s in src.parent.glob(f"{src.stem}*.vtt"): + subs.append(s) + out["subs"] = subs + return out + +def load_info_json(path: Path) -> dict | None: + try: + return json.loads(path.read_text(encoding="utf-8")) + except Exception: + return None + +def _iso_from_yyyymmdd(s: str | None) -> str | None: + if not s or not re.match(r"^\d{8}$", s): + return None + return f"{s[0:4]}-{s[4:6]}-{s[6:8]}" + +def build_meta_from_sources(media_path: Path, uploader: str, fallback_meta: dict, ep: dict | None = None) -> dict: + """ + Merge metadata from (priority): RSS episode `ep` -> yt-dlp info.json (if present) -> fallback. + Returns a dict compatible with write_episode_nfo(). + """ + # Start with fallback + meta = dict(fallback_meta) + + # Augment from info.json if present + info = None + for cand in [ + media_path.parent / f"{media_path.name}.info.json", + media_path.parent / f"{media_path.stem}.info.json", + ]: + if cand.exists(): + info = load_info_json(cand) + break + if info: + meta.setdefault("title", info.get("title")) + meta.setdefault("episode_title", info.get("title")) + meta.setdefault("description", info.get("description") or info.get("fulltitle")) + # upload_date is YYYYMMDD + iso = _iso_from_yyyymmdd(info.get("upload_date")) + if iso: + meta["pubdate_iso"] = iso + # Prefer video duration if present + if not meta.get("duration_sec") and info.get("duration"): + meta["duration_sec"] = info.get("duration") + # thumbnail URL + if not meta.get("image"): + meta["image"] = info.get("thumbnail") + # show/uploader + if not meta.get("show"): + meta["show"] = info.get("uploader") or uploader + + # Finally, layer RSS data on top if available (most authoritative for podcasts) + if ep: + meta.update({ + "title": ep.get("title") or meta.get("title"), + "episode_title": ep.get("title") or meta.get("episode_title"), + "show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or meta.get("show") or uploader, + "description": ep.get("description") or ep.get("content") or meta.get("description", ""), + "pubdate": ep.get("pubdate") or meta.get("pubdate", ""), + "pubdate_iso": ep.get("date_iso") or meta.get("pubdate_iso", meta.get("pubdate")), + "duration_sec": ep.get("duration_sec") or ep.get("duration") or meta.get("duration_sec"), + "image": ep.get("image") or ep.get("image_url") or meta.get("image", ""), + "guid": ep.get("guid") or meta.get("guid", ""), + }) + + return meta # ---------- Kodi/Plex NFO writer ---------- from datetime import datetime @@ -505,8 +728,12 @@ def yt_dlp(url, outdir): base_cmd = [ "yt-dlp", "-o", outtmpl, "-f", "bv*+ba/best", - "-x", "--audio-format", "m4a", + "--write-info-json", "--write-thumbnail", + "--convert-thumbnails", "jpg", + "--write-subs", "--write-auto-subs", + "--sub-langs", os.getenv("YTDLP_SUBS_LANGS", "en.*,en"), + "--convert-subs", "srt", "--no-playlist", "--no-warnings", "--restrict-filenames", ] @@ -527,10 +754,13 @@ def yt_dlp(url, outdir): ] subprocess.check_call(retry_cmd) - media = (list(outdir.rglob("*.[mM][pP]4")) + - list(outdir.rglob("*.mkv")) + - list(outdir.rglob("*.m4a")) + - list(outdir.rglob("*.mp3"))) + media = ( + list(outdir.rglob("*.[mM][pP]4")) + + list(outdir.rglob("*.mkv")) + + list(outdir.rglob("*.webm")) + + list(outdir.rglob("*.m4a")) + + list(outdir.rglob("*.mp3")) + ) return sorted(media, key=lambda p: p.stat().st_mtime)[-1:] def extract_audio(src: Path, outdir: Path) -> Path: @@ -550,6 +780,27 @@ def extract_audio(src: Path, outdir: Path) -> Path: raise RuntimeError(f"ffmpeg extract failed: {e.output.decode(errors='ignore')}") return wav_path +# --- WAV trimming helper --- +def trim_wav(src_wav: Path, start_sec: float, outdir: Path) -> Path: + """Return a trimmed 16k mono WAV starting at start_sec from src_wav.""" + outdir.mkdir(parents=True, exist_ok=True) + if not start_sec or start_sec <= 0.0: + return src_wav + dst = outdir / (src_wav.stem + f".from_{int(start_sec)}s.wav") + try: + subprocess.check_output([ + "ffmpeg", "-nostdin", "-y", + "-ss", str(max(0.0, float(start_sec))), + "-i", str(src_wav), + "-vn", "-ac", "1", "-ar", "16000", + "-f", "wav", str(dst), + ], stderr=subprocess.STDOUT) + return dst + except subprocess.CalledProcessError as e: + # If trimming fails, fall back to full file + print(f"[whisper] trim failed, using full WAV: {e.output.decode(errors='ignore')}", flush=True) + return src_wav + def media_duration_seconds(path: Path) -> float: """Return duration in seconds using ffprobe; fallback to 0.0 on error.""" try: @@ -561,32 +812,91 @@ def media_duration_seconds(path: Path) -> float: except Exception: return 0.0 +# --- Partial transcript helpers --- +def _partial_paths(title: str) -> tuple[Path, Path]: + base = TRN / title + return base.with_suffix(".partial.json"), base.with_suffix(".partial.txt") + +def _save_partial(title: str, language: str, segs: list[dict]): + pjson, ptxt = _partial_paths(title) + try: + # Save JSON + pjson.write_bytes(orjson.dumps({"file": str((TRN / title).with_suffix('.wav')), "language": language, "segments": segs})) + except Exception as e: + print(f"[whisper] partial json save failed: {e}", flush=True) + try: + # Save TXT snapshot + ptxt.write_text(" ".join(s.get("text","") for s in segs), encoding="utf-8") + except Exception as e: + print(f"[whisper] partial txt save failed: {e}", flush=True) + def transcribe(media_path: Path): model = get_model() + print(f"[whisper] start transcribe: {media_path}", flush=True) # 1) Robustly extract audio to 16k mono WAV (fixes pyAV/webm edge cases) wav = extract_audio(media_path, TMP) - # 2) Language - lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE - - # 3) Transcribe - segments, info = model.transcribe(str(wav), vad_filter=True, language=lang) - title = media_path.stem base = TRN / title - # Determine duration for progress; use extracted WAV (accurate for transcription input) + # Resume support: if a partial checkpoint exists, load it and trim input + resume_segments = [] + resume_offset = 0.0 + language_hint = None + if WHISPER_RESUME: + pjson, ptxt = _partial_paths(title) + if pjson.exists(): + try: + pdata = json.loads(pjson.read_text(encoding="utf-8")) + resume_segments = pdata.get("segments", []) or [] + if resume_segments: + resume_offset = float(resume_segments[-1].get("end", 0.0)) + language_hint = pdata.get("language") + print(f"[whisper] resuming from ~{resume_offset:.2f}s with {len(resume_segments)} segments", flush=True) + except Exception as e: + print(f"[whisper] failed to load partial: {e}", flush=True) + + # If resuming, trim WAV from last end time + wav_for_run = trim_wav(wav, resume_offset, TMP) + + # 2) Language selection + lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE + if language_hint and WHISPER_LANGUAGE.lower() == "auto": + # carry hint forward if available + lang = language_hint + + # 3) Transcribe + segments, info = run_transcribe_with_fallback(wav_for_run, lang) + + # Determine duration for progress; use full WAV duration for consistent % regardless of resume dur = media_duration_seconds(wav) or 0.0 + # Start wall clock timer for speed/ETA + start_wall = time.time() + if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur: + print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True) + resume_offset = 0.0 last_pct = -1 - segs, text_parts = [], [] + segs = list(resume_segments) # start with what we already have + text_parts = [s.get("text","") for s in resume_segments] + + # Walk new segments; shift their timestamps by resume_offset if trimmed + seg_count_since_save = 0 + seg_index = len(resume_segments) for s in segments: - seg = {"start": s.start, "end": s.end, "text": s.text} + seg_index += 1 + start = (s.start or 0.0) + resume_offset + end = (s.end or 0.0) + resume_offset + seg = {"start": start, "end": end, "text": s.text} segs.append(seg) text_parts.append(s.text) + + if WHISPER_LOG_SEGMENTS: + print(f"[whisper] {start:8.2f}–{end:8.2f} {s.text.strip()}", flush=True) + # progress logging every +5% - if dur > 0 and s.end is not None: - pct = int(min(100, max(0, (s.end / dur) * 100))) + if dur > 0 and end is not None: + pct = int(min(100, max(0, (end / dur) * 100))) if pct >= last_pct + 5: log({ "status": "transcribing", @@ -595,19 +905,50 @@ def transcribe(media_path: Path): "progress": pct }) last_pct = pct + + # compute realtime speed and ETA for console logs + try: + elapsed = max(0.001, time.time() - start_wall) + processed = max(0.0, float(end)) + speed = (processed / elapsed) if elapsed > 0 else 0.0 # seconds processed per second + # represent as X real-time factor + rtf = speed # 1.0 == real-time + eta = ((dur - processed) / speed) if (speed > 0 and dur > 0) else 0 + print(f"[whisper] progress {pct:3d}% seg={seg_index:5d} rtf={rtf:0.2f}x eta={_fmt_eta(eta)}", flush=True) + # also mirror to feed log with speed/eta + try: + log({ + "status": "transcribing", + "path": str(media_path), + "title": title, + "progress": pct, + "speed_rtf": round(rtf, 2), + "eta_sec": int(max(0, eta)) + }) + except Exception: + pass + except Exception: + pass + + # periodic partial save + seg_count_since_save += 1 + if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS: + _save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs) + seg_count_since_save = 0 + # ensure we mark 100% on completion if last_pct < 100: log({"status": "transcribing", "path": str(media_path), "title": title, "progress": 100}) txt = " ".join(text_parts).strip() - # Write transcript artifacts - open(base.with_suffix(".json"), "wb").write(orjson.dumps({ + # Write final transcript artifacts + (base.with_suffix(".json")).write_bytes(orjson.dumps({ "file": str(media_path), "language": info.language, "segments": segs })) - open(base.with_suffix(".txt"), "w", encoding="utf-8").write(txt) + (base.with_suffix(".txt")).write_text(txt, encoding="utf-8") def fmt_ts(t): h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60) @@ -630,9 +971,10 @@ def transcribe(media_path: Path): shutil.copy2(srt_src, srt_dst) except Exception as e: print(f"[post] could not copy srt -> {srt_dst}: {e}", flush=True) - # Write Kodi/Plex-compatible NFO using basic metadata + + # Write Kodi/Plex-compatible NFO using enhanced metadata (same as before) try: - meta = { + fallback = { "title": title, "episode_title": title, "show": media_path.parent.name, @@ -642,18 +984,42 @@ def transcribe(media_path: Path): "image": "", "guid": "", } + meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None) ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8") write_episode_nfo(media_path, meta, ttxt) + try: + save_episode_artwork(meta.get("image"), media_path, meta.get("show")) + except Exception: + pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) - # Optional: cleanup temporary WAV + # Cleanup temp WAVs try: + if wav_for_run != wav and wav_for_run.exists(): + wav_for_run.unlink() if wav.exists(): wav.unlink() except Exception: pass + # Remove partial checkpoints on success + if WHISPER_RESUME: + try: + pjson, ptxt = _partial_paths(title) + if pjson.exists(): pjson.unlink() + if ptxt.exists(): ptxt.unlink() + except Exception: + pass + + # Final average speed over whole transcription + try: + total_elapsed = max(0.001, time.time() - start_wall) + avg_rtf = (dur / total_elapsed) if total_elapsed > 0 else 0.0 + print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True) + except Exception: + pass + print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True) return base def index_meili(json_path: Path): @@ -829,7 +1195,8 @@ def handle_local_file(path_str: str): index_meili(base.with_suffix(".json")) publish_to_openwebui([base.with_suffix(".txt")]) try: - meta = { + # Use info.json (if present) to enrich metadata + fallback = { "title": title, "episode_title": title, "show": p.parent.name, @@ -839,8 +1206,14 @@ def handle_local_file(path_str: str): "image": "", "guid": "", } + meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None) ttxt = base.with_suffix(".txt").read_text(encoding="utf-8") write_episode_nfo(p, meta, ttxt) + # Try to fetch and save artwork locally + try: + save_episode_artwork(meta.get("image"), p, meta.get("show")) + except Exception: + pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) log({**info, **{"status": "done", "note": "used_existing_transcript"}}) @@ -855,7 +1228,8 @@ def handle_local_file(path_str: str): publish_to_openwebui([base.with_suffix(".txt")]) try: data = json.loads((base.with_suffix(".json")).read_text(encoding="utf-8")) - meta = { + # Start with repo metadata, then enrich from yt-dlp info.json if any + meta_repo = { "title": data.get("title") or title, "episode_title": data.get("title") or title, "show": data.get("show") or p.parent.name, @@ -865,8 +1239,13 @@ def handle_local_file(path_str: str): "image": data.get("image"), "guid": data.get("guid") or data.get("id"), } + meta = build_meta_from_sources(p, p.parent.name, meta_repo, ep=None) ttxt = base.with_suffix(".txt").read_text(encoding="utf-8") write_episode_nfo(p, meta, ttxt) + try: + save_episode_artwork(meta.get("image"), p, meta.get("show")) + except Exception: + pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) log({**info, **{"status": "done", "note": "reused_repo_transcript"}}) @@ -876,6 +1255,26 @@ def handle_local_file(path_str: str): base = transcribe(p) index_meili(base.with_suffix(".json")) publish_to_openwebui([base.with_suffix(".txt")]) + try: + fallback = { + "title": title, + "episode_title": title, + "show": p.parent.name, + "description": "", + "pubdate": _extract_date_from_stem(title), + "duration_sec": media_duration_seconds(p), + "image": "", + "guid": "", + } + meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None) + ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8") + write_episode_nfo(p, meta, ttxt) + try: + save_episode_artwork(meta.get("image"), p, meta.get("show")) + except Exception: + pass + except Exception as e: + print(f"[post] NFO write failed: {e}", flush=True) log({**info, **{"status": "done"}}) except Exception as e: log({"url": path_str, "status": "error", "error": str(e)}) @@ -913,6 +1312,45 @@ def handle_url(url: str): dest_dir.mkdir(parents=True, exist_ok=True) dest = dest_dir / sanitize(f.name) shutil.move(str(f), dest) + # Move companion files produced by yt-dlp (info.json, thumbnail, subtitles) + try: + companions = find_companion_files(f) + # info.json -> prefer ".info.json", fallback to ".info.json" + if companions.get("info") and companions["info"].exists(): + dest_info = dest.parent / f"{dest.name}.info.json" + try: + shutil.move(str(companions["info"]), dest_info) + except Exception: + # fallback naming without extension + dest_info2 = dest.parent / f"{dest.stem}.info.json" + try: + shutil.move(str(companions['info']), dest_info2) + except Exception: + pass + # thumbnail -> ".jpg" + if companions.get("thumb") and companions["thumb"].exists(): + try: + shutil.move(str(companions["thumb"]), str(dest.with_suffix(".jpg"))) + except Exception: + pass + # subtitles -> preserve language suffix: "" + for s in companions.get("subs", []): + if not s.exists(): + continue + suffix_tail = "" + s_name = s.name + f_stem = f.stem + if s_name.startswith(f_stem): + suffix_tail = s_name[len(f_stem):] # includes leading dot if present + else: + suffix_tail = s.suffix + dest_sub = dest.parent / f"{dest.stem}{suffix_tail}" + try: + shutil.move(str(s), str(dest_sub)) + except Exception: + pass + except Exception: + pass info.update({"title": dest.stem, "uploader": uploader, "date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""), "path": str(dest)}) @@ -937,34 +1375,28 @@ def handle_url(url: str): index_meili(base.with_suffix(".json")) publish_to_openwebui([base.with_suffix(".txt")]) try: - if 'ep' in locals() and ep: - meta = { - "title": ep.get("title"), - "episode_title": ep.get("title"), - "show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or uploader, - "description": ep.get("description") or ep.get("content"), - "pubdate": ep.get("pubdate"), - "pubdate_iso": ep.get("date_iso"), - "duration_sec": ep.get("duration_sec") or ep.get("duration") or media_duration_seconds(dest), - "image": ep.get("image") or ep.get("image_url"), - "guid": ep.get("guid"), - } - else: - meta = { - "title": dest.stem, - "episode_title": dest.stem, - "show": uploader, - "description": "", - "pubdate": _extract_date_from_stem(dest.stem), - "duration_sec": media_duration_seconds(dest), - "image": "", - "guid": "", - } + # Build metadata from RSS (if matched), yt-dlp info.json, and sensible fallbacks + fallback = { + "title": dest.stem, + "episode_title": dest.stem, + "show": uploader, + "description": "", + "pubdate": _extract_date_from_stem(dest.stem), + "duration_sec": media_duration_seconds(dest), + "image": "", + "guid": "", + } + meta = build_meta_from_sources(dest, uploader, fallback, ep if 'ep' in locals() else None) ttxt = base.with_suffix(".txt").read_text(encoding="utf-8") write_episode_nfo(dest, meta, ttxt) + # Save local artwork for Plex/Kodi from meta image url + try: + save_episode_artwork(meta.get("image"), dest, meta.get("show")) + except Exception: + pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) log({**info, **{"status":"done"}}) except Exception as e: log({"url": url, "status":"error", "error": str(e)}) - raise + raise \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index d017fee..6ba60ae 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -40,6 +40,42 @@ services: TMP_ROOT: /tmpdl WHISPER_MODEL: large-v3 WHISPER_PRECISION: int8 + WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} + WHISPER_RESUME: ${WHISPER_RESUME:-1} + WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20} + PYTHONPATH: /app + JOB_TIMEOUT: ${JOB_TIMEOUT:-14400} + JOB_TTL: ${JOB_TTL:-86400} + RESULT_TTL: ${RESULT_TTL:-86400} + FAILURE_TTL: ${FAILURE_TTL:-86400} + volumes: + - ${LIBRARY_HOST_DIR:-./library}:/library + - ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts + - ${TMP_HOST_DIR:-./tmp}:/tmpdl + - ${MODELS_HOST_DIR:-./models}:/root/.cache/huggingface + depends_on: [meili, redis] + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "exit 0"] + extra_hosts: + - host.docker.internal:host-gateway + + podx-worker-transcribe: + build: ./app + container_name: podx-worker-transcribe + command: ["rq", "worker", "-u", "redis://redis:6379/0", "transcribe"] + env_file: [.env] + environment: + MEILI_URL: http://meili:7700 + REDIS_URL: redis://redis:6379/0 + LIBRARY_ROOT: /library + TRANSCRIPT_ROOT: /transcripts + TMP_ROOT: /tmpdl + WHISPER_MODEL: large-v3 + WHISPER_PRECISION: int8 + WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} + WHISPER_RESUME: ${WHISPER_RESUME:-1} + WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20} PYTHONPATH: /app JOB_TIMEOUT: ${JOB_TIMEOUT:-14400} JOB_TTL: ${JOB_TTL:-86400} @@ -90,7 +126,7 @@ services: # - COOKIE_FILE=/config/cookies.txt # Optional: yt-dlp options (JSON). Example enables Android client fallback # - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}} - - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1} + - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1,"writesubtitles":true,"writeautomaticsub":true,"subtitleslangs":["en.*"],"convertsubs":"srt","writeinfojson":true,"writethumbnail":true,"converttumbnails":"jpg"} volumes: - ${LIBRARY_HOST_DIR:-./library}:/downloads # Optional cookies file on host → /config/cookies.txt inside container