import os, subprocess, shutil, json, re, orjson, requests from pathlib import Path import math import difflib import time from faster_whisper import WhisperModel from xml.sax.saxutils import escape as xml_escape MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700") MEILI_KEY = os.getenv("MEILI_KEY", "") LIB = Path(os.getenv("LIBRARY_ROOT", "/library")) TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts")) TMP = Path(os.getenv("TMP_ROOT", "/tmpdl")) MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3") COMPUTE = os.getenv("WHISPER_PRECISION","int8") WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip() # Whisper device/config controls WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "auto").strip() WHISPER_DEVICE_INDEX = int(os.getenv("WHISPER_DEVICE_INDEX", "0")) WHISPER_CPU_THREADS = int(os.getenv("WHISPER_CPU_THREADS", "4")) # Whisper logging & resume controls WHISPER_LOG_SEGMENTS = os.getenv("WHISPER_LOG_SEGMENTS", "1") not in ("0", "false", "False") WHISPER_RESUME = os.getenv("WHISPER_RESUME", "1") not in ("0", "false", "False") PARTIAL_SAVE_EVERY_SEGS = int(os.getenv("WHISPER_PARTIAL_SAVE_EVERY_SEGS", "20")) # RSS resolver config RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json")) RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150")) # seconds DEFAULT_TRANSCRIPT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en" OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/") OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "") OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library") TRN.mkdir(parents=True, exist_ok=True) LIB.mkdir(parents=True, exist_ok=True) TMP.mkdir(parents=True, exist_ok=True) # Lazy Whisper model loader so the worker can start even if model download/setup is slow _model = None def get_model(): global _model if _model is None: print(f"[whisper] loading model='{MODEL_NAME}' device='{WHISPER_DEVICE}' idx={WHISPER_DEVICE_INDEX} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True) _model = WhisperModel( MODEL_NAME, device=WHISPER_DEVICE, device_index=WHISPER_DEVICE_INDEX, compute_type=COMPUTE, cpu_threads=WHISPER_CPU_THREADS, ) return _model # --- Helper: Reset model with new device and device_index --- def reset_model(device: str, device_index: int | None = None): """Reset the global _model to a new WhisperModel with the given device and device_index.""" global _model idx = device_index if device_index is not None else WHISPER_DEVICE_INDEX print(f"[whisper] resetting model='{MODEL_NAME}' device='{device}' idx={idx} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True) _model = WhisperModel( MODEL_NAME, device=device, device_index=idx, compute_type=COMPUTE, cpu_threads=WHISPER_CPU_THREADS, ) # --- Helper: Run transcribe with fallback to CPU on GPU/oom errors --- def run_transcribe_with_fallback(wav_path: Path, lang): """ Try to transcribe with current model; on GPU/CUDA/HIP/ROCm/OOM errors, reset to CPU and retry once. Returns (segments, info) or raises exception. """ model = get_model() try: return model.transcribe(str(wav_path), vad_filter=True, language=lang) except Exception as e: msg = str(e) gpu_errs = [ "CUDA", "cublas", "out of memory", "HIP", "ROCm", "device-side assert", "CUDNN", "cudaError", "cuda runtime", "cudaMalloc" ] if any(err.lower() in msg.lower() for err in gpu_errs): print(f"[whisper] GPU error detected: '{msg}'. Retrying on CPU...", flush=True) reset_model("cpu", 0) try: model = get_model() return model.transcribe(str(wav_path), vad_filter=True, language=lang) except Exception as e2: print(f"[whisper] CPU fallback also failed: {e2}", flush=True) raise raise def log(feed): try: with open(TRN / "_feed.log", "a", encoding="utf-8") as f: f.write(orjson.dumps(feed).decode()+"\n") except Exception: pass def sanitize(name): return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip() # ---------- RSS transcript resolver ---------- def _normalize_title(t: str) -> str: t = (t or "").lower() t = re.sub(r"\s+", " ", t) # remove punctuation-ish t = re.sub(r"[^a-z0-9 _-]+", "", t) return t.strip() def _stem_without_date(stem: str) -> str: # drop leading YYYYMMDD - from filenames created by yt-dlp template m = re.match(r"^\d{8}\s*-\s*(.*)$", stem) return m.group(1) if m else stem def _extract_date_from_stem(stem: str) -> str | None: m = re.search(r"\b(\d{8})\b", stem) return m.group(1) if m else None def _best_title_match(title: str, candidates: list[str]) -> tuple[str, float]: """Return (best_title, score 0..1) using difflib SequenceMatcher.""" if not candidates: return "", 0.0 norm_title = _normalize_title(title) best = ("", 0.0) for c in candidates: score = difflib.SequenceMatcher(None, norm_title, _normalize_title(c)).ratio() if score > best[1]: best = (c, score) return best def _load_rss_index() -> list[dict]: try: if RSS_INDEX_PATH.exists(): data = json.loads(RSS_INDEX_PATH.read_text(encoding="utf-8")) # supports {"episodes":[...]} or a flat list if isinstance(data, dict) and "episodes" in data: return data["episodes"] or [] if isinstance(data, list): return data except Exception as e: print(f"[resolver] failed to load RSS index: {e}", flush=True) return [] def match_media_to_rss(media_path: Path) -> dict | None: """Try to match a local media file to an RSS episode entry.""" episodes = _load_rss_index() if not episodes: return None stem = media_path.stem title_no_date = _stem_without_date(stem) file_date = _extract_date_from_stem(stem) # duration tolerance media_dur = media_duration_seconds(media_path) # Candidates: filter by date if present, else all if file_date: pool = [e for e in episodes if (str(e.get("date", "")) == file_date or str(e.get("pubdate", "")) == file_date)] if not pool: pool = episodes else: pool = episodes # Pick best by (title similarity, duration proximity) best_ep, best_score = None, -1.0 for ep in pool: ep_title = ep.get("title") or ep.get("itunes_title") or "" sim = _best_title_match(title_no_date, [ep_title])[1] dur = float(ep.get("duration_sec") or ep.get("duration") or 0.0) dur_ok = True if media_dur and dur: dur_ok = abs(media_dur - dur) <= RSS_DURATION_TOLERANCE score = sim + (0.1 if dur_ok else 0.0) if score > best_score: best_score, best_ep = score, ep if best_ep and best_score >= 0.5: print(f"[resolver] matched '{stem}' -> '{best_ep.get('title','')}' score={best_score:.2f}", flush=True) return best_ep return None def _choose_transcript_url(ep: dict) -> tuple[str, str] | tuple[None, None]: """Return (url, kind) preferring txt, vtt, then srt. 'kind' in {'txt','vtt','srt'}.""" # unified structure from rss_ingest.py: ep["transcripts"] = [{"url":..., "type": ...}, ...] items = ep.get("transcripts") or [] # some ingesters store separate keys if not items: for key, kind in [("transcript_txt","txt"), ("transcript_vtt","vtt"), ("transcript_srt","srt")]: if ep.get(key): items.append({"url": ep[key], "type": kind}) # preference order for kind in ["txt", "vtt", "srt"]: for it in items: t = (it.get("type") or "").lower() u = it.get("url") or "" if u and (kind in t or (kind == "txt" and t in ["text","plain","text/plain"]) or (kind in u.lower())): return u, kind return (None, None) def fetch_rss_transcript(ep: dict, dest_dir: Path) -> Path | None: """Download transcript to dest_dir and return local Path; convert VTT->SRT if needed.""" url, kind = _choose_transcript_url(ep) if not url: return None dest_dir.mkdir(parents=True, exist_ok=True) # filename from episode title safe = sanitize(ep.get("title") or ep.get("guid") or "episode") path = dest_dir / f"{safe}.{kind if kind!='txt' else 'txt'}" try: r = requests.get(url, timeout=30) r.raise_for_status() mode = "wb" if kind in ("vtt","srt") else "w" if mode == "wb": path.write_bytes(r.content) else: path.write_text(r.text, encoding="utf-8") print(f"[resolver] downloaded transcript ({kind}) from {url}", flush=True) return path except Exception as e: print(f"[resolver] failed to fetch transcript: {e}", flush=True) return None def use_rss_transcript(media_path: Path, ep: dict) -> Path | None: """Create standard transcript artifacts from an RSS transcript (txt/vtt/srt).""" # Prefer direct download; else if rss_ingest already saved a local file path, try that. sidecar = None local_hint = ep.get("transcript_local") if local_hint: p = Path(local_hint) if p.exists(): sidecar = p if sidecar is None: sidecar = fetch_rss_transcript(ep, TMP) if not sidecar or not sidecar.exists(): return None # Convert to plain text plain = transcript_text_from_file(sidecar) lang = (ep.get("language") or ep.get("lang") or DEFAULT_TRANSCRIPT_LANG).split("-")[0] base = write_plain_transcript(media_path, plain, language=lang) # Place an SRT next to video for Plex ensure_sidecar_next_to_media(sidecar, media_path, lang=lang) # Write provenance sidecar (base.with_suffix(".prov.json")).write_bytes(orjson.dumps({ "source": "rss", "feed": ep.get("feed_url"), "guid": ep.get("guid"), "episode_title": ep.get("title"), "transcript_kind": sidecar.suffix.lower().lstrip("."), "transcript_url": _choose_transcript_url(ep)[0] or "", })) # Write Kodi/Plex-compatible NFO try: # Gather metadata for NFO from RSS entry meta = { "title": ep.get("title"), "episode_title": ep.get("title"), "show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show"), "description": ep.get("description") or ep.get("content"), "pubdate": ep.get("pubdate"), "pubdate_iso": ep.get("date_iso"), "duration_sec": ep.get("duration_sec") or ep.get("duration"), "image": ep.get("image") or ep.get("image_url"), "guid": ep.get("guid"), } txt_path = base.with_suffix(".txt") transcript_text = txt_path.read_text(encoding="utf-8") if txt_path.exists() else None write_episode_nfo(media_path, meta, transcript_text) # Save local artwork for Plex/Kodi try: save_episode_artwork(meta.get("image"), media_path, meta.get("show")) except Exception: pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) return base def find_sidecar_transcript(media_path: Path) -> Path | None: """Return a .txt/.srt/.vtt transcript file sitting next to media, if any. Tries common variants including language-suffixed SRT/VTT. """ candidates: list[Path] = [] # exact same stem in same folder for ext in [".txt", ".srt", ".vtt"]: p = media_path.parent / (media_path.stem + ext) if p.exists(): candidates.append(p) # language-suffixed near the media file (e.g., .en.srt) for ext in [".srt", ".vtt"]: p = media_path.with_suffix(f".en{ext}") if p.exists() and p not in candidates: candidates.append(p) return candidates[0] if candidates else None # ---------- Transcript repository reuse helpers ---------- def find_repo_transcript_for_media(media_path: Path) -> Path | None: """Search the transcript repository (/transcripts) for an existing transcript that likely belongs to this media file (match by YYYYMMDD in filename and/or fuzzy title similarity). Returns a path to a matching .json if found.""" try: stem = media_path.stem title_no_date = _stem_without_date(stem) file_date = _extract_date_from_stem(stem) best_json, best_score = None, 0.0 for j in TRN.glob("*.json"): try: data = json.loads(j.read_text(encoding="utf-8")) except Exception: continue other_file = Path(data.get("file", "")) other_stem = other_file.stem if other_file else j.stem other_date = _extract_date_from_stem(other_stem) # If both have dates and they differ a lot, skip if file_date and other_date and file_date != other_date: continue # Compare titles (without dates) sim = difflib.SequenceMatcher( None, _normalize_title(title_no_date), _normalize_title(_stem_without_date(other_stem)), ).ratio() # Nudge score when dates match if file_date and other_date and file_date == other_date: sim += 0.1 if sim > best_score: best_score, best_json = sim, j # Require a reasonable similarity return best_json if best_json and best_score >= 0.60 else None except Exception: return None def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None: """Copy/retarget an existing transcript JSON/TXT (and make SRT/VTT if possible) from the repository so that it belongs to the provided media_path. Returns the new base path in /transcripts or None.""" try: # load the source transcript data = json.loads(repo_json.read_text(encoding="utf-8")) src_base = TRN / Path(repo_json).stem src_txt = src_base.with_suffix(".txt") src_srt = src_base.with_suffix(".srt") src_vtt = src_base.with_suffix(".vtt") # write the retargeted artifacts new_title = media_path.stem new_base = TRN / new_title new_base.parent.mkdir(parents=True, exist_ok=True) # update file path data["file"] = str(media_path) (new_base.with_suffix(".json")).write_bytes(orjson.dumps(data)) # copy or synthesize TXT if src_txt.exists(): shutil.copy2(src_txt, new_base.with_suffix(".txt")) else: # fallback: concatenate segments txt = " ".join(s.get("text", "") for s in data.get("segments", [])) (new_base.with_suffix(".txt")).write_text(txt, encoding="utf-8") # copy SRT/VTT if present; otherwise synthesize SRT from segments if src_srt.exists(): shutil.copy2(src_srt, new_base.with_suffix(".srt")) else: # synthesize SRT def fmt_ts(t): h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60) return f"{h:02}:{m:02}:{s:06.3f}".replace('.',',') with open(new_base.with_suffix(".srt"), "w", encoding="utf-8") as srt: for i, s in enumerate(data.get("segments", []), 1): srt.write(f"{i}\n{fmt_ts(s.get('start',0.0))} --> {fmt_ts(s.get('end',0.0))}\n{s.get('text','').strip()}\n\n") if src_vtt.exists(): shutil.copy2(src_vtt, new_base.with_suffix(".vtt")) else: # synthesize VTT from segments def fmt_ts_vtt(t): h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60) return f"{h:02}:{m:02}:{s:06.3f}" with open(new_base.with_suffix(".vtt"), "w", encoding="utf-8") as vtt: vtt.write("WEBVTT\n\n") for s in data.get("segments", []): vtt.write(f"{fmt_ts_vtt(s.get('start',0.0))} --> {fmt_ts_vtt(s.get('end',0.0))} \n{s.get('text','').strip()}\n\n") # ensure sidecar next to media try: lang = (data.get("language") or DEFAULT_TRANSCRIPT_LANG).split("-")[0] ensure_sidecar_next_to_media(new_base.with_suffix(".srt"), media_path, lang=lang) except Exception: pass # Write Kodi/Plex-compatible NFO try: meta = { "title": data.get("title") or media_path.stem, "episode_title": data.get("title") or media_path.stem, "show": data.get("show") or media_path.parent.name, "description": data.get("description") or "", "pubdate": data.get("pubdate") or data.get("date"), "duration_sec": media_duration_seconds(media_path), "image": data.get("image"), "guid": data.get("guid") or data.get("id"), } txtp = new_base.with_suffix(".txt") ttxt = txtp.read_text(encoding="utf-8") if txtp.exists() else None write_episode_nfo(media_path, meta, ttxt) # Save local artwork for Plex/Kodi try: save_episode_artwork(meta.get("image"), media_path, meta.get("show")) except Exception: pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) return new_base except Exception as e: print(f"[resolver] failed to reuse repo transcript: {e}", flush=True) return None def transcript_text_from_file(path: Path) -> str: """Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters.""" try: raw = path.read_text(encoding="utf-8", errors="ignore") except Exception: raw = path.read_text(errors="ignore") if path.suffix.lower() == ".txt": return raw.strip() # For SRT/VTT, drop timestamp lines, cue numbers and headers lines: list[str] = [] for line in raw.splitlines(): ls = line.strip() if not ls: continue if "-->" in ls: # timestamp line continue if ls.upper().startswith("WEBVTT"): continue if re.match(r"^\d+$", ls): # cue index continue lines.append(ls) return " ".join(lines) def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None: """Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed. If the sidecar is .txt, do nothing.""" try: if sidecar.suffix.lower() == ".txt": return if sidecar.suffix.lower() == ".srt": dst = media_path.with_suffix(f".{lang}.srt") shutil.copy2(sidecar, dst) elif sidecar.suffix.lower() == ".vtt": tmp_srt = sidecar.with_suffix(".srt") subprocess.run(["ffmpeg", "-nostdin", "-y", "-i", str(sidecar), str(tmp_srt)], check=True) dst = media_path.with_suffix(f".{lang}.srt") shutil.move(str(tmp_srt), dst) except Exception as e: print(f"[post] sidecar copy/convert failed: {e}", flush=True) # --- small helpers for progress/ETA formatting --- def _fmt_eta(sec: float) -> str: try: sec = max(0, int(sec)) h, rem = divmod(sec, 3600) m, s = divmod(rem, 60) if h: return f"{h}h {m}m {s}s" if m: return f"{m}m {s}s" return f"{s}s" except Exception: return "" def save_episode_artwork(image_url: str | None, media_path: Path, show_title: str | None = None): """Download episode artwork from image_url and save next to the media as '.jpg'. Also drop a folder-level 'poster.jpg' for the show directory if not present. Best-effort; failures are logged but non-fatal. """ if not image_url: return try: resp = requests.get(image_url, timeout=30, stream=True) resp.raise_for_status() # Determine content-type and write a temporary file ctype = (resp.headers.get("Content-Type") or "").lower() tmp_file = media_path.with_suffix(".art.tmp") with open(tmp_file, "wb") as out: for chunk in resp.iter_content(chunk_size=8192): if chunk: out.write(chunk) # Always provide a .jpg next to the media for Plex episode_jpg = media_path.with_suffix(".jpg") if "image/jpeg" in ctype: # Already JPEG shutil.move(str(tmp_file), str(episode_jpg)) else: # Try converting to JPEG with ffmpeg; if it fails, keep bytes as-is try: subprocess.run( ["ffmpeg", "-nostdin", "-y", "-i", str(tmp_file), str(episode_jpg)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True ) try: tmp_file.unlink() except Exception: pass except Exception: shutil.move(str(tmp_file), str(episode_jpg)) # Also drop a folder poster once per show (helps Plex folder views) try: show_poster = media_path.parent / "poster.jpg" if not show_poster.exists(): shutil.copy2(episode_jpg, show_poster) except Exception: pass except Exception as e: print(f"[post] artwork download failed: {e}", flush=True) def find_companion_files(src: Path) -> dict: """Return likely yt-dlp companion files for a downloaded media file.""" out = {} # info.json can be either "..info.json" or ".info.json" cands_info = [ src.parent / f"{src.name}.info.json", src.parent / f"{src.stem}.info.json", ] out["info"] = next((p for p in cands_info if p.exists()), None) # thumbnails may be "..jpg" or ".jpg" (we convert to jpg) cand_thumbs = [ src.parent / f"{src.name}.jpg", src.parent / f"{src.stem}.jpg", src.parent / f"{src.stem}.jpeg", src.parent / f"{src.stem}.png", src.parent / f"{src.stem}.webp", ] out["thumb"] = next((p for p in cand_thumbs if p.exists()), None) # subtitles (keep multiple) subs = [] for s in src.parent.glob(f"{src.stem}*.srt"): subs.append(s) for s in src.parent.glob(f"{src.stem}*.vtt"): subs.append(s) out["subs"] = subs return out def load_info_json(path: Path) -> dict | None: try: return json.loads(path.read_text(encoding="utf-8")) except Exception: return None def _iso_from_yyyymmdd(s: str | None) -> str | None: if not s or not re.match(r"^\d{8}$", s): return None return f"{s[0:4]}-{s[4:6]}-{s[6:8]}" def build_meta_from_sources(media_path: Path, uploader: str, fallback_meta: dict, ep: dict | None = None) -> dict: """ Merge metadata from (priority): RSS episode `ep` -> yt-dlp info.json (if present) -> fallback. Returns a dict compatible with write_episode_nfo(). """ # Start with fallback meta = dict(fallback_meta) # Augment from info.json if present info = None for cand in [ media_path.parent / f"{media_path.name}.info.json", media_path.parent / f"{media_path.stem}.info.json", ]: if cand.exists(): info = load_info_json(cand) break if info: meta.setdefault("title", info.get("title")) meta.setdefault("episode_title", info.get("title")) meta.setdefault("description", info.get("description") or info.get("fulltitle")) # upload_date is YYYYMMDD iso = _iso_from_yyyymmdd(info.get("upload_date")) if iso: meta["pubdate_iso"] = iso # Prefer video duration if present if not meta.get("duration_sec") and info.get("duration"): meta["duration_sec"] = info.get("duration") # thumbnail URL if not meta.get("image"): meta["image"] = info.get("thumbnail") # show/uploader if not meta.get("show"): meta["show"] = info.get("uploader") or uploader # Finally, layer RSS data on top if available (most authoritative for podcasts) if ep: meta.update({ "title": ep.get("title") or meta.get("title"), "episode_title": ep.get("title") or meta.get("episode_title"), "show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or meta.get("show") or uploader, "description": ep.get("description") or ep.get("content") or meta.get("description", ""), "pubdate": ep.get("pubdate") or meta.get("pubdate", ""), "pubdate_iso": ep.get("date_iso") or meta.get("pubdate_iso", meta.get("pubdate")), "duration_sec": ep.get("duration_sec") or ep.get("duration") or meta.get("duration_sec"), "image": ep.get("image") or ep.get("image_url") or meta.get("image", ""), "guid": ep.get("guid") or meta.get("guid", ""), }) return meta # ---------- Kodi/Plex NFO writer ---------- from datetime import datetime def _first_nonempty(*vals): for v in vals: if v is None: continue if isinstance(v, str) and v.strip(): return v.strip() if v: return v return None def _coerce_aired(pubdate: str | None) -> str: """Convert RSS-style pubdate to YYYY-MM-DD if possible.""" if not pubdate: return "" # already ISO-like m = re.match(r"^(\d{4})[-/](\d{2})[-/](\d{2})", pubdate) if m: return f"{m.group(1)}-{m.group(2)}-{m.group(3)}" # RFC 2822 example: Tue, 21 Feb 2023 06:00:00 +0000 try: dt = datetime.strptime(pubdate[:31], "%a, %d %b %Y %H:%M:%S %z") return dt.strftime("%Y-%m-%d") except Exception: # try without tz try: dt = datetime.strptime(pubdate[:25], "%a, %d %b %Y %H:%M:%S") return dt.strftime("%Y-%m-%d") except Exception: return "" def write_episode_nfo(media_path: Path, meta: dict, transcript_text: str | None = None) -> Path: """Write a minimal Kodi/Plex-compatible NFO next to the media file. `meta` may include: title, show, plot, pubdate, duration_sec, thumb, guid. """ try: title = _first_nonempty(meta.get("episode_title"), meta.get("title"), media_path.stem) or media_path.stem show = _first_nonempty(meta.get("show"), meta.get("podcast_title"), meta.get("feed_title"), media_path.parent.name) or media_path.parent.name plot = _first_nonempty(meta.get("description"), meta.get("content"), meta.get("summary"), "") or "" # Optionally append transcript preview to plot if transcript_text: preview = transcript_text.strip() if preview: preview = (preview[:1800] + "…") if len(preview) > 1800 else preview plot = (plot + "\n\n" if plot else "") + preview aired = _coerce_aired(_first_nonempty(meta.get("pubdate_iso"), meta.get("pubdate"))) guid = _first_nonempty(meta.get("guid"), meta.get("id"), "") or "" thumb = _first_nonempty(meta.get("image"), meta.get("image_url"), meta.get("thumbnail"), "") or "" dur_s = meta.get("duration_sec") or meta.get("duration") or 0 try: dur_min = int(round(float(dur_s) / 60.0)) if dur_s else 0 except Exception: dur_min = 0 # Build XML xml = [""] xml.append(f" {xml_escape(title)}") xml.append(f" {xml_escape(show)}") if plot: xml.append(f" {xml_escape(plot)}") if aired: xml.append(f" {xml_escape(aired)}") if guid: xml.append(f" {xml_escape(guid)}") if dur_min: xml.append(f" {dur_min}") if thumb: xml.append(f" {xml_escape(thumb)}") xml.append("\n") nfo_path = media_path.with_suffix(".nfo") nfo_path.write_text("\n".join(xml), encoding="utf-8") return nfo_path except Exception: return media_path.with_suffix(".nfo") def write_plain_transcript(media_path: Path, text: str, language: str = "en") -> Path: """Write minimal transcript artifacts (.txt + .json) from plain text (no timestamps).""" title = media_path.stem base = TRN / title base.parent.mkdir(parents=True, exist_ok=True) (base.with_suffix(".txt")).write_text(text, encoding="utf-8") (base.with_suffix(".json")).write_bytes(orjson.dumps({ "file": str(media_path), "language": language, "segments": [{"start": 0.0, "end": 0.0, "text": text}] })) return base def yt_dlp(url, outdir): # 1) Normalize YouTube Music URLs to standard YouTube yurl = url if 'music.youtube.com' in yurl: yurl = yurl.replace('music.youtube.com', 'www.youtube.com') outtmpl = str(outdir / "%(uploader)s/%(upload_date)s - %(title)s.%(ext)s") base_cmd = [ "yt-dlp", "-o", outtmpl, "-f", "bv*+ba/best", "--write-info-json", "--write-thumbnail", "--convert-thumbnails", "jpg", "--write-subs", "--write-auto-subs", "--sub-langs", os.getenv("YTDLP_SUBS_LANGS", "en.*,en"), "--convert-subs", "srt", "--no-playlist", "--no-warnings", "--restrict-filenames", ] # 3) Optional cookies (set YTDLP_COOKIES=/path/to/cookies.txt in .env and mount it) cookies_path = os.getenv("YTDLP_COOKIES", "").strip() if cookies_path: base_cmd += ["--cookies", cookies_path] # Primary attempt try: subprocess.check_call(base_cmd + [yurl]) except subprocess.CalledProcessError: # 2) Retry with Android client + mobile UA retry_cmd = base_cmd + [ "--extractor-args", "youtube:player_client=android", "--user-agent", "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Mobile Safari/537.36", yurl, ] subprocess.check_call(retry_cmd) media = ( list(outdir.rglob("*.[mM][pP]4")) + list(outdir.rglob("*.mkv")) + list(outdir.rglob("*.webm")) + list(outdir.rglob("*.m4a")) + list(outdir.rglob("*.mp3")) ) return sorted(media, key=lambda p: p.stat().st_mtime)[-1:] def extract_audio(src: Path, outdir: Path) -> Path: """Extract mono 16kHz WAV for robust transcription (handles odd containers/codecs).""" outdir.mkdir(parents=True, exist_ok=True) wav_path = outdir / (src.stem + ".wav") # Force audio-only, mono, 16kHz WAV cmd = [ "ffmpeg", "-nostdin", "-y", "-i", str(src), "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", str(wav_path), ] try: subprocess.check_output(cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: raise RuntimeError(f"ffmpeg extract failed: {e.output.decode(errors='ignore')}") return wav_path # --- WAV trimming helper --- def trim_wav(src_wav: Path, start_sec: float, outdir: Path) -> Path: """Return a trimmed 16k mono WAV starting at start_sec from src_wav.""" outdir.mkdir(parents=True, exist_ok=True) if not start_sec or start_sec <= 0.0: return src_wav dst = outdir / (src_wav.stem + f".from_{int(start_sec)}s.wav") try: subprocess.check_output([ "ffmpeg", "-nostdin", "-y", "-ss", str(max(0.0, float(start_sec))), "-i", str(src_wav), "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", str(dst), ], stderr=subprocess.STDOUT) return dst except subprocess.CalledProcessError as e: # If trimming fails, fall back to full file print(f"[whisper] trim failed, using full WAV: {e.output.decode(errors='ignore')}", flush=True) return src_wav def media_duration_seconds(path: Path) -> float: """Return duration in seconds using ffprobe; fallback to 0.0 on error.""" try: out = subprocess.check_output([ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=nokey=1:noprint_wrappers=1", str(path) ], stderr=subprocess.STDOUT, text=True).strip() return float(out) if out else 0.0 except Exception: return 0.0 # --- Partial transcript helpers --- def _partial_paths(title: str) -> tuple[Path, Path]: base = TRN / title return base.with_suffix(".partial.json"), base.with_suffix(".partial.txt") def _save_partial(title: str, language: str, segs: list[dict]): pjson, ptxt = _partial_paths(title) try: # Save JSON pjson.write_bytes(orjson.dumps({"file": str((TRN / title).with_suffix('.wav')), "language": language, "segments": segs})) except Exception as e: print(f"[whisper] partial json save failed: {e}", flush=True) try: # Save TXT snapshot ptxt.write_text(" ".join(s.get("text","") for s in segs), encoding="utf-8") except Exception as e: print(f"[whisper] partial txt save failed: {e}", flush=True) def transcribe(media_path: Path): model = get_model() print(f"[whisper] start transcribe: {media_path}", flush=True) # 1) Robustly extract audio to 16k mono WAV (fixes pyAV/webm edge cases) wav = extract_audio(media_path, TMP) title = media_path.stem base = TRN / title # Resume support: if a partial checkpoint exists, load it and trim input resume_segments = [] resume_offset = 0.0 language_hint = None if WHISPER_RESUME: pjson, ptxt = _partial_paths(title) if pjson.exists(): try: pdata = json.loads(pjson.read_text(encoding="utf-8")) resume_segments = pdata.get("segments", []) or [] if resume_segments: resume_offset = float(resume_segments[-1].get("end", 0.0)) language_hint = pdata.get("language") print(f"[whisper] resuming from ~{resume_offset:.2f}s with {len(resume_segments)} segments", flush=True) except Exception as e: print(f"[whisper] failed to load partial: {e}", flush=True) # If resuming, trim WAV from last end time wav_for_run = trim_wav(wav, resume_offset, TMP) # 2) Language selection lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE if language_hint and WHISPER_LANGUAGE.lower() == "auto": # carry hint forward if available lang = language_hint # 3) Transcribe segments, info = run_transcribe_with_fallback(wav_for_run, lang) # Determine duration for progress; use full WAV duration for consistent % regardless of resume dur = media_duration_seconds(wav) or 0.0 # Start wall clock timer for speed/ETA start_wall = time.time() if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur: print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True) resume_offset = 0.0 last_pct = -1 segs = list(resume_segments) # start with what we already have text_parts = [s.get("text","") for s in resume_segments] # Walk new segments; shift their timestamps by resume_offset if trimmed seg_count_since_save = 0 seg_index = len(resume_segments) for s in segments: seg_index += 1 start = (s.start or 0.0) + resume_offset end = (s.end or 0.0) + resume_offset seg = {"start": start, "end": end, "text": s.text} segs.append(seg) text_parts.append(s.text) if WHISPER_LOG_SEGMENTS: print(f"[whisper] {start:8.2f}–{end:8.2f} {s.text.strip()}", flush=True) # progress logging every +5% if dur > 0 and end is not None: pct = int(min(100, max(0, (end / dur) * 100))) if pct >= last_pct + 5: log({ "status": "transcribing", "path": str(media_path), "title": title, "progress": pct }) last_pct = pct # compute realtime speed and ETA for console logs try: elapsed = max(0.001, time.time() - start_wall) processed = max(0.0, float(end)) speed = (processed / elapsed) if elapsed > 0 else 0.0 # seconds processed per second # represent as X real-time factor rtf = speed # 1.0 == real-time eta = ((dur - processed) / speed) if (speed > 0 and dur > 0) else 0 print(f"[whisper] progress {pct:3d}% seg={seg_index:5d} rtf={rtf:0.2f}x eta={_fmt_eta(eta)}", flush=True) # also mirror to feed log with speed/eta try: log({ "status": "transcribing", "path": str(media_path), "title": title, "progress": pct, "speed_rtf": round(rtf, 2), "eta_sec": int(max(0, eta)) }) except Exception: pass except Exception: pass # periodic partial save seg_count_since_save += 1 if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS: _save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs) seg_count_since_save = 0 # ensure we mark 100% on completion if last_pct < 100: log({"status": "transcribing", "path": str(media_path), "title": title, "progress": 100}) txt = " ".join(text_parts).strip() # Write final transcript artifacts (base.with_suffix(".json")).write_bytes(orjson.dumps({ "file": str(media_path), "language": info.language, "segments": segs })) (base.with_suffix(".txt")).write_text(txt, encoding="utf-8") def fmt_ts(t): h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60) return f"{h:02}:{m:02}:{s:06.3f}".replace('.',',') with open(base.with_suffix(".srt"), "w", encoding="utf-8") as srt: for i,s in enumerate(segs,1): srt.write(f"{i}\n{fmt_ts(s['start'])} --> {fmt_ts(s['end'])}\n{s['text'].strip()}\n\n") with open(base.with_suffix(".vtt"), "w", encoding="utf-8") as vtt: vtt.write("WEBVTT\n\n") for s in segs: vtt.write(f"{fmt_ts(s['start']).replace(',', '.')} --> {fmt_ts(s['end']).replace(',', '.')} \n{s['text'].strip()}\n\n") # 4) Copy SRT next to media for Plex (language-suffixed) try: lang_code = (info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != 'auto' else 'en')).lower() srt_src = base.with_suffix(".srt") srt_dst = media_path.with_suffix(f".{lang_code}.srt") shutil.copy2(srt_src, srt_dst) except Exception as e: print(f"[post] could not copy srt -> {srt_dst}: {e}", flush=True) # Write Kodi/Plex-compatible NFO using enhanced metadata (same as before) try: fallback = { "title": title, "episode_title": title, "show": media_path.parent.name, "description": "", "pubdate": _extract_date_from_stem(title), "duration_sec": media_duration_seconds(media_path), "image": "", "guid": "", } meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None) ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8") write_episode_nfo(media_path, meta, ttxt) try: save_episode_artwork(meta.get("image"), media_path, meta.get("show")) except Exception: pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) # Cleanup temp WAVs try: if wav_for_run != wav and wav_for_run.exists(): wav_for_run.unlink() if wav.exists(): wav.unlink() except Exception: pass # Remove partial checkpoints on success if WHISPER_RESUME: try: pjson, ptxt = _partial_paths(title) if pjson.exists(): pjson.unlink() if ptxt.exists(): ptxt.unlink() except Exception: pass # Final average speed over whole transcription try: total_elapsed = max(0.001, time.time() - start_wall) avg_rtf = (dur / total_elapsed) if total_elapsed > 0 else 0.0 print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True) except Exception: pass print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True) return base # --- Meilisearch helpers --- def _safe_doc_id(s: str) -> str: """ Meilisearch document IDs must be [A-Za-z0-9_-]. Convert the title to a safe slug. If the result is empty, fall back to a short SHA1 hash. """ import hashlib slug = re.sub(r"\s+", "_", (s or "").strip()) slug = re.sub(r"[^A-Za-z0-9_-]", "", slug) if not slug: slug = hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:16] return slug def ensure_meili_index(): """Create index 'library' with primaryKey 'id' if it does not already exist.""" try: r = requests.get(f"{MEILI_URL}/indexes/library", headers={"Authorization": f"Bearer {MEILI_KEY}"}, timeout=10) if r.status_code == 200: return # Attempt to create it cr = requests.post( f"{MEILI_URL}/indexes", headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"}, data=orjson.dumps({"uid": "library", "primaryKey": "id"}), timeout=10, ) # Ignore errors if another process created it first try: cr.raise_for_status() except Exception: pass except Exception: # Non-fatal; indexing will fail later if the index truly doesn't exist pass def index_meili(json_path: Path): # Make sure the index exists and is configured with a primary key ensure_meili_index() doc = json.loads(open(json_path, "r", encoding="utf-8").read()) file_field = doc.get("file", "") title = Path(file_field).stem if file_field else json_path.stem # Build a Meili-safe document ID doc_id = _safe_doc_id(title) # Extract a YYYYMMDD date if present m = re.search(r"\b(\d{8})\b", title) date = m.group(1) if m else "" payload = { "id": doc_id, "type": "podcast", "title": title, "date": date, "source": str(Path(LIB, Path(file_field or title).name)), "text": " ".join(s.get("text", "") for s in doc.get("segments", [])), "segments": doc.get("segments", []), "meta": {"language": doc.get("language", "")}, } for attempt in range(5): try: r = requests.post( f"{MEILI_URL}/indexes/library/documents", headers={ "Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json", }, data=orjson.dumps(payload), timeout=15, ) r.raise_for_status() break except Exception: if attempt == 4: raise time.sleep(2 * (attempt + 1)) import tldextract, trafilatura, requests as _requests def slugify(text): text = re.sub(r'[^A-Za-z0-9\-._ ]+', '', text).strip().replace(' ', '_') return text[:120] or 'page' def save_web_snapshot(url: str): r = _requests.get(url, timeout=30, headers={"User-Agent":"Mozilla/5.0"}) r.raise_for_status() html = r.text downloaded = trafilatura.load_html(html, url=url) text = trafilatura.extract(downloaded, include_comments=False, include_images=False, with_metadata=True) or "" meta = trafilatura.metadata.extract_metadata(downloaded) or None title = (meta.title if meta and getattr(meta, 'title', None) else None) or (re.search(r']*>(.*?)', html, re.I|re.S).group(1).strip() if re.search(r']*>(.*?)', html, re.I|re.S) else url) date = (meta.date if meta and getattr(meta, 'date', None) else "") parts = tldextract.extract(url) domain = ".".join([p for p in [parts.domain, parts.suffix] if p]) slug = slugify(title) outdir = LIB / "web" / domain outdir.mkdir(parents=True, exist_ok=True) base = outdir / slug open(base.with_suffix(".html"), "w", encoding="utf-8", errors="ignore").write(html) open(base.with_suffix(".txt"), "w", encoding="utf-8", errors="ignore").write(text) return base, title, domain, date, text def index_web(base: Path, title: str, domain: str, date: str, text: str, url: str): payload = { "id": f"web:{domain}:{base.stem}", "type": "web", "title": title, "date": re.sub(r'[^0-9]', '', date)[:8] if date else "", "source": f"file://{str(base.with_suffix('.html'))}", "text": text, "segments": [], "meta": {"url": url, "domain": domain} } r = requests.post(f"{MEILI_URL}/indexes/library/documents", headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"}, data=orjson.dumps(payload)) r.raise_for_status() def is_media_url(url: str): lowered = url.lower() media_hosts = ["youtube.com","youtu.be","rumble.com","vimeo.com","soundcloud.com","spotify.com","podbean.com","buzzsprout.com"] return any(h in lowered for h in media_hosts) def owui_headers(): return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {} def owui_get_or_create_kb(): if not OWUI_URL or not OWUI_KEY: return None try: r = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15) r.raise_for_status() for kb in r.json().get("data", []): if kb.get("name") == OWUI_KB: return kb["id"] except Exception: pass r = requests.post( f"{OWUI_URL}/api/v1/knowledge/create", headers={**owui_headers(), "Content-Type": "application/json"}, data=orjson.dumps({"name": OWUI_KB, "description": "All local content indexed by podx"}), timeout=15, ) r.raise_for_status() return r.json()["data"]["id"] def owui_upload_and_attach(path: Path, kb_id: str): with open(path, "rb") as f: r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=owui_headers(), files={"file": (path.name, f)}, timeout=60*10) r.raise_for_status() file_id = r.json()["data"]["id"] r = requests.post( f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add", headers={**owui_headers(), "Content-Type": "application/json"}, data=orjson.dumps({"file_id": file_id}), timeout=60, ) r.raise_for_status() return True def publish_to_openwebui(paths): if not OWUI_URL or not OWUI_KEY: return try: kb_id = owui_get_or_create_kb() for p in paths: p = Path(p) if not p.exists(): continue try: owui_upload_and_attach(p, kb_id) except Exception as e: log({"url": str(p), "status": "owui_error", "error": str(e)}) except Exception as e: log({"status": "owui_error", "error": str(e)}) def handle_local_file(path_str: str): """Transcribe & index a local media file that already exists in /library. If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper. Safe to call repeatedly; it skips if transcript JSON already exists. """ try: p = Path(path_str) if not p.exists(): log({"url": path_str, "status": "error", "error": "file_not_found"}) return title = p.stem base_json = TRN / f"{title}.json" if base_json.exists(): log({"url": path_str, "status": "skip", "reason": "already_transcribed"}) return info = {"url": path_str, "status": "transcribing", "title": title, "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0} log(info) # 0) Try RSS resolver first: if episode with transcript exists, use it (skip Whisper) try: ep = match_media_to_rss(p) except Exception as _e: ep = None if ep: base = use_rss_transcript(p, ep) if base: index_meili(base.with_suffix(".json")) publish_to_openwebui([base.with_suffix(".txt")]) log({**info, **{"status": "done", "note": "used_rss_transcript"}}) return # 1) Prefer an existing transcript sidecar if present sidecar = find_sidecar_transcript(p) if sidecar: plain = transcript_text_from_file(sidecar) lang = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en" base = write_plain_transcript(p, plain, language=lang) ensure_sidecar_next_to_media(sidecar, p, lang=lang) index_meili(base.with_suffix(".json")) publish_to_openwebui([base.with_suffix(".txt")]) try: # Use info.json (if present) to enrich metadata fallback = { "title": title, "episode_title": title, "show": p.parent.name, "description": "", "pubdate": _extract_date_from_stem(title), "duration_sec": media_duration_seconds(p), "image": "", "guid": "", } meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None) ttxt = base.with_suffix(".txt").read_text(encoding="utf-8") write_episode_nfo(p, meta, ttxt) # Try to fetch and save artwork locally try: save_episode_artwork(meta.get("image"), p, meta.get("show")) except Exception: pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) log({**info, **{"status": "done", "note": "used_existing_transcript"}}) return # 1.5) Reuse a transcript that exists in the repository for a matching episode repo_json = find_repo_transcript_for_media(p) if repo_json: base = reuse_repo_transcript(p, repo_json) if base: index_meili(base.with_suffix(".json")) publish_to_openwebui([base.with_suffix(".txt")]) try: data = json.loads((base.with_suffix(".json")).read_text(encoding="utf-8")) # Start with repo metadata, then enrich from yt-dlp info.json if any meta_repo = { "title": data.get("title") or title, "episode_title": data.get("title") or title, "show": data.get("show") or p.parent.name, "description": data.get("description") or "", "pubdate": data.get("pubdate") or _extract_date_from_stem(title), "duration_sec": media_duration_seconds(p), "image": data.get("image"), "guid": data.get("guid") or data.get("id"), } meta = build_meta_from_sources(p, p.parent.name, meta_repo, ep=None) ttxt = base.with_suffix(".txt").read_text(encoding="utf-8") write_episode_nfo(p, meta, ttxt) try: save_episode_artwork(meta.get("image"), p, meta.get("show")) except Exception: pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) log({**info, **{"status": "done", "note": "reused_repo_transcript"}}) return # 2) Otherwise, run transcription base = transcribe(p) index_meili(base.with_suffix(".json")) publish_to_openwebui([base.with_suffix(".txt")]) try: fallback = { "title": title, "episode_title": title, "show": p.parent.name, "description": "", "pubdate": _extract_date_from_stem(title), "duration_sec": media_duration_seconds(p), "image": "", "guid": "", } meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None) ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8") write_episode_nfo(p, meta, ttxt) try: save_episode_artwork(meta.get("image"), p, meta.get("show")) except Exception: pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) log({**info, **{"status": "done"}}) except Exception as e: log({"url": path_str, "status": "error", "error": str(e)}) raise # --- Refresh sidecar metadata and subtitles for an already-downloaded media file --- def refresh_media(path_str: str): """ Refresh sidecar metadata (info.json, thumbnail) and subtitles for an already-downloaded media file. Requires a companion .info.json next to the media (to supply the original URL). No media re-download. """ try: p = Path(path_str) if not p.exists() or not p.is_file(): log({"url": path_str, "status": "error", "error": "file_not_found"}) return # Locate existing info.json to get the original URL info_json = None for cand in [p.parent / f"{p.name}.info.json", p.parent / f"{p.stem}.info.json"]: if cand.exists(): info_json = cand break if not info_json: log({"path": str(p), "status": "refresh-skip", "reason": "no_info_json"}) print(f"[refresh] skip: no info.json next to {p}", flush=True) return info = load_info_json(info_json) or {} url = info.get("webpage_url") or info.get("original_url") or info.get("url") if not url: log({"path": str(p), "status": "refresh-skip", "reason": "no_url_in_info"}) print(f"[refresh] skip: no URL in {info_json}", flush=True) return # Prepare yt-dlp command to refresh sidecars only, writing files exactly next to the media outtmpl = str(p.with_suffix(".%(ext)s")) sub_langs = os.getenv("YTDLP_SUBS_LANGS", "en.*,en") cmd = [ "yt-dlp", "--skip-download", "--write-info-json", "--write-thumbnail", "--convert-thumbnails", "jpg", "--write-subs", "--write-auto-subs", "--sub-langs", sub_langs, "--convert-subs", "srt", "-o", outtmpl, url, ] print(f"[refresh] refreshing sidecars for {p} via yt-dlp", flush=True) try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print(f"[refresh] yt-dlp failed: {e}", flush=True) raise # Ensure language-suffixed SRT exists (Plex-friendly) if any subs were fetched try: # Pick any .srt just fetched that matches base for s in p.parent.glob(f"{p.stem}*.srt"): # If it's already lang-suffixed, keep; also copy to .en.srt when only plain .srt exists if s.name == f"{p.stem}.srt": shutil.copy2(s, p.with_suffix(".en.srt")) except Exception: pass # Rebuild NFO using fresh info.json (and RSS if available) try: # Try RSS match to enrich metadata (non-fatal if not present) ep = None try: ep = match_media_to_rss(p) except Exception: ep = None fallback = { "title": p.stem, "episode_title": p.stem, "show": p.parent.name, "description": "", "pubdate": _extract_date_from_stem(p.stem), "duration_sec": media_duration_seconds(p), "image": "", "guid": "", } meta = build_meta_from_sources(p, p.parent.name, fallback, ep) # Save local artwork too try: save_episode_artwork(meta.get("image"), p, meta.get("show")) except Exception: pass # If a transcript already exists, include it in the NFO plot preview ttxt_path = (TRN / p.stem).with_suffix(".txt") ttxt = ttxt_path.read_text(encoding="utf-8") if ttxt_path.exists() else None write_episode_nfo(p, meta, ttxt) except Exception as e: print(f"[refresh] NFO/artwork update failed: {e}", flush=True) log({"path": str(p), "status": "refresh-done"}) print(f"[refresh] done for {p}", flush=True) except Exception as e: log({"path": path_str, "status": "error", "error": str(e)}) raise def handle_web(url: str): info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""} log(info) base, title, domain, date, text = save_web_snapshot(url) info.update({"title": title, "uploader": domain, "date": date, "path": str(base.with_suffix('.html'))}) log({**info, **{"status":"web-indexing"}}) index_web(base, title, domain, date, text, url) push = [p for p in [base.with_suffix('.txt'), base.with_suffix('.html')] if p.exists()] publish_to_openwebui(push) log({**info, **{"status":"done"}}) def handle_url(url: str): try: # If a local file path (or file:// URL) is provided, process it directly if url.startswith("file://"): return handle_local_file(url[7:]) if url.startswith("/") and Path(url).exists(): return handle_local_file(url) if not is_media_url(url): handle_web(url) return info = {"url": url, "status":"queued", "title":"", "uploader":"", "date":"", "path":""} log({**info, **{"status":"downloading"}}) files = yt_dlp(url, TMP) for f in files: parts = f.relative_to(TMP).parts uploader = sanitize(parts[0]) if len(parts)>1 else "Unknown" dest_dir = LIB / uploader dest_dir.mkdir(parents=True, exist_ok=True) dest = dest_dir / sanitize(f.name) shutil.move(str(f), dest) # Move companion files produced by yt-dlp (info.json, thumbnail, subtitles) try: companions = find_companion_files(f) # info.json -> prefer ".info.json", fallback to ".info.json" if companions.get("info") and companions["info"].exists(): dest_info = dest.parent / f"{dest.name}.info.json" try: shutil.move(str(companions["info"]), dest_info) except Exception: # fallback naming without extension dest_info2 = dest.parent / f"{dest.stem}.info.json" try: shutil.move(str(companions['info']), dest_info2) except Exception: pass # thumbnail -> ".jpg" if companions.get("thumb") and companions["thumb"].exists(): try: shutil.move(str(companions["thumb"]), str(dest.with_suffix(".jpg"))) except Exception: pass # subtitles -> preserve language suffix: "" for s in companions.get("subs", []): if not s.exists(): continue suffix_tail = "" s_name = s.name f_stem = f.stem if s_name.startswith(f_stem): suffix_tail = s_name[len(f_stem):] # includes leading dot if present else: suffix_tail = s.suffix dest_sub = dest.parent / f"{dest.stem}{suffix_tail}" try: shutil.move(str(s), str(dest_sub)) except Exception: pass except Exception: pass info.update({"title": dest.stem, "uploader": uploader, "date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""), "path": str(dest)}) log({**info, **{"status":"transcribing", "progress": 0}}) # Try RSS transcript resolver first ep = None try: ep = match_media_to_rss(dest) except Exception: ep = None if ep: base = use_rss_transcript(dest, ep) else: base = None # 1.5) If we didn't get an RSS transcript and there is a matching one already in the repo, reuse it if not base: repo_json = find_repo_transcript_for_media(dest) if repo_json: base = reuse_repo_transcript(dest, repo_json) if not base: base = transcribe(dest) index_meili(base.with_suffix(".json")) publish_to_openwebui([base.with_suffix(".txt")]) try: # Build metadata from RSS (if matched), yt-dlp info.json, and sensible fallbacks fallback = { "title": dest.stem, "episode_title": dest.stem, "show": uploader, "description": "", "pubdate": _extract_date_from_stem(dest.stem), "duration_sec": media_duration_seconds(dest), "image": "", "guid": "", } meta = build_meta_from_sources(dest, uploader, fallback, ep if 'ep' in locals() else None) ttxt = base.with_suffix(".txt").read_text(encoding="utf-8") write_episode_nfo(dest, meta, ttxt) # Save local artwork for Plex/Kodi from meta image url try: save_episode_artwork(meta.get("image"), dest, meta.get("show")) except Exception: pass except Exception as e: print(f"[post] NFO write failed: {e}", flush=True) log({**info, **{"status":"done"}}) except Exception as e: log({"url": url, "status":"error", "error": str(e)}) raise