1561 lines
		
	
	
		
			63 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			1561 lines
		
	
	
		
			63 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os, subprocess, shutil, json, re, orjson, requests
 | ||
| from pathlib import Path
 | ||
| import math
 | ||
| import difflib
 | ||
| import time
 | ||
| from faster_whisper import WhisperModel
 | ||
| 
 | ||
| from xml.sax.saxutils import escape as xml_escape
 | ||
| 
 | ||
| MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
 | ||
| MEILI_KEY = os.getenv("MEILI_KEY", "")
 | ||
| LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
 | ||
| TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
 | ||
| TMP = Path(os.getenv("TMP_ROOT", "/tmpdl"))
 | ||
| 
 | ||
| MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3")
 | ||
| COMPUTE = os.getenv("WHISPER_PRECISION","int8")
 | ||
| WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()
 | ||
| 
 | ||
| # Whisper device/config controls
 | ||
| WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "auto").strip()
 | ||
| WHISPER_DEVICE_INDEX = int(os.getenv("WHISPER_DEVICE_INDEX", "0"))
 | ||
| WHISPER_CPU_THREADS = int(os.getenv("WHISPER_CPU_THREADS", "4"))
 | ||
| 
 | ||
| # Whisper logging & resume controls
 | ||
| WHISPER_LOG_SEGMENTS = os.getenv("WHISPER_LOG_SEGMENTS", "1") not in ("0", "false", "False")
 | ||
| WHISPER_RESUME = os.getenv("WHISPER_RESUME", "1") not in ("0", "false", "False")
 | ||
| PARTIAL_SAVE_EVERY_SEGS = int(os.getenv("WHISPER_PARTIAL_SAVE_EVERY_SEGS", "20"))
 | ||
| 
 | ||
| # RSS resolver config
 | ||
| RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json"))
 | ||
| RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150"))  # seconds
 | ||
| DEFAULT_TRANSCRIPT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
 | ||
| 
 | ||
| OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
 | ||
| OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
 | ||
| OWUI_KB  = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
 | ||
| 
 | ||
| TRN.mkdir(parents=True, exist_ok=True)
 | ||
| LIB.mkdir(parents=True, exist_ok=True)
 | ||
| TMP.mkdir(parents=True, exist_ok=True)
 | ||
| 
 | ||
| # Lazy Whisper model loader so the worker can start even if model download/setup is slow
 | ||
| _model = None
 | ||
| 
 | ||
| def get_model():
 | ||
|     global _model
 | ||
|     if _model is None:
 | ||
|         print(f"[whisper] loading model='{MODEL_NAME}' device='{WHISPER_DEVICE}' idx={WHISPER_DEVICE_INDEX} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
 | ||
|         _model = WhisperModel(
 | ||
|             MODEL_NAME,
 | ||
|             device=WHISPER_DEVICE,
 | ||
|             device_index=WHISPER_DEVICE_INDEX,
 | ||
|             compute_type=COMPUTE,
 | ||
|             cpu_threads=WHISPER_CPU_THREADS,
 | ||
|         )
 | ||
|     return _model
 | ||
| 
 | ||
| # --- Helper: Reset model with new device and device_index ---
 | ||
| def reset_model(device: str, device_index: int | None = None):
 | ||
|     """Reset the global _model to a new WhisperModel with the given device and device_index."""
 | ||
|     global _model
 | ||
|     idx = device_index if device_index is not None else WHISPER_DEVICE_INDEX
 | ||
|     print(f"[whisper] resetting model='{MODEL_NAME}' device='{device}' idx={idx} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
 | ||
|     _model = WhisperModel(
 | ||
|         MODEL_NAME,
 | ||
|         device=device,
 | ||
|         device_index=idx,
 | ||
|         compute_type=COMPUTE,
 | ||
|         cpu_threads=WHISPER_CPU_THREADS,
 | ||
|     )
 | ||
| 
 | ||
| # --- Helper: Run transcribe with fallback to CPU on GPU/oom errors ---
 | ||
| def run_transcribe_with_fallback(wav_path: Path, lang):
 | ||
|     """
 | ||
|     Try to transcribe with current model; on GPU/CUDA/HIP/ROCm/OOM errors, reset to CPU and retry once.
 | ||
|     Returns (segments, info) or raises exception.
 | ||
|     """
 | ||
|     model = get_model()
 | ||
|     try:
 | ||
|         return model.transcribe(str(wav_path), vad_filter=True, language=lang)
 | ||
|     except Exception as e:
 | ||
|         msg = str(e)
 | ||
|         gpu_errs = [
 | ||
|             "CUDA", "cublas", "out of memory", "HIP", "ROCm", "device-side assert", "CUDNN", "cudaError", "cuda runtime", "cudaMalloc"
 | ||
|         ]
 | ||
|         if any(err.lower() in msg.lower() for err in gpu_errs):
 | ||
|             print(f"[whisper] GPU error detected: '{msg}'. Retrying on CPU...", flush=True)
 | ||
|             reset_model("cpu", 0)
 | ||
|             try:
 | ||
|                 model = get_model()
 | ||
|                 return model.transcribe(str(wav_path), vad_filter=True, language=lang)
 | ||
|             except Exception as e2:
 | ||
|                 print(f"[whisper] CPU fallback also failed: {e2}", flush=True)
 | ||
|                 raise
 | ||
|         raise
 | ||
| 
 | ||
| def log(feed):
 | ||
|     try:
 | ||
|         with open(TRN / "_feed.log", "a", encoding="utf-8") as f:
 | ||
|             f.write(orjson.dumps(feed).decode()+"\n")
 | ||
|     except Exception:
 | ||
|         pass
 | ||
| 
 | ||
| def sanitize(name):
 | ||
|     return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
 | ||
| 
 | ||
| # ---------- RSS transcript resolver ----------
 | ||
| 
 | ||
| def _normalize_title(t: str) -> str:
 | ||
|     t = (t or "").lower()
 | ||
|     t = re.sub(r"\s+", " ", t)
 | ||
|     # remove punctuation-ish
 | ||
|     t = re.sub(r"[^a-z0-9 _-]+", "", t)
 | ||
|     return t.strip()
 | ||
| 
 | ||
| def _stem_without_date(stem: str) -> str:
 | ||
|     # drop leading YYYYMMDD - from filenames created by yt-dlp template
 | ||
|     m = re.match(r"^\d{8}\s*-\s*(.*)$", stem)
 | ||
|     return m.group(1) if m else stem
 | ||
| 
 | ||
| def _extract_date_from_stem(stem: str) -> str | None:
 | ||
|     m = re.search(r"\b(\d{8})\b", stem)
 | ||
|     return m.group(1) if m else None
 | ||
| 
 | ||
| def _best_title_match(title: str, candidates: list[str]) -> tuple[str, float]:
 | ||
|     """Return (best_title, score 0..1) using difflib SequenceMatcher."""
 | ||
|     if not candidates:
 | ||
|         return "", 0.0
 | ||
|     norm_title = _normalize_title(title)
 | ||
|     best = ("", 0.0)
 | ||
|     for c in candidates:
 | ||
|         score = difflib.SequenceMatcher(None, norm_title, _normalize_title(c)).ratio()
 | ||
|         if score > best[1]:
 | ||
|             best = (c, score)
 | ||
|     return best
 | ||
| 
 | ||
| def _load_rss_index() -> list[dict]:
 | ||
|     try:
 | ||
|         if RSS_INDEX_PATH.exists():
 | ||
|             data = json.loads(RSS_INDEX_PATH.read_text(encoding="utf-8"))
 | ||
|             # supports {"episodes":[...]} or a flat list
 | ||
|             if isinstance(data, dict) and "episodes" in data:
 | ||
|                 return data["episodes"] or []
 | ||
|             if isinstance(data, list):
 | ||
|                 return data
 | ||
|     except Exception as e:
 | ||
|         print(f"[resolver] failed to load RSS index: {e}", flush=True)
 | ||
|     return []
 | ||
| 
 | ||
| def match_media_to_rss(media_path: Path) -> dict | None:
 | ||
|     """Try to match a local media file to an RSS episode entry."""
 | ||
|     episodes = _load_rss_index()
 | ||
|     if not episodes:
 | ||
|         return None
 | ||
| 
 | ||
|     stem = media_path.stem
 | ||
|     title_no_date = _stem_without_date(stem)
 | ||
|     file_date = _extract_date_from_stem(stem)
 | ||
|     # duration tolerance
 | ||
|     media_dur = media_duration_seconds(media_path)
 | ||
| 
 | ||
|     # Candidates: filter by date if present, else all
 | ||
|     if file_date:
 | ||
|         pool = [e for e in episodes if (str(e.get("date", "")) == file_date or str(e.get("pubdate", "")) == file_date)]
 | ||
|         if not pool:
 | ||
|             pool = episodes
 | ||
|     else:
 | ||
|         pool = episodes
 | ||
| 
 | ||
|     # Pick best by (title similarity, duration proximity)
 | ||
|     best_ep, best_score = None, -1.0
 | ||
|     for ep in pool:
 | ||
|         ep_title = ep.get("title") or ep.get("itunes_title") or ""
 | ||
|         sim = _best_title_match(title_no_date, [ep_title])[1]
 | ||
|         dur = float(ep.get("duration_sec") or ep.get("duration") or 0.0)
 | ||
|         dur_ok = True
 | ||
|         if media_dur and dur:
 | ||
|             dur_ok = abs(media_dur - dur) <= RSS_DURATION_TOLERANCE
 | ||
|         score = sim + (0.1 if dur_ok else 0.0)
 | ||
|         if score > best_score:
 | ||
|             best_score, best_ep = score, ep
 | ||
| 
 | ||
|     if best_ep and best_score >= 0.5:
 | ||
|         print(f"[resolver] matched '{stem}' -> '{best_ep.get('title','')}' score={best_score:.2f}", flush=True)
 | ||
|         return best_ep
 | ||
|     return None
 | ||
| 
 | ||
| def _choose_transcript_url(ep: dict) -> tuple[str, str] | tuple[None, None]:
 | ||
|     """Return (url, kind) preferring txt, vtt, then srt. 'kind' in {'txt','vtt','srt'}."""
 | ||
|     # unified structure from rss_ingest.py: ep["transcripts"] = [{"url":..., "type": ...}, ...]
 | ||
|     items = ep.get("transcripts") or []
 | ||
|     # some ingesters store separate keys
 | ||
|     if not items:
 | ||
|         for key, kind in [("transcript_txt","txt"), ("transcript_vtt","vtt"), ("transcript_srt","srt")]:
 | ||
|             if ep.get(key):
 | ||
|                 items.append({"url": ep[key], "type": kind})
 | ||
|     # preference order
 | ||
|     for kind in ["txt", "vtt", "srt"]:
 | ||
|         for it in items:
 | ||
|             t = (it.get("type") or "").lower()
 | ||
|             u = it.get("url") or ""
 | ||
|             if u and (kind in t or (kind == "txt" and t in ["text","plain","text/plain"]) or (kind in u.lower())):
 | ||
|                 return u, kind
 | ||
|     return (None, None)
 | ||
| 
 | ||
| def fetch_rss_transcript(ep: dict, dest_dir: Path) -> Path | None:
 | ||
|     """Download transcript to dest_dir and return local Path; convert VTT->SRT if needed."""
 | ||
|     url, kind = _choose_transcript_url(ep)
 | ||
|     if not url:
 | ||
|         return None
 | ||
|     dest_dir.mkdir(parents=True, exist_ok=True)
 | ||
|     # filename from episode title
 | ||
|     safe = sanitize(ep.get("title") or ep.get("guid") or "episode")
 | ||
|     path = dest_dir / f"{safe}.{kind if kind!='txt' else 'txt'}"
 | ||
|     try:
 | ||
|         r = requests.get(url, timeout=30)
 | ||
|         r.raise_for_status()
 | ||
|         mode = "wb" if kind in ("vtt","srt") else "w"
 | ||
|         if mode == "wb":
 | ||
|             path.write_bytes(r.content)
 | ||
|         else:
 | ||
|             path.write_text(r.text, encoding="utf-8")
 | ||
|         print(f"[resolver] downloaded transcript ({kind}) from {url}", flush=True)
 | ||
|         return path
 | ||
|     except Exception as e:
 | ||
|         print(f"[resolver] failed to fetch transcript: {e}", flush=True)
 | ||
|         return None
 | ||
| 
 | ||
| def use_rss_transcript(media_path: Path, ep: dict) -> Path | None:
 | ||
|     """Create standard transcript artifacts from an RSS transcript (txt/vtt/srt)."""
 | ||
|     # Prefer direct download; else if rss_ingest already saved a local file path, try that.
 | ||
|     sidecar = None
 | ||
|     local_hint = ep.get("transcript_local")
 | ||
|     if local_hint:
 | ||
|         p = Path(local_hint)
 | ||
|         if p.exists():
 | ||
|             sidecar = p
 | ||
|     if sidecar is None:
 | ||
|         sidecar = fetch_rss_transcript(ep, TMP)
 | ||
| 
 | ||
|     if not sidecar or not sidecar.exists():
 | ||
|         return None
 | ||
| 
 | ||
|     # Convert to plain text
 | ||
|     plain = transcript_text_from_file(sidecar)
 | ||
|     lang = (ep.get("language") or ep.get("lang") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
 | ||
|     base = write_plain_transcript(media_path, plain, language=lang)
 | ||
|     # Place an SRT next to video for Plex
 | ||
|     ensure_sidecar_next_to_media(sidecar, media_path, lang=lang)
 | ||
|     # Write provenance sidecar
 | ||
|     (base.with_suffix(".prov.json")).write_bytes(orjson.dumps({
 | ||
|         "source": "rss",
 | ||
|         "feed": ep.get("feed_url"),
 | ||
|         "guid": ep.get("guid"),
 | ||
|         "episode_title": ep.get("title"),
 | ||
|         "transcript_kind": sidecar.suffix.lower().lstrip("."),
 | ||
|         "transcript_url": _choose_transcript_url(ep)[0] or "",
 | ||
|     }))
 | ||
|     # Write Kodi/Plex-compatible NFO
 | ||
|     try:
 | ||
|         # Gather metadata for NFO from RSS entry
 | ||
|         meta = {
 | ||
|             "title": ep.get("title"),
 | ||
|             "episode_title": ep.get("title"),
 | ||
|             "show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show"),
 | ||
|             "description": ep.get("description") or ep.get("content"),
 | ||
|             "pubdate": ep.get("pubdate"),
 | ||
|             "pubdate_iso": ep.get("date_iso"),
 | ||
|             "duration_sec": ep.get("duration_sec") or ep.get("duration"),
 | ||
|             "image": ep.get("image") or ep.get("image_url"),
 | ||
|             "guid": ep.get("guid"),
 | ||
|         }
 | ||
|         txt_path = base.with_suffix(".txt")
 | ||
|         transcript_text = txt_path.read_text(encoding="utf-8") if txt_path.exists() else None
 | ||
|         write_episode_nfo(media_path, meta, transcript_text)
 | ||
|         # Save local artwork for Plex/Kodi
 | ||
|         try:
 | ||
|             save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
 | ||
|         except Exception:
 | ||
|             pass
 | ||
|     except Exception as e:
 | ||
|         print(f"[post] NFO write failed: {e}", flush=True)
 | ||
|     return base
 | ||
| 
 | ||
| def find_sidecar_transcript(media_path: Path) -> Path | None:
 | ||
|     """Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
 | ||
|     Tries common variants including language-suffixed SRT/VTT.
 | ||
|     """
 | ||
|     candidates: list[Path] = []
 | ||
|     # exact same stem in same folder
 | ||
|     for ext in [".txt", ".srt", ".vtt"]:
 | ||
|         p = media_path.parent / (media_path.stem + ext)
 | ||
|         if p.exists():
 | ||
|             candidates.append(p)
 | ||
|     # language-suffixed near the media file (e.g., .en.srt)
 | ||
|     for ext in [".srt", ".vtt"]:
 | ||
|         p = media_path.with_suffix(f".en{ext}")
 | ||
|         if p.exists() and p not in candidates:
 | ||
|             candidates.append(p)
 | ||
|     return candidates[0] if candidates else None
 | ||
| 
 | ||
| 
 | ||
| # ---------- Transcript repository reuse helpers ----------
 | ||
| 
 | ||
| def find_repo_transcript_for_media(media_path: Path) -> Path | None:
 | ||
|     """Search the transcript repository (/transcripts) for an existing transcript
 | ||
|     that likely belongs to this media file (match by YYYYMMDD in filename and/or
 | ||
|     fuzzy title similarity). Returns a path to a matching .json if found."""
 | ||
|     try:
 | ||
|         stem = media_path.stem
 | ||
|         title_no_date = _stem_without_date(stem)
 | ||
|         file_date = _extract_date_from_stem(stem)
 | ||
|         best_json, best_score = None, 0.0
 | ||
|         for j in TRN.glob("*.json"):
 | ||
|             try:
 | ||
|                 data = json.loads(j.read_text(encoding="utf-8"))
 | ||
|             except Exception:
 | ||
|                 continue
 | ||
|             other_file = Path(data.get("file", ""))
 | ||
|             other_stem = other_file.stem if other_file else j.stem
 | ||
|             other_date = _extract_date_from_stem(other_stem)
 | ||
|             # If both have dates and they differ a lot, skip
 | ||
|             if file_date and other_date and file_date != other_date:
 | ||
|                 continue
 | ||
|             # Compare titles (without dates)
 | ||
|             sim = difflib.SequenceMatcher(
 | ||
|                 None,
 | ||
|                 _normalize_title(title_no_date),
 | ||
|                 _normalize_title(_stem_without_date(other_stem)),
 | ||
|             ).ratio()
 | ||
|             # Nudge score when dates match
 | ||
|             if file_date and other_date and file_date == other_date:
 | ||
|                 sim += 0.1
 | ||
|             if sim > best_score:
 | ||
|                 best_score, best_json = sim, j
 | ||
|         # Require a reasonable similarity
 | ||
|         return best_json if best_json and best_score >= 0.60 else None
 | ||
|     except Exception:
 | ||
|         return None
 | ||
| 
 | ||
| 
 | ||
| def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None:
 | ||
|     """Copy/retarget an existing transcript JSON/TXT (and make SRT/VTT if possible)
 | ||
|     from the repository so that it belongs to the provided media_path. Returns
 | ||
|     the new base path in /transcripts or None."""
 | ||
|     try:
 | ||
|         # load the source transcript
 | ||
|         data = json.loads(repo_json.read_text(encoding="utf-8"))
 | ||
|         src_base = TRN / Path(repo_json).stem
 | ||
|         src_txt = src_base.with_suffix(".txt")
 | ||
|         src_srt = src_base.with_suffix(".srt")
 | ||
|         src_vtt = src_base.with_suffix(".vtt")
 | ||
| 
 | ||
|         # write the retargeted artifacts
 | ||
|         new_title = media_path.stem
 | ||
|         new_base = TRN / new_title
 | ||
|         new_base.parent.mkdir(parents=True, exist_ok=True)
 | ||
| 
 | ||
|         # update file path
 | ||
|         data["file"] = str(media_path)
 | ||
|         (new_base.with_suffix(".json")).write_bytes(orjson.dumps(data))
 | ||
| 
 | ||
|         # copy or synthesize TXT
 | ||
|         if src_txt.exists():
 | ||
|             shutil.copy2(src_txt, new_base.with_suffix(".txt"))
 | ||
|         else:
 | ||
|             # fallback: concatenate segments
 | ||
|             txt = " ".join(s.get("text", "") for s in data.get("segments", []))
 | ||
|             (new_base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
 | ||
| 
 | ||
|         # copy SRT/VTT if present; otherwise synthesize SRT from segments
 | ||
|         if src_srt.exists():
 | ||
|             shutil.copy2(src_srt, new_base.with_suffix(".srt"))
 | ||
|         else:
 | ||
|             # synthesize SRT
 | ||
|             def fmt_ts(t):
 | ||
|                 h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
 | ||
|                 return f"{h:02}:{m:02}:{s:06.3f}".replace('.',',')
 | ||
|             with open(new_base.with_suffix(".srt"), "w", encoding="utf-8") as srt:
 | ||
|                 for i, s in enumerate(data.get("segments", []), 1):
 | ||
|                     srt.write(f"{i}\n{fmt_ts(s.get('start',0.0))} --> {fmt_ts(s.get('end',0.0))}\n{s.get('text','').strip()}\n\n")
 | ||
|         if src_vtt.exists():
 | ||
|             shutil.copy2(src_vtt, new_base.with_suffix(".vtt"))
 | ||
|         else:
 | ||
|             # synthesize VTT from segments
 | ||
|             def fmt_ts_vtt(t):
 | ||
|                 h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
 | ||
|                 return f"{h:02}:{m:02}:{s:06.3f}"
 | ||
|             with open(new_base.with_suffix(".vtt"), "w", encoding="utf-8") as vtt:
 | ||
|                 vtt.write("WEBVTT\n\n")
 | ||
|                 for s in data.get("segments", []):
 | ||
|                     vtt.write(f"{fmt_ts_vtt(s.get('start',0.0))} --> {fmt_ts_vtt(s.get('end',0.0))} \n{s.get('text','').strip()}\n\n")
 | ||
| 
 | ||
|         # ensure sidecar next to media
 | ||
|         try:
 | ||
|             lang = (data.get("language") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
 | ||
|             ensure_sidecar_next_to_media(new_base.with_suffix(".srt"), media_path, lang=lang)
 | ||
|         except Exception:
 | ||
|             pass
 | ||
| 
 | ||
|         # Write Kodi/Plex-compatible NFO
 | ||
|         try:
 | ||
|             meta = {
 | ||
|                 "title": data.get("title") or media_path.stem,
 | ||
|                 "episode_title": data.get("title") or media_path.stem,
 | ||
|                 "show": data.get("show") or media_path.parent.name,
 | ||
|                 "description": data.get("description") or "",
 | ||
|                 "pubdate": data.get("pubdate") or data.get("date"),
 | ||
|                 "duration_sec": media_duration_seconds(media_path),
 | ||
|                 "image": data.get("image"),
 | ||
|                 "guid": data.get("guid") or data.get("id"),
 | ||
|             }
 | ||
|             txtp = new_base.with_suffix(".txt")
 | ||
|             ttxt = txtp.read_text(encoding="utf-8") if txtp.exists() else None
 | ||
|             write_episode_nfo(media_path, meta, ttxt)
 | ||
|             # Save local artwork for Plex/Kodi
 | ||
|             try:
 | ||
|                 save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
 | ||
|             except Exception:
 | ||
|                 pass
 | ||
|         except Exception as e:
 | ||
|             print(f"[post] NFO write failed: {e}", flush=True)
 | ||
| 
 | ||
|         return new_base
 | ||
|     except Exception as e:
 | ||
|         print(f"[resolver] failed to reuse repo transcript: {e}", flush=True)
 | ||
|         return None
 | ||
| 
 | ||
| 
 | ||
| def transcript_text_from_file(path: Path) -> str:
 | ||
|     """Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
 | ||
|     try:
 | ||
|         raw = path.read_text(encoding="utf-8", errors="ignore")
 | ||
|     except Exception:
 | ||
|         raw = path.read_text(errors="ignore")
 | ||
| 
 | ||
|     if path.suffix.lower() == ".txt":
 | ||
|         return raw.strip()
 | ||
| 
 | ||
|     # For SRT/VTT, drop timestamp lines, cue numbers and headers
 | ||
|     lines: list[str] = []
 | ||
|     for line in raw.splitlines():
 | ||
|         ls = line.strip()
 | ||
|         if not ls:
 | ||
|             continue
 | ||
|         if "-->" in ls:  # timestamp line
 | ||
|             continue
 | ||
|         if ls.upper().startswith("WEBVTT"):
 | ||
|             continue
 | ||
|         if re.match(r"^\d+$", ls):  # cue index
 | ||
|             continue
 | ||
|         lines.append(ls)
 | ||
|     return " ".join(lines)
 | ||
| 
 | ||
| 
 | ||
| def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
 | ||
|     """Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed. If the sidecar is .txt, do nothing."""
 | ||
|     try:
 | ||
|         if sidecar.suffix.lower() == ".txt":
 | ||
|             return
 | ||
|         if sidecar.suffix.lower() == ".srt":
 | ||
|             dst = media_path.with_suffix(f".{lang}.srt")
 | ||
|             shutil.copy2(sidecar, dst)
 | ||
|         elif sidecar.suffix.lower() == ".vtt":
 | ||
|             tmp_srt = sidecar.with_suffix(".srt")
 | ||
|             subprocess.run(["ffmpeg", "-nostdin", "-y", "-i", str(sidecar), str(tmp_srt)], check=True)
 | ||
|             dst = media_path.with_suffix(f".{lang}.srt")
 | ||
|             shutil.move(str(tmp_srt), dst)
 | ||
|     except Exception as e:
 | ||
|         print(f"[post] sidecar copy/convert failed: {e}", flush=True)
 | ||
| 
 | ||
| 
 | ||
| # --- small helpers for progress/ETA formatting ---
 | ||
| def _fmt_eta(sec: float) -> str:
 | ||
|     try:
 | ||
|         sec = max(0, int(sec))
 | ||
|         h, rem = divmod(sec, 3600)
 | ||
|         m, s = divmod(rem, 60)
 | ||
|         if h:
 | ||
|             return f"{h}h {m}m {s}s"
 | ||
|         if m:
 | ||
|             return f"{m}m {s}s"
 | ||
|         return f"{s}s"
 | ||
|     except Exception:
 | ||
|         return ""
 | ||
| 
 | ||
| 
 | ||
| def save_episode_artwork(image_url: str | None, media_path: Path, show_title: str | None = None):
 | ||
|     """Download episode artwork from image_url and save next to the media as '<basename>.jpg'.
 | ||
|     Also drop a folder-level 'poster.jpg' for the show directory if not present.
 | ||
|     Best-effort; failures are logged but non-fatal.
 | ||
|     """
 | ||
|     if not image_url:
 | ||
|         return
 | ||
|     try:
 | ||
|         resp = requests.get(image_url, timeout=30, stream=True)
 | ||
|         resp.raise_for_status()
 | ||
|         # Determine content-type and write a temporary file
 | ||
|         ctype = (resp.headers.get("Content-Type") or "").lower()
 | ||
|         tmp_file = media_path.with_suffix(".art.tmp")
 | ||
|         with open(tmp_file, "wb") as out:
 | ||
|             for chunk in resp.iter_content(chunk_size=8192):
 | ||
|                 if chunk:
 | ||
|                     out.write(chunk)
 | ||
| 
 | ||
|         # Always provide a .jpg next to the media for Plex
 | ||
|         episode_jpg = media_path.with_suffix(".jpg")
 | ||
|         if "image/jpeg" in ctype:
 | ||
|             # Already JPEG
 | ||
|             shutil.move(str(tmp_file), str(episode_jpg))
 | ||
|         else:
 | ||
|             # Try converting to JPEG with ffmpeg; if it fails, keep bytes as-is
 | ||
|             try:
 | ||
|                 subprocess.run(
 | ||
|                     ["ffmpeg", "-nostdin", "-y", "-i", str(tmp_file), str(episode_jpg)],
 | ||
|                     stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
 | ||
|                 )
 | ||
|                 try:
 | ||
|                     tmp_file.unlink()
 | ||
|                 except Exception:
 | ||
|                     pass
 | ||
|             except Exception:
 | ||
|                 shutil.move(str(tmp_file), str(episode_jpg))
 | ||
| 
 | ||
|         # Also drop a folder poster once per show (helps Plex folder views)
 | ||
|         try:
 | ||
|             show_poster = media_path.parent / "poster.jpg"
 | ||
|             if not show_poster.exists():
 | ||
|                 shutil.copy2(episode_jpg, show_poster)
 | ||
|         except Exception:
 | ||
|             pass
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         print(f"[post] artwork download failed: {e}", flush=True)
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| def find_companion_files(src: Path) -> dict:
 | ||
|     """Return likely yt-dlp companion files for a downloaded media file."""
 | ||
|     out = {}
 | ||
|     # info.json can be either "<name>.<ext>.info.json" or "<name>.info.json"
 | ||
|     cands_info = [
 | ||
|         src.parent / f"{src.name}.info.json",
 | ||
|         src.parent / f"{src.stem}.info.json",
 | ||
|     ]
 | ||
|     out["info"] = next((p for p in cands_info if p.exists()), None)
 | ||
| 
 | ||
|     # thumbnails may be "<name>.<ext>.jpg" or "<name>.jpg" (we convert to jpg)
 | ||
|     cand_thumbs = [
 | ||
|         src.parent / f"{src.name}.jpg",
 | ||
|         src.parent / f"{src.stem}.jpg",
 | ||
|         src.parent / f"{src.stem}.jpeg",
 | ||
|         src.parent / f"{src.stem}.png",
 | ||
|         src.parent / f"{src.stem}.webp",
 | ||
|     ]
 | ||
|     out["thumb"] = next((p for p in cand_thumbs if p.exists()), None)
 | ||
| 
 | ||
|     # subtitles (keep multiple)
 | ||
|     subs = []
 | ||
|     for s in src.parent.glob(f"{src.stem}*.srt"):
 | ||
|         subs.append(s)
 | ||
|     for s in src.parent.glob(f"{src.stem}*.vtt"):
 | ||
|         subs.append(s)
 | ||
|     out["subs"] = subs
 | ||
|     return out
 | ||
| 
 | ||
| def load_info_json(path: Path) -> dict | None:
 | ||
|     try:
 | ||
|         return json.loads(path.read_text(encoding="utf-8"))
 | ||
|     except Exception:
 | ||
|         return None
 | ||
| 
 | ||
| def _iso_from_yyyymmdd(s: str | None) -> str | None:
 | ||
|     if not s or not re.match(r"^\d{8}$", s):
 | ||
|         return None
 | ||
|     return f"{s[0:4]}-{s[4:6]}-{s[6:8]}"
 | ||
| 
 | ||
| def build_meta_from_sources(media_path: Path, uploader: str, fallback_meta: dict, ep: dict | None = None) -> dict:
 | ||
|     """
 | ||
|     Merge metadata from (priority): RSS episode `ep` -> yt-dlp info.json (if present) -> fallback.
 | ||
|     Returns a dict compatible with write_episode_nfo().
 | ||
|     """
 | ||
|     # Start with fallback
 | ||
|     meta = dict(fallback_meta)
 | ||
| 
 | ||
|     # Augment from info.json if present
 | ||
|     info = None
 | ||
|     for cand in [
 | ||
|         media_path.parent / f"{media_path.name}.info.json",
 | ||
|         media_path.parent / f"{media_path.stem}.info.json",
 | ||
|     ]:
 | ||
|         if cand.exists():
 | ||
|             info = load_info_json(cand)
 | ||
|             break
 | ||
|     if info:
 | ||
|         meta.setdefault("title", info.get("title"))
 | ||
|         meta.setdefault("episode_title", info.get("title"))
 | ||
|         meta.setdefault("description", info.get("description") or info.get("fulltitle"))
 | ||
|         # upload_date is YYYYMMDD
 | ||
|         iso = _iso_from_yyyymmdd(info.get("upload_date"))
 | ||
|         if iso:
 | ||
|             meta["pubdate_iso"] = iso
 | ||
|         # Prefer video duration if present
 | ||
|         if not meta.get("duration_sec") and info.get("duration"):
 | ||
|             meta["duration_sec"] = info.get("duration")
 | ||
|         # thumbnail URL
 | ||
|         if not meta.get("image"):
 | ||
|             meta["image"] = info.get("thumbnail")
 | ||
|         # show/uploader
 | ||
|         if not meta.get("show"):
 | ||
|             meta["show"] = info.get("uploader") or uploader
 | ||
| 
 | ||
|     # Finally, layer RSS data on top if available (most authoritative for podcasts)
 | ||
|     if ep:
 | ||
|         meta.update({
 | ||
|             "title": ep.get("title") or meta.get("title"),
 | ||
|             "episode_title": ep.get("title") or meta.get("episode_title"),
 | ||
|             "show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or meta.get("show") or uploader,
 | ||
|             "description": ep.get("description") or ep.get("content") or meta.get("description", ""),
 | ||
|             "pubdate": ep.get("pubdate") or meta.get("pubdate", ""),
 | ||
|             "pubdate_iso": ep.get("date_iso") or meta.get("pubdate_iso", meta.get("pubdate")),
 | ||
|             "duration_sec": ep.get("duration_sec") or ep.get("duration") or meta.get("duration_sec"),
 | ||
|             "image": ep.get("image") or ep.get("image_url") or meta.get("image", ""),
 | ||
|             "guid": ep.get("guid") or meta.get("guid", ""),
 | ||
|         })
 | ||
| 
 | ||
|     return meta
 | ||
| 
 | ||
| # ---------- Kodi/Plex NFO writer ----------
 | ||
| from datetime import datetime
 | ||
| 
 | ||
| def _first_nonempty(*vals):
 | ||
|     for v in vals:
 | ||
|         if v is None:
 | ||
|             continue
 | ||
|         if isinstance(v, str) and v.strip():
 | ||
|             return v.strip()
 | ||
|         if v:
 | ||
|             return v
 | ||
|     return None
 | ||
| 
 | ||
| def _coerce_aired(pubdate: str | None) -> str:
 | ||
|     """Convert RSS-style pubdate to YYYY-MM-DD if possible."""
 | ||
|     if not pubdate:
 | ||
|         return ""
 | ||
|     # already ISO-like
 | ||
|     m = re.match(r"^(\d{4})[-/](\d{2})[-/](\d{2})", pubdate)
 | ||
|     if m:
 | ||
|         return f"{m.group(1)}-{m.group(2)}-{m.group(3)}"
 | ||
|     # RFC 2822 example: Tue, 21 Feb 2023 06:00:00 +0000
 | ||
|     try:
 | ||
|         dt = datetime.strptime(pubdate[:31], "%a, %d %b %Y %H:%M:%S %z")
 | ||
|         return dt.strftime("%Y-%m-%d")
 | ||
|     except Exception:
 | ||
|         # try without tz
 | ||
|         try:
 | ||
|             dt = datetime.strptime(pubdate[:25], "%a, %d %b %Y %H:%M:%S")
 | ||
|             return dt.strftime("%Y-%m-%d")
 | ||
|         except Exception:
 | ||
|             return ""
 | ||
| 
 | ||
| def write_episode_nfo(media_path: Path, meta: dict, transcript_text: str | None = None) -> Path:
 | ||
|     """Write a minimal Kodi/Plex-compatible NFO next to the media file.
 | ||
|     `meta` may include: title, show, plot, pubdate, duration_sec, thumb, guid.
 | ||
|     """
 | ||
|     try:
 | ||
|         title = _first_nonempty(meta.get("episode_title"), meta.get("title"), media_path.stem) or media_path.stem
 | ||
|         show  = _first_nonempty(meta.get("show"), meta.get("podcast_title"), meta.get("feed_title"), media_path.parent.name) or media_path.parent.name
 | ||
|         plot  = _first_nonempty(meta.get("description"), meta.get("content"), meta.get("summary"), "") or ""
 | ||
|         # Optionally append transcript preview to plot
 | ||
|         if transcript_text:
 | ||
|             preview = transcript_text.strip()
 | ||
|             if preview:
 | ||
|                 preview = (preview[:1800] + "…") if len(preview) > 1800 else preview
 | ||
|                 plot = (plot + "\n\n" if plot else "") + preview
 | ||
|         aired = _coerce_aired(_first_nonempty(meta.get("pubdate_iso"), meta.get("pubdate")))
 | ||
|         guid  = _first_nonempty(meta.get("guid"), meta.get("id"), "") or ""
 | ||
|         thumb = _first_nonempty(meta.get("image"), meta.get("image_url"), meta.get("thumbnail"), "") or ""
 | ||
|         dur_s = meta.get("duration_sec") or meta.get("duration") or 0
 | ||
|         try:
 | ||
|             dur_min = int(round(float(dur_s) / 60.0)) if dur_s else 0
 | ||
|         except Exception:
 | ||
|             dur_min = 0
 | ||
| 
 | ||
|         # Build XML
 | ||
|         xml = ["<episodedetails>"]
 | ||
|         xml.append(f"  <title>{xml_escape(title)}</title>")
 | ||
|         xml.append(f"  <showtitle>{xml_escape(show)}</showtitle>")
 | ||
|         if plot:
 | ||
|             xml.append(f"  <plot>{xml_escape(plot)}</plot>")
 | ||
|         if aired:
 | ||
|             xml.append(f"  <aired>{xml_escape(aired)}</aired>")
 | ||
|         if guid:
 | ||
|             xml.append(f"  <uniqueid type=\"guid\" default=\"true\">{xml_escape(guid)}</uniqueid>")
 | ||
|         if dur_min:
 | ||
|             xml.append(f"  <runtime>{dur_min}</runtime>")
 | ||
|         if thumb:
 | ||
|             xml.append(f"  <thumb>{xml_escape(thumb)}</thumb>")
 | ||
|         xml.append("</episodedetails>\n")
 | ||
|         nfo_path = media_path.with_suffix(".nfo")
 | ||
|         nfo_path.write_text("\n".join(xml), encoding="utf-8")
 | ||
|         return nfo_path
 | ||
|     except Exception:
 | ||
|         return media_path.with_suffix(".nfo")
 | ||
| 
 | ||
| def write_plain_transcript(media_path: Path, text: str, language: str = "en") -> Path:
 | ||
|     """Write minimal transcript artifacts (.txt + .json) from plain text (no timestamps)."""
 | ||
|     title = media_path.stem
 | ||
|     base = TRN / title
 | ||
|     base.parent.mkdir(parents=True, exist_ok=True)
 | ||
|     (base.with_suffix(".txt")).write_text(text, encoding="utf-8")
 | ||
|     (base.with_suffix(".json")).write_bytes(orjson.dumps({
 | ||
|         "file": str(media_path),
 | ||
|         "language": language,
 | ||
|         "segments": [{"start": 0.0, "end": 0.0, "text": text}]
 | ||
|     }))
 | ||
|     return base
 | ||
| 
 | ||
| def yt_dlp(url, outdir):
 | ||
|     # 1) Normalize YouTube Music URLs to standard YouTube
 | ||
|     yurl = url
 | ||
|     if 'music.youtube.com' in yurl:
 | ||
|         yurl = yurl.replace('music.youtube.com', 'www.youtube.com')
 | ||
| 
 | ||
|     outtmpl = str(outdir / "%(uploader)s/%(upload_date)s - %(title)s.%(ext)s")
 | ||
| 
 | ||
|     base_cmd = [
 | ||
|         "yt-dlp", "-o", outtmpl,
 | ||
|         "-f", "bv*+ba/best",
 | ||
|         "--write-info-json",
 | ||
|         "--write-thumbnail",
 | ||
|         "--convert-thumbnails", "jpg",
 | ||
|         "--write-subs", "--write-auto-subs",
 | ||
|         "--sub-langs", os.getenv("YTDLP_SUBS_LANGS", "en.*,en"),
 | ||
|         "--convert-subs", "srt",
 | ||
|         "--no-playlist", "--no-warnings", "--restrict-filenames",
 | ||
|     ]
 | ||
| 
 | ||
|     # 3) Optional cookies (set YTDLP_COOKIES=/path/to/cookies.txt in .env and mount it)
 | ||
|     cookies_path = os.getenv("YTDLP_COOKIES", "").strip()
 | ||
|     if cookies_path:
 | ||
|         base_cmd += ["--cookies", cookies_path]
 | ||
| 
 | ||
|     # Primary attempt
 | ||
|     try:
 | ||
|         subprocess.check_call(base_cmd + [yurl])
 | ||
|     except subprocess.CalledProcessError:
 | ||
|         # 2) Retry with Android client + mobile UA
 | ||
|         retry_cmd = base_cmd + [
 | ||
|             "--extractor-args", "youtube:player_client=android",
 | ||
|             "--user-agent", "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Mobile Safari/537.36",
 | ||
|             yurl,
 | ||
|         ]
 | ||
|         subprocess.check_call(retry_cmd)
 | ||
| 
 | ||
|     media = (
 | ||
|         list(outdir.rglob("*.[mM][pP]4")) +
 | ||
|         list(outdir.rglob("*.mkv")) +
 | ||
|         list(outdir.rglob("*.webm")) +
 | ||
|         list(outdir.rglob("*.m4a")) +
 | ||
|         list(outdir.rglob("*.mp3"))
 | ||
|     )
 | ||
|     return sorted(media, key=lambda p: p.stat().st_mtime)[-1:]
 | ||
| 
 | ||
| def extract_audio(src: Path, outdir: Path) -> Path:
 | ||
|     """Extract mono 16kHz WAV for robust transcription (handles odd containers/codecs)."""
 | ||
|     outdir.mkdir(parents=True, exist_ok=True)
 | ||
|     wav_path = outdir / (src.stem + ".wav")
 | ||
|     # Force audio-only, mono, 16kHz WAV
 | ||
|     cmd = [
 | ||
|         "ffmpeg", "-nostdin", "-y",
 | ||
|         "-i", str(src),
 | ||
|         "-vn", "-ac", "1", "-ar", "16000",
 | ||
|         "-f", "wav", str(wav_path),
 | ||
|     ]
 | ||
|     try:
 | ||
|         subprocess.check_output(cmd, stderr=subprocess.STDOUT)
 | ||
|     except subprocess.CalledProcessError as e:
 | ||
|         raise RuntimeError(f"ffmpeg extract failed: {e.output.decode(errors='ignore')}")
 | ||
|     return wav_path
 | ||
| 
 | ||
| # --- WAV trimming helper ---
 | ||
| def trim_wav(src_wav: Path, start_sec: float, outdir: Path) -> Path:
 | ||
|     """Return a trimmed 16k mono WAV starting at start_sec from src_wav."""
 | ||
|     outdir.mkdir(parents=True, exist_ok=True)
 | ||
|     if not start_sec or start_sec <= 0.0:
 | ||
|         return src_wav
 | ||
|     dst = outdir / (src_wav.stem + f".from_{int(start_sec)}s.wav")
 | ||
|     try:
 | ||
|         subprocess.check_output([
 | ||
|             "ffmpeg", "-nostdin", "-y",
 | ||
|             "-ss", str(max(0.0, float(start_sec))),
 | ||
|             "-i", str(src_wav),
 | ||
|             "-vn", "-ac", "1", "-ar", "16000",
 | ||
|             "-f", "wav", str(dst),
 | ||
|         ], stderr=subprocess.STDOUT)
 | ||
|         return dst
 | ||
|     except subprocess.CalledProcessError as e:
 | ||
|         # If trimming fails, fall back to full file
 | ||
|         print(f"[whisper] trim failed, using full WAV: {e.output.decode(errors='ignore')}", flush=True)
 | ||
|         return src_wav
 | ||
| 
 | ||
| def media_duration_seconds(path: Path) -> float:
 | ||
|     """Return duration in seconds using ffprobe; fallback to 0.0 on error."""
 | ||
|     try:
 | ||
|         out = subprocess.check_output([
 | ||
|             "ffprobe", "-v", "error", "-show_entries", "format=duration",
 | ||
|             "-of", "default=nokey=1:noprint_wrappers=1", str(path)
 | ||
|         ], stderr=subprocess.STDOUT, text=True).strip()
 | ||
|         return float(out) if out else 0.0
 | ||
|     except Exception:
 | ||
|         return 0.0
 | ||
| 
 | ||
| # --- Partial transcript helpers ---
 | ||
| def _partial_paths(title: str) -> tuple[Path, Path]:
 | ||
|     base = TRN / title
 | ||
|     return base.with_suffix(".partial.json"), base.with_suffix(".partial.txt")
 | ||
| 
 | ||
| def _save_partial(title: str, language: str, segs: list[dict]):
 | ||
|     pjson, ptxt = _partial_paths(title)
 | ||
|     try:
 | ||
|         # Save JSON
 | ||
|         pjson.write_bytes(orjson.dumps({"file": str((TRN / title).with_suffix('.wav')), "language": language, "segments": segs}))
 | ||
|     except Exception as e:
 | ||
|         print(f"[whisper] partial json save failed: {e}", flush=True)
 | ||
|     try:
 | ||
|         # Save TXT snapshot
 | ||
|         ptxt.write_text(" ".join(s.get("text","") for s in segs), encoding="utf-8")
 | ||
|     except Exception as e:
 | ||
|         print(f"[whisper] partial txt save failed: {e}", flush=True)
 | ||
| 
 | ||
| def transcribe(media_path: Path):
 | ||
|     model = get_model()
 | ||
|     print(f"[whisper] start transcribe: {media_path}", flush=True)
 | ||
|     # 1) Robustly extract audio to 16k mono WAV (fixes pyAV/webm edge cases)
 | ||
|     wav = extract_audio(media_path, TMP)
 | ||
| 
 | ||
|     title = media_path.stem
 | ||
|     base = TRN / title
 | ||
| 
 | ||
|     # Resume support: if a partial checkpoint exists, load it and trim input
 | ||
|     resume_segments = []
 | ||
|     resume_offset = 0.0
 | ||
|     language_hint = None
 | ||
|     if WHISPER_RESUME:
 | ||
|         pjson, ptxt = _partial_paths(title)
 | ||
|         if pjson.exists():
 | ||
|             try:
 | ||
|                 pdata = json.loads(pjson.read_text(encoding="utf-8"))
 | ||
|                 resume_segments = pdata.get("segments", []) or []
 | ||
|                 if resume_segments:
 | ||
|                     resume_offset = float(resume_segments[-1].get("end", 0.0))
 | ||
|                 language_hint = pdata.get("language")
 | ||
|                 print(f"[whisper] resuming from ~{resume_offset:.2f}s with {len(resume_segments)} segments", flush=True)
 | ||
|             except Exception as e:
 | ||
|                 print(f"[whisper] failed to load partial: {e}", flush=True)
 | ||
| 
 | ||
|     # If resuming, trim WAV from last end time
 | ||
|     wav_for_run = trim_wav(wav, resume_offset, TMP)
 | ||
| 
 | ||
|     # 2) Language selection
 | ||
|     lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
 | ||
|     if language_hint and WHISPER_LANGUAGE.lower() == "auto":
 | ||
|         # carry hint forward if available
 | ||
|         lang = language_hint
 | ||
| 
 | ||
|     # 3) Transcribe
 | ||
|     segments, info = run_transcribe_with_fallback(wav_for_run, lang)
 | ||
| 
 | ||
|     # Determine duration for progress; use full WAV duration for consistent % regardless of resume
 | ||
|     dur = media_duration_seconds(wav) or 0.0
 | ||
|     # Start wall clock timer for speed/ETA
 | ||
|     start_wall = time.time()
 | ||
|     if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur:
 | ||
|         print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True)
 | ||
|         resume_offset = 0.0
 | ||
|     last_pct = -1
 | ||
| 
 | ||
|     segs = list(resume_segments)  # start with what we already have
 | ||
|     text_parts = [s.get("text","") for s in resume_segments]
 | ||
| 
 | ||
|     # Walk new segments; shift their timestamps by resume_offset if trimmed
 | ||
|     seg_count_since_save = 0
 | ||
|     seg_index = len(resume_segments)
 | ||
|     for s in segments:
 | ||
|         seg_index += 1
 | ||
|         start = (s.start or 0.0) + resume_offset
 | ||
|         end   = (s.end or 0.0) + resume_offset
 | ||
|         seg = {"start": start, "end": end, "text": s.text}
 | ||
|         segs.append(seg)
 | ||
|         text_parts.append(s.text)
 | ||
| 
 | ||
|         if WHISPER_LOG_SEGMENTS:
 | ||
|             print(f"[whisper] {start:8.2f}–{end:8.2f}  {s.text.strip()}", flush=True)
 | ||
| 
 | ||
|         # progress logging every +5%
 | ||
|         if dur > 0 and end is not None:
 | ||
|             pct = int(min(100, max(0, (end / dur) * 100)))
 | ||
|             if pct >= last_pct + 5:
 | ||
|                 log({
 | ||
|                     "status": "transcribing",
 | ||
|                     "path": str(media_path),
 | ||
|                     "title": title,
 | ||
|                     "progress": pct
 | ||
|                 })
 | ||
|                 last_pct = pct
 | ||
| 
 | ||
|                 # compute realtime speed and ETA for console logs
 | ||
|                 try:
 | ||
|                     elapsed = max(0.001, time.time() - start_wall)
 | ||
|                     processed = max(0.0, float(end))
 | ||
|                     speed = (processed / elapsed) if elapsed > 0 else 0.0  # seconds processed per second
 | ||
|                     # represent as X real-time factor
 | ||
|                     rtf = speed  # 1.0 == real-time
 | ||
|                     eta = ((dur - processed) / speed) if (speed > 0 and dur > 0) else 0
 | ||
|                     print(f"[whisper] progress {pct:3d}%  seg={seg_index:5d}  rtf={rtf:0.2f}x  eta={_fmt_eta(eta)}", flush=True)
 | ||
|                     # also mirror to feed log with speed/eta
 | ||
|                     try:
 | ||
|                         log({
 | ||
|                             "status": "transcribing",
 | ||
|                             "path": str(media_path),
 | ||
|                             "title": title,
 | ||
|                             "progress": pct,
 | ||
|                             "speed_rtf": round(rtf, 2),
 | ||
|                             "eta_sec": int(max(0, eta))
 | ||
|                         })
 | ||
|                     except Exception:
 | ||
|                         pass
 | ||
|                 except Exception:
 | ||
|                     pass
 | ||
| 
 | ||
|         # periodic partial save
 | ||
|         seg_count_since_save += 1
 | ||
|         if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS:
 | ||
|             _save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs)
 | ||
|             seg_count_since_save = 0
 | ||
| 
 | ||
|     # ensure we mark 100% on completion
 | ||
|     if last_pct < 100:
 | ||
|         log({"status": "transcribing", "path": str(media_path), "title": title, "progress": 100})
 | ||
| 
 | ||
|     txt = " ".join(text_parts).strip()
 | ||
| 
 | ||
|     # Write final transcript artifacts
 | ||
|     (base.with_suffix(".json")).write_bytes(orjson.dumps({
 | ||
|         "file": str(media_path),
 | ||
|         "language": info.language,
 | ||
|         "segments": segs
 | ||
|     }))
 | ||
|     (base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
 | ||
| 
 | ||
|     def fmt_ts(t):
 | ||
|         h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
 | ||
|         return f"{h:02}:{m:02}:{s:06.3f}".replace('.',',')
 | ||
| 
 | ||
|     with open(base.with_suffix(".srt"), "w", encoding="utf-8") as srt:
 | ||
|         for i,s in enumerate(segs,1):
 | ||
|             srt.write(f"{i}\n{fmt_ts(s['start'])} --> {fmt_ts(s['end'])}\n{s['text'].strip()}\n\n")
 | ||
| 
 | ||
|     with open(base.with_suffix(".vtt"), "w", encoding="utf-8") as vtt:
 | ||
|         vtt.write("WEBVTT\n\n")
 | ||
|         for s in segs:
 | ||
|             vtt.write(f"{fmt_ts(s['start']).replace(',', '.')} --> {fmt_ts(s['end']).replace(',', '.')} \n{s['text'].strip()}\n\n")
 | ||
| 
 | ||
|     # 4) Copy SRT next to media for Plex (language-suffixed)
 | ||
|     try:
 | ||
|         lang_code = (info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != 'auto' else 'en')).lower()
 | ||
|         srt_src = base.with_suffix(".srt")
 | ||
|         srt_dst = media_path.with_suffix(f".{lang_code}.srt")
 | ||
|         shutil.copy2(srt_src, srt_dst)
 | ||
|     except Exception as e:
 | ||
|         print(f"[post] could not copy srt -> {srt_dst}: {e}", flush=True)
 | ||
| 
 | ||
|     # Write Kodi/Plex-compatible NFO using enhanced metadata (same as before)
 | ||
|     try:
 | ||
|         fallback = {
 | ||
|             "title": title,
 | ||
|             "episode_title": title,
 | ||
|             "show": media_path.parent.name,
 | ||
|             "description": "",
 | ||
|             "pubdate": _extract_date_from_stem(title),
 | ||
|             "duration_sec": media_duration_seconds(media_path),
 | ||
|             "image": "",
 | ||
|             "guid": "",
 | ||
|         }
 | ||
|         meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None)
 | ||
|         ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
 | ||
|         write_episode_nfo(media_path, meta, ttxt)
 | ||
|         try:
 | ||
|             save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
 | ||
|         except Exception:
 | ||
|             pass
 | ||
|     except Exception as e:
 | ||
|         print(f"[post] NFO write failed: {e}", flush=True)
 | ||
| 
 | ||
|     # Cleanup temp WAVs
 | ||
|     try:
 | ||
|         if wav_for_run != wav and wav_for_run.exists():
 | ||
|             wav_for_run.unlink()
 | ||
|         if wav.exists():
 | ||
|             wav.unlink()
 | ||
|     except Exception:
 | ||
|         pass
 | ||
| 
 | ||
|     # Remove partial checkpoints on success
 | ||
|     if WHISPER_RESUME:
 | ||
|         try:
 | ||
|             pjson, ptxt = _partial_paths(title)
 | ||
|             if pjson.exists(): pjson.unlink()
 | ||
|             if ptxt.exists(): ptxt.unlink()
 | ||
|         except Exception:
 | ||
|             pass
 | ||
| 
 | ||
|     # Final average speed over whole transcription
 | ||
|     try:
 | ||
|         total_elapsed = max(0.001, time.time() - start_wall)
 | ||
|         avg_rtf = (dur / total_elapsed) if total_elapsed > 0 else 0.0
 | ||
|         print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True)
 | ||
|     except Exception:
 | ||
|         pass
 | ||
|     print(f"[whisper] finished: {media_path}  lang={info.language}  segments={len(segs)}  dur={dur:.2f}s", flush=True)
 | ||
|     return base
 | ||
| 
 | ||
| 
 | ||
| # --- Meilisearch helpers ---
 | ||
| def _safe_doc_id(s: str) -> str:
 | ||
|     """
 | ||
|     Meilisearch document IDs must be [A-Za-z0-9_-]. Convert the title to a safe slug.
 | ||
|     If the result is empty, fall back to a short SHA1 hash.
 | ||
|     """
 | ||
|     import hashlib
 | ||
|     slug = re.sub(r"\s+", "_", (s or "").strip())
 | ||
|     slug = re.sub(r"[^A-Za-z0-9_-]", "", slug)
 | ||
|     if not slug:
 | ||
|         slug = hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:16]
 | ||
|     return slug
 | ||
| 
 | ||
| 
 | ||
| def ensure_meili_index():
 | ||
|     """Create index 'library' with primaryKey 'id' if it does not already exist."""
 | ||
|     try:
 | ||
|         r = requests.get(f"{MEILI_URL}/indexes/library",
 | ||
|                          headers={"Authorization": f"Bearer {MEILI_KEY}"}, timeout=10)
 | ||
|         if r.status_code == 200:
 | ||
|             return
 | ||
|         # Attempt to create it
 | ||
|         cr = requests.post(
 | ||
|             f"{MEILI_URL}/indexes",
 | ||
|             headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
 | ||
|             data=orjson.dumps({"uid": "library", "primaryKey": "id"}),
 | ||
|             timeout=10,
 | ||
|         )
 | ||
|         # Ignore errors if another process created it first
 | ||
|         try:
 | ||
|             cr.raise_for_status()
 | ||
|         except Exception:
 | ||
|             pass
 | ||
|     except Exception:
 | ||
|         # Non-fatal; indexing will fail later if the index truly doesn't exist
 | ||
|         pass
 | ||
| 
 | ||
| 
 | ||
| def index_meili(json_path: Path):
 | ||
|     # Make sure the index exists and is configured with a primary key
 | ||
|     ensure_meili_index()
 | ||
| 
 | ||
|     doc = json.loads(open(json_path, "r", encoding="utf-8").read())
 | ||
|     file_field = doc.get("file", "")
 | ||
|     title = Path(file_field).stem if file_field else json_path.stem
 | ||
| 
 | ||
|     # Build a Meili-safe document ID
 | ||
|     doc_id = _safe_doc_id(title)
 | ||
| 
 | ||
|     # Extract a YYYYMMDD date if present
 | ||
|     m = re.search(r"\b(\d{8})\b", title)
 | ||
|     date = m.group(1) if m else ""
 | ||
| 
 | ||
|     payload = {
 | ||
|         "id": doc_id,
 | ||
|         "type": "podcast",
 | ||
|         "title": title,
 | ||
|         "date": date,
 | ||
|         "source": str(Path(LIB, Path(file_field or title).name)),
 | ||
|         "text": " ".join(s.get("text", "") for s in doc.get("segments", [])),
 | ||
|         "segments": doc.get("segments", []),
 | ||
|         "meta": {"language": doc.get("language", "")},
 | ||
|     }
 | ||
| 
 | ||
|     for attempt in range(5):
 | ||
|         try:
 | ||
|             r = requests.post(
 | ||
|                 f"{MEILI_URL}/indexes/library/documents",
 | ||
|                 headers={
 | ||
|                     "Authorization": f"Bearer {MEILI_KEY}",
 | ||
|                     "Content-Type": "application/json",
 | ||
|                 },
 | ||
|                 data=orjson.dumps(payload),
 | ||
|                 timeout=15,
 | ||
|             )
 | ||
|             r.raise_for_status()
 | ||
|             break
 | ||
|         except Exception:
 | ||
|             if attempt == 4:
 | ||
|                 raise
 | ||
|             time.sleep(2 * (attempt + 1))
 | ||
| 
 | ||
| import tldextract, trafilatura, requests as _requests
 | ||
| 
 | ||
| def slugify(text):
 | ||
|     text = re.sub(r'[^A-Za-z0-9\-._ ]+', '', text).strip().replace(' ', '_')
 | ||
|     return text[:120] or 'page'
 | ||
| 
 | ||
| def save_web_snapshot(url: str):
 | ||
|     r = _requests.get(url, timeout=30, headers={"User-Agent":"Mozilla/5.0"})
 | ||
|     r.raise_for_status()
 | ||
|     html = r.text
 | ||
|     downloaded = trafilatura.load_html(html, url=url)
 | ||
|     text = trafilatura.extract(downloaded, include_comments=False, include_images=False, with_metadata=True) or ""
 | ||
|     meta = trafilatura.metadata.extract_metadata(downloaded) or None
 | ||
|     title = (meta.title if meta and getattr(meta, 'title', None) else None) or (re.search(r'<title[^>]*>(.*?)</title>', html, re.I|re.S).group(1).strip() if re.search(r'<title[^>]*>(.*?)</title>', html, re.I|re.S) else url)
 | ||
|     date = (meta.date if meta and getattr(meta, 'date', None) else "")
 | ||
|     parts = tldextract.extract(url)
 | ||
|     domain = ".".join([p for p in [parts.domain, parts.suffix] if p])
 | ||
|     slug = slugify(title)
 | ||
|     outdir = LIB / "web" / domain
 | ||
|     outdir.mkdir(parents=True, exist_ok=True)
 | ||
|     base = outdir / slug
 | ||
|     open(base.with_suffix(".html"), "w", encoding="utf-8", errors="ignore").write(html)
 | ||
|     open(base.with_suffix(".txt"), "w", encoding="utf-8", errors="ignore").write(text)
 | ||
|     return base, title, domain, date, text
 | ||
| 
 | ||
| def index_web(base: Path, title: str, domain: str, date: str, text: str, url: str):
 | ||
|     payload = {
 | ||
|         "id": f"web:{domain}:{base.stem}",
 | ||
|         "type": "web",
 | ||
|         "title": title,
 | ||
|         "date": re.sub(r'[^0-9]', '', date)[:8] if date else "",
 | ||
|         "source": f"file://{str(base.with_suffix('.html'))}",
 | ||
|         "text": text,
 | ||
|         "segments": [],
 | ||
|         "meta": {"url": url, "domain": domain}
 | ||
|     }
 | ||
|     r = requests.post(f"{MEILI_URL}/indexes/library/documents",
 | ||
|                       headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
 | ||
|                       data=orjson.dumps(payload))
 | ||
|     r.raise_for_status()
 | ||
| 
 | ||
| def is_media_url(url: str):
 | ||
|     lowered = url.lower()
 | ||
|     media_hosts = ["youtube.com","youtu.be","rumble.com","vimeo.com","soundcloud.com","spotify.com","podbean.com","buzzsprout.com"]
 | ||
|     return any(h in lowered for h in media_hosts)
 | ||
| 
 | ||
| def owui_headers():
 | ||
|     return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {}
 | ||
| 
 | ||
| def owui_get_or_create_kb():
 | ||
|     if not OWUI_URL or not OWUI_KEY:
 | ||
|         return None
 | ||
|     try:
 | ||
|         r = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15)
 | ||
|         r.raise_for_status()
 | ||
|         for kb in r.json().get("data", []):
 | ||
|             if kb.get("name") == OWUI_KB:
 | ||
|                 return kb["id"]
 | ||
|     except Exception:
 | ||
|         pass
 | ||
|     r = requests.post(
 | ||
|         f"{OWUI_URL}/api/v1/knowledge/create",
 | ||
|         headers={**owui_headers(), "Content-Type": "application/json"},
 | ||
|         data=orjson.dumps({"name": OWUI_KB, "description": "All local content indexed by podx"}),
 | ||
|         timeout=15,
 | ||
|     )
 | ||
|     r.raise_for_status()
 | ||
|     return r.json()["data"]["id"]
 | ||
| 
 | ||
| def owui_upload_and_attach(path: Path, kb_id: str):
 | ||
|     with open(path, "rb") as f:
 | ||
|         r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=owui_headers(), files={"file": (path.name, f)}, timeout=60*10)
 | ||
|     r.raise_for_status()
 | ||
|     file_id = r.json()["data"]["id"]
 | ||
|     r = requests.post(
 | ||
|         f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add",
 | ||
|         headers={**owui_headers(), "Content-Type": "application/json"},
 | ||
|         data=orjson.dumps({"file_id": file_id}),
 | ||
|         timeout=60,
 | ||
|     )
 | ||
|     r.raise_for_status()
 | ||
|     return True
 | ||
| 
 | ||
| def publish_to_openwebui(paths):
 | ||
|     if not OWUI_URL or not OWUI_KEY:
 | ||
|         return
 | ||
|     try:
 | ||
|         kb_id = owui_get_or_create_kb()
 | ||
|         for p in paths:
 | ||
|             p = Path(p)
 | ||
|             if not p.exists():
 | ||
|                 continue
 | ||
|             try:
 | ||
|                 owui_upload_and_attach(p, kb_id)
 | ||
|             except Exception as e:
 | ||
|                 log({"url": str(p), "status": "owui_error", "error": str(e)})
 | ||
|     except Exception as e:
 | ||
|         log({"status": "owui_error", "error": str(e)})
 | ||
| 
 | ||
| def handle_local_file(path_str: str):
 | ||
|     """Transcribe & index a local media file that already exists in /library.
 | ||
|     If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper.
 | ||
|     Safe to call repeatedly; it skips if transcript JSON already exists.
 | ||
|     """
 | ||
|     try:
 | ||
|         p = Path(path_str)
 | ||
|         if not p.exists():
 | ||
|             log({"url": path_str, "status": "error", "error": "file_not_found"})
 | ||
|             return
 | ||
| 
 | ||
|         title = p.stem
 | ||
|         base_json = TRN / f"{title}.json"
 | ||
|         if base_json.exists():
 | ||
|             log({"url": path_str, "status": "skip", "reason": "already_transcribed"})
 | ||
|             return
 | ||
| 
 | ||
|         info = {"url": path_str, "status": "transcribing", "title": title,
 | ||
|                 "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
 | ||
|         log(info)
 | ||
| 
 | ||
|         # 0) Try RSS resolver first: if episode with transcript exists, use it (skip Whisper)
 | ||
|         try:
 | ||
|             ep = match_media_to_rss(p)
 | ||
|         except Exception as _e:
 | ||
|             ep = None
 | ||
|         if ep:
 | ||
|             base = use_rss_transcript(p, ep)
 | ||
|             if base:
 | ||
|                 index_meili(base.with_suffix(".json"))
 | ||
|                 publish_to_openwebui([base.with_suffix(".txt")])
 | ||
|                 log({**info, **{"status": "done", "note": "used_rss_transcript"}})
 | ||
|                 return
 | ||
| 
 | ||
|         # 1) Prefer an existing transcript sidecar if present
 | ||
|         sidecar = find_sidecar_transcript(p)
 | ||
|         if sidecar:
 | ||
|             plain = transcript_text_from_file(sidecar)
 | ||
|             lang = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
 | ||
|             base = write_plain_transcript(p, plain, language=lang)
 | ||
|             ensure_sidecar_next_to_media(sidecar, p, lang=lang)
 | ||
|             index_meili(base.with_suffix(".json"))
 | ||
|             publish_to_openwebui([base.with_suffix(".txt")])
 | ||
|             try:
 | ||
|                 # Use info.json (if present) to enrich metadata
 | ||
|                 fallback = {
 | ||
|                     "title": title,
 | ||
|                     "episode_title": title,
 | ||
|                     "show": p.parent.name,
 | ||
|                     "description": "",
 | ||
|                     "pubdate": _extract_date_from_stem(title),
 | ||
|                     "duration_sec": media_duration_seconds(p),
 | ||
|                     "image": "",
 | ||
|                     "guid": "",
 | ||
|                 }
 | ||
|                 meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
 | ||
|                 ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
 | ||
|                 write_episode_nfo(p, meta, ttxt)
 | ||
|                 # Try to fetch and save artwork locally
 | ||
|                 try:
 | ||
|                     save_episode_artwork(meta.get("image"), p, meta.get("show"))
 | ||
|                 except Exception:
 | ||
|                     pass
 | ||
|             except Exception as e:
 | ||
|                 print(f"[post] NFO write failed: {e}", flush=True)
 | ||
|             log({**info, **{"status": "done", "note": "used_existing_transcript"}})
 | ||
|             return
 | ||
| 
 | ||
|         # 1.5) Reuse a transcript that exists in the repository for a matching episode
 | ||
|         repo_json = find_repo_transcript_for_media(p)
 | ||
|         if repo_json:
 | ||
|             base = reuse_repo_transcript(p, repo_json)
 | ||
|             if base:
 | ||
|                 index_meili(base.with_suffix(".json"))
 | ||
|                 publish_to_openwebui([base.with_suffix(".txt")])
 | ||
|                 try:
 | ||
|                     data = json.loads((base.with_suffix(".json")).read_text(encoding="utf-8"))
 | ||
|                     # Start with repo metadata, then enrich from yt-dlp info.json if any
 | ||
|                     meta_repo = {
 | ||
|                         "title": data.get("title") or title,
 | ||
|                         "episode_title": data.get("title") or title,
 | ||
|                         "show": data.get("show") or p.parent.name,
 | ||
|                         "description": data.get("description") or "",
 | ||
|                         "pubdate": data.get("pubdate") or _extract_date_from_stem(title),
 | ||
|                         "duration_sec": media_duration_seconds(p),
 | ||
|                         "image": data.get("image"),
 | ||
|                         "guid": data.get("guid") or data.get("id"),
 | ||
|                     }
 | ||
|                     meta = build_meta_from_sources(p, p.parent.name, meta_repo, ep=None)
 | ||
|                     ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
 | ||
|                     write_episode_nfo(p, meta, ttxt)
 | ||
|                     try:
 | ||
|                         save_episode_artwork(meta.get("image"), p, meta.get("show"))
 | ||
|                     except Exception:
 | ||
|                         pass
 | ||
|                 except Exception as e:
 | ||
|                     print(f"[post] NFO write failed: {e}", flush=True)
 | ||
|                 log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
 | ||
|                 return
 | ||
| 
 | ||
|         # 2) Otherwise, run transcription
 | ||
|         base = transcribe(p)
 | ||
|         index_meili(base.with_suffix(".json"))
 | ||
|         publish_to_openwebui([base.with_suffix(".txt")])
 | ||
|         try:
 | ||
|             fallback = {
 | ||
|                 "title": title,
 | ||
|                 "episode_title": title,
 | ||
|                 "show": p.parent.name,
 | ||
|                 "description": "",
 | ||
|                 "pubdate": _extract_date_from_stem(title),
 | ||
|                 "duration_sec": media_duration_seconds(p),
 | ||
|                 "image": "",
 | ||
|                 "guid": "",
 | ||
|             }
 | ||
|             meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
 | ||
|             ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
 | ||
|             write_episode_nfo(p, meta, ttxt)
 | ||
|             try:
 | ||
|                 save_episode_artwork(meta.get("image"), p, meta.get("show"))
 | ||
|             except Exception:
 | ||
|                 pass
 | ||
|         except Exception as e:
 | ||
|             print(f"[post] NFO write failed: {e}", flush=True)
 | ||
|         log({**info, **{"status": "done"}})
 | ||
|     except Exception as e:
 | ||
|         log({"url": path_str, "status": "error", "error": str(e)})
 | ||
|         raise
 | ||
| 
 | ||
| 
 | ||
| # --- Refresh sidecar metadata and subtitles for an already-downloaded media file ---
 | ||
| def refresh_media(path_str: str):
 | ||
|     """
 | ||
|     Refresh sidecar metadata (info.json, thumbnail) and subtitles for an already-downloaded media file.
 | ||
|     Requires a companion .info.json next to the media (to supply the original URL). No media re-download.
 | ||
|     """
 | ||
|     try:
 | ||
|         p = Path(path_str)
 | ||
|         if not p.exists() or not p.is_file():
 | ||
|             log({"url": path_str, "status": "error", "error": "file_not_found"})
 | ||
|             return
 | ||
| 
 | ||
|         # Locate existing info.json to get the original URL
 | ||
|         info_json = None
 | ||
|         for cand in [p.parent / f"{p.name}.info.json", p.parent / f"{p.stem}.info.json"]:
 | ||
|             if cand.exists():
 | ||
|                 info_json = cand
 | ||
|                 break
 | ||
| 
 | ||
|         if not info_json:
 | ||
|             log({"path": str(p), "status": "refresh-skip", "reason": "no_info_json"})
 | ||
|             print(f"[refresh] skip: no info.json next to {p}", flush=True)
 | ||
|             return
 | ||
| 
 | ||
|         info = load_info_json(info_json) or {}
 | ||
|         url = info.get("webpage_url") or info.get("original_url") or info.get("url")
 | ||
|         if not url:
 | ||
|             log({"path": str(p), "status": "refresh-skip", "reason": "no_url_in_info"})
 | ||
|             print(f"[refresh] skip: no URL in {info_json}", flush=True)
 | ||
|             return
 | ||
| 
 | ||
|         # Prepare yt-dlp command to refresh sidecars only, writing files exactly next to the media
 | ||
|         outtmpl = str(p.with_suffix(".%(ext)s"))
 | ||
|         sub_langs = os.getenv("YTDLP_SUBS_LANGS", "en.*,en")
 | ||
| 
 | ||
|         cmd = [
 | ||
|             "yt-dlp",
 | ||
|             "--skip-download",
 | ||
|             "--write-info-json",
 | ||
|             "--write-thumbnail",
 | ||
|             "--convert-thumbnails", "jpg",
 | ||
|             "--write-subs", "--write-auto-subs",
 | ||
|             "--sub-langs", sub_langs,
 | ||
|             "--convert-subs", "srt",
 | ||
|             "-o", outtmpl,
 | ||
|             url,
 | ||
|         ]
 | ||
| 
 | ||
|         print(f"[refresh] refreshing sidecars for {p} via yt-dlp", flush=True)
 | ||
|         try:
 | ||
|             subprocess.check_call(cmd)
 | ||
|         except subprocess.CalledProcessError as e:
 | ||
|             print(f"[refresh] yt-dlp failed: {e}", flush=True)
 | ||
|             raise
 | ||
| 
 | ||
|         # Ensure language-suffixed SRT exists (Plex-friendly) if any subs were fetched
 | ||
|         try:
 | ||
|             # Pick any .srt just fetched that matches base
 | ||
|             for s in p.parent.glob(f"{p.stem}*.srt"):
 | ||
|                 # If it's already lang-suffixed, keep; also copy to .en.srt when only plain .srt exists
 | ||
|                 if s.name == f"{p.stem}.srt":
 | ||
|                     shutil.copy2(s, p.with_suffix(".en.srt"))
 | ||
|         except Exception:
 | ||
|             pass
 | ||
| 
 | ||
|         # Rebuild NFO using fresh info.json (and RSS if available)
 | ||
|         try:
 | ||
|             # Try RSS match to enrich metadata (non-fatal if not present)
 | ||
|             ep = None
 | ||
|             try:
 | ||
|                 ep = match_media_to_rss(p)
 | ||
|             except Exception:
 | ||
|                 ep = None
 | ||
| 
 | ||
|             fallback = {
 | ||
|                 "title": p.stem,
 | ||
|                 "episode_title": p.stem,
 | ||
|                 "show": p.parent.name,
 | ||
|                 "description": "",
 | ||
|                 "pubdate": _extract_date_from_stem(p.stem),
 | ||
|                 "duration_sec": media_duration_seconds(p),
 | ||
|                 "image": "",
 | ||
|                 "guid": "",
 | ||
|             }
 | ||
|             meta = build_meta_from_sources(p, p.parent.name, fallback, ep)
 | ||
|             # Save local artwork too
 | ||
|             try:
 | ||
|                 save_episode_artwork(meta.get("image"), p, meta.get("show"))
 | ||
|             except Exception:
 | ||
|                 pass
 | ||
| 
 | ||
|             # If a transcript already exists, include it in the NFO plot preview
 | ||
|             ttxt_path = (TRN / p.stem).with_suffix(".txt")
 | ||
|             ttxt = ttxt_path.read_text(encoding="utf-8") if ttxt_path.exists() else None
 | ||
|             write_episode_nfo(p, meta, ttxt)
 | ||
|         except Exception as e:
 | ||
|             print(f"[refresh] NFO/artwork update failed: {e}", flush=True)
 | ||
| 
 | ||
|         log({"path": str(p), "status": "refresh-done"})
 | ||
|         print(f"[refresh] done for {p}", flush=True)
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         log({"path": path_str, "status": "error", "error": str(e)})
 | ||
|         raise
 | ||
| 
 | ||
| def handle_web(url: str):
 | ||
|     info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""}
 | ||
|     log(info)
 | ||
|     base, title, domain, date, text = save_web_snapshot(url)
 | ||
|     info.update({"title": title, "uploader": domain, "date": date, "path": str(base.with_suffix('.html'))})
 | ||
|     log({**info, **{"status":"web-indexing"}})
 | ||
|     index_web(base, title, domain, date, text, url)
 | ||
|     push = [p for p in [base.with_suffix('.txt'), base.with_suffix('.html')] if p.exists()]
 | ||
|     publish_to_openwebui(push)
 | ||
|     log({**info, **{"status":"done"}})
 | ||
| 
 | ||
| def handle_url(url: str):
 | ||
|     try:
 | ||
|         # If a local file path (or file:// URL) is provided, process it directly
 | ||
|         if url.startswith("file://"):
 | ||
|             return handle_local_file(url[7:])
 | ||
|         if url.startswith("/") and Path(url).exists():
 | ||
|             return handle_local_file(url)
 | ||
| 
 | ||
|         if not is_media_url(url):
 | ||
|             handle_web(url)
 | ||
|             return
 | ||
|         info = {"url": url, "status":"queued", "title":"", "uploader":"", "date":"", "path":""}
 | ||
|         log({**info, **{"status":"downloading"}})
 | ||
|         files = yt_dlp(url, TMP)
 | ||
|         for f in files:
 | ||
|             parts = f.relative_to(TMP).parts
 | ||
|             uploader = sanitize(parts[0]) if len(parts)>1 else "Unknown"
 | ||
|             dest_dir = LIB / uploader
 | ||
|             dest_dir.mkdir(parents=True, exist_ok=True)
 | ||
|             dest = dest_dir / sanitize(f.name)
 | ||
|             shutil.move(str(f), dest)
 | ||
|             # Move companion files produced by yt-dlp (info.json, thumbnail, subtitles)
 | ||
|             try:
 | ||
|                 companions = find_companion_files(f)
 | ||
|                 # info.json -> prefer "<dest.name>.info.json", fallback to "<dest.stem>.info.json"
 | ||
|                 if companions.get("info") and companions["info"].exists():
 | ||
|                     dest_info = dest.parent / f"{dest.name}.info.json"
 | ||
|                     try:
 | ||
|                         shutil.move(str(companions["info"]), dest_info)
 | ||
|                     except Exception:
 | ||
|                         # fallback naming without extension
 | ||
|                         dest_info2 = dest.parent / f"{dest.stem}.info.json"
 | ||
|                         try:
 | ||
|                             shutil.move(str(companions['info']), dest_info2)
 | ||
|                         except Exception:
 | ||
|                             pass
 | ||
|                 # thumbnail -> "<dest>.jpg"
 | ||
|                 if companions.get("thumb") and companions["thumb"].exists():
 | ||
|                     try:
 | ||
|                         shutil.move(str(companions["thumb"]), str(dest.with_suffix(".jpg")))
 | ||
|                     except Exception:
 | ||
|                         pass
 | ||
|                 # subtitles -> preserve language suffix: "<dest.stem><suffix>"
 | ||
|                 for s in companions.get("subs", []):
 | ||
|                     if not s.exists():
 | ||
|                         continue
 | ||
|                     suffix_tail = ""
 | ||
|                     s_name = s.name
 | ||
|                     f_stem = f.stem
 | ||
|                     if s_name.startswith(f_stem):
 | ||
|                         suffix_tail = s_name[len(f_stem):]  # includes leading dot if present
 | ||
|                     else:
 | ||
|                         suffix_tail = s.suffix
 | ||
|                     dest_sub = dest.parent / f"{dest.stem}{suffix_tail}"
 | ||
|                     try:
 | ||
|                         shutil.move(str(s), str(dest_sub))
 | ||
|                     except Exception:
 | ||
|                         pass
 | ||
|             except Exception:
 | ||
|                 pass
 | ||
|             info.update({"title": dest.stem, "uploader": uploader,
 | ||
|                          "date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
 | ||
|                          "path": str(dest)})
 | ||
|             log({**info, **{"status":"transcribing", "progress": 0}})
 | ||
|             # Try RSS transcript resolver first
 | ||
|             ep = None
 | ||
|             try:
 | ||
|                 ep = match_media_to_rss(dest)
 | ||
|             except Exception:
 | ||
|                 ep = None
 | ||
|             if ep:
 | ||
|                 base = use_rss_transcript(dest, ep)
 | ||
|             else:
 | ||
|                 base = None
 | ||
|             # 1.5) If we didn't get an RSS transcript and there is a matching one already in the repo, reuse it
 | ||
|             if not base:
 | ||
|                 repo_json = find_repo_transcript_for_media(dest)
 | ||
|                 if repo_json:
 | ||
|                     base = reuse_repo_transcript(dest, repo_json)
 | ||
|             if not base:
 | ||
|                 base = transcribe(dest)
 | ||
|             index_meili(base.with_suffix(".json"))
 | ||
|             publish_to_openwebui([base.with_suffix(".txt")])
 | ||
|             try:
 | ||
|                 # Build metadata from RSS (if matched), yt-dlp info.json, and sensible fallbacks
 | ||
|                 fallback = {
 | ||
|                     "title": dest.stem,
 | ||
|                     "episode_title": dest.stem,
 | ||
|                     "show": uploader,
 | ||
|                     "description": "",
 | ||
|                     "pubdate": _extract_date_from_stem(dest.stem),
 | ||
|                     "duration_sec": media_duration_seconds(dest),
 | ||
|                     "image": "",
 | ||
|                     "guid": "",
 | ||
|                 }
 | ||
|                 meta = build_meta_from_sources(dest, uploader, fallback, ep if 'ep' in locals() else None)
 | ||
|                 ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
 | ||
|                 write_episode_nfo(dest, meta, ttxt)
 | ||
|                 # Save local artwork for Plex/Kodi from meta image url
 | ||
|                 try:
 | ||
|                     save_episode_artwork(meta.get("image"), dest, meta.get("show"))
 | ||
|                 except Exception:
 | ||
|                     pass
 | ||
|             except Exception as e:
 | ||
|                 print(f"[post] NFO write failed: {e}", flush=True)
 | ||
|             log({**info, **{"status":"done"}})
 | ||
|     except Exception as e:
 | ||
|         log({"url": url, "status":"error", "error": str(e)})
 | ||
|         raise |