1779 lines
72 KiB
Python
1779 lines
72 KiB
Python
import os, subprocess, shutil, json, re, orjson, requests, unicodedata
|
||
from rq import Queue
|
||
from redis import Redis
|
||
from pathlib import Path
|
||
import math
|
||
import difflib
|
||
import time
|
||
from faster_whisper import WhisperModel
|
||
|
||
from xml.sax.saxutils import escape as xml_escape
|
||
|
||
MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
|
||
MEILI_KEY = os.getenv("MEILI_KEY", "")
|
||
LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
|
||
TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
|
||
TMP = Path(os.getenv("TMP_ROOT", "/tmpdl"))
|
||
|
||
|
||
# --- Runtime pause switch for CPU-heavy work (no rebuild needed) ---
|
||
PAUSE_TRANSCRIBE_FILE = Path(os.getenv("PAUSE_TRANSCRIBE_FILE", str(TRN / ".pause_transcribe")))
|
||
|
||
# Redis-backed pause flag (podx-tools compatible)
|
||
PAUSE_TRANSCRIBE_REDIS_KEY = os.getenv("PAUSE_TRANSCRIBE_REDIS_KEY", "podx:transcribe:paused").strip()
|
||
|
||
def _pause_flag_redis() -> bool:
|
||
"""Return True if a truthy pause flag is set in Redis under PAUSE_TRANSCRIBE_REDIS_KEY."""
|
||
try:
|
||
from redis import Redis as _R
|
||
val = _R.from_url(REDIS_URL).get(PAUSE_TRANSCRIBE_REDIS_KEY)
|
||
if not val:
|
||
return False
|
||
v = val.decode("utf-8", "ignore").strip().lower()
|
||
return v not in ("", "0", "false", "no", "(nil)")
|
||
except Exception:
|
||
return False
|
||
|
||
def transcribe_paused() -> bool:
|
||
"""Return True if new transcription work should be paused (file flag or Redis flag)."""
|
||
try:
|
||
if PAUSE_TRANSCRIBE_FILE.exists():
|
||
return True
|
||
except Exception:
|
||
pass
|
||
# Fall back to Redis-based switch used by podx-tools
|
||
return _pause_flag_redis()
|
||
|
||
def wait_if_paused(label: str = "transcribe", poll_sec: int = 10):
|
||
"""
|
||
If the pause file exists, block this worker in a low-CPU sleep loop until it is removed.
|
||
This lets you 'pause' heavy work without killing workers or rebuilding.
|
||
"""
|
||
try:
|
||
if transcribe_paused():
|
||
print(f"[pause] {label}: pause flag present at {PAUSE_TRANSCRIBE_FILE}; waiting…", flush=True)
|
||
while transcribe_paused():
|
||
time.sleep(max(1, int(poll_sec)))
|
||
except Exception:
|
||
# If anything goes wrong reading the flag, don't block the pipeline.
|
||
pass
|
||
|
||
# --- Exception to abort transcription when pause is requested ---
|
||
class PauseInterrupt(Exception):
|
||
"""Raised to cooperatively abort a running transcription when pause is requested."""
|
||
pass
|
||
|
||
MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3")
|
||
COMPUTE = os.getenv("WHISPER_PRECISION","int8")
|
||
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()
|
||
|
||
# Whisper device/config controls
|
||
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "auto").strip()
|
||
WHISPER_DEVICE_INDEX = int(os.getenv("WHISPER_DEVICE_INDEX", "0"))
|
||
WHISPER_CPU_THREADS = int(os.getenv("WHISPER_CPU_THREADS", "4"))
|
||
|
||
# --- Host load guards / thread limits ---
|
||
# Limit ffmpeg threads (helps keep CPU in check when multiple workers run)
|
||
FFMPEG_THREADS = int(os.getenv("FFMPEG_THREADS", "1"))
|
||
|
||
# Tame BLAS/threadpools that libraries may spin up implicitly
|
||
import os as _os_threads
|
||
_os_threads.environ.setdefault("OMP_NUM_THREADS", str(WHISPER_CPU_THREADS))
|
||
_os_threads.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
|
||
_os_threads.environ.setdefault("MKL_NUM_THREADS", "1")
|
||
_os_threads.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
|
||
|
||
# Whisper logging & resume controls
|
||
WHISPER_LOG_SEGMENTS = os.getenv("WHISPER_LOG_SEGMENTS", "1") not in ("0", "false", "False")
|
||
WHISPER_RESUME = os.getenv("WHISPER_RESUME", "1") not in ("0", "false", "False")
|
||
PARTIAL_SAVE_EVERY_SEGS = int(os.getenv("WHISPER_PARTIAL_SAVE_EVERY_SEGS", "20"))
|
||
|
||
# RSS resolver config
|
||
RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json"))
|
||
RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150")) # seconds
|
||
DEFAULT_TRANSCRIPT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
|
||
|
||
OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
|
||
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
|
||
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
|
||
|
||
# Redis-backed job queue settings and offload toggle
|
||
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0").strip()
|
||
OFFLOAD_TRANSCRIBE = os.getenv("OFFLOAD_TRANSCRIBE", "1").lower() not in ("0", "false", "no")
|
||
|
||
# Worker role selection
|
||
WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe'
|
||
JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()]
|
||
|
||
def _mode_allows(task: str) -> bool:
|
||
"""Gate tasks by worker role. In 'transcribe' mode only allow transcription of local files
|
||
(including indexing and OWUI publish). "task" is one of: 'download','web','local','transcribe'."""
|
||
if WORKER_MODE == "transcribe":
|
||
return task in {"local", "transcribe"}
|
||
return True
|
||
|
||
TRN.mkdir(parents=True, exist_ok=True)
|
||
LIB.mkdir(parents=True, exist_ok=True)
|
||
TMP.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Lazy Whisper model loader so the worker can start even if model download/setup is slow
|
||
_model = None
|
||
|
||
def get_model():
|
||
global _model
|
||
if _model is None:
|
||
print(f"[whisper] loading model='{MODEL_NAME}' device='{WHISPER_DEVICE}' idx={WHISPER_DEVICE_INDEX} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
|
||
_model = WhisperModel(
|
||
MODEL_NAME,
|
||
device=WHISPER_DEVICE,
|
||
device_index=WHISPER_DEVICE_INDEX,
|
||
compute_type=COMPUTE,
|
||
cpu_threads=WHISPER_CPU_THREADS,
|
||
)
|
||
return _model
|
||
|
||
# --- Helper: Reset model with new device and device_index ---
|
||
def reset_model(device: str, device_index: int | None = None):
|
||
"""Reset the global _model to a new WhisperModel with the given device and device_index."""
|
||
global _model
|
||
idx = device_index if device_index is not None else WHISPER_DEVICE_INDEX
|
||
print(f"[whisper] resetting model='{MODEL_NAME}' device='{device}' idx={idx} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
|
||
_model = WhisperModel(
|
||
MODEL_NAME,
|
||
device=device,
|
||
device_index=idx,
|
||
compute_type=COMPUTE,
|
||
cpu_threads=WHISPER_CPU_THREADS,
|
||
)
|
||
|
||
# --- Helper: Run transcribe with fallback to CPU on GPU/oom errors ---
|
||
def run_transcribe_with_fallback(wav_path: Path, lang):
|
||
"""
|
||
Try to transcribe with current model; on GPU/CUDA/HIP/ROCm/OOM errors, reset to CPU and retry once.
|
||
Returns (segments, info) or raises exception.
|
||
"""
|
||
model = get_model()
|
||
try:
|
||
return model.transcribe(str(wav_path), vad_filter=True, language=lang)
|
||
except Exception as e:
|
||
msg = str(e)
|
||
gpu_errs = [
|
||
"CUDA", "cublas", "out of memory", "HIP", "ROCm", "device-side assert", "CUDNN", "cudaError", "cuda runtime", "cudaMalloc"
|
||
]
|
||
if any(err.lower() in msg.lower() for err in gpu_errs):
|
||
print(f"[whisper] GPU error detected: '{msg}'. Retrying on CPU...", flush=True)
|
||
reset_model("cpu", 0)
|
||
try:
|
||
model = get_model()
|
||
return model.transcribe(str(wav_path), vad_filter=True, language=lang)
|
||
except Exception as e2:
|
||
print(f"[whisper] CPU fallback also failed: {e2}", flush=True)
|
||
raise
|
||
raise
|
||
|
||
def log(feed):
|
||
try:
|
||
with open(TRN / "_feed.log", "a", encoding="utf-8") as f:
|
||
f.write(orjson.dumps(feed).decode()+"\n")
|
||
except Exception:
|
||
pass
|
||
|
||
def sanitize(name):
|
||
return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
|
||
|
||
# ---------- RSS transcript resolver ----------
|
||
|
||
def _normalize_title(t: str) -> str:
|
||
t = (t or "").lower()
|
||
t = re.sub(r"\s+", " ", t)
|
||
# remove punctuation-ish
|
||
t = re.sub(r"[^a-z0-9 _-]+", "", t)
|
||
return t.strip()
|
||
|
||
def _stem_without_date(stem: str) -> str:
|
||
# drop leading YYYYMMDD - from filenames created by yt-dlp template
|
||
m = re.match(r"^\d{8}\s*-\s*(.*)$", stem)
|
||
return m.group(1) if m else stem
|
||
|
||
def _extract_date_from_stem(stem: str) -> str | None:
|
||
m = re.search(r"\b(\d{8})\b", stem)
|
||
return m.group(1) if m else None
|
||
|
||
def _best_title_match(title: str, candidates: list[str]) -> tuple[str, float]:
|
||
"""Return (best_title, score 0..1) using difflib SequenceMatcher."""
|
||
if not candidates:
|
||
return "", 0.0
|
||
norm_title = _normalize_title(title)
|
||
best = ("", 0.0)
|
||
for c in candidates:
|
||
score = difflib.SequenceMatcher(None, norm_title, _normalize_title(c)).ratio()
|
||
if score > best[1]:
|
||
best = (c, score)
|
||
return best
|
||
|
||
def _load_rss_index() -> list[dict]:
|
||
try:
|
||
if RSS_INDEX_PATH.exists():
|
||
data = json.loads(RSS_INDEX_PATH.read_text(encoding="utf-8"))
|
||
# supports {"episodes":[...]} or a flat list
|
||
if isinstance(data, dict) and "episodes" in data:
|
||
return data["episodes"] or []
|
||
if isinstance(data, list):
|
||
return data
|
||
except Exception as e:
|
||
print(f"[resolver] failed to load RSS index: {e}", flush=True)
|
||
return []
|
||
|
||
def match_media_to_rss(media_path: Path) -> dict | None:
|
||
"""Try to match a local media file to an RSS episode entry."""
|
||
episodes = _load_rss_index()
|
||
if not episodes:
|
||
return None
|
||
|
||
stem = media_path.stem
|
||
title_no_date = _stem_without_date(stem)
|
||
file_date = _extract_date_from_stem(stem)
|
||
# duration tolerance
|
||
media_dur = media_duration_seconds(media_path)
|
||
|
||
# Candidates: filter by date if present, else all
|
||
if file_date:
|
||
pool = [e for e in episodes if (str(e.get("date", "")) == file_date or str(e.get("pubdate", "")) == file_date)]
|
||
if not pool:
|
||
pool = episodes
|
||
else:
|
||
pool = episodes
|
||
|
||
# Pick best by (title similarity, duration proximity)
|
||
best_ep, best_score = None, -1.0
|
||
for ep in pool:
|
||
ep_title = ep.get("title") or ep.get("itunes_title") or ""
|
||
sim = _best_title_match(title_no_date, [ep_title])[1]
|
||
dur = float(ep.get("duration_sec") or ep.get("duration") or 0.0)
|
||
dur_ok = True
|
||
if media_dur and dur:
|
||
dur_ok = abs(media_dur - dur) <= RSS_DURATION_TOLERANCE
|
||
score = sim + (0.1 if dur_ok else 0.0)
|
||
if score > best_score:
|
||
best_score, best_ep = score, ep
|
||
|
||
if best_ep and best_score >= 0.5:
|
||
print(f"[resolver] matched '{stem}' -> '{best_ep.get('title','')}' score={best_score:.2f}", flush=True)
|
||
return best_ep
|
||
return None
|
||
|
||
def _choose_transcript_url(ep: dict) -> tuple[str, str] | tuple[None, None]:
|
||
"""Return (url, kind) preferring txt, vtt, then srt. 'kind' in {'txt','vtt','srt'}."""
|
||
# unified structure from rss_ingest.py: ep["transcripts"] = [{"url":..., "type": ...}, ...]
|
||
items = ep.get("transcripts") or []
|
||
# some ingesters store separate keys
|
||
if not items:
|
||
for key, kind in [("transcript_txt","txt"), ("transcript_vtt","vtt"), ("transcript_srt","srt")]:
|
||
if ep.get(key):
|
||
items.append({"url": ep[key], "type": kind})
|
||
# preference order
|
||
for kind in ["txt", "vtt", "srt"]:
|
||
for it in items:
|
||
t = (it.get("type") or "").lower()
|
||
u = it.get("url") or ""
|
||
if u and (kind in t or (kind == "txt" and t in ["text","plain","text/plain"]) or (kind in u.lower())):
|
||
return u, kind
|
||
return (None, None)
|
||
|
||
def fetch_rss_transcript(ep: dict, dest_dir: Path) -> Path | None:
|
||
"""Download transcript to dest_dir and return local Path; convert VTT->SRT if needed."""
|
||
url, kind = _choose_transcript_url(ep)
|
||
if not url:
|
||
return None
|
||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||
# filename from episode title
|
||
safe = sanitize(ep.get("title") or ep.get("guid") or "episode")
|
||
path = dest_dir / f"{safe}.{kind if kind!='txt' else 'txt'}"
|
||
try:
|
||
r = requests.get(url, timeout=30)
|
||
r.raise_for_status()
|
||
mode = "wb" if kind in ("vtt","srt") else "w"
|
||
if mode == "wb":
|
||
path.write_bytes(r.content)
|
||
else:
|
||
path.write_text(r.text, encoding="utf-8")
|
||
print(f"[resolver] downloaded transcript ({kind}) from {url}", flush=True)
|
||
return path
|
||
except Exception as e:
|
||
print(f"[resolver] failed to fetch transcript: {e}", flush=True)
|
||
return None
|
||
|
||
def use_rss_transcript(media_path: Path, ep: dict) -> Path | None:
|
||
"""Create standard transcript artifacts from an RSS transcript (txt/vtt/srt)."""
|
||
# Prefer direct download; else if rss_ingest already saved a local file path, try that.
|
||
sidecar = None
|
||
local_hint = ep.get("transcript_local")
|
||
if local_hint:
|
||
p = Path(local_hint)
|
||
if p.exists():
|
||
sidecar = p
|
||
if sidecar is None:
|
||
sidecar = fetch_rss_transcript(ep, TMP)
|
||
|
||
if not sidecar or not sidecar.exists():
|
||
return None
|
||
|
||
# Convert to plain text
|
||
plain = transcript_text_from_file(sidecar)
|
||
lang = (ep.get("language") or ep.get("lang") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
|
||
base = write_plain_transcript(media_path, plain, language=lang)
|
||
# Place an SRT next to video for Plex
|
||
ensure_sidecar_next_to_media(sidecar, media_path, lang=lang)
|
||
# Write provenance sidecar
|
||
(base.with_suffix(".prov.json")).write_bytes(orjson.dumps({
|
||
"source": "rss",
|
||
"feed": ep.get("feed_url"),
|
||
"guid": ep.get("guid"),
|
||
"episode_title": ep.get("title"),
|
||
"transcript_kind": sidecar.suffix.lower().lstrip("."),
|
||
"transcript_url": _choose_transcript_url(ep)[0] or "",
|
||
}))
|
||
# Write Kodi/Plex-compatible NFO
|
||
try:
|
||
# Gather metadata for NFO from RSS entry
|
||
meta = {
|
||
"title": ep.get("title"),
|
||
"episode_title": ep.get("title"),
|
||
"show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show"),
|
||
"description": ep.get("description") or ep.get("content"),
|
||
"pubdate": ep.get("pubdate"),
|
||
"pubdate_iso": ep.get("date_iso"),
|
||
"duration_sec": ep.get("duration_sec") or ep.get("duration"),
|
||
"image": ep.get("image") or ep.get("image_url"),
|
||
"guid": ep.get("guid"),
|
||
}
|
||
txt_path = base.with_suffix(".txt")
|
||
transcript_text = txt_path.read_text(encoding="utf-8") if txt_path.exists() else None
|
||
write_episode_nfo(media_path, meta, transcript_text)
|
||
# Save local artwork for Plex/Kodi
|
||
try:
|
||
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
|
||
except Exception:
|
||
pass
|
||
except Exception as e:
|
||
print(f"[post] NFO write failed: {e}", flush=True)
|
||
return base
|
||
|
||
def find_sidecar_transcript(media_path: Path) -> Path | None:
|
||
"""Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
|
||
Tries common variants including language-suffixed SRT/VTT.
|
||
"""
|
||
candidates: list[Path] = []
|
||
# exact same stem in same folder
|
||
for ext in [".txt", ".srt", ".vtt"]:
|
||
p = media_path.parent / (media_path.stem + ext)
|
||
if p.exists():
|
||
candidates.append(p)
|
||
# language-suffixed near the media file (e.g., .en.srt)
|
||
for ext in [".srt", ".vtt"]:
|
||
p = media_path.with_suffix(f".en{ext}")
|
||
if p.exists() and p not in candidates:
|
||
candidates.append(p)
|
||
return candidates[0] if candidates else None
|
||
|
||
|
||
# ---------- Transcript repository reuse helpers ----------
|
||
|
||
def find_repo_transcript_for_media(media_path: Path) -> Path | None:
|
||
"""Search the transcript repository (/transcripts) for an existing transcript
|
||
that likely belongs to this media file (match by YYYYMMDD in filename and/or
|
||
fuzzy title similarity). Returns a path to a matching .json if found."""
|
||
try:
|
||
stem = media_path.stem
|
||
title_no_date = _stem_without_date(stem)
|
||
file_date = _extract_date_from_stem(stem)
|
||
best_json, best_score = None, 0.0
|
||
for j in TRN.glob("*.json"):
|
||
try:
|
||
data = json.loads(j.read_text(encoding="utf-8"))
|
||
except Exception:
|
||
continue
|
||
other_file = Path(data.get("file", ""))
|
||
other_stem = other_file.stem if other_file else j.stem
|
||
other_date = _extract_date_from_stem(other_stem)
|
||
# If both have dates and they differ a lot, skip
|
||
if file_date and other_date and file_date != other_date:
|
||
continue
|
||
# Compare titles (without dates)
|
||
sim = difflib.SequenceMatcher(
|
||
None,
|
||
_normalize_title(title_no_date),
|
||
_normalize_title(_stem_without_date(other_stem)),
|
||
).ratio()
|
||
# Nudge score when dates match
|
||
if file_date and other_date and file_date == other_date:
|
||
sim += 0.1
|
||
if sim > best_score:
|
||
best_score, best_json = sim, j
|
||
# Require a reasonable similarity
|
||
return best_json if best_json and best_score >= 0.60 else None
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None:
|
||
"""Copy/retarget an existing transcript JSON/TXT (and make SRT/VTT if possible)
|
||
from the repository so that it belongs to the provided media_path. Returns
|
||
the new base path in /transcripts or None."""
|
||
try:
|
||
# load the source transcript
|
||
data = json.loads(repo_json.read_text(encoding="utf-8"))
|
||
src_base = TRN / Path(repo_json).stem
|
||
src_txt = src_base.with_suffix(".txt")
|
||
src_srt = src_base.with_suffix(".srt")
|
||
src_vtt = src_base.with_suffix(".vtt")
|
||
|
||
# write the retargeted artifacts
|
||
new_title = media_path.stem
|
||
new_base = TRN / new_title
|
||
new_base.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# update file path
|
||
data["file"] = str(media_path)
|
||
(new_base.with_suffix(".json")).write_bytes(orjson.dumps(data))
|
||
|
||
# copy or synthesize TXT
|
||
if src_txt.exists():
|
||
shutil.copy2(src_txt, new_base.with_suffix(".txt"))
|
||
else:
|
||
# fallback: concatenate segments
|
||
txt = " ".join(s.get("text", "") for s in data.get("segments", []))
|
||
(new_base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
|
||
|
||
# copy SRT/VTT if present; otherwise synthesize SRT from segments
|
||
if src_srt.exists():
|
||
shutil.copy2(src_srt, new_base.with_suffix(".srt"))
|
||
else:
|
||
# synthesize SRT
|
||
def fmt_ts(t):
|
||
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
|
||
return f"{h:02}:{m:02}:{s:06.3f}".replace('.',',')
|
||
with open(new_base.with_suffix(".srt"), "w", encoding="utf-8") as srt:
|
||
for i, s in enumerate(data.get("segments", []), 1):
|
||
srt.write(f"{i}\n{fmt_ts(s.get('start',0.0))} --> {fmt_ts(s.get('end',0.0))}\n{s.get('text','').strip()}\n\n")
|
||
if src_vtt.exists():
|
||
shutil.copy2(src_vtt, new_base.with_suffix(".vtt"))
|
||
else:
|
||
# synthesize VTT from segments
|
||
def fmt_ts_vtt(t):
|
||
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
|
||
return f"{h:02}:{m:02}:{s:06.3f}"
|
||
with open(new_base.with_suffix(".vtt"), "w", encoding="utf-8") as vtt:
|
||
vtt.write("WEBVTT\n\n")
|
||
for s in data.get("segments", []):
|
||
vtt.write(f"{fmt_ts_vtt(s.get('start',0.0))} --> {fmt_ts_vtt(s.get('end',0.0))} \n{s.get('text','').strip()}\n\n")
|
||
|
||
# ensure sidecar next to media
|
||
try:
|
||
lang = (data.get("language") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
|
||
ensure_sidecar_next_to_media(new_base.with_suffix(".srt"), media_path, lang=lang)
|
||
except Exception:
|
||
pass
|
||
|
||
# Write Kodi/Plex-compatible NFO
|
||
try:
|
||
meta = {
|
||
"title": data.get("title") or media_path.stem,
|
||
"episode_title": data.get("title") or media_path.stem,
|
||
"show": data.get("show") or media_path.parent.name,
|
||
"description": data.get("description") or "",
|
||
"pubdate": data.get("pubdate") or data.get("date"),
|
||
"duration_sec": media_duration_seconds(media_path),
|
||
"image": data.get("image"),
|
||
"guid": data.get("guid") or data.get("id"),
|
||
}
|
||
txtp = new_base.with_suffix(".txt")
|
||
ttxt = txtp.read_text(encoding="utf-8") if txtp.exists() else None
|
||
write_episode_nfo(media_path, meta, ttxt)
|
||
# Save local artwork for Plex/Kodi
|
||
try:
|
||
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
|
||
except Exception:
|
||
pass
|
||
except Exception as e:
|
||
print(f"[post] NFO write failed: {e}", flush=True)
|
||
|
||
return new_base
|
||
except Exception as e:
|
||
print(f"[resolver] failed to reuse repo transcript: {e}", flush=True)
|
||
return None
|
||
|
||
|
||
def transcript_text_from_file(path: Path) -> str:
|
||
"""Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
|
||
try:
|
||
raw = path.read_text(encoding="utf-8", errors="ignore")
|
||
except Exception:
|
||
raw = path.read_text(errors="ignore")
|
||
|
||
if path.suffix.lower() == ".txt":
|
||
return raw.strip()
|
||
|
||
# For SRT/VTT, drop timestamp lines, cue numbers and headers
|
||
lines: list[str] = []
|
||
for line in raw.splitlines():
|
||
ls = line.strip()
|
||
if not ls:
|
||
continue
|
||
if "-->" in ls: # timestamp line
|
||
continue
|
||
if ls.upper().startswith("WEBVTT"):
|
||
continue
|
||
if re.match(r"^\d+$", ls): # cue index
|
||
continue
|
||
lines.append(ls)
|
||
return " ".join(lines)
|
||
|
||
|
||
def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
|
||
"""Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed. If the sidecar is .txt, do nothing."""
|
||
try:
|
||
if sidecar.suffix.lower() == ".txt":
|
||
return
|
||
if sidecar.suffix.lower() == ".srt":
|
||
dst = media_path.with_suffix(f".{lang}.srt")
|
||
shutil.copy2(sidecar, dst)
|
||
elif sidecar.suffix.lower() == ".vtt":
|
||
tmp_srt = sidecar.with_suffix(".srt")
|
||
subprocess.run(["ffmpeg", "-nostdin", "-y", "-threads", str(FFMPEG_THREADS), "-i", str(sidecar), str(tmp_srt)], check=True)
|
||
dst = media_path.with_suffix(f".{lang}.srt")
|
||
shutil.move(str(tmp_srt), dst)
|
||
except Exception as e:
|
||
print(f"[post] sidecar copy/convert failed: {e}", flush=True)
|
||
|
||
|
||
# --- small helpers for progress/ETA formatting ---
|
||
def _fmt_eta(sec: float) -> str:
|
||
try:
|
||
sec = max(0, int(sec))
|
||
h, rem = divmod(sec, 3600)
|
||
m, s = divmod(rem, 60)
|
||
if h:
|
||
return f"{h}h {m}m {s}s"
|
||
if m:
|
||
return f"{m}m {s}s"
|
||
return f"{s}s"
|
||
except Exception:
|
||
return ""
|
||
|
||
|
||
def save_episode_artwork(image_url: str | None, media_path: Path, show_title: str | None = None):
|
||
"""Download episode artwork from image_url and save next to the media as '<basename>.jpg'.
|
||
Also drop a folder-level 'poster.jpg' for the show directory if not present.
|
||
Best-effort; failures are logged but non-fatal.
|
||
"""
|
||
if not image_url:
|
||
return
|
||
try:
|
||
resp = requests.get(image_url, timeout=30, stream=True)
|
||
resp.raise_for_status()
|
||
# Determine content-type and write a temporary file
|
||
ctype = (resp.headers.get("Content-Type") or "").lower()
|
||
tmp_file = media_path.with_suffix(".art.tmp")
|
||
with open(tmp_file, "wb") as out:
|
||
for chunk in resp.iter_content(chunk_size=8192):
|
||
if chunk:
|
||
out.write(chunk)
|
||
|
||
# Always provide a .jpg next to the media for Plex
|
||
episode_jpg = media_path.with_suffix(".jpg")
|
||
if "image/jpeg" in ctype:
|
||
# Already JPEG
|
||
shutil.move(str(tmp_file), str(episode_jpg))
|
||
else:
|
||
# Try converting to JPEG with ffmpeg; if it fails, keep bytes as-is
|
||
try:
|
||
subprocess.run(
|
||
["ffmpeg", "-nostdin", "-y", "-i", str(tmp_file), str(episode_jpg)],
|
||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
|
||
)
|
||
try:
|
||
tmp_file.unlink()
|
||
except Exception:
|
||
pass
|
||
except Exception:
|
||
shutil.move(str(tmp_file), str(episode_jpg))
|
||
|
||
# Also drop a folder poster once per show (helps Plex folder views)
|
||
try:
|
||
show_poster = media_path.parent / "poster.jpg"
|
||
if not show_poster.exists():
|
||
shutil.copy2(episode_jpg, show_poster)
|
||
except Exception:
|
||
pass
|
||
|
||
except Exception as e:
|
||
print(f"[post] artwork download failed: {e}", flush=True)
|
||
|
||
|
||
|
||
def find_companion_files(src: Path) -> dict:
|
||
"""Return likely yt-dlp companion files for a downloaded media file."""
|
||
out = {}
|
||
# info.json can be either "<name>.<ext>.info.json" or "<name>.info.json"
|
||
cands_info = [
|
||
src.parent / f"{src.name}.info.json",
|
||
src.parent / f"{src.stem}.info.json",
|
||
]
|
||
out["info"] = next((p for p in cands_info if p.exists()), None)
|
||
|
||
# thumbnails may be "<name>.<ext>.jpg" or "<name>.jpg" (we convert to jpg)
|
||
cand_thumbs = [
|
||
src.parent / f"{src.name}.jpg",
|
||
src.parent / f"{src.stem}.jpg",
|
||
src.parent / f"{src.stem}.jpeg",
|
||
src.parent / f"{src.stem}.png",
|
||
src.parent / f"{src.stem}.webp",
|
||
]
|
||
out["thumb"] = next((p for p in cand_thumbs if p.exists()), None)
|
||
|
||
# subtitles (keep multiple)
|
||
subs = []
|
||
for s in src.parent.glob(f"{src.stem}*.srt"):
|
||
subs.append(s)
|
||
for s in src.parent.glob(f"{src.stem}*.vtt"):
|
||
subs.append(s)
|
||
out["subs"] = subs
|
||
return out
|
||
|
||
def load_info_json(path: Path) -> dict | None:
|
||
try:
|
||
return json.loads(path.read_text(encoding="utf-8"))
|
||
except Exception:
|
||
return None
|
||
|
||
def _iso_from_yyyymmdd(s: str | None) -> str | None:
|
||
if not s or not re.match(r"^\d{8}$", s):
|
||
return None
|
||
return f"{s[0:4]}-{s[4:6]}-{s[6:8]}"
|
||
|
||
def build_meta_from_sources(media_path: Path, uploader: str, fallback_meta: dict, ep: dict | None = None) -> dict:
|
||
"""
|
||
Merge metadata from (priority): RSS episode `ep` -> yt-dlp info.json (if present) -> fallback.
|
||
Returns a dict compatible with write_episode_nfo().
|
||
"""
|
||
# Start with fallback
|
||
meta = dict(fallback_meta)
|
||
|
||
# Augment from info.json if present
|
||
info = None
|
||
for cand in [
|
||
media_path.parent / f"{media_path.name}.info.json",
|
||
media_path.parent / f"{media_path.stem}.info.json",
|
||
]:
|
||
if cand.exists():
|
||
info = load_info_json(cand)
|
||
break
|
||
if info:
|
||
meta.setdefault("title", info.get("title"))
|
||
meta.setdefault("episode_title", info.get("title"))
|
||
meta.setdefault("description", info.get("description") or info.get("fulltitle"))
|
||
# upload_date is YYYYMMDD
|
||
iso = _iso_from_yyyymmdd(info.get("upload_date"))
|
||
if iso:
|
||
meta["pubdate_iso"] = iso
|
||
# Prefer video duration if present
|
||
if not meta.get("duration_sec") and info.get("duration"):
|
||
meta["duration_sec"] = info.get("duration")
|
||
# thumbnail URL
|
||
if not meta.get("image"):
|
||
meta["image"] = info.get("thumbnail")
|
||
# show/uploader
|
||
if not meta.get("show"):
|
||
meta["show"] = info.get("uploader") or uploader
|
||
|
||
# Finally, layer RSS data on top if available (most authoritative for podcasts)
|
||
if ep:
|
||
meta.update({
|
||
"title": ep.get("title") or meta.get("title"),
|
||
"episode_title": ep.get("title") or meta.get("episode_title"),
|
||
"show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or meta.get("show") or uploader,
|
||
"description": ep.get("description") or ep.get("content") or meta.get("description", ""),
|
||
"pubdate": ep.get("pubdate") or meta.get("pubdate", ""),
|
||
"pubdate_iso": ep.get("date_iso") or meta.get("pubdate_iso", meta.get("pubdate")),
|
||
"duration_sec": ep.get("duration_sec") or ep.get("duration") or meta.get("duration_sec"),
|
||
"image": ep.get("image") or ep.get("image_url") or meta.get("image", ""),
|
||
"guid": ep.get("guid") or meta.get("guid", ""),
|
||
})
|
||
|
||
return meta
|
||
|
||
# ---------- Kodi/Plex NFO writer ----------
|
||
from datetime import datetime
|
||
|
||
def _first_nonempty(*vals):
|
||
for v in vals:
|
||
if v is None:
|
||
continue
|
||
if isinstance(v, str) and v.strip():
|
||
return v.strip()
|
||
if v:
|
||
return v
|
||
return None
|
||
|
||
def _coerce_aired(pubdate: str | None) -> str:
|
||
"""Convert RSS-style pubdate to YYYY-MM-DD if possible."""
|
||
if not pubdate:
|
||
return ""
|
||
# already ISO-like
|
||
m = re.match(r"^(\d{4})[-/](\d{2})[-/](\d{2})", pubdate)
|
||
if m:
|
||
return f"{m.group(1)}-{m.group(2)}-{m.group(3)}"
|
||
# RFC 2822 example: Tue, 21 Feb 2023 06:00:00 +0000
|
||
try:
|
||
dt = datetime.strptime(pubdate[:31], "%a, %d %b %Y %H:%M:%S %z")
|
||
return dt.strftime("%Y-%m-%d")
|
||
except Exception:
|
||
# try without tz
|
||
try:
|
||
dt = datetime.strptime(pubdate[:25], "%a, %d %b %Y %H:%M:%S")
|
||
return dt.strftime("%Y-%m-%d")
|
||
except Exception:
|
||
return ""
|
||
|
||
def write_episode_nfo(media_path: Path, meta: dict, transcript_text: str | None = None) -> Path:
|
||
"""Write a minimal Kodi/Plex-compatible NFO next to the media file.
|
||
`meta` may include: title, show, plot, pubdate, duration_sec, thumb, guid.
|
||
"""
|
||
try:
|
||
title = _first_nonempty(meta.get("episode_title"), meta.get("title"), media_path.stem) or media_path.stem
|
||
show = _first_nonempty(meta.get("show"), meta.get("podcast_title"), meta.get("feed_title"), media_path.parent.name) or media_path.parent.name
|
||
plot = _first_nonempty(meta.get("description"), meta.get("content"), meta.get("summary"), "") or ""
|
||
# Optionally append transcript preview to plot
|
||
if transcript_text:
|
||
preview = transcript_text.strip()
|
||
if preview:
|
||
preview = (preview[:1800] + "…") if len(preview) > 1800 else preview
|
||
plot = (plot + "\n\n" if plot else "") + preview
|
||
aired = _coerce_aired(_first_nonempty(meta.get("pubdate_iso"), meta.get("pubdate")))
|
||
guid = _first_nonempty(meta.get("guid"), meta.get("id"), "") or ""
|
||
thumb = _first_nonempty(meta.get("image"), meta.get("image_url"), meta.get("thumbnail"), "") or ""
|
||
dur_s = meta.get("duration_sec") or meta.get("duration") or 0
|
||
try:
|
||
dur_min = int(round(float(dur_s) / 60.0)) if dur_s else 0
|
||
except Exception:
|
||
dur_min = 0
|
||
|
||
# Build XML
|
||
xml = ["<episodedetails>"]
|
||
xml.append(f" <title>{xml_escape(title)}</title>")
|
||
xml.append(f" <showtitle>{xml_escape(show)}</showtitle>")
|
||
if plot:
|
||
xml.append(f" <plot>{xml_escape(plot)}</plot>")
|
||
if aired:
|
||
xml.append(f" <aired>{xml_escape(aired)}</aired>")
|
||
if guid:
|
||
xml.append(f" <uniqueid type=\"guid\" default=\"true\">{xml_escape(guid)}</uniqueid>")
|
||
if dur_min:
|
||
xml.append(f" <runtime>{dur_min}</runtime>")
|
||
if thumb:
|
||
xml.append(f" <thumb>{xml_escape(thumb)}</thumb>")
|
||
xml.append("</episodedetails>\n")
|
||
nfo_path = media_path.with_suffix(".nfo")
|
||
nfo_path.write_text("\n".join(xml), encoding="utf-8")
|
||
return nfo_path
|
||
except Exception:
|
||
return media_path.with_suffix(".nfo")
|
||
|
||
def write_plain_transcript(media_path: Path, text: str, language: str = "en") -> Path:
|
||
"""Write minimal transcript artifacts (.txt + .json) from plain text (no timestamps)."""
|
||
title = media_path.stem
|
||
base = TRN / title
|
||
base.parent.mkdir(parents=True, exist_ok=True)
|
||
(base.with_suffix(".txt")).write_text(text, encoding="utf-8")
|
||
(base.with_suffix(".json")).write_bytes(orjson.dumps({
|
||
"file": str(media_path),
|
||
"language": language,
|
||
"segments": [{"start": 0.0, "end": 0.0, "text": text}]
|
||
}))
|
||
return base
|
||
|
||
def yt_dlp(url, outdir):
|
||
# 1) Normalize YouTube Music URLs to standard YouTube
|
||
yurl = url
|
||
if 'music.youtube.com' in yurl:
|
||
yurl = yurl.replace('music.youtube.com', 'www.youtube.com')
|
||
|
||
outtmpl = str(outdir / "%(uploader)s/%(upload_date)s - %(title)s.%(ext)s")
|
||
|
||
base_cmd = [
|
||
"yt-dlp", "-o", outtmpl,
|
||
"-f", "bv*+ba/best",
|
||
"--write-info-json",
|
||
"--write-thumbnail",
|
||
"--convert-thumbnails", "jpg",
|
||
"--write-subs", "--write-auto-subs",
|
||
"--sub-langs", os.getenv("YTDLP_SUBS_LANGS", "en.*,en"),
|
||
"--convert-subs", "srt",
|
||
"--no-playlist", "--no-warnings", "--restrict-filenames",
|
||
]
|
||
|
||
# 3) Optional cookies (set YTDLP_COOKIES=/path/to/cookies.txt in .env and mount it)
|
||
cookies_path = os.getenv("YTDLP_COOKIES", "").strip()
|
||
if cookies_path:
|
||
base_cmd += ["--cookies", cookies_path]
|
||
|
||
# Primary attempt
|
||
try:
|
||
subprocess.check_call(base_cmd + [yurl])
|
||
except subprocess.CalledProcessError:
|
||
# 2) Retry with Android client + mobile UA
|
||
retry_cmd = base_cmd + [
|
||
"--extractor-args", "youtube:player_client=android",
|
||
"--user-agent", "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Mobile Safari/537.36",
|
||
yurl,
|
||
]
|
||
subprocess.check_call(retry_cmd)
|
||
|
||
media = (
|
||
list(outdir.rglob("*.[mM][pP]4")) +
|
||
list(outdir.rglob("*.mkv")) +
|
||
list(outdir.rglob("*.webm")) +
|
||
list(outdir.rglob("*.m4a")) +
|
||
list(outdir.rglob("*.mp3"))
|
||
)
|
||
return sorted(media, key=lambda p: p.stat().st_mtime)[-1:]
|
||
|
||
def extract_audio(src: Path, outdir: Path) -> Path:
|
||
"""Extract mono 16kHz WAV for robust transcription (handles odd containers/codecs)."""
|
||
outdir.mkdir(parents=True, exist_ok=True)
|
||
wav_path = outdir / (src.stem + ".wav")
|
||
# Force audio-only, mono, 16kHz WAV
|
||
cmd = [
|
||
"ffmpeg", "-nostdin", "-y",
|
||
"-threads", str(FFMPEG_THREADS),
|
||
"-i", str(src),
|
||
"-vn", "-ac", "1", "-ar", "16000",
|
||
"-f", "wav", str(wav_path),
|
||
]
|
||
try:
|
||
subprocess.check_output(cmd, stderr=subprocess.STDOUT)
|
||
except subprocess.CalledProcessError as e:
|
||
raise RuntimeError(f"ffmpeg extract failed: {e.output.decode(errors='ignore')}")
|
||
return wav_path
|
||
|
||
# --- WAV trimming helper ---
|
||
def trim_wav(src_wav: Path, start_sec: float, outdir: Path) -> Path:
|
||
"""Return a trimmed 16k mono WAV starting at start_sec from src_wav."""
|
||
outdir.mkdir(parents=True, exist_ok=True)
|
||
if not start_sec or start_sec <= 0.0:
|
||
return src_wav
|
||
dst = outdir / (src_wav.stem + f".from_{int(start_sec)}s.wav")
|
||
try:
|
||
subprocess.check_output([
|
||
"ffmpeg", "-nostdin", "-y",
|
||
"-ss", str(max(0.0, float(start_sec))),
|
||
"-i", str(src_wav),
|
||
"-vn", "-ac", "1", "-ar", "16000",
|
||
"-f", "wav", str(dst),
|
||
], stderr=subprocess.STDOUT)
|
||
return dst
|
||
except subprocess.CalledProcessError as e:
|
||
# If trimming fails, fall back to full file
|
||
print(f"[whisper] trim failed, using full WAV: {e.output.decode(errors='ignore')}", flush=True)
|
||
return src_wav
|
||
|
||
def media_duration_seconds(path: Path) -> float:
|
||
"""Return duration in seconds using ffprobe; fallback to 0.0 on error."""
|
||
try:
|
||
out = subprocess.check_output([
|
||
"ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||
"-of", "default=nokey=1:noprint_wrappers=1", str(path)
|
||
], stderr=subprocess.STDOUT, text=True).strip()
|
||
return float(out) if out else 0.0
|
||
except Exception:
|
||
return 0.0
|
||
|
||
# --- Partial transcript helpers ---
|
||
def _partial_paths(title: str) -> tuple[Path, Path]:
|
||
base = TRN / title
|
||
return base.with_suffix(".partial.json"), base.with_suffix(".partial.txt")
|
||
|
||
def _save_partial(title: str, language: str, segs: list[dict]):
|
||
pjson, ptxt = _partial_paths(title)
|
||
try:
|
||
# Save JSON
|
||
pjson.write_bytes(orjson.dumps({"file": str((TRN / title).with_suffix('.wav')), "language": language, "segments": segs}))
|
||
except Exception as e:
|
||
print(f"[whisper] partial json save failed: {e}", flush=True)
|
||
try:
|
||
# Save TXT snapshot
|
||
ptxt.write_text(" ".join(s.get("text","") for s in segs), encoding="utf-8")
|
||
except Exception as e:
|
||
print(f"[whisper] partial txt save failed: {e}", flush=True)
|
||
|
||
def transcribe(media_path: Path):
|
||
print(f"[whisper] start transcribe: {media_path}", flush=True)
|
||
# If paused, abort before any heavy work (no ffmpeg, no model load)
|
||
if transcribe_paused():
|
||
print(f"[pause] transcribe: pause active before heavy work; aborting {media_path}", flush=True)
|
||
raise PauseInterrupt("pause requested before start")
|
||
# 1) Robustly extract audio to 16k mono WAV (fixes pyAV/webm edge cases)
|
||
wav = extract_audio(media_path, TMP)
|
||
# Check again after extraction to avoid loading the model if a pause was requested meanwhile
|
||
if transcribe_paused():
|
||
try:
|
||
if wav.exists():
|
||
wav.unlink()
|
||
except Exception:
|
||
pass
|
||
print(f"[pause] transcribe: pause activated; stopping before model load for {media_path}", flush=True)
|
||
raise PauseInterrupt("pause requested after extract")
|
||
|
||
title = media_path.stem
|
||
base = TRN / title
|
||
|
||
# Resume support: if a partial checkpoint exists, load it and trim input
|
||
resume_segments = []
|
||
resume_offset = 0.0
|
||
language_hint = None
|
||
if WHISPER_RESUME:
|
||
pjson, ptxt = _partial_paths(title)
|
||
if pjson.exists():
|
||
try:
|
||
pdata = json.loads(pjson.read_text(encoding="utf-8"))
|
||
resume_segments = pdata.get("segments", []) or []
|
||
if resume_segments:
|
||
resume_offset = float(resume_segments[-1].get("end", 0.0))
|
||
language_hint = pdata.get("language")
|
||
print(f"[whisper] resuming from ~{resume_offset:.2f}s with {len(resume_segments)} segments", flush=True)
|
||
except Exception as e:
|
||
print(f"[whisper] failed to load partial: {e}", flush=True)
|
||
|
||
# If resuming, trim WAV from last end time
|
||
wav_for_run = trim_wav(wav, resume_offset, TMP)
|
||
|
||
# 2) Language selection
|
||
lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
|
||
if language_hint and WHISPER_LANGUAGE.lower() == "auto":
|
||
# carry hint forward if available
|
||
lang = language_hint
|
||
|
||
# 3) Transcribe
|
||
segments, info = run_transcribe_with_fallback(wav_for_run, lang)
|
||
|
||
# Determine duration for progress; use full WAV duration for consistent % regardless of resume
|
||
dur = media_duration_seconds(wav) or 0.0
|
||
# Start wall clock timer for speed/ETA
|
||
start_wall = time.time()
|
||
if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur:
|
||
print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True)
|
||
resume_offset = 0.0
|
||
last_pct = -1
|
||
|
||
segs = list(resume_segments) # start with what we already have
|
||
text_parts = [s.get("text","") for s in resume_segments]
|
||
|
||
# Walk new segments; shift their timestamps by resume_offset if trimmed
|
||
seg_count_since_save = 0
|
||
seg_index = len(resume_segments)
|
||
for s in segments:
|
||
seg_index += 1
|
||
start = (s.start or 0.0) + resume_offset
|
||
end = (s.end or 0.0) + resume_offset
|
||
seg = {"start": start, "end": end, "text": s.text}
|
||
segs.append(seg)
|
||
text_parts.append(s.text)
|
||
|
||
# --- Cooperative pause: save checkpoint and abort as soon as pause is requested ---
|
||
if transcribe_paused():
|
||
try:
|
||
pct = int(min(100, max(0, (end / dur) * 100))) if dur > 0 else 0
|
||
except Exception:
|
||
pct = 0
|
||
_save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs)
|
||
log({
|
||
"status": "paused",
|
||
"path": str(media_path),
|
||
"title": title,
|
||
"progress": pct
|
||
})
|
||
print(f"[pause] transcribe: pause requested mid-run; aborting at ~{end:.2f}s for {media_path}", flush=True)
|
||
raise PauseInterrupt("pause requested")
|
||
|
||
if WHISPER_LOG_SEGMENTS:
|
||
print(f"[whisper] {start:8.2f}–{end:8.2f} {s.text.strip()}", flush=True)
|
||
|
||
# progress logging every +5%
|
||
if dur > 0 and end is not None:
|
||
pct = int(min(100, max(0, (end / dur) * 100)))
|
||
if pct >= last_pct + 5:
|
||
log({
|
||
"status": "transcribing",
|
||
"path": str(media_path),
|
||
"title": title,
|
||
"progress": pct
|
||
})
|
||
last_pct = pct
|
||
|
||
# compute realtime speed and ETA for console logs
|
||
try:
|
||
elapsed = max(0.001, time.time() - start_wall)
|
||
processed = max(0.0, float(end))
|
||
speed = (processed / elapsed) if elapsed > 0 else 0.0 # seconds processed per second
|
||
# represent as X real-time factor
|
||
rtf = speed # 1.0 == real-time
|
||
eta = ((dur - processed) / speed) if (speed > 0 and dur > 0) else 0
|
||
print(f"[whisper] progress {pct:3d}% seg={seg_index:5d} rtf={rtf:0.2f}x eta={_fmt_eta(eta)}", flush=True)
|
||
# also mirror to feed log with speed/eta
|
||
try:
|
||
log({
|
||
"status": "transcribing",
|
||
"path": str(media_path),
|
||
"title": title,
|
||
"progress": pct,
|
||
"speed_rtf": round(rtf, 2),
|
||
"eta_sec": int(max(0, eta))
|
||
})
|
||
except Exception:
|
||
pass
|
||
except Exception:
|
||
pass
|
||
|
||
# periodic partial save
|
||
seg_count_since_save += 1
|
||
if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS:
|
||
_save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs)
|
||
seg_count_since_save = 0
|
||
|
||
# ensure we mark 100% on completion
|
||
if last_pct < 100:
|
||
log({"status": "transcribing", "path": str(media_path), "title": title, "progress": 100})
|
||
|
||
txt = " ".join(text_parts).strip()
|
||
|
||
# Write final transcript artifacts
|
||
(base.with_suffix(".json")).write_bytes(orjson.dumps({
|
||
"file": str(media_path),
|
||
"language": info.language,
|
||
"segments": segs
|
||
}))
|
||
(base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
|
||
|
||
def fmt_ts(t):
|
||
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
|
||
return f"{h:02}:{m:02}:{s:06.3f}".replace('.',',')
|
||
|
||
with open(base.with_suffix(".srt"), "w", encoding="utf-8") as srt:
|
||
for i,s in enumerate(segs,1):
|
||
srt.write(f"{i}\n{fmt_ts(s['start'])} --> {fmt_ts(s['end'])}\n{s['text'].strip()}\n\n")
|
||
|
||
with open(base.with_suffix(".vtt"), "w", encoding="utf-8") as vtt:
|
||
vtt.write("WEBVTT\n\n")
|
||
for s in segs:
|
||
vtt.write(f"{fmt_ts(s['start']).replace(',', '.')} --> {fmt_ts(s['end']).replace(',', '.')} \n{s['text'].strip()}\n\n")
|
||
|
||
# 4) Copy SRT next to media for Plex (language-suffixed)
|
||
try:
|
||
lang_code = (info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != 'auto' else 'en')).lower()
|
||
srt_src = base.with_suffix(".srt")
|
||
srt_dst = media_path.with_suffix(f".{lang_code}.srt")
|
||
shutil.copy2(srt_src, srt_dst)
|
||
except Exception as e:
|
||
print(f"[post] could not copy srt -> {srt_dst}: {e}", flush=True)
|
||
|
||
# Write Kodi/Plex-compatible NFO using enhanced metadata (same as before)
|
||
try:
|
||
fallback = {
|
||
"title": title,
|
||
"episode_title": title,
|
||
"show": media_path.parent.name,
|
||
"description": "",
|
||
"pubdate": _extract_date_from_stem(title),
|
||
"duration_sec": media_duration_seconds(media_path),
|
||
"image": "",
|
||
"guid": "",
|
||
}
|
||
meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None)
|
||
ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
|
||
write_episode_nfo(media_path, meta, ttxt)
|
||
try:
|
||
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
|
||
except Exception:
|
||
pass
|
||
except Exception as e:
|
||
print(f"[post] NFO write failed: {e}", flush=True)
|
||
|
||
# Cleanup temp WAVs
|
||
try:
|
||
if wav_for_run != wav and wav_for_run.exists():
|
||
wav_for_run.unlink()
|
||
if wav.exists():
|
||
wav.unlink()
|
||
except Exception:
|
||
pass
|
||
|
||
# Remove partial checkpoints on success
|
||
if WHISPER_RESUME:
|
||
try:
|
||
pjson, ptxt = _partial_paths(title)
|
||
if pjson.exists(): pjson.unlink()
|
||
if ptxt.exists(): ptxt.unlink()
|
||
except Exception:
|
||
pass
|
||
|
||
# Final average speed over whole transcription
|
||
try:
|
||
total_elapsed = max(0.001, time.time() - start_wall)
|
||
avg_rtf = (dur / total_elapsed) if total_elapsed > 0 else 0.0
|
||
print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True)
|
||
except Exception:
|
||
pass
|
||
print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True)
|
||
return base
|
||
|
||
|
||
# --- Meilisearch helpers ---
|
||
def _safe_doc_id(s: str) -> str:
|
||
"""
|
||
Meilisearch document IDs must be [A-Za-z0-9_-]. Convert the title to a safe slug.
|
||
If the result is empty, fall back to a short SHA1 hash.
|
||
"""
|
||
import hashlib
|
||
slug = re.sub(r"\s+", "_", (s or "").strip())
|
||
slug = re.sub(r"[^A-Za-z0-9_-]", "", slug)
|
||
if not slug:
|
||
slug = hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:16]
|
||
return slug
|
||
|
||
|
||
def ensure_meili_index():
|
||
"""Create index 'library' with primaryKey 'id' if it does not already exist."""
|
||
try:
|
||
r = requests.get(f"{MEILI_URL}/indexes/library",
|
||
headers={"Authorization": f"Bearer {MEILI_KEY}"}, timeout=10)
|
||
if r.status_code == 200:
|
||
return
|
||
# Attempt to create it
|
||
cr = requests.post(
|
||
f"{MEILI_URL}/indexes",
|
||
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
|
||
data=orjson.dumps({"uid": "library", "primaryKey": "id"}),
|
||
timeout=10,
|
||
)
|
||
# Ignore errors if another process created it first
|
||
try:
|
||
cr.raise_for_status()
|
||
except Exception:
|
||
pass
|
||
except Exception:
|
||
# Non-fatal; indexing will fail later if the index truly doesn't exist
|
||
pass
|
||
|
||
|
||
def index_meili(json_path: Path):
|
||
# Make sure the index exists and is configured with a primary key
|
||
ensure_meili_index()
|
||
|
||
doc = json.loads(open(json_path, "r", encoding="utf-8").read())
|
||
file_field = doc.get("file", "")
|
||
title = Path(file_field).stem if file_field else json_path.stem
|
||
|
||
# Build a Meili-safe document ID
|
||
doc_id = _safe_doc_id(title)
|
||
|
||
# Extract a YYYYMMDD date if present
|
||
m = re.search(r"\b(\d{8})\b", title)
|
||
date = m.group(1) if m else ""
|
||
|
||
payload = {
|
||
"id": doc_id,
|
||
"type": "podcast",
|
||
"title": title,
|
||
"date": date,
|
||
"source": str(Path(LIB, Path(file_field or title).name)),
|
||
"text": " ".join(s.get("text", "") for s in doc.get("segments", [])),
|
||
"segments": doc.get("segments", []),
|
||
"meta": {"language": doc.get("language", "")},
|
||
}
|
||
|
||
for attempt in range(5):
|
||
try:
|
||
r = requests.post(
|
||
f"{MEILI_URL}/indexes/library/documents",
|
||
headers={
|
||
"Authorization": f"Bearer {MEILI_KEY}",
|
||
"Content-Type": "application/json",
|
||
},
|
||
data=orjson.dumps(payload),
|
||
timeout=15,
|
||
)
|
||
r.raise_for_status()
|
||
break
|
||
except Exception:
|
||
if attempt == 4:
|
||
raise
|
||
time.sleep(2 * (attempt + 1))
|
||
|
||
import tldextract, trafilatura, requests as _requests
|
||
|
||
def slugify(text):
|
||
text = re.sub(r'[^A-Za-z0-9\-._ ]+', '', text).strip().replace(' ', '_')
|
||
return text[:120] or 'page'
|
||
|
||
def _norm(s: str | None) -> str:
|
||
"""Normalize strings for stable comparisons across Unicode lookalikes and stray whitespace."""
|
||
if s is None:
|
||
return ""
|
||
try:
|
||
return unicodedata.normalize("NFKC", s).strip()
|
||
except Exception:
|
||
return (s or "").strip()
|
||
|
||
def save_web_snapshot(url: str):
|
||
r = _requests.get(url, timeout=30, headers={"User-Agent":"Mozilla/5.0"})
|
||
r.raise_for_status()
|
||
html = r.text
|
||
downloaded = trafilatura.load_html(html, url=url)
|
||
text = trafilatura.extract(downloaded, include_comments=False, include_images=False, with_metadata=True) or ""
|
||
meta = trafilatura.metadata.extract_metadata(downloaded) or None
|
||
title = (meta.title if meta and getattr(meta, 'title', None) else None) or (re.search(r'<title[^>]*>(.*?)</title>', html, re.I|re.S).group(1).strip() if re.search(r'<title[^>]*>(.*?)</title>', html, re.I|re.S) else url)
|
||
date = (meta.date if meta and getattr(meta, 'date', None) else "")
|
||
parts = tldextract.extract(url)
|
||
domain = ".".join([p for p in [parts.domain, parts.suffix] if p])
|
||
slug = slugify(title)
|
||
outdir = LIB / "web" / domain
|
||
outdir.mkdir(parents=True, exist_ok=True)
|
||
base = outdir / slug
|
||
open(base.with_suffix(".html"), "w", encoding="utf-8", errors="ignore").write(html)
|
||
open(base.with_suffix(".txt"), "w", encoding="utf-8", errors="ignore").write(text)
|
||
return base, title, domain, date, text
|
||
|
||
def index_web(base: Path, title: str, domain: str, date: str, text: str, url: str):
|
||
payload = {
|
||
"id": f"web:{domain}:{base.stem}",
|
||
"type": "web",
|
||
"title": title,
|
||
"date": re.sub(r'[^0-9]', '', date)[:8] if date else "",
|
||
"source": f"file://{str(base.with_suffix('.html'))}",
|
||
"text": text,
|
||
"segments": [],
|
||
"meta": {"url": url, "domain": domain}
|
||
}
|
||
r = requests.post(f"{MEILI_URL}/indexes/library/documents",
|
||
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
|
||
data=orjson.dumps(payload))
|
||
r.raise_for_status()
|
||
|
||
def is_media_url(url: str):
|
||
lowered = url.lower()
|
||
media_hosts = ["youtube.com","youtu.be","rumble.com","vimeo.com","soundcloud.com","spotify.com","podbean.com","buzzsprout.com"]
|
||
return any(h in lowered for h in media_hosts)
|
||
|
||
def owui_headers():
|
||
return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {}
|
||
|
||
def owui_get_or_create_kb():
|
||
"""Return a KB id for OWUI_KB without creating duplicates.
|
||
Honors OPENWEBUI_KB_ID, and tolerates both list and {"data": ...} response shapes.
|
||
"""
|
||
if not OWUI_URL or not OWUI_KEY:
|
||
return None
|
||
|
||
# 1) If an explicit id is provided, trust it
|
||
forced = os.getenv("OPENWEBUI_KB_ID", "").strip()
|
||
if forced:
|
||
return forced
|
||
|
||
# 2) List and try to find an exact name match
|
||
try:
|
||
r = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15)
|
||
r.raise_for_status()
|
||
body = r.json()
|
||
items = body if isinstance(body, list) else body.get("data", [])
|
||
# Prefer exact normalized name match; if multiple, pick the most recently updated
|
||
kb_target = _norm(OWUI_KB)
|
||
matches = [kb for kb in items if _norm(kb.get("name")) == kb_target]
|
||
if matches:
|
||
try:
|
||
matches.sort(key=lambda k: k.get("updated_at") or 0, reverse=True)
|
||
except Exception:
|
||
pass
|
||
return matches[0].get("id")
|
||
except Exception:
|
||
pass
|
||
|
||
# 3) Create only if not found
|
||
cr = requests.post(
|
||
f"{OWUI_URL}/api/v1/knowledge/create",
|
||
headers={**owui_headers(), "Content-Type": "application/json"},
|
||
data=orjson.dumps({"name": OWUI_KB, "description": "All local content indexed by podx"}),
|
||
timeout=15,
|
||
)
|
||
cr.raise_for_status()
|
||
created = cr.json()
|
||
if isinstance(created, dict) and created.get("id"):
|
||
return created["id"]
|
||
if isinstance(created, dict) and created.get("data") and created["data"].get("id"):
|
||
return created["data"]["id"]
|
||
# Fallback: try to resolve again by name
|
||
try:
|
||
rr = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15)
|
||
rr.raise_for_status()
|
||
body = rr.json()
|
||
items = body if isinstance(body, list) else body.get("data", [])
|
||
kb_target = _norm(OWUI_KB)
|
||
for kb in items:
|
||
if _norm(kb.get("name")) == kb_target:
|
||
return kb.get("id")
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
def owui_upload_and_attach(path: Path, kb_id: str):
|
||
with open(path, "rb") as f:
|
||
r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=owui_headers(), files={"file": (path.name, f)}, timeout=60*10)
|
||
r.raise_for_status()
|
||
up = r.json()
|
||
file_id = (up.get("id") or (up.get("data") or {}).get("id"))
|
||
if not file_id:
|
||
raise RuntimeError(f"OWUI upload: could not get file id from response: {up}")
|
||
r = requests.post(
|
||
f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add",
|
||
headers={**owui_headers(), "Content-Type": "application/json"},
|
||
data=orjson.dumps({"file_id": file_id}),
|
||
timeout=180,
|
||
)
|
||
r.raise_for_status()
|
||
try:
|
||
time.sleep(0.5)
|
||
except Exception:
|
||
pass
|
||
return True
|
||
|
||
def publish_to_openwebui(paths):
|
||
if not OWUI_URL or not OWUI_KEY:
|
||
return
|
||
try:
|
||
kb_id = owui_get_or_create_kb()
|
||
if not kb_id:
|
||
print("[owui] KB resolve failed; skipping attach to avoid accidental duplicates", flush=True)
|
||
return
|
||
for p in paths:
|
||
p = Path(p)
|
||
if not p.exists():
|
||
continue
|
||
try:
|
||
owui_upload_and_attach(p, kb_id)
|
||
except Exception as e:
|
||
log({"url": str(p), "status": "owui_error", "error": str(e)})
|
||
except Exception as e:
|
||
log({"status": "owui_error", "error": str(e)})
|
||
|
||
# --------- Post-transcribe pipeline and job/queue helpers ---------
|
||
|
||
def _postprocess_after_transcribe(media_path: Path, base: Path):
|
||
"""Common steps after we have a `base` transcript path: index, publish, NFO, artwork."""
|
||
try:
|
||
index_meili(base.with_suffix(".json"))
|
||
except Exception as e:
|
||
print(f"[post] meili index failed: {e}", flush=True)
|
||
try:
|
||
publish_to_openwebui([base.with_suffix(".txt")])
|
||
except Exception as e:
|
||
print(f"[post] owui publish failed: {e}", flush=True)
|
||
# Build metadata using existing helper
|
||
try:
|
||
title = media_path.stem
|
||
fallback = {
|
||
"title": title,
|
||
"episode_title": title,
|
||
"show": media_path.parent.name,
|
||
"description": "",
|
||
"pubdate": _extract_date_from_stem(title),
|
||
"duration_sec": media_duration_seconds(media_path),
|
||
"image": "",
|
||
"guid": "",
|
||
}
|
||
meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None)
|
||
ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
|
||
write_episode_nfo(media_path, meta, ttxt)
|
||
try:
|
||
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
|
||
except Exception:
|
||
pass
|
||
except Exception as e:
|
||
print(f"[post] NFO write failed: {e}", flush=True)
|
||
|
||
def transcribe_job(path_str: str):
|
||
"""RQ job: heavy transcription only. Safe to import by dotted path 'worker.transcribe_job'."""
|
||
# Do NOT block when paused; exit quickly so CPU-heavy work stops ASAP.
|
||
if transcribe_paused():
|
||
print(f"[pause] transcribe_job: pause is active; skipping start for {path_str}", flush=True)
|
||
return "paused"
|
||
p = Path(path_str)
|
||
try:
|
||
base = transcribe(p)
|
||
except PauseInterrupt:
|
||
print(f"[pause] transcribe_job: paused mid-run for {p}", flush=True)
|
||
return "paused"
|
||
_postprocess_after_transcribe(p, base)
|
||
return str(base)
|
||
|
||
def enqueue_transcribe(path: Path) -> bool:
|
||
"""Enqueue a transcription job to the 'transcribe' queue. Returns True on success."""
|
||
try:
|
||
if transcribe_paused():
|
||
print(f"[queue] pause flag present; enqueuing job for {path} but workers will wait", flush=True)
|
||
conn = Redis.from_url(REDIS_URL)
|
||
q = Queue("transcribe", connection=conn, default_timeout=60*60*24)
|
||
# Use dotted path so workers in other processes can import
|
||
q.enqueue("worker.transcribe_job", str(path), job_timeout=60*60*24)
|
||
print(f"[queue] enqueued transcribe job for {path}", flush=True)
|
||
return True
|
||
except Exception as e:
|
||
print(f"[queue] enqueue failed, will transcribe inline: {e}", flush=True)
|
||
return False
|
||
|
||
def handle_local_file(path_str: str):
|
||
"""Transcribe & index a local media file that already exists in /library.
|
||
If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper.
|
||
Safe to call repeatedly; it skips if transcript JSON already exists.
|
||
"""
|
||
try:
|
||
p = Path(path_str)
|
||
if not p.exists():
|
||
log({"url": path_str, "status": "error", "error": "file_not_found"})
|
||
return
|
||
if WORKER_MODE == "transcribe":
|
||
print(f"[mode] transcribe-only worker handling local file: {p}", flush=True)
|
||
|
||
title = p.stem
|
||
base_json = TRN / f"{title}.json"
|
||
if base_json.exists():
|
||
log({"url": path_str, "status": "skip", "reason": "already_transcribed"})
|
||
return
|
||
|
||
info = {"url": path_str, "status": "transcribing", "title": title,
|
||
"uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
|
||
log(info)
|
||
|
||
# 0) Try RSS resolver first: if episode with transcript exists, use it (skip Whisper)
|
||
try:
|
||
ep = match_media_to_rss(p)
|
||
except Exception as _e:
|
||
ep = None
|
||
if ep:
|
||
base = use_rss_transcript(p, ep)
|
||
if base:
|
||
index_meili(base.with_suffix(".json"))
|
||
publish_to_openwebui([base.with_suffix(".txt")])
|
||
log({**info, **{"status": "done", "note": "used_rss_transcript"}})
|
||
return
|
||
|
||
# 1) Prefer an existing transcript sidecar if present
|
||
sidecar = find_sidecar_transcript(p)
|
||
if sidecar:
|
||
plain = transcript_text_from_file(sidecar)
|
||
lang = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
|
||
base = write_plain_transcript(p, plain, language=lang)
|
||
ensure_sidecar_next_to_media(sidecar, p, lang=lang)
|
||
index_meili(base.with_suffix(".json"))
|
||
publish_to_openwebui([base.with_suffix(".txt")])
|
||
try:
|
||
# Use info.json (if present) to enrich metadata
|
||
fallback = {
|
||
"title": title,
|
||
"episode_title": title,
|
||
"show": p.parent.name,
|
||
"description": "",
|
||
"pubdate": _extract_date_from_stem(title),
|
||
"duration_sec": media_duration_seconds(p),
|
||
"image": "",
|
||
"guid": "",
|
||
}
|
||
meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
|
||
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
|
||
write_episode_nfo(p, meta, ttxt)
|
||
# Try to fetch and save artwork locally
|
||
try:
|
||
save_episode_artwork(meta.get("image"), p, meta.get("show"))
|
||
except Exception:
|
||
pass
|
||
except Exception as e:
|
||
print(f"[post] NFO write failed: {e}", flush=True)
|
||
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
|
||
return
|
||
|
||
# 1.5) Reuse a transcript that exists in the repository for a matching episode
|
||
repo_json = find_repo_transcript_for_media(p)
|
||
if repo_json:
|
||
base = reuse_repo_transcript(p, repo_json)
|
||
if base:
|
||
index_meili(base.with_suffix(".json"))
|
||
publish_to_openwebui([base.with_suffix(".txt")])
|
||
try:
|
||
data = json.loads((base.with_suffix(".json")).read_text(encoding="utf-8"))
|
||
# Start with repo metadata, then enrich from yt-dlp info.json if any
|
||
meta_repo = {
|
||
"title": data.get("title") or title,
|
||
"episode_title": data.get("title") or title,
|
||
"show": data.get("show") or p.parent.name,
|
||
"description": data.get("description") or "",
|
||
"pubdate": data.get("pubdate") or _extract_date_from_stem(title),
|
||
"duration_sec": media_duration_seconds(p),
|
||
"image": data.get("image"),
|
||
"guid": data.get("guid") or data.get("id"),
|
||
}
|
||
meta = build_meta_from_sources(p, p.parent.name, meta_repo, ep=None)
|
||
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
|
||
write_episode_nfo(p, meta, ttxt)
|
||
try:
|
||
save_episode_artwork(meta.get("image"), p, meta.get("show"))
|
||
except Exception:
|
||
pass
|
||
except Exception as e:
|
||
print(f"[post] NFO write failed: {e}", flush=True)
|
||
log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
|
||
return
|
||
|
||
# 2) Otherwise, run transcription (offload to queue if enabled and not in transcribe-only worker)
|
||
# If paused, do not block; either enqueue (so worker will pause) or skip now.
|
||
if transcribe_paused():
|
||
if OFFLOAD_TRANSCRIBE and WORKER_MODE != "transcribe" and enqueue_transcribe(p):
|
||
log({**info, **{"status": "queued_transcribe"}})
|
||
return
|
||
log({**info, **{"status": "paused"}})
|
||
print(f"[pause] handle_local_file: pause active; not starting {p}", flush=True)
|
||
return
|
||
if OFFLOAD_TRANSCRIBE and WORKER_MODE != "transcribe" and enqueue_transcribe(p):
|
||
log({**info, **{"status": "queued_transcribe"}})
|
||
return
|
||
base = transcribe(p)
|
||
_postprocess_after_transcribe(p, base)
|
||
log({**info, **{"status": "done"}})
|
||
except Exception as e:
|
||
log({"url": path_str, "status": "error", "error": str(e)})
|
||
raise
|
||
|
||
|
||
# --- Refresh sidecar metadata and subtitles for an already-downloaded media file ---
|
||
def refresh_media(path_str: str):
|
||
"""
|
||
Refresh sidecar metadata (info.json, thumbnail) and subtitles for an already-downloaded media file.
|
||
Requires a companion .info.json next to the media (to supply the original URL). No media re-download.
|
||
"""
|
||
try:
|
||
p = Path(path_str)
|
||
if not p.exists() or not p.is_file():
|
||
log({"url": path_str, "status": "error", "error": "file_not_found"})
|
||
return
|
||
|
||
# Locate existing info.json to get the original URL
|
||
info_json = None
|
||
for cand in [p.parent / f"{p.name}.info.json", p.parent / f"{p.stem}.info.json"]:
|
||
if cand.exists():
|
||
info_json = cand
|
||
break
|
||
|
||
if not info_json:
|
||
log({"path": str(p), "status": "refresh-skip", "reason": "no_info_json"})
|
||
print(f"[refresh] skip: no info.json next to {p}", flush=True)
|
||
return
|
||
|
||
info = load_info_json(info_json) or {}
|
||
url = info.get("webpage_url") or info.get("original_url") or info.get("url")
|
||
if not url:
|
||
log({"path": str(p), "status": "refresh-skip", "reason": "no_url_in_info"})
|
||
print(f"[refresh] skip: no URL in {info_json}", flush=True)
|
||
return
|
||
|
||
# Prepare yt-dlp command to refresh sidecars only, writing files exactly next to the media
|
||
outtmpl = str(p.with_suffix(".%(ext)s"))
|
||
sub_langs = os.getenv("YTDLP_SUBS_LANGS", "en.*,en")
|
||
|
||
cmd = [
|
||
"yt-dlp",
|
||
"--skip-download",
|
||
"--write-info-json",
|
||
"--write-thumbnail",
|
||
"--convert-thumbnails", "jpg",
|
||
"--write-subs", "--write-auto-subs",
|
||
"--sub-langs", sub_langs,
|
||
"--convert-subs", "srt",
|
||
"-o", outtmpl,
|
||
url,
|
||
]
|
||
|
||
print(f"[refresh] refreshing sidecars for {p} via yt-dlp", flush=True)
|
||
try:
|
||
subprocess.check_call(cmd)
|
||
except subprocess.CalledProcessError as e:
|
||
print(f"[refresh] yt-dlp failed: {e}", flush=True)
|
||
raise
|
||
|
||
# Ensure language-suffixed SRT exists (Plex-friendly) if any subs were fetched
|
||
try:
|
||
# Pick any .srt just fetched that matches base
|
||
for s in p.parent.glob(f"{p.stem}*.srt"):
|
||
# If it's already lang-suffixed, keep; also copy to .en.srt when only plain .srt exists
|
||
if s.name == f"{p.stem}.srt":
|
||
shutil.copy2(s, p.with_suffix(".en.srt"))
|
||
except Exception:
|
||
pass
|
||
|
||
# Rebuild NFO using fresh info.json (and RSS if available)
|
||
try:
|
||
# Try RSS match to enrich metadata (non-fatal if not present)
|
||
ep = None
|
||
try:
|
||
ep = match_media_to_rss(p)
|
||
except Exception:
|
||
ep = None
|
||
|
||
fallback = {
|
||
"title": p.stem,
|
||
"episode_title": p.stem,
|
||
"show": p.parent.name,
|
||
"description": "",
|
||
"pubdate": _extract_date_from_stem(p.stem),
|
||
"duration_sec": media_duration_seconds(p),
|
||
"image": "",
|
||
"guid": "",
|
||
}
|
||
meta = build_meta_from_sources(p, p.parent.name, fallback, ep)
|
||
# Save local artwork too
|
||
try:
|
||
save_episode_artwork(meta.get("image"), p, meta.get("show"))
|
||
except Exception:
|
||
pass
|
||
|
||
# If a transcript already exists, include it in the NFO plot preview
|
||
ttxt_path = (TRN / p.stem).with_suffix(".txt")
|
||
ttxt = ttxt_path.read_text(encoding="utf-8") if ttxt_path.exists() else None
|
||
write_episode_nfo(p, meta, ttxt)
|
||
except Exception as e:
|
||
print(f"[refresh] NFO/artwork update failed: {e}", flush=True)
|
||
|
||
log({"path": str(p), "status": "refresh-done"})
|
||
print(f"[refresh] done for {p}", flush=True)
|
||
|
||
except Exception as e:
|
||
log({"path": path_str, "status": "error", "error": str(e)})
|
||
raise
|
||
|
||
def handle_web(url: str):
|
||
if not _mode_allows("web"):
|
||
log({"url": url, "status": "skip", "reason": "mode_transcribe_only"})
|
||
print(f"[mode] transcribe-only: skipping web snapshot job: {url}", flush=True)
|
||
return
|
||
info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""}
|
||
log(info)
|
||
base, title, domain, date, text = save_web_snapshot(url)
|
||
info.update({"title": title, "uploader": domain, "date": date, "path": str(base.with_suffix('.html'))})
|
||
log({**info, **{"status":"web-indexing"}})
|
||
index_web(base, title, domain, date, text, url)
|
||
push = [p for p in [base.with_suffix('.txt'), base.with_suffix('.html')] if p.exists()]
|
||
publish_to_openwebui(push)
|
||
log({**info, **{"status":"done"}})
|
||
|
||
def handle_url(url: str):
|
||
try:
|
||
# In transcribe-only mode, refuse non-local/download jobs
|
||
if not _mode_allows("download"):
|
||
# Only permit local file paths in this mode
|
||
if url.startswith("/") or url.startswith("file://"):
|
||
return handle_local_file(url[7:] if url.startswith("file://") else url)
|
||
log({"url": url, "status": "skip", "reason": "mode_transcribe_only"})
|
||
print(f"[mode] transcribe-only: skipping non-local job: {url}", flush=True)
|
||
return
|
||
# If a local file path (or file:// URL) is provided, process it directly
|
||
if url.startswith("file://"):
|
||
return handle_local_file(url[7:])
|
||
if url.startswith("/") and Path(url).exists():
|
||
return handle_local_file(url)
|
||
|
||
if not is_media_url(url):
|
||
handle_web(url)
|
||
return
|
||
info = {"url": url, "status":"queued", "title":"", "uploader":"", "date":"", "path":""}
|
||
log({**info, **{"status":"downloading"}})
|
||
files = yt_dlp(url, TMP)
|
||
for f in files:
|
||
parts = f.relative_to(TMP).parts
|
||
uploader = sanitize(parts[0]) if len(parts)>1 else "Unknown"
|
||
dest_dir = LIB / uploader
|
||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||
dest = dest_dir / sanitize(f.name)
|
||
shutil.move(str(f), dest)
|
||
# Move companion files produced by yt-dlp (info.json, thumbnail, subtitles)
|
||
try:
|
||
companions = find_companion_files(f)
|
||
# info.json -> prefer "<dest.name>.info.json", fallback to "<dest.stem>.info.json"
|
||
if companions.get("info") and companions["info"].exists():
|
||
dest_info = dest.parent / f"{dest.name}.info.json"
|
||
try:
|
||
shutil.move(str(companions["info"]), dest_info)
|
||
except Exception:
|
||
# fallback naming without extension
|
||
dest_info2 = dest.parent / f"{dest.stem}.info.json"
|
||
try:
|
||
shutil.move(str(companions['info']), dest_info2)
|
||
except Exception:
|
||
pass
|
||
# thumbnail -> "<dest>.jpg"
|
||
if companions.get("thumb") and companions["thumb"].exists():
|
||
try:
|
||
shutil.move(str(companions["thumb"]), str(dest.with_suffix(".jpg")))
|
||
except Exception:
|
||
pass
|
||
# subtitles -> preserve language suffix: "<dest.stem><suffix>"
|
||
for s in companions.get("subs", []):
|
||
if not s.exists():
|
||
continue
|
||
suffix_tail = ""
|
||
s_name = s.name
|
||
f_stem = f.stem
|
||
if s_name.startswith(f_stem):
|
||
suffix_tail = s_name[len(f_stem):] # includes leading dot if present
|
||
else:
|
||
suffix_tail = s.suffix
|
||
dest_sub = dest.parent / f"{dest.stem}{suffix_tail}"
|
||
try:
|
||
shutil.move(str(s), str(dest_sub))
|
||
except Exception:
|
||
pass
|
||
except Exception:
|
||
pass
|
||
info.update({"title": dest.stem, "uploader": uploader,
|
||
"date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
|
||
"path": str(dest)})
|
||
log({**info, **{"status":"transcribing", "progress": 0}})
|
||
# Try RSS transcript resolver first
|
||
ep = None
|
||
try:
|
||
ep = match_media_to_rss(dest)
|
||
except Exception:
|
||
ep = None
|
||
if ep:
|
||
base = use_rss_transcript(dest, ep)
|
||
else:
|
||
base = None
|
||
# 1.5) If we didn't get an RSS transcript and there is a matching one already in the repo, reuse it
|
||
if not base:
|
||
repo_json = find_repo_transcript_for_media(dest)
|
||
if repo_json:
|
||
base = reuse_repo_transcript(dest, repo_json)
|
||
if not base:
|
||
# If paused, do not block; either enqueue (so worker will pause) or skip now.
|
||
if transcribe_paused():
|
||
if OFFLOAD_TRANSCRIBE and WORKER_MODE != "transcribe" and enqueue_transcribe(dest):
|
||
log({**info, **{"status": "queued_transcribe"}})
|
||
continue
|
||
log({**info, **{"status": "paused"}})
|
||
print(f"[pause] handle_url: pause active; not starting {dest}", flush=True)
|
||
continue
|
||
if OFFLOAD_TRANSCRIBE and WORKER_MODE != "transcribe" and enqueue_transcribe(dest):
|
||
log({**info, **{"status": "queued_transcribe"}})
|
||
continue
|
||
base = transcribe(dest)
|
||
_postprocess_after_transcribe(dest, base)
|
||
log({**info, **{"status":"done"}})
|
||
except Exception as e:
|
||
log({"url": url, "status":"error", "error": str(e)})
|
||
raise |