Adding second worker

This commit is contained in:
2025-09-07 17:30:45 +02:00
parent f4b456ccae
commit bd846a8d9f
3 changed files with 560 additions and 50 deletions

View File

@@ -20,6 +20,11 @@ MEDIA_EXTS = {".mp3", ".m4a", ".flac", ".wav", ".ogg", ".opus", ".mp4", ".m4v",
# Fuzzy title match threshold for media ↔ transcript pairing
TITLE_MATCH_THRESHOLD = float(os.getenv("RSS_TITLE_MATCH_THRESHOLD", "0.60"))
# Download podcast audio (enclosures) to a local library
PODCASTS_ROOT = Path(os.getenv("PODCASTS_ROOT", str(LIB / "Podcasts")))
PODCASTS_PER_SHOW = os.getenv("PODCASTS_PER_SHOW", "true").lower() in {"1","true","yes","y"}
DOWNLOAD_AUDIO = os.getenv("RSS_DOWNLOAD_AUDIO", "true").lower() in {"1","true","yes","y"}
# Namespace map (extend as needed)
NS = {
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
@@ -31,6 +36,7 @@ NS = {
TRN.mkdir(parents=True, exist_ok=True)
OUT_INDEX.parent.mkdir(parents=True, exist_ok=True)
PODCASTS_ROOT.mkdir(parents=True, exist_ok=True)
def _text(el):
@@ -235,6 +241,7 @@ def parse_feed(feed_url: str):
duration_sec = _parse_duration(dur) or None
enclosure = _find_ns(it, "enclosure")
audio_url = enclosure.get("url") if enclosure is not None else ""
audio_type = enclosure.get("type") if enclosure is not None else ""
if not audio_url:
for mc in _findall_ns(it, "media:content"):
@@ -253,6 +260,7 @@ def parse_feed(feed_url: str):
"date": date,
"duration_sec": duration_sec,
"audio_url": audio_url,
"audio_type": audio_type,
"language": DEFAULT_LANG,
"transcripts": transcripts,
}
@@ -276,6 +284,40 @@ def parse_feed(feed_url: str):
if created:
t["sidecars"] = created
# Optionally download podcast audio locally
local_audio_path = None
if DOWNLOAD_AUDIO and audio_url:
show_dir = PODCASTS_ROOT / _slug(show_title or "Podcast") if PODCASTS_PER_SHOW else PODCASTS_ROOT
base_name = f"{(date or '00000000')} - {_slug(title or guid or 'episode')}"
ext = _guess_audio_ext(audio_type, audio_url)
target = (show_dir / base_name).with_suffix(ext)
# Avoid re-download if already exists
if not target.exists():
saved = _download_stream(audio_url, target)
if saved is None:
# Try a non-streaming fallback
saved = _download(audio_url, target)
else:
saved = target
if saved and saved.exists():
local_audio_path = saved
# If we previously downloaded transcript sidecars, try to place them next to this audio
for t in item_rec.get("transcripts", []) or []:
lp = t.get("local_path")
if lp:
try:
lp = Path(lp)
if lp.exists() and lp.suffix.lower() in {'.srt','.vtt','.txt'}:
sc = _sidecar_path_for(local_audio_path, t.get('language') or DEFAULT_LANG, lp.suffix.lower())
if not sc.exists():
sc.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(lp, sc)
t.setdefault("sidecars", []).append(str(sc))
except Exception:
pass
if local_audio_path:
item_rec["local_audio"] = str(local_audio_path)
items.append(item_rec)
return {"feed_url": feed_url, "show": show_title, "episodes": items}

View File

@@ -2,6 +2,7 @@ import os, subprocess, shutil, json, re, orjson, requests
from pathlib import Path
import math
import difflib
import time
from faster_whisper import WhisperModel
from xml.sax.saxutils import escape as xml_escape
@@ -11,10 +12,21 @@ MEILI_KEY = os.getenv("MEILI_KEY", "")
LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
TMP = Path(os.getenv("TMP_ROOT", "/tmpdl"))
MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3")
COMPUTE = os.getenv("WHISPER_PRECISION","int8")
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()
# Whisper device/config controls
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "auto").strip()
WHISPER_DEVICE_INDEX = int(os.getenv("WHISPER_DEVICE_INDEX", "0"))
WHISPER_CPU_THREADS = int(os.getenv("WHISPER_CPU_THREADS", "4"))
# Whisper logging & resume controls
WHISPER_LOG_SEGMENTS = os.getenv("WHISPER_LOG_SEGMENTS", "1") not in ("0", "false", "False")
WHISPER_RESUME = os.getenv("WHISPER_RESUME", "1") not in ("0", "false", "False")
PARTIAL_SAVE_EVERY_SEGS = int(os.getenv("WHISPER_PARTIAL_SAVE_EVERY_SEGS", "20"))
# RSS resolver config
RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json"))
RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150")) # seconds
@@ -34,9 +46,55 @@ _model = None
def get_model():
global _model
if _model is None:
_model = WhisperModel(MODEL_NAME, compute_type=COMPUTE)
print(f"[whisper] loading model='{MODEL_NAME}' device='{WHISPER_DEVICE}' idx={WHISPER_DEVICE_INDEX} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
_model = WhisperModel(
MODEL_NAME,
device=WHISPER_DEVICE,
device_index=WHISPER_DEVICE_INDEX,
compute_type=COMPUTE,
cpu_threads=WHISPER_CPU_THREADS,
)
return _model
# --- Helper: Reset model with new device and device_index ---
def reset_model(device: str, device_index: int | None = None):
"""Reset the global _model to a new WhisperModel with the given device and device_index."""
global _model
idx = device_index if device_index is not None else WHISPER_DEVICE_INDEX
print(f"[whisper] resetting model='{MODEL_NAME}' device='{device}' idx={idx} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
_model = WhisperModel(
MODEL_NAME,
device=device,
device_index=idx,
compute_type=COMPUTE,
cpu_threads=WHISPER_CPU_THREADS,
)
# --- Helper: Run transcribe with fallback to CPU on GPU/oom errors ---
def run_transcribe_with_fallback(wav_path: Path, lang):
"""
Try to transcribe with current model; on GPU/CUDA/HIP/ROCm/OOM errors, reset to CPU and retry once.
Returns (segments, info) or raises exception.
"""
model = get_model()
try:
return model.transcribe(str(wav_path), vad_filter=True, language=lang)
except Exception as e:
msg = str(e)
gpu_errs = [
"CUDA", "cublas", "out of memory", "HIP", "ROCm", "device-side assert", "CUDNN", "cudaError", "cuda runtime", "cudaMalloc"
]
if any(err.lower() in msg.lower() for err in gpu_errs):
print(f"[whisper] GPU error detected: '{msg}'. Retrying on CPU...", flush=True)
reset_model("cpu", 0)
try:
model = get_model()
return model.transcribe(str(wav_path), vad_filter=True, language=lang)
except Exception as e2:
print(f"[whisper] CPU fallback also failed: {e2}", flush=True)
raise
raise
def log(feed):
try:
with open(TRN / "_feed.log", "a", encoding="utf-8") as f:
@@ -216,6 +274,11 @@ def use_rss_transcript(media_path: Path, ep: dict) -> Path | None:
txt_path = base.with_suffix(".txt")
transcript_text = txt_path.read_text(encoding="utf-8") if txt_path.exists() else None
write_episode_nfo(media_path, meta, transcript_text)
# Save local artwork for Plex/Kodi
try:
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
except Exception:
pass
except Exception as e:
print(f"[post] NFO write failed: {e}", flush=True)
return base
@@ -351,6 +414,11 @@ def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None:
txtp = new_base.with_suffix(".txt")
ttxt = txtp.read_text(encoding="utf-8") if txtp.exists() else None
write_episode_nfo(media_path, meta, ttxt)
# Save local artwork for Plex/Kodi
try:
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
except Exception:
pass
except Exception as e:
print(f"[post] NFO write failed: {e}", flush=True)
@@ -403,6 +471,161 @@ def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "e
print(f"[post] sidecar copy/convert failed: {e}", flush=True)
# --- small helpers for progress/ETA formatting ---
def _fmt_eta(sec: float) -> str:
try:
sec = max(0, int(sec))
h, rem = divmod(sec, 3600)
m, s = divmod(rem, 60)
if h:
return f"{h}h {m}m {s}s"
if m:
return f"{m}m {s}s"
return f"{s}s"
except Exception:
return ""
def save_episode_artwork(image_url: str | None, media_path: Path, show_title: str | None = None):
"""Download episode artwork from image_url and save next to the media as '<basename>.jpg'.
Also drop a folder-level 'poster.jpg' for the show directory if not present.
Best-effort; failures are logged but non-fatal.
"""
if not image_url:
return
try:
resp = requests.get(image_url, timeout=30, stream=True)
resp.raise_for_status()
# Determine content-type and write a temporary file
ctype = (resp.headers.get("Content-Type") or "").lower()
tmp_file = media_path.with_suffix(".art.tmp")
with open(tmp_file, "wb") as out:
for chunk in resp.iter_content(chunk_size=8192):
if chunk:
out.write(chunk)
# Always provide a .jpg next to the media for Plex
episode_jpg = media_path.with_suffix(".jpg")
if "image/jpeg" in ctype:
# Already JPEG
shutil.move(str(tmp_file), str(episode_jpg))
else:
# Try converting to JPEG with ffmpeg; if it fails, keep bytes as-is
try:
subprocess.run(
["ffmpeg", "-nostdin", "-y", "-i", str(tmp_file), str(episode_jpg)],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
)
try:
tmp_file.unlink()
except Exception:
pass
except Exception:
shutil.move(str(tmp_file), str(episode_jpg))
# Also drop a folder poster once per show (helps Plex folder views)
try:
show_poster = media_path.parent / "poster.jpg"
if not show_poster.exists():
shutil.copy2(episode_jpg, show_poster)
except Exception:
pass
except Exception as e:
print(f"[post] artwork download failed: {e}", flush=True)
def find_companion_files(src: Path) -> dict:
"""Return likely yt-dlp companion files for a downloaded media file."""
out = {}
# info.json can be either "<name>.<ext>.info.json" or "<name>.info.json"
cands_info = [
src.parent / f"{src.name}.info.json",
src.parent / f"{src.stem}.info.json",
]
out["info"] = next((p for p in cands_info if p.exists()), None)
# thumbnails may be "<name>.<ext>.jpg" or "<name>.jpg" (we convert to jpg)
cand_thumbs = [
src.parent / f"{src.name}.jpg",
src.parent / f"{src.stem}.jpg",
src.parent / f"{src.stem}.jpeg",
src.parent / f"{src.stem}.png",
src.parent / f"{src.stem}.webp",
]
out["thumb"] = next((p for p in cand_thumbs if p.exists()), None)
# subtitles (keep multiple)
subs = []
for s in src.parent.glob(f"{src.stem}*.srt"):
subs.append(s)
for s in src.parent.glob(f"{src.stem}*.vtt"):
subs.append(s)
out["subs"] = subs
return out
def load_info_json(path: Path) -> dict | None:
try:
return json.loads(path.read_text(encoding="utf-8"))
except Exception:
return None
def _iso_from_yyyymmdd(s: str | None) -> str | None:
if not s or not re.match(r"^\d{8}$", s):
return None
return f"{s[0:4]}-{s[4:6]}-{s[6:8]}"
def build_meta_from_sources(media_path: Path, uploader: str, fallback_meta: dict, ep: dict | None = None) -> dict:
"""
Merge metadata from (priority): RSS episode `ep` -> yt-dlp info.json (if present) -> fallback.
Returns a dict compatible with write_episode_nfo().
"""
# Start with fallback
meta = dict(fallback_meta)
# Augment from info.json if present
info = None
for cand in [
media_path.parent / f"{media_path.name}.info.json",
media_path.parent / f"{media_path.stem}.info.json",
]:
if cand.exists():
info = load_info_json(cand)
break
if info:
meta.setdefault("title", info.get("title"))
meta.setdefault("episode_title", info.get("title"))
meta.setdefault("description", info.get("description") or info.get("fulltitle"))
# upload_date is YYYYMMDD
iso = _iso_from_yyyymmdd(info.get("upload_date"))
if iso:
meta["pubdate_iso"] = iso
# Prefer video duration if present
if not meta.get("duration_sec") and info.get("duration"):
meta["duration_sec"] = info.get("duration")
# thumbnail URL
if not meta.get("image"):
meta["image"] = info.get("thumbnail")
# show/uploader
if not meta.get("show"):
meta["show"] = info.get("uploader") or uploader
# Finally, layer RSS data on top if available (most authoritative for podcasts)
if ep:
meta.update({
"title": ep.get("title") or meta.get("title"),
"episode_title": ep.get("title") or meta.get("episode_title"),
"show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or meta.get("show") or uploader,
"description": ep.get("description") or ep.get("content") or meta.get("description", ""),
"pubdate": ep.get("pubdate") or meta.get("pubdate", ""),
"pubdate_iso": ep.get("date_iso") or meta.get("pubdate_iso", meta.get("pubdate")),
"duration_sec": ep.get("duration_sec") or ep.get("duration") or meta.get("duration_sec"),
"image": ep.get("image") or ep.get("image_url") or meta.get("image", ""),
"guid": ep.get("guid") or meta.get("guid", ""),
})
return meta
# ---------- Kodi/Plex NFO writer ----------
from datetime import datetime
@@ -505,8 +728,12 @@ def yt_dlp(url, outdir):
base_cmd = [
"yt-dlp", "-o", outtmpl,
"-f", "bv*+ba/best",
"-x", "--audio-format", "m4a",
"--write-info-json",
"--write-thumbnail",
"--convert-thumbnails", "jpg",
"--write-subs", "--write-auto-subs",
"--sub-langs", os.getenv("YTDLP_SUBS_LANGS", "en.*,en"),
"--convert-subs", "srt",
"--no-playlist", "--no-warnings", "--restrict-filenames",
]
@@ -527,10 +754,13 @@ def yt_dlp(url, outdir):
]
subprocess.check_call(retry_cmd)
media = (list(outdir.rglob("*.[mM][pP]4")) +
list(outdir.rglob("*.mkv")) +
list(outdir.rglob("*.m4a")) +
list(outdir.rglob("*.mp3")))
media = (
list(outdir.rglob("*.[mM][pP]4")) +
list(outdir.rglob("*.mkv")) +
list(outdir.rglob("*.webm")) +
list(outdir.rglob("*.m4a")) +
list(outdir.rglob("*.mp3"))
)
return sorted(media, key=lambda p: p.stat().st_mtime)[-1:]
def extract_audio(src: Path, outdir: Path) -> Path:
@@ -550,6 +780,27 @@ def extract_audio(src: Path, outdir: Path) -> Path:
raise RuntimeError(f"ffmpeg extract failed: {e.output.decode(errors='ignore')}")
return wav_path
# --- WAV trimming helper ---
def trim_wav(src_wav: Path, start_sec: float, outdir: Path) -> Path:
"""Return a trimmed 16k mono WAV starting at start_sec from src_wav."""
outdir.mkdir(parents=True, exist_ok=True)
if not start_sec or start_sec <= 0.0:
return src_wav
dst = outdir / (src_wav.stem + f".from_{int(start_sec)}s.wav")
try:
subprocess.check_output([
"ffmpeg", "-nostdin", "-y",
"-ss", str(max(0.0, float(start_sec))),
"-i", str(src_wav),
"-vn", "-ac", "1", "-ar", "16000",
"-f", "wav", str(dst),
], stderr=subprocess.STDOUT)
return dst
except subprocess.CalledProcessError as e:
# If trimming fails, fall back to full file
print(f"[whisper] trim failed, using full WAV: {e.output.decode(errors='ignore')}", flush=True)
return src_wav
def media_duration_seconds(path: Path) -> float:
"""Return duration in seconds using ffprobe; fallback to 0.0 on error."""
try:
@@ -561,32 +812,91 @@ def media_duration_seconds(path: Path) -> float:
except Exception:
return 0.0
# --- Partial transcript helpers ---
def _partial_paths(title: str) -> tuple[Path, Path]:
base = TRN / title
return base.with_suffix(".partial.json"), base.with_suffix(".partial.txt")
def _save_partial(title: str, language: str, segs: list[dict]):
pjson, ptxt = _partial_paths(title)
try:
# Save JSON
pjson.write_bytes(orjson.dumps({"file": str((TRN / title).with_suffix('.wav')), "language": language, "segments": segs}))
except Exception as e:
print(f"[whisper] partial json save failed: {e}", flush=True)
try:
# Save TXT snapshot
ptxt.write_text(" ".join(s.get("text","") for s in segs), encoding="utf-8")
except Exception as e:
print(f"[whisper] partial txt save failed: {e}", flush=True)
def transcribe(media_path: Path):
model = get_model()
print(f"[whisper] start transcribe: {media_path}", flush=True)
# 1) Robustly extract audio to 16k mono WAV (fixes pyAV/webm edge cases)
wav = extract_audio(media_path, TMP)
# 2) Language
lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
# 3) Transcribe
segments, info = model.transcribe(str(wav), vad_filter=True, language=lang)
title = media_path.stem
base = TRN / title
# Determine duration for progress; use extracted WAV (accurate for transcription input)
# Resume support: if a partial checkpoint exists, load it and trim input
resume_segments = []
resume_offset = 0.0
language_hint = None
if WHISPER_RESUME:
pjson, ptxt = _partial_paths(title)
if pjson.exists():
try:
pdata = json.loads(pjson.read_text(encoding="utf-8"))
resume_segments = pdata.get("segments", []) or []
if resume_segments:
resume_offset = float(resume_segments[-1].get("end", 0.0))
language_hint = pdata.get("language")
print(f"[whisper] resuming from ~{resume_offset:.2f}s with {len(resume_segments)} segments", flush=True)
except Exception as e:
print(f"[whisper] failed to load partial: {e}", flush=True)
# If resuming, trim WAV from last end time
wav_for_run = trim_wav(wav, resume_offset, TMP)
# 2) Language selection
lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
if language_hint and WHISPER_LANGUAGE.lower() == "auto":
# carry hint forward if available
lang = language_hint
# 3) Transcribe
segments, info = run_transcribe_with_fallback(wav_for_run, lang)
# Determine duration for progress; use full WAV duration for consistent % regardless of resume
dur = media_duration_seconds(wav) or 0.0
# Start wall clock timer for speed/ETA
start_wall = time.time()
if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur:
print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True)
resume_offset = 0.0
last_pct = -1
segs, text_parts = [], []
segs = list(resume_segments) # start with what we already have
text_parts = [s.get("text","") for s in resume_segments]
# Walk new segments; shift their timestamps by resume_offset if trimmed
seg_count_since_save = 0
seg_index = len(resume_segments)
for s in segments:
seg = {"start": s.start, "end": s.end, "text": s.text}
seg_index += 1
start = (s.start or 0.0) + resume_offset
end = (s.end or 0.0) + resume_offset
seg = {"start": start, "end": end, "text": s.text}
segs.append(seg)
text_parts.append(s.text)
if WHISPER_LOG_SEGMENTS:
print(f"[whisper] {start:8.2f}{end:8.2f} {s.text.strip()}", flush=True)
# progress logging every +5%
if dur > 0 and s.end is not None:
pct = int(min(100, max(0, (s.end / dur) * 100)))
if dur > 0 and end is not None:
pct = int(min(100, max(0, (end / dur) * 100)))
if pct >= last_pct + 5:
log({
"status": "transcribing",
@@ -595,19 +905,50 @@ def transcribe(media_path: Path):
"progress": pct
})
last_pct = pct
# compute realtime speed and ETA for console logs
try:
elapsed = max(0.001, time.time() - start_wall)
processed = max(0.0, float(end))
speed = (processed / elapsed) if elapsed > 0 else 0.0 # seconds processed per second
# represent as X real-time factor
rtf = speed # 1.0 == real-time
eta = ((dur - processed) / speed) if (speed > 0 and dur > 0) else 0
print(f"[whisper] progress {pct:3d}% seg={seg_index:5d} rtf={rtf:0.2f}x eta={_fmt_eta(eta)}", flush=True)
# also mirror to feed log with speed/eta
try:
log({
"status": "transcribing",
"path": str(media_path),
"title": title,
"progress": pct,
"speed_rtf": round(rtf, 2),
"eta_sec": int(max(0, eta))
})
except Exception:
pass
except Exception:
pass
# periodic partial save
seg_count_since_save += 1
if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS:
_save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs)
seg_count_since_save = 0
# ensure we mark 100% on completion
if last_pct < 100:
log({"status": "transcribing", "path": str(media_path), "title": title, "progress": 100})
txt = " ".join(text_parts).strip()
# Write transcript artifacts
open(base.with_suffix(".json"), "wb").write(orjson.dumps({
# Write final transcript artifacts
(base.with_suffix(".json")).write_bytes(orjson.dumps({
"file": str(media_path),
"language": info.language,
"segments": segs
}))
open(base.with_suffix(".txt"), "w", encoding="utf-8").write(txt)
(base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
def fmt_ts(t):
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
@@ -630,9 +971,10 @@ def transcribe(media_path: Path):
shutil.copy2(srt_src, srt_dst)
except Exception as e:
print(f"[post] could not copy srt -> {srt_dst}: {e}", flush=True)
# Write Kodi/Plex-compatible NFO using basic metadata
# Write Kodi/Plex-compatible NFO using enhanced metadata (same as before)
try:
meta = {
fallback = {
"title": title,
"episode_title": title,
"show": media_path.parent.name,
@@ -642,18 +984,42 @@ def transcribe(media_path: Path):
"image": "",
"guid": "",
}
meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None)
ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
write_episode_nfo(media_path, meta, ttxt)
try:
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
except Exception:
pass
except Exception as e:
print(f"[post] NFO write failed: {e}", flush=True)
# Optional: cleanup temporary WAV
# Cleanup temp WAVs
try:
if wav_for_run != wav and wav_for_run.exists():
wav_for_run.unlink()
if wav.exists():
wav.unlink()
except Exception:
pass
# Remove partial checkpoints on success
if WHISPER_RESUME:
try:
pjson, ptxt = _partial_paths(title)
if pjson.exists(): pjson.unlink()
if ptxt.exists(): ptxt.unlink()
except Exception:
pass
# Final average speed over whole transcription
try:
total_elapsed = max(0.001, time.time() - start_wall)
avg_rtf = (dur / total_elapsed) if total_elapsed > 0 else 0.0
print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True)
except Exception:
pass
print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True)
return base
def index_meili(json_path: Path):
@@ -829,7 +1195,8 @@ def handle_local_file(path_str: str):
index_meili(base.with_suffix(".json"))
publish_to_openwebui([base.with_suffix(".txt")])
try:
meta = {
# Use info.json (if present) to enrich metadata
fallback = {
"title": title,
"episode_title": title,
"show": p.parent.name,
@@ -839,8 +1206,14 @@ def handle_local_file(path_str: str):
"image": "",
"guid": "",
}
meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
write_episode_nfo(p, meta, ttxt)
# Try to fetch and save artwork locally
try:
save_episode_artwork(meta.get("image"), p, meta.get("show"))
except Exception:
pass
except Exception as e:
print(f"[post] NFO write failed: {e}", flush=True)
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
@@ -855,7 +1228,8 @@ def handle_local_file(path_str: str):
publish_to_openwebui([base.with_suffix(".txt")])
try:
data = json.loads((base.with_suffix(".json")).read_text(encoding="utf-8"))
meta = {
# Start with repo metadata, then enrich from yt-dlp info.json if any
meta_repo = {
"title": data.get("title") or title,
"episode_title": data.get("title") or title,
"show": data.get("show") or p.parent.name,
@@ -865,8 +1239,13 @@ def handle_local_file(path_str: str):
"image": data.get("image"),
"guid": data.get("guid") or data.get("id"),
}
meta = build_meta_from_sources(p, p.parent.name, meta_repo, ep=None)
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
write_episode_nfo(p, meta, ttxt)
try:
save_episode_artwork(meta.get("image"), p, meta.get("show"))
except Exception:
pass
except Exception as e:
print(f"[post] NFO write failed: {e}", flush=True)
log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
@@ -876,6 +1255,26 @@ def handle_local_file(path_str: str):
base = transcribe(p)
index_meili(base.with_suffix(".json"))
publish_to_openwebui([base.with_suffix(".txt")])
try:
fallback = {
"title": title,
"episode_title": title,
"show": p.parent.name,
"description": "",
"pubdate": _extract_date_from_stem(title),
"duration_sec": media_duration_seconds(p),
"image": "",
"guid": "",
}
meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
write_episode_nfo(p, meta, ttxt)
try:
save_episode_artwork(meta.get("image"), p, meta.get("show"))
except Exception:
pass
except Exception as e:
print(f"[post] NFO write failed: {e}", flush=True)
log({**info, **{"status": "done"}})
except Exception as e:
log({"url": path_str, "status": "error", "error": str(e)})
@@ -913,6 +1312,45 @@ def handle_url(url: str):
dest_dir.mkdir(parents=True, exist_ok=True)
dest = dest_dir / sanitize(f.name)
shutil.move(str(f), dest)
# Move companion files produced by yt-dlp (info.json, thumbnail, subtitles)
try:
companions = find_companion_files(f)
# info.json -> prefer "<dest.name>.info.json", fallback to "<dest.stem>.info.json"
if companions.get("info") and companions["info"].exists():
dest_info = dest.parent / f"{dest.name}.info.json"
try:
shutil.move(str(companions["info"]), dest_info)
except Exception:
# fallback naming without extension
dest_info2 = dest.parent / f"{dest.stem}.info.json"
try:
shutil.move(str(companions['info']), dest_info2)
except Exception:
pass
# thumbnail -> "<dest>.jpg"
if companions.get("thumb") and companions["thumb"].exists():
try:
shutil.move(str(companions["thumb"]), str(dest.with_suffix(".jpg")))
except Exception:
pass
# subtitles -> preserve language suffix: "<dest.stem><suffix>"
for s in companions.get("subs", []):
if not s.exists():
continue
suffix_tail = ""
s_name = s.name
f_stem = f.stem
if s_name.startswith(f_stem):
suffix_tail = s_name[len(f_stem):] # includes leading dot if present
else:
suffix_tail = s.suffix
dest_sub = dest.parent / f"{dest.stem}{suffix_tail}"
try:
shutil.move(str(s), str(dest_sub))
except Exception:
pass
except Exception:
pass
info.update({"title": dest.stem, "uploader": uploader,
"date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
"path": str(dest)})
@@ -937,31 +1375,25 @@ def handle_url(url: str):
index_meili(base.with_suffix(".json"))
publish_to_openwebui([base.with_suffix(".txt")])
try:
if 'ep' in locals() and ep:
meta = {
"title": ep.get("title"),
"episode_title": ep.get("title"),
"show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or uploader,
"description": ep.get("description") or ep.get("content"),
"pubdate": ep.get("pubdate"),
"pubdate_iso": ep.get("date_iso"),
"duration_sec": ep.get("duration_sec") or ep.get("duration") or media_duration_seconds(dest),
"image": ep.get("image") or ep.get("image_url"),
"guid": ep.get("guid"),
}
else:
meta = {
"title": dest.stem,
"episode_title": dest.stem,
"show": uploader,
"description": "",
"pubdate": _extract_date_from_stem(dest.stem),
"duration_sec": media_duration_seconds(dest),
"image": "",
"guid": "",
}
# Build metadata from RSS (if matched), yt-dlp info.json, and sensible fallbacks
fallback = {
"title": dest.stem,
"episode_title": dest.stem,
"show": uploader,
"description": "",
"pubdate": _extract_date_from_stem(dest.stem),
"duration_sec": media_duration_seconds(dest),
"image": "",
"guid": "",
}
meta = build_meta_from_sources(dest, uploader, fallback, ep if 'ep' in locals() else None)
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
write_episode_nfo(dest, meta, ttxt)
# Save local artwork for Plex/Kodi from meta image url
try:
save_episode_artwork(meta.get("image"), dest, meta.get("show"))
except Exception:
pass
except Exception as e:
print(f"[post] NFO write failed: {e}", flush=True)
log({**info, **{"status":"done"}})

View File

@@ -40,6 +40,42 @@ services:
TMP_ROOT: /tmpdl
WHISPER_MODEL: large-v3
WHISPER_PRECISION: int8
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
WHISPER_RESUME: ${WHISPER_RESUME:-1}
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
PYTHONPATH: /app
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
JOB_TTL: ${JOB_TTL:-86400}
RESULT_TTL: ${RESULT_TTL:-86400}
FAILURE_TTL: ${FAILURE_TTL:-86400}
volumes:
- ${LIBRARY_HOST_DIR:-./library}:/library
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
- ${TMP_HOST_DIR:-./tmp}:/tmpdl
- ${MODELS_HOST_DIR:-./models}:/root/.cache/huggingface
depends_on: [meili, redis]
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "exit 0"]
extra_hosts:
- host.docker.internal:host-gateway
podx-worker-transcribe:
build: ./app
container_name: podx-worker-transcribe
command: ["rq", "worker", "-u", "redis://redis:6379/0", "transcribe"]
env_file: [.env]
environment:
MEILI_URL: http://meili:7700
REDIS_URL: redis://redis:6379/0
LIBRARY_ROOT: /library
TRANSCRIPT_ROOT: /transcripts
TMP_ROOT: /tmpdl
WHISPER_MODEL: large-v3
WHISPER_PRECISION: int8
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
WHISPER_RESUME: ${WHISPER_RESUME:-1}
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
PYTHONPATH: /app
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
JOB_TTL: ${JOB_TTL:-86400}
@@ -90,7 +126,7 @@ services:
# - COOKIE_FILE=/config/cookies.txt
# Optional: yt-dlp options (JSON). Example enables Android client fallback
# - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}}
- YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1}
- YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1,"writesubtitles":true,"writeautomaticsub":true,"subtitleslangs":["en.*"],"convertsubs":"srt","writeinfojson":true,"writethumbnail":true,"converttumbnails":"jpg"}
volumes:
- ${LIBRARY_HOST_DIR:-./library}:/downloads
# Optional cookies file on host → /config/cookies.txt inside container