Adding second worker
This commit is contained in:
@@ -20,6 +20,11 @@ MEDIA_EXTS = {".mp3", ".m4a", ".flac", ".wav", ".ogg", ".opus", ".mp4", ".m4v",
|
|||||||
# Fuzzy title match threshold for media ↔ transcript pairing
|
# Fuzzy title match threshold for media ↔ transcript pairing
|
||||||
TITLE_MATCH_THRESHOLD = float(os.getenv("RSS_TITLE_MATCH_THRESHOLD", "0.60"))
|
TITLE_MATCH_THRESHOLD = float(os.getenv("RSS_TITLE_MATCH_THRESHOLD", "0.60"))
|
||||||
|
|
||||||
|
# Download podcast audio (enclosures) to a local library
|
||||||
|
PODCASTS_ROOT = Path(os.getenv("PODCASTS_ROOT", str(LIB / "Podcasts")))
|
||||||
|
PODCASTS_PER_SHOW = os.getenv("PODCASTS_PER_SHOW", "true").lower() in {"1","true","yes","y"}
|
||||||
|
DOWNLOAD_AUDIO = os.getenv("RSS_DOWNLOAD_AUDIO", "true").lower() in {"1","true","yes","y"}
|
||||||
|
|
||||||
# Namespace map (extend as needed)
|
# Namespace map (extend as needed)
|
||||||
NS = {
|
NS = {
|
||||||
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
||||||
@@ -31,6 +36,7 @@ NS = {
|
|||||||
|
|
||||||
TRN.mkdir(parents=True, exist_ok=True)
|
TRN.mkdir(parents=True, exist_ok=True)
|
||||||
OUT_INDEX.parent.mkdir(parents=True, exist_ok=True)
|
OUT_INDEX.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
PODCASTS_ROOT.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def _text(el):
|
def _text(el):
|
||||||
@@ -235,6 +241,7 @@ def parse_feed(feed_url: str):
|
|||||||
duration_sec = _parse_duration(dur) or None
|
duration_sec = _parse_duration(dur) or None
|
||||||
enclosure = _find_ns(it, "enclosure")
|
enclosure = _find_ns(it, "enclosure")
|
||||||
audio_url = enclosure.get("url") if enclosure is not None else ""
|
audio_url = enclosure.get("url") if enclosure is not None else ""
|
||||||
|
audio_type = enclosure.get("type") if enclosure is not None else ""
|
||||||
|
|
||||||
if not audio_url:
|
if not audio_url:
|
||||||
for mc in _findall_ns(it, "media:content"):
|
for mc in _findall_ns(it, "media:content"):
|
||||||
@@ -253,6 +260,7 @@ def parse_feed(feed_url: str):
|
|||||||
"date": date,
|
"date": date,
|
||||||
"duration_sec": duration_sec,
|
"duration_sec": duration_sec,
|
||||||
"audio_url": audio_url,
|
"audio_url": audio_url,
|
||||||
|
"audio_type": audio_type,
|
||||||
"language": DEFAULT_LANG,
|
"language": DEFAULT_LANG,
|
||||||
"transcripts": transcripts,
|
"transcripts": transcripts,
|
||||||
}
|
}
|
||||||
@@ -276,6 +284,40 @@ def parse_feed(feed_url: str):
|
|||||||
if created:
|
if created:
|
||||||
t["sidecars"] = created
|
t["sidecars"] = created
|
||||||
|
|
||||||
|
# Optionally download podcast audio locally
|
||||||
|
local_audio_path = None
|
||||||
|
if DOWNLOAD_AUDIO and audio_url:
|
||||||
|
show_dir = PODCASTS_ROOT / _slug(show_title or "Podcast") if PODCASTS_PER_SHOW else PODCASTS_ROOT
|
||||||
|
base_name = f"{(date or '00000000')} - {_slug(title or guid or 'episode')}"
|
||||||
|
ext = _guess_audio_ext(audio_type, audio_url)
|
||||||
|
target = (show_dir / base_name).with_suffix(ext)
|
||||||
|
# Avoid re-download if already exists
|
||||||
|
if not target.exists():
|
||||||
|
saved = _download_stream(audio_url, target)
|
||||||
|
if saved is None:
|
||||||
|
# Try a non-streaming fallback
|
||||||
|
saved = _download(audio_url, target)
|
||||||
|
else:
|
||||||
|
saved = target
|
||||||
|
if saved and saved.exists():
|
||||||
|
local_audio_path = saved
|
||||||
|
# If we previously downloaded transcript sidecars, try to place them next to this audio
|
||||||
|
for t in item_rec.get("transcripts", []) or []:
|
||||||
|
lp = t.get("local_path")
|
||||||
|
if lp:
|
||||||
|
try:
|
||||||
|
lp = Path(lp)
|
||||||
|
if lp.exists() and lp.suffix.lower() in {'.srt','.vtt','.txt'}:
|
||||||
|
sc = _sidecar_path_for(local_audio_path, t.get('language') or DEFAULT_LANG, lp.suffix.lower())
|
||||||
|
if not sc.exists():
|
||||||
|
sc.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(lp, sc)
|
||||||
|
t.setdefault("sidecars", []).append(str(sc))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if local_audio_path:
|
||||||
|
item_rec["local_audio"] = str(local_audio_path)
|
||||||
|
|
||||||
items.append(item_rec)
|
items.append(item_rec)
|
||||||
|
|
||||||
return {"feed_url": feed_url, "show": show_title, "episodes": items}
|
return {"feed_url": feed_url, "show": show_title, "episodes": items}
|
||||||
|
530
app/worker.py
530
app/worker.py
@@ -2,6 +2,7 @@ import os, subprocess, shutil, json, re, orjson, requests
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import math
|
import math
|
||||||
import difflib
|
import difflib
|
||||||
|
import time
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
from xml.sax.saxutils import escape as xml_escape
|
from xml.sax.saxutils import escape as xml_escape
|
||||||
@@ -11,10 +12,21 @@ MEILI_KEY = os.getenv("MEILI_KEY", "")
|
|||||||
LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
|
LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
|
||||||
TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
|
TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
|
||||||
TMP = Path(os.getenv("TMP_ROOT", "/tmpdl"))
|
TMP = Path(os.getenv("TMP_ROOT", "/tmpdl"))
|
||||||
|
|
||||||
MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3")
|
MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3")
|
||||||
COMPUTE = os.getenv("WHISPER_PRECISION","int8")
|
COMPUTE = os.getenv("WHISPER_PRECISION","int8")
|
||||||
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()
|
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()
|
||||||
|
|
||||||
|
# Whisper device/config controls
|
||||||
|
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "auto").strip()
|
||||||
|
WHISPER_DEVICE_INDEX = int(os.getenv("WHISPER_DEVICE_INDEX", "0"))
|
||||||
|
WHISPER_CPU_THREADS = int(os.getenv("WHISPER_CPU_THREADS", "4"))
|
||||||
|
|
||||||
|
# Whisper logging & resume controls
|
||||||
|
WHISPER_LOG_SEGMENTS = os.getenv("WHISPER_LOG_SEGMENTS", "1") not in ("0", "false", "False")
|
||||||
|
WHISPER_RESUME = os.getenv("WHISPER_RESUME", "1") not in ("0", "false", "False")
|
||||||
|
PARTIAL_SAVE_EVERY_SEGS = int(os.getenv("WHISPER_PARTIAL_SAVE_EVERY_SEGS", "20"))
|
||||||
|
|
||||||
# RSS resolver config
|
# RSS resolver config
|
||||||
RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json"))
|
RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json"))
|
||||||
RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150")) # seconds
|
RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150")) # seconds
|
||||||
@@ -34,9 +46,55 @@ _model = None
|
|||||||
def get_model():
|
def get_model():
|
||||||
global _model
|
global _model
|
||||||
if _model is None:
|
if _model is None:
|
||||||
_model = WhisperModel(MODEL_NAME, compute_type=COMPUTE)
|
print(f"[whisper] loading model='{MODEL_NAME}' device='{WHISPER_DEVICE}' idx={WHISPER_DEVICE_INDEX} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
|
||||||
|
_model = WhisperModel(
|
||||||
|
MODEL_NAME,
|
||||||
|
device=WHISPER_DEVICE,
|
||||||
|
device_index=WHISPER_DEVICE_INDEX,
|
||||||
|
compute_type=COMPUTE,
|
||||||
|
cpu_threads=WHISPER_CPU_THREADS,
|
||||||
|
)
|
||||||
return _model
|
return _model
|
||||||
|
|
||||||
|
# --- Helper: Reset model with new device and device_index ---
|
||||||
|
def reset_model(device: str, device_index: int | None = None):
|
||||||
|
"""Reset the global _model to a new WhisperModel with the given device and device_index."""
|
||||||
|
global _model
|
||||||
|
idx = device_index if device_index is not None else WHISPER_DEVICE_INDEX
|
||||||
|
print(f"[whisper] resetting model='{MODEL_NAME}' device='{device}' idx={idx} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
|
||||||
|
_model = WhisperModel(
|
||||||
|
MODEL_NAME,
|
||||||
|
device=device,
|
||||||
|
device_index=idx,
|
||||||
|
compute_type=COMPUTE,
|
||||||
|
cpu_threads=WHISPER_CPU_THREADS,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Helper: Run transcribe with fallback to CPU on GPU/oom errors ---
|
||||||
|
def run_transcribe_with_fallback(wav_path: Path, lang):
|
||||||
|
"""
|
||||||
|
Try to transcribe with current model; on GPU/CUDA/HIP/ROCm/OOM errors, reset to CPU and retry once.
|
||||||
|
Returns (segments, info) or raises exception.
|
||||||
|
"""
|
||||||
|
model = get_model()
|
||||||
|
try:
|
||||||
|
return model.transcribe(str(wav_path), vad_filter=True, language=lang)
|
||||||
|
except Exception as e:
|
||||||
|
msg = str(e)
|
||||||
|
gpu_errs = [
|
||||||
|
"CUDA", "cublas", "out of memory", "HIP", "ROCm", "device-side assert", "CUDNN", "cudaError", "cuda runtime", "cudaMalloc"
|
||||||
|
]
|
||||||
|
if any(err.lower() in msg.lower() for err in gpu_errs):
|
||||||
|
print(f"[whisper] GPU error detected: '{msg}'. Retrying on CPU...", flush=True)
|
||||||
|
reset_model("cpu", 0)
|
||||||
|
try:
|
||||||
|
model = get_model()
|
||||||
|
return model.transcribe(str(wav_path), vad_filter=True, language=lang)
|
||||||
|
except Exception as e2:
|
||||||
|
print(f"[whisper] CPU fallback also failed: {e2}", flush=True)
|
||||||
|
raise
|
||||||
|
raise
|
||||||
|
|
||||||
def log(feed):
|
def log(feed):
|
||||||
try:
|
try:
|
||||||
with open(TRN / "_feed.log", "a", encoding="utf-8") as f:
|
with open(TRN / "_feed.log", "a", encoding="utf-8") as f:
|
||||||
@@ -216,6 +274,11 @@ def use_rss_transcript(media_path: Path, ep: dict) -> Path | None:
|
|||||||
txt_path = base.with_suffix(".txt")
|
txt_path = base.with_suffix(".txt")
|
||||||
transcript_text = txt_path.read_text(encoding="utf-8") if txt_path.exists() else None
|
transcript_text = txt_path.read_text(encoding="utf-8") if txt_path.exists() else None
|
||||||
write_episode_nfo(media_path, meta, transcript_text)
|
write_episode_nfo(media_path, meta, transcript_text)
|
||||||
|
# Save local artwork for Plex/Kodi
|
||||||
|
try:
|
||||||
|
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[post] NFO write failed: {e}", flush=True)
|
print(f"[post] NFO write failed: {e}", flush=True)
|
||||||
return base
|
return base
|
||||||
@@ -351,6 +414,11 @@ def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None:
|
|||||||
txtp = new_base.with_suffix(".txt")
|
txtp = new_base.with_suffix(".txt")
|
||||||
ttxt = txtp.read_text(encoding="utf-8") if txtp.exists() else None
|
ttxt = txtp.read_text(encoding="utf-8") if txtp.exists() else None
|
||||||
write_episode_nfo(media_path, meta, ttxt)
|
write_episode_nfo(media_path, meta, ttxt)
|
||||||
|
# Save local artwork for Plex/Kodi
|
||||||
|
try:
|
||||||
|
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[post] NFO write failed: {e}", flush=True)
|
print(f"[post] NFO write failed: {e}", flush=True)
|
||||||
|
|
||||||
@@ -403,6 +471,161 @@ def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "e
|
|||||||
print(f"[post] sidecar copy/convert failed: {e}", flush=True)
|
print(f"[post] sidecar copy/convert failed: {e}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
# --- small helpers for progress/ETA formatting ---
|
||||||
|
def _fmt_eta(sec: float) -> str:
|
||||||
|
try:
|
||||||
|
sec = max(0, int(sec))
|
||||||
|
h, rem = divmod(sec, 3600)
|
||||||
|
m, s = divmod(rem, 60)
|
||||||
|
if h:
|
||||||
|
return f"{h}h {m}m {s}s"
|
||||||
|
if m:
|
||||||
|
return f"{m}m {s}s"
|
||||||
|
return f"{s}s"
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def save_episode_artwork(image_url: str | None, media_path: Path, show_title: str | None = None):
|
||||||
|
"""Download episode artwork from image_url and save next to the media as '<basename>.jpg'.
|
||||||
|
Also drop a folder-level 'poster.jpg' for the show directory if not present.
|
||||||
|
Best-effort; failures are logged but non-fatal.
|
||||||
|
"""
|
||||||
|
if not image_url:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
resp = requests.get(image_url, timeout=30, stream=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
# Determine content-type and write a temporary file
|
||||||
|
ctype = (resp.headers.get("Content-Type") or "").lower()
|
||||||
|
tmp_file = media_path.with_suffix(".art.tmp")
|
||||||
|
with open(tmp_file, "wb") as out:
|
||||||
|
for chunk in resp.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
out.write(chunk)
|
||||||
|
|
||||||
|
# Always provide a .jpg next to the media for Plex
|
||||||
|
episode_jpg = media_path.with_suffix(".jpg")
|
||||||
|
if "image/jpeg" in ctype:
|
||||||
|
# Already JPEG
|
||||||
|
shutil.move(str(tmp_file), str(episode_jpg))
|
||||||
|
else:
|
||||||
|
# Try converting to JPEG with ffmpeg; if it fails, keep bytes as-is
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["ffmpeg", "-nostdin", "-y", "-i", str(tmp_file), str(episode_jpg)],
|
||||||
|
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
tmp_file.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
shutil.move(str(tmp_file), str(episode_jpg))
|
||||||
|
|
||||||
|
# Also drop a folder poster once per show (helps Plex folder views)
|
||||||
|
try:
|
||||||
|
show_poster = media_path.parent / "poster.jpg"
|
||||||
|
if not show_poster.exists():
|
||||||
|
shutil.copy2(episode_jpg, show_poster)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[post] artwork download failed: {e}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def find_companion_files(src: Path) -> dict:
|
||||||
|
"""Return likely yt-dlp companion files for a downloaded media file."""
|
||||||
|
out = {}
|
||||||
|
# info.json can be either "<name>.<ext>.info.json" or "<name>.info.json"
|
||||||
|
cands_info = [
|
||||||
|
src.parent / f"{src.name}.info.json",
|
||||||
|
src.parent / f"{src.stem}.info.json",
|
||||||
|
]
|
||||||
|
out["info"] = next((p for p in cands_info if p.exists()), None)
|
||||||
|
|
||||||
|
# thumbnails may be "<name>.<ext>.jpg" or "<name>.jpg" (we convert to jpg)
|
||||||
|
cand_thumbs = [
|
||||||
|
src.parent / f"{src.name}.jpg",
|
||||||
|
src.parent / f"{src.stem}.jpg",
|
||||||
|
src.parent / f"{src.stem}.jpeg",
|
||||||
|
src.parent / f"{src.stem}.png",
|
||||||
|
src.parent / f"{src.stem}.webp",
|
||||||
|
]
|
||||||
|
out["thumb"] = next((p for p in cand_thumbs if p.exists()), None)
|
||||||
|
|
||||||
|
# subtitles (keep multiple)
|
||||||
|
subs = []
|
||||||
|
for s in src.parent.glob(f"{src.stem}*.srt"):
|
||||||
|
subs.append(s)
|
||||||
|
for s in src.parent.glob(f"{src.stem}*.vtt"):
|
||||||
|
subs.append(s)
|
||||||
|
out["subs"] = subs
|
||||||
|
return out
|
||||||
|
|
||||||
|
def load_info_json(path: Path) -> dict | None:
|
||||||
|
try:
|
||||||
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _iso_from_yyyymmdd(s: str | None) -> str | None:
|
||||||
|
if not s or not re.match(r"^\d{8}$", s):
|
||||||
|
return None
|
||||||
|
return f"{s[0:4]}-{s[4:6]}-{s[6:8]}"
|
||||||
|
|
||||||
|
def build_meta_from_sources(media_path: Path, uploader: str, fallback_meta: dict, ep: dict | None = None) -> dict:
|
||||||
|
"""
|
||||||
|
Merge metadata from (priority): RSS episode `ep` -> yt-dlp info.json (if present) -> fallback.
|
||||||
|
Returns a dict compatible with write_episode_nfo().
|
||||||
|
"""
|
||||||
|
# Start with fallback
|
||||||
|
meta = dict(fallback_meta)
|
||||||
|
|
||||||
|
# Augment from info.json if present
|
||||||
|
info = None
|
||||||
|
for cand in [
|
||||||
|
media_path.parent / f"{media_path.name}.info.json",
|
||||||
|
media_path.parent / f"{media_path.stem}.info.json",
|
||||||
|
]:
|
||||||
|
if cand.exists():
|
||||||
|
info = load_info_json(cand)
|
||||||
|
break
|
||||||
|
if info:
|
||||||
|
meta.setdefault("title", info.get("title"))
|
||||||
|
meta.setdefault("episode_title", info.get("title"))
|
||||||
|
meta.setdefault("description", info.get("description") or info.get("fulltitle"))
|
||||||
|
# upload_date is YYYYMMDD
|
||||||
|
iso = _iso_from_yyyymmdd(info.get("upload_date"))
|
||||||
|
if iso:
|
||||||
|
meta["pubdate_iso"] = iso
|
||||||
|
# Prefer video duration if present
|
||||||
|
if not meta.get("duration_sec") and info.get("duration"):
|
||||||
|
meta["duration_sec"] = info.get("duration")
|
||||||
|
# thumbnail URL
|
||||||
|
if not meta.get("image"):
|
||||||
|
meta["image"] = info.get("thumbnail")
|
||||||
|
# show/uploader
|
||||||
|
if not meta.get("show"):
|
||||||
|
meta["show"] = info.get("uploader") or uploader
|
||||||
|
|
||||||
|
# Finally, layer RSS data on top if available (most authoritative for podcasts)
|
||||||
|
if ep:
|
||||||
|
meta.update({
|
||||||
|
"title": ep.get("title") or meta.get("title"),
|
||||||
|
"episode_title": ep.get("title") or meta.get("episode_title"),
|
||||||
|
"show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or meta.get("show") or uploader,
|
||||||
|
"description": ep.get("description") or ep.get("content") or meta.get("description", ""),
|
||||||
|
"pubdate": ep.get("pubdate") or meta.get("pubdate", ""),
|
||||||
|
"pubdate_iso": ep.get("date_iso") or meta.get("pubdate_iso", meta.get("pubdate")),
|
||||||
|
"duration_sec": ep.get("duration_sec") or ep.get("duration") or meta.get("duration_sec"),
|
||||||
|
"image": ep.get("image") or ep.get("image_url") or meta.get("image", ""),
|
||||||
|
"guid": ep.get("guid") or meta.get("guid", ""),
|
||||||
|
})
|
||||||
|
|
||||||
|
return meta
|
||||||
|
|
||||||
# ---------- Kodi/Plex NFO writer ----------
|
# ---------- Kodi/Plex NFO writer ----------
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -505,8 +728,12 @@ def yt_dlp(url, outdir):
|
|||||||
base_cmd = [
|
base_cmd = [
|
||||||
"yt-dlp", "-o", outtmpl,
|
"yt-dlp", "-o", outtmpl,
|
||||||
"-f", "bv*+ba/best",
|
"-f", "bv*+ba/best",
|
||||||
"-x", "--audio-format", "m4a",
|
"--write-info-json",
|
||||||
"--write-thumbnail",
|
"--write-thumbnail",
|
||||||
|
"--convert-thumbnails", "jpg",
|
||||||
|
"--write-subs", "--write-auto-subs",
|
||||||
|
"--sub-langs", os.getenv("YTDLP_SUBS_LANGS", "en.*,en"),
|
||||||
|
"--convert-subs", "srt",
|
||||||
"--no-playlist", "--no-warnings", "--restrict-filenames",
|
"--no-playlist", "--no-warnings", "--restrict-filenames",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -527,10 +754,13 @@ def yt_dlp(url, outdir):
|
|||||||
]
|
]
|
||||||
subprocess.check_call(retry_cmd)
|
subprocess.check_call(retry_cmd)
|
||||||
|
|
||||||
media = (list(outdir.rglob("*.[mM][pP]4")) +
|
media = (
|
||||||
list(outdir.rglob("*.mkv")) +
|
list(outdir.rglob("*.[mM][pP]4")) +
|
||||||
list(outdir.rglob("*.m4a")) +
|
list(outdir.rglob("*.mkv")) +
|
||||||
list(outdir.rglob("*.mp3")))
|
list(outdir.rglob("*.webm")) +
|
||||||
|
list(outdir.rglob("*.m4a")) +
|
||||||
|
list(outdir.rglob("*.mp3"))
|
||||||
|
)
|
||||||
return sorted(media, key=lambda p: p.stat().st_mtime)[-1:]
|
return sorted(media, key=lambda p: p.stat().st_mtime)[-1:]
|
||||||
|
|
||||||
def extract_audio(src: Path, outdir: Path) -> Path:
|
def extract_audio(src: Path, outdir: Path) -> Path:
|
||||||
@@ -550,6 +780,27 @@ def extract_audio(src: Path, outdir: Path) -> Path:
|
|||||||
raise RuntimeError(f"ffmpeg extract failed: {e.output.decode(errors='ignore')}")
|
raise RuntimeError(f"ffmpeg extract failed: {e.output.decode(errors='ignore')}")
|
||||||
return wav_path
|
return wav_path
|
||||||
|
|
||||||
|
# --- WAV trimming helper ---
|
||||||
|
def trim_wav(src_wav: Path, start_sec: float, outdir: Path) -> Path:
|
||||||
|
"""Return a trimmed 16k mono WAV starting at start_sec from src_wav."""
|
||||||
|
outdir.mkdir(parents=True, exist_ok=True)
|
||||||
|
if not start_sec or start_sec <= 0.0:
|
||||||
|
return src_wav
|
||||||
|
dst = outdir / (src_wav.stem + f".from_{int(start_sec)}s.wav")
|
||||||
|
try:
|
||||||
|
subprocess.check_output([
|
||||||
|
"ffmpeg", "-nostdin", "-y",
|
||||||
|
"-ss", str(max(0.0, float(start_sec))),
|
||||||
|
"-i", str(src_wav),
|
||||||
|
"-vn", "-ac", "1", "-ar", "16000",
|
||||||
|
"-f", "wav", str(dst),
|
||||||
|
], stderr=subprocess.STDOUT)
|
||||||
|
return dst
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
# If trimming fails, fall back to full file
|
||||||
|
print(f"[whisper] trim failed, using full WAV: {e.output.decode(errors='ignore')}", flush=True)
|
||||||
|
return src_wav
|
||||||
|
|
||||||
def media_duration_seconds(path: Path) -> float:
|
def media_duration_seconds(path: Path) -> float:
|
||||||
"""Return duration in seconds using ffprobe; fallback to 0.0 on error."""
|
"""Return duration in seconds using ffprobe; fallback to 0.0 on error."""
|
||||||
try:
|
try:
|
||||||
@@ -561,32 +812,91 @@ def media_duration_seconds(path: Path) -> float:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
# --- Partial transcript helpers ---
|
||||||
|
def _partial_paths(title: str) -> tuple[Path, Path]:
|
||||||
|
base = TRN / title
|
||||||
|
return base.with_suffix(".partial.json"), base.with_suffix(".partial.txt")
|
||||||
|
|
||||||
|
def _save_partial(title: str, language: str, segs: list[dict]):
|
||||||
|
pjson, ptxt = _partial_paths(title)
|
||||||
|
try:
|
||||||
|
# Save JSON
|
||||||
|
pjson.write_bytes(orjson.dumps({"file": str((TRN / title).with_suffix('.wav')), "language": language, "segments": segs}))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[whisper] partial json save failed: {e}", flush=True)
|
||||||
|
try:
|
||||||
|
# Save TXT snapshot
|
||||||
|
ptxt.write_text(" ".join(s.get("text","") for s in segs), encoding="utf-8")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[whisper] partial txt save failed: {e}", flush=True)
|
||||||
|
|
||||||
def transcribe(media_path: Path):
|
def transcribe(media_path: Path):
|
||||||
model = get_model()
|
model = get_model()
|
||||||
|
print(f"[whisper] start transcribe: {media_path}", flush=True)
|
||||||
# 1) Robustly extract audio to 16k mono WAV (fixes pyAV/webm edge cases)
|
# 1) Robustly extract audio to 16k mono WAV (fixes pyAV/webm edge cases)
|
||||||
wav = extract_audio(media_path, TMP)
|
wav = extract_audio(media_path, TMP)
|
||||||
|
|
||||||
# 2) Language
|
|
||||||
lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
|
|
||||||
|
|
||||||
# 3) Transcribe
|
|
||||||
segments, info = model.transcribe(str(wav), vad_filter=True, language=lang)
|
|
||||||
|
|
||||||
title = media_path.stem
|
title = media_path.stem
|
||||||
base = TRN / title
|
base = TRN / title
|
||||||
|
|
||||||
# Determine duration for progress; use extracted WAV (accurate for transcription input)
|
# Resume support: if a partial checkpoint exists, load it and trim input
|
||||||
|
resume_segments = []
|
||||||
|
resume_offset = 0.0
|
||||||
|
language_hint = None
|
||||||
|
if WHISPER_RESUME:
|
||||||
|
pjson, ptxt = _partial_paths(title)
|
||||||
|
if pjson.exists():
|
||||||
|
try:
|
||||||
|
pdata = json.loads(pjson.read_text(encoding="utf-8"))
|
||||||
|
resume_segments = pdata.get("segments", []) or []
|
||||||
|
if resume_segments:
|
||||||
|
resume_offset = float(resume_segments[-1].get("end", 0.0))
|
||||||
|
language_hint = pdata.get("language")
|
||||||
|
print(f"[whisper] resuming from ~{resume_offset:.2f}s with {len(resume_segments)} segments", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[whisper] failed to load partial: {e}", flush=True)
|
||||||
|
|
||||||
|
# If resuming, trim WAV from last end time
|
||||||
|
wav_for_run = trim_wav(wav, resume_offset, TMP)
|
||||||
|
|
||||||
|
# 2) Language selection
|
||||||
|
lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
|
||||||
|
if language_hint and WHISPER_LANGUAGE.lower() == "auto":
|
||||||
|
# carry hint forward if available
|
||||||
|
lang = language_hint
|
||||||
|
|
||||||
|
# 3) Transcribe
|
||||||
|
segments, info = run_transcribe_with_fallback(wav_for_run, lang)
|
||||||
|
|
||||||
|
# Determine duration for progress; use full WAV duration for consistent % regardless of resume
|
||||||
dur = media_duration_seconds(wav) or 0.0
|
dur = media_duration_seconds(wav) or 0.0
|
||||||
|
# Start wall clock timer for speed/ETA
|
||||||
|
start_wall = time.time()
|
||||||
|
if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur:
|
||||||
|
print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True)
|
||||||
|
resume_offset = 0.0
|
||||||
last_pct = -1
|
last_pct = -1
|
||||||
|
|
||||||
segs, text_parts = [], []
|
segs = list(resume_segments) # start with what we already have
|
||||||
|
text_parts = [s.get("text","") for s in resume_segments]
|
||||||
|
|
||||||
|
# Walk new segments; shift their timestamps by resume_offset if trimmed
|
||||||
|
seg_count_since_save = 0
|
||||||
|
seg_index = len(resume_segments)
|
||||||
for s in segments:
|
for s in segments:
|
||||||
seg = {"start": s.start, "end": s.end, "text": s.text}
|
seg_index += 1
|
||||||
|
start = (s.start or 0.0) + resume_offset
|
||||||
|
end = (s.end or 0.0) + resume_offset
|
||||||
|
seg = {"start": start, "end": end, "text": s.text}
|
||||||
segs.append(seg)
|
segs.append(seg)
|
||||||
text_parts.append(s.text)
|
text_parts.append(s.text)
|
||||||
|
|
||||||
|
if WHISPER_LOG_SEGMENTS:
|
||||||
|
print(f"[whisper] {start:8.2f}–{end:8.2f} {s.text.strip()}", flush=True)
|
||||||
|
|
||||||
# progress logging every +5%
|
# progress logging every +5%
|
||||||
if dur > 0 and s.end is not None:
|
if dur > 0 and end is not None:
|
||||||
pct = int(min(100, max(0, (s.end / dur) * 100)))
|
pct = int(min(100, max(0, (end / dur) * 100)))
|
||||||
if pct >= last_pct + 5:
|
if pct >= last_pct + 5:
|
||||||
log({
|
log({
|
||||||
"status": "transcribing",
|
"status": "transcribing",
|
||||||
@@ -595,19 +905,50 @@ def transcribe(media_path: Path):
|
|||||||
"progress": pct
|
"progress": pct
|
||||||
})
|
})
|
||||||
last_pct = pct
|
last_pct = pct
|
||||||
|
|
||||||
|
# compute realtime speed and ETA for console logs
|
||||||
|
try:
|
||||||
|
elapsed = max(0.001, time.time() - start_wall)
|
||||||
|
processed = max(0.0, float(end))
|
||||||
|
speed = (processed / elapsed) if elapsed > 0 else 0.0 # seconds processed per second
|
||||||
|
# represent as X real-time factor
|
||||||
|
rtf = speed # 1.0 == real-time
|
||||||
|
eta = ((dur - processed) / speed) if (speed > 0 and dur > 0) else 0
|
||||||
|
print(f"[whisper] progress {pct:3d}% seg={seg_index:5d} rtf={rtf:0.2f}x eta={_fmt_eta(eta)}", flush=True)
|
||||||
|
# also mirror to feed log with speed/eta
|
||||||
|
try:
|
||||||
|
log({
|
||||||
|
"status": "transcribing",
|
||||||
|
"path": str(media_path),
|
||||||
|
"title": title,
|
||||||
|
"progress": pct,
|
||||||
|
"speed_rtf": round(rtf, 2),
|
||||||
|
"eta_sec": int(max(0, eta))
|
||||||
|
})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# periodic partial save
|
||||||
|
seg_count_since_save += 1
|
||||||
|
if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS:
|
||||||
|
_save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs)
|
||||||
|
seg_count_since_save = 0
|
||||||
|
|
||||||
# ensure we mark 100% on completion
|
# ensure we mark 100% on completion
|
||||||
if last_pct < 100:
|
if last_pct < 100:
|
||||||
log({"status": "transcribing", "path": str(media_path), "title": title, "progress": 100})
|
log({"status": "transcribing", "path": str(media_path), "title": title, "progress": 100})
|
||||||
|
|
||||||
txt = " ".join(text_parts).strip()
|
txt = " ".join(text_parts).strip()
|
||||||
|
|
||||||
# Write transcript artifacts
|
# Write final transcript artifacts
|
||||||
open(base.with_suffix(".json"), "wb").write(orjson.dumps({
|
(base.with_suffix(".json")).write_bytes(orjson.dumps({
|
||||||
"file": str(media_path),
|
"file": str(media_path),
|
||||||
"language": info.language,
|
"language": info.language,
|
||||||
"segments": segs
|
"segments": segs
|
||||||
}))
|
}))
|
||||||
open(base.with_suffix(".txt"), "w", encoding="utf-8").write(txt)
|
(base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
|
||||||
|
|
||||||
def fmt_ts(t):
|
def fmt_ts(t):
|
||||||
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
|
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
|
||||||
@@ -630,9 +971,10 @@ def transcribe(media_path: Path):
|
|||||||
shutil.copy2(srt_src, srt_dst)
|
shutil.copy2(srt_src, srt_dst)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[post] could not copy srt -> {srt_dst}: {e}", flush=True)
|
print(f"[post] could not copy srt -> {srt_dst}: {e}", flush=True)
|
||||||
# Write Kodi/Plex-compatible NFO using basic metadata
|
|
||||||
|
# Write Kodi/Plex-compatible NFO using enhanced metadata (same as before)
|
||||||
try:
|
try:
|
||||||
meta = {
|
fallback = {
|
||||||
"title": title,
|
"title": title,
|
||||||
"episode_title": title,
|
"episode_title": title,
|
||||||
"show": media_path.parent.name,
|
"show": media_path.parent.name,
|
||||||
@@ -642,18 +984,42 @@ def transcribe(media_path: Path):
|
|||||||
"image": "",
|
"image": "",
|
||||||
"guid": "",
|
"guid": "",
|
||||||
}
|
}
|
||||||
|
meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None)
|
||||||
ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
|
ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
|
||||||
write_episode_nfo(media_path, meta, ttxt)
|
write_episode_nfo(media_path, meta, ttxt)
|
||||||
|
try:
|
||||||
|
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[post] NFO write failed: {e}", flush=True)
|
print(f"[post] NFO write failed: {e}", flush=True)
|
||||||
|
|
||||||
# Optional: cleanup temporary WAV
|
# Cleanup temp WAVs
|
||||||
try:
|
try:
|
||||||
|
if wav_for_run != wav and wav_for_run.exists():
|
||||||
|
wav_for_run.unlink()
|
||||||
if wav.exists():
|
if wav.exists():
|
||||||
wav.unlink()
|
wav.unlink()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Remove partial checkpoints on success
|
||||||
|
if WHISPER_RESUME:
|
||||||
|
try:
|
||||||
|
pjson, ptxt = _partial_paths(title)
|
||||||
|
if pjson.exists(): pjson.unlink()
|
||||||
|
if ptxt.exists(): ptxt.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Final average speed over whole transcription
|
||||||
|
try:
|
||||||
|
total_elapsed = max(0.001, time.time() - start_wall)
|
||||||
|
avg_rtf = (dur / total_elapsed) if total_elapsed > 0 else 0.0
|
||||||
|
print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True)
|
||||||
return base
|
return base
|
||||||
|
|
||||||
def index_meili(json_path: Path):
|
def index_meili(json_path: Path):
|
||||||
@@ -829,7 +1195,8 @@ def handle_local_file(path_str: str):
|
|||||||
index_meili(base.with_suffix(".json"))
|
index_meili(base.with_suffix(".json"))
|
||||||
publish_to_openwebui([base.with_suffix(".txt")])
|
publish_to_openwebui([base.with_suffix(".txt")])
|
||||||
try:
|
try:
|
||||||
meta = {
|
# Use info.json (if present) to enrich metadata
|
||||||
|
fallback = {
|
||||||
"title": title,
|
"title": title,
|
||||||
"episode_title": title,
|
"episode_title": title,
|
||||||
"show": p.parent.name,
|
"show": p.parent.name,
|
||||||
@@ -839,8 +1206,14 @@ def handle_local_file(path_str: str):
|
|||||||
"image": "",
|
"image": "",
|
||||||
"guid": "",
|
"guid": "",
|
||||||
}
|
}
|
||||||
|
meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
|
||||||
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
|
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
|
||||||
write_episode_nfo(p, meta, ttxt)
|
write_episode_nfo(p, meta, ttxt)
|
||||||
|
# Try to fetch and save artwork locally
|
||||||
|
try:
|
||||||
|
save_episode_artwork(meta.get("image"), p, meta.get("show"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[post] NFO write failed: {e}", flush=True)
|
print(f"[post] NFO write failed: {e}", flush=True)
|
||||||
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
|
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
|
||||||
@@ -855,7 +1228,8 @@ def handle_local_file(path_str: str):
|
|||||||
publish_to_openwebui([base.with_suffix(".txt")])
|
publish_to_openwebui([base.with_suffix(".txt")])
|
||||||
try:
|
try:
|
||||||
data = json.loads((base.with_suffix(".json")).read_text(encoding="utf-8"))
|
data = json.loads((base.with_suffix(".json")).read_text(encoding="utf-8"))
|
||||||
meta = {
|
# Start with repo metadata, then enrich from yt-dlp info.json if any
|
||||||
|
meta_repo = {
|
||||||
"title": data.get("title") or title,
|
"title": data.get("title") or title,
|
||||||
"episode_title": data.get("title") or title,
|
"episode_title": data.get("title") or title,
|
||||||
"show": data.get("show") or p.parent.name,
|
"show": data.get("show") or p.parent.name,
|
||||||
@@ -865,8 +1239,13 @@ def handle_local_file(path_str: str):
|
|||||||
"image": data.get("image"),
|
"image": data.get("image"),
|
||||||
"guid": data.get("guid") or data.get("id"),
|
"guid": data.get("guid") or data.get("id"),
|
||||||
}
|
}
|
||||||
|
meta = build_meta_from_sources(p, p.parent.name, meta_repo, ep=None)
|
||||||
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
|
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
|
||||||
write_episode_nfo(p, meta, ttxt)
|
write_episode_nfo(p, meta, ttxt)
|
||||||
|
try:
|
||||||
|
save_episode_artwork(meta.get("image"), p, meta.get("show"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[post] NFO write failed: {e}", flush=True)
|
print(f"[post] NFO write failed: {e}", flush=True)
|
||||||
log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
|
log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
|
||||||
@@ -876,6 +1255,26 @@ def handle_local_file(path_str: str):
|
|||||||
base = transcribe(p)
|
base = transcribe(p)
|
||||||
index_meili(base.with_suffix(".json"))
|
index_meili(base.with_suffix(".json"))
|
||||||
publish_to_openwebui([base.with_suffix(".txt")])
|
publish_to_openwebui([base.with_suffix(".txt")])
|
||||||
|
try:
|
||||||
|
fallback = {
|
||||||
|
"title": title,
|
||||||
|
"episode_title": title,
|
||||||
|
"show": p.parent.name,
|
||||||
|
"description": "",
|
||||||
|
"pubdate": _extract_date_from_stem(title),
|
||||||
|
"duration_sec": media_duration_seconds(p),
|
||||||
|
"image": "",
|
||||||
|
"guid": "",
|
||||||
|
}
|
||||||
|
meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
|
||||||
|
ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
|
||||||
|
write_episode_nfo(p, meta, ttxt)
|
||||||
|
try:
|
||||||
|
save_episode_artwork(meta.get("image"), p, meta.get("show"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[post] NFO write failed: {e}", flush=True)
|
||||||
log({**info, **{"status": "done"}})
|
log({**info, **{"status": "done"}})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log({"url": path_str, "status": "error", "error": str(e)})
|
log({"url": path_str, "status": "error", "error": str(e)})
|
||||||
@@ -913,6 +1312,45 @@ def handle_url(url: str):
|
|||||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||||
dest = dest_dir / sanitize(f.name)
|
dest = dest_dir / sanitize(f.name)
|
||||||
shutil.move(str(f), dest)
|
shutil.move(str(f), dest)
|
||||||
|
# Move companion files produced by yt-dlp (info.json, thumbnail, subtitles)
|
||||||
|
try:
|
||||||
|
companions = find_companion_files(f)
|
||||||
|
# info.json -> prefer "<dest.name>.info.json", fallback to "<dest.stem>.info.json"
|
||||||
|
if companions.get("info") and companions["info"].exists():
|
||||||
|
dest_info = dest.parent / f"{dest.name}.info.json"
|
||||||
|
try:
|
||||||
|
shutil.move(str(companions["info"]), dest_info)
|
||||||
|
except Exception:
|
||||||
|
# fallback naming without extension
|
||||||
|
dest_info2 = dest.parent / f"{dest.stem}.info.json"
|
||||||
|
try:
|
||||||
|
shutil.move(str(companions['info']), dest_info2)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# thumbnail -> "<dest>.jpg"
|
||||||
|
if companions.get("thumb") and companions["thumb"].exists():
|
||||||
|
try:
|
||||||
|
shutil.move(str(companions["thumb"]), str(dest.with_suffix(".jpg")))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# subtitles -> preserve language suffix: "<dest.stem><suffix>"
|
||||||
|
for s in companions.get("subs", []):
|
||||||
|
if not s.exists():
|
||||||
|
continue
|
||||||
|
suffix_tail = ""
|
||||||
|
s_name = s.name
|
||||||
|
f_stem = f.stem
|
||||||
|
if s_name.startswith(f_stem):
|
||||||
|
suffix_tail = s_name[len(f_stem):] # includes leading dot if present
|
||||||
|
else:
|
||||||
|
suffix_tail = s.suffix
|
||||||
|
dest_sub = dest.parent / f"{dest.stem}{suffix_tail}"
|
||||||
|
try:
|
||||||
|
shutil.move(str(s), str(dest_sub))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
info.update({"title": dest.stem, "uploader": uploader,
|
info.update({"title": dest.stem, "uploader": uploader,
|
||||||
"date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
|
"date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
|
||||||
"path": str(dest)})
|
"path": str(dest)})
|
||||||
@@ -937,34 +1375,28 @@ def handle_url(url: str):
|
|||||||
index_meili(base.with_suffix(".json"))
|
index_meili(base.with_suffix(".json"))
|
||||||
publish_to_openwebui([base.with_suffix(".txt")])
|
publish_to_openwebui([base.with_suffix(".txt")])
|
||||||
try:
|
try:
|
||||||
if 'ep' in locals() and ep:
|
# Build metadata from RSS (if matched), yt-dlp info.json, and sensible fallbacks
|
||||||
meta = {
|
fallback = {
|
||||||
"title": ep.get("title"),
|
"title": dest.stem,
|
||||||
"episode_title": ep.get("title"),
|
"episode_title": dest.stem,
|
||||||
"show": ep.get("podcast_title") or ep.get("feed_title") or ep.get("show") or uploader,
|
"show": uploader,
|
||||||
"description": ep.get("description") or ep.get("content"),
|
"description": "",
|
||||||
"pubdate": ep.get("pubdate"),
|
"pubdate": _extract_date_from_stem(dest.stem),
|
||||||
"pubdate_iso": ep.get("date_iso"),
|
"duration_sec": media_duration_seconds(dest),
|
||||||
"duration_sec": ep.get("duration_sec") or ep.get("duration") or media_duration_seconds(dest),
|
"image": "",
|
||||||
"image": ep.get("image") or ep.get("image_url"),
|
"guid": "",
|
||||||
"guid": ep.get("guid"),
|
}
|
||||||
}
|
meta = build_meta_from_sources(dest, uploader, fallback, ep if 'ep' in locals() else None)
|
||||||
else:
|
|
||||||
meta = {
|
|
||||||
"title": dest.stem,
|
|
||||||
"episode_title": dest.stem,
|
|
||||||
"show": uploader,
|
|
||||||
"description": "",
|
|
||||||
"pubdate": _extract_date_from_stem(dest.stem),
|
|
||||||
"duration_sec": media_duration_seconds(dest),
|
|
||||||
"image": "",
|
|
||||||
"guid": "",
|
|
||||||
}
|
|
||||||
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
|
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
|
||||||
write_episode_nfo(dest, meta, ttxt)
|
write_episode_nfo(dest, meta, ttxt)
|
||||||
|
# Save local artwork for Plex/Kodi from meta image url
|
||||||
|
try:
|
||||||
|
save_episode_artwork(meta.get("image"), dest, meta.get("show"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[post] NFO write failed: {e}", flush=True)
|
print(f"[post] NFO write failed: {e}", flush=True)
|
||||||
log({**info, **{"status":"done"}})
|
log({**info, **{"status":"done"}})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log({"url": url, "status":"error", "error": str(e)})
|
log({"url": url, "status":"error", "error": str(e)})
|
||||||
raise
|
raise
|
@@ -40,6 +40,42 @@ services:
|
|||||||
TMP_ROOT: /tmpdl
|
TMP_ROOT: /tmpdl
|
||||||
WHISPER_MODEL: large-v3
|
WHISPER_MODEL: large-v3
|
||||||
WHISPER_PRECISION: int8
|
WHISPER_PRECISION: int8
|
||||||
|
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
||||||
|
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
||||||
|
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
||||||
|
PYTHONPATH: /app
|
||||||
|
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
|
||||||
|
JOB_TTL: ${JOB_TTL:-86400}
|
||||||
|
RESULT_TTL: ${RESULT_TTL:-86400}
|
||||||
|
FAILURE_TTL: ${FAILURE_TTL:-86400}
|
||||||
|
volumes:
|
||||||
|
- ${LIBRARY_HOST_DIR:-./library}:/library
|
||||||
|
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
|
||||||
|
- ${TMP_HOST_DIR:-./tmp}:/tmpdl
|
||||||
|
- ${MODELS_HOST_DIR:-./models}:/root/.cache/huggingface
|
||||||
|
depends_on: [meili, redis]
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "exit 0"]
|
||||||
|
extra_hosts:
|
||||||
|
- host.docker.internal:host-gateway
|
||||||
|
|
||||||
|
podx-worker-transcribe:
|
||||||
|
build: ./app
|
||||||
|
container_name: podx-worker-transcribe
|
||||||
|
command: ["rq", "worker", "-u", "redis://redis:6379/0", "transcribe"]
|
||||||
|
env_file: [.env]
|
||||||
|
environment:
|
||||||
|
MEILI_URL: http://meili:7700
|
||||||
|
REDIS_URL: redis://redis:6379/0
|
||||||
|
LIBRARY_ROOT: /library
|
||||||
|
TRANSCRIPT_ROOT: /transcripts
|
||||||
|
TMP_ROOT: /tmpdl
|
||||||
|
WHISPER_MODEL: large-v3
|
||||||
|
WHISPER_PRECISION: int8
|
||||||
|
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
||||||
|
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
||||||
|
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
||||||
PYTHONPATH: /app
|
PYTHONPATH: /app
|
||||||
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
|
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
|
||||||
JOB_TTL: ${JOB_TTL:-86400}
|
JOB_TTL: ${JOB_TTL:-86400}
|
||||||
@@ -90,7 +126,7 @@ services:
|
|||||||
# - COOKIE_FILE=/config/cookies.txt
|
# - COOKIE_FILE=/config/cookies.txt
|
||||||
# Optional: yt-dlp options (JSON). Example enables Android client fallback
|
# Optional: yt-dlp options (JSON). Example enables Android client fallback
|
||||||
# - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}}
|
# - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}}
|
||||||
- YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1}
|
- YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1,"writesubtitles":true,"writeautomaticsub":true,"subtitleslangs":["en.*"],"convertsubs":"srt","writeinfojson":true,"writethumbnail":true,"converttumbnails":"jpg"}
|
||||||
volumes:
|
volumes:
|
||||||
- ${LIBRARY_HOST_DIR:-./library}:/downloads
|
- ${LIBRARY_HOST_DIR:-./library}:/downloads
|
||||||
# Optional cookies file on host → /config/cookies.txt inside container
|
# Optional cookies file on host → /config/cookies.txt inside container
|
||||||
|
Reference in New Issue
Block a user