From 399af7a205551d2f732c6a69185bdf6b21d74a5a Mon Sep 17 00:00:00 2001 From: Tomas Kracmar Date: Sun, 7 Sep 2025 17:41:58 +0200 Subject: [PATCH] Added the ability to refresh metadata --- app/scanner.py | 49 ++++++++++++++++++++++- app/worker.py | 106 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 2 deletions(-) diff --git a/app/scanner.py b/app/scanner.py index 24ab518..984acc4 100644 --- a/app/scanner.py +++ b/app/scanner.py @@ -1,6 +1,7 @@ import os +from typing import Set import time import signal import sys @@ -20,13 +21,19 @@ JOB_TTL = int(os.getenv("JOB_TTL", "86400")) # 24 hours RESULT_TTL = int(os.getenv("RESULT_TTL", "86400")) # 24 hours FAILURE_TTL = int(os.getenv("FAILURE_TTL", "86400")) # 24 hours +# Optional refresh of existing items to fetch metadata/subtitles/thumbnails +REFRESH_EXISTING = os.getenv("REFRESH_EXISTING", "1").strip() not in ("0", "false", "False", "") +REFRESH_TTL = int(os.getenv("REFRESH_TTL", "21600")) # 6 hours +REFRESH_FAILURE_TTL = int(os.getenv("REFRESH_FAILURE_TTL", "21600")) + # Media types to track MEDIA_EXT = { ".mp3", ".m4a", ".mp4", ".mkv", ".wav", ".flac", ".webm", ".ogg", ".opus" } # In-memory seen set to avoid re-enqueueing during a single run -_seen: set[str] = set() +_seen: Set[str] = set() +_seen_refresh: Set[str] = set() def already_transcribed(p: Path) -> bool: @@ -35,6 +42,28 @@ def already_transcribed(p: Path) -> bool: return base_json.exists() +# Helper to decide when to refresh sidecars +def needs_refresh(p: Path) -> bool: + """ + Decide whether to refresh sidecars for a media file: + - If metadata (*.info.json) is missing + - If no subtitle SRT is present next to media (either .srt or .en.srt) + - If no thumbnail JPG/PNG is present next to media + """ + stem = p.with_suffix("") + info_json = stem.with_suffix(".info.json") + # Accept any language-suffixed SRT as well + srt_plain = stem.with_suffix(".srt") + srt_en = p.with_suffix(".en.srt") + has_any_srt = srt_plain.exists() or srt_en.exists() or any(p.parent.glob(p.stem + ".*.srt")) + thumb_jpg = stem.with_suffix(".jpg") + thumb_png = stem.with_suffix(".png") + missing_info = not info_json.exists() + missing_subs = not has_any_srt + missing_thumb = not (thumb_jpg.exists() or thumb_png.exists()) + return missing_info or missing_subs or missing_thumb + + def iter_media_files(root: Path): for path in root.rglob("*"): if not path.is_file(): @@ -57,7 +86,23 @@ def enqueue_new_files(): continue if already_transcribed(p): _seen.add(key) - print(f"[scanner] Skip (already transcribed): {p}", flush=True) + if REFRESH_EXISTING and needs_refresh(p): + if key not in _seen_refresh: + # Ask worker to refresh metadata/subtitles/thumbnails without redownloading media + q.enqueue( + "worker.refresh_media", + key, + job_timeout=JOB_TIMEOUT, + ttl=REFRESH_TTL, + result_ttl=RESULT_TTL, + failure_ttl=REFRESH_FAILURE_TTL, + ) + _seen_refresh.add(key) + print(f"[scanner] Refresh enqueued: {p}", flush=True) + else: + print(f"[scanner] Skip (already queued refresh): {p}", flush=True) + else: + print(f"[scanner] Skip (already transcribed): {p}", flush=True) continue # Enqueue the worker to process this local file (with generous timeouts) q.enqueue( diff --git a/app/worker.py b/app/worker.py index 6e051ac..f6a0226 100644 --- a/app/worker.py +++ b/app/worker.py @@ -1280,6 +1280,112 @@ def handle_local_file(path_str: str): log({"url": path_str, "status": "error", "error": str(e)}) raise + +# --- Refresh sidecar metadata and subtitles for an already-downloaded media file --- +def refresh_media(path_str: str): + """ + Refresh sidecar metadata (info.json, thumbnail) and subtitles for an already-downloaded media file. + Requires a companion .info.json next to the media (to supply the original URL). No media re-download. + """ + try: + p = Path(path_str) + if not p.exists() or not p.is_file(): + log({"url": path_str, "status": "error", "error": "file_not_found"}) + return + + # Locate existing info.json to get the original URL + info_json = None + for cand in [p.parent / f"{p.name}.info.json", p.parent / f"{p.stem}.info.json"]: + if cand.exists(): + info_json = cand + break + + if not info_json: + log({"path": str(p), "status": "refresh-skip", "reason": "no_info_json"}) + print(f"[refresh] skip: no info.json next to {p}", flush=True) + return + + info = load_info_json(info_json) or {} + url = info.get("webpage_url") or info.get("original_url") or info.get("url") + if not url: + log({"path": str(p), "status": "refresh-skip", "reason": "no_url_in_info"}) + print(f"[refresh] skip: no URL in {info_json}", flush=True) + return + + # Prepare yt-dlp command to refresh sidecars only, writing files exactly next to the media + outtmpl = str(p.with_suffix(".%(ext)s")) + sub_langs = os.getenv("YTDLP_SUBS_LANGS", "en.*,en") + + cmd = [ + "yt-dlp", + "--skip-download", + "--write-info-json", + "--write-thumbnail", + "--convert-thumbnails", "jpg", + "--write-subs", "--write-auto-subs", + "--sub-langs", sub_langs, + "--convert-subs", "srt", + "-o", outtmpl, + url, + ] + + print(f"[refresh] refreshing sidecars for {p} via yt-dlp", flush=True) + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError as e: + print(f"[refresh] yt-dlp failed: {e}", flush=True) + raise + + # Ensure language-suffixed SRT exists (Plex-friendly) if any subs were fetched + try: + # Pick any .srt just fetched that matches base + for s in p.parent.glob(f"{p.stem}*.srt"): + # If it's already lang-suffixed, keep; also copy to .en.srt when only plain .srt exists + if s.name == f"{p.stem}.srt": + shutil.copy2(s, p.with_suffix(".en.srt")) + except Exception: + pass + + # Rebuild NFO using fresh info.json (and RSS if available) + try: + # Try RSS match to enrich metadata (non-fatal if not present) + ep = None + try: + ep = match_media_to_rss(p) + except Exception: + ep = None + + fallback = { + "title": p.stem, + "episode_title": p.stem, + "show": p.parent.name, + "description": "", + "pubdate": _extract_date_from_stem(p.stem), + "duration_sec": media_duration_seconds(p), + "image": "", + "guid": "", + } + meta = build_meta_from_sources(p, p.parent.name, fallback, ep) + # Save local artwork too + try: + save_episode_artwork(meta.get("image"), p, meta.get("show")) + except Exception: + pass + + # If a transcript already exists, include it in the NFO plot preview + ttxt_path = (TRN / p.stem).with_suffix(".txt") + ttxt = ttxt_path.read_text(encoding="utf-8") if ttxt_path.exists() else None + write_episode_nfo(p, meta, ttxt) + except Exception as e: + print(f"[refresh] NFO/artwork update failed: {e}", flush=True) + + log({"path": str(p), "status": "refresh-done"}) + print(f"[refresh] done for {p}", flush=True) + + except Exception as e: + log({"path": path_str, "status": "error", "error": str(e)}) + raise + def handle_web(url: str): info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""} log(info)