From 399af7a205551d2f732c6a69185bdf6b21d74a5a Mon Sep 17 00:00:00 2001
From: Tomas Kracmar <tomas.kracmar@cqre.net>
Date: Sun, 7 Sep 2025 17:41:58 +0200
Subject: [PATCH] Added the ability to refresh metadata

---
 app/scanner.py |  49 ++++++++++++++++++++++-
 app/worker.py  | 106 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/app/scanner.py b/app/scanner.py
index 24ab518..984acc4 100644
--- a/app/scanner.py
+++ b/app/scanner.py
@@ -1,6 +1,7 @@
 
 
 import os
+from typing import Set
 import time
 import signal
 import sys
@@ -20,13 +21,19 @@ JOB_TTL = int(os.getenv("JOB_TTL", "86400"))               # 24 hours
 RESULT_TTL = int(os.getenv("RESULT_TTL", "86400"))         # 24 hours
 FAILURE_TTL = int(os.getenv("FAILURE_TTL", "86400"))       # 24 hours
 
+# Optional refresh of existing items to fetch metadata/subtitles/thumbnails
+REFRESH_EXISTING = os.getenv("REFRESH_EXISTING", "1").strip() not in ("0", "false", "False", "")
+REFRESH_TTL = int(os.getenv("REFRESH_TTL", "21600"))          # 6 hours
+REFRESH_FAILURE_TTL = int(os.getenv("REFRESH_FAILURE_TTL", "21600"))
+
 # Media types to track
 MEDIA_EXT = {
     ".mp3", ".m4a", ".mp4", ".mkv", ".wav", ".flac", ".webm", ".ogg", ".opus"
 }
 
 # In-memory seen set to avoid re-enqueueing during a single run
-_seen: set[str] = set()
+_seen: Set[str] = set()
+_seen_refresh: Set[str] = set()
 
 
 def already_transcribed(p: Path) -> bool:
@@ -35,6 +42,28 @@ def already_transcribed(p: Path) -> bool:
     return base_json.exists()
 
 
+# Helper to decide when to refresh sidecars
+def needs_refresh(p: Path) -> bool:
+    """
+    Decide whether to refresh sidecars for a media file:
+    - If metadata (*.info.json) is missing
+    - If no subtitle SRT is present next to media (either .srt or .en.srt)
+    - If no thumbnail JPG/PNG is present next to media
+    """
+    stem = p.with_suffix("")
+    info_json = stem.with_suffix(".info.json")
+    # Accept any language-suffixed SRT as well
+    srt_plain = stem.with_suffix(".srt")
+    srt_en = p.with_suffix(".en.srt")
+    has_any_srt = srt_plain.exists() or srt_en.exists() or any(p.parent.glob(p.stem + ".*.srt"))
+    thumb_jpg = stem.with_suffix(".jpg")
+    thumb_png = stem.with_suffix(".png")
+    missing_info = not info_json.exists()
+    missing_subs = not has_any_srt
+    missing_thumb = not (thumb_jpg.exists() or thumb_png.exists())
+    return missing_info or missing_subs or missing_thumb
+
+
 def iter_media_files(root: Path):
     for path in root.rglob("*"):
         if not path.is_file():
@@ -57,7 +86,23 @@ def enqueue_new_files():
             continue
         if already_transcribed(p):
             _seen.add(key)
-            print(f"[scanner] Skip (already transcribed): {p}", flush=True)
+            if REFRESH_EXISTING and needs_refresh(p):
+                if key not in _seen_refresh:
+                    # Ask worker to refresh metadata/subtitles/thumbnails without redownloading media
+                    q.enqueue(
+                        "worker.refresh_media",
+                        key,
+                        job_timeout=JOB_TIMEOUT,
+                        ttl=REFRESH_TTL,
+                        result_ttl=RESULT_TTL,
+                        failure_ttl=REFRESH_FAILURE_TTL,
+                    )
+                    _seen_refresh.add(key)
+                    print(f"[scanner] Refresh enqueued: {p}", flush=True)
+                else:
+                    print(f"[scanner] Skip (already queued refresh): {p}", flush=True)
+            else:
+                print(f"[scanner] Skip (already transcribed): {p}", flush=True)
             continue
         # Enqueue the worker to process this local file (with generous timeouts)
         q.enqueue(
diff --git a/app/worker.py b/app/worker.py
index 6e051ac..f6a0226 100644
--- a/app/worker.py
+++ b/app/worker.py
@@ -1280,6 +1280,112 @@ def handle_local_file(path_str: str):
         log({"url": path_str, "status": "error", "error": str(e)})
         raise
 
+
+# --- Refresh sidecar metadata and subtitles for an already-downloaded media file ---
+def refresh_media(path_str: str):
+    """
+    Refresh sidecar metadata (info.json, thumbnail) and subtitles for an already-downloaded media file.
+    Requires a companion .info.json next to the media (to supply the original URL). No media re-download.
+    """
+    try:
+        p = Path(path_str)
+        if not p.exists() or not p.is_file():
+            log({"url": path_str, "status": "error", "error": "file_not_found"})
+            return
+
+        # Locate existing info.json to get the original URL
+        info_json = None
+        for cand in [p.parent / f"{p.name}.info.json", p.parent / f"{p.stem}.info.json"]:
+            if cand.exists():
+                info_json = cand
+                break
+
+        if not info_json:
+            log({"path": str(p), "status": "refresh-skip", "reason": "no_info_json"})
+            print(f"[refresh] skip: no info.json next to {p}", flush=True)
+            return
+
+        info = load_info_json(info_json) or {}
+        url = info.get("webpage_url") or info.get("original_url") or info.get("url")
+        if not url:
+            log({"path": str(p), "status": "refresh-skip", "reason": "no_url_in_info"})
+            print(f"[refresh] skip: no URL in {info_json}", flush=True)
+            return
+
+        # Prepare yt-dlp command to refresh sidecars only, writing files exactly next to the media
+        outtmpl = str(p.with_suffix(".%(ext)s"))
+        sub_langs = os.getenv("YTDLP_SUBS_LANGS", "en.*,en")
+
+        cmd = [
+            "yt-dlp",
+            "--skip-download",
+            "--write-info-json",
+            "--write-thumbnail",
+            "--convert-thumbnails", "jpg",
+            "--write-subs", "--write-auto-subs",
+            "--sub-langs", sub_langs,
+            "--convert-subs", "srt",
+            "-o", outtmpl,
+            url,
+        ]
+
+        print(f"[refresh] refreshing sidecars for {p} via yt-dlp", flush=True)
+        try:
+            subprocess.check_call(cmd)
+        except subprocess.CalledProcessError as e:
+            print(f"[refresh] yt-dlp failed: {e}", flush=True)
+            raise
+
+        # Ensure language-suffixed SRT exists (Plex-friendly) if any subs were fetched
+        try:
+            # Pick any .srt just fetched that matches base
+            for s in p.parent.glob(f"{p.stem}*.srt"):
+                # If it's already lang-suffixed, keep; also copy to .en.srt when only plain .srt exists
+                if s.name == f"{p.stem}.srt":
+                    shutil.copy2(s, p.with_suffix(".en.srt"))
+        except Exception:
+            pass
+
+        # Rebuild NFO using fresh info.json (and RSS if available)
+        try:
+            # Try RSS match to enrich metadata (non-fatal if not present)
+            ep = None
+            try:
+                ep = match_media_to_rss(p)
+            except Exception:
+                ep = None
+
+            fallback = {
+                "title": p.stem,
+                "episode_title": p.stem,
+                "show": p.parent.name,
+                "description": "",
+                "pubdate": _extract_date_from_stem(p.stem),
+                "duration_sec": media_duration_seconds(p),
+                "image": "",
+                "guid": "",
+            }
+            meta = build_meta_from_sources(p, p.parent.name, fallback, ep)
+            # Save local artwork too
+            try:
+                save_episode_artwork(meta.get("image"), p, meta.get("show"))
+            except Exception:
+                pass
+
+            # If a transcript already exists, include it in the NFO plot preview
+            ttxt_path = (TRN / p.stem).with_suffix(".txt")
+            ttxt = ttxt_path.read_text(encoding="utf-8") if ttxt_path.exists() else None
+            write_episode_nfo(p, meta, ttxt)
+        except Exception as e:
+            print(f"[refresh] NFO/artwork update failed: {e}", flush=True)
+
+        log({"path": str(p), "status": "refresh-done"})
+        print(f"[refresh] done for {p}", flush=True)
+
+    except Exception as e:
+        log({"path": path_str, "status": "error", "error": str(e)})
+        raise
+
 def handle_web(url: str):
     info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""}
     log(info)