Added the ability to refresh metadata

2025-09-07 17:41:58 +02:00
parent bd846a8d9f
commit 399af7a205
2 changed files with 153 additions and 2 deletions
--- a/app/scanner.py
+++ b/app/scanner.py
@@ -1,6 +1,7 @@
 import os
 from typing import Set
 import time
 import signal
 import sys
@@ -20,13 +21,19 @@ JOB_TTL = int(os.getenv("JOB_TTL", "86400"))               # 24 hours
 RESULT_TTL = int(os.getenv("RESULT_TTL", "86400"))         # 24 hours
 FAILURE_TTL = int(os.getenv("FAILURE_TTL", "86400"))       # 24 hours
 # Optional refresh of existing items to fetch metadata/subtitles/thumbnails
 REFRESH_EXISTING = os.getenv("REFRESH_EXISTING", "1").strip() not in ("0", "false", "False", "")
 REFRESH_TTL = int(os.getenv("REFRESH_TTL", "21600"))          # 6 hours
 REFRESH_FAILURE_TTL = int(os.getenv("REFRESH_FAILURE_TTL", "21600"))
 # Media types to track
 MEDIA_EXT = {
    ".mp3", ".m4a", ".mp4", ".mkv", ".wav", ".flac", ".webm", ".ogg", ".opus"
 }
 # In-memory seen set to avoid re-enqueueing during a single run
-_seen: set[str] = set()
+_seen: Set[str] = set()
 _seen_refresh: Set[str] = set()
 def already_transcribed(p: Path) -> bool:
@@ -35,6 +42,28 @@ def already_transcribed(p: Path) -> bool:
    return base_json.exists()
 # Helper to decide when to refresh sidecars
 def needs_refresh(p: Path) -> bool:
    """
    Decide whether to refresh sidecars for a media file:
    - If metadata (*.info.json) is missing
    - If no subtitle SRT is present next to media (either .srt or .en.srt)
    - If no thumbnail JPG/PNG is present next to media
    """
    stem = p.with_suffix("")
    info_json = stem.with_suffix(".info.json")
    # Accept any language-suffixed SRT as well
    srt_plain = stem.with_suffix(".srt")
    srt_en = p.with_suffix(".en.srt")
    has_any_srt = srt_plain.exists() or srt_en.exists() or any(p.parent.glob(p.stem + ".*.srt"))
    thumb_jpg = stem.with_suffix(".jpg")
    thumb_png = stem.with_suffix(".png")
    missing_info = not info_json.exists()
    missing_subs = not has_any_srt
    missing_thumb = not (thumb_jpg.exists() or thumb_png.exists())
    return missing_info or missing_subs or missing_thumb
 def iter_media_files(root: Path):
    for path in root.rglob("*"):
        if not path.is_file():
@@ -57,7 +86,23 @@ def enqueue_new_files():
            continue
        if already_transcribed(p):
            _seen.add(key)
-            print(f"[scanner] Skip (already transcribed): {p}", flush=True)
+            if REFRESH_EXISTING and needs_refresh(p):
                if key not in _seen_refresh:
                    # Ask worker to refresh metadata/subtitles/thumbnails without redownloading media
                    q.enqueue(
                        "worker.refresh_media",
                        key,
                        job_timeout=JOB_TIMEOUT,
                        ttl=REFRESH_TTL,
                        result_ttl=RESULT_TTL,
                        failure_ttl=REFRESH_FAILURE_TTL,
                    )
                    _seen_refresh.add(key)
                    print(f"[scanner] Refresh enqueued: {p}", flush=True)
                else:
                    print(f"[scanner] Skip (already queued refresh): {p}", flush=True)
            else:
                print(f"[scanner] Skip (already transcribed): {p}", flush=True)
            continue
        # Enqueue the worker to process this local file (with generous timeouts)
        q.enqueue(
--- a/app/worker.py
+++ b/app/worker.py
@@ -1280,6 +1280,112 @@ def handle_local_file(path_str: str):
        log({"url": path_str, "status": "error", "error": str(e)})
        raise
 # --- Refresh sidecar metadata and subtitles for an already-downloaded media file ---
 def refresh_media(path_str: str):
    """
    Refresh sidecar metadata (info.json, thumbnail) and subtitles for an already-downloaded media file.
    Requires a companion .info.json next to the media (to supply the original URL). No media re-download.
    """
    try:
        p = Path(path_str)
        if not p.exists() or not p.is_file():
            log({"url": path_str, "status": "error", "error": "file_not_found"})
            return
        # Locate existing info.json to get the original URL
        info_json = None
        for cand in [p.parent / f"{p.name}.info.json", p.parent / f"{p.stem}.info.json"]:
            if cand.exists():
                info_json = cand
                break
        if not info_json:
            log({"path": str(p), "status": "refresh-skip", "reason": "no_info_json"})
            print(f"[refresh] skip: no info.json next to {p}", flush=True)
            return
        info = load_info_json(info_json) or {}
        url = info.get("webpage_url") or info.get("original_url") or info.get("url")
        if not url:
            log({"path": str(p), "status": "refresh-skip", "reason": "no_url_in_info"})
            print(f"[refresh] skip: no URL in {info_json}", flush=True)
            return
        # Prepare yt-dlp command to refresh sidecars only, writing files exactly next to the media
        outtmpl = str(p.with_suffix(".%(ext)s"))
        sub_langs = os.getenv("YTDLP_SUBS_LANGS", "en.*,en")
        cmd = [
            "yt-dlp",
            "--skip-download",
            "--write-info-json",
            "--write-thumbnail",
            "--convert-thumbnails", "jpg",
            "--write-subs", "--write-auto-subs",
            "--sub-langs", sub_langs,
            "--convert-subs", "srt",
            "-o", outtmpl,
            url,
        ]
        print(f"[refresh] refreshing sidecars for {p} via yt-dlp", flush=True)
        try:
            subprocess.check_call(cmd)
        except subprocess.CalledProcessError as e:
            print(f"[refresh] yt-dlp failed: {e}", flush=True)
            raise
        # Ensure language-suffixed SRT exists (Plex-friendly) if any subs were fetched
        try:
            # Pick any .srt just fetched that matches base
            for s in p.parent.glob(f"{p.stem}*.srt"):
                # If it's already lang-suffixed, keep; also copy to .en.srt when only plain .srt exists
                if s.name == f"{p.stem}.srt":
                    shutil.copy2(s, p.with_suffix(".en.srt"))
        except Exception:
            pass
        # Rebuild NFO using fresh info.json (and RSS if available)
        try:
            # Try RSS match to enrich metadata (non-fatal if not present)
            ep = None
            try:
                ep = match_media_to_rss(p)
            except Exception:
                ep = None
            fallback = {
                "title": p.stem,
                "episode_title": p.stem,
                "show": p.parent.name,
                "description": "",
                "pubdate": _extract_date_from_stem(p.stem),
                "duration_sec": media_duration_seconds(p),
                "image": "",
                "guid": "",
            }
            meta = build_meta_from_sources(p, p.parent.name, fallback, ep)
            # Save local artwork too
            try:
                save_episode_artwork(meta.get("image"), p, meta.get("show"))
            except Exception:
                pass
            # If a transcript already exists, include it in the NFO plot preview
            ttxt_path = (TRN / p.stem).with_suffix(".txt")
            ttxt = ttxt_path.read_text(encoding="utf-8") if ttxt_path.exists() else None
            write_episode_nfo(p, meta, ttxt)
        except Exception as e:
            print(f"[refresh] NFO/artwork update failed: {e}", flush=True)
        log({"path": str(p), "status": "refresh-done"})
        print(f"[refresh] done for {p}", flush=True)
    except Exception as e:
        log({"path": path_str, "status": "error", "error": str(e)})
        raise
 def handle_web(url: str):
    info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""}
    log(info)