Added the ability to refresh metadata

This commit is contained in:
2025-09-07 17:41:58 +02:00
parent bd846a8d9f
commit 399af7a205
2 changed files with 153 additions and 2 deletions

View File

@@ -1,6 +1,7 @@
import os
from typing import Set
import time
import signal
import sys
@@ -20,13 +21,19 @@ JOB_TTL = int(os.getenv("JOB_TTL", "86400")) # 24 hours
RESULT_TTL = int(os.getenv("RESULT_TTL", "86400")) # 24 hours
FAILURE_TTL = int(os.getenv("FAILURE_TTL", "86400")) # 24 hours
# Optional refresh of existing items to fetch metadata/subtitles/thumbnails
REFRESH_EXISTING = os.getenv("REFRESH_EXISTING", "1").strip() not in ("0", "false", "False", "")
REFRESH_TTL = int(os.getenv("REFRESH_TTL", "21600")) # 6 hours
REFRESH_FAILURE_TTL = int(os.getenv("REFRESH_FAILURE_TTL", "21600"))
# Media types to track
MEDIA_EXT = {
".mp3", ".m4a", ".mp4", ".mkv", ".wav", ".flac", ".webm", ".ogg", ".opus"
}
# In-memory seen set to avoid re-enqueueing during a single run
_seen: set[str] = set()
_seen: Set[str] = set()
_seen_refresh: Set[str] = set()
def already_transcribed(p: Path) -> bool:
@@ -35,6 +42,28 @@ def already_transcribed(p: Path) -> bool:
return base_json.exists()
# Helper to decide when to refresh sidecars
def needs_refresh(p: Path) -> bool:
"""
Decide whether to refresh sidecars for a media file:
- If metadata (*.info.json) is missing
- If no subtitle SRT is present next to media (either .srt or .en.srt)
- If no thumbnail JPG/PNG is present next to media
"""
stem = p.with_suffix("")
info_json = stem.with_suffix(".info.json")
# Accept any language-suffixed SRT as well
srt_plain = stem.with_suffix(".srt")
srt_en = p.with_suffix(".en.srt")
has_any_srt = srt_plain.exists() or srt_en.exists() or any(p.parent.glob(p.stem + ".*.srt"))
thumb_jpg = stem.with_suffix(".jpg")
thumb_png = stem.with_suffix(".png")
missing_info = not info_json.exists()
missing_subs = not has_any_srt
missing_thumb = not (thumb_jpg.exists() or thumb_png.exists())
return missing_info or missing_subs or missing_thumb
def iter_media_files(root: Path):
for path in root.rglob("*"):
if not path.is_file():
@@ -57,7 +86,23 @@ def enqueue_new_files():
continue
if already_transcribed(p):
_seen.add(key)
print(f"[scanner] Skip (already transcribed): {p}", flush=True)
if REFRESH_EXISTING and needs_refresh(p):
if key not in _seen_refresh:
# Ask worker to refresh metadata/subtitles/thumbnails without redownloading media
q.enqueue(
"worker.refresh_media",
key,
job_timeout=JOB_TIMEOUT,
ttl=REFRESH_TTL,
result_ttl=RESULT_TTL,
failure_ttl=REFRESH_FAILURE_TTL,
)
_seen_refresh.add(key)
print(f"[scanner] Refresh enqueued: {p}", flush=True)
else:
print(f"[scanner] Skip (already queued refresh): {p}", flush=True)
else:
print(f"[scanner] Skip (already transcribed): {p}", flush=True)
continue
# Enqueue the worker to process this local file (with generous timeouts)
q.enqueue(

View File

@@ -1280,6 +1280,112 @@ def handle_local_file(path_str: str):
log({"url": path_str, "status": "error", "error": str(e)})
raise
# --- Refresh sidecar metadata and subtitles for an already-downloaded media file ---
def refresh_media(path_str: str):
"""
Refresh sidecar metadata (info.json, thumbnail) and subtitles for an already-downloaded media file.
Requires a companion .info.json next to the media (to supply the original URL). No media re-download.
"""
try:
p = Path(path_str)
if not p.exists() or not p.is_file():
log({"url": path_str, "status": "error", "error": "file_not_found"})
return
# Locate existing info.json to get the original URL
info_json = None
for cand in [p.parent / f"{p.name}.info.json", p.parent / f"{p.stem}.info.json"]:
if cand.exists():
info_json = cand
break
if not info_json:
log({"path": str(p), "status": "refresh-skip", "reason": "no_info_json"})
print(f"[refresh] skip: no info.json next to {p}", flush=True)
return
info = load_info_json(info_json) or {}
url = info.get("webpage_url") or info.get("original_url") or info.get("url")
if not url:
log({"path": str(p), "status": "refresh-skip", "reason": "no_url_in_info"})
print(f"[refresh] skip: no URL in {info_json}", flush=True)
return
# Prepare yt-dlp command to refresh sidecars only, writing files exactly next to the media
outtmpl = str(p.with_suffix(".%(ext)s"))
sub_langs = os.getenv("YTDLP_SUBS_LANGS", "en.*,en")
cmd = [
"yt-dlp",
"--skip-download",
"--write-info-json",
"--write-thumbnail",
"--convert-thumbnails", "jpg",
"--write-subs", "--write-auto-subs",
"--sub-langs", sub_langs,
"--convert-subs", "srt",
"-o", outtmpl,
url,
]
print(f"[refresh] refreshing sidecars for {p} via yt-dlp", flush=True)
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError as e:
print(f"[refresh] yt-dlp failed: {e}", flush=True)
raise
# Ensure language-suffixed SRT exists (Plex-friendly) if any subs were fetched
try:
# Pick any .srt just fetched that matches base
for s in p.parent.glob(f"{p.stem}*.srt"):
# If it's already lang-suffixed, keep; also copy to .en.srt when only plain .srt exists
if s.name == f"{p.stem}.srt":
shutil.copy2(s, p.with_suffix(".en.srt"))
except Exception:
pass
# Rebuild NFO using fresh info.json (and RSS if available)
try:
# Try RSS match to enrich metadata (non-fatal if not present)
ep = None
try:
ep = match_media_to_rss(p)
except Exception:
ep = None
fallback = {
"title": p.stem,
"episode_title": p.stem,
"show": p.parent.name,
"description": "",
"pubdate": _extract_date_from_stem(p.stem),
"duration_sec": media_duration_seconds(p),
"image": "",
"guid": "",
}
meta = build_meta_from_sources(p, p.parent.name, fallback, ep)
# Save local artwork too
try:
save_episode_artwork(meta.get("image"), p, meta.get("show"))
except Exception:
pass
# If a transcript already exists, include it in the NFO plot preview
ttxt_path = (TRN / p.stem).with_suffix(".txt")
ttxt = ttxt_path.read_text(encoding="utf-8") if ttxt_path.exists() else None
write_episode_nfo(p, meta, ttxt)
except Exception as e:
print(f"[refresh] NFO/artwork update failed: {e}", flush=True)
log({"path": str(p), "status": "refresh-done"})
print(f"[refresh] done for {p}", flush=True)
except Exception as e:
log({"path": path_str, "status": "error", "error": str(e)})
raise
def handle_web(url: str):
info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""}
log(info)