Added the ability to refresh metadata

This commit is contained in:
2025-09-07 17:41:58 +02:00
parent bd846a8d9f
commit 399af7a205
2 changed files with 153 additions and 2 deletions

View File

@@ -1,6 +1,7 @@
import os
from typing import Set
import time
import signal
import sys
@@ -20,13 +21,19 @@ JOB_TTL = int(os.getenv("JOB_TTL", "86400")) # 24 hours
RESULT_TTL = int(os.getenv("RESULT_TTL", "86400")) # 24 hours
FAILURE_TTL = int(os.getenv("FAILURE_TTL", "86400")) # 24 hours
# Optional refresh of existing items to fetch metadata/subtitles/thumbnails
REFRESH_EXISTING = os.getenv("REFRESH_EXISTING", "1").strip() not in ("0", "false", "False", "")
REFRESH_TTL = int(os.getenv("REFRESH_TTL", "21600")) # 6 hours
REFRESH_FAILURE_TTL = int(os.getenv("REFRESH_FAILURE_TTL", "21600"))
# Media types to track
MEDIA_EXT = {
".mp3", ".m4a", ".mp4", ".mkv", ".wav", ".flac", ".webm", ".ogg", ".opus"
}
# In-memory seen set to avoid re-enqueueing during a single run
_seen: set[str] = set()
_seen: Set[str] = set()
_seen_refresh: Set[str] = set()
def already_transcribed(p: Path) -> bool:
@@ -35,6 +42,28 @@ def already_transcribed(p: Path) -> bool:
return base_json.exists()
# Helper to decide when to refresh sidecars
def needs_refresh(p: Path) -> bool:
"""
Decide whether to refresh sidecars for a media file:
- If metadata (*.info.json) is missing
- If no subtitle SRT is present next to media (either .srt or .en.srt)
- If no thumbnail JPG/PNG is present next to media
"""
stem = p.with_suffix("")
info_json = stem.with_suffix(".info.json")
# Accept any language-suffixed SRT as well
srt_plain = stem.with_suffix(".srt")
srt_en = p.with_suffix(".en.srt")
has_any_srt = srt_plain.exists() or srt_en.exists() or any(p.parent.glob(p.stem + ".*.srt"))
thumb_jpg = stem.with_suffix(".jpg")
thumb_png = stem.with_suffix(".png")
missing_info = not info_json.exists()
missing_subs = not has_any_srt
missing_thumb = not (thumb_jpg.exists() or thumb_png.exists())
return missing_info or missing_subs or missing_thumb
def iter_media_files(root: Path):
for path in root.rglob("*"):
if not path.is_file():
@@ -57,7 +86,23 @@ def enqueue_new_files():
continue
if already_transcribed(p):
_seen.add(key)
print(f"[scanner] Skip (already transcribed): {p}", flush=True)
if REFRESH_EXISTING and needs_refresh(p):
if key not in _seen_refresh:
# Ask worker to refresh metadata/subtitles/thumbnails without redownloading media
q.enqueue(
"worker.refresh_media",
key,
job_timeout=JOB_TIMEOUT,
ttl=REFRESH_TTL,
result_ttl=RESULT_TTL,
failure_ttl=REFRESH_FAILURE_TTL,
)
_seen_refresh.add(key)
print(f"[scanner] Refresh enqueued: {p}", flush=True)
else:
print(f"[scanner] Skip (already queued refresh): {p}", flush=True)
else:
print(f"[scanner] Skip (already transcribed): {p}", flush=True)
continue
# Enqueue the worker to process this local file (with generous timeouts)
q.enqueue(