Files
podx/app/scanner.py
2025-09-07 14:33:45 +02:00

107 lines
2.9 KiB
Python

import os
import time
import signal
import sys
from pathlib import Path
from redis import Redis
from rq import Queue
# Config via env (matches docker-compose)
LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
SCAN_INTERVAL = int(os.getenv("SCAN_INTERVAL", "30")) # seconds
# RQ job configuration
JOB_TIMEOUT = int(os.getenv("JOB_TIMEOUT", "14400")) # 4 hours
JOB_TTL = int(os.getenv("JOB_TTL", "86400")) # 24 hours
RESULT_TTL = int(os.getenv("RESULT_TTL", "86400")) # 24 hours
FAILURE_TTL = int(os.getenv("FAILURE_TTL", "86400")) # 24 hours
# Media types to track
MEDIA_EXT = {
".mp3", ".m4a", ".mp4", ".mkv", ".wav", ".flac", ".webm", ".ogg", ".opus"
}
# In-memory seen set to avoid re-enqueueing during a single run
_seen: set[str] = set()
def already_transcribed(p: Path) -> bool:
"""Heuristic: if <basename>.json exists in transcripts, consider it done."""
base_json = TRN / f"{p.stem}.json"
return base_json.exists()
def iter_media_files(root: Path):
for path in root.rglob("*"):
if not path.is_file():
continue
if path.suffix.lower() in MEDIA_EXT:
yield path
def enqueue_new_files():
q = Queue(connection=Redis.from_url(REDIS_URL))
# Ensure target dirs exist
TRN.mkdir(parents=True, exist_ok=True)
LIB.mkdir(parents=True, exist_ok=True)
new_jobs = 0
for p in iter_media_files(LIB):
key = str(p.resolve())
if key in _seen:
continue
if already_transcribed(p):
_seen.add(key)
print(f"[scanner] Skip (already transcribed): {p}", flush=True)
continue
# Enqueue the worker to process this local file (with generous timeouts)
q.enqueue(
"worker.handle_local_file",
key,
job_timeout=JOB_TIMEOUT,
ttl=JOB_TTL,
result_ttl=RESULT_TTL,
failure_ttl=FAILURE_TTL,
)
_seen.add(key)
new_jobs += 1
print(f"[scanner] Enqueued: {p}", flush=True)
return new_jobs
_shutdown = False
def _handle_sig(sig, frame):
global _shutdown
_shutdown = True
def main():
signal.signal(signal.SIGINT, _handle_sig)
signal.signal(signal.SIGTERM, _handle_sig)
print(f"[scanner] Watching {LIB} → transcripts in {TRN}; interval={SCAN_INTERVAL}s", flush=True)
while not _shutdown:
try:
jobs = enqueue_new_files()
if jobs:
print(f"[scanner] Enqueued {jobs} new file(s)", flush=True)
except Exception as e:
print(f"[scanner] Error: {e}", file=sys.stderr, flush=True)
# Sleep between passes
for _ in range(SCAN_INTERVAL):
if _shutdown:
break
time.sleep(1)
print("[scanner] Shutting down", flush=True)
if __name__ == "__main__":
main()