diff --git a/app/scanner.py b/app/scanner.py new file mode 100644 index 0000000..33d0e8f --- /dev/null +++ b/app/scanner.py @@ -0,0 +1,92 @@ + + +import os +import time +import signal +import sys +from pathlib import Path +from redis import Redis +from rq import Queue + +# Config via env (matches docker-compose) +LIB = Path(os.getenv("LIBRARY_ROOT", "/library")) +TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts")) +REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") +SCAN_INTERVAL = int(os.getenv("SCAN_INTERVAL", "30")) # seconds + +# Media types to track +MEDIA_EXT = { + ".mp3", ".m4a", ".mp4", ".mkv", ".wav", ".flac", ".webm", ".ogg", ".opus" +} + +# In-memory seen set to avoid re-enqueueing during a single run +_seen: set[str] = set() + + +def already_transcribed(p: Path) -> bool: + """Heuristic: if .json exists in transcripts, consider it done.""" + base_json = TRN / f"{p.stem}.json" + return base_json.exists() + + +def iter_media_files(root: Path): + for path in root.rglob("*"): + if not path.is_file(): + continue + if path.suffix.lower() in MEDIA_EXT: + yield path + + +def enqueue_new_files(): + q = Queue(connection=Redis.from_url(REDIS_URL)) + + # Ensure target dirs exist + TRN.mkdir(parents=True, exist_ok=True) + LIB.mkdir(parents=True, exist_ok=True) + + new_jobs = 0 + for p in iter_media_files(LIB): + key = str(p.resolve()) + if key in _seen: + continue + if already_transcribed(p): + _seen.add(key) + continue + # Enqueue the worker to process this local file + q.enqueue("worker.handle_local_file", key) + _seen.add(key) + new_jobs += 1 + return new_jobs + + +_shutdown = False + + +def _handle_sig(sig, frame): + global _shutdown + _shutdown = True + + +def main(): + signal.signal(signal.SIGINT, _handle_sig) + signal.signal(signal.SIGTERM, _handle_sig) + + print(f"[scanner] Watching {LIB} → transcripts in {TRN}; interval={SCAN_INTERVAL}s", flush=True) + while not _shutdown: + try: + jobs = enqueue_new_files() + if jobs: + print(f"[scanner] Enqueued {jobs} new file(s)", flush=True) + except Exception as e: + print(f"[scanner] Error: {e}", file=sys.stderr, flush=True) + # Sleep between passes + for _ in range(SCAN_INTERVAL): + if _shutdown: + break + time.sleep(1) + + print("[scanner] Shutting down", flush=True) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/app/worker.py b/app/worker.py index 56d2237..88d9cd9 100644 --- a/app/worker.py +++ b/app/worker.py @@ -231,6 +231,32 @@ def publish_to_openwebui(paths): except Exception as e: log({"status": "owui_error", "error": str(e)}) +from pathlib import Path + +def handle_local_file(path_str: str): + """Transcribe & index a local media file that already exists in /library. + Safe to call repeatedly; it skips if transcript JSON already exists. + """ + try: + p = Path(path_str) + if not p.exists(): + log({"url": path_str, "status": "error", "error": "file_not_found"}) + return + title = p.stem + base_json = TRN / f"{title}.json" + if base_json.exists(): + log({"url": path_str, "status": "skip", "reason": "already_transcribed"}) + return + info = {"url": path_str, "status": "transcribing", "title": title, "uploader": p.parent.name, "date": "", "path": str(p)} + log(info) + base = transcribe(p) + index_meili(base.with_suffix(".json")) + publish_to_openwebui([base.with_suffix(".txt")]) + log({**info, **{"status": "done"}}) + except Exception as e: + log({"url": path_str, "status": "error", "error": str(e)}) + raise + def handle_web(url: str): info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""} log(info) @@ -244,6 +270,12 @@ def handle_web(url: str): def handle_url(url: str): try: + # If a local file path (or file:// URL) is provided, process it directly + if url.startswith("file://"): + return handle_local_file(url[7:]) + if url.startswith("/") and Path(url).exists(): + return handle_local_file(url) + if not is_media_url(url): handle_web(url) return diff --git a/docker-compose.yml b/docker-compose.yml index 96621cb..9ac0a91 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -63,3 +63,41 @@ services: volumes: - ${REDIS_DATA_HOST_DIR:-./data/redis}:/data restart: unless-stopped + + metube: + image: alexta69/metube:latest + container_name: metube + ports: + - "8081:8081" + environment: + - PUID=1000 + - PGID=1000 + - TZ=Europe/Prague + - DOWNLOAD_DIR=/downloads + - OUTPUT_TEMPLATE=%(uploader)s/%(upload_date)s - %(title)s.%(ext)s + # Optional: pass a cookies file to bypass consent/age walls + # - COOKIE_FILE=/config/cookies.txt + # Optional: yt-dlp options (JSON). Example enables Android client fallback + # - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}} + volumes: + - ${LIBRARY_HOST_DIR:-./library}:/downloads + # Optional cookies file on host → /config/cookies.txt inside container + # - /mnt/secure/cookies.txt:/config/cookies.txt:ro + restart: unless-stopped + + podx-scanner: + build: ./app + container_name: podx-scanner + command: ["python", "scanner.py"] + env_file: [.env] + environment: + MEILI_URL: http://meili:7700 + REDIS_URL: redis://redis:6379/0 + LIBRARY_ROOT: /library + TRANSCRIPT_ROOT: /transcripts + SCAN_INTERVAL: 30 + volumes: + - ${LIBRARY_HOST_DIR:-./library}:/library + - ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts + depends_on: [redis] + restart: unless-stopped