Actual worker differentiation

2025-09-08 16:02:42 +02:00
parent ccd676b390
commit 688d0406c0
2 changed files with 101 additions and 53 deletions
--- a/app/worker.py
+++ b/app/worker.py
@@ -1,4 +1,6 @@
-import os, subprocess, shutil, json, re, orjson, requests
+import os, subprocess, shutil, json, re, orjson, requests, unicodedata
+from rq import Queue
+from redis import Redis
 from pathlib import Path
 import math
 import difflib
@@ -36,6 +38,10 @@ OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
 OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
 OWUI_KB  = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")

+# Redis-backed job queue settings and offload toggle
+REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0").strip()
+OFFLOAD_TRANSCRIBE = os.getenv("OFFLOAD_TRANSCRIBE", "1").lower() not in ("0", "false", "no")
+
 # Worker role selection
 WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower()  # 'all' or 'transcribe'
 JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()]
@@ -1122,6 +1128,15 @@ def slugify(text):
    text = re.sub(r'[^A-Za-z0-9\-._ ]+', '', text).strip().replace(' ', '_')
    return text[:120] or 'page'

+def _norm(s: str | None) -> str:
+    """Normalize strings for stable comparisons across Unicode lookalikes and stray whitespace."""
+    if s is None:
+        return ""
+    try:
+        return unicodedata.normalize("NFKC", s).strip()
+    except Exception:
+        return (s or "").strip()
+
 def save_web_snapshot(url: str):
    r = _requests.get(url, timeout=30, headers={"User-Agent":"Mozilla/5.0"})
    r.raise_for_status()
@@ -1183,8 +1198,9 @@ def owui_get_or_create_kb():
        r.raise_for_status()
        body = r.json()
        items = body if isinstance(body, list) else body.get("data", [])
-        # Prefer exact name match; if multiple, pick the most recently updated
-        matches = [kb for kb in items if (kb.get("name") or "") == OWUI_KB]
+        # Prefer exact normalized name match; if multiple, pick the most recently updated
+        kb_target = _norm(OWUI_KB)
+        matches = [kb for kb in items if _norm(kb.get("name")) == kb_target]
        if matches:
            try:
                matches.sort(key=lambda k: k.get("updated_at") or 0, reverse=True)
@@ -1213,8 +1229,9 @@ def owui_get_or_create_kb():
        rr.raise_for_status()
        body = rr.json()
        items = body if isinstance(body, list) else body.get("data", [])
+        kb_target = _norm(OWUI_KB)
        for kb in items:
-            if (kb.get("name") or "") == OWUI_KB:
+            if _norm(kb.get("name")) == kb_target:
                return kb.get("id")
    except Exception:
        pass
@@ -1224,14 +1241,21 @@ def owui_upload_and_attach(path: Path, kb_id: str):
    with open(path, "rb") as f:
        r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=owui_headers(), files={"file": (path.name, f)}, timeout=60*10)
    r.raise_for_status()
-    file_id = r.json()["data"]["id"]
+    up = r.json()
+    file_id = (up.get("id") or (up.get("data") or {}).get("id"))
+    if not file_id:
+        raise RuntimeError(f"OWUI upload: could not get file id from response: {up}")
    r = requests.post(
        f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add",
        headers={**owui_headers(), "Content-Type": "application/json"},
        data=orjson.dumps({"file_id": file_id}),
-        timeout=60,
+        timeout=180,
    )
    r.raise_for_status()
+    try:
+        time.sleep(0.5)
+    except Exception:
+        pass
    return True

 def publish_to_openwebui(paths):
@@ -1239,6 +1263,9 @@ def publish_to_openwebui(paths):
        return
    try:
        kb_id = owui_get_or_create_kb()
+        if not kb_id:
+            print("[owui] KB resolve failed; skipping attach to avoid accidental duplicates", flush=True)
+            return
        for p in paths:
            p = Path(p)
            if not p.exists():
@@ -1250,6 +1277,61 @@ def publish_to_openwebui(paths):
    except Exception as e:
        log({"status": "owui_error", "error": str(e)})

+# --------- Post-transcribe pipeline and job/queue helpers ---------
+
+def _postprocess_after_transcribe(media_path: Path, base: Path):
+    """Common steps after we have a `base` transcript path: index, publish, NFO, artwork."""
+    try:
+        index_meili(base.with_suffix(".json"))
+    except Exception as e:
+        print(f"[post] meili index failed: {e}", flush=True)
+    try:
+        publish_to_openwebui([base.with_suffix(".txt")])
+    except Exception as e:
+        print(f"[post] owui publish failed: {e}", flush=True)
+    # Build metadata using existing helper
+    try:
+        title = media_path.stem
+        fallback = {
+            "title": title,
+            "episode_title": title,
+            "show": media_path.parent.name,
+            "description": "",
+            "pubdate": _extract_date_from_stem(title),
+            "duration_sec": media_duration_seconds(media_path),
+            "image": "",
+            "guid": "",
+        }
+        meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None)
+        ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
+        write_episode_nfo(media_path, meta, ttxt)
+        try:
+            save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
+        except Exception:
+            pass
+    except Exception as e:
+        print(f"[post] NFO write failed: {e}", flush=True)
+
+def transcribe_job(path_str: str):
+    """RQ job: heavy transcription only. Safe to import by dotted path 'worker.transcribe_job'."""
+    p = Path(path_str)
+    base = transcribe(p)
+    _postprocess_after_transcribe(p, base)
+    return str(base)
+
+def enqueue_transcribe(path: Path) -> bool:
+    """Enqueue a transcription job to the 'transcribe' queue. Returns True on success."""
+    try:
+        conn = Redis.from_url(REDIS_URL)
+        q = Queue("transcribe", connection=conn, default_timeout=60*60*24)
+        # Use dotted path so workers in other processes can import
+        q.enqueue("worker.transcribe_job", str(path), job_timeout=60*60*24)
+        print(f"[queue] enqueued transcribe job for {path}", flush=True)
+        return True
+    except Exception as e:
+        print(f"[queue] enqueue failed, will transcribe inline: {e}", flush=True)
+        return False
+
 def handle_local_file(path_str: str):
    """Transcribe & index a local media file that already exists in /library.
    If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper.
@@ -1352,30 +1434,12 @@ def handle_local_file(path_str: str):
                log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
                return

-        # 2) Otherwise, run transcription
+        # 2) Otherwise, run transcription (offload to queue if enabled and not in transcribe-only worker)
+        if OFFLOAD_TRANSCRIBE and WORKER_MODE != "transcribe" and enqueue_transcribe(p):
+            log({**info, **{"status": "queued_transcribe"}})
+            return
        base = transcribe(p)
-        index_meili(base.with_suffix(".json"))
-        publish_to_openwebui([base.with_suffix(".txt")])
-        try:
-            fallback = {
-                "title": title,
-                "episode_title": title,
-                "show": p.parent.name,
-                "description": "",
-                "pubdate": _extract_date_from_stem(title),
-                "duration_sec": media_duration_seconds(p),
-                "image": "",
-                "guid": "",
-            }
-            meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
-            ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
-            write_episode_nfo(p, meta, ttxt)
-            try:
-                save_episode_artwork(meta.get("image"), p, meta.get("show"))
-            except Exception:
-                pass
-        except Exception as e:
-            print(f"[post] NFO write failed: {e}", flush=True)
+        _postprocess_after_transcribe(p, base)
        log({**info, **{"status": "done"}})
    except Exception as e:
        log({"url": path_str, "status": "error", "error": str(e)})
@@ -1590,31 +1654,11 @@ def handle_url(url: str):
                if repo_json:
                    base = reuse_repo_transcript(dest, repo_json)
            if not base:
+                if OFFLOAD_TRANSCRIBE and WORKER_MODE != "transcribe" and enqueue_transcribe(dest):
+                    log({**info, **{"status": "queued_transcribe"}})
+                    continue
                base = transcribe(dest)
-            index_meili(base.with_suffix(".json"))
-            publish_to_openwebui([base.with_suffix(".txt")])
-            try:
-                # Build metadata from RSS (if matched), yt-dlp info.json, and sensible fallbacks
-                fallback = {
-                    "title": dest.stem,
-                    "episode_title": dest.stem,
-                    "show": uploader,
-                    "description": "",
-                    "pubdate": _extract_date_from_stem(dest.stem),
-                    "duration_sec": media_duration_seconds(dest),
-                    "image": "",
-                    "guid": "",
-                }
-                meta = build_meta_from_sources(dest, uploader, fallback, ep if 'ep' in locals() else None)
-                ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
-                write_episode_nfo(dest, meta, ttxt)
-                # Save local artwork for Plex/Kodi from meta image url
-                try:
-                    save_episode_artwork(meta.get("image"), dest, meta.get("show"))
-                except Exception:
-                    pass
-            except Exception as e:
-                print(f"[post] NFO write failed: {e}", flush=True)
+            _postprocess_after_transcribe(dest, base)
            log({**info, **{"status":"done"}})
    except Exception as e:
        log({"url": url, "status":"error", "error": str(e)})
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -32,6 +32,7 @@ services:
      timeout: 5s
      retries: 3

+  # Main worker: handles downloads, indexing, RSS, OWUI, etc. (no heavy Whisper)
  podx-worker:
    build: ./app
    container_name: podx-worker
@@ -71,6 +72,7 @@ services:
    extra_hosts:
      - host.docker.internal:host-gateway

+  # Transcribe-only worker: listens to the "transcribe" queue and runs Whisper jobs
  podx-worker-transcribe:
    build: ./app
    container_name: podx-worker-transcribe
@@ -150,6 +152,7 @@ services:
      # - /mnt/secure/cookies.txt:/config/cookies.txt:ro
    restart: unless-stopped

+  # Scanner: watches /library and enqueues jobs (heavy jobs go to "transcribe" queue)
  podx-scanner:
    build: ./app
    container_name: podx-scanner
@@ -160,6 +163,7 @@ services:
      REDIS_URL: redis://redis:6379/0
      LIBRARY_ROOT: /library
      TRANSCRIPT_ROOT: /transcripts
+      TRANSCRIBE_QUEUE: transcribe
      SCAN_INTERVAL: 30
      JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
      JOB_TTL: ${JOB_TTL:-86400}