From ccd676b39051f3b9c4a18ebfcd8a8b8bb2cee67f Mon Sep 17 00:00:00 2001 From: Tomas Kracmar Date: Mon, 8 Sep 2025 15:32:13 +0200 Subject: [PATCH] Real worker differentiation --- app/worker.py | 72 ++++++++++++++++++++++++++++++++++++++++++---- docker-compose.yml | 14 +++++++++ 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/app/worker.py b/app/worker.py index b7aee73..981a80c 100644 --- a/app/worker.py +++ b/app/worker.py @@ -36,6 +36,17 @@ OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/") OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "") OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library") +# Worker role selection +WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe' +JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()] + +def _mode_allows(task: str) -> bool: + """Gate tasks by worker role. In 'transcribe' mode only allow transcription of local files + (including indexing and OWUI publish). "task" is one of: 'download','web','local','transcribe'.""" + if WORKER_MODE == "transcribe": + return task in {"local", "transcribe"} + return True + TRN.mkdir(parents=True, exist_ok=True) LIB.mkdir(parents=True, exist_ok=True) TMP.mkdir(parents=True, exist_ok=True) @@ -1155,24 +1166,59 @@ def owui_headers(): return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {} def owui_get_or_create_kb(): + """Return a KB id for OWUI_KB without creating duplicates. + Honors OPENWEBUI_KB_ID, and tolerates both list and {"data": ...} response shapes. + """ if not OWUI_URL or not OWUI_KEY: return None + + # 1) If an explicit id is provided, trust it + forced = os.getenv("OPENWEBUI_KB_ID", "").strip() + if forced: + return forced + + # 2) List and try to find an exact name match try: r = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15) r.raise_for_status() - for kb in r.json().get("data", []): - if kb.get("name") == OWUI_KB: - return kb["id"] + body = r.json() + items = body if isinstance(body, list) else body.get("data", []) + # Prefer exact name match; if multiple, pick the most recently updated + matches = [kb for kb in items if (kb.get("name") or "") == OWUI_KB] + if matches: + try: + matches.sort(key=lambda k: k.get("updated_at") or 0, reverse=True) + except Exception: + pass + return matches[0].get("id") except Exception: pass - r = requests.post( + + # 3) Create only if not found + cr = requests.post( f"{OWUI_URL}/api/v1/knowledge/create", headers={**owui_headers(), "Content-Type": "application/json"}, data=orjson.dumps({"name": OWUI_KB, "description": "All local content indexed by podx"}), timeout=15, ) - r.raise_for_status() - return r.json()["data"]["id"] + cr.raise_for_status() + created = cr.json() + if isinstance(created, dict) and created.get("id"): + return created["id"] + if isinstance(created, dict) and created.get("data") and created["data"].get("id"): + return created["data"]["id"] + # Fallback: try to resolve again by name + try: + rr = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15) + rr.raise_for_status() + body = rr.json() + items = body if isinstance(body, list) else body.get("data", []) + for kb in items: + if (kb.get("name") or "") == OWUI_KB: + return kb.get("id") + except Exception: + pass + return None def owui_upload_and_attach(path: Path, kb_id: str): with open(path, "rb") as f: @@ -1214,6 +1260,8 @@ def handle_local_file(path_str: str): if not p.exists(): log({"url": path_str, "status": "error", "error": "file_not_found"}) return + if WORKER_MODE == "transcribe": + print(f"[mode] transcribe-only worker handling local file: {p}", flush=True) title = p.stem base_json = TRN / f"{title}.json" @@ -1440,6 +1488,10 @@ def refresh_media(path_str: str): raise def handle_web(url: str): + if not _mode_allows("web"): + log({"url": url, "status": "skip", "reason": "mode_transcribe_only"}) + print(f"[mode] transcribe-only: skipping web snapshot job: {url}", flush=True) + return info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""} log(info) base, title, domain, date, text = save_web_snapshot(url) @@ -1452,6 +1504,14 @@ def handle_web(url: str): def handle_url(url: str): try: + # In transcribe-only mode, refuse non-local/download jobs + if not _mode_allows("download"): + # Only permit local file paths in this mode + if url.startswith("/") or url.startswith("file://"): + return handle_local_file(url[7:] if url.startswith("file://") else url) + log({"url": url, "status": "skip", "reason": "mode_transcribe_only"}) + print(f"[mode] transcribe-only: skipping non-local job: {url}", flush=True) + return # If a local file path (or file:// URL) is provided, process it directly if url.startswith("file://"): return handle_local_file(url[7:]) diff --git a/docker-compose.yml b/docker-compose.yml index 3a86e95..33103c4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,6 +11,10 @@ services: TMP_ROOT: /tmpdl WHISPER_MODEL: large-v3 WHISPER_PRECISION: int8 + OPENWEBUI_URL: ${OPENWEBUI_URL} + OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} + OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library} + OPENWEBUI_KB_ID: ${OPENWEBUI_KB_ID:-} volumes: - ${LIBRARY_HOST_DIR:-./library}:/library - ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts @@ -44,6 +48,11 @@ services: WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} WHISPER_RESUME: ${WHISPER_RESUME:-1} WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20} + WORKER_MODE: all + OPENWEBUI_URL: ${OPENWEBUI_URL} + OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} + OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library} + OPENWEBUI_KB_ID: ${OPENWEBUI_KB_ID:-} PYTHONPATH: /app JOB_TIMEOUT: ${JOB_TIMEOUT:-14400} JOB_TTL: ${JOB_TTL:-86400} @@ -78,6 +87,11 @@ services: WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} WHISPER_RESUME: ${WHISPER_RESUME:-1} WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20} + WORKER_MODE: transcribe + OPENWEBUI_URL: ${OPENWEBUI_URL} + OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} + OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library} + OPENWEBUI_KB_ID: ${OPENWEBUI_KB_ID:-} PYTHONPATH: /app JOB_TIMEOUT: ${JOB_TIMEOUT:-14400} JOB_TTL: ${JOB_TTL:-86400}