Real worker differentiation

This commit is contained in:
2025-09-08 15:32:13 +02:00
parent 3a4dc5bfcc
commit ccd676b390
2 changed files with 80 additions and 6 deletions

View File

@@ -36,6 +36,17 @@ OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
# Worker role selection
WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe'
JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()]
def _mode_allows(task: str) -> bool:
"""Gate tasks by worker role. In 'transcribe' mode only allow transcription of local files
(including indexing and OWUI publish). "task" is one of: 'download','web','local','transcribe'."""
if WORKER_MODE == "transcribe":
return task in {"local", "transcribe"}
return True
TRN.mkdir(parents=True, exist_ok=True)
LIB.mkdir(parents=True, exist_ok=True)
TMP.mkdir(parents=True, exist_ok=True)
@@ -1155,24 +1166,59 @@ def owui_headers():
return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {}
def owui_get_or_create_kb():
"""Return a KB id for OWUI_KB without creating duplicates.
Honors OPENWEBUI_KB_ID, and tolerates both list and {"data": ...} response shapes.
"""
if not OWUI_URL or not OWUI_KEY:
return None
# 1) If an explicit id is provided, trust it
forced = os.getenv("OPENWEBUI_KB_ID", "").strip()
if forced:
return forced
# 2) List and try to find an exact name match
try:
r = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15)
r.raise_for_status()
for kb in r.json().get("data", []):
if kb.get("name") == OWUI_KB:
return kb["id"]
body = r.json()
items = body if isinstance(body, list) else body.get("data", [])
# Prefer exact name match; if multiple, pick the most recently updated
matches = [kb for kb in items if (kb.get("name") or "") == OWUI_KB]
if matches:
try:
matches.sort(key=lambda k: k.get("updated_at") or 0, reverse=True)
except Exception:
pass
return matches[0].get("id")
except Exception:
pass
r = requests.post(
# 3) Create only if not found
cr = requests.post(
f"{OWUI_URL}/api/v1/knowledge/create",
headers={**owui_headers(), "Content-Type": "application/json"},
data=orjson.dumps({"name": OWUI_KB, "description": "All local content indexed by podx"}),
timeout=15,
)
r.raise_for_status()
return r.json()["data"]["id"]
cr.raise_for_status()
created = cr.json()
if isinstance(created, dict) and created.get("id"):
return created["id"]
if isinstance(created, dict) and created.get("data") and created["data"].get("id"):
return created["data"]["id"]
# Fallback: try to resolve again by name
try:
rr = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15)
rr.raise_for_status()
body = rr.json()
items = body if isinstance(body, list) else body.get("data", [])
for kb in items:
if (kb.get("name") or "") == OWUI_KB:
return kb.get("id")
except Exception:
pass
return None
def owui_upload_and_attach(path: Path, kb_id: str):
with open(path, "rb") as f:
@@ -1214,6 +1260,8 @@ def handle_local_file(path_str: str):
if not p.exists():
log({"url": path_str, "status": "error", "error": "file_not_found"})
return
if WORKER_MODE == "transcribe":
print(f"[mode] transcribe-only worker handling local file: {p}", flush=True)
title = p.stem
base_json = TRN / f"{title}.json"
@@ -1440,6 +1488,10 @@ def refresh_media(path_str: str):
raise
def handle_web(url: str):
if not _mode_allows("web"):
log({"url": url, "status": "skip", "reason": "mode_transcribe_only"})
print(f"[mode] transcribe-only: skipping web snapshot job: {url}", flush=True)
return
info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""}
log(info)
base, title, domain, date, text = save_web_snapshot(url)
@@ -1452,6 +1504,14 @@ def handle_web(url: str):
def handle_url(url: str):
try:
# In transcribe-only mode, refuse non-local/download jobs
if not _mode_allows("download"):
# Only permit local file paths in this mode
if url.startswith("/") or url.startswith("file://"):
return handle_local_file(url[7:] if url.startswith("file://") else url)
log({"url": url, "status": "skip", "reason": "mode_transcribe_only"})
print(f"[mode] transcribe-only: skipping non-local job: {url}", flush=True)
return
# If a local file path (or file:// URL) is provided, process it directly
if url.startswith("file://"):
return handle_local_file(url[7:])

View File

@@ -11,6 +11,10 @@ services:
TMP_ROOT: /tmpdl
WHISPER_MODEL: large-v3
WHISPER_PRECISION: int8
OPENWEBUI_URL: ${OPENWEBUI_URL}
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
OPENWEBUI_KB_ID: ${OPENWEBUI_KB_ID:-}
volumes:
- ${LIBRARY_HOST_DIR:-./library}:/library
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
@@ -44,6 +48,11 @@ services:
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
WHISPER_RESUME: ${WHISPER_RESUME:-1}
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
WORKER_MODE: all
OPENWEBUI_URL: ${OPENWEBUI_URL}
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
OPENWEBUI_KB_ID: ${OPENWEBUI_KB_ID:-}
PYTHONPATH: /app
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
JOB_TTL: ${JOB_TTL:-86400}
@@ -78,6 +87,11 @@ services:
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
WHISPER_RESUME: ${WHISPER_RESUME:-1}
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
WORKER_MODE: transcribe
OPENWEBUI_URL: ${OPENWEBUI_URL}
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
OPENWEBUI_KB_ID: ${OPENWEBUI_KB_ID:-}
PYTHONPATH: /app
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
JOB_TTL: ${JOB_TTL:-86400}