Real worker differentiation
This commit is contained in:
@@ -36,6 +36,17 @@ OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
|
|||||||
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
|
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
|
||||||
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
|
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
|
||||||
|
|
||||||
|
# Worker role selection
|
||||||
|
WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe'
|
||||||
|
JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()]
|
||||||
|
|
||||||
|
def _mode_allows(task: str) -> bool:
|
||||||
|
"""Gate tasks by worker role. In 'transcribe' mode only allow transcription of local files
|
||||||
|
(including indexing and OWUI publish). "task" is one of: 'download','web','local','transcribe'."""
|
||||||
|
if WORKER_MODE == "transcribe":
|
||||||
|
return task in {"local", "transcribe"}
|
||||||
|
return True
|
||||||
|
|
||||||
TRN.mkdir(parents=True, exist_ok=True)
|
TRN.mkdir(parents=True, exist_ok=True)
|
||||||
LIB.mkdir(parents=True, exist_ok=True)
|
LIB.mkdir(parents=True, exist_ok=True)
|
||||||
TMP.mkdir(parents=True, exist_ok=True)
|
TMP.mkdir(parents=True, exist_ok=True)
|
||||||
@@ -1155,24 +1166,59 @@ def owui_headers():
|
|||||||
return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {}
|
return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {}
|
||||||
|
|
||||||
def owui_get_or_create_kb():
|
def owui_get_or_create_kb():
|
||||||
|
"""Return a KB id for OWUI_KB without creating duplicates.
|
||||||
|
Honors OPENWEBUI_KB_ID, and tolerates both list and {"data": ...} response shapes.
|
||||||
|
"""
|
||||||
if not OWUI_URL or not OWUI_KEY:
|
if not OWUI_URL or not OWUI_KEY:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# 1) If an explicit id is provided, trust it
|
||||||
|
forced = os.getenv("OPENWEBUI_KB_ID", "").strip()
|
||||||
|
if forced:
|
||||||
|
return forced
|
||||||
|
|
||||||
|
# 2) List and try to find an exact name match
|
||||||
try:
|
try:
|
||||||
r = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15)
|
r = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
for kb in r.json().get("data", []):
|
body = r.json()
|
||||||
if kb.get("name") == OWUI_KB:
|
items = body if isinstance(body, list) else body.get("data", [])
|
||||||
return kb["id"]
|
# Prefer exact name match; if multiple, pick the most recently updated
|
||||||
|
matches = [kb for kb in items if (kb.get("name") or "") == OWUI_KB]
|
||||||
|
if matches:
|
||||||
|
try:
|
||||||
|
matches.sort(key=lambda k: k.get("updated_at") or 0, reverse=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return matches[0].get("id")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
r = requests.post(
|
|
||||||
|
# 3) Create only if not found
|
||||||
|
cr = requests.post(
|
||||||
f"{OWUI_URL}/api/v1/knowledge/create",
|
f"{OWUI_URL}/api/v1/knowledge/create",
|
||||||
headers={**owui_headers(), "Content-Type": "application/json"},
|
headers={**owui_headers(), "Content-Type": "application/json"},
|
||||||
data=orjson.dumps({"name": OWUI_KB, "description": "All local content indexed by podx"}),
|
data=orjson.dumps({"name": OWUI_KB, "description": "All local content indexed by podx"}),
|
||||||
timeout=15,
|
timeout=15,
|
||||||
)
|
)
|
||||||
r.raise_for_status()
|
cr.raise_for_status()
|
||||||
return r.json()["data"]["id"]
|
created = cr.json()
|
||||||
|
if isinstance(created, dict) and created.get("id"):
|
||||||
|
return created["id"]
|
||||||
|
if isinstance(created, dict) and created.get("data") and created["data"].get("id"):
|
||||||
|
return created["data"]["id"]
|
||||||
|
# Fallback: try to resolve again by name
|
||||||
|
try:
|
||||||
|
rr = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15)
|
||||||
|
rr.raise_for_status()
|
||||||
|
body = rr.json()
|
||||||
|
items = body if isinstance(body, list) else body.get("data", [])
|
||||||
|
for kb in items:
|
||||||
|
if (kb.get("name") or "") == OWUI_KB:
|
||||||
|
return kb.get("id")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
def owui_upload_and_attach(path: Path, kb_id: str):
|
def owui_upload_and_attach(path: Path, kb_id: str):
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
@@ -1214,6 +1260,8 @@ def handle_local_file(path_str: str):
|
|||||||
if not p.exists():
|
if not p.exists():
|
||||||
log({"url": path_str, "status": "error", "error": "file_not_found"})
|
log({"url": path_str, "status": "error", "error": "file_not_found"})
|
||||||
return
|
return
|
||||||
|
if WORKER_MODE == "transcribe":
|
||||||
|
print(f"[mode] transcribe-only worker handling local file: {p}", flush=True)
|
||||||
|
|
||||||
title = p.stem
|
title = p.stem
|
||||||
base_json = TRN / f"{title}.json"
|
base_json = TRN / f"{title}.json"
|
||||||
@@ -1440,6 +1488,10 @@ def refresh_media(path_str: str):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
def handle_web(url: str):
|
def handle_web(url: str):
|
||||||
|
if not _mode_allows("web"):
|
||||||
|
log({"url": url, "status": "skip", "reason": "mode_transcribe_only"})
|
||||||
|
print(f"[mode] transcribe-only: skipping web snapshot job: {url}", flush=True)
|
||||||
|
return
|
||||||
info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""}
|
info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""}
|
||||||
log(info)
|
log(info)
|
||||||
base, title, domain, date, text = save_web_snapshot(url)
|
base, title, domain, date, text = save_web_snapshot(url)
|
||||||
@@ -1452,6 +1504,14 @@ def handle_web(url: str):
|
|||||||
|
|
||||||
def handle_url(url: str):
|
def handle_url(url: str):
|
||||||
try:
|
try:
|
||||||
|
# In transcribe-only mode, refuse non-local/download jobs
|
||||||
|
if not _mode_allows("download"):
|
||||||
|
# Only permit local file paths in this mode
|
||||||
|
if url.startswith("/") or url.startswith("file://"):
|
||||||
|
return handle_local_file(url[7:] if url.startswith("file://") else url)
|
||||||
|
log({"url": url, "status": "skip", "reason": "mode_transcribe_only"})
|
||||||
|
print(f"[mode] transcribe-only: skipping non-local job: {url}", flush=True)
|
||||||
|
return
|
||||||
# If a local file path (or file:// URL) is provided, process it directly
|
# If a local file path (or file:// URL) is provided, process it directly
|
||||||
if url.startswith("file://"):
|
if url.startswith("file://"):
|
||||||
return handle_local_file(url[7:])
|
return handle_local_file(url[7:])
|
||||||
|
@@ -11,6 +11,10 @@ services:
|
|||||||
TMP_ROOT: /tmpdl
|
TMP_ROOT: /tmpdl
|
||||||
WHISPER_MODEL: large-v3
|
WHISPER_MODEL: large-v3
|
||||||
WHISPER_PRECISION: int8
|
WHISPER_PRECISION: int8
|
||||||
|
OPENWEBUI_URL: ${OPENWEBUI_URL}
|
||||||
|
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
|
||||||
|
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
|
||||||
|
OPENWEBUI_KB_ID: ${OPENWEBUI_KB_ID:-}
|
||||||
volumes:
|
volumes:
|
||||||
- ${LIBRARY_HOST_DIR:-./library}:/library
|
- ${LIBRARY_HOST_DIR:-./library}:/library
|
||||||
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
|
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
|
||||||
@@ -44,6 +48,11 @@ services:
|
|||||||
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
||||||
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
||||||
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
||||||
|
WORKER_MODE: all
|
||||||
|
OPENWEBUI_URL: ${OPENWEBUI_URL}
|
||||||
|
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
|
||||||
|
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
|
||||||
|
OPENWEBUI_KB_ID: ${OPENWEBUI_KB_ID:-}
|
||||||
PYTHONPATH: /app
|
PYTHONPATH: /app
|
||||||
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
|
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
|
||||||
JOB_TTL: ${JOB_TTL:-86400}
|
JOB_TTL: ${JOB_TTL:-86400}
|
||||||
@@ -78,6 +87,11 @@ services:
|
|||||||
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
||||||
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
||||||
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
||||||
|
WORKER_MODE: transcribe
|
||||||
|
OPENWEBUI_URL: ${OPENWEBUI_URL}
|
||||||
|
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
|
||||||
|
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
|
||||||
|
OPENWEBUI_KB_ID: ${OPENWEBUI_KB_ID:-}
|
||||||
PYTHONPATH: /app
|
PYTHONPATH: /app
|
||||||
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
|
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
|
||||||
JOB_TTL: ${JOB_TTL:-86400}
|
JOB_TTL: ${JOB_TTL:-86400}
|
||||||
|
Reference in New Issue
Block a user