From 6240e866501755bddecc9154cc3af34545e84f8c Mon Sep 17 00:00:00 2001 From: Tomas Kracmar Date: Wed, 24 Sep 2025 11:38:53 +0200 Subject: [PATCH] Fixing metadata import errors --- .env.example | 3 + README.md | 2 + app/worker.py | 77 +++++++++++++- docker-compose.yml | 6 ++ scripts/backfill_openwebui.py | 67 +++++++++++- scripts/podx-tools.sh | 192 +++++++++++++++++++++++++--------- 6 files changed, 294 insertions(+), 53 deletions(-) diff --git a/.env.example b/.env.example index fe8ff41..8d5e3bc 100644 --- a/.env.example +++ b/.env.example @@ -9,6 +9,9 @@ OPENWEBUI_URL=http://openwebui:3000 OPENWEBUI_API_KEY=put_your_openwebui_api_key_here OPENWEBUI_KB_NAME=Homelab Library OPENWEBUI_KB_ID=your_kb_uuid_here +OPENWEBUI_AUTO_FIX_METADATA=1 +# Optional: JSON string to enforce as metadata template when auto-fix runs +# OPENWEBUI_METADATA_TEMPLATE_JSON={} # Transcription backend (local Whisper by default) TRANSCRIBE_BACKEND=local diff --git a/README.md b/README.md index f4f557b..ebc3401 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,8 @@ Note: `.env.example` includes placeholders for both **Meili** and **OpenWebUI** - `OPENWEBUI_API_KEY`: API key for authenticating PodX workers with OpenWebUI. - `OPENWEBUI_KB_NAME`: Human-readable Knowledge Base name to attach documents to. - `OPENWEBUI_KB_ID`: Fixed UUID of the Knowledge Base (avoids duplicate KBs on restart). +- `OPENWEBUI_AUTO_FIX_METADATA` (default `1`): When enabled, PodX clears/overrides the Knowledge Base metadata template before uploads to prevent ingestion crashes from invalid templates. +- `OPENWEBUI_METADATA_TEMPLATE_JSON`: Optional JSON applied when the auto-fix runs (defaults to `{}`, i.e., no custom metadata template). ## RSS Ingestion diff --git a/app/worker.py b/app/worker.py index 3830249..c16a50b 100644 --- a/app/worker.py +++ b/app/worker.py @@ -97,6 +97,10 @@ DEFAULT_TRANSCRIPT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/") OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "") OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library") +OWUI_AUTO_FIX_METADATA = os.getenv("OPENWEBUI_AUTO_FIX_METADATA", "1").strip().lower() not in ("0", "false", "no") +OWUI_METADATA_TEMPLATE_JSON = os.getenv("OPENWEBUI_METADATA_TEMPLATE_JSON", "").strip() + +_OWUI_TEMPLATE_PATCHED: set[str] = set() # Redis-backed job queue settings and offload toggle REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0").strip() @@ -1355,6 +1359,56 @@ def is_media_url(url: str): def owui_headers(): return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {} + +def _owui_metadata_template_payload(): + """Return the metadata template payload to apply when auto-fix is enabled.""" + if not OWUI_METADATA_TEMPLATE_JSON: + return {} + try: + return json.loads(OWUI_METADATA_TEMPLATE_JSON) + except Exception: + # Treat value as a raw string template if parsing fails + return OWUI_METADATA_TEMPLATE_JSON + + +def owui_fix_metadata_template(kb_id: str, force: bool = False) -> bool: + """Ensure the target knowledge base has a safe metadata template. + + Attempts PATCH/PUT with either a user-provided template or an empty object. + Returns True if an update succeeded; False otherwise. + """ + if not OWUI_AUTO_FIX_METADATA or not OWUI_URL or not OWUI_KEY or not kb_id: + return False + if not force and kb_id in _OWUI_TEMPLATE_PATCHED: + return False + + payload_variants: list[object] = [] + template_payload = _owui_metadata_template_payload() + payload_variants.append({"metadata_template": template_payload}) + if template_payload not in ({}, "", None): + payload_variants.append({"metadata_template": {}}) + payload_variants.append({"metadata_template": None}) + + headers = {**owui_headers(), "Content-Type": "application/json"} + url = f"{OWUI_URL}/api/v1/knowledge/{kb_id}" + success_codes = {200, 201, 202, 204} + + for payload in payload_variants: + try: + body = orjson.dumps(payload) + except Exception: + body = json.dumps(payload).encode("utf-8") + for method in ("PATCH", "PUT"): + try: + resp = requests.request(method, url, headers=headers, data=body, timeout=15) + except Exception: + continue + if resp.status_code in success_codes: + print(f"[owui] metadata template adjusted via {method} for KB {kb_id}", flush=True) + _OWUI_TEMPLATE_PATCHED.add(kb_id) + return True + return False + def owui_get_or_create_kb(): """Return a KB id for OWUI_KB without creating duplicates. Honors OPENWEBUI_KB_ID, and tolerates both list and {"data": ...} response shapes. @@ -1413,6 +1467,8 @@ def owui_get_or_create_kb(): return None def owui_upload_and_attach(path: Path, kb_id: str): + if OWUI_AUTO_FIX_METADATA: + owui_fix_metadata_template(kb_id) with open(path, "rb") as f: r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=owui_headers(), files={"file": (path.name, f)}, timeout=60*10) r.raise_for_status() @@ -1420,12 +1476,28 @@ def owui_upload_and_attach(path: Path, kb_id: str): file_id = (up.get("id") or (up.get("data") or {}).get("id")) if not file_id: raise RuntimeError(f"OWUI upload: could not get file id from response: {up}") + payload = {"file_id": file_id} + attach_headers = {**owui_headers(), "Content-Type": "application/json"} + body = orjson.dumps(payload) r = requests.post( f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add", - headers={**owui_headers(), "Content-Type": "application/json"}, - data=orjson.dumps({"file_id": file_id}), + headers=attach_headers, + data=body, timeout=180, ) + if r.status_code == 400 and OWUI_AUTO_FIX_METADATA: + txt = "" + try: + txt = r.text.lower() + except Exception: + txt = str(r.content).lower() + if "metadata" in txt and owui_fix_metadata_template(kb_id, force=True): + r = requests.post( + f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add", + headers=attach_headers, + data=body, + timeout=180, + ) r.raise_for_status() try: time.sleep(0.5) @@ -1441,6 +1513,7 @@ def publish_to_openwebui(paths): if not kb_id: print("[owui] KB resolve failed; skipping attach to avoid accidental duplicates", flush=True) return + owui_fix_metadata_template(kb_id) for p in paths: p = Path(p) if not p.exists(): diff --git a/docker-compose.yml b/docker-compose.yml index 98ad1a8..71d894e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,6 +16,8 @@ services: OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1} OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1} OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600} + OPENWEBUI_AUTO_FIX_METADATA: ${OPENWEBUI_AUTO_FIX_METADATA:-1} + OPENWEBUI_METADATA_TEMPLATE_JSON: ${OPENWEBUI_METADATA_TEMPLATE_JSON:-} OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080} OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library} @@ -59,6 +61,8 @@ services: OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1} OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1} OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600} + OPENWEBUI_AUTO_FIX_METADATA: ${OPENWEBUI_AUTO_FIX_METADATA:-1} + OPENWEBUI_METADATA_TEMPLATE_JSON: ${OPENWEBUI_METADATA_TEMPLATE_JSON:-} WORKER_MODE: all OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080} OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} @@ -104,6 +108,8 @@ services: OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1} OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1} OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600} + OPENWEBUI_AUTO_FIX_METADATA: ${OPENWEBUI_AUTO_FIX_METADATA:-1} + OPENWEBUI_METADATA_TEMPLATE_JSON: ${OPENWEBUI_METADATA_TEMPLATE_JSON:-} WORKER_MODE: transcribe OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080} OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} diff --git a/scripts/backfill_openwebui.py b/scripts/backfill_openwebui.py index a6cb0ca..78c492a 100755 --- a/scripts/backfill_openwebui.py +++ b/scripts/backfill_openwebui.py @@ -1,11 +1,15 @@ #!/usr/bin/env python3 -import os, sys +import os, sys, json from pathlib import Path import requests, orjson OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/") OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "") OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library") +OWUI_AUTO_FIX_METADATA = os.getenv("OPENWEBUI_AUTO_FIX_METADATA", "1").strip().lower() not in ("0", "false", "no") +OWUI_METADATA_TEMPLATE_JSON = os.getenv("OPENWEBUI_METADATA_TEMPLATE_JSON", "").strip() + +_TEMPLATE_PATCHED = False LIB = Path(os.getenv("LIBRARY_ROOT", "./library")) TRN = Path(os.getenv("TRANSCRIPT_ROOT", "./transcripts")) @@ -28,19 +32,76 @@ def get_or_create_kb(): r.raise_for_status() return r.json()["data"]["id"] + +def metadata_template_payload(): + if not OWUI_METADATA_TEMPLATE_JSON: + return {} + try: + return json.loads(OWUI_METADATA_TEMPLATE_JSON) + except Exception: + return OWUI_METADATA_TEMPLATE_JSON + + +def ensure_metadata_template(kb_id: str, force: bool = False) -> bool: + global _TEMPLATE_PATCHED + if not OWUI_AUTO_FIX_METADATA or not kb_id: + return False + if not force and _TEMPLATE_PATCHED: + return False + + payload_variants = [] + template_payload = metadata_template_payload() + payload_variants.append({"metadata_template": template_payload}) + if template_payload not in ({}, "", None): + payload_variants.append({"metadata_template": {}}) + payload_variants.append({"metadata_template": None}) + + hdrs = {**headers(), "Content-Type": "application/json"} + url = f"{OWUI_URL}/api/v1/knowledge/{kb_id}" + for payload in payload_variants: + try: + body = orjson.dumps(payload) + except Exception: + body = json.dumps(payload).encode("utf-8") + for method in ("PATCH", "PUT"): + try: + resp = requests.request(method, url, headers=hdrs, data=body, timeout=10) + except Exception: + continue + if resp.status_code in (200, 201, 202, 204): + _TEMPLATE_PATCHED = True + print(f"Applied metadata template via {method} for KB {kb_id}") + return True + return False + def upload_and_attach(path: Path, kb_id: str): + ensure_metadata_template(kb_id) with open(path, "rb") as f: r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=headers(), files={"file": (path.name, f)}, timeout=60*10) r.raise_for_status() file_id = r.json()["data"]["id"] + payload = {"file_id": file_id} + body = orjson.dumps(payload) + hdrs = {**headers(), "Content-Type": "application/json"} r = requests.post(f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add", - headers={**headers(), "Content-Type":"application/json"}, - data=orjson.dumps({"file_id": file_id}), timeout=60) + headers=hdrs, + data=body, timeout=60) + if r.status_code == 400 and OWUI_AUTO_FIX_METADATA: + txt = "" + try: + txt = r.text.lower() + except Exception: + txt = str(r.content).lower() + if "metadata" in txt and ensure_metadata_template(kb_id, force=True): + r = requests.post(f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add", + headers=hdrs, + data=body, timeout=60) r.raise_for_status() print(f"Uploaded {path}") def main(): kb_id = get_or_create_kb() + ensure_metadata_template(kb_id) # transcripts for txt in TRN.glob("*.txt"): upload_and_attach(txt, kb_id) diff --git a/scripts/podx-tools.sh b/scripts/podx-tools.sh index 2cd2a69..5e51d98 100755 --- a/scripts/podx-tools.sh +++ b/scripts/podx-tools.sh @@ -63,6 +63,10 @@ fi : "${OPENWEBUI_API_KEY:=}" : "${OPENWEBUI_KB_ID:=}" : "${OPENWEBUI_WAIT_SECS:=180}" +: "${OPENWEBUI_AUTO_FIX_METADATA:=1}" +: "${OPENWEBUI_METADATA_TEMPLATE_JSON:=}" + +__OWUI_METADATA_PATCHED="" # ------------------------------ Helpers ------------------------------ _require() { @@ -142,6 +146,59 @@ PY printf '%s' "${__id:-}" } +_owui_metadata_template_payload() { + python3 - "$OPENWEBUI_METADATA_TEMPLATE_JSON" <<'PY' +import sys, json +raw = sys.argv[1] if len(sys.argv) > 1 else "" +raw = (raw or "").strip() +if not raw: + payload = {"metadata_template": {}} +else: + try: + payload = {"metadata_template": json.loads(raw)} + except Exception: + payload = {"metadata_template": raw} +print(json.dumps(payload)) +PY +} + +_owui_fix_metadata_template() { + local kb_id="$1" force="${2:-0}" + case "${OPENWEBUI_AUTO_FIX_METADATA,,}" in + 0|false|no) return 1 ;; + esac + [ -z "$kb_id" ] && return 1 + if [ "$force" != "1" ]; then + for existing in $__OWUI_METADATA_PATCHED; do + [ "$existing" = "$kb_id" ] && return 0 + done + fi + + local payload methods http_code tmp_body tmp_code + payload="$(_owui_metadata_template_payload)" + methods=(PATCH PUT) + for method in "${methods[@]}"; do + tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)" + curl -sS -X "$method" \ + -H "Authorization: Bearer $OPENWEBUI_API_KEY" \ + -H "Content-Type: application/json" \ + -d "$payload" \ + -w "%{http_code}" --output "$tmp_body" \ + "$(_owui_url)/api/v1/knowledge/$kb_id" >"$tmp_code" || true + http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)" + rm -f "$tmp_body" "$tmp_code" + case "$http_code" in + 200|201|202|204) + __OWUI_METADATA_PATCHED="${__OWUI_METADATA_PATCHED} $kb_id" + echo "[owui] metadata template adjusted via $method for KB $kb_id" + return 0 + ;; + 0|405) ;; + esac + done + return 1 +} + # ------------------------------ OWUI file helpers ------------------------------ _owui_file_get() { local fid="$1" @@ -551,33 +608,52 @@ PY KB_ID="$(_kb_id_by_name "$kb_name")" echo "[owui] attaching to KB: $kb_name (id: ${KB_ID:-})" [ -z "$KB_ID" ] && { echo "KB '$kb_name' not found (or ambiguous)." >&2; exit 1; } + _owui_fix_metadata_template "$KB_ID" || true - tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)"; tmp_hdrs="$(_mktemp)" - curl -sS -X POST \ - -H "Authorization: Bearer $OPENWEBUI_API_KEY" \ - -H "Content-Type: application/json" \ - -d "{\"file_id\":\"$FILE_ID\"}" \ - -D "$tmp_hdrs" \ - -w "%{http_code}" --output "$tmp_body" \ - "$(_owui_url)/api/v1/knowledge/$KB_ID/file/add" >"$tmp_code" || true - curl_exit=$?; http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)" - echo "[owui] response headers:"; sed -n '1,5p' "$tmp_hdrs" || true - RESP="$(cat "$tmp_body")" - echo "$RESP" | ppjson - rm -f "$tmp_body" "$tmp_code" "$tmp_hdrs" - [ -n "${TMP_EXTRACT:-}" ] && rm -f "$TMP_EXTRACT" || true + attach_payload="{\"file_id\":\"$FILE_ID\"}" + attempt=0 + while :; do + attempt=$((attempt+1)) + tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)"; tmp_hdrs="$(_mktemp)" + curl -sS -X POST \ + -H "Authorization: Bearer $OPENWEBUI_API_KEY" \ + -H "Content-Type: application/json" \ + -d "$attach_payload" \ + -D "$tmp_hdrs" \ + -w "%{http_code}" --output "$tmp_body" \ + "$(_owui_url)/api/v1/knowledge/$KB_ID/file/add" >"$tmp_code" || true + curl_exit=$? + http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)" + echo "[owui] response headers:"; sed -n '1,5p' "$tmp_hdrs" || true + RESP="$(cat "$tmp_body")" + echo "$RESP" | ppjson + rm -f "$tmp_body" "$tmp_code" "$tmp_hdrs" - [ $curl_exit -ne 0 ] && { echo "Attach failed: curl exit $curl_exit" >&2; exit $curl_exit; } - [ -z "$http_code" ] || [ "$http_code" = "000" ] && { echo "Attach failed: no HTTP code returned" >&2; exit 1; } - case "$http_code" in - 200|201|204) : ;; - *) - if printf '%s' "$RESP" | grep -qi "Duplicate content"; then - echo "[owui] duplicate content — already indexed. Treating as success."; exit 0 + [ $curl_exit -ne 0 ] && { echo "Attach failed: curl exit $curl_exit" >&2; exit $curl_exit; } + [ -z "$http_code" ] || [ "$http_code" = "000" ] && { echo "Attach failed: no HTTP code returned" >&2; exit 1; } + + if [ "$http_code" = "400" ] && printf '%s' "$RESP" | grep -qi "metadata"; then + if [ "$attempt" -lt 3 ] && _owui_fix_metadata_template "$KB_ID" 1; then + echo "[owui] retrying attach after metadata template fix" + continue fi - echo "Attach failed (HTTP $http_code)" >&2; exit 1 - ;; - esac + fi + + case "$http_code" in + 200|201|204) + break + ;; + *) + if printf '%s' "$RESP" | grep -qi "Duplicate content"; then + echo "[owui] duplicate content — already indexed. Treating as success." + break + fi + echo "Attach failed (HTTP $http_code)" >&2; exit 1 + ;; + esac + done + + [ -n "${TMP_EXTRACT:-}" ] && rm -f "$TMP_EXTRACT" || true ;; owui-attach-id) shift || true; kb_id="${1:-}"; file="${2:-}" @@ -627,31 +703,51 @@ PY echo "[owui] WARNING: timed out waiting for file extraction; attach may fail" >&2 fi - tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)"; tmp_hdrs="$(_mktemp)" - curl -sS -X POST \ - -H "Authorization: Bearer $OPENWEBUI_API_KEY" \ - -H "Content-Type: application/json" \ - -d "{\"file_id\":\"$FILE_ID\"}" \ - -D "$tmp_hdrs" \ - -w "%{http_code}" --output "$tmp_body" \ - "$(_owui_url)/api/v1/knowledge/$kb_id/file/add" >"$tmp_code" || true - curl_exit=$?; http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)" - echo "[owui] response headers:"; sed -n '1,5p' "$tmp_hdrs" || true - RESP="$(cat "$tmp_body")"; echo "$RESP" | ppjson - rm -f "$tmp_body" "$tmp_code" "$tmp_hdrs" - [ -n "${TMP_EXTRACT:-}" ] && rm -f "$TMP_EXTRACT" || true + _owui_fix_metadata_template "$kb_id" || true - [ $curl_exit -ne 0 ] && { echo "Attach failed: curl exit $curl_exit" >&2; exit $curl_exit; } - [ -z "$http_code" ] || [ "$http_code" = "000" ] && { echo "Attach failed: no HTTP code returned" >&2; exit 1; } - case "$http_code" in - 200|201|204) : ;; - *) - if printf '%s' "$RESP" | grep -qi "Duplicate content"; then - echo "[owui] duplicate content — already indexed. Treating as success."; exit 0 + attach_payload="{\"file_id\":\"$FILE_ID\"}" + attempt=0 + while :; do + attempt=$((attempt+1)) + tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)"; tmp_hdrs="$(_mktemp)" + curl -sS -X POST \ + -H "Authorization: Bearer $OPENWEBUI_API_KEY" \ + -H "Content-Type: application/json" \ + -d "$attach_payload" \ + -D "$tmp_hdrs" \ + -w "%{http_code}" --output "$tmp_body" \ + "$(_owui_url)/api/v1/knowledge/$kb_id/file/add" >"$tmp_code" || true + curl_exit=$? + http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)" + echo "[owui] response headers:"; sed -n '1,5p' "$tmp_hdrs" || true + RESP="$(cat "$tmp_body")"; echo "$RESP" | ppjson + rm -f "$tmp_body" "$tmp_code" "$tmp_hdrs" + + [ $curl_exit -ne 0 ] && { echo "Attach failed: curl exit $curl_exit" >&2; exit $curl_exit; } + [ -z "$http_code" ] || [ "$http_code" = "000" ] && { echo "Attach failed: no HTTP code returned" >&2; exit 1; } + + if [ "$http_code" = "400" ] && printf '%s' "$RESP" | grep -qi "metadata"; then + if [ "$attempt" -lt 3 ] && _owui_fix_metadata_template "$kb_id" 1; then + echo "[owui] retrying attach after metadata template fix" + continue fi - echo "Attach failed (HTTP $http_code)" >&2; exit 1 - ;; - esac + fi + + case "$http_code" in + 200|201|204) + break + ;; + *) + if printf '%s' "$RESP" | grep -qi "Duplicate content"; then + echo "[owui] duplicate content — already indexed. Treating as success." + break + fi + echo "Attach failed (HTTP $http_code)" >&2; exit 1 + ;; + esac + done + + [ -n "${TMP_EXTRACT:-}" ] && rm -f "$TMP_EXTRACT" || true ;; owui-kb-files) shift || true; kb_name="${1:-}" @@ -753,4 +849,4 @@ PY _help exit 1 ;; -esac \ No newline at end of file +esac