Fixing metadata import errors

This commit is contained in:
2025-09-24 11:38:53 +02:00
parent 73e89b9a67
commit 6240e86650
6 changed files with 294 additions and 53 deletions

View File

@@ -9,6 +9,9 @@ OPENWEBUI_URL=http://openwebui:3000
OPENWEBUI_API_KEY=put_your_openwebui_api_key_here
OPENWEBUI_KB_NAME=Homelab Library
OPENWEBUI_KB_ID=your_kb_uuid_here
OPENWEBUI_AUTO_FIX_METADATA=1
# Optional: JSON string to enforce as metadata template when auto-fix runs
# OPENWEBUI_METADATA_TEMPLATE_JSON={}
# Transcription backend (local Whisper by default)
TRANSCRIBE_BACKEND=local

View File

@@ -41,6 +41,8 @@ Note: `.env.example` includes placeholders for both **Meili** and **OpenWebUI**
- `OPENWEBUI_API_KEY`: API key for authenticating PodX workers with OpenWebUI.
- `OPENWEBUI_KB_NAME`: Human-readable Knowledge Base name to attach documents to.
- `OPENWEBUI_KB_ID`: Fixed UUID of the Knowledge Base (avoids duplicate KBs on restart).
- `OPENWEBUI_AUTO_FIX_METADATA` (default `1`): When enabled, PodX clears/overrides the Knowledge Base metadata template before uploads to prevent ingestion crashes from invalid templates.
- `OPENWEBUI_METADATA_TEMPLATE_JSON`: Optional JSON applied when the auto-fix runs (defaults to `{}`, i.e., no custom metadata template).
## RSS Ingestion

View File

@@ -97,6 +97,10 @@ DEFAULT_TRANSCRIPT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or
OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
OWUI_AUTO_FIX_METADATA = os.getenv("OPENWEBUI_AUTO_FIX_METADATA", "1").strip().lower() not in ("0", "false", "no")
OWUI_METADATA_TEMPLATE_JSON = os.getenv("OPENWEBUI_METADATA_TEMPLATE_JSON", "").strip()
_OWUI_TEMPLATE_PATCHED: set[str] = set()
# Redis-backed job queue settings and offload toggle
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0").strip()
@@ -1355,6 +1359,56 @@ def is_media_url(url: str):
def owui_headers():
return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {}
def _owui_metadata_template_payload():
"""Return the metadata template payload to apply when auto-fix is enabled."""
if not OWUI_METADATA_TEMPLATE_JSON:
return {}
try:
return json.loads(OWUI_METADATA_TEMPLATE_JSON)
except Exception:
# Treat value as a raw string template if parsing fails
return OWUI_METADATA_TEMPLATE_JSON
def owui_fix_metadata_template(kb_id: str, force: bool = False) -> bool:
"""Ensure the target knowledge base has a safe metadata template.
Attempts PATCH/PUT with either a user-provided template or an empty object.
Returns True if an update succeeded; False otherwise.
"""
if not OWUI_AUTO_FIX_METADATA or not OWUI_URL or not OWUI_KEY or not kb_id:
return False
if not force and kb_id in _OWUI_TEMPLATE_PATCHED:
return False
payload_variants: list[object] = []
template_payload = _owui_metadata_template_payload()
payload_variants.append({"metadata_template": template_payload})
if template_payload not in ({}, "", None):
payload_variants.append({"metadata_template": {}})
payload_variants.append({"metadata_template": None})
headers = {**owui_headers(), "Content-Type": "application/json"}
url = f"{OWUI_URL}/api/v1/knowledge/{kb_id}"
success_codes = {200, 201, 202, 204}
for payload in payload_variants:
try:
body = orjson.dumps(payload)
except Exception:
body = json.dumps(payload).encode("utf-8")
for method in ("PATCH", "PUT"):
try:
resp = requests.request(method, url, headers=headers, data=body, timeout=15)
except Exception:
continue
if resp.status_code in success_codes:
print(f"[owui] metadata template adjusted via {method} for KB {kb_id}", flush=True)
_OWUI_TEMPLATE_PATCHED.add(kb_id)
return True
return False
def owui_get_or_create_kb():
"""Return a KB id for OWUI_KB without creating duplicates.
Honors OPENWEBUI_KB_ID, and tolerates both list and {"data": ...} response shapes.
@@ -1413,6 +1467,8 @@ def owui_get_or_create_kb():
return None
def owui_upload_and_attach(path: Path, kb_id: str):
if OWUI_AUTO_FIX_METADATA:
owui_fix_metadata_template(kb_id)
with open(path, "rb") as f:
r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=owui_headers(), files={"file": (path.name, f)}, timeout=60*10)
r.raise_for_status()
@@ -1420,12 +1476,28 @@ def owui_upload_and_attach(path: Path, kb_id: str):
file_id = (up.get("id") or (up.get("data") or {}).get("id"))
if not file_id:
raise RuntimeError(f"OWUI upload: could not get file id from response: {up}")
payload = {"file_id": file_id}
attach_headers = {**owui_headers(), "Content-Type": "application/json"}
body = orjson.dumps(payload)
r = requests.post(
f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add",
headers={**owui_headers(), "Content-Type": "application/json"},
data=orjson.dumps({"file_id": file_id}),
headers=attach_headers,
data=body,
timeout=180,
)
if r.status_code == 400 and OWUI_AUTO_FIX_METADATA:
txt = ""
try:
txt = r.text.lower()
except Exception:
txt = str(r.content).lower()
if "metadata" in txt and owui_fix_metadata_template(kb_id, force=True):
r = requests.post(
f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add",
headers=attach_headers,
data=body,
timeout=180,
)
r.raise_for_status()
try:
time.sleep(0.5)
@@ -1441,6 +1513,7 @@ def publish_to_openwebui(paths):
if not kb_id:
print("[owui] KB resolve failed; skipping attach to avoid accidental duplicates", flush=True)
return
owui_fix_metadata_template(kb_id)
for p in paths:
p = Path(p)
if not p.exists():

View File

@@ -16,6 +16,8 @@ services:
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1}
OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1}
OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600}
OPENWEBUI_AUTO_FIX_METADATA: ${OPENWEBUI_AUTO_FIX_METADATA:-1}
OPENWEBUI_METADATA_TEMPLATE_JSON: ${OPENWEBUI_METADATA_TEMPLATE_JSON:-}
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
@@ -59,6 +61,8 @@ services:
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1}
OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1}
OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600}
OPENWEBUI_AUTO_FIX_METADATA: ${OPENWEBUI_AUTO_FIX_METADATA:-1}
OPENWEBUI_METADATA_TEMPLATE_JSON: ${OPENWEBUI_METADATA_TEMPLATE_JSON:-}
WORKER_MODE: all
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
@@ -104,6 +108,8 @@ services:
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1}
OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1}
OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600}
OPENWEBUI_AUTO_FIX_METADATA: ${OPENWEBUI_AUTO_FIX_METADATA:-1}
OPENWEBUI_METADATA_TEMPLATE_JSON: ${OPENWEBUI_METADATA_TEMPLATE_JSON:-}
WORKER_MODE: transcribe
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}

View File

@@ -1,11 +1,15 @@
#!/usr/bin/env python3
import os, sys
import os, sys, json
from pathlib import Path
import requests, orjson
OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
OWUI_AUTO_FIX_METADATA = os.getenv("OPENWEBUI_AUTO_FIX_METADATA", "1").strip().lower() not in ("0", "false", "no")
OWUI_METADATA_TEMPLATE_JSON = os.getenv("OPENWEBUI_METADATA_TEMPLATE_JSON", "").strip()
_TEMPLATE_PATCHED = False
LIB = Path(os.getenv("LIBRARY_ROOT", "./library"))
TRN = Path(os.getenv("TRANSCRIPT_ROOT", "./transcripts"))
@@ -28,19 +32,76 @@ def get_or_create_kb():
r.raise_for_status()
return r.json()["data"]["id"]
def metadata_template_payload():
if not OWUI_METADATA_TEMPLATE_JSON:
return {}
try:
return json.loads(OWUI_METADATA_TEMPLATE_JSON)
except Exception:
return OWUI_METADATA_TEMPLATE_JSON
def ensure_metadata_template(kb_id: str, force: bool = False) -> bool:
global _TEMPLATE_PATCHED
if not OWUI_AUTO_FIX_METADATA or not kb_id:
return False
if not force and _TEMPLATE_PATCHED:
return False
payload_variants = []
template_payload = metadata_template_payload()
payload_variants.append({"metadata_template": template_payload})
if template_payload not in ({}, "", None):
payload_variants.append({"metadata_template": {}})
payload_variants.append({"metadata_template": None})
hdrs = {**headers(), "Content-Type": "application/json"}
url = f"{OWUI_URL}/api/v1/knowledge/{kb_id}"
for payload in payload_variants:
try:
body = orjson.dumps(payload)
except Exception:
body = json.dumps(payload).encode("utf-8")
for method in ("PATCH", "PUT"):
try:
resp = requests.request(method, url, headers=hdrs, data=body, timeout=10)
except Exception:
continue
if resp.status_code in (200, 201, 202, 204):
_TEMPLATE_PATCHED = True
print(f"Applied metadata template via {method} for KB {kb_id}")
return True
return False
def upload_and_attach(path: Path, kb_id: str):
ensure_metadata_template(kb_id)
with open(path, "rb") as f:
r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=headers(), files={"file": (path.name, f)}, timeout=60*10)
r.raise_for_status()
file_id = r.json()["data"]["id"]
payload = {"file_id": file_id}
body = orjson.dumps(payload)
hdrs = {**headers(), "Content-Type": "application/json"}
r = requests.post(f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add",
headers={**headers(), "Content-Type":"application/json"},
data=orjson.dumps({"file_id": file_id}), timeout=60)
headers=hdrs,
data=body, timeout=60)
if r.status_code == 400 and OWUI_AUTO_FIX_METADATA:
txt = ""
try:
txt = r.text.lower()
except Exception:
txt = str(r.content).lower()
if "metadata" in txt and ensure_metadata_template(kb_id, force=True):
r = requests.post(f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add",
headers=hdrs,
data=body, timeout=60)
r.raise_for_status()
print(f"Uploaded {path}")
def main():
kb_id = get_or_create_kb()
ensure_metadata_template(kb_id)
# transcripts
for txt in TRN.glob("*.txt"):
upload_and_attach(txt, kb_id)

View File

@@ -63,6 +63,10 @@ fi
: "${OPENWEBUI_API_KEY:=}"
: "${OPENWEBUI_KB_ID:=}"
: "${OPENWEBUI_WAIT_SECS:=180}"
: "${OPENWEBUI_AUTO_FIX_METADATA:=1}"
: "${OPENWEBUI_METADATA_TEMPLATE_JSON:=}"
__OWUI_METADATA_PATCHED=""
# ------------------------------ Helpers ------------------------------
_require() {
@@ -142,6 +146,59 @@ PY
printf '%s' "${__id:-}"
}
_owui_metadata_template_payload() {
python3 - "$OPENWEBUI_METADATA_TEMPLATE_JSON" <<'PY'
import sys, json
raw = sys.argv[1] if len(sys.argv) > 1 else ""
raw = (raw or "").strip()
if not raw:
payload = {"metadata_template": {}}
else:
try:
payload = {"metadata_template": json.loads(raw)}
except Exception:
payload = {"metadata_template": raw}
print(json.dumps(payload))
PY
}
_owui_fix_metadata_template() {
local kb_id="$1" force="${2:-0}"
case "${OPENWEBUI_AUTO_FIX_METADATA,,}" in
0|false|no) return 1 ;;
esac
[ -z "$kb_id" ] && return 1
if [ "$force" != "1" ]; then
for existing in $__OWUI_METADATA_PATCHED; do
[ "$existing" = "$kb_id" ] && return 0
done
fi
local payload methods http_code tmp_body tmp_code
payload="$(_owui_metadata_template_payload)"
methods=(PATCH PUT)
for method in "${methods[@]}"; do
tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)"
curl -sS -X "$method" \
-H "Authorization: Bearer $OPENWEBUI_API_KEY" \
-H "Content-Type: application/json" \
-d "$payload" \
-w "%{http_code}" --output "$tmp_body" \
"$(_owui_url)/api/v1/knowledge/$kb_id" >"$tmp_code" || true
http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)"
rm -f "$tmp_body" "$tmp_code"
case "$http_code" in
200|201|202|204)
__OWUI_METADATA_PATCHED="${__OWUI_METADATA_PATCHED} $kb_id"
echo "[owui] metadata template adjusted via $method for KB $kb_id"
return 0
;;
0|405) ;;
esac
done
return 1
}
# ------------------------------ OWUI file helpers ------------------------------
_owui_file_get() {
local fid="$1"
@@ -551,33 +608,52 @@ PY
KB_ID="$(_kb_id_by_name "$kb_name")"
echo "[owui] attaching to KB: $kb_name (id: ${KB_ID:-<none>})"
[ -z "$KB_ID" ] && { echo "KB '$kb_name' not found (or ambiguous)." >&2; exit 1; }
_owui_fix_metadata_template "$KB_ID" || true
tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)"; tmp_hdrs="$(_mktemp)"
curl -sS -X POST \
-H "Authorization: Bearer $OPENWEBUI_API_KEY" \
-H "Content-Type: application/json" \
-d "{\"file_id\":\"$FILE_ID\"}" \
-D "$tmp_hdrs" \
-w "%{http_code}" --output "$tmp_body" \
"$(_owui_url)/api/v1/knowledge/$KB_ID/file/add" >"$tmp_code" || true
curl_exit=$?; http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)"
echo "[owui] response headers:"; sed -n '1,5p' "$tmp_hdrs" || true
RESP="$(cat "$tmp_body")"
echo "$RESP" | ppjson
rm -f "$tmp_body" "$tmp_code" "$tmp_hdrs"
[ -n "${TMP_EXTRACT:-}" ] && rm -f "$TMP_EXTRACT" || true
attach_payload="{\"file_id\":\"$FILE_ID\"}"
attempt=0
while :; do
attempt=$((attempt+1))
tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)"; tmp_hdrs="$(_mktemp)"
curl -sS -X POST \
-H "Authorization: Bearer $OPENWEBUI_API_KEY" \
-H "Content-Type: application/json" \
-d "$attach_payload" \
-D "$tmp_hdrs" \
-w "%{http_code}" --output "$tmp_body" \
"$(_owui_url)/api/v1/knowledge/$KB_ID/file/add" >"$tmp_code" || true
curl_exit=$?
http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)"
echo "[owui] response headers:"; sed -n '1,5p' "$tmp_hdrs" || true
RESP="$(cat "$tmp_body")"
echo "$RESP" | ppjson
rm -f "$tmp_body" "$tmp_code" "$tmp_hdrs"
[ $curl_exit -ne 0 ] && { echo "Attach failed: curl exit $curl_exit" >&2; exit $curl_exit; }
[ -z "$http_code" ] || [ "$http_code" = "000" ] && { echo "Attach failed: no HTTP code returned" >&2; exit 1; }
case "$http_code" in
200|201|204) : ;;
*)
if printf '%s' "$RESP" | grep -qi "Duplicate content"; then
echo "[owui] duplicate content — already indexed. Treating as success."; exit 0
[ $curl_exit -ne 0 ] && { echo "Attach failed: curl exit $curl_exit" >&2; exit $curl_exit; }
[ -z "$http_code" ] || [ "$http_code" = "000" ] && { echo "Attach failed: no HTTP code returned" >&2; exit 1; }
if [ "$http_code" = "400" ] && printf '%s' "$RESP" | grep -qi "metadata"; then
if [ "$attempt" -lt 3 ] && _owui_fix_metadata_template "$KB_ID" 1; then
echo "[owui] retrying attach after metadata template fix"
continue
fi
echo "Attach failed (HTTP $http_code)" >&2; exit 1
;;
esac
fi
case "$http_code" in
200|201|204)
break
;;
*)
if printf '%s' "$RESP" | grep -qi "Duplicate content"; then
echo "[owui] duplicate content — already indexed. Treating as success."
break
fi
echo "Attach failed (HTTP $http_code)" >&2; exit 1
;;
esac
done
[ -n "${TMP_EXTRACT:-}" ] && rm -f "$TMP_EXTRACT" || true
;;
owui-attach-id)
shift || true; kb_id="${1:-}"; file="${2:-}"
@@ -627,31 +703,51 @@ PY
echo "[owui] WARNING: timed out waiting for file extraction; attach may fail" >&2
fi
tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)"; tmp_hdrs="$(_mktemp)"
curl -sS -X POST \
-H "Authorization: Bearer $OPENWEBUI_API_KEY" \
-H "Content-Type: application/json" \
-d "{\"file_id\":\"$FILE_ID\"}" \
-D "$tmp_hdrs" \
-w "%{http_code}" --output "$tmp_body" \
"$(_owui_url)/api/v1/knowledge/$kb_id/file/add" >"$tmp_code" || true
curl_exit=$?; http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)"
echo "[owui] response headers:"; sed -n '1,5p' "$tmp_hdrs" || true
RESP="$(cat "$tmp_body")"; echo "$RESP" | ppjson
rm -f "$tmp_body" "$tmp_code" "$tmp_hdrs"
[ -n "${TMP_EXTRACT:-}" ] && rm -f "$TMP_EXTRACT" || true
_owui_fix_metadata_template "$kb_id" || true
[ $curl_exit -ne 0 ] && { echo "Attach failed: curl exit $curl_exit" >&2; exit $curl_exit; }
[ -z "$http_code" ] || [ "$http_code" = "000" ] && { echo "Attach failed: no HTTP code returned" >&2; exit 1; }
case "$http_code" in
200|201|204) : ;;
*)
if printf '%s' "$RESP" | grep -qi "Duplicate content"; then
echo "[owui] duplicate content — already indexed. Treating as success."; exit 0
attach_payload="{\"file_id\":\"$FILE_ID\"}"
attempt=0
while :; do
attempt=$((attempt+1))
tmp_body="$(_mktemp)"; tmp_code="$(_mktemp)"; tmp_hdrs="$(_mktemp)"
curl -sS -X POST \
-H "Authorization: Bearer $OPENWEBUI_API_KEY" \
-H "Content-Type: application/json" \
-d "$attach_payload" \
-D "$tmp_hdrs" \
-w "%{http_code}" --output "$tmp_body" \
"$(_owui_url)/api/v1/knowledge/$kb_id/file/add" >"$tmp_code" || true
curl_exit=$?
http_code="$(cat "$tmp_code" 2>/dev/null || echo 0)"
echo "[owui] response headers:"; sed -n '1,5p' "$tmp_hdrs" || true
RESP="$(cat "$tmp_body")"; echo "$RESP" | ppjson
rm -f "$tmp_body" "$tmp_code" "$tmp_hdrs"
[ $curl_exit -ne 0 ] && { echo "Attach failed: curl exit $curl_exit" >&2; exit $curl_exit; }
[ -z "$http_code" ] || [ "$http_code" = "000" ] && { echo "Attach failed: no HTTP code returned" >&2; exit 1; }
if [ "$http_code" = "400" ] && printf '%s' "$RESP" | grep -qi "metadata"; then
if [ "$attempt" -lt 3 ] && _owui_fix_metadata_template "$kb_id" 1; then
echo "[owui] retrying attach after metadata template fix"
continue
fi
echo "Attach failed (HTTP $http_code)" >&2; exit 1
;;
esac
fi
case "$http_code" in
200|201|204)
break
;;
*)
if printf '%s' "$RESP" | grep -qi "Duplicate content"; then
echo "[owui] duplicate content — already indexed. Treating as success."
break
fi
echo "Attach failed (HTTP $http_code)" >&2; exit 1
;;
esac
done
[ -n "${TMP_EXTRACT:-}" ] && rm -f "$TMP_EXTRACT" || true
;;
owui-kb-files)
shift || true; kb_name="${1:-}"
@@ -753,4 +849,4 @@ PY
_help
exit 1
;;
esac
esac