Fix id in json

2025-09-07 20:19:17 +02:00
parent 70b7449bc1
commit 69a61f7350
1 changed files with 62 additions and 9 deletions
--- a/app/worker.py
+++ b/app/worker.py
@@ -1022,26 +1022,79 @@ def transcribe(media_path: Path):
    print(f"[whisper] finished: {media_path}  lang={info.language}  segments={len(segs)}  dur={dur:.2f}s", flush=True)
    return base

+
+# --- Meilisearch helpers ---
+def _safe_doc_id(s: str) -> str:
+    """
+    Meilisearch document IDs must be [A-Za-z0-9_-]. Convert the title to a safe slug.
+    If the result is empty, fall back to a short SHA1 hash.
+    """
+    import hashlib
+    slug = re.sub(r"\s+", "_", (s or "").strip())
+    slug = re.sub(r"[^A-Za-z0-9_-]", "", slug)
+    if not slug:
+        slug = hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:16]
+    return slug
+
+
+def ensure_meili_index():
+    """Create index 'library' with primaryKey 'id' if it does not already exist."""
+    try:
+        r = requests.get(f"{MEILI_URL}/indexes/library",
+                         headers={"Authorization": f"Bearer {MEILI_KEY}"}, timeout=10)
+        if r.status_code == 200:
+            return
+        # Attempt to create it
+        cr = requests.post(
+            f"{MEILI_URL}/indexes",
+            headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
+            data=orjson.dumps({"uid": "library", "primaryKey": "id"}),
+            timeout=10,
+        )
+        # Ignore errors if another process created it first
+        try:
+            cr.raise_for_status()
+        except Exception:
+            pass
+    except Exception:
+        # Non-fatal; indexing will fail later if the index truly doesn't exist
+        pass
+
+
 def index_meili(json_path: Path):
+    # Make sure the index exists and is configured with a primary key
+    ensure_meili_index()
+
    doc = json.loads(open(json_path, "r", encoding="utf-8").read())
-    title = Path(doc["file"]).stem
-    date = re.findall(r"\b(\d{8})\b", title)
+    file_field = doc.get("file", "")
+    title = Path(file_field).stem if file_field else json_path.stem
+
+    # Build a Meili-safe document ID
+    doc_id = _safe_doc_id(title)
+
+    # Extract a YYYYMMDD date if present
+    m = re.search(r"\b(\d{8})\b", title)
+    date = m.group(1) if m else ""
+
    payload = {
-        "id": title,
+        "id": doc_id,
        "type": "podcast",
        "title": title,
-        "date": date[0] if date else "",
-        "source": str(Path(LIB, Path(doc["file"]).name)),
-        "text": " ".join(s["text"] for s in doc.get("segments", [])),
+        "date": date,
+        "source": str(Path(LIB, Path(file_field or title).name)),
+        "text": " ".join(s.get("text", "") for s in doc.get("segments", [])),
        "segments": doc.get("segments", []),
-        "meta": {"language": doc.get("language", "")}
+        "meta": {"language": doc.get("language", "")},
    }
-    import time
+
    for attempt in range(5):
        try:
            r = requests.post(
                f"{MEILI_URL}/indexes/library/documents",
-                headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
+                headers={
+                    "Authorization": f"Bearer {MEILI_KEY}",
+                    "Content-Type": "application/json",
+                },
                data=orjson.dumps(payload),
                timeout=15,
            )