Fix id in json

2025-09-07 20:19:17 +02:00
parent 70b7449bc1
commit 69a61f7350
1 changed files with 62 additions and 9 deletions
--- a/app/worker.py
+++ b/app/worker.py
@@ -1022,26 +1022,79 @@ def transcribe(media_path: Path):
    print(f"[whisper] finished: {media_path}  lang={info.language}  segments={len(segs)}  dur={dur:.2f}s", flush=True)
    return base
 # --- Meilisearch helpers ---
 def _safe_doc_id(s: str) -> str:
    """
    Meilisearch document IDs must be [A-Za-z0-9_-]. Convert the title to a safe slug.
    If the result is empty, fall back to a short SHA1 hash.
    """
    import hashlib
    slug = re.sub(r"\s+", "_", (s or "").strip())
    slug = re.sub(r"[^A-Za-z0-9_-]", "", slug)
    if not slug:
        slug = hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:16]
    return slug
 def ensure_meili_index():
    """Create index 'library' with primaryKey 'id' if it does not already exist."""
    try:
        r = requests.get(f"{MEILI_URL}/indexes/library",
                         headers={"Authorization": f"Bearer {MEILI_KEY}"}, timeout=10)
        if r.status_code == 200:
            return
        # Attempt to create it
        cr = requests.post(
            f"{MEILI_URL}/indexes",
            headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
            data=orjson.dumps({"uid": "library", "primaryKey": "id"}),
            timeout=10,
        )
        # Ignore errors if another process created it first
        try:
            cr.raise_for_status()
        except Exception:
            pass
    except Exception:
        # Non-fatal; indexing will fail later if the index truly doesn't exist
        pass
 def index_meili(json_path: Path):
    # Make sure the index exists and is configured with a primary key
    ensure_meili_index()
    doc = json.loads(open(json_path, "r", encoding="utf-8").read())
-    title = Path(doc["file"]).stem
+    file_field = doc.get("file", "")
-    date = re.findall(r"\b(\d{8})\b", title)
+    title = Path(file_field).stem if file_field else json_path.stem
    # Build a Meili-safe document ID
    doc_id = _safe_doc_id(title)
    # Extract a YYYYMMDD date if present
    m = re.search(r"\b(\d{8})\b", title)
    date = m.group(1) if m else ""
    payload = {
-        "id": title,
+        "id": doc_id,
        "type": "podcast",
        "title": title,
-        "date": date[0] if date else "",
+        "date": date,
-        "source": str(Path(LIB, Path(doc["file"]).name)),
+        "source": str(Path(LIB, Path(file_field or title).name)),
-        "text": " ".join(s["text"] for s in doc.get("segments", [])),
+        "text": " ".join(s.get("text", "") for s in doc.get("segments", [])),
        "segments": doc.get("segments", []),
-        "meta": {"language": doc.get("language", "")}
+        "meta": {"language": doc.get("language", "")},
    }
-    import time
+
    for attempt in range(5):
        try:
            r = requests.post(
                f"{MEILI_URL}/indexes/library/documents",
-                headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
+                headers={
                    "Authorization": f"Bearer {MEILI_KEY}",
                    "Content-Type": "application/json",
                },
                data=orjson.dumps(payload),
                timeout=15,
            )