From 69a61f73509976ffd094f223d955c9e7368fbc46 Mon Sep 17 00:00:00 2001 From: Tomas Kracmar Date: Sun, 7 Sep 2025 20:19:17 +0200 Subject: [PATCH] Fix id in json --- app/worker.py | 71 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/app/worker.py b/app/worker.py index f6a0226..b7aee73 100644 --- a/app/worker.py +++ b/app/worker.py @@ -1022,26 +1022,79 @@ def transcribe(media_path: Path): print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True) return base + +# --- Meilisearch helpers --- +def _safe_doc_id(s: str) -> str: + """ + Meilisearch document IDs must be [A-Za-z0-9_-]. Convert the title to a safe slug. + If the result is empty, fall back to a short SHA1 hash. + """ + import hashlib + slug = re.sub(r"\s+", "_", (s or "").strip()) + slug = re.sub(r"[^A-Za-z0-9_-]", "", slug) + if not slug: + slug = hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:16] + return slug + + +def ensure_meili_index(): + """Create index 'library' with primaryKey 'id' if it does not already exist.""" + try: + r = requests.get(f"{MEILI_URL}/indexes/library", + headers={"Authorization": f"Bearer {MEILI_KEY}"}, timeout=10) + if r.status_code == 200: + return + # Attempt to create it + cr = requests.post( + f"{MEILI_URL}/indexes", + headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"}, + data=orjson.dumps({"uid": "library", "primaryKey": "id"}), + timeout=10, + ) + # Ignore errors if another process created it first + try: + cr.raise_for_status() + except Exception: + pass + except Exception: + # Non-fatal; indexing will fail later if the index truly doesn't exist + pass + + def index_meili(json_path: Path): + # Make sure the index exists and is configured with a primary key + ensure_meili_index() + doc = json.loads(open(json_path, "r", encoding="utf-8").read()) - title = Path(doc["file"]).stem - date = re.findall(r"\b(\d{8})\b", title) + file_field = doc.get("file", "") + title = Path(file_field).stem if file_field else json_path.stem + + # Build a Meili-safe document ID + doc_id = _safe_doc_id(title) + + # Extract a YYYYMMDD date if present + m = re.search(r"\b(\d{8})\b", title) + date = m.group(1) if m else "" + payload = { - "id": title, + "id": doc_id, "type": "podcast", "title": title, - "date": date[0] if date else "", - "source": str(Path(LIB, Path(doc["file"]).name)), - "text": " ".join(s["text"] for s in doc.get("segments", [])), + "date": date, + "source": str(Path(LIB, Path(file_field or title).name)), + "text": " ".join(s.get("text", "") for s in doc.get("segments", [])), "segments": doc.get("segments", []), - "meta": {"language": doc.get("language", "")} + "meta": {"language": doc.get("language", "")}, } - import time + for attempt in range(5): try: r = requests.post( f"{MEILI_URL}/indexes/library/documents", - headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"}, + headers={ + "Authorization": f"Bearer {MEILI_KEY}", + "Content-Type": "application/json", + }, data=orjson.dumps(payload), timeout=15, )