Fix id in json

This commit is contained in:
2025-09-07 20:19:17 +02:00
parent 70b7449bc1
commit 69a61f7350

View File

@@ -1022,26 +1022,79 @@ def transcribe(media_path: Path):
print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True) print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True)
return base return base
# --- Meilisearch helpers ---
def _safe_doc_id(s: str) -> str:
"""
Meilisearch document IDs must be [A-Za-z0-9_-]. Convert the title to a safe slug.
If the result is empty, fall back to a short SHA1 hash.
"""
import hashlib
slug = re.sub(r"\s+", "_", (s or "").strip())
slug = re.sub(r"[^A-Za-z0-9_-]", "", slug)
if not slug:
slug = hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:16]
return slug
def ensure_meili_index():
"""Create index 'library' with primaryKey 'id' if it does not already exist."""
try:
r = requests.get(f"{MEILI_URL}/indexes/library",
headers={"Authorization": f"Bearer {MEILI_KEY}"}, timeout=10)
if r.status_code == 200:
return
# Attempt to create it
cr = requests.post(
f"{MEILI_URL}/indexes",
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
data=orjson.dumps({"uid": "library", "primaryKey": "id"}),
timeout=10,
)
# Ignore errors if another process created it first
try:
cr.raise_for_status()
except Exception:
pass
except Exception:
# Non-fatal; indexing will fail later if the index truly doesn't exist
pass
def index_meili(json_path: Path): def index_meili(json_path: Path):
# Make sure the index exists and is configured with a primary key
ensure_meili_index()
doc = json.loads(open(json_path, "r", encoding="utf-8").read()) doc = json.loads(open(json_path, "r", encoding="utf-8").read())
title = Path(doc["file"]).stem file_field = doc.get("file", "")
date = re.findall(r"\b(\d{8})\b", title) title = Path(file_field).stem if file_field else json_path.stem
# Build a Meili-safe document ID
doc_id = _safe_doc_id(title)
# Extract a YYYYMMDD date if present
m = re.search(r"\b(\d{8})\b", title)
date = m.group(1) if m else ""
payload = { payload = {
"id": title, "id": doc_id,
"type": "podcast", "type": "podcast",
"title": title, "title": title,
"date": date[0] if date else "", "date": date,
"source": str(Path(LIB, Path(doc["file"]).name)), "source": str(Path(LIB, Path(file_field or title).name)),
"text": " ".join(s["text"] for s in doc.get("segments", [])), "text": " ".join(s.get("text", "") for s in doc.get("segments", [])),
"segments": doc.get("segments", []), "segments": doc.get("segments", []),
"meta": {"language": doc.get("language", "")} "meta": {"language": doc.get("language", "")},
} }
import time
for attempt in range(5): for attempt in range(5):
try: try:
r = requests.post( r = requests.post(
f"{MEILI_URL}/indexes/library/documents", f"{MEILI_URL}/indexes/library/documents",
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"}, headers={
"Authorization": f"Bearer {MEILI_KEY}",
"Content-Type": "application/json",
},
data=orjson.dumps(payload), data=orjson.dumps(payload),
timeout=15, timeout=15,
) )