Fix id in json
This commit is contained in:
@@ -1022,26 +1022,79 @@ def transcribe(media_path: Path):
|
||||
print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True)
|
||||
return base
|
||||
|
||||
|
||||
# --- Meilisearch helpers ---
|
||||
def _safe_doc_id(s: str) -> str:
|
||||
"""
|
||||
Meilisearch document IDs must be [A-Za-z0-9_-]. Convert the title to a safe slug.
|
||||
If the result is empty, fall back to a short SHA1 hash.
|
||||
"""
|
||||
import hashlib
|
||||
slug = re.sub(r"\s+", "_", (s or "").strip())
|
||||
slug = re.sub(r"[^A-Za-z0-9_-]", "", slug)
|
||||
if not slug:
|
||||
slug = hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:16]
|
||||
return slug
|
||||
|
||||
|
||||
def ensure_meili_index():
|
||||
"""Create index 'library' with primaryKey 'id' if it does not already exist."""
|
||||
try:
|
||||
r = requests.get(f"{MEILI_URL}/indexes/library",
|
||||
headers={"Authorization": f"Bearer {MEILI_KEY}"}, timeout=10)
|
||||
if r.status_code == 200:
|
||||
return
|
||||
# Attempt to create it
|
||||
cr = requests.post(
|
||||
f"{MEILI_URL}/indexes",
|
||||
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
|
||||
data=orjson.dumps({"uid": "library", "primaryKey": "id"}),
|
||||
timeout=10,
|
||||
)
|
||||
# Ignore errors if another process created it first
|
||||
try:
|
||||
cr.raise_for_status()
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
# Non-fatal; indexing will fail later if the index truly doesn't exist
|
||||
pass
|
||||
|
||||
|
||||
def index_meili(json_path: Path):
|
||||
# Make sure the index exists and is configured with a primary key
|
||||
ensure_meili_index()
|
||||
|
||||
doc = json.loads(open(json_path, "r", encoding="utf-8").read())
|
||||
title = Path(doc["file"]).stem
|
||||
date = re.findall(r"\b(\d{8})\b", title)
|
||||
file_field = doc.get("file", "")
|
||||
title = Path(file_field).stem if file_field else json_path.stem
|
||||
|
||||
# Build a Meili-safe document ID
|
||||
doc_id = _safe_doc_id(title)
|
||||
|
||||
# Extract a YYYYMMDD date if present
|
||||
m = re.search(r"\b(\d{8})\b", title)
|
||||
date = m.group(1) if m else ""
|
||||
|
||||
payload = {
|
||||
"id": title,
|
||||
"id": doc_id,
|
||||
"type": "podcast",
|
||||
"title": title,
|
||||
"date": date[0] if date else "",
|
||||
"source": str(Path(LIB, Path(doc["file"]).name)),
|
||||
"text": " ".join(s["text"] for s in doc.get("segments", [])),
|
||||
"date": date,
|
||||
"source": str(Path(LIB, Path(file_field or title).name)),
|
||||
"text": " ".join(s.get("text", "") for s in doc.get("segments", [])),
|
||||
"segments": doc.get("segments", []),
|
||||
"meta": {"language": doc.get("language", "")}
|
||||
"meta": {"language": doc.get("language", "")},
|
||||
}
|
||||
import time
|
||||
|
||||
for attempt in range(5):
|
||||
try:
|
||||
r = requests.post(
|
||||
f"{MEILI_URL}/indexes/library/documents",
|
||||
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
|
||||
headers={
|
||||
"Authorization": f"Bearer {MEILI_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
data=orjson.dumps(payload),
|
||||
timeout=15,
|
||||
)
|
||||
|
Reference in New Issue
Block a user