Fix id in json
This commit is contained in:
@@ -1022,26 +1022,79 @@ def transcribe(media_path: Path):
|
|||||||
print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True)
|
print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True)
|
||||||
return base
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
# --- Meilisearch helpers ---
|
||||||
|
def _safe_doc_id(s: str) -> str:
|
||||||
|
"""
|
||||||
|
Meilisearch document IDs must be [A-Za-z0-9_-]. Convert the title to a safe slug.
|
||||||
|
If the result is empty, fall back to a short SHA1 hash.
|
||||||
|
"""
|
||||||
|
import hashlib
|
||||||
|
slug = re.sub(r"\s+", "_", (s or "").strip())
|
||||||
|
slug = re.sub(r"[^A-Za-z0-9_-]", "", slug)
|
||||||
|
if not slug:
|
||||||
|
slug = hashlib.sha1((s or "").encode("utf-8", errors="ignore")).hexdigest()[:16]
|
||||||
|
return slug
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_meili_index():
|
||||||
|
"""Create index 'library' with primaryKey 'id' if it does not already exist."""
|
||||||
|
try:
|
||||||
|
r = requests.get(f"{MEILI_URL}/indexes/library",
|
||||||
|
headers={"Authorization": f"Bearer {MEILI_KEY}"}, timeout=10)
|
||||||
|
if r.status_code == 200:
|
||||||
|
return
|
||||||
|
# Attempt to create it
|
||||||
|
cr = requests.post(
|
||||||
|
f"{MEILI_URL}/indexes",
|
||||||
|
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
|
||||||
|
data=orjson.dumps({"uid": "library", "primaryKey": "id"}),
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
# Ignore errors if another process created it first
|
||||||
|
try:
|
||||||
|
cr.raise_for_status()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
# Non-fatal; indexing will fail later if the index truly doesn't exist
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def index_meili(json_path: Path):
|
def index_meili(json_path: Path):
|
||||||
|
# Make sure the index exists and is configured with a primary key
|
||||||
|
ensure_meili_index()
|
||||||
|
|
||||||
doc = json.loads(open(json_path, "r", encoding="utf-8").read())
|
doc = json.loads(open(json_path, "r", encoding="utf-8").read())
|
||||||
title = Path(doc["file"]).stem
|
file_field = doc.get("file", "")
|
||||||
date = re.findall(r"\b(\d{8})\b", title)
|
title = Path(file_field).stem if file_field else json_path.stem
|
||||||
|
|
||||||
|
# Build a Meili-safe document ID
|
||||||
|
doc_id = _safe_doc_id(title)
|
||||||
|
|
||||||
|
# Extract a YYYYMMDD date if present
|
||||||
|
m = re.search(r"\b(\d{8})\b", title)
|
||||||
|
date = m.group(1) if m else ""
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"id": title,
|
"id": doc_id,
|
||||||
"type": "podcast",
|
"type": "podcast",
|
||||||
"title": title,
|
"title": title,
|
||||||
"date": date[0] if date else "",
|
"date": date,
|
||||||
"source": str(Path(LIB, Path(doc["file"]).name)),
|
"source": str(Path(LIB, Path(file_field or title).name)),
|
||||||
"text": " ".join(s["text"] for s in doc.get("segments", [])),
|
"text": " ".join(s.get("text", "") for s in doc.get("segments", [])),
|
||||||
"segments": doc.get("segments", []),
|
"segments": doc.get("segments", []),
|
||||||
"meta": {"language": doc.get("language", "")}
|
"meta": {"language": doc.get("language", "")},
|
||||||
}
|
}
|
||||||
import time
|
|
||||||
for attempt in range(5):
|
for attempt in range(5):
|
||||||
try:
|
try:
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
f"{MEILI_URL}/indexes/library/documents",
|
f"{MEILI_URL}/indexes/library/documents",
|
||||||
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
|
headers={
|
||||||
|
"Authorization": f"Bearer {MEILI_KEY}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
data=orjson.dumps(payload),
|
data=orjson.dumps(payload),
|
||||||
timeout=15,
|
timeout=15,
|
||||||
)
|
)
|
||||||
|
Reference in New Issue
Block a user