Moar fixes

2025-09-07 13:00:30 +02:00
parent 8c84e27a0a
commit 5c68154775
4 changed files with 57 additions and 20 deletions
--- a/app/Dockerfile
+++ b/app/Dockerfile
@@ -1,11 +1,16 @@
 FROM python:3.11-slim

-# Keep python fast and quiet, and pip lean
+# Keep python fast/quiet and pip lean
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
-    PIP_NO_CACHE_DIR=1
+    PIP_NO_CACHE_DIR=1 \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8 \
+    # sensible defaults (can be overridden by .env)
+    WHISPER_MODEL=large-v3 \
+    WHISPER_PRECISION=int8

-# System deps (ffmpeg for media, jq for scripts, poppler-utils for PDFs, curl for healthcheck)
+# System deps: ffmpeg for media, curl for healthcheck, jq for scripts, poppler-utils for PDFs
 RUN apt-get update && apt-get install -y --no-install-recommends \
    ffmpeg \
    curl \
@@ -15,18 +20,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \

 WORKDIR /app

-# Upgrade pip first, then install deps
+# Upgrade pip toolchain then install Python deps
 COPY requirements.txt .
-RUN python -m pip install --upgrade pip \
- && pip install --no-cache-dir -r requirements.txt \
+RUN python -m pip install --upgrade pip setuptools wheel \ 
+ && pip install --no-cache-dir -r requirements.txt \ 
 && pip check || true

 # App code
 COPY app.py worker.py ./
 RUN pip install --no-cache-dir gunicorn==22.0.0

-# Basic container health check (relies on curl)
-HEALTHCHECK --interval=30s --timeout=5s --retries=3 CMD curl -fsS http://127.0.0.1:8080/ || exit 1
+# Healthcheck against the app's /health endpoint
+HEALTHCHECK --interval=30s --timeout=5s --retries=3 CMD curl -fsS http://127.0.0.1:8080/health || exit 1

 EXPOSE 8080
 CMD ["gunicorn", "-b", "0.0.0.0:8080", "app:app", "--workers", "2", "--threads", "4"]
--- a/app/app.py
+++ b/app/app.py
@@ -68,11 +68,24 @@ doSearch();
 """

 def meili_search(qstr, limit=30):
-    if not qstr.strip(): return []
-    r = requests.post(f"{MEILI_URL}/indexes/library/search",
-                      headers={"Authorization": f"Bearer {MEILI_KEY}","Content-Type":"application/json"},
-                      data=json.dumps({"q": qstr, "limit": limit}))
-    return r.json().get("hits", [])
+    if not qstr.strip():
+        return []
+    try:
+        r = requests.post(
+            f"{MEILI_URL}/indexes/library/search",
+            headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
+            data=json.dumps({"q": qstr, "limit": limit}),
+            timeout=5,
+        )
+        if r.status_code != 200:
+            return []
+        return r.json().get("hits", [])
+    except Exception:
+        return []
+
+@app.get("/health")
+def health():
+    return "ok"

@app.get("/")
 def index():
--- a/app/worker.py
+++ b/app/worker.py
@@ -18,7 +18,14 @@ TRN.mkdir(parents=True, exist_ok=True)
 LIB.mkdir(parents=True, exist_ok=True)
 TMP.mkdir(parents=True, exist_ok=True)

-model = WhisperModel(MODEL_NAME, compute_type=COMPUTE)
+# Lazy Whisper model loader so the worker can start even if model download/setup is slow
+_model = None
+
+def get_model():
+    global _model
+    if _model is None:
+        _model = WhisperModel(MODEL_NAME, compute_type=COMPUTE)
+    return _model

 def log(feed):
    try:
@@ -45,6 +52,7 @@ def yt_dlp(url, outdir):
    return sorted(media, key=lambda p: p.stat().st_mtime)[-1:]

 def transcribe(media_path: Path):
+    model = get_model()
    segments, info = model.transcribe(str(media_path), vad_filter=True, language="auto")
    title = media_path.stem
    base = TRN / title
@@ -83,10 +91,21 @@ def index_meili(json_path: Path):
        "segments": doc.get("segments", []),
        "meta": {"language": doc.get("language", "")}
    }
-    r = requests.post(f"{MEILI_URL}/indexes/library/documents",
-                      headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
-                      data=orjson.dumps(payload))
-    r.raise_for_status()
+    import time
+    for attempt in range(5):
+        try:
+            r = requests.post(
+                f"{MEILI_URL}/indexes/library/documents",
+                headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
+                data=orjson.dumps(payload),
+                timeout=15,
+            )
+            r.raise_for_status()
+            break
+        except Exception:
+            if attempt == 4:
+                raise
+            time.sleep(2 * (attempt + 1))

 import tldextract, trafilatura, requests as _requests