diff --git a/app/Dockerfile b/app/Dockerfile index 142fa61..10cd030 100644 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -1,11 +1,16 @@ FROM python:3.11-slim -# Keep python fast and quiet, and pip lean +# Keep python fast/quiet and pip lean ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ - PIP_NO_CACHE_DIR=1 + PIP_NO_CACHE_DIR=1 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + # sensible defaults (can be overridden by .env) + WHISPER_MODEL=large-v3 \ + WHISPER_PRECISION=int8 -# System deps (ffmpeg for media, jq for scripts, poppler-utils for PDFs, curl for healthcheck) +# System deps: ffmpeg for media, curl for healthcheck, jq for scripts, poppler-utils for PDFs RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ curl \ @@ -15,18 +20,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ WORKDIR /app -# Upgrade pip first, then install deps +# Upgrade pip toolchain then install Python deps COPY requirements.txt . -RUN python -m pip install --upgrade pip \ - && pip install --no-cache-dir -r requirements.txt \ +RUN python -m pip install --upgrade pip setuptools wheel \ + && pip install --no-cache-dir -r requirements.txt \ && pip check || true # App code COPY app.py worker.py ./ RUN pip install --no-cache-dir gunicorn==22.0.0 -# Basic container health check (relies on curl) -HEALTHCHECK --interval=30s --timeout=5s --retries=3 CMD curl -fsS http://127.0.0.1:8080/ || exit 1 +# Healthcheck against the app's /health endpoint +HEALTHCHECK --interval=30s --timeout=5s --retries=3 CMD curl -fsS http://127.0.0.1:8080/health || exit 1 EXPOSE 8080 CMD ["gunicorn", "-b", "0.0.0.0:8080", "app:app", "--workers", "2", "--threads", "4"] \ No newline at end of file diff --git a/app/app.py b/app/app.py index 46051d1..7c90a76 100644 --- a/app/app.py +++ b/app/app.py @@ -68,11 +68,24 @@ doSearch(); """ def meili_search(qstr, limit=30): - if not qstr.strip(): return [] - r = requests.post(f"{MEILI_URL}/indexes/library/search", - headers={"Authorization": f"Bearer {MEILI_KEY}","Content-Type":"application/json"}, - data=json.dumps({"q": qstr, "limit": limit})) - return r.json().get("hits", []) + if not qstr.strip(): + return [] + try: + r = requests.post( + f"{MEILI_URL}/indexes/library/search", + headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"}, + data=json.dumps({"q": qstr, "limit": limit}), + timeout=5, + ) + if r.status_code != 200: + return [] + return r.json().get("hits", []) + except Exception: + return [] + +@app.get("/health") +def health(): + return "ok" @app.get("/") def index(): diff --git a/app/worker.py b/app/worker.py index 5e04436..bfac00b 100644 --- a/app/worker.py +++ b/app/worker.py @@ -18,7 +18,14 @@ TRN.mkdir(parents=True, exist_ok=True) LIB.mkdir(parents=True, exist_ok=True) TMP.mkdir(parents=True, exist_ok=True) -model = WhisperModel(MODEL_NAME, compute_type=COMPUTE) +# Lazy Whisper model loader so the worker can start even if model download/setup is slow +_model = None + +def get_model(): + global _model + if _model is None: + _model = WhisperModel(MODEL_NAME, compute_type=COMPUTE) + return _model def log(feed): try: @@ -45,6 +52,7 @@ def yt_dlp(url, outdir): return sorted(media, key=lambda p: p.stat().st_mtime)[-1:] def transcribe(media_path: Path): + model = get_model() segments, info = model.transcribe(str(media_path), vad_filter=True, language="auto") title = media_path.stem base = TRN / title @@ -83,10 +91,21 @@ def index_meili(json_path: Path): "segments": doc.get("segments", []), "meta": {"language": doc.get("language", "")} } - r = requests.post(f"{MEILI_URL}/indexes/library/documents", - headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"}, - data=orjson.dumps(payload)) - r.raise_for_status() + import time + for attempt in range(5): + try: + r = requests.post( + f"{MEILI_URL}/indexes/library/documents", + headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"}, + data=orjson.dumps(payload), + timeout=15, + ) + r.raise_for_status() + break + except Exception: + if attempt == 4: + raise + time.sleep(2 * (attempt + 1)) import tldextract, trafilatura, requests as _requests diff --git a/docker-compose.yml b/docker-compose.yml index fdf95cd..96621cb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.9" services: podx-web: build: ./app @@ -26,7 +25,7 @@ services: podx-worker: build: ./app container_name: podx-worker - command: ["python", "worker.py"] + command: ["rq", "worker", "-u", "redis://redis:6379/0", "default"] env_file: [.env] environment: MEILI_URL: http://meili:7700 @@ -36,6 +35,7 @@ services: TMP_ROOT: /tmpdl WHISPER_MODEL: large-v3 WHISPER_PRECISION: int8 + PYTHONPATH: /app volumes: - ${LIBRARY_HOST_DIR:-./library}:/library - ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts