Moar fixes

This commit is contained in:
2025-09-07 13:00:30 +02:00
parent 8c84e27a0a
commit 5c68154775
4 changed files with 57 additions and 20 deletions

View File

@@ -1,11 +1,16 @@
FROM python:3.11-slim FROM python:3.11-slim
# Keep python fast and quiet, and pip lean # Keep python fast/quiet and pip lean
ENV PYTHONDONTWRITEBYTECODE=1 \ ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \ PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 PIP_NO_CACHE_DIR=1 \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
# sensible defaults (can be overridden by .env)
WHISPER_MODEL=large-v3 \
WHISPER_PRECISION=int8
# System deps (ffmpeg for media, jq for scripts, poppler-utils for PDFs, curl for healthcheck) # System deps: ffmpeg for media, curl for healthcheck, jq for scripts, poppler-utils for PDFs
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \ ffmpeg \
curl \ curl \
@@ -15,9 +20,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
WORKDIR /app WORKDIR /app
# Upgrade pip first, then install deps # Upgrade pip toolchain then install Python deps
COPY requirements.txt . COPY requirements.txt .
RUN python -m pip install --upgrade pip \ RUN python -m pip install --upgrade pip setuptools wheel \
&& pip install --no-cache-dir -r requirements.txt \ && pip install --no-cache-dir -r requirements.txt \
&& pip check || true && pip check || true
@@ -25,8 +30,8 @@ RUN python -m pip install --upgrade pip \
COPY app.py worker.py ./ COPY app.py worker.py ./
RUN pip install --no-cache-dir gunicorn==22.0.0 RUN pip install --no-cache-dir gunicorn==22.0.0
# Basic container health check (relies on curl) # Healthcheck against the app's /health endpoint
HEALTHCHECK --interval=30s --timeout=5s --retries=3 CMD curl -fsS http://127.0.0.1:8080/ || exit 1 HEALTHCHECK --interval=30s --timeout=5s --retries=3 CMD curl -fsS http://127.0.0.1:8080/health || exit 1
EXPOSE 8080 EXPOSE 8080
CMD ["gunicorn", "-b", "0.0.0.0:8080", "app:app", "--workers", "2", "--threads", "4"] CMD ["gunicorn", "-b", "0.0.0.0:8080", "app:app", "--workers", "2", "--threads", "4"]

View File

@@ -68,11 +68,24 @@ doSearch();
""" """
def meili_search(qstr, limit=30): def meili_search(qstr, limit=30):
if not qstr.strip(): return [] if not qstr.strip():
r = requests.post(f"{MEILI_URL}/indexes/library/search", return []
try:
r = requests.post(
f"{MEILI_URL}/indexes/library/search",
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"}, headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type": "application/json"},
data=json.dumps({"q": qstr, "limit": limit})) data=json.dumps({"q": qstr, "limit": limit}),
timeout=5,
)
if r.status_code != 200:
return []
return r.json().get("hits", []) return r.json().get("hits", [])
except Exception:
return []
@app.get("/health")
def health():
return "ok"
@app.get("/") @app.get("/")
def index(): def index():

View File

@@ -18,7 +18,14 @@ TRN.mkdir(parents=True, exist_ok=True)
LIB.mkdir(parents=True, exist_ok=True) LIB.mkdir(parents=True, exist_ok=True)
TMP.mkdir(parents=True, exist_ok=True) TMP.mkdir(parents=True, exist_ok=True)
model = WhisperModel(MODEL_NAME, compute_type=COMPUTE) # Lazy Whisper model loader so the worker can start even if model download/setup is slow
_model = None
def get_model():
global _model
if _model is None:
_model = WhisperModel(MODEL_NAME, compute_type=COMPUTE)
return _model
def log(feed): def log(feed):
try: try:
@@ -45,6 +52,7 @@ def yt_dlp(url, outdir):
return sorted(media, key=lambda p: p.stat().st_mtime)[-1:] return sorted(media, key=lambda p: p.stat().st_mtime)[-1:]
def transcribe(media_path: Path): def transcribe(media_path: Path):
model = get_model()
segments, info = model.transcribe(str(media_path), vad_filter=True, language="auto") segments, info = model.transcribe(str(media_path), vad_filter=True, language="auto")
title = media_path.stem title = media_path.stem
base = TRN / title base = TRN / title
@@ -83,10 +91,21 @@ def index_meili(json_path: Path):
"segments": doc.get("segments", []), "segments": doc.get("segments", []),
"meta": {"language": doc.get("language", "")} "meta": {"language": doc.get("language", "")}
} }
r = requests.post(f"{MEILI_URL}/indexes/library/documents", import time
for attempt in range(5):
try:
r = requests.post(
f"{MEILI_URL}/indexes/library/documents",
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"}, headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
data=orjson.dumps(payload)) data=orjson.dumps(payload),
timeout=15,
)
r.raise_for_status() r.raise_for_status()
break
except Exception:
if attempt == 4:
raise
time.sleep(2 * (attempt + 1))
import tldextract, trafilatura, requests as _requests import tldextract, trafilatura, requests as _requests

View File

@@ -1,4 +1,3 @@
version: "3.9"
services: services:
podx-web: podx-web:
build: ./app build: ./app
@@ -26,7 +25,7 @@ services:
podx-worker: podx-worker:
build: ./app build: ./app
container_name: podx-worker container_name: podx-worker
command: ["python", "worker.py"] command: ["rq", "worker", "-u", "redis://redis:6379/0", "default"]
env_file: [.env] env_file: [.env]
environment: environment:
MEILI_URL: http://meili:7700 MEILI_URL: http://meili:7700
@@ -36,6 +35,7 @@ services:
TMP_ROOT: /tmpdl TMP_ROOT: /tmpdl
WHISPER_MODEL: large-v3 WHISPER_MODEL: large-v3
WHISPER_PRECISION: int8 WHISPER_PRECISION: int8
PYTHONPATH: /app
volumes: volumes:
- ${LIBRARY_HOST_DIR:-./library}:/library - ${LIBRARY_HOST_DIR:-./library}:/library
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts - ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts