Adding GPU support for transcription

2025-10-05 12:01:22 +02:00
parent 51624bd066
commit a450ab5c70
5 changed files with 82 additions and 14 deletions
@@ -1,4 +1,7 @@
-FROM python:3.11-slim
+## GPU-ready base image with CUDA 12 + cuDNN 9 runtime
+# If you don't have an NVIDIA GPU or the NVIDIA Container Toolkit, this image still runs on CPU.
+# For smaller CPU-only images, you can switch back to python:3.11-slim.
+FROM nvidia/cuda:12.4.1-cudnn9-runtime-ubuntu22.04

 # Keep python fast/quiet and pip lean
 ENV PYTHONDONTWRITEBYTECODE=1 \
@@ -12,6 +15,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \

 # System deps: ffmpeg for media, curl for healthcheck, jq for scripts, poppler-utils for PDFs
 RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-pip python3-venv \
    ffmpeg \
    curl \
    jq \
@@ -22,15 +26,15 @@ WORKDIR /app

 # Upgrade pip toolchain then install Python deps
 COPY requirements.txt .
-RUN python -m pip install --upgrade pip setuptools wheel \
- && pip install --no-cache-dir -r requirements.txt \
- && pip check || true
+RUN python3 -m pip install --upgrade pip setuptools wheel \
+ && pip3 install --no-cache-dir -r requirements.txt \
+ && pip3 check || true

 # App code
 COPY app.py worker.py scanner.py ./
-RUN pip install --no-cache-dir gunicorn==22.0.0
+RUN pip3 install --no-cache-dir gunicorn==22.0.0

 # Healthcheck against the app's /health endpoint

 EXPOSE 8080
-CMD ["gunicorn", "-b", "0.0.0.0:8080", "app:app", "--workers", "2", "--threads", "4"]
+CMD ["gunicorn", "-b", "0.0.0.0:8080", "app:app", "--workers", "2", "--threads", "4"]
@@ -162,13 +162,33 @@ def get_model():
    global _model
    if _model is None:
        print(f"[whisper] loading model='{MODEL_NAME}' device='{WHISPER_DEVICE}' idx={WHISPER_DEVICE_INDEX} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True)
-        _model = WhisperModel(
-            MODEL_NAME,
-            device=WHISPER_DEVICE,
-            device_index=WHISPER_DEVICE_INDEX,
-            compute_type=COMPUTE,
-            cpu_threads=WHISPER_CPU_THREADS,
-        )
+        try:
+            _model = WhisperModel(
+                MODEL_NAME,
+                device=WHISPER_DEVICE,
+                device_index=WHISPER_DEVICE_INDEX,
+                compute_type=COMPUTE,
+                cpu_threads=WHISPER_CPU_THREADS,
+            )
+        except Exception as e:
+            # If GPU is selected/auto-selected but not available, some environments try to load
+            # CUDA/cuDNN and fail. Fall back to CPU automatically.
+            msg = str(e).lower()
+            gpu_markers = [
+                "cuda", "cublas", "cudnn", "hip", "rocm", "nvrtc", "gpu",
+                "unable to load any of {libcudnn", "cannot load symbol cudnncreatetensordescriptor",
+            ]
+            if WHISPER_DEVICE.lower() != "cpu" and any(m in msg for m in gpu_markers):
+                print(f"[whisper] model init failed on device '{WHISPER_DEVICE}': {e}. Falling back to CPU…", flush=True)
+                _model = WhisperModel(
+                    MODEL_NAME,
+                    device="cpu",
+                    device_index=0,
+                    compute_type=COMPUTE,
+                    cpu_threads=WHISPER_CPU_THREADS,
+                )
+            else:
+                raise
    return _model

 # --- Helper: Reset model with new device and device_index ---
@@ -191,8 +211,8 @@ def run_transcribe_with_fallback(wav_path: Path, lang):
    Try to transcribe with current model; on GPU/CUDA/HIP/ROCm/OOM errors, reset to CPU and retry once.
    Returns (segments, info) or raises exception.
    """
-    model = get_model()
    try:
+        model = get_model()
        return model.transcribe(str(wav_path), vad_filter=True, language=lang)
    except Exception as e:
        msg = str(e)