From a450ab5c70aeb38ae6d57f5ffca28b2e4dee4f9e Mon Sep 17 00:00:00 2001 From: Tomas Kracmar Date: Sun, 5 Oct 2025 12:01:22 +0200 Subject: [PATCH] Adding GPU support for transcription --- .env.example | 20 ++++++++++++++++++++ README.md | 18 ++++++++++++++++++ app/Dockerfile | 16 ++++++++++------ app/worker.py | 36 ++++++++++++++++++++++++++++-------- docker-compose.yml | 6 ++++++ 5 files changed, 82 insertions(+), 14 deletions(-) diff --git a/.env.example b/.env.example index 970c5bc..b5de885 100644 --- a/.env.example +++ b/.env.example @@ -35,6 +35,26 @@ OPENAI_API_KEY= # OPENAI_TRANSCRIBE_MODEL=whisper-1 # OPENAI_TRANSCRIBE_TIMEOUT=600 +# Local Whisper settings +# Choose CPU explicitly unless you have a working GPU runtime in Docker +WHISPER_DEVICE=cpu +# Model and precision (large-v3 int8 is accurate but heavy; consider medium/small for speed) +WHISPER_MODEL=large-v3 +WHISPER_PRECISION=int8 +# Threads for CPU inference +WHISPER_CPU_THREADS=4 + +# --- GPU (CUDA) optional setup --- +# To enable NVIDIA GPU acceleration: +# 1) Install NVIDIA driver on the host and the NVIDIA Container Toolkit +# 2) Set the Docker runtime to NVIDIA for the worker containers +# DOCKER_GPU_RUNTIME=nvidia +# 3) Ensure GPU visibility (default is all) +# NVIDIA_VISIBLE_DEVICES=all +# 4) Use GPU-friendly precision and device +# WHISPER_DEVICE=cuda +# WHISPER_PRECISION=float16 + # Docker volumes paths LIBRARY_HOST_DIR=/mnt/nfs/library TRANSCRIPTS_HOST_DIR=/mnt/nfs/transcripts diff --git a/README.md b/README.md index f739610..45bc042 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,22 @@ The worker reaches OpenWebUI at `$OPENWEBUI_URL` (default: http://host.docker.in Note: `.env.example` includes placeholders for both **Meili** and **OpenWebUI** configuration. Be sure to set `OPENWEBUI_URL` to point to your OpenWebUI container accordingly. +## GPU (CUDA) Setup + +To run Whisper on NVIDIA GPU: + +- Install the NVIDIA driver on the host and the NVIDIA Container Toolkit. +- Copy `.env.example` to `.env` and set: + - `DOCKER_GPU_RUNTIME=nvidia` + - `NVIDIA_VISIBLE_DEVICES=all` (or a specific GPU index) + - `WHISPER_DEVICE=cuda` (or `auto`) + - `WHISPER_PRECISION=float16` (recommended for GPU) +- Rebuild and start: + - `docker compose up -d --build` +- Check logs for `device='cuda'` when the transcribe worker loads the model. + +This repo's app image is based on `nvidia/cuda:12.4.1-cudnn9-runtime-ubuntu22.04`, which includes the CUDA and cuDNN user-space libraries that faster-whisper requires. On non-GPU hosts it still runs on CPU. + ## Components Overview - **scanner**: Scans your media folders (`library` and `transcripts`) for new or updated files, triggering ingestion and processing workflows. @@ -33,6 +49,8 @@ Note: `.env.example` includes placeholders for both **Meili** and **OpenWebUI** - `WHISPER_MODEL`: Whisper model variant to use for transcription (e.g., `small`, `medium`, `large`). - `WHISPER_PRECISION`: Precision setting for Whisper inference (`float32` or `float16`). - `WHISPER_LANGUAGE`: Language code for Whisper to use during transcription (e.g., `en` for English). +- `WHISPER_DEVICE`: Device selection for faster-whisper (`cpu`, `cuda`, or `auto`). Default is `cpu` in docker-compose to avoid GPU lib issues on non-GPU hosts. +- `WHISPER_CPU_THREADS`: CPU threads used for Whisper when `WHISPER_DEVICE=cpu` (default `4`). - `TRANSCRIBE_BACKEND` (default `local`): Set to `openai` to offload Whisper transcription to the OpenAI API instead of running locally. - `OPENAI_API_KEY`: Required when `TRANSCRIBE_BACKEND=openai`; API key used for authenticated requests. - `OPENAI_BASE_URL`, `OPENAI_TRANSCRIBE_MODEL`, `OPENAI_TRANSCRIBE_TIMEOUT`: Optional overrides for the OpenAI transcription endpoint, model and request timeout. diff --git a/app/Dockerfile b/app/Dockerfile index 6f00a25..6f3de09 100644 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -1,4 +1,7 @@ -FROM python:3.11-slim +## GPU-ready base image with CUDA 12 + cuDNN 9 runtime +# If you don't have an NVIDIA GPU or the NVIDIA Container Toolkit, this image still runs on CPU. +# For smaller CPU-only images, you can switch back to python:3.11-slim. +FROM nvidia/cuda:12.4.1-cudnn9-runtime-ubuntu22.04 # Keep python fast/quiet and pip lean ENV PYTHONDONTWRITEBYTECODE=1 \ @@ -12,6 +15,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ # System deps: ffmpeg for media, curl for healthcheck, jq for scripts, poppler-utils for PDFs RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv \ ffmpeg \ curl \ jq \ @@ -22,15 +26,15 @@ WORKDIR /app # Upgrade pip toolchain then install Python deps COPY requirements.txt . -RUN python -m pip install --upgrade pip setuptools wheel \ - && pip install --no-cache-dir -r requirements.txt \ - && pip check || true +RUN python3 -m pip install --upgrade pip setuptools wheel \ + && pip3 install --no-cache-dir -r requirements.txt \ + && pip3 check || true # App code COPY app.py worker.py scanner.py ./ -RUN pip install --no-cache-dir gunicorn==22.0.0 +RUN pip3 install --no-cache-dir gunicorn==22.0.0 # Healthcheck against the app's /health endpoint EXPOSE 8080 -CMD ["gunicorn", "-b", "0.0.0.0:8080", "app:app", "--workers", "2", "--threads", "4"] \ No newline at end of file +CMD ["gunicorn", "-b", "0.0.0.0:8080", "app:app", "--workers", "2", "--threads", "4"] diff --git a/app/worker.py b/app/worker.py index 733f761..4f1ac5c 100644 --- a/app/worker.py +++ b/app/worker.py @@ -162,13 +162,33 @@ def get_model(): global _model if _model is None: print(f"[whisper] loading model='{MODEL_NAME}' device='{WHISPER_DEVICE}' idx={WHISPER_DEVICE_INDEX} compute='{COMPUTE}' threads={WHISPER_CPU_THREADS}", flush=True) - _model = WhisperModel( - MODEL_NAME, - device=WHISPER_DEVICE, - device_index=WHISPER_DEVICE_INDEX, - compute_type=COMPUTE, - cpu_threads=WHISPER_CPU_THREADS, - ) + try: + _model = WhisperModel( + MODEL_NAME, + device=WHISPER_DEVICE, + device_index=WHISPER_DEVICE_INDEX, + compute_type=COMPUTE, + cpu_threads=WHISPER_CPU_THREADS, + ) + except Exception as e: + # If GPU is selected/auto-selected but not available, some environments try to load + # CUDA/cuDNN and fail. Fall back to CPU automatically. + msg = str(e).lower() + gpu_markers = [ + "cuda", "cublas", "cudnn", "hip", "rocm", "nvrtc", "gpu", + "unable to load any of {libcudnn", "cannot load symbol cudnncreatetensordescriptor", + ] + if WHISPER_DEVICE.lower() != "cpu" and any(m in msg for m in gpu_markers): + print(f"[whisper] model init failed on device '{WHISPER_DEVICE}': {e}. Falling back to CPU…", flush=True) + _model = WhisperModel( + MODEL_NAME, + device="cpu", + device_index=0, + compute_type=COMPUTE, + cpu_threads=WHISPER_CPU_THREADS, + ) + else: + raise return _model # --- Helper: Reset model with new device and device_index --- @@ -191,8 +211,8 @@ def run_transcribe_with_fallback(wav_path: Path, lang): Try to transcribe with current model; on GPU/CUDA/HIP/ROCm/OOM errors, reset to CPU and retry once. Returns (segments, info) or raises exception. """ - model = get_model() try: + model = get_model() return model.transcribe(str(wav_path), vad_filter=True, language=lang) except Exception as e: msg = str(e) diff --git a/docker-compose.yml b/docker-compose.yml index db98458..67088f2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,6 +11,8 @@ services: TMP_ROOT: /tmpdl WHISPER_MODEL: large-v3 WHISPER_PRECISION: int8 + WHISPER_DEVICE: ${WHISPER_DEVICE:-cpu} + WHISPER_CPU_THREADS: ${WHISPER_CPU_THREADS:-4} TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local} OPENAI_API_KEY: ${OPENAI_API_KEY:-} OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1} @@ -66,6 +68,8 @@ services: TMP_ROOT: /tmpdl WHISPER_MODEL: large-v3 WHISPER_PRECISION: int8 + WHISPER_DEVICE: ${WHISPER_DEVICE:-cpu} + WHISPER_CPU_THREADS: ${WHISPER_CPU_THREADS:-4} WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} WHISPER_RESUME: ${WHISPER_RESUME:-1} WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20} @@ -129,6 +133,8 @@ services: TMP_ROOT: /tmpdl WHISPER_MODEL: large-v3 WHISPER_PRECISION: int8 + WHISPER_DEVICE: ${WHISPER_DEVICE:-cpu} + WHISPER_CPU_THREADS: ${WHISPER_CPU_THREADS:-4} WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} WHISPER_RESUME: ${WHISPER_RESUME:-1} WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}