diff --git a/.env.example b/.env.example index a623ade..fe8ff41 100644 --- a/.env.example +++ b/.env.example @@ -10,6 +10,14 @@ OPENWEBUI_API_KEY=put_your_openwebui_api_key_here OPENWEBUI_KB_NAME=Homelab Library OPENWEBUI_KB_ID=your_kb_uuid_here +# Transcription backend (local Whisper by default) +TRANSCRIBE_BACKEND=local +OPENAI_API_KEY= +# Uncomment to customize OpenAI settings when offloading transcription +# OPENAI_BASE_URL=https://api.openai.com/v1 +# OPENAI_TRANSCRIBE_MODEL=whisper-1 +# OPENAI_TRANSCRIBE_TIMEOUT=600 + # Docker volumes paths LIBRARY_HOST_DIR=/mnt/nfs/library TRANSCRIPTS_HOST_DIR=/mnt/nfs/transcripts @@ -17,4 +25,4 @@ TRANSCRIPTS_HOST_DIR=/mnt/nfs/transcripts # TMP_HOST_DIR=./tmp # MODELS_HOST_DIR=./models # MEILI_DATA_HOST_DIR=./data/meili -# REDIS_DATA_HOST_DIR=./data/redis \ No newline at end of file +# REDIS_DATA_HOST_DIR=./data/redis diff --git a/README.md b/README.md index 59ed83c..f4f557b 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,9 @@ Note: `.env.example` includes placeholders for both **Meili** and **OpenWebUI** - `WHISPER_MODEL`: Whisper model variant to use for transcription (e.g., `small`, `medium`, `large`). - `WHISPER_PRECISION`: Precision setting for Whisper inference (`float32` or `float16`). - `WHISPER_LANGUAGE`: Language code for Whisper to use during transcription (e.g., `en` for English). +- `TRANSCRIBE_BACKEND` (default `local`): Set to `openai` to offload Whisper transcription to the OpenAI API instead of running locally. +- `OPENAI_API_KEY`: Required when `TRANSCRIBE_BACKEND=openai`; API key used for authenticated requests. +- `OPENAI_BASE_URL`, `OPENAI_TRANSCRIBE_MODEL`, `OPENAI_TRANSCRIBE_TIMEOUT`: Optional overrides for the OpenAI transcription endpoint, model and request timeout. - `YTDLP_COOKIES`: Path to YouTube-DL cookies file for accessing age-restricted or private videos. - `OPENWEBUI_URL`: Base URL of the OpenWebUI API (default depends on platform). - `OPENWEBUI_API_KEY`: API key for authenticating PodX workers with OpenWebUI. diff --git a/app/worker.py b/app/worker.py index 3bb8857..3830249 100644 --- a/app/worker.py +++ b/app/worker.py @@ -1,4 +1,5 @@ import os, subprocess, shutil, json, re, orjson, requests, unicodedata +from types import SimpleNamespace from rq import Queue from redis import Redis from pathlib import Path @@ -105,6 +106,13 @@ OFFLOAD_TRANSCRIBE = os.getenv("OFFLOAD_TRANSCRIBE", "1").lower() not in ("0", " WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe' JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()] +# Remote transcription (OpenAI) configuration +TRANSCRIBE_BACKEND = os.getenv("TRANSCRIBE_BACKEND", "local").strip().lower() +OPENAI_API_KEY = (os.getenv("OPENAI_API_KEY", "") or "").strip() +OPENAI_BASE_URL = (os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1") or "https://api.openai.com/v1").rstrip("/") +OPENAI_TRANSCRIBE_MODEL = os.getenv("OPENAI_TRANSCRIBE_MODEL", "whisper-1").strip() +OPENAI_TRANSCRIBE_TIMEOUT = int(os.getenv("OPENAI_TRANSCRIBE_TIMEOUT", "600")) + def _mode_allows(task: str) -> bool: """Gate tasks by worker role. In 'transcribe' mode only allow transcription of local files (including indexing and OWUI publish). "task" is one of: 'download','web','local','transcribe'.""" @@ -171,6 +179,73 @@ def run_transcribe_with_fallback(wav_path: Path, lang): raise raise + +def run_transcribe_openai(wav_path: Path, lang_hint: str | None): + """Transcribe audio via OpenAI's Whisper API, returning (segments, info, raw_payload).""" + if not OPENAI_API_KEY: + raise RuntimeError("OPENAI_API_KEY must be set when TRANSCRIBE_BACKEND is 'openai'") + + url = f"{OPENAI_BASE_URL}/audio/transcriptions" + headers = {"Authorization": f"Bearer {OPENAI_API_KEY}"} + data: dict[str, str] = { + "model": OPENAI_TRANSCRIBE_MODEL or "whisper-1", + "response_format": "verbose_json", + } + if lang_hint: + data["language"] = lang_hint + + start = time.time() + with open(wav_path, "rb") as fh: + files = {"file": (wav_path.name, fh, "audio/wav")} + resp = requests.post( + url, + headers=headers, + data=data, + files=files, + timeout=OPENAI_TRANSCRIBE_TIMEOUT, + ) + elapsed = time.time() - start + + try: + resp.raise_for_status() + except requests.HTTPError as exc: + print(f"[openai] transcription failed ({exc}); response={resp.text[:400]}", flush=True) + raise + + payload = resp.json() + segments_raw = payload.get("segments") or [] + + seg_objs: list[SimpleNamespace] = [] + for seg in segments_raw: + seg_objs.append( + SimpleNamespace( + start=float(seg.get("start") or 0.0), + end=float(seg.get("end") or 0.0), + text=str(seg.get("text") or ""), + ) + ) + + if not seg_objs and payload.get("text"): + duration = float(payload.get("duration") or 0.0) + seg_objs.append( + SimpleNamespace( + start=0.0, + end=duration, + text=str(payload.get("text") or ""), + ) + ) + + language = payload.get("language") or lang_hint or "" + info = SimpleNamespace(language=language) + + print( + f"[openai] transcribed {wav_path.name} via {OPENAI_TRANSCRIBE_MODEL or 'whisper-1'} " + f"in {elapsed:.1f}s; segments={len(seg_objs)} lang={language or 'unknown'}", + flush=True, + ) + + return seg_objs, info, payload + def log(feed): try: with open(TRN / "_feed.log", "a", encoding="utf-8") as f: @@ -908,7 +983,8 @@ def _save_partial(title: str, language: str, segs: list[dict]): print(f"[whisper] partial txt save failed: {e}", flush=True) def transcribe(media_path: Path): - print(f"[whisper] start transcribe: {media_path}", flush=True) + backend = TRANSCRIBE_BACKEND + print(f"[transcribe] start backend={backend}: {media_path}", flush=True) # If paused, abort before any heavy work (no ffmpeg, no model load) if transcribe_paused(): print(f"[pause] transcribe: pause active before heavy work; aborting {media_path}", flush=True) @@ -927,12 +1003,13 @@ def transcribe(media_path: Path): title = media_path.stem base = TRN / title + resume_enabled = (backend != "openai") and WHISPER_RESUME # Resume support: if a partial checkpoint exists, load it and trim input resume_segments = [] resume_offset = 0.0 language_hint = None - if WHISPER_RESUME: + if resume_enabled: pjson, ptxt = _partial_paths(title) if pjson.exists(): try: @@ -946,7 +1023,10 @@ def transcribe(media_path: Path): print(f"[whisper] failed to load partial: {e}", flush=True) # If resuming, trim WAV from last end time - wav_for_run = trim_wav(wav, resume_offset, TMP) + if resume_enabled and resume_offset > 0.0: + wav_for_run = trim_wav(wav, resume_offset, TMP) + else: + wav_for_run = wav # 2) Language selection lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE @@ -954,14 +1034,18 @@ def transcribe(media_path: Path): # carry hint forward if available lang = language_hint - # 3) Transcribe - segments, info = run_transcribe_with_fallback(wav_for_run, lang) + # 3) Transcribe (local Whisper or OpenAI backend) + payload = None + if backend == "openai": + segments, info, payload = run_transcribe_openai(wav_for_run, lang) + else: + segments, info = run_transcribe_with_fallback(wav_for_run, lang) # Determine duration for progress; use full WAV duration for consistent % regardless of resume dur = media_duration_seconds(wav) or 0.0 # Start wall clock timer for speed/ETA start_wall = time.time() - if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur: + if resume_enabled and resume_offset and dur and resume_offset >= dur: print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True) resume_offset = 0.0 last_pct = -1 @@ -981,7 +1065,7 @@ def transcribe(media_path: Path): text_parts.append(s.text) # --- Cooperative pause: save checkpoint and abort as soon as pause is requested --- - if transcribe_paused(): + if resume_enabled and transcribe_paused(): try: pct = int(min(100, max(0, (end / dur) * 100))) if dur > 0 else 0 except Exception: @@ -1037,7 +1121,7 @@ def transcribe(media_path: Path): # periodic partial save seg_count_since_save += 1 - if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS: + if resume_enabled and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS: _save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs) seg_count_since_save = 0 @@ -1109,7 +1193,7 @@ def transcribe(media_path: Path): pass # Remove partial checkpoints on success - if WHISPER_RESUME: + if resume_enabled: try: pjson, ptxt = _partial_paths(title) if pjson.exists(): pjson.unlink() @@ -1124,7 +1208,10 @@ def transcribe(media_path: Path): print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True) except Exception: pass - print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True) + print( + f"[transcribe] backend={backend} finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", + flush=True, + ) return base @@ -1776,4 +1863,4 @@ def handle_url(url: str): log({**info, **{"status":"done"}}) except Exception as e: log({"url": url, "status":"error", "error": str(e)}) - raise \ No newline at end of file + raise diff --git a/docker-compose.yml b/docker-compose.yml index 78a7727..98ad1a8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,6 +11,11 @@ services: TMP_ROOT: /tmpdl WHISPER_MODEL: large-v3 WHISPER_PRECISION: int8 + TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local} + OPENAI_API_KEY: ${OPENAI_API_KEY:-} + OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1} + OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1} + OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600} OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080} OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library} @@ -49,6 +54,11 @@ services: WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} WHISPER_RESUME: ${WHISPER_RESUME:-1} WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20} + TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local} + OPENAI_API_KEY: ${OPENAI_API_KEY:-} + OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1} + OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1} + OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600} WORKER_MODE: all OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080} OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} @@ -89,6 +99,11 @@ services: WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} WHISPER_RESUME: ${WHISPER_RESUME:-1} WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20} + TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local} + OPENAI_API_KEY: ${OPENAI_API_KEY:-} + OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1} + OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1} + OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600} WORKER_MODE: transcribe OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080} OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}