Introducing OpenAI offloaded transcription

This commit is contained in:
2025-09-24 09:36:00 +02:00
parent 47d13cde83
commit 73e89b9a67
4 changed files with 125 additions and 12 deletions

View File

@@ -10,6 +10,14 @@ OPENWEBUI_API_KEY=put_your_openwebui_api_key_here
OPENWEBUI_KB_NAME=Homelab Library OPENWEBUI_KB_NAME=Homelab Library
OPENWEBUI_KB_ID=your_kb_uuid_here OPENWEBUI_KB_ID=your_kb_uuid_here
# Transcription backend (local Whisper by default)
TRANSCRIBE_BACKEND=local
OPENAI_API_KEY=
# Uncomment to customize OpenAI settings when offloading transcription
# OPENAI_BASE_URL=https://api.openai.com/v1
# OPENAI_TRANSCRIBE_MODEL=whisper-1
# OPENAI_TRANSCRIBE_TIMEOUT=600
# Docker volumes paths # Docker volumes paths
LIBRARY_HOST_DIR=/mnt/nfs/library LIBRARY_HOST_DIR=/mnt/nfs/library
TRANSCRIPTS_HOST_DIR=/mnt/nfs/transcripts TRANSCRIPTS_HOST_DIR=/mnt/nfs/transcripts
@@ -17,4 +25,4 @@ TRANSCRIPTS_HOST_DIR=/mnt/nfs/transcripts
# TMP_HOST_DIR=./tmp # TMP_HOST_DIR=./tmp
# MODELS_HOST_DIR=./models # MODELS_HOST_DIR=./models
# MEILI_DATA_HOST_DIR=./data/meili # MEILI_DATA_HOST_DIR=./data/meili
# REDIS_DATA_HOST_DIR=./data/redis # REDIS_DATA_HOST_DIR=./data/redis

View File

@@ -33,6 +33,9 @@ Note: `.env.example` includes placeholders for both **Meili** and **OpenWebUI**
- `WHISPER_MODEL`: Whisper model variant to use for transcription (e.g., `small`, `medium`, `large`). - `WHISPER_MODEL`: Whisper model variant to use for transcription (e.g., `small`, `medium`, `large`).
- `WHISPER_PRECISION`: Precision setting for Whisper inference (`float32` or `float16`). - `WHISPER_PRECISION`: Precision setting for Whisper inference (`float32` or `float16`).
- `WHISPER_LANGUAGE`: Language code for Whisper to use during transcription (e.g., `en` for English). - `WHISPER_LANGUAGE`: Language code for Whisper to use during transcription (e.g., `en` for English).
- `TRANSCRIBE_BACKEND` (default `local`): Set to `openai` to offload Whisper transcription to the OpenAI API instead of running locally.
- `OPENAI_API_KEY`: Required when `TRANSCRIBE_BACKEND=openai`; API key used for authenticated requests.
- `OPENAI_BASE_URL`, `OPENAI_TRANSCRIBE_MODEL`, `OPENAI_TRANSCRIBE_TIMEOUT`: Optional overrides for the OpenAI transcription endpoint, model and request timeout.
- `YTDLP_COOKIES`: Path to YouTube-DL cookies file for accessing age-restricted or private videos. - `YTDLP_COOKIES`: Path to YouTube-DL cookies file for accessing age-restricted or private videos.
- `OPENWEBUI_URL`: Base URL of the OpenWebUI API (default depends on platform). - `OPENWEBUI_URL`: Base URL of the OpenWebUI API (default depends on platform).
- `OPENWEBUI_API_KEY`: API key for authenticating PodX workers with OpenWebUI. - `OPENWEBUI_API_KEY`: API key for authenticating PodX workers with OpenWebUI.

View File

@@ -1,4 +1,5 @@
import os, subprocess, shutil, json, re, orjson, requests, unicodedata import os, subprocess, shutil, json, re, orjson, requests, unicodedata
from types import SimpleNamespace
from rq import Queue from rq import Queue
from redis import Redis from redis import Redis
from pathlib import Path from pathlib import Path
@@ -105,6 +106,13 @@ OFFLOAD_TRANSCRIBE = os.getenv("OFFLOAD_TRANSCRIBE", "1").lower() not in ("0", "
WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe' WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe'
JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()] JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()]
# Remote transcription (OpenAI) configuration
TRANSCRIBE_BACKEND = os.getenv("TRANSCRIBE_BACKEND", "local").strip().lower()
OPENAI_API_KEY = (os.getenv("OPENAI_API_KEY", "") or "").strip()
OPENAI_BASE_URL = (os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1") or "https://api.openai.com/v1").rstrip("/")
OPENAI_TRANSCRIBE_MODEL = os.getenv("OPENAI_TRANSCRIBE_MODEL", "whisper-1").strip()
OPENAI_TRANSCRIBE_TIMEOUT = int(os.getenv("OPENAI_TRANSCRIBE_TIMEOUT", "600"))
def _mode_allows(task: str) -> bool: def _mode_allows(task: str) -> bool:
"""Gate tasks by worker role. In 'transcribe' mode only allow transcription of local files """Gate tasks by worker role. In 'transcribe' mode only allow transcription of local files
(including indexing and OWUI publish). "task" is one of: 'download','web','local','transcribe'.""" (including indexing and OWUI publish). "task" is one of: 'download','web','local','transcribe'."""
@@ -171,6 +179,73 @@ def run_transcribe_with_fallback(wav_path: Path, lang):
raise raise
raise raise
def run_transcribe_openai(wav_path: Path, lang_hint: str | None):
"""Transcribe audio via OpenAI's Whisper API, returning (segments, info, raw_payload)."""
if not OPENAI_API_KEY:
raise RuntimeError("OPENAI_API_KEY must be set when TRANSCRIBE_BACKEND is 'openai'")
url = f"{OPENAI_BASE_URL}/audio/transcriptions"
headers = {"Authorization": f"Bearer {OPENAI_API_KEY}"}
data: dict[str, str] = {
"model": OPENAI_TRANSCRIBE_MODEL or "whisper-1",
"response_format": "verbose_json",
}
if lang_hint:
data["language"] = lang_hint
start = time.time()
with open(wav_path, "rb") as fh:
files = {"file": (wav_path.name, fh, "audio/wav")}
resp = requests.post(
url,
headers=headers,
data=data,
files=files,
timeout=OPENAI_TRANSCRIBE_TIMEOUT,
)
elapsed = time.time() - start
try:
resp.raise_for_status()
except requests.HTTPError as exc:
print(f"[openai] transcription failed ({exc}); response={resp.text[:400]}", flush=True)
raise
payload = resp.json()
segments_raw = payload.get("segments") or []
seg_objs: list[SimpleNamespace] = []
for seg in segments_raw:
seg_objs.append(
SimpleNamespace(
start=float(seg.get("start") or 0.0),
end=float(seg.get("end") or 0.0),
text=str(seg.get("text") or ""),
)
)
if not seg_objs and payload.get("text"):
duration = float(payload.get("duration") or 0.0)
seg_objs.append(
SimpleNamespace(
start=0.0,
end=duration,
text=str(payload.get("text") or ""),
)
)
language = payload.get("language") or lang_hint or ""
info = SimpleNamespace(language=language)
print(
f"[openai] transcribed {wav_path.name} via {OPENAI_TRANSCRIBE_MODEL or 'whisper-1'} "
f"in {elapsed:.1f}s; segments={len(seg_objs)} lang={language or 'unknown'}",
flush=True,
)
return seg_objs, info, payload
def log(feed): def log(feed):
try: try:
with open(TRN / "_feed.log", "a", encoding="utf-8") as f: with open(TRN / "_feed.log", "a", encoding="utf-8") as f:
@@ -908,7 +983,8 @@ def _save_partial(title: str, language: str, segs: list[dict]):
print(f"[whisper] partial txt save failed: {e}", flush=True) print(f"[whisper] partial txt save failed: {e}", flush=True)
def transcribe(media_path: Path): def transcribe(media_path: Path):
print(f"[whisper] start transcribe: {media_path}", flush=True) backend = TRANSCRIBE_BACKEND
print(f"[transcribe] start backend={backend}: {media_path}", flush=True)
# If paused, abort before any heavy work (no ffmpeg, no model load) # If paused, abort before any heavy work (no ffmpeg, no model load)
if transcribe_paused(): if transcribe_paused():
print(f"[pause] transcribe: pause active before heavy work; aborting {media_path}", flush=True) print(f"[pause] transcribe: pause active before heavy work; aborting {media_path}", flush=True)
@@ -927,12 +1003,13 @@ def transcribe(media_path: Path):
title = media_path.stem title = media_path.stem
base = TRN / title base = TRN / title
resume_enabled = (backend != "openai") and WHISPER_RESUME
# Resume support: if a partial checkpoint exists, load it and trim input # Resume support: if a partial checkpoint exists, load it and trim input
resume_segments = [] resume_segments = []
resume_offset = 0.0 resume_offset = 0.0
language_hint = None language_hint = None
if WHISPER_RESUME: if resume_enabled:
pjson, ptxt = _partial_paths(title) pjson, ptxt = _partial_paths(title)
if pjson.exists(): if pjson.exists():
try: try:
@@ -946,7 +1023,10 @@ def transcribe(media_path: Path):
print(f"[whisper] failed to load partial: {e}", flush=True) print(f"[whisper] failed to load partial: {e}", flush=True)
# If resuming, trim WAV from last end time # If resuming, trim WAV from last end time
wav_for_run = trim_wav(wav, resume_offset, TMP) if resume_enabled and resume_offset > 0.0:
wav_for_run = trim_wav(wav, resume_offset, TMP)
else:
wav_for_run = wav
# 2) Language selection # 2) Language selection
lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
@@ -954,14 +1034,18 @@ def transcribe(media_path: Path):
# carry hint forward if available # carry hint forward if available
lang = language_hint lang = language_hint
# 3) Transcribe # 3) Transcribe (local Whisper or OpenAI backend)
segments, info = run_transcribe_with_fallback(wav_for_run, lang) payload = None
if backend == "openai":
segments, info, payload = run_transcribe_openai(wav_for_run, lang)
else:
segments, info = run_transcribe_with_fallback(wav_for_run, lang)
# Determine duration for progress; use full WAV duration for consistent % regardless of resume # Determine duration for progress; use full WAV duration for consistent % regardless of resume
dur = media_duration_seconds(wav) or 0.0 dur = media_duration_seconds(wav) or 0.0
# Start wall clock timer for speed/ETA # Start wall clock timer for speed/ETA
start_wall = time.time() start_wall = time.time()
if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur: if resume_enabled and resume_offset and dur and resume_offset >= dur:
print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True) print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True)
resume_offset = 0.0 resume_offset = 0.0
last_pct = -1 last_pct = -1
@@ -981,7 +1065,7 @@ def transcribe(media_path: Path):
text_parts.append(s.text) text_parts.append(s.text)
# --- Cooperative pause: save checkpoint and abort as soon as pause is requested --- # --- Cooperative pause: save checkpoint and abort as soon as pause is requested ---
if transcribe_paused(): if resume_enabled and transcribe_paused():
try: try:
pct = int(min(100, max(0, (end / dur) * 100))) if dur > 0 else 0 pct = int(min(100, max(0, (end / dur) * 100))) if dur > 0 else 0
except Exception: except Exception:
@@ -1037,7 +1121,7 @@ def transcribe(media_path: Path):
# periodic partial save # periodic partial save
seg_count_since_save += 1 seg_count_since_save += 1
if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS: if resume_enabled and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS:
_save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs) _save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs)
seg_count_since_save = 0 seg_count_since_save = 0
@@ -1109,7 +1193,7 @@ def transcribe(media_path: Path):
pass pass
# Remove partial checkpoints on success # Remove partial checkpoints on success
if WHISPER_RESUME: if resume_enabled:
try: try:
pjson, ptxt = _partial_paths(title) pjson, ptxt = _partial_paths(title)
if pjson.exists(): pjson.unlink() if pjson.exists(): pjson.unlink()
@@ -1124,7 +1208,10 @@ def transcribe(media_path: Path):
print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True) print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True)
except Exception: except Exception:
pass pass
print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True) print(
f"[transcribe] backend={backend} finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s",
flush=True,
)
return base return base
@@ -1776,4 +1863,4 @@ def handle_url(url: str):
log({**info, **{"status":"done"}}) log({**info, **{"status":"done"}})
except Exception as e: except Exception as e:
log({"url": url, "status":"error", "error": str(e)}) log({"url": url, "status":"error", "error": str(e)})
raise raise

View File

@@ -11,6 +11,11 @@ services:
TMP_ROOT: /tmpdl TMP_ROOT: /tmpdl
WHISPER_MODEL: large-v3 WHISPER_MODEL: large-v3
WHISPER_PRECISION: int8 WHISPER_PRECISION: int8
TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local}
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1}
OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1}
OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600}
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080} OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library} OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
@@ -49,6 +54,11 @@ services:
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
WHISPER_RESUME: ${WHISPER_RESUME:-1} WHISPER_RESUME: ${WHISPER_RESUME:-1}
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20} WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local}
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1}
OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1}
OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600}
WORKER_MODE: all WORKER_MODE: all
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080} OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
@@ -89,6 +99,11 @@ services:
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1} WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
WHISPER_RESUME: ${WHISPER_RESUME:-1} WHISPER_RESUME: ${WHISPER_RESUME:-1}
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20} WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local}
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1}
OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1}
OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600}
WORKER_MODE: transcribe WORKER_MODE: transcribe
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080} OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY} OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}