Introducing OpenAI offloaded transcription
This commit is contained in:
10
.env.example
10
.env.example
@@ -10,6 +10,14 @@ OPENWEBUI_API_KEY=put_your_openwebui_api_key_here
|
|||||||
OPENWEBUI_KB_NAME=Homelab Library
|
OPENWEBUI_KB_NAME=Homelab Library
|
||||||
OPENWEBUI_KB_ID=your_kb_uuid_here
|
OPENWEBUI_KB_ID=your_kb_uuid_here
|
||||||
|
|
||||||
|
# Transcription backend (local Whisper by default)
|
||||||
|
TRANSCRIBE_BACKEND=local
|
||||||
|
OPENAI_API_KEY=
|
||||||
|
# Uncomment to customize OpenAI settings when offloading transcription
|
||||||
|
# OPENAI_BASE_URL=https://api.openai.com/v1
|
||||||
|
# OPENAI_TRANSCRIBE_MODEL=whisper-1
|
||||||
|
# OPENAI_TRANSCRIBE_TIMEOUT=600
|
||||||
|
|
||||||
# Docker volumes paths
|
# Docker volumes paths
|
||||||
LIBRARY_HOST_DIR=/mnt/nfs/library
|
LIBRARY_HOST_DIR=/mnt/nfs/library
|
||||||
TRANSCRIPTS_HOST_DIR=/mnt/nfs/transcripts
|
TRANSCRIPTS_HOST_DIR=/mnt/nfs/transcripts
|
||||||
@@ -17,4 +25,4 @@ TRANSCRIPTS_HOST_DIR=/mnt/nfs/transcripts
|
|||||||
# TMP_HOST_DIR=./tmp
|
# TMP_HOST_DIR=./tmp
|
||||||
# MODELS_HOST_DIR=./models
|
# MODELS_HOST_DIR=./models
|
||||||
# MEILI_DATA_HOST_DIR=./data/meili
|
# MEILI_DATA_HOST_DIR=./data/meili
|
||||||
# REDIS_DATA_HOST_DIR=./data/redis
|
# REDIS_DATA_HOST_DIR=./data/redis
|
||||||
|
@@ -33,6 +33,9 @@ Note: `.env.example` includes placeholders for both **Meili** and **OpenWebUI**
|
|||||||
- `WHISPER_MODEL`: Whisper model variant to use for transcription (e.g., `small`, `medium`, `large`).
|
- `WHISPER_MODEL`: Whisper model variant to use for transcription (e.g., `small`, `medium`, `large`).
|
||||||
- `WHISPER_PRECISION`: Precision setting for Whisper inference (`float32` or `float16`).
|
- `WHISPER_PRECISION`: Precision setting for Whisper inference (`float32` or `float16`).
|
||||||
- `WHISPER_LANGUAGE`: Language code for Whisper to use during transcription (e.g., `en` for English).
|
- `WHISPER_LANGUAGE`: Language code for Whisper to use during transcription (e.g., `en` for English).
|
||||||
|
- `TRANSCRIBE_BACKEND` (default `local`): Set to `openai` to offload Whisper transcription to the OpenAI API instead of running locally.
|
||||||
|
- `OPENAI_API_KEY`: Required when `TRANSCRIBE_BACKEND=openai`; API key used for authenticated requests.
|
||||||
|
- `OPENAI_BASE_URL`, `OPENAI_TRANSCRIBE_MODEL`, `OPENAI_TRANSCRIBE_TIMEOUT`: Optional overrides for the OpenAI transcription endpoint, model and request timeout.
|
||||||
- `YTDLP_COOKIES`: Path to YouTube-DL cookies file for accessing age-restricted or private videos.
|
- `YTDLP_COOKIES`: Path to YouTube-DL cookies file for accessing age-restricted or private videos.
|
||||||
- `OPENWEBUI_URL`: Base URL of the OpenWebUI API (default depends on platform).
|
- `OPENWEBUI_URL`: Base URL of the OpenWebUI API (default depends on platform).
|
||||||
- `OPENWEBUI_API_KEY`: API key for authenticating PodX workers with OpenWebUI.
|
- `OPENWEBUI_API_KEY`: API key for authenticating PodX workers with OpenWebUI.
|
||||||
|
109
app/worker.py
109
app/worker.py
@@ -1,4 +1,5 @@
|
|||||||
import os, subprocess, shutil, json, re, orjson, requests, unicodedata
|
import os, subprocess, shutil, json, re, orjson, requests, unicodedata
|
||||||
|
from types import SimpleNamespace
|
||||||
from rq import Queue
|
from rq import Queue
|
||||||
from redis import Redis
|
from redis import Redis
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -105,6 +106,13 @@ OFFLOAD_TRANSCRIBE = os.getenv("OFFLOAD_TRANSCRIBE", "1").lower() not in ("0", "
|
|||||||
WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe'
|
WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe'
|
||||||
JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()]
|
JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()]
|
||||||
|
|
||||||
|
# Remote transcription (OpenAI) configuration
|
||||||
|
TRANSCRIBE_BACKEND = os.getenv("TRANSCRIBE_BACKEND", "local").strip().lower()
|
||||||
|
OPENAI_API_KEY = (os.getenv("OPENAI_API_KEY", "") or "").strip()
|
||||||
|
OPENAI_BASE_URL = (os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1") or "https://api.openai.com/v1").rstrip("/")
|
||||||
|
OPENAI_TRANSCRIBE_MODEL = os.getenv("OPENAI_TRANSCRIBE_MODEL", "whisper-1").strip()
|
||||||
|
OPENAI_TRANSCRIBE_TIMEOUT = int(os.getenv("OPENAI_TRANSCRIBE_TIMEOUT", "600"))
|
||||||
|
|
||||||
def _mode_allows(task: str) -> bool:
|
def _mode_allows(task: str) -> bool:
|
||||||
"""Gate tasks by worker role. In 'transcribe' mode only allow transcription of local files
|
"""Gate tasks by worker role. In 'transcribe' mode only allow transcription of local files
|
||||||
(including indexing and OWUI publish). "task" is one of: 'download','web','local','transcribe'."""
|
(including indexing and OWUI publish). "task" is one of: 'download','web','local','transcribe'."""
|
||||||
@@ -171,6 +179,73 @@ def run_transcribe_with_fallback(wav_path: Path, lang):
|
|||||||
raise
|
raise
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def run_transcribe_openai(wav_path: Path, lang_hint: str | None):
|
||||||
|
"""Transcribe audio via OpenAI's Whisper API, returning (segments, info, raw_payload)."""
|
||||||
|
if not OPENAI_API_KEY:
|
||||||
|
raise RuntimeError("OPENAI_API_KEY must be set when TRANSCRIBE_BACKEND is 'openai'")
|
||||||
|
|
||||||
|
url = f"{OPENAI_BASE_URL}/audio/transcriptions"
|
||||||
|
headers = {"Authorization": f"Bearer {OPENAI_API_KEY}"}
|
||||||
|
data: dict[str, str] = {
|
||||||
|
"model": OPENAI_TRANSCRIBE_MODEL or "whisper-1",
|
||||||
|
"response_format": "verbose_json",
|
||||||
|
}
|
||||||
|
if lang_hint:
|
||||||
|
data["language"] = lang_hint
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
with open(wav_path, "rb") as fh:
|
||||||
|
files = {"file": (wav_path.name, fh, "audio/wav")}
|
||||||
|
resp = requests.post(
|
||||||
|
url,
|
||||||
|
headers=headers,
|
||||||
|
data=data,
|
||||||
|
files=files,
|
||||||
|
timeout=OPENAI_TRANSCRIBE_TIMEOUT,
|
||||||
|
)
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp.raise_for_status()
|
||||||
|
except requests.HTTPError as exc:
|
||||||
|
print(f"[openai] transcription failed ({exc}); response={resp.text[:400]}", flush=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
payload = resp.json()
|
||||||
|
segments_raw = payload.get("segments") or []
|
||||||
|
|
||||||
|
seg_objs: list[SimpleNamespace] = []
|
||||||
|
for seg in segments_raw:
|
||||||
|
seg_objs.append(
|
||||||
|
SimpleNamespace(
|
||||||
|
start=float(seg.get("start") or 0.0),
|
||||||
|
end=float(seg.get("end") or 0.0),
|
||||||
|
text=str(seg.get("text") or ""),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not seg_objs and payload.get("text"):
|
||||||
|
duration = float(payload.get("duration") or 0.0)
|
||||||
|
seg_objs.append(
|
||||||
|
SimpleNamespace(
|
||||||
|
start=0.0,
|
||||||
|
end=duration,
|
||||||
|
text=str(payload.get("text") or ""),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
language = payload.get("language") or lang_hint or ""
|
||||||
|
info = SimpleNamespace(language=language)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[openai] transcribed {wav_path.name} via {OPENAI_TRANSCRIBE_MODEL or 'whisper-1'} "
|
||||||
|
f"in {elapsed:.1f}s; segments={len(seg_objs)} lang={language or 'unknown'}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return seg_objs, info, payload
|
||||||
|
|
||||||
def log(feed):
|
def log(feed):
|
||||||
try:
|
try:
|
||||||
with open(TRN / "_feed.log", "a", encoding="utf-8") as f:
|
with open(TRN / "_feed.log", "a", encoding="utf-8") as f:
|
||||||
@@ -908,7 +983,8 @@ def _save_partial(title: str, language: str, segs: list[dict]):
|
|||||||
print(f"[whisper] partial txt save failed: {e}", flush=True)
|
print(f"[whisper] partial txt save failed: {e}", flush=True)
|
||||||
|
|
||||||
def transcribe(media_path: Path):
|
def transcribe(media_path: Path):
|
||||||
print(f"[whisper] start transcribe: {media_path}", flush=True)
|
backend = TRANSCRIBE_BACKEND
|
||||||
|
print(f"[transcribe] start backend={backend}: {media_path}", flush=True)
|
||||||
# If paused, abort before any heavy work (no ffmpeg, no model load)
|
# If paused, abort before any heavy work (no ffmpeg, no model load)
|
||||||
if transcribe_paused():
|
if transcribe_paused():
|
||||||
print(f"[pause] transcribe: pause active before heavy work; aborting {media_path}", flush=True)
|
print(f"[pause] transcribe: pause active before heavy work; aborting {media_path}", flush=True)
|
||||||
@@ -927,12 +1003,13 @@ def transcribe(media_path: Path):
|
|||||||
|
|
||||||
title = media_path.stem
|
title = media_path.stem
|
||||||
base = TRN / title
|
base = TRN / title
|
||||||
|
resume_enabled = (backend != "openai") and WHISPER_RESUME
|
||||||
|
|
||||||
# Resume support: if a partial checkpoint exists, load it and trim input
|
# Resume support: if a partial checkpoint exists, load it and trim input
|
||||||
resume_segments = []
|
resume_segments = []
|
||||||
resume_offset = 0.0
|
resume_offset = 0.0
|
||||||
language_hint = None
|
language_hint = None
|
||||||
if WHISPER_RESUME:
|
if resume_enabled:
|
||||||
pjson, ptxt = _partial_paths(title)
|
pjson, ptxt = _partial_paths(title)
|
||||||
if pjson.exists():
|
if pjson.exists():
|
||||||
try:
|
try:
|
||||||
@@ -946,7 +1023,10 @@ def transcribe(media_path: Path):
|
|||||||
print(f"[whisper] failed to load partial: {e}", flush=True)
|
print(f"[whisper] failed to load partial: {e}", flush=True)
|
||||||
|
|
||||||
# If resuming, trim WAV from last end time
|
# If resuming, trim WAV from last end time
|
||||||
wav_for_run = trim_wav(wav, resume_offset, TMP)
|
if resume_enabled and resume_offset > 0.0:
|
||||||
|
wav_for_run = trim_wav(wav, resume_offset, TMP)
|
||||||
|
else:
|
||||||
|
wav_for_run = wav
|
||||||
|
|
||||||
# 2) Language selection
|
# 2) Language selection
|
||||||
lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
|
lang = None if WHISPER_LANGUAGE.lower() == "auto" else WHISPER_LANGUAGE
|
||||||
@@ -954,14 +1034,18 @@ def transcribe(media_path: Path):
|
|||||||
# carry hint forward if available
|
# carry hint forward if available
|
||||||
lang = language_hint
|
lang = language_hint
|
||||||
|
|
||||||
# 3) Transcribe
|
# 3) Transcribe (local Whisper or OpenAI backend)
|
||||||
segments, info = run_transcribe_with_fallback(wav_for_run, lang)
|
payload = None
|
||||||
|
if backend == "openai":
|
||||||
|
segments, info, payload = run_transcribe_openai(wav_for_run, lang)
|
||||||
|
else:
|
||||||
|
segments, info = run_transcribe_with_fallback(wav_for_run, lang)
|
||||||
|
|
||||||
# Determine duration for progress; use full WAV duration for consistent % regardless of resume
|
# Determine duration for progress; use full WAV duration for consistent % regardless of resume
|
||||||
dur = media_duration_seconds(wav) or 0.0
|
dur = media_duration_seconds(wav) or 0.0
|
||||||
# Start wall clock timer for speed/ETA
|
# Start wall clock timer for speed/ETA
|
||||||
start_wall = time.time()
|
start_wall = time.time()
|
||||||
if WHISPER_RESUME and resume_offset and dur and resume_offset >= dur:
|
if resume_enabled and resume_offset and dur and resume_offset >= dur:
|
||||||
print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True)
|
print(f"[whisper] resume offset {resume_offset:.2f}s >= duration {dur:.2f}s; resetting resume.", flush=True)
|
||||||
resume_offset = 0.0
|
resume_offset = 0.0
|
||||||
last_pct = -1
|
last_pct = -1
|
||||||
@@ -981,7 +1065,7 @@ def transcribe(media_path: Path):
|
|||||||
text_parts.append(s.text)
|
text_parts.append(s.text)
|
||||||
|
|
||||||
# --- Cooperative pause: save checkpoint and abort as soon as pause is requested ---
|
# --- Cooperative pause: save checkpoint and abort as soon as pause is requested ---
|
||||||
if transcribe_paused():
|
if resume_enabled and transcribe_paused():
|
||||||
try:
|
try:
|
||||||
pct = int(min(100, max(0, (end / dur) * 100))) if dur > 0 else 0
|
pct = int(min(100, max(0, (end / dur) * 100))) if dur > 0 else 0
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -1037,7 +1121,7 @@ def transcribe(media_path: Path):
|
|||||||
|
|
||||||
# periodic partial save
|
# periodic partial save
|
||||||
seg_count_since_save += 1
|
seg_count_since_save += 1
|
||||||
if WHISPER_RESUME and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS:
|
if resume_enabled and seg_count_since_save >= PARTIAL_SAVE_EVERY_SEGS:
|
||||||
_save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs)
|
_save_partial(title, info.language or (WHISPER_LANGUAGE if WHISPER_LANGUAGE.lower() != "auto" else "en"), segs)
|
||||||
seg_count_since_save = 0
|
seg_count_since_save = 0
|
||||||
|
|
||||||
@@ -1109,7 +1193,7 @@ def transcribe(media_path: Path):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# Remove partial checkpoints on success
|
# Remove partial checkpoints on success
|
||||||
if WHISPER_RESUME:
|
if resume_enabled:
|
||||||
try:
|
try:
|
||||||
pjson, ptxt = _partial_paths(title)
|
pjson, ptxt = _partial_paths(title)
|
||||||
if pjson.exists(): pjson.unlink()
|
if pjson.exists(): pjson.unlink()
|
||||||
@@ -1124,7 +1208,10 @@ def transcribe(media_path: Path):
|
|||||||
print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True)
|
print(f"[whisper] avg speed ~{avg_rtf:0.2f}x (audio_seconds / wall_seconds)", flush=True)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
print(f"[whisper] finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s", flush=True)
|
print(
|
||||||
|
f"[transcribe] backend={backend} finished: {media_path} lang={info.language} segments={len(segs)} dur={dur:.2f}s",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
return base
|
return base
|
||||||
|
|
||||||
|
|
||||||
@@ -1776,4 +1863,4 @@ def handle_url(url: str):
|
|||||||
log({**info, **{"status":"done"}})
|
log({**info, **{"status":"done"}})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log({"url": url, "status":"error", "error": str(e)})
|
log({"url": url, "status":"error", "error": str(e)})
|
||||||
raise
|
raise
|
||||||
|
@@ -11,6 +11,11 @@ services:
|
|||||||
TMP_ROOT: /tmpdl
|
TMP_ROOT: /tmpdl
|
||||||
WHISPER_MODEL: large-v3
|
WHISPER_MODEL: large-v3
|
||||||
WHISPER_PRECISION: int8
|
WHISPER_PRECISION: int8
|
||||||
|
TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local}
|
||||||
|
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
|
||||||
|
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1}
|
||||||
|
OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1}
|
||||||
|
OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600}
|
||||||
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
|
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
|
||||||
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
|
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
|
||||||
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
|
OPENWEBUI_KB_NAME: ${OPENWEBUI_KB_NAME:-Homelab Library}
|
||||||
@@ -49,6 +54,11 @@ services:
|
|||||||
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
||||||
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
||||||
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
||||||
|
TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local}
|
||||||
|
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
|
||||||
|
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1}
|
||||||
|
OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1}
|
||||||
|
OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600}
|
||||||
WORKER_MODE: all
|
WORKER_MODE: all
|
||||||
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
|
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
|
||||||
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
|
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
|
||||||
@@ -89,6 +99,11 @@ services:
|
|||||||
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
WHISPER_LOG_SEGMENTS: ${WHISPER_LOG_SEGMENTS:-1}
|
||||||
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
WHISPER_RESUME: ${WHISPER_RESUME:-1}
|
||||||
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
WHISPER_PARTIAL_SAVE_EVERY_SEGS: ${WHISPER_PARTIAL_SAVE_EVERY_SEGS:-20}
|
||||||
|
TRANSCRIBE_BACKEND: ${TRANSCRIBE_BACKEND:-local}
|
||||||
|
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
|
||||||
|
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-https://api.openai.com/v1}
|
||||||
|
OPENAI_TRANSCRIBE_MODEL: ${OPENAI_TRANSCRIBE_MODEL:-whisper-1}
|
||||||
|
OPENAI_TRANSCRIBE_TIMEOUT: ${OPENAI_TRANSCRIBE_TIMEOUT:-600}
|
||||||
WORKER_MODE: transcribe
|
WORKER_MODE: transcribe
|
||||||
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
|
OPENWEBUI_URL: ${OPENWEBUI_CONTAINER_URL:-http://open-webui:8080}
|
||||||
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
|
OPENWEBUI_API_KEY: ${OPENWEBUI_API_KEY}
|
||||||
|
Reference in New Issue
Block a user