Actual worker differentiation
This commit is contained in:
150
app/worker.py
150
app/worker.py
@@ -1,4 +1,6 @@
|
||||
import os, subprocess, shutil, json, re, orjson, requests
|
||||
import os, subprocess, shutil, json, re, orjson, requests, unicodedata
|
||||
from rq import Queue
|
||||
from redis import Redis
|
||||
from pathlib import Path
|
||||
import math
|
||||
import difflib
|
||||
@@ -36,6 +38,10 @@ OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
|
||||
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
|
||||
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
|
||||
|
||||
# Redis-backed job queue settings and offload toggle
|
||||
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0").strip()
|
||||
OFFLOAD_TRANSCRIBE = os.getenv("OFFLOAD_TRANSCRIBE", "1").lower() not in ("0", "false", "no")
|
||||
|
||||
# Worker role selection
|
||||
WORKER_MODE = os.getenv("WORKER_MODE", "all").strip().lower() # 'all' or 'transcribe'
|
||||
JOB_QUEUES = [q.strip() for q in os.getenv("JOB_QUEUES", "default").split(",") if q.strip()]
|
||||
@@ -1122,6 +1128,15 @@ def slugify(text):
|
||||
text = re.sub(r'[^A-Za-z0-9\-._ ]+', '', text).strip().replace(' ', '_')
|
||||
return text[:120] or 'page'
|
||||
|
||||
def _norm(s: str | None) -> str:
|
||||
"""Normalize strings for stable comparisons across Unicode lookalikes and stray whitespace."""
|
||||
if s is None:
|
||||
return ""
|
||||
try:
|
||||
return unicodedata.normalize("NFKC", s).strip()
|
||||
except Exception:
|
||||
return (s or "").strip()
|
||||
|
||||
def save_web_snapshot(url: str):
|
||||
r = _requests.get(url, timeout=30, headers={"User-Agent":"Mozilla/5.0"})
|
||||
r.raise_for_status()
|
||||
@@ -1183,8 +1198,9 @@ def owui_get_or_create_kb():
|
||||
r.raise_for_status()
|
||||
body = r.json()
|
||||
items = body if isinstance(body, list) else body.get("data", [])
|
||||
# Prefer exact name match; if multiple, pick the most recently updated
|
||||
matches = [kb for kb in items if (kb.get("name") or "") == OWUI_KB]
|
||||
# Prefer exact normalized name match; if multiple, pick the most recently updated
|
||||
kb_target = _norm(OWUI_KB)
|
||||
matches = [kb for kb in items if _norm(kb.get("name")) == kb_target]
|
||||
if matches:
|
||||
try:
|
||||
matches.sort(key=lambda k: k.get("updated_at") or 0, reverse=True)
|
||||
@@ -1213,8 +1229,9 @@ def owui_get_or_create_kb():
|
||||
rr.raise_for_status()
|
||||
body = rr.json()
|
||||
items = body if isinstance(body, list) else body.get("data", [])
|
||||
kb_target = _norm(OWUI_KB)
|
||||
for kb in items:
|
||||
if (kb.get("name") or "") == OWUI_KB:
|
||||
if _norm(kb.get("name")) == kb_target:
|
||||
return kb.get("id")
|
||||
except Exception:
|
||||
pass
|
||||
@@ -1224,14 +1241,21 @@ def owui_upload_and_attach(path: Path, kb_id: str):
|
||||
with open(path, "rb") as f:
|
||||
r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=owui_headers(), files={"file": (path.name, f)}, timeout=60*10)
|
||||
r.raise_for_status()
|
||||
file_id = r.json()["data"]["id"]
|
||||
up = r.json()
|
||||
file_id = (up.get("id") or (up.get("data") or {}).get("id"))
|
||||
if not file_id:
|
||||
raise RuntimeError(f"OWUI upload: could not get file id from response: {up}")
|
||||
r = requests.post(
|
||||
f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add",
|
||||
headers={**owui_headers(), "Content-Type": "application/json"},
|
||||
data=orjson.dumps({"file_id": file_id}),
|
||||
timeout=60,
|
||||
timeout=180,
|
||||
)
|
||||
r.raise_for_status()
|
||||
try:
|
||||
time.sleep(0.5)
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
def publish_to_openwebui(paths):
|
||||
@@ -1239,6 +1263,9 @@ def publish_to_openwebui(paths):
|
||||
return
|
||||
try:
|
||||
kb_id = owui_get_or_create_kb()
|
||||
if not kb_id:
|
||||
print("[owui] KB resolve failed; skipping attach to avoid accidental duplicates", flush=True)
|
||||
return
|
||||
for p in paths:
|
||||
p = Path(p)
|
||||
if not p.exists():
|
||||
@@ -1250,6 +1277,61 @@ def publish_to_openwebui(paths):
|
||||
except Exception as e:
|
||||
log({"status": "owui_error", "error": str(e)})
|
||||
|
||||
# --------- Post-transcribe pipeline and job/queue helpers ---------
|
||||
|
||||
def _postprocess_after_transcribe(media_path: Path, base: Path):
|
||||
"""Common steps after we have a `base` transcript path: index, publish, NFO, artwork."""
|
||||
try:
|
||||
index_meili(base.with_suffix(".json"))
|
||||
except Exception as e:
|
||||
print(f"[post] meili index failed: {e}", flush=True)
|
||||
try:
|
||||
publish_to_openwebui([base.with_suffix(".txt")])
|
||||
except Exception as e:
|
||||
print(f"[post] owui publish failed: {e}", flush=True)
|
||||
# Build metadata using existing helper
|
||||
try:
|
||||
title = media_path.stem
|
||||
fallback = {
|
||||
"title": title,
|
||||
"episode_title": title,
|
||||
"show": media_path.parent.name,
|
||||
"description": "",
|
||||
"pubdate": _extract_date_from_stem(title),
|
||||
"duration_sec": media_duration_seconds(media_path),
|
||||
"image": "",
|
||||
"guid": "",
|
||||
}
|
||||
meta = build_meta_from_sources(media_path, media_path.parent.name, fallback, ep=None)
|
||||
ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
|
||||
write_episode_nfo(media_path, meta, ttxt)
|
||||
try:
|
||||
save_episode_artwork(meta.get("image"), media_path, meta.get("show"))
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"[post] NFO write failed: {e}", flush=True)
|
||||
|
||||
def transcribe_job(path_str: str):
|
||||
"""RQ job: heavy transcription only. Safe to import by dotted path 'worker.transcribe_job'."""
|
||||
p = Path(path_str)
|
||||
base = transcribe(p)
|
||||
_postprocess_after_transcribe(p, base)
|
||||
return str(base)
|
||||
|
||||
def enqueue_transcribe(path: Path) -> bool:
|
||||
"""Enqueue a transcription job to the 'transcribe' queue. Returns True on success."""
|
||||
try:
|
||||
conn = Redis.from_url(REDIS_URL)
|
||||
q = Queue("transcribe", connection=conn, default_timeout=60*60*24)
|
||||
# Use dotted path so workers in other processes can import
|
||||
q.enqueue("worker.transcribe_job", str(path), job_timeout=60*60*24)
|
||||
print(f"[queue] enqueued transcribe job for {path}", flush=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"[queue] enqueue failed, will transcribe inline: {e}", flush=True)
|
||||
return False
|
||||
|
||||
def handle_local_file(path_str: str):
|
||||
"""Transcribe & index a local media file that already exists in /library.
|
||||
If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper.
|
||||
@@ -1352,30 +1434,12 @@ def handle_local_file(path_str: str):
|
||||
log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
|
||||
return
|
||||
|
||||
# 2) Otherwise, run transcription
|
||||
# 2) Otherwise, run transcription (offload to queue if enabled and not in transcribe-only worker)
|
||||
if OFFLOAD_TRANSCRIBE and WORKER_MODE != "transcribe" and enqueue_transcribe(p):
|
||||
log({**info, **{"status": "queued_transcribe"}})
|
||||
return
|
||||
base = transcribe(p)
|
||||
index_meili(base.with_suffix(".json"))
|
||||
publish_to_openwebui([base.with_suffix(".txt")])
|
||||
try:
|
||||
fallback = {
|
||||
"title": title,
|
||||
"episode_title": title,
|
||||
"show": p.parent.name,
|
||||
"description": "",
|
||||
"pubdate": _extract_date_from_stem(title),
|
||||
"duration_sec": media_duration_seconds(p),
|
||||
"image": "",
|
||||
"guid": "",
|
||||
}
|
||||
meta = build_meta_from_sources(p, p.parent.name, fallback, ep=None)
|
||||
ttxt = (TRN / title).with_suffix(".txt").read_text(encoding="utf-8")
|
||||
write_episode_nfo(p, meta, ttxt)
|
||||
try:
|
||||
save_episode_artwork(meta.get("image"), p, meta.get("show"))
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"[post] NFO write failed: {e}", flush=True)
|
||||
_postprocess_after_transcribe(p, base)
|
||||
log({**info, **{"status": "done"}})
|
||||
except Exception as e:
|
||||
log({"url": path_str, "status": "error", "error": str(e)})
|
||||
@@ -1590,31 +1654,11 @@ def handle_url(url: str):
|
||||
if repo_json:
|
||||
base = reuse_repo_transcript(dest, repo_json)
|
||||
if not base:
|
||||
if OFFLOAD_TRANSCRIBE and WORKER_MODE != "transcribe" and enqueue_transcribe(dest):
|
||||
log({**info, **{"status": "queued_transcribe"}})
|
||||
continue
|
||||
base = transcribe(dest)
|
||||
index_meili(base.with_suffix(".json"))
|
||||
publish_to_openwebui([base.with_suffix(".txt")])
|
||||
try:
|
||||
# Build metadata from RSS (if matched), yt-dlp info.json, and sensible fallbacks
|
||||
fallback = {
|
||||
"title": dest.stem,
|
||||
"episode_title": dest.stem,
|
||||
"show": uploader,
|
||||
"description": "",
|
||||
"pubdate": _extract_date_from_stem(dest.stem),
|
||||
"duration_sec": media_duration_seconds(dest),
|
||||
"image": "",
|
||||
"guid": "",
|
||||
}
|
||||
meta = build_meta_from_sources(dest, uploader, fallback, ep if 'ep' in locals() else None)
|
||||
ttxt = base.with_suffix(".txt").read_text(encoding="utf-8")
|
||||
write_episode_nfo(dest, meta, ttxt)
|
||||
# Save local artwork for Plex/Kodi from meta image url
|
||||
try:
|
||||
save_episode_artwork(meta.get("image"), dest, meta.get("show"))
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"[post] NFO write failed: {e}", flush=True)
|
||||
_postprocess_after_transcribe(dest, base)
|
||||
log({**info, **{"status":"done"}})
|
||||
except Exception as e:
|
||||
log({"url": url, "status":"error", "error": str(e)})
|
||||
|
@@ -32,6 +32,7 @@ services:
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# Main worker: handles downloads, indexing, RSS, OWUI, etc. (no heavy Whisper)
|
||||
podx-worker:
|
||||
build: ./app
|
||||
container_name: podx-worker
|
||||
@@ -71,6 +72,7 @@ services:
|
||||
extra_hosts:
|
||||
- host.docker.internal:host-gateway
|
||||
|
||||
# Transcribe-only worker: listens to the "transcribe" queue and runs Whisper jobs
|
||||
podx-worker-transcribe:
|
||||
build: ./app
|
||||
container_name: podx-worker-transcribe
|
||||
@@ -150,6 +152,7 @@ services:
|
||||
# - /mnt/secure/cookies.txt:/config/cookies.txt:ro
|
||||
restart: unless-stopped
|
||||
|
||||
# Scanner: watches /library and enqueues jobs (heavy jobs go to "transcribe" queue)
|
||||
podx-scanner:
|
||||
build: ./app
|
||||
container_name: podx-scanner
|
||||
@@ -160,6 +163,7 @@ services:
|
||||
REDIS_URL: redis://redis:6379/0
|
||||
LIBRARY_ROOT: /library
|
||||
TRANSCRIPT_ROOT: /transcripts
|
||||
TRANSCRIBE_QUEUE: transcribe
|
||||
SCAN_INTERVAL: 30
|
||||
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
|
||||
JOB_TTL: ${JOB_TTL:-86400}
|
||||
|
Reference in New Issue
Block a user