Podcast sync
This commit is contained in:
105
app/resolver.py
Normal file
105
app/resolver.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
# resolver.py
|
||||||
|
from __future__ import annotations
|
||||||
|
import json, os, re, subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional, Dict, Any, Tuple, List
|
||||||
|
|
||||||
|
try:
|
||||||
|
from rapidfuzz import fuzz, process
|
||||||
|
except Exception:
|
||||||
|
fuzz = None
|
||||||
|
process = None
|
||||||
|
|
||||||
|
def _norm(s: str) -> str:
|
||||||
|
s = s.lower()
|
||||||
|
s = re.sub(r"[\[\]\(\)\{\}|_]+", " ", s)
|
||||||
|
s = re.sub(r"[^0-9a-zá-žà-ÿ\u00C0-\u024F\s]+", " ", s) # keep latin accents, cz/diacritics
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _title_from_filename(p: Path) -> str:
|
||||||
|
name = p.stem # drop extension
|
||||||
|
# common yt-dlp patterns like "YYYYMMDD - Title"
|
||||||
|
name = re.sub(r"^\d{8}\s*-\s*", "", name)
|
||||||
|
return name
|
||||||
|
|
||||||
|
def _ffprobe_duration_seconds(p: Path) -> Optional[int]:
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output([
|
||||||
|
"ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||||
|
"-of", "default=nw=1:nk=1", str(p)
|
||||||
|
], stderr=subprocess.STDOUT, text=True).strip()
|
||||||
|
return int(float(out))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def load_index(index_path: Path) -> List[Dict[str, Any]]:
|
||||||
|
if not index_path.exists():
|
||||||
|
return []
|
||||||
|
with index_path.open("r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
# expected per-item keys:
|
||||||
|
# title, pubdate_ts (int), duration_s (int or null),
|
||||||
|
# transcript_urls: {"srt": str|None, "vtt": str|None, "txt": str|None},
|
||||||
|
# audio_url, guid, feed_url
|
||||||
|
return data if isinstance(data, list) else []
|
||||||
|
|
||||||
|
def match_episode(
|
||||||
|
media_path: Path,
|
||||||
|
index_items: List[Dict[str, Any]],
|
||||||
|
duration_tolerance_s: int = 120,
|
||||||
|
min_ratio: int = 82,
|
||||||
|
date_window_days: int = 14,
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
title_guess = _title_from_filename(media_path)
|
||||||
|
tnorm = _norm(title_guess)
|
||||||
|
if not tnorm:
|
||||||
|
return None
|
||||||
|
|
||||||
|
media_secs = _ffprobe_duration_seconds(media_path)
|
||||||
|
media_date = None
|
||||||
|
# try to parse upload date prefix in filename if present
|
||||||
|
m = re.search(r"(\d{8})", media_path.stem)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
media_date = datetime.strptime(m.group(1), "%Y%m%d").replace(tzinfo=timezone.utc)
|
||||||
|
except Exception:
|
||||||
|
media_date = None
|
||||||
|
|
||||||
|
candidates = []
|
||||||
|
for item in index_items:
|
||||||
|
item_title = _norm(item.get("title", ""))
|
||||||
|
if not item_title:
|
||||||
|
continue
|
||||||
|
ratio = (fuzz.token_sort_ratio(tnorm, item_title) if fuzz else (100 if tnorm == item_title else 0))
|
||||||
|
if ratio < min_ratio:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# duration filter (if both known)
|
||||||
|
ok_duration = True
|
||||||
|
if media_secs and item.get("duration_s"):
|
||||||
|
ok_duration = abs(media_secs - int(item["duration_s"])) <= duration_tolerance_s
|
||||||
|
|
||||||
|
# date window (if both known)
|
||||||
|
ok_date = True
|
||||||
|
if media_date and item.get("pubdate_ts"):
|
||||||
|
dt_item = datetime.fromtimestamp(int(item["pubdate_ts"]), tz=timezone.utc)
|
||||||
|
delta_days = abs((media_date - dt_item).days)
|
||||||
|
ok_date = delta_days <= date_window_days
|
||||||
|
|
||||||
|
if ok_duration and ok_date:
|
||||||
|
candidates.append((ratio, item))
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return None
|
||||||
|
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
return candidates[0][1]
|
||||||
|
|
||||||
|
def choose_transcript_url(item: Dict[str, Any]) -> Optional[Tuple[str, str]]:
|
||||||
|
urls = item.get("transcript_urls") or {}
|
||||||
|
# prefer text/plain, then VTT, then SRT:
|
||||||
|
if urls.get("txt"): return (urls["txt"], "txt")
|
||||||
|
if urls.get("vtt"): return (urls["vtt"], "vtt")
|
||||||
|
if urls.get("srt"): return (urls["srt"], "srt")
|
||||||
|
return None
|
@@ -1,388 +1,315 @@
|
|||||||
#!/usr/bin/env python3
|
import os, re, json, time, shutil, difflib
|
||||||
"""
|
|
||||||
RSS ingester for PodX
|
|
||||||
- Reads feeds from env var RSS_FEEDS (comma-separated) *and*/or from a file (FEEDS_FILE, default /library/feeds.txt)
|
|
||||||
- Fetches RSS with ETag/Last-Modified caching to avoid re-downloading unchanged feeds
|
|
||||||
- Saves audio to LIBRARY_ROOT/<podcast>/<YYYYMMDD - title>.<ext>
|
|
||||||
- Saves transcript sidecars when `<podcast:transcript>` links are present (prefers TextWithTimestamps → WebVTT → SRT → TXT)
|
|
||||||
- Enqueues `worker.handle_local_file` for indexing/transcription (worker will skip Whisper if a transcript sidecar exists)
|
|
||||||
- Keeps a small state JSON with per-feed ETag/Last-Modified and per-item processed GUIDs to avoid duplicate work
|
|
||||||
|
|
||||||
Environment variables (with sane defaults):
|
|
||||||
MEILI_URL (unused directly here, but kept for parity)
|
|
||||||
REDIS_URL redis://redis:6379/0
|
|
||||||
LIBRARY_ROOT /library
|
|
||||||
TRANSCRIPT_ROOT /transcripts
|
|
||||||
RSS_FEEDS "" (comma-separated list)
|
|
||||||
FEEDS_FILE /library/feeds.txt
|
|
||||||
RSS_SCAN_MINUTES 120
|
|
||||||
RSS_ONCE 0 ("1" to run once and exit)
|
|
||||||
USER_AGENT podx-rss/1.0 (+local-archive)
|
|
||||||
RSS_STATE_FILE /library/.rss_state.json
|
|
||||||
RSS_CONNECT_TIMEOUT 15 (seconds)
|
|
||||||
RSS_READ_TIMEOUT 60 (seconds)
|
|
||||||
AUDIO_MAX_MB 4096 (skip larger-than if HEAD reveals size > max, 0 = unlimited)
|
|
||||||
"""
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
import itertools
|
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Tuple, Optional
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import redis
|
|
||||||
from rq import Queue
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format='[rss] %(message)s')
|
# ---- Config ----
|
||||||
log = logging.getLogger("rss")
|
TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
|
||||||
|
OUT_INDEX = Path(os.getenv("RSS_INDEX_PATH", str(TRN / "rss_index.json")))
|
||||||
|
FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/app/config/feeds.txt"))
|
||||||
|
FEEDS_ENV = os.getenv("RSS_FEEDS", "").strip()
|
||||||
|
TIMEOUT = int(os.getenv("RSS_HTTP_TIMEOUT", "30"))
|
||||||
|
DOWNLOAD_TRANSCRIPTS = os.getenv("RSS_DOWNLOAD_TRANSCRIPTS", "true").lower() in {"1", "true", "yes", "y"}
|
||||||
|
DEFAULT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
|
||||||
|
|
||||||
# Config
|
# Where media files live; used to sidecar RSS transcripts next to matching media
|
||||||
MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
|
LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
|
||||||
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
|
MEDIA_EXTS = {".mp3", ".m4a", ".flac", ".wav", ".ogg", ".opus", ".mp4", ".m4v", ".mkv", ".webm", ".mov", ".avi"}
|
||||||
LIBRARY_ROOT = Path(os.getenv("LIBRARY_ROOT", "/library"))
|
|
||||||
TRANSCRIPT_ROOT = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
|
|
||||||
RSS_FEEDS_ENV = [s.strip() for s in os.getenv("RSS_FEEDS", "").split(",") if s.strip()]
|
|
||||||
FEEDS_FILE = Path(os.getenv("FEEDS_FILE", str(LIBRARY_ROOT / "feeds.txt")))
|
|
||||||
RSS_SCAN_MINUTES = int(os.getenv("RSS_SCAN_MINUTES", "120"))
|
|
||||||
RSS_ONCE = os.getenv("RSS_ONCE", "0") == "1"
|
|
||||||
USER_AGENT = os.getenv("USER_AGENT", "podx-rss/1.0 (+local-archive)")
|
|
||||||
STATE_FILE = Path(os.getenv("RSS_STATE_FILE", str(LIBRARY_ROOT / ".rss_state.json")))
|
|
||||||
CONNECT_TIMEOUT = float(os.getenv("RSS_CONNECT_TIMEOUT", "15"))
|
|
||||||
READ_TIMEOUT = float(os.getenv("RSS_READ_TIMEOUT", "60"))
|
|
||||||
AUDIO_MAX_MB = int(os.getenv("AUDIO_MAX_MB", "4096"))
|
|
||||||
|
|
||||||
# Redis queue
|
# Fuzzy title match threshold for media ↔ transcript pairing
|
||||||
r = redis.from_url(REDIS_URL)
|
TITLE_MATCH_THRESHOLD = float(os.getenv("RSS_TITLE_MATCH_THRESHOLD", "0.60"))
|
||||||
q = Queue("default", connection=r)
|
|
||||||
|
|
||||||
# HTTP session
|
# Namespace map (extend as needed)
|
||||||
SESSION = requests.Session()
|
|
||||||
SESSION.headers.update({"User-Agent": USER_AGENT})
|
|
||||||
|
|
||||||
# Namespaces commonly used in podcast RSS
|
|
||||||
NS = {
|
NS = {
|
||||||
"podcast": "https://podcastindex.org/namespace/1.0",
|
|
||||||
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
||||||
"media": "http://search.yahoo.com/mrss/",
|
|
||||||
"content": "http://purl.org/rss/1.0/modules/content/",
|
"content": "http://purl.org/rss/1.0/modules/content/",
|
||||||
"dc": "http://purl.org/dc/elements/1.1/",
|
"media": "http://search.yahoo.com/mrss/",
|
||||||
|
"podcast": "https://podcastindex.org/namespace/1.0",
|
||||||
|
"atom": "http://www.w3.org/2005/Atom",
|
||||||
}
|
}
|
||||||
|
|
||||||
# ----------------- helpers -----------------
|
TRN.mkdir(parents=True, exist_ok=True)
|
||||||
|
OUT_INDEX.parent.mkdir(parents=True, exist_ok=True)
|
||||||
def safe(s: str) -> str:
|
|
||||||
s = re.sub(r"[\\/:*?\"<>|]", "_", s)
|
|
||||||
# collapse whitespace and trim
|
|
||||||
s = re.sub(r"\s+", " ", s).strip()
|
|
||||||
# guard against very long filenames
|
|
||||||
return s[:200] if len(s) > 200 else s
|
|
||||||
|
|
||||||
|
|
||||||
def load_state() -> Dict:
|
def _text(el):
|
||||||
if STATE_FILE.exists():
|
return (el.text or "").strip() if el is not None else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_duration(d):
|
||||||
|
if not d:
|
||||||
|
return None
|
||||||
|
s = str(d).strip()
|
||||||
|
if s.isdigit():
|
||||||
|
return int(s)
|
||||||
|
parts = [p for p in s.split(":") if p != ""]
|
||||||
try:
|
try:
|
||||||
return json.loads(STATE_FILE.read_text("utf-8"))
|
if len(parts) == 3:
|
||||||
except Exception:
|
h, m, sec = parts
|
||||||
log.warning("State file unreadable, starting fresh")
|
return int(h) * 3600 + int(m) * 60 + int(float(sec))
|
||||||
return {"feeds": {}, "items": {}} # items keyed by GUID / enclosure URL
|
if len(parts) == 2:
|
||||||
|
m, sec = parts
|
||||||
|
return int(m) * 60 + int(float(sec))
|
||||||
def save_state(state: Dict) -> None:
|
return int(float(parts[0]))
|
||||||
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp")
|
|
||||||
tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2))
|
|
||||||
tmp.replace(STATE_FILE)
|
|
||||||
|
|
||||||
|
|
||||||
def load_feeds() -> List[str]:
|
|
||||||
feeds = list(RSS_FEEDS_ENV)
|
|
||||||
if FEEDS_FILE.exists():
|
|
||||||
try:
|
|
||||||
for line in FEEDS_FILE.read_text("utf-8").splitlines():
|
|
||||||
s = line.strip()
|
|
||||||
if not s or s.startswith("#"): # allow comments
|
|
||||||
continue
|
|
||||||
feeds.append(s)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning(f"Failed to read {FEEDS_FILE}: {e}")
|
|
||||||
# de-dup preserving order
|
|
||||||
seen = set()
|
|
||||||
uniq = []
|
|
||||||
for f in feeds:
|
|
||||||
if f not in seen:
|
|
||||||
uniq.append(f)
|
|
||||||
seen.add(f)
|
|
||||||
return uniq
|
|
||||||
|
|
||||||
|
|
||||||
def fetch(url: str, *, etag: Optional[str]=None, modified: Optional[str]=None, as_text=False):
|
|
||||||
headers = {}
|
|
||||||
if etag:
|
|
||||||
headers["If-None-Match"] = etag
|
|
||||||
if modified:
|
|
||||||
headers["If-Modified-Since"] = modified
|
|
||||||
resp = SESSION.get(url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
|
|
||||||
if resp.status_code == 304:
|
|
||||||
return None, 304, None, None
|
|
||||||
resp.raise_for_status()
|
|
||||||
new_etag = resp.headers.get("ETag")
|
|
||||||
new_mod = resp.headers.get("Last-Modified")
|
|
||||||
return (resp.text if as_text else resp.content), resp.status_code, new_etag, new_mod
|
|
||||||
|
|
||||||
|
|
||||||
def head_size(url: str) -> Optional[int]:
|
|
||||||
try:
|
|
||||||
h = SESSION.head(url, allow_redirects=True, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
|
|
||||||
if h.ok:
|
|
||||||
cl = h.headers.get("Content-Length")
|
|
||||||
return int(cl) if cl and cl.isdigit() else None
|
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def best_transcript_links(item) -> List[str]:
|
def _slug(text: str) -> str:
|
||||||
links: List[Tuple[int, str, str]] = []
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
# Try explicit QName first
|
text = re.sub(r"[^A-Za-z0-9\-._ ]+", "", text)
|
||||||
for tag in item.findall(".//{https://podcastindex.org/namespace/1.0}transcript"):
|
return (text[:120] or "episode").replace(" ", "_")
|
||||||
t = (tag.attrib.get("type") or "").lower()
|
|
||||||
url = tag.attrib.get("url")
|
|
||||||
if url:
|
|
||||||
links.append((0, t, url))
|
|
||||||
# Namespace-prefixed fallback
|
|
||||||
for tag in item.findall(".//podcast:transcript", NS):
|
|
||||||
t = (tag.attrib.get("type") or "").lower()
|
|
||||||
url = tag.attrib.get("url")
|
|
||||||
if url:
|
|
||||||
links.append((0, t, url))
|
|
||||||
|
|
||||||
order = [
|
|
||||||
"text/plain", # often used for TextWithTimestamps
|
|
||||||
"application/json",
|
|
||||||
"text/vtt",
|
|
||||||
"application/srt",
|
|
||||||
"application/x-subrip",
|
|
||||||
"application/text",
|
|
||||||
"text/plain; charset=utf-8",
|
|
||||||
]
|
|
||||||
key = {v: i for i, v in enumerate(order)}
|
|
||||||
ranked = []
|
|
||||||
for _, t, url in links:
|
|
||||||
ranked.append((key.get(t, 999), t, url))
|
|
||||||
ranked.sort()
|
|
||||||
return [u for _, _, u in ranked]
|
|
||||||
|
|
||||||
|
|
||||||
def get_enclosure(item) -> Optional[str]:
|
def _yymmdd_from_pubdate(pubdate: str) -> str:
|
||||||
enc = item.find("enclosure")
|
|
||||||
if enc is not None and enc.attrib.get("url"):
|
|
||||||
return enc.attrib["url"]
|
|
||||||
mc = item.find("media:content", NS)
|
|
||||||
if mc is not None and mc.attrib.get("url"):
|
|
||||||
return mc.attrib["url"]
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def parse_pubdate(item) -> datetime:
|
|
||||||
# Try common fields
|
|
||||||
candidates = [
|
|
||||||
item.findtext("pubDate"),
|
|
||||||
item.findtext("dc:date", namespaces=NS),
|
|
||||||
item.findtext("{http://purl.org/dc/elements/1.1/}date"),
|
|
||||||
]
|
|
||||||
for pd in filter(None, candidates):
|
|
||||||
s = pd.strip()
|
|
||||||
# Try several common formats
|
|
||||||
for fmt in [
|
|
||||||
"%a, %d %b %Y %H:%M:%S %z",
|
|
||||||
"%a, %d %b %Y %H:%M:%S",
|
|
||||||
"%Y-%m-%dT%H:%M:%S%z",
|
|
||||||
"%Y-%m-%dT%H:%M:%S",
|
|
||||||
]:
|
|
||||||
try:
|
try:
|
||||||
return datetime.strptime(s, fmt)
|
from email.utils import parsedate_to_datetime
|
||||||
|
|
||||||
|
dt = parsedate_to_datetime(pubdate)
|
||||||
|
if dt is not None:
|
||||||
|
return dt.strftime("%Y%m%d")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return datetime.utcnow()
|
m = re.search(r"(\d{4})[-/](\d{1,2})[-/](\d{1,2})", pubdate or "")
|
||||||
|
if m:
|
||||||
|
y, mo, d = m.groups()
|
||||||
|
return f"{int(y):04d}{int(mo):02d}{int(d):02d}"
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def save_bytes(path: Path, data: bytes) -> None:
|
def _iter_items(channel: ET.Element):
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
for tag in ["item", "{http://www.w3.org/2005/Atom}entry"]:
|
||||||
tmp = path.with_suffix(path.suffix + ".part")
|
for it in channel.findall(tag):
|
||||||
tmp.write_bytes(data)
|
yield it
|
||||||
tmp.replace(path)
|
|
||||||
|
|
||||||
|
|
||||||
def decide_audio_ext(audio_url: str) -> str:
|
def _findall_ns(el, path):
|
||||||
p = urlparse(audio_url)
|
res = el.findall(path, NS)
|
||||||
low = p.path.lower()
|
if not res and ":" in path:
|
||||||
if low.endswith(".m4a"):
|
last = path.split("/")[-1]
|
||||||
return ".m4a"
|
res = el.findall(last)
|
||||||
if low.endswith(".mp3"):
|
return res
|
||||||
return ".mp3"
|
|
||||||
if low.endswith(".ogg") or low.endswith(".oga"):
|
|
||||||
return ".ogg"
|
|
||||||
if low.endswith(".aac"):
|
|
||||||
return ".aac"
|
|
||||||
if low.endswith(".wav"):
|
|
||||||
return ".wav"
|
|
||||||
return ".mp3"
|
|
||||||
|
|
||||||
|
|
||||||
def item_key(item) -> str:
|
def _find_ns(el, path):
|
||||||
# Prefer GUID value, else enclosure URL, else title+date
|
found = el.find(path, NS)
|
||||||
guid = item.findtext("guid")
|
if found is None and ":" in path:
|
||||||
if guid:
|
found = el.find(path.split("/")[-1])
|
||||||
return guid.strip()
|
return found
|
||||||
enc = get_enclosure(item)
|
|
||||||
if enc:
|
|
||||||
return enc
|
|
||||||
title = item.findtext("title") or "Episode"
|
|
||||||
pub = parse_pubdate(item).strftime("%Y%m%d")
|
|
||||||
return f"{pub}:{title}"
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------- core ingest -----------------
|
def _download(url: str, dst: Path) -> Path | None:
|
||||||
|
|
||||||
def ingest_feed(url: str, state: Dict) -> int:
|
|
||||||
fstate = state.setdefault("feeds", {}).setdefault(url, {})
|
|
||||||
etag = fstate.get("etag")
|
|
||||||
mod = fstate.get("modified")
|
|
||||||
|
|
||||||
log.info(f"Fetch RSS: {url}")
|
|
||||||
try:
|
try:
|
||||||
data, status, new_etag, new_mod = fetch(url, etag=etag, modified=mod, as_text=True)
|
r = requests.get(url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"})
|
||||||
except Exception as e:
|
r.raise_for_status()
|
||||||
log.error(f"Fetch failed: {e}")
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||||
return 0
|
with open(dst, "wb") as f:
|
||||||
|
f.write(r.content)
|
||||||
|
return dst
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
if status == 304:
|
|
||||||
log.info("Not modified")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if new_etag:
|
def _guess_ext_from_type(mime: str) -> str:
|
||||||
fstate["etag"] = new_etag
|
if not mime:
|
||||||
if new_mod:
|
return ".txt"
|
||||||
fstate["modified"] = new_mod
|
mime = mime.lower()
|
||||||
|
if "vtt" in mime:
|
||||||
|
return ".vtt"
|
||||||
|
if "srt" in mime or "subrip" in mime:
|
||||||
|
return ".srt"
|
||||||
|
if "json" in mime:
|
||||||
|
return ".json"
|
||||||
|
return ".txt"
|
||||||
|
|
||||||
try:
|
|
||||||
root = ET.fromstring(data)
|
|
||||||
except Exception as e:
|
|
||||||
log.error(f"XML parse error: {e}")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
channel_title = safe((root.findtext("channel/title") or "Podcast"))
|
def _norm_text(s: str) -> str:
|
||||||
|
s = (s or "").lower()
|
||||||
|
s = re.sub(r"\s+", " ", s)
|
||||||
|
s = re.sub(r"[^a-z0-9 _.-]+", "", s)
|
||||||
|
return s.strip()
|
||||||
|
|
||||||
new_items = 0
|
def _strip_leading_date(basename: str) -> str:
|
||||||
for item in root.findall("channel/item"):
|
m = re.match(r"^(\d{8})\s*-\s*(.+)$", basename)
|
||||||
key = item_key(item)
|
return m.group(2) if m else basename
|
||||||
already = state.setdefault("items", {})
|
|
||||||
if already.get(key):
|
def _find_matching_media(date: str, title: str) -> list[Path]:
|
||||||
|
"""Find media files in LIB that likely correspond to this episode.
|
||||||
|
Strategy:
|
||||||
|
1) If YYYYMMDD is present, prefer files starting with that date.
|
||||||
|
2) Otherwise, fuzzy title match using difflib on stems (date stripped).
|
||||||
|
"""
|
||||||
|
matches: list[Path] = []
|
||||||
|
# 1) Date-based scan
|
||||||
|
if date:
|
||||||
|
for p in LIB.rglob(f"{date} - *"):
|
||||||
|
if p.is_file() and p.suffix.lower() in MEDIA_EXTS:
|
||||||
|
matches.append(p)
|
||||||
|
if matches:
|
||||||
|
return matches
|
||||||
|
|
||||||
|
# 2) Fuzzy title scan (can be expensive on huge libraries)
|
||||||
|
tkey = _norm_text(title)
|
||||||
|
if not tkey:
|
||||||
|
return matches
|
||||||
|
for p in LIB.rglob("*"):
|
||||||
|
if not (p.is_file() and p.suffix.lower() in MEDIA_EXTS):
|
||||||
continue
|
continue
|
||||||
|
stem = _strip_leading_date(p.stem)
|
||||||
|
fkey = _norm_text(stem)
|
||||||
|
if not fkey:
|
||||||
|
continue
|
||||||
|
if difflib.SequenceMatcher(None, tkey, fkey).ratio() >= TITLE_MATCH_THRESHOLD:
|
||||||
|
matches.append(p)
|
||||||
|
return matches
|
||||||
|
|
||||||
title = safe(item.findtext("title") or "Episode")
|
def _sidecar_path_for(media_path: Path, lang: str, ext: str) -> Path:
|
||||||
pub = parse_pubdate(item)
|
base = media_path.with_suffix("")
|
||||||
date_prefix = pub.strftime("%Y%m%d")
|
lang = (lang or DEFAULT_LANG or "en").lower().replace("_", "-")
|
||||||
base = f"{date_prefix} - {title}"
|
# Prefer language-suffixed sidecars (e.g., episode.en.srt)
|
||||||
|
return base.with_name(f"{base.name}.{lang}{ext}")
|
||||||
|
|
||||||
|
def _propagate_transcript_to_media(local_tr_path: Path, lang: str, date: str, title: str) -> list[str]:
|
||||||
|
"""Copy a downloaded transcript next to any matching media under LIB.
|
||||||
|
Returns a list of created sidecar file paths (as strings)."""
|
||||||
|
created: list[str] = []
|
||||||
|
if not local_tr_path.exists():
|
||||||
|
return created
|
||||||
|
ext = local_tr_path.suffix.lower()
|
||||||
|
if ext not in {".srt", ".vtt", ".txt"}:
|
||||||
|
# Unknown/unsupported transcript type for sidecar; skip silently
|
||||||
|
return created
|
||||||
|
for media in _find_matching_media(date, title):
|
||||||
|
dst = _sidecar_path_for(media, lang, ext)
|
||||||
|
try:
|
||||||
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
if not dst.exists():
|
||||||
|
shutil.copy2(local_tr_path, dst)
|
||||||
|
created.append(str(dst))
|
||||||
|
except Exception:
|
||||||
|
# best-effort; continue
|
||||||
|
pass
|
||||||
|
return created
|
||||||
|
|
||||||
|
|
||||||
|
def _gather_transcripts(item: ET.Element):
|
||||||
|
transcripts = []
|
||||||
|
# podcast:transcript elements
|
||||||
|
for t in _findall_ns(item, "podcast:transcript"):
|
||||||
|
url = t.get("url") or t.get("href")
|
||||||
|
ttype = t.get("type") or ""
|
||||||
|
lang = t.get("language") or t.get("lang") or ""
|
||||||
|
if url:
|
||||||
|
transcripts.append({"url": url, "type": ttype, "language": lang})
|
||||||
|
# Atom-style transcript link
|
||||||
|
for link in _findall_ns(item, "atom:link"):
|
||||||
|
if (link.get("rel") or "").lower() == "transcript" and link.get("href"):
|
||||||
|
transcripts.append(
|
||||||
|
{
|
||||||
|
"url": link.get("href"),
|
||||||
|
"type": link.get("type") or "",
|
||||||
|
"language": link.get("hreflang") or "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return transcripts
|
||||||
|
|
||||||
|
|
||||||
|
def parse_feed(feed_url: str):
|
||||||
|
items = []
|
||||||
|
try:
|
||||||
|
r = requests.get(feed_url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"})
|
||||||
|
r.raise_for_status()
|
||||||
|
root = ET.fromstring(r.content)
|
||||||
|
|
||||||
|
channel = root.find("channel") or root
|
||||||
|
show_title = _text(_find_ns(channel, "title")) or _text(_find_ns(root, "title"))
|
||||||
|
|
||||||
|
for it in _iter_items(channel):
|
||||||
|
title = _text(_find_ns(it, "title"))
|
||||||
|
guid = _text(_find_ns(it, "guid")) or _text(_find_ns(it, "id"))
|
||||||
|
link = _text(_find_ns(it, "link"))
|
||||||
|
pub = _text(_find_ns(it, "pubDate")) or _text(_find_ns(it, "published"))
|
||||||
|
date = _yymmdd_from_pubdate(pub)
|
||||||
|
dur = _text(_find_ns(it, "itunes:duration"))
|
||||||
|
duration_sec = _parse_duration(dur) or None
|
||||||
|
enclosure = _find_ns(it, "enclosure")
|
||||||
|
audio_url = enclosure.get("url") if enclosure is not None else ""
|
||||||
|
|
||||||
audio_url = get_enclosure(item)
|
|
||||||
if not audio_url:
|
if not audio_url:
|
||||||
log.info(f"Skip (no enclosure): {title}")
|
for mc in _findall_ns(it, "media:content"):
|
||||||
already[key] = {"skipped": "no_enclosure"}
|
if (mc.get("type") or "").startswith("audio") and mc.get("url"):
|
||||||
continue
|
audio_url = mc.get("url")
|
||||||
|
|
||||||
# HEAD size guard (optional)
|
|
||||||
if AUDIO_MAX_MB > 0:
|
|
||||||
size = head_size(audio_url)
|
|
||||||
if size and size > AUDIO_MAX_MB * 1024 * 1024:
|
|
||||||
log.info(f"Skip (size>{AUDIO_MAX_MB}MB): {title}")
|
|
||||||
already[key] = {"skipped": "too_large", "size": size}
|
|
||||||
continue
|
|
||||||
|
|
||||||
path_ext = decide_audio_ext(audio_url)
|
|
||||||
audio_out = LIBRARY_ROOT / channel_title / f"{base}{path_ext}"
|
|
||||||
transcript_links = best_transcript_links(item)
|
|
||||||
|
|
||||||
# If audio exists and a transcript sidecar exists → just enqueue index
|
|
||||||
sidecars = list((TRANSCRIPT_ROOT / channel_title).glob(f"{base}.*"))
|
|
||||||
have_transcript = len(sidecars) > 0
|
|
||||||
if audio_out.exists() and have_transcript:
|
|
||||||
log.info(f"Skip download, enqueue index (have audio+transcript): {audio_out.name}")
|
|
||||||
try:
|
|
||||||
q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning(f"Enqueue failed: {e}")
|
|
||||||
already[key] = {"done": True, "audio": str(audio_out)}
|
|
||||||
new_items += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Download audio
|
|
||||||
try:
|
|
||||||
log.info(f"Downloading audio → {audio_out}")
|
|
||||||
content, _, _, _ = fetch(audio_url, as_text=False)
|
|
||||||
save_bytes(audio_out, content)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning(f"Audio failed: {e}")
|
|
||||||
already[key] = {"error": f"audio:{e}"}
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Download transcript if present (take first best)
|
|
||||||
transcript_out = None
|
|
||||||
for turl in transcript_links:
|
|
||||||
try:
|
|
||||||
ext = ".vtt" if "vtt" in turl.lower() else ".srt" if "srt" in turl.lower() else ".txt"
|
|
||||||
tout = TRANSCRIPT_ROOT / channel_title / f"{base}{ext}"
|
|
||||||
log.info(f"Downloading transcript → {tout} ({turl})")
|
|
||||||
tdata, _, _, _ = fetch(turl, as_text=False)
|
|
||||||
save_bytes(tout, tdata)
|
|
||||||
transcript_out = tout
|
|
||||||
break
|
break
|
||||||
except Exception as e:
|
|
||||||
log.warning(f"Transcript fetch failed ({turl}): {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Enqueue for indexing/transcription
|
transcripts = _gather_transcripts(it)
|
||||||
|
|
||||||
|
item_rec = {
|
||||||
|
"show": show_title,
|
||||||
|
"feed_url": feed_url,
|
||||||
|
"title": title,
|
||||||
|
"guid": guid,
|
||||||
|
"link": link,
|
||||||
|
"date": date,
|
||||||
|
"duration_sec": duration_sec,
|
||||||
|
"audio_url": audio_url,
|
||||||
|
"language": DEFAULT_LANG,
|
||||||
|
"transcripts": transcripts,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Optionally download transcripts locally
|
||||||
|
if DOWNLOAD_TRANSCRIPTS and transcripts:
|
||||||
|
base_name = f"{date or '00000000'} - {_slug(title)}"
|
||||||
|
for t in item_rec["transcripts"]:
|
||||||
|
ext = _guess_ext_from_type(t.get("type", ""))
|
||||||
|
parsed = urlparse(t["url"])
|
||||||
|
url_ext = Path(parsed.path).suffix.lower()
|
||||||
|
if url_ext in {".vtt", ".srt", ".txt", ".json"}:
|
||||||
|
ext = url_ext
|
||||||
|
local_file = (TRN / base_name).with_suffix(ext)
|
||||||
|
saved = _download(t["url"], local_file)
|
||||||
|
if saved:
|
||||||
|
t["local_path"] = str(saved)
|
||||||
|
# If we saved a readable sidecar type, try to place it next to matching media
|
||||||
|
if ext in {".vtt", ".srt", ".txt"}:
|
||||||
|
created = _propagate_transcript_to_media(saved, t.get("language") or DEFAULT_LANG, date, title)
|
||||||
|
if created:
|
||||||
|
t["sidecars"] = created
|
||||||
|
|
||||||
|
items.append(item_rec)
|
||||||
|
|
||||||
|
return {"feed_url": feed_url, "show": show_title, "episodes": items}
|
||||||
|
except Exception as e:
|
||||||
|
return {"feed_url": feed_url, "error": str(e), "episodes": []}
|
||||||
|
|
||||||
|
|
||||||
|
def load_feeds_list():
|
||||||
|
feeds = []
|
||||||
|
if FEEDS_ENV:
|
||||||
|
feeds.extend([u.strip() for u in FEEDS_ENV.split(",") if u.strip()])
|
||||||
|
if FEEDS_FILE.exists():
|
||||||
try:
|
try:
|
||||||
q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
|
for line in FEEDS_FILE.read_text(encoding="utf-8").splitlines():
|
||||||
except Exception as e:
|
line = line.strip()
|
||||||
log.warning(f"Enqueue failed: {e}")
|
if not line or line.startswith("#"):
|
||||||
|
continue
|
||||||
already[key] = {"done": True, "audio": str(audio_out), "transcript": str(transcript_out) if transcript_out else None}
|
feeds.append(line)
|
||||||
new_items += 1
|
except Exception:
|
||||||
|
pass
|
||||||
return new_items
|
# unique, keep order
|
||||||
|
return sorted(list(dict.fromkeys(feeds)))
|
||||||
|
|
||||||
|
|
||||||
# ----------------- main loop -----------------
|
def build_index():
|
||||||
|
feeds = load_feeds_list()
|
||||||
def main():
|
out = {"generated_at": int(time.time()), "feeds": feeds, "episodes": []}
|
||||||
while True:
|
|
||||||
feeds = load_feeds()
|
|
||||||
if not feeds:
|
|
||||||
log.error("No RSS feeds configured. Set RSS_FEEDS or create feeds.txt.")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
state = load_state()
|
|
||||||
total_new = 0
|
|
||||||
for url in feeds:
|
for url in feeds:
|
||||||
try:
|
data = parse_feed(url)
|
||||||
added = ingest_feed(url, state)
|
out["episodes"].extend(data.get("episodes", []))
|
||||||
total_new += added
|
OUT_INDEX.write_text(json.dumps(out, ensure_ascii=False, indent=2))
|
||||||
save_state(state)
|
print(f"[rss] wrote index with {len(out['episodes'])} episode(s) -> {OUT_INDEX}")
|
||||||
except Exception as e:
|
return OUT_INDEX
|
||||||
log.error(f"Feed error: {url} -> {e}")
|
|
||||||
log.info(f"Cycle complete. New items: {total_new}")
|
|
||||||
if RSS_ONCE:
|
|
||||||
break
|
|
||||||
time.sleep(RSS_SCAN_MINUTES * 60)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
build_index()
|
307
app/worker.py
307
app/worker.py
@@ -1,6 +1,7 @@
|
|||||||
import os, subprocess, shutil, json, re, orjson, requests
|
import os, subprocess, shutil, json, re, orjson, requests
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import math
|
import math
|
||||||
|
import difflib
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
|
MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
|
||||||
@@ -12,6 +13,11 @@ MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3")
|
|||||||
COMPUTE = os.getenv("WHISPER_PRECISION","int8")
|
COMPUTE = os.getenv("WHISPER_PRECISION","int8")
|
||||||
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()
|
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()
|
||||||
|
|
||||||
|
# RSS resolver config
|
||||||
|
RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json"))
|
||||||
|
RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150")) # seconds
|
||||||
|
DEFAULT_TRANSCRIPT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
|
||||||
|
|
||||||
OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
|
OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
|
||||||
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
|
OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
|
||||||
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
|
OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
|
||||||
@@ -39,6 +45,160 @@ def log(feed):
|
|||||||
def sanitize(name):
|
def sanitize(name):
|
||||||
return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
|
return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
|
||||||
|
|
||||||
|
# ---------- RSS transcript resolver ----------
|
||||||
|
|
||||||
|
def _normalize_title(t: str) -> str:
|
||||||
|
t = (t or "").lower()
|
||||||
|
t = re.sub(r"\s+", " ", t)
|
||||||
|
# remove punctuation-ish
|
||||||
|
t = re.sub(r"[^a-z0-9 _-]+", "", t)
|
||||||
|
return t.strip()
|
||||||
|
|
||||||
|
def _stem_without_date(stem: str) -> str:
|
||||||
|
# drop leading YYYYMMDD - from filenames created by yt-dlp template
|
||||||
|
m = re.match(r"^\d{8}\s*-\s*(.*)$", stem)
|
||||||
|
return m.group(1) if m else stem
|
||||||
|
|
||||||
|
def _extract_date_from_stem(stem: str) -> str | None:
|
||||||
|
m = re.search(r"\b(\d{8})\b", stem)
|
||||||
|
return m.group(1) if m else None
|
||||||
|
|
||||||
|
def _best_title_match(title: str, candidates: list[str]) -> tuple[str, float]:
|
||||||
|
"""Return (best_title, score 0..1) using difflib SequenceMatcher."""
|
||||||
|
if not candidates:
|
||||||
|
return "", 0.0
|
||||||
|
norm_title = _normalize_title(title)
|
||||||
|
best = ("", 0.0)
|
||||||
|
for c in candidates:
|
||||||
|
score = difflib.SequenceMatcher(None, norm_title, _normalize_title(c)).ratio()
|
||||||
|
if score > best[1]:
|
||||||
|
best = (c, score)
|
||||||
|
return best
|
||||||
|
|
||||||
|
def _load_rss_index() -> list[dict]:
|
||||||
|
try:
|
||||||
|
if RSS_INDEX_PATH.exists():
|
||||||
|
data = json.loads(RSS_INDEX_PATH.read_text(encoding="utf-8"))
|
||||||
|
# supports {"episodes":[...]} or a flat list
|
||||||
|
if isinstance(data, dict) and "episodes" in data:
|
||||||
|
return data["episodes"] or []
|
||||||
|
if isinstance(data, list):
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[resolver] failed to load RSS index: {e}", flush=True)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def match_media_to_rss(media_path: Path) -> dict | None:
|
||||||
|
"""Try to match a local media file to an RSS episode entry."""
|
||||||
|
episodes = _load_rss_index()
|
||||||
|
if not episodes:
|
||||||
|
return None
|
||||||
|
|
||||||
|
stem = media_path.stem
|
||||||
|
title_no_date = _stem_without_date(stem)
|
||||||
|
file_date = _extract_date_from_stem(stem)
|
||||||
|
# duration tolerance
|
||||||
|
media_dur = media_duration_seconds(media_path)
|
||||||
|
|
||||||
|
# Candidates: filter by date if present, else all
|
||||||
|
if file_date:
|
||||||
|
pool = [e for e in episodes if (str(e.get("date", "")) == file_date or str(e.get("pubdate", "")) == file_date)]
|
||||||
|
if not pool:
|
||||||
|
pool = episodes
|
||||||
|
else:
|
||||||
|
pool = episodes
|
||||||
|
|
||||||
|
# Pick best by (title similarity, duration proximity)
|
||||||
|
best_ep, best_score = None, -1.0
|
||||||
|
for ep in pool:
|
||||||
|
ep_title = ep.get("title") or ep.get("itunes_title") or ""
|
||||||
|
sim = _best_title_match(title_no_date, [ep_title])[1]
|
||||||
|
dur = float(ep.get("duration_sec") or ep.get("duration") or 0.0)
|
||||||
|
dur_ok = True
|
||||||
|
if media_dur and dur:
|
||||||
|
dur_ok = abs(media_dur - dur) <= RSS_DURATION_TOLERANCE
|
||||||
|
score = sim + (0.1 if dur_ok else 0.0)
|
||||||
|
if score > best_score:
|
||||||
|
best_score, best_ep = score, ep
|
||||||
|
|
||||||
|
if best_ep and best_score >= 0.5:
|
||||||
|
print(f"[resolver] matched '{stem}' -> '{best_ep.get('title','')}' score={best_score:.2f}", flush=True)
|
||||||
|
return best_ep
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _choose_transcript_url(ep: dict) -> tuple[str, str] | tuple[None, None]:
|
||||||
|
"""Return (url, kind) preferring txt, vtt, then srt. 'kind' in {'txt','vtt','srt'}."""
|
||||||
|
# unified structure from rss_ingest.py: ep["transcripts"] = [{"url":..., "type": ...}, ...]
|
||||||
|
items = ep.get("transcripts") or []
|
||||||
|
# some ingesters store separate keys
|
||||||
|
if not items:
|
||||||
|
for key, kind in [("transcript_txt","txt"), ("transcript_vtt","vtt"), ("transcript_srt","srt")]:
|
||||||
|
if ep.get(key):
|
||||||
|
items.append({"url": ep[key], "type": kind})
|
||||||
|
# preference order
|
||||||
|
for kind in ["txt", "vtt", "srt"]:
|
||||||
|
for it in items:
|
||||||
|
t = (it.get("type") or "").lower()
|
||||||
|
u = it.get("url") or ""
|
||||||
|
if u and (kind in t or (kind == "txt" and t in ["text","plain","text/plain"]) or (kind in u.lower())):
|
||||||
|
return u, kind
|
||||||
|
return (None, None)
|
||||||
|
|
||||||
|
def fetch_rss_transcript(ep: dict, dest_dir: Path) -> Path | None:
|
||||||
|
"""Download transcript to dest_dir and return local Path; convert VTT->SRT if needed."""
|
||||||
|
url, kind = _choose_transcript_url(ep)
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
# filename from episode title
|
||||||
|
safe = sanitize(ep.get("title") or ep.get("guid") or "episode")
|
||||||
|
path = dest_dir / f"{safe}.{kind if kind!='txt' else 'txt'}"
|
||||||
|
try:
|
||||||
|
r = requests.get(url, timeout=30)
|
||||||
|
r.raise_for_status()
|
||||||
|
mode = "wb" if kind in ("vtt","srt") else "w"
|
||||||
|
if mode == "wb":
|
||||||
|
path.write_bytes(r.content)
|
||||||
|
else:
|
||||||
|
path.write_text(r.text, encoding="utf-8")
|
||||||
|
print(f"[resolver] downloaded transcript ({kind}) from {url}", flush=True)
|
||||||
|
return path
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[resolver] failed to fetch transcript: {e}", flush=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def use_rss_transcript(media_path: Path, ep: dict) -> Path | None:
|
||||||
|
"""Create standard transcript artifacts from an RSS transcript (txt/vtt/srt)."""
|
||||||
|
# Prefer direct download; else if rss_ingest already saved a local file path, try that.
|
||||||
|
sidecar = None
|
||||||
|
local_hint = ep.get("transcript_local")
|
||||||
|
if local_hint:
|
||||||
|
p = Path(local_hint)
|
||||||
|
if p.exists():
|
||||||
|
sidecar = p
|
||||||
|
if sidecar is None:
|
||||||
|
sidecar = fetch_rss_transcript(ep, TMP)
|
||||||
|
|
||||||
|
if not sidecar or not sidecar.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Convert to plain text
|
||||||
|
plain = transcript_text_from_file(sidecar)
|
||||||
|
lang = (ep.get("language") or ep.get("lang") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
|
||||||
|
base = write_plain_transcript(media_path, plain, language=lang)
|
||||||
|
# Place an SRT next to video for Plex
|
||||||
|
ensure_sidecar_next_to_media(sidecar, media_path, lang=lang)
|
||||||
|
# Write provenance sidecar
|
||||||
|
(base.with_suffix(".prov.json")).write_bytes(orjson.dumps({
|
||||||
|
"source": "rss",
|
||||||
|
"feed": ep.get("feed_url"),
|
||||||
|
"guid": ep.get("guid"),
|
||||||
|
"episode_title": ep.get("title"),
|
||||||
|
"transcript_kind": sidecar.suffix.lower().lstrip("."),
|
||||||
|
"transcript_url": _choose_transcript_url(ep)[0] or "",
|
||||||
|
}))
|
||||||
|
return base
|
||||||
|
|
||||||
def find_sidecar_transcript(media_path: Path) -> Path | None:
|
def find_sidecar_transcript(media_path: Path) -> Path | None:
|
||||||
"""Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
|
"""Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
|
||||||
Tries common variants including language-suffixed SRT/VTT.
|
Tries common variants including language-suffixed SRT/VTT.
|
||||||
@@ -57,6 +217,110 @@ def find_sidecar_transcript(media_path: Path) -> Path | None:
|
|||||||
return candidates[0] if candidates else None
|
return candidates[0] if candidates else None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- Transcript repository reuse helpers ----------
|
||||||
|
|
||||||
|
def find_repo_transcript_for_media(media_path: Path) -> Path | None:
|
||||||
|
"""Search the transcript repository (/transcripts) for an existing transcript
|
||||||
|
that likely belongs to this media file (match by YYYYMMDD in filename and/or
|
||||||
|
fuzzy title similarity). Returns a path to a matching .json if found."""
|
||||||
|
try:
|
||||||
|
stem = media_path.stem
|
||||||
|
title_no_date = _stem_without_date(stem)
|
||||||
|
file_date = _extract_date_from_stem(stem)
|
||||||
|
best_json, best_score = None, 0.0
|
||||||
|
for j in TRN.glob("*.json"):
|
||||||
|
try:
|
||||||
|
data = json.loads(j.read_text(encoding="utf-8"))
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
other_file = Path(data.get("file", ""))
|
||||||
|
other_stem = other_file.stem if other_file else j.stem
|
||||||
|
other_date = _extract_date_from_stem(other_stem)
|
||||||
|
# If both have dates and they differ a lot, skip
|
||||||
|
if file_date and other_date and file_date != other_date:
|
||||||
|
continue
|
||||||
|
# Compare titles (without dates)
|
||||||
|
sim = difflib.SequenceMatcher(
|
||||||
|
None,
|
||||||
|
_normalize_title(title_no_date),
|
||||||
|
_normalize_title(_stem_without_date(other_stem)),
|
||||||
|
).ratio()
|
||||||
|
# Nudge score when dates match
|
||||||
|
if file_date and other_date and file_date == other_date:
|
||||||
|
sim += 0.1
|
||||||
|
if sim > best_score:
|
||||||
|
best_score, best_json = sim, j
|
||||||
|
# Require a reasonable similarity
|
||||||
|
return best_json if best_json and best_score >= 0.60 else None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None:
|
||||||
|
"""Copy/retarget an existing transcript JSON/TXT (and make SRT/VTT if possible)
|
||||||
|
from the repository so that it belongs to the provided media_path. Returns
|
||||||
|
the new base path in /transcripts or None."""
|
||||||
|
try:
|
||||||
|
# load the source transcript
|
||||||
|
data = json.loads(repo_json.read_text(encoding="utf-8"))
|
||||||
|
src_base = TRN / Path(repo_json).stem
|
||||||
|
src_txt = src_base.with_suffix(".txt")
|
||||||
|
src_srt = src_base.with_suffix(".srt")
|
||||||
|
src_vtt = src_base.with_suffix(".vtt")
|
||||||
|
|
||||||
|
# write the retargeted artifacts
|
||||||
|
new_title = media_path.stem
|
||||||
|
new_base = TRN / new_title
|
||||||
|
new_base.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# update file path
|
||||||
|
data["file"] = str(media_path)
|
||||||
|
(new_base.with_suffix(".json")).write_bytes(orjson.dumps(data))
|
||||||
|
|
||||||
|
# copy or synthesize TXT
|
||||||
|
if src_txt.exists():
|
||||||
|
shutil.copy2(src_txt, new_base.with_suffix(".txt"))
|
||||||
|
else:
|
||||||
|
# fallback: concatenate segments
|
||||||
|
txt = " ".join(s.get("text", "") for s in data.get("segments", []))
|
||||||
|
(new_base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
|
||||||
|
|
||||||
|
# copy SRT/VTT if present; otherwise synthesize SRT from segments
|
||||||
|
if src_srt.exists():
|
||||||
|
shutil.copy2(src_srt, new_base.with_suffix(".srt"))
|
||||||
|
else:
|
||||||
|
# synthesize SRT
|
||||||
|
def fmt_ts(t):
|
||||||
|
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
|
||||||
|
return f"{h:02}:{m:02}:{s:06.3f}".replace('.',',')
|
||||||
|
with open(new_base.with_suffix(".srt"), "w", encoding="utf-8") as srt:
|
||||||
|
for i, s in enumerate(data.get("segments", []), 1):
|
||||||
|
srt.write(f"{i}\n{fmt_ts(s.get('start',0.0))} --> {fmt_ts(s.get('end',0.0))}\n{s.get('text','').strip()}\n\n")
|
||||||
|
if src_vtt.exists():
|
||||||
|
shutil.copy2(src_vtt, new_base.with_suffix(".vtt"))
|
||||||
|
else:
|
||||||
|
# synthesize VTT from segments
|
||||||
|
def fmt_ts_vtt(t):
|
||||||
|
h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
|
||||||
|
return f"{h:02}:{m:02}:{s:06.3f}"
|
||||||
|
with open(new_base.with_suffix(".vtt"), "w", encoding="utf-8") as vtt:
|
||||||
|
vtt.write("WEBVTT\n\n")
|
||||||
|
for s in data.get("segments", []):
|
||||||
|
vtt.write(f"{fmt_ts_vtt(s.get('start',0.0))} --> {fmt_ts_vtt(s.get('end',0.0))} \n{s.get('text','').strip()}\n\n")
|
||||||
|
|
||||||
|
# ensure sidecar next to media
|
||||||
|
try:
|
||||||
|
lang = (data.get("language") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
|
||||||
|
ensure_sidecar_next_to_media(new_base.with_suffix(".srt"), media_path, lang=lang)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return new_base
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[resolver] failed to reuse repo transcript: {e}", flush=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def transcript_text_from_file(path: Path) -> str:
|
def transcript_text_from_file(path: Path) -> str:
|
||||||
"""Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
|
"""Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
|
||||||
try:
|
try:
|
||||||
@@ -84,8 +348,10 @@ def transcript_text_from_file(path: Path) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
|
def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
|
||||||
"""Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed."""
|
"""Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed. If the sidecar is .txt, do nothing."""
|
||||||
try:
|
try:
|
||||||
|
if sidecar.suffix.lower() == ".txt":
|
||||||
|
return
|
||||||
if sidecar.suffix.lower() == ".srt":
|
if sidecar.suffix.lower() == ".srt":
|
||||||
dst = media_path.with_suffix(f".{lang}.srt")
|
dst = media_path.with_suffix(f".{lang}.srt")
|
||||||
shutil.copy2(sidecar, dst)
|
shutil.copy2(sidecar, dst)
|
||||||
@@ -407,6 +673,19 @@ def handle_local_file(path_str: str):
|
|||||||
"uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
|
"uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
|
||||||
log(info)
|
log(info)
|
||||||
|
|
||||||
|
# 0) Try RSS resolver first: if episode with transcript exists, use it (skip Whisper)
|
||||||
|
try:
|
||||||
|
ep = match_media_to_rss(p)
|
||||||
|
except Exception as _e:
|
||||||
|
ep = None
|
||||||
|
if ep:
|
||||||
|
base = use_rss_transcript(p, ep)
|
||||||
|
if base:
|
||||||
|
index_meili(base.with_suffix(".json"))
|
||||||
|
publish_to_openwebui([base.with_suffix(".txt")])
|
||||||
|
log({**info, **{"status": "done", "note": "used_rss_transcript"}})
|
||||||
|
return
|
||||||
|
|
||||||
# 1) Prefer an existing transcript sidecar if present
|
# 1) Prefer an existing transcript sidecar if present
|
||||||
sidecar = find_sidecar_transcript(p)
|
sidecar = find_sidecar_transcript(p)
|
||||||
if sidecar:
|
if sidecar:
|
||||||
@@ -419,6 +698,16 @@ def handle_local_file(path_str: str):
|
|||||||
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
|
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# 1.5) Reuse a transcript that exists in the repository for a matching episode
|
||||||
|
repo_json = find_repo_transcript_for_media(p)
|
||||||
|
if repo_json:
|
||||||
|
base = reuse_repo_transcript(p, repo_json)
|
||||||
|
if base:
|
||||||
|
index_meili(base.with_suffix(".json"))
|
||||||
|
publish_to_openwebui([base.with_suffix(".txt")])
|
||||||
|
log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
|
||||||
|
return
|
||||||
|
|
||||||
# 2) Otherwise, run transcription
|
# 2) Otherwise, run transcription
|
||||||
base = transcribe(p)
|
base = transcribe(p)
|
||||||
index_meili(base.with_suffix(".json"))
|
index_meili(base.with_suffix(".json"))
|
||||||
@@ -464,6 +753,22 @@ def handle_url(url: str):
|
|||||||
"date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
|
"date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
|
||||||
"path": str(dest)})
|
"path": str(dest)})
|
||||||
log({**info, **{"status":"transcribing", "progress": 0}})
|
log({**info, **{"status":"transcribing", "progress": 0}})
|
||||||
|
# Try RSS transcript resolver first
|
||||||
|
ep = None
|
||||||
|
try:
|
||||||
|
ep = match_media_to_rss(dest)
|
||||||
|
except Exception:
|
||||||
|
ep = None
|
||||||
|
if ep:
|
||||||
|
base = use_rss_transcript(dest, ep)
|
||||||
|
else:
|
||||||
|
base = None
|
||||||
|
# 1.5) If we didn't get an RSS transcript and there is a matching one already in the repo, reuse it
|
||||||
|
if not base:
|
||||||
|
repo_json = find_repo_transcript_for_media(dest)
|
||||||
|
if repo_json:
|
||||||
|
base = reuse_repo_transcript(dest, repo_json)
|
||||||
|
if not base:
|
||||||
base = transcribe(dest)
|
base = transcribe(dest)
|
||||||
index_meili(base.with_suffix(".json"))
|
index_meili(base.with_suffix(".json"))
|
||||||
publish_to_openwebui([base.with_suffix(".txt")])
|
publish_to_openwebui([base.with_suffix(".txt")])
|
||||||
|
Reference in New Issue
Block a user