Podcast sync

2025-09-07 16:01:59 +02:00
parent e6582e9a6b
commit e66414570a
3 changed files with 689 additions and 352 deletions
--- a/app/resolver.py
+++ b/app/resolver.py
@@ -0,0 +1,105 @@
+# resolver.py
+from __future__ import annotations
+import json, os, re, subprocess
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Optional, Dict, Any, Tuple, List
+
+try:
+    from rapidfuzz import fuzz, process
+except Exception:
+    fuzz = None
+    process = None
+
+def _norm(s: str) -> str:
+    s = s.lower()
+    s = re.sub(r"[\[\]\(\)\{\}|_]+", " ", s)
+    s = re.sub(r"[^0-9a-zá-žà-ÿ\u00C0-\u024F\s]+", " ", s)  # keep latin accents, cz/diacritics
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+
+def _title_from_filename(p: Path) -> str:
+    name = p.stem  # drop extension
+    # common yt-dlp patterns like "YYYYMMDD - Title"
+    name = re.sub(r"^\d{8}\s*-\s*", "", name)
+    return name
+
+def _ffprobe_duration_seconds(p: Path) -> Optional[int]:
+    try:
+        out = subprocess.check_output([
+            "ffprobe", "-v", "error", "-show_entries", "format=duration",
+            "-of", "default=nw=1:nk=1", str(p)
+        ], stderr=subprocess.STDOUT, text=True).strip()
+        return int(float(out))
+    except Exception:
+        return None
+
+def load_index(index_path: Path) -> List[Dict[str, Any]]:
+    if not index_path.exists():
+        return []
+    with index_path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    # expected per-item keys:
+    # title, pubdate_ts (int), duration_s (int or null),
+    # transcript_urls: {"srt": str|None, "vtt": str|None, "txt": str|None},
+    # audio_url, guid, feed_url
+    return data if isinstance(data, list) else []
+
+def match_episode(
+    media_path: Path,
+    index_items: List[Dict[str, Any]],
+    duration_tolerance_s: int = 120,
+    min_ratio: int = 82,
+    date_window_days: int = 14,
+) -> Optional[Dict[str, Any]]:
+    title_guess = _title_from_filename(media_path)
+    tnorm = _norm(title_guess)
+    if not tnorm:
+        return None
+
+    media_secs = _ffprobe_duration_seconds(media_path)
+    media_date = None
+    # try to parse upload date prefix in filename if present
+    m = re.search(r"(\d{8})", media_path.stem)
+    if m:
+        try:
+            media_date = datetime.strptime(m.group(1), "%Y%m%d").replace(tzinfo=timezone.utc)
+        except Exception:
+            media_date = None
+
+    candidates = []
+    for item in index_items:
+        item_title = _norm(item.get("title", ""))
+        if not item_title:
+            continue
+        ratio = (fuzz.token_sort_ratio(tnorm, item_title) if fuzz else (100 if tnorm == item_title else 0))
+        if ratio < min_ratio:
+            continue
+
+        # duration filter (if both known)
+        ok_duration = True
+        if media_secs and item.get("duration_s"):
+            ok_duration = abs(media_secs - int(item["duration_s"])) <= duration_tolerance_s
+
+        # date window (if both known)
+        ok_date = True
+        if media_date and item.get("pubdate_ts"):
+            dt_item = datetime.fromtimestamp(int(item["pubdate_ts"]), tz=timezone.utc)
+            delta_days = abs((media_date - dt_item).days)
+            ok_date = delta_days <= date_window_days
+
+        if ok_duration and ok_date:
+            candidates.append((ratio, item))
+
+    if not candidates:
+        return None
+    candidates.sort(key=lambda x: x[0], reverse=True)
+    return candidates[0][1]
+
+def choose_transcript_url(item: Dict[str, Any]) -> Optional[Tuple[str, str]]:
+    urls = item.get("transcript_urls") or {}
+    # prefer text/plain, then VTT, then SRT:
+    if urls.get("txt"): return (urls["txt"], "txt")
+    if urls.get("vtt"): return (urls["vtt"], "vtt")
+    if urls.get("srt"): return (urls["srt"], "srt")
+    return None
--- a/app/rss_ingest.py
+++ b/app/rss_ingest.py
@@ -1,388 +1,315 @@
-#!/usr/bin/env python3
-"""
-RSS ingester for PodX
- Reads feeds from env var RSS_FEEDS (comma-separated) *and*/or from a file (FEEDS_FILE, default /library/feeds.txt)
- Fetches RSS with ETag/Last-Modified caching to avoid re-downloading unchanged feeds
- Saves audio to LIBRARY_ROOT/<podcast>/<YYYYMMDD - title>.<ext>
- Saves transcript sidecars when `<podcast:transcript>` links are present (prefers TextWithTimestamps → WebVTT → SRT → TXT)
- Enqueues `worker.handle_local_file` for indexing/transcription (worker will skip Whisper if a transcript sidecar exists)
- Keeps a small state JSON with per-feed ETag/Last-Modified and per-item processed GUIDs to avoid duplicate work
-
-Environment variables (with sane defaults):
-  MEILI_URL            (unused directly here, but kept for parity)
-  REDIS_URL            redis://redis:6379/0
-  LIBRARY_ROOT         /library
-  TRANSCRIPT_ROOT      /transcripts
-  RSS_FEEDS            "" (comma-separated list)
-  FEEDS_FILE           /library/feeds.txt
-  RSS_SCAN_MINUTES     120
-  RSS_ONCE             0 ("1" to run once and exit)
-  USER_AGENT           podx-rss/1.0 (+local-archive)
-  RSS_STATE_FILE       /library/.rss_state.json
-  RSS_CONNECT_TIMEOUT  15 (seconds)
-  RSS_READ_TIMEOUT     60 (seconds)
-  AUDIO_MAX_MB         4096 (skip larger-than if HEAD reveals size > max, 0 = unlimited)
-"""
-import os
-import re
-import sys
-import json
-import time
-import logging
-import itertools
-from datetime import datetime
+import os, re, json, time, shutil, difflib
 from pathlib import Path
-from typing import Dict, List, Tuple, Optional
 from urllib.parse import urlparse
-
 import requests
 import xml.etree.ElementTree as ET
-import redis
-from rq import Queue

-logging.basicConfig(level=logging.INFO, format='[rss] %(message)s')
-log = logging.getLogger("rss")
+# ---- Config ----
+TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
+OUT_INDEX = Path(os.getenv("RSS_INDEX_PATH", str(TRN / "rss_index.json")))
+FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/app/config/feeds.txt"))
+FEEDS_ENV = os.getenv("RSS_FEEDS", "").strip()
+TIMEOUT = int(os.getenv("RSS_HTTP_TIMEOUT", "30"))
+DOWNLOAD_TRANSCRIPTS = os.getenv("RSS_DOWNLOAD_TRANSCRIPTS", "true").lower() in {"1", "true", "yes", "y"}
+DEFAULT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"

-# Config
-MEILI_URL         = os.getenv("MEILI_URL", "http://meili:7700")
-REDIS_URL         = os.getenv("REDIS_URL", "redis://redis:6379/0")
-LIBRARY_ROOT      = Path(os.getenv("LIBRARY_ROOT", "/library"))
-TRANSCRIPT_ROOT   = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
-RSS_FEEDS_ENV     = [s.strip() for s in os.getenv("RSS_FEEDS", "").split(",") if s.strip()]
-FEEDS_FILE        = Path(os.getenv("FEEDS_FILE", str(LIBRARY_ROOT / "feeds.txt")))
-RSS_SCAN_MINUTES  = int(os.getenv("RSS_SCAN_MINUTES", "120"))
-RSS_ONCE          = os.getenv("RSS_ONCE", "0") == "1"
-USER_AGENT        = os.getenv("USER_AGENT", "podx-rss/1.0 (+local-archive)")
-STATE_FILE        = Path(os.getenv("RSS_STATE_FILE", str(LIBRARY_ROOT / ".rss_state.json")))
-CONNECT_TIMEOUT   = float(os.getenv("RSS_CONNECT_TIMEOUT", "15"))
-READ_TIMEOUT      = float(os.getenv("RSS_READ_TIMEOUT", "60"))
-AUDIO_MAX_MB      = int(os.getenv("AUDIO_MAX_MB", "4096"))
+# Where media files live; used to sidecar RSS transcripts next to matching media
+LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
+MEDIA_EXTS = {".mp3", ".m4a", ".flac", ".wav", ".ogg", ".opus", ".mp4", ".m4v", ".mkv", ".webm", ".mov", ".avi"}

-# Redis queue
-r = redis.from_url(REDIS_URL)
-q = Queue("default", connection=r)
+# Fuzzy title match threshold for media ↔ transcript pairing
+TITLE_MATCH_THRESHOLD = float(os.getenv("RSS_TITLE_MATCH_THRESHOLD", "0.60"))

-# HTTP session
-SESSION = requests.Session()
-SESSION.headers.update({"User-Agent": USER_AGENT})
-
-# Namespaces commonly used in podcast RSS
+# Namespace map (extend as needed)
 NS = {
-    "podcast": "https://podcastindex.org/namespace/1.0",
    "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
-    "media": "http://search.yahoo.com/mrss/",
    "content": "http://purl.org/rss/1.0/modules/content/",
-    "dc": "http://purl.org/dc/elements/1.1/",
+    "media": "http://search.yahoo.com/mrss/",
+    "podcast": "https://podcastindex.org/namespace/1.0",
+    "atom": "http://www.w3.org/2005/Atom",
 }

-# ----------------- helpers -----------------
-
-def safe(s: str) -> str:
-    s = re.sub(r"[\\/:*?\"<>|]", "_", s)
-    # collapse whitespace and trim
-    s = re.sub(r"\s+", " ", s).strip()
-    # guard against very long filenames
-    return s[:200] if len(s) > 200 else s
+TRN.mkdir(parents=True, exist_ok=True)
+OUT_INDEX.parent.mkdir(parents=True, exist_ok=True)


-def load_state() -> Dict:
-    if STATE_FILE.exists():
-        try:
-            return json.loads(STATE_FILE.read_text("utf-8"))
-        except Exception:
-            log.warning("State file unreadable, starting fresh")
-    return {"feeds": {}, "items": {}}  # items keyed by GUID / enclosure URL
+def _text(el):
+    return (el.text or "").strip() if el is not None else ""


-def save_state(state: Dict) -> None:
-    STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
-    tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp")
-    tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2))
-    tmp.replace(STATE_FILE)
-
-
-def load_feeds() -> List[str]:
-    feeds = list(RSS_FEEDS_ENV)
-    if FEEDS_FILE.exists():
-        try:
-            for line in FEEDS_FILE.read_text("utf-8").splitlines():
-                s = line.strip()
-                if not s or s.startswith("#"):  # allow comments
-                    continue
-                feeds.append(s)
-        except Exception as e:
-            log.warning(f"Failed to read {FEEDS_FILE}: {e}")
-    # de-dup preserving order
-    seen = set()
-    uniq = []
-    for f in feeds:
-        if f not in seen:
-            uniq.append(f)
-            seen.add(f)
-    return uniq
-
-
-def fetch(url: str, *, etag: Optional[str]=None, modified: Optional[str]=None, as_text=False):
-    headers = {}
-    if etag:
-        headers["If-None-Match"] = etag
-    if modified:
-        headers["If-Modified-Since"] = modified
-    resp = SESSION.get(url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
-    if resp.status_code == 304:
-        return None, 304, None, None
-    resp.raise_for_status()
-    new_etag = resp.headers.get("ETag")
-    new_mod = resp.headers.get("Last-Modified")
-    return (resp.text if as_text else resp.content), resp.status_code, new_etag, new_mod
-
-
-def head_size(url: str) -> Optional[int]:
+def _parse_duration(d):
+    if not d:
+        return None
+    s = str(d).strip()
+    if s.isdigit():
+        return int(s)
+    parts = [p for p in s.split(":") if p != ""]
    try:
-        h = SESSION.head(url, allow_redirects=True, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
-        if h.ok:
-            cl = h.headers.get("Content-Length")
-            return int(cl) if cl and cl.isdigit() else None
+        if len(parts) == 3:
+            h, m, sec = parts
+            return int(h) * 3600 + int(m) * 60 + int(float(sec))
+        if len(parts) == 2:
+            m, sec = parts
+            return int(m) * 60 + int(float(sec))
+        return int(float(parts[0]))
    except Exception:
        return None
-    return None


-def best_transcript_links(item) -> List[str]:
-    links: List[Tuple[int, str, str]] = []
-    # Try explicit QName first
-    for tag in item.findall(".//{https://podcastindex.org/namespace/1.0}transcript"):
-        t = (tag.attrib.get("type") or "").lower()
-        url = tag.attrib.get("url")
-        if url:
-            links.append((0, t, url))
-    # Namespace-prefixed fallback
-    for tag in item.findall(".//podcast:transcript", NS):
-        t = (tag.attrib.get("type") or "").lower()
-        url = tag.attrib.get("url")
-        if url:
-            links.append((0, t, url))
-
-    order = [
-        "text/plain",              # often used for TextWithTimestamps
-        "application/json",
-        "text/vtt",
-        "application/srt",
-        "application/x-subrip",
-        "application/text",
-        "text/plain; charset=utf-8",
-    ]
-    key = {v: i for i, v in enumerate(order)}
-    ranked = []
-    for _, t, url in links:
-        ranked.append((key.get(t, 999), t, url))
-    ranked.sort()
-    return [u for _, _, u in ranked]
+def _slug(text: str) -> str:
+    text = re.sub(r"\s+", " ", text).strip()
+    text = re.sub(r"[^A-Za-z0-9\-._ ]+", "", text)
+    return (text[:120] or "episode").replace(" ", "_")


-def get_enclosure(item) -> Optional[str]:
-    enc = item.find("enclosure")
-    if enc is not None and enc.attrib.get("url"):
-        return enc.attrib["url"]
-    mc = item.find("media:content", NS)
-    if mc is not None and mc.attrib.get("url"):
-        return mc.attrib["url"]
-    return None
-
-
-def parse_pubdate(item) -> datetime:
-    # Try common fields
-    candidates = [
-        item.findtext("pubDate"),
-        item.findtext("dc:date", namespaces=NS),
-        item.findtext("{http://purl.org/dc/elements/1.1/}date"),
-    ]
-    for pd in filter(None, candidates):
-        s = pd.strip()
-        # Try several common formats
-        for fmt in [
-            "%a, %d %b %Y %H:%M:%S %z",
-            "%a, %d %b %Y %H:%M:%S",
-            "%Y-%m-%dT%H:%M:%S%z",
-            "%Y-%m-%dT%H:%M:%S",
-        ]:
-            try:
-                return datetime.strptime(s, fmt)
-            except Exception:
-                pass
-    return datetime.utcnow()
-
-
-def save_bytes(path: Path, data: bytes) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    tmp = path.with_suffix(path.suffix + ".part")
-    tmp.write_bytes(data)
-    tmp.replace(path)
-
-
-def decide_audio_ext(audio_url: str) -> str:
-    p = urlparse(audio_url)
-    low = p.path.lower()
-    if low.endswith(".m4a"):
-        return ".m4a"
-    if low.endswith(".mp3"):
-        return ".mp3"
-    if low.endswith(".ogg") or low.endswith(".oga"):
-        return ".ogg"
-    if low.endswith(".aac"):
-        return ".aac"
-    if low.endswith(".wav"):
-        return ".wav"
-    return ".mp3"
-
-
-def item_key(item) -> str:
-    # Prefer GUID value, else enclosure URL, else title+date
-    guid = item.findtext("guid")
-    if guid:
-        return guid.strip()
-    enc = get_enclosure(item)
-    if enc:
-        return enc
-    title = item.findtext("title") or "Episode"
-    pub = parse_pubdate(item).strftime("%Y%m%d")
-    return f"{pub}:{title}"
-
-
-# ----------------- core ingest -----------------
-
-def ingest_feed(url: str, state: Dict) -> int:
-    fstate = state.setdefault("feeds", {}).setdefault(url, {})
-    etag = fstate.get("etag")
-    mod = fstate.get("modified")
-
-    log.info(f"Fetch RSS: {url}")
+def _yymmdd_from_pubdate(pubdate: str) -> str:
    try:
-        data, status, new_etag, new_mod = fetch(url, etag=etag, modified=mod, as_text=True)
-    except Exception as e:
-        log.error(f"Fetch failed: {e}")
-        return 0
+        from email.utils import parsedate_to_datetime

-    if status == 304:
-        log.info("Not modified")
-        return 0
+        dt = parsedate_to_datetime(pubdate)
+        if dt is not None:
+            return dt.strftime("%Y%m%d")
+    except Exception:
+        pass
+    m = re.search(r"(\d{4})[-/](\d{1,2})[-/](\d{1,2})", pubdate or "")
+    if m:
+        y, mo, d = m.groups()
+        return f"{int(y):04d}{int(mo):02d}{int(d):02d}"
+    return ""

-    if new_etag:
-        fstate["etag"] = new_etag
-    if new_mod:
-        fstate["modified"] = new_mod

+def _iter_items(channel: ET.Element):
+    for tag in ["item", "{http://www.w3.org/2005/Atom}entry"]:
+        for it in channel.findall(tag):
+            yield it
+
+
+def _findall_ns(el, path):
+    res = el.findall(path, NS)
+    if not res and ":" in path:
+        last = path.split("/")[-1]
+        res = el.findall(last)
+    return res
+
+
+def _find_ns(el, path):
+    found = el.find(path, NS)
+    if found is None and ":" in path:
+        found = el.find(path.split("/")[-1])
+    return found
+
+
+def _download(url: str, dst: Path) -> Path | None:
    try:
-        root = ET.fromstring(data)
+        r = requests.get(url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"})
+        r.raise_for_status()
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        with open(dst, "wb") as f:
+            f.write(r.content)
+        return dst
+    except Exception:
+        return None
+
+
+def _guess_ext_from_type(mime: str) -> str:
+    if not mime:
+        return ".txt"
+    mime = mime.lower()
+    if "vtt" in mime:
+        return ".vtt"
+    if "srt" in mime or "subrip" in mime:
+        return ".srt"
+    if "json" in mime:
+        return ".json"
+    return ".txt"
+
+
+def _norm_text(s: str) -> str:
+    s = (s or "").lower()
+    s = re.sub(r"\s+", " ", s)
+    s = re.sub(r"[^a-z0-9 _.-]+", "", s)
+    return s.strip()
+
+def _strip_leading_date(basename: str) -> str:
+    m = re.match(r"^(\d{8})\s*-\s*(.+)$", basename)
+    return m.group(2) if m else basename
+
+def _find_matching_media(date: str, title: str) -> list[Path]:
+    """Find media files in LIB that likely correspond to this episode.
+    Strategy:
+      1) If YYYYMMDD is present, prefer files starting with that date.
+      2) Otherwise, fuzzy title match using difflib on stems (date stripped).
+    """
+    matches: list[Path] = []
+    # 1) Date-based scan
+    if date:
+        for p in LIB.rglob(f"{date} - *"):
+            if p.is_file() and p.suffix.lower() in MEDIA_EXTS:
+                matches.append(p)
+    if matches:
+        return matches
+
+    # 2) Fuzzy title scan (can be expensive on huge libraries)
+    tkey = _norm_text(title)
+    if not tkey:
+        return matches
+    for p in LIB.rglob("*"):
+        if not (p.is_file() and p.suffix.lower() in MEDIA_EXTS):
+            continue
+        stem = _strip_leading_date(p.stem)
+        fkey = _norm_text(stem)
+        if not fkey:
+            continue
+        if difflib.SequenceMatcher(None, tkey, fkey).ratio() >= TITLE_MATCH_THRESHOLD:
+            matches.append(p)
+    return matches
+
+def _sidecar_path_for(media_path: Path, lang: str, ext: str) -> Path:
+    base = media_path.with_suffix("")
+    lang = (lang or DEFAULT_LANG or "en").lower().replace("_", "-")
+    # Prefer language-suffixed sidecars (e.g., episode.en.srt)
+    return base.with_name(f"{base.name}.{lang}{ext}")
+
+def _propagate_transcript_to_media(local_tr_path: Path, lang: str, date: str, title: str) -> list[str]:
+    """Copy a downloaded transcript next to any matching media under LIB.
+    Returns a list of created sidecar file paths (as strings)."""
+    created: list[str] = []
+    if not local_tr_path.exists():
+        return created
+    ext = local_tr_path.suffix.lower()
+    if ext not in {".srt", ".vtt", ".txt"}:
+        # Unknown/unsupported transcript type for sidecar; skip silently
+        return created
+    for media in _find_matching_media(date, title):
+        dst = _sidecar_path_for(media, lang, ext)
+        try:
+            dst.parent.mkdir(parents=True, exist_ok=True)
+            if not dst.exists():
+                shutil.copy2(local_tr_path, dst)
+                created.append(str(dst))
+        except Exception:
+            # best-effort; continue
+            pass
+    return created
+
+
+def _gather_transcripts(item: ET.Element):
+    transcripts = []
+    # podcast:transcript elements
+    for t in _findall_ns(item, "podcast:transcript"):
+        url = t.get("url") or t.get("href")
+        ttype = t.get("type") or ""
+        lang = t.get("language") or t.get("lang") or ""
+        if url:
+            transcripts.append({"url": url, "type": ttype, "language": lang})
+    # Atom-style transcript link
+    for link in _findall_ns(item, "atom:link"):
+        if (link.get("rel") or "").lower() == "transcript" and link.get("href"):
+            transcripts.append(
+                {
+                    "url": link.get("href"),
+                    "type": link.get("type") or "",
+                    "language": link.get("hreflang") or "",
+                }
+            )
+    return transcripts
+
+
+def parse_feed(feed_url: str):
+    items = []
+    try:
+        r = requests.get(feed_url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"})
+        r.raise_for_status()
+        root = ET.fromstring(r.content)
+
+        channel = root.find("channel") or root
+        show_title = _text(_find_ns(channel, "title")) or _text(_find_ns(root, "title"))
+
+        for it in _iter_items(channel):
+            title = _text(_find_ns(it, "title"))
+            guid = _text(_find_ns(it, "guid")) or _text(_find_ns(it, "id"))
+            link = _text(_find_ns(it, "link"))
+            pub = _text(_find_ns(it, "pubDate")) or _text(_find_ns(it, "published"))
+            date = _yymmdd_from_pubdate(pub)
+            dur = _text(_find_ns(it, "itunes:duration"))
+            duration_sec = _parse_duration(dur) or None
+            enclosure = _find_ns(it, "enclosure")
+            audio_url = enclosure.get("url") if enclosure is not None else ""
+
+            if not audio_url:
+                for mc in _findall_ns(it, "media:content"):
+                    if (mc.get("type") or "").startswith("audio") and mc.get("url"):
+                        audio_url = mc.get("url")
+                        break
+
+            transcripts = _gather_transcripts(it)
+
+            item_rec = {
+                "show": show_title,
+                "feed_url": feed_url,
+                "title": title,
+                "guid": guid,
+                "link": link,
+                "date": date,
+                "duration_sec": duration_sec,
+                "audio_url": audio_url,
+                "language": DEFAULT_LANG,
+                "transcripts": transcripts,
+            }
+
+            # Optionally download transcripts locally
+            if DOWNLOAD_TRANSCRIPTS and transcripts:
+                base_name = f"{date or '00000000'} - {_slug(title)}"
+                for t in item_rec["transcripts"]:
+                    ext = _guess_ext_from_type(t.get("type", ""))
+                    parsed = urlparse(t["url"])
+                    url_ext = Path(parsed.path).suffix.lower()
+                    if url_ext in {".vtt", ".srt", ".txt", ".json"}:
+                        ext = url_ext
+                    local_file = (TRN / base_name).with_suffix(ext)
+                    saved = _download(t["url"], local_file)
+                    if saved:
+                        t["local_path"] = str(saved)
+                        # If we saved a readable sidecar type, try to place it next to matching media
+                        if ext in {".vtt", ".srt", ".txt"}:
+                            created = _propagate_transcript_to_media(saved, t.get("language") or DEFAULT_LANG, date, title)
+                            if created:
+                                t["sidecars"] = created
+
+            items.append(item_rec)
+
+        return {"feed_url": feed_url, "show": show_title, "episodes": items}
    except Exception as e:
-        log.error(f"XML parse error: {e}")
-        return 0
+        return {"feed_url": feed_url, "error": str(e), "episodes": []}

-    channel_title = safe((root.findtext("channel/title") or "Podcast"))

-    new_items = 0
-    for item in root.findall("channel/item"):
-        key = item_key(item)
-        already = state.setdefault("items", {})
-        if already.get(key):
-            continue
-
-        title = safe(item.findtext("title") or "Episode")
-        pub = parse_pubdate(item)
-        date_prefix = pub.strftime("%Y%m%d")
-        base = f"{date_prefix} - {title}"
-
-        audio_url = get_enclosure(item)
-        if not audio_url:
-            log.info(f"Skip (no enclosure): {title}")
-            already[key] = {"skipped": "no_enclosure"}
-            continue
-
-        # HEAD size guard (optional)
-        if AUDIO_MAX_MB > 0:
-            size = head_size(audio_url)
-            if size and size > AUDIO_MAX_MB * 1024 * 1024:
-                log.info(f"Skip (size>{AUDIO_MAX_MB}MB): {title}")
-                already[key] = {"skipped": "too_large", "size": size}
-                continue
-
-        path_ext = decide_audio_ext(audio_url)
-        audio_out = LIBRARY_ROOT / channel_title / f"{base}{path_ext}"
-        transcript_links = best_transcript_links(item)
-
-        # If audio exists and a transcript sidecar exists → just enqueue index
-        sidecars = list((TRANSCRIPT_ROOT / channel_title).glob(f"{base}.*"))
-        have_transcript = len(sidecars) > 0
-        if audio_out.exists() and have_transcript:
-            log.info(f"Skip download, enqueue index (have audio+transcript): {audio_out.name}")
-            try:
-                q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
-            except Exception as e:
-                log.warning(f"Enqueue failed: {e}")
-            already[key] = {"done": True, "audio": str(audio_out)}
-            new_items += 1
-            continue
-
-        # Download audio
+def load_feeds_list():
+    feeds = []
+    if FEEDS_ENV:
+        feeds.extend([u.strip() for u in FEEDS_ENV.split(",") if u.strip()])
+    if FEEDS_FILE.exists():
        try:
-            log.info(f"Downloading audio → {audio_out}")
-            content, _, _, _ = fetch(audio_url, as_text=False)
-            save_bytes(audio_out, content)
-        except Exception as e:
-            log.warning(f"Audio failed: {e}")
-            already[key] = {"error": f"audio:{e}"}
-            continue
-
-        # Download transcript if present (take first best)
-        transcript_out = None
-        for turl in transcript_links:
-            try:
-                ext = ".vtt" if "vtt" in turl.lower() else ".srt" if "srt" in turl.lower() else ".txt"
-                tout = TRANSCRIPT_ROOT / channel_title / f"{base}{ext}"
-                log.info(f"Downloading transcript → {tout} ({turl})")
-                tdata, _, _, _ = fetch(turl, as_text=False)
-                save_bytes(tout, tdata)
-                transcript_out = tout
-                break
-            except Exception as e:
-                log.warning(f"Transcript fetch failed ({turl}): {e}")
-                continue
-
-        # Enqueue for indexing/transcription
-        try:
-            q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
-        except Exception as e:
-            log.warning(f"Enqueue failed: {e}")
-
-        already[key] = {"done": True, "audio": str(audio_out), "transcript": str(transcript_out) if transcript_out else None}
-        new_items += 1
-
-    return new_items
+            for line in FEEDS_FILE.read_text(encoding="utf-8").splitlines():
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                feeds.append(line)
+        except Exception:
+            pass
+    # unique, keep order
+    return sorted(list(dict.fromkeys(feeds)))


-# ----------------- main loop -----------------
-
-def main():
-    while True:
-        feeds = load_feeds()
-        if not feeds:
-            log.error("No RSS feeds configured. Set RSS_FEEDS or create feeds.txt.")
-            sys.exit(1)
-
-        state = load_state()
-        total_new = 0
-        for url in feeds:
-            try:
-                added = ingest_feed(url, state)
-                total_new += added
-                save_state(state)
-            except Exception as e:
-                log.error(f"Feed error: {url} -> {e}")
-        log.info(f"Cycle complete. New items: {total_new}")
-        if RSS_ONCE:
-            break
-        time.sleep(RSS_SCAN_MINUTES * 60)
+def build_index():
+    feeds = load_feeds_list()
+    out = {"generated_at": int(time.time()), "feeds": feeds, "episodes": []}
+    for url in feeds:
+        data = parse_feed(url)
+        out["episodes"].extend(data.get("episodes", []))
+    OUT_INDEX.write_text(json.dumps(out, ensure_ascii=False, indent=2))
+    print(f"[rss] wrote index with {len(out['episodes'])} episode(s) -> {OUT_INDEX}")
+    return OUT_INDEX


 if __name__ == "__main__":
-    main()
+    build_index()
--- a/app/worker.py
+++ b/app/worker.py
@@ -1,6 +1,7 @@
 import os, subprocess, shutil, json, re, orjson, requests
 from pathlib import Path
 import math
+import difflib
 from faster_whisper import WhisperModel

 MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
@@ -12,6 +13,11 @@ MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3")
 COMPUTE = os.getenv("WHISPER_PRECISION","int8")
 WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "auto").strip()

+# RSS resolver config
+RSS_INDEX_PATH = Path(os.getenv("RSS_INDEX_PATH", "/transcripts/rss_index.json"))
+RSS_DURATION_TOLERANCE = int(os.getenv("RSS_DURATION_TOLERANCE", "150"))  # seconds
+DEFAULT_TRANSCRIPT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
+
 OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/")
 OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "")
 OWUI_KB  = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library")
@@ -39,6 +45,160 @@ def log(feed):
 def sanitize(name):
    return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()

+# ---------- RSS transcript resolver ----------
+
+def _normalize_title(t: str) -> str:
+    t = (t or "").lower()
+    t = re.sub(r"\s+", " ", t)
+    # remove punctuation-ish
+    t = re.sub(r"[^a-z0-9 _-]+", "", t)
+    return t.strip()
+
+def _stem_without_date(stem: str) -> str:
+    # drop leading YYYYMMDD - from filenames created by yt-dlp template
+    m = re.match(r"^\d{8}\s*-\s*(.*)$", stem)
+    return m.group(1) if m else stem
+
+def _extract_date_from_stem(stem: str) -> str | None:
+    m = re.search(r"\b(\d{8})\b", stem)
+    return m.group(1) if m else None
+
+def _best_title_match(title: str, candidates: list[str]) -> tuple[str, float]:
+    """Return (best_title, score 0..1) using difflib SequenceMatcher."""
+    if not candidates:
+        return "", 0.0
+    norm_title = _normalize_title(title)
+    best = ("", 0.0)
+    for c in candidates:
+        score = difflib.SequenceMatcher(None, norm_title, _normalize_title(c)).ratio()
+        if score > best[1]:
+            best = (c, score)
+    return best
+
+def _load_rss_index() -> list[dict]:
+    try:
+        if RSS_INDEX_PATH.exists():
+            data = json.loads(RSS_INDEX_PATH.read_text(encoding="utf-8"))
+            # supports {"episodes":[...]} or a flat list
+            if isinstance(data, dict) and "episodes" in data:
+                return data["episodes"] or []
+            if isinstance(data, list):
+                return data
+    except Exception as e:
+        print(f"[resolver] failed to load RSS index: {e}", flush=True)
+    return []
+
+def match_media_to_rss(media_path: Path) -> dict | None:
+    """Try to match a local media file to an RSS episode entry."""
+    episodes = _load_rss_index()
+    if not episodes:
+        return None
+
+    stem = media_path.stem
+    title_no_date = _stem_without_date(stem)
+    file_date = _extract_date_from_stem(stem)
+    # duration tolerance
+    media_dur = media_duration_seconds(media_path)
+
+    # Candidates: filter by date if present, else all
+    if file_date:
+        pool = [e for e in episodes if (str(e.get("date", "")) == file_date or str(e.get("pubdate", "")) == file_date)]
+        if not pool:
+            pool = episodes
+    else:
+        pool = episodes
+
+    # Pick best by (title similarity, duration proximity)
+    best_ep, best_score = None, -1.0
+    for ep in pool:
+        ep_title = ep.get("title") or ep.get("itunes_title") or ""
+        sim = _best_title_match(title_no_date, [ep_title])[1]
+        dur = float(ep.get("duration_sec") or ep.get("duration") or 0.0)
+        dur_ok = True
+        if media_dur and dur:
+            dur_ok = abs(media_dur - dur) <= RSS_DURATION_TOLERANCE
+        score = sim + (0.1 if dur_ok else 0.0)
+        if score > best_score:
+            best_score, best_ep = score, ep
+
+    if best_ep and best_score >= 0.5:
+        print(f"[resolver] matched '{stem}' -> '{best_ep.get('title','')}' score={best_score:.2f}", flush=True)
+        return best_ep
+    return None
+
+def _choose_transcript_url(ep: dict) -> tuple[str, str] | tuple[None, None]:
+    """Return (url, kind) preferring txt, vtt, then srt. 'kind' in {'txt','vtt','srt'}."""
+    # unified structure from rss_ingest.py: ep["transcripts"] = [{"url":..., "type": ...}, ...]
+    items = ep.get("transcripts") or []
+    # some ingesters store separate keys
+    if not items:
+        for key, kind in [("transcript_txt","txt"), ("transcript_vtt","vtt"), ("transcript_srt","srt")]:
+            if ep.get(key):
+                items.append({"url": ep[key], "type": kind})
+    # preference order
+    for kind in ["txt", "vtt", "srt"]:
+        for it in items:
+            t = (it.get("type") or "").lower()
+            u = it.get("url") or ""
+            if u and (kind in t or (kind == "txt" and t in ["text","plain","text/plain"]) or (kind in u.lower())):
+                return u, kind
+    return (None, None)
+
+def fetch_rss_transcript(ep: dict, dest_dir: Path) -> Path | None:
+    """Download transcript to dest_dir and return local Path; convert VTT->SRT if needed."""
+    url, kind = _choose_transcript_url(ep)
+    if not url:
+        return None
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    # filename from episode title
+    safe = sanitize(ep.get("title") or ep.get("guid") or "episode")
+    path = dest_dir / f"{safe}.{kind if kind!='txt' else 'txt'}"
+    try:
+        r = requests.get(url, timeout=30)
+        r.raise_for_status()
+        mode = "wb" if kind in ("vtt","srt") else "w"
+        if mode == "wb":
+            path.write_bytes(r.content)
+        else:
+            path.write_text(r.text, encoding="utf-8")
+        print(f"[resolver] downloaded transcript ({kind}) from {url}", flush=True)
+        return path
+    except Exception as e:
+        print(f"[resolver] failed to fetch transcript: {e}", flush=True)
+        return None
+
+def use_rss_transcript(media_path: Path, ep: dict) -> Path | None:
+    """Create standard transcript artifacts from an RSS transcript (txt/vtt/srt)."""
+    # Prefer direct download; else if rss_ingest already saved a local file path, try that.
+    sidecar = None
+    local_hint = ep.get("transcript_local")
+    if local_hint:
+        p = Path(local_hint)
+        if p.exists():
+            sidecar = p
+    if sidecar is None:
+        sidecar = fetch_rss_transcript(ep, TMP)
+
+    if not sidecar or not sidecar.exists():
+        return None
+
+    # Convert to plain text
+    plain = transcript_text_from_file(sidecar)
+    lang = (ep.get("language") or ep.get("lang") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
+    base = write_plain_transcript(media_path, plain, language=lang)
+    # Place an SRT next to video for Plex
+    ensure_sidecar_next_to_media(sidecar, media_path, lang=lang)
+    # Write provenance sidecar
+    (base.with_suffix(".prov.json")).write_bytes(orjson.dumps({
+        "source": "rss",
+        "feed": ep.get("feed_url"),
+        "guid": ep.get("guid"),
+        "episode_title": ep.get("title"),
+        "transcript_kind": sidecar.suffix.lower().lstrip("."),
+        "transcript_url": _choose_transcript_url(ep)[0] or "",
+    }))
+    return base
+
 def find_sidecar_transcript(media_path: Path) -> Path | None:
    """Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
    Tries common variants including language-suffixed SRT/VTT.
@@ -57,6 +217,110 @@ def find_sidecar_transcript(media_path: Path) -> Path | None:
    return candidates[0] if candidates else None


+# ---------- Transcript repository reuse helpers ----------
+
+def find_repo_transcript_for_media(media_path: Path) -> Path | None:
+    """Search the transcript repository (/transcripts) for an existing transcript
+    that likely belongs to this media file (match by YYYYMMDD in filename and/or
+    fuzzy title similarity). Returns a path to a matching .json if found."""
+    try:
+        stem = media_path.stem
+        title_no_date = _stem_without_date(stem)
+        file_date = _extract_date_from_stem(stem)
+        best_json, best_score = None, 0.0
+        for j in TRN.glob("*.json"):
+            try:
+                data = json.loads(j.read_text(encoding="utf-8"))
+            except Exception:
+                continue
+            other_file = Path(data.get("file", ""))
+            other_stem = other_file.stem if other_file else j.stem
+            other_date = _extract_date_from_stem(other_stem)
+            # If both have dates and they differ a lot, skip
+            if file_date and other_date and file_date != other_date:
+                continue
+            # Compare titles (without dates)
+            sim = difflib.SequenceMatcher(
+                None,
+                _normalize_title(title_no_date),
+                _normalize_title(_stem_without_date(other_stem)),
+            ).ratio()
+            # Nudge score when dates match
+            if file_date and other_date and file_date == other_date:
+                sim += 0.1
+            if sim > best_score:
+                best_score, best_json = sim, j
+        # Require a reasonable similarity
+        return best_json if best_json and best_score >= 0.60 else None
+    except Exception:
+        return None
+
+
+def reuse_repo_transcript(media_path: Path, repo_json: Path) -> Path | None:
+    """Copy/retarget an existing transcript JSON/TXT (and make SRT/VTT if possible)
+    from the repository so that it belongs to the provided media_path. Returns
+    the new base path in /transcripts or None."""
+    try:
+        # load the source transcript
+        data = json.loads(repo_json.read_text(encoding="utf-8"))
+        src_base = TRN / Path(repo_json).stem
+        src_txt = src_base.with_suffix(".txt")
+        src_srt = src_base.with_suffix(".srt")
+        src_vtt = src_base.with_suffix(".vtt")
+
+        # write the retargeted artifacts
+        new_title = media_path.stem
+        new_base = TRN / new_title
+        new_base.parent.mkdir(parents=True, exist_ok=True)
+
+        # update file path
+        data["file"] = str(media_path)
+        (new_base.with_suffix(".json")).write_bytes(orjson.dumps(data))
+
+        # copy or synthesize TXT
+        if src_txt.exists():
+            shutil.copy2(src_txt, new_base.with_suffix(".txt"))
+        else:
+            # fallback: concatenate segments
+            txt = " ".join(s.get("text", "") for s in data.get("segments", []))
+            (new_base.with_suffix(".txt")).write_text(txt, encoding="utf-8")
+
+        # copy SRT/VTT if present; otherwise synthesize SRT from segments
+        if src_srt.exists():
+            shutil.copy2(src_srt, new_base.with_suffix(".srt"))
+        else:
+            # synthesize SRT
+            def fmt_ts(t):
+                h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
+                return f"{h:02}:{m:02}:{s:06.3f}".replace('.',',')
+            with open(new_base.with_suffix(".srt"), "w", encoding="utf-8") as srt:
+                for i, s in enumerate(data.get("segments", []), 1):
+                    srt.write(f"{i}\n{fmt_ts(s.get('start',0.0))} --> {fmt_ts(s.get('end',0.0))}\n{s.get('text','').strip()}\n\n")
+        if src_vtt.exists():
+            shutil.copy2(src_vtt, new_base.with_suffix(".vtt"))
+        else:
+            # synthesize VTT from segments
+            def fmt_ts_vtt(t):
+                h=int(t//3600); m=int((t%3600)//60); s=t-(h*3600+m*60)
+                return f"{h:02}:{m:02}:{s:06.3f}"
+            with open(new_base.with_suffix(".vtt"), "w", encoding="utf-8") as vtt:
+                vtt.write("WEBVTT\n\n")
+                for s in data.get("segments", []):
+                    vtt.write(f"{fmt_ts_vtt(s.get('start',0.0))} --> {fmt_ts_vtt(s.get('end',0.0))} \n{s.get('text','').strip()}\n\n")
+
+        # ensure sidecar next to media
+        try:
+            lang = (data.get("language") or DEFAULT_TRANSCRIPT_LANG).split("-")[0]
+            ensure_sidecar_next_to_media(new_base.with_suffix(".srt"), media_path, lang=lang)
+        except Exception:
+            pass
+
+        return new_base
+    except Exception as e:
+        print(f"[resolver] failed to reuse repo transcript: {e}", flush=True)
+        return None
+
+
 def transcript_text_from_file(path: Path) -> str:
    """Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
    try:
@@ -84,8 +348,10 @@ def transcript_text_from_file(path: Path) -> str:


 def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
-    """Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed."""
+    """Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed. If the sidecar is .txt, do nothing."""
    try:
+        if sidecar.suffix.lower() == ".txt":
+            return
        if sidecar.suffix.lower() == ".srt":
            dst = media_path.with_suffix(f".{lang}.srt")
            shutil.copy2(sidecar, dst)
@@ -407,6 +673,19 @@ def handle_local_file(path_str: str):
                "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
        log(info)

+        # 0) Try RSS resolver first: if episode with transcript exists, use it (skip Whisper)
+        try:
+            ep = match_media_to_rss(p)
+        except Exception as _e:
+            ep = None
+        if ep:
+            base = use_rss_transcript(p, ep)
+            if base:
+                index_meili(base.with_suffix(".json"))
+                publish_to_openwebui([base.with_suffix(".txt")])
+                log({**info, **{"status": "done", "note": "used_rss_transcript"}})
+                return
+
        # 1) Prefer an existing transcript sidecar if present
        sidecar = find_sidecar_transcript(p)
        if sidecar:
@@ -419,6 +698,16 @@ def handle_local_file(path_str: str):
            log({**info, **{"status": "done", "note": "used_existing_transcript"}})
            return

+        # 1.5) Reuse a transcript that exists in the repository for a matching episode
+        repo_json = find_repo_transcript_for_media(p)
+        if repo_json:
+            base = reuse_repo_transcript(p, repo_json)
+            if base:
+                index_meili(base.with_suffix(".json"))
+                publish_to_openwebui([base.with_suffix(".txt")])
+                log({**info, **{"status": "done", "note": "reused_repo_transcript"}})
+                return
+
        # 2) Otherwise, run transcription
        base = transcribe(p)
        index_meili(base.with_suffix(".json"))
@@ -464,7 +753,23 @@ def handle_url(url: str):
                         "date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""),
                         "path": str(dest)})
            log({**info, **{"status":"transcribing", "progress": 0}})
-            base = transcribe(dest)
+            # Try RSS transcript resolver first
+            ep = None
+            try:
+                ep = match_media_to_rss(dest)
+            except Exception:
+                ep = None
+            if ep:
+                base = use_rss_transcript(dest, ep)
+            else:
+                base = None
+            # 1.5) If we didn't get an RSS transcript and there is a matching one already in the repo, reuse it
+            if not base:
+                repo_json = find_repo_transcript_for_media(dest)
+                if repo_json:
+                    base = reuse_repo_transcript(dest, repo_json)
+            if not base:
+                base = transcribe(dest)
            index_meili(base.with_suffix(".json"))
            publish_to_openwebui([base.with_suffix(".txt")])
            log({**info, **{"status":"done"}})