Add RSS feed downloading

2025-09-07 15:30:22 +02:00
parent bcd874ecf8
commit e6582e9a6b
5 changed files with 522 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,9 @@
 # Local env and secrets
 .env
 # RSS Feeds
 feeds.txt
 # Runtime data
 data/
 models/
--- a/app/rss_ingest.py
+++ b/app/rss_ingest.py
@@ -0,0 +1,388 @@
 #!/usr/bin/env python3
 """
 RSS ingester for PodX
 - Reads feeds from env var RSS_FEEDS (comma-separated) *and*/or from a file (FEEDS_FILE, default /library/feeds.txt)
 - Fetches RSS with ETag/Last-Modified caching to avoid re-downloading unchanged feeds
 - Saves audio to LIBRARY_ROOT/<podcast>/<YYYYMMDD - title>.<ext>
 - Saves transcript sidecars when `<podcast:transcript>` links are present (prefers TextWithTimestamps → WebVTT → SRT → TXT)
 - Enqueues `worker.handle_local_file` for indexing/transcription (worker will skip Whisper if a transcript sidecar exists)
 - Keeps a small state JSON with per-feed ETag/Last-Modified and per-item processed GUIDs to avoid duplicate work
 Environment variables (with sane defaults):
  MEILI_URL            (unused directly here, but kept for parity)
  REDIS_URL            redis://redis:6379/0
  LIBRARY_ROOT         /library
  TRANSCRIPT_ROOT      /transcripts
  RSS_FEEDS            "" (comma-separated list)
  FEEDS_FILE           /library/feeds.txt
  RSS_SCAN_MINUTES     120
  RSS_ONCE             0 ("1" to run once and exit)
  USER_AGENT           podx-rss/1.0 (+local-archive)
  RSS_STATE_FILE       /library/.rss_state.json
  RSS_CONNECT_TIMEOUT  15 (seconds)
  RSS_READ_TIMEOUT     60 (seconds)
  AUDIO_MAX_MB         4096 (skip larger-than if HEAD reveals size > max, 0 = unlimited)
 """
 import os
 import re
 import sys
 import json
 import time
 import logging
 import itertools
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
 from urllib.parse import urlparse
 import requests
 import xml.etree.ElementTree as ET
 import redis
 from rq import Queue
 logging.basicConfig(level=logging.INFO, format='[rss] %(message)s')
 log = logging.getLogger("rss")
 # Config
 MEILI_URL         = os.getenv("MEILI_URL", "http://meili:7700")
 REDIS_URL         = os.getenv("REDIS_URL", "redis://redis:6379/0")
 LIBRARY_ROOT      = Path(os.getenv("LIBRARY_ROOT", "/library"))
 TRANSCRIPT_ROOT   = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
 RSS_FEEDS_ENV     = [s.strip() for s in os.getenv("RSS_FEEDS", "").split(",") if s.strip()]
 FEEDS_FILE        = Path(os.getenv("FEEDS_FILE", str(LIBRARY_ROOT / "feeds.txt")))
 RSS_SCAN_MINUTES  = int(os.getenv("RSS_SCAN_MINUTES", "120"))
 RSS_ONCE          = os.getenv("RSS_ONCE", "0") == "1"
 USER_AGENT        = os.getenv("USER_AGENT", "podx-rss/1.0 (+local-archive)")
 STATE_FILE        = Path(os.getenv("RSS_STATE_FILE", str(LIBRARY_ROOT / ".rss_state.json")))
 CONNECT_TIMEOUT   = float(os.getenv("RSS_CONNECT_TIMEOUT", "15"))
 READ_TIMEOUT      = float(os.getenv("RSS_READ_TIMEOUT", "60"))
 AUDIO_MAX_MB      = int(os.getenv("AUDIO_MAX_MB", "4096"))
 # Redis queue
 r = redis.from_url(REDIS_URL)
 q = Queue("default", connection=r)
 # HTTP session
 SESSION = requests.Session()
 SESSION.headers.update({"User-Agent": USER_AGENT})
 # Namespaces commonly used in podcast RSS
 NS = {
    "podcast": "https://podcastindex.org/namespace/1.0",
    "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
    "media": "http://search.yahoo.com/mrss/",
    "content": "http://purl.org/rss/1.0/modules/content/",
    "dc": "http://purl.org/dc/elements/1.1/",
 }
 # ----------------- helpers -----------------
 def safe(s: str) -> str:
    s = re.sub(r"[\\/:*?\"<>|]", "_", s)
    # collapse whitespace and trim
    s = re.sub(r"\s+", " ", s).strip()
    # guard against very long filenames
    return s[:200] if len(s) > 200 else s
 def load_state() -> Dict:
    if STATE_FILE.exists():
        try:
            return json.loads(STATE_FILE.read_text("utf-8"))
        except Exception:
            log.warning("State file unreadable, starting fresh")
    return {"feeds": {}, "items": {}}  # items keyed by GUID / enclosure URL
 def save_state(state: Dict) -> None:
    STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
    tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp")
    tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2))
    tmp.replace(STATE_FILE)
 def load_feeds() -> List[str]:
    feeds = list(RSS_FEEDS_ENV)
    if FEEDS_FILE.exists():
        try:
            for line in FEEDS_FILE.read_text("utf-8").splitlines():
                s = line.strip()
                if not s or s.startswith("#"):  # allow comments
                    continue
                feeds.append(s)
        except Exception as e:
            log.warning(f"Failed to read {FEEDS_FILE}: {e}")
    # de-dup preserving order
    seen = set()
    uniq = []
    for f in feeds:
        if f not in seen:
            uniq.append(f)
            seen.add(f)
    return uniq
 def fetch(url: str, *, etag: Optional[str]=None, modified: Optional[str]=None, as_text=False):
    headers = {}
    if etag:
        headers["If-None-Match"] = etag
    if modified:
        headers["If-Modified-Since"] = modified
    resp = SESSION.get(url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
    if resp.status_code == 304:
        return None, 304, None, None
    resp.raise_for_status()
    new_etag = resp.headers.get("ETag")
    new_mod = resp.headers.get("Last-Modified")
    return (resp.text if as_text else resp.content), resp.status_code, new_etag, new_mod
 def head_size(url: str) -> Optional[int]:
    try:
        h = SESSION.head(url, allow_redirects=True, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
        if h.ok:
            cl = h.headers.get("Content-Length")
            return int(cl) if cl and cl.isdigit() else None
    except Exception:
        return None
    return None
 def best_transcript_links(item) -> List[str]:
    links: List[Tuple[int, str, str]] = []
    # Try explicit QName first
    for tag in item.findall(".//{https://podcastindex.org/namespace/1.0}transcript"):
        t = (tag.attrib.get("type") or "").lower()
        url = tag.attrib.get("url")
        if url:
            links.append((0, t, url))
    # Namespace-prefixed fallback
    for tag in item.findall(".//podcast:transcript", NS):
        t = (tag.attrib.get("type") or "").lower()
        url = tag.attrib.get("url")
        if url:
            links.append((0, t, url))
    order = [
        "text/plain",              # often used for TextWithTimestamps
        "application/json",
        "text/vtt",
        "application/srt",
        "application/x-subrip",
        "application/text",
        "text/plain; charset=utf-8",
    ]
    key = {v: i for i, v in enumerate(order)}
    ranked = []
    for _, t, url in links:
        ranked.append((key.get(t, 999), t, url))
    ranked.sort()
    return [u for _, _, u in ranked]
 def get_enclosure(item) -> Optional[str]:
    enc = item.find("enclosure")
    if enc is not None and enc.attrib.get("url"):
        return enc.attrib["url"]
    mc = item.find("media:content", NS)
    if mc is not None and mc.attrib.get("url"):
        return mc.attrib["url"]
    return None
 def parse_pubdate(item) -> datetime:
    # Try common fields
    candidates = [
        item.findtext("pubDate"),
        item.findtext("dc:date", namespaces=NS),
        item.findtext("{http://purl.org/dc/elements/1.1/}date"),
    ]
    for pd in filter(None, candidates):
        s = pd.strip()
        # Try several common formats
        for fmt in [
            "%a, %d %b %Y %H:%M:%S %z",
            "%a, %d %b %Y %H:%M:%S",
            "%Y-%m-%dT%H:%M:%S%z",
            "%Y-%m-%dT%H:%M:%S",
        ]:
            try:
                return datetime.strptime(s, fmt)
            except Exception:
                pass
    return datetime.utcnow()
 def save_bytes(path: Path, data: bytes) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + ".part")
    tmp.write_bytes(data)
    tmp.replace(path)
 def decide_audio_ext(audio_url: str) -> str:
    p = urlparse(audio_url)
    low = p.path.lower()
    if low.endswith(".m4a"):
        return ".m4a"
    if low.endswith(".mp3"):
        return ".mp3"
    if low.endswith(".ogg") or low.endswith(".oga"):
        return ".ogg"
    if low.endswith(".aac"):
        return ".aac"
    if low.endswith(".wav"):
        return ".wav"
    return ".mp3"
 def item_key(item) -> str:
    # Prefer GUID value, else enclosure URL, else title+date
    guid = item.findtext("guid")
    if guid:
        return guid.strip()
    enc = get_enclosure(item)
    if enc:
        return enc
    title = item.findtext("title") or "Episode"
    pub = parse_pubdate(item).strftime("%Y%m%d")
    return f"{pub}:{title}"
 # ----------------- core ingest -----------------
 def ingest_feed(url: str, state: Dict) -> int:
    fstate = state.setdefault("feeds", {}).setdefault(url, {})
    etag = fstate.get("etag")
    mod = fstate.get("modified")
    log.info(f"Fetch RSS: {url}")
    try:
        data, status, new_etag, new_mod = fetch(url, etag=etag, modified=mod, as_text=True)
    except Exception as e:
        log.error(f"Fetch failed: {e}")
        return 0
    if status == 304:
        log.info("Not modified")
        return 0
    if new_etag:
        fstate["etag"] = new_etag
    if new_mod:
        fstate["modified"] = new_mod
    try:
        root = ET.fromstring(data)
    except Exception as e:
        log.error(f"XML parse error: {e}")
        return 0
    channel_title = safe((root.findtext("channel/title") or "Podcast"))
    new_items = 0
    for item in root.findall("channel/item"):
        key = item_key(item)
        already = state.setdefault("items", {})
        if already.get(key):
            continue
        title = safe(item.findtext("title") or "Episode")
        pub = parse_pubdate(item)
        date_prefix = pub.strftime("%Y%m%d")
        base = f"{date_prefix} - {title}"
        audio_url = get_enclosure(item)
        if not audio_url:
            log.info(f"Skip (no enclosure): {title}")
            already[key] = {"skipped": "no_enclosure"}
            continue
        # HEAD size guard (optional)
        if AUDIO_MAX_MB > 0:
            size = head_size(audio_url)
            if size and size > AUDIO_MAX_MB * 1024 * 1024:
                log.info(f"Skip (size>{AUDIO_MAX_MB}MB): {title}")
                already[key] = {"skipped": "too_large", "size": size}
                continue
        path_ext = decide_audio_ext(audio_url)
        audio_out = LIBRARY_ROOT / channel_title / f"{base}{path_ext}"
        transcript_links = best_transcript_links(item)
        # If audio exists and a transcript sidecar exists → just enqueue index
        sidecars = list((TRANSCRIPT_ROOT / channel_title).glob(f"{base}.*"))
        have_transcript = len(sidecars) > 0
        if audio_out.exists() and have_transcript:
            log.info(f"Skip download, enqueue index (have audio+transcript): {audio_out.name}")
            try:
                q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
            except Exception as e:
                log.warning(f"Enqueue failed: {e}")
            already[key] = {"done": True, "audio": str(audio_out)}
            new_items += 1
            continue
        # Download audio
        try:
            log.info(f"Downloading audio → {audio_out}")
            content, _, _, _ = fetch(audio_url, as_text=False)
            save_bytes(audio_out, content)
        except Exception as e:
            log.warning(f"Audio failed: {e}")
            already[key] = {"error": f"audio:{e}"}
            continue
        # Download transcript if present (take first best)
        transcript_out = None
        for turl in transcript_links:
            try:
                ext = ".vtt" if "vtt" in turl.lower() else ".srt" if "srt" in turl.lower() else ".txt"
                tout = TRANSCRIPT_ROOT / channel_title / f"{base}{ext}"
                log.info(f"Downloading transcript → {tout} ({turl})")
                tdata, _, _, _ = fetch(turl, as_text=False)
                save_bytes(tout, tdata)
                transcript_out = tout
                break
            except Exception as e:
                log.warning(f"Transcript fetch failed ({turl}): {e}")
                continue
        # Enqueue for indexing/transcription
        try:
            q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
        except Exception as e:
            log.warning(f"Enqueue failed: {e}")
        already[key] = {"done": True, "audio": str(audio_out), "transcript": str(transcript_out) if transcript_out else None}
        new_items += 1
    return new_items
 # ----------------- main loop -----------------
 def main():
    while True:
        feeds = load_feeds()
        if not feeds:
            log.error("No RSS feeds configured. Set RSS_FEEDS or create feeds.txt.")
            sys.exit(1)
        state = load_state()
        total_new = 0
        for url in feeds:
            try:
                added = ingest_feed(url, state)
                total_new += added
                save_state(state)
            except Exception as e:
                log.error(f"Feed error: {url} -> {e}")
        log.info(f"Cycle complete. New items: {total_new}")
        if RSS_ONCE:
            break
        time.sleep(RSS_SCAN_MINUTES * 60)
 if __name__ == "__main__":
    main()
--- a/app/worker.py
+++ b/app/worker.py
@@ -39,6 +39,78 @@ def log(feed):
 def sanitize(name):
    return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
 def find_sidecar_transcript(media_path: Path) -> Path | None:
    """Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
    Tries common variants including language-suffixed SRT/VTT.
    """
    candidates: list[Path] = []
    # exact same stem in same folder
    for ext in [".txt", ".srt", ".vtt"]:
        p = media_path.parent / (media_path.stem + ext)
        if p.exists():
            candidates.append(p)
    # language-suffixed near the media file (e.g., .en.srt)
    for ext in [".srt", ".vtt"]:
        p = media_path.with_suffix(f".en{ext}")
        if p.exists() and p not in candidates:
            candidates.append(p)
    return candidates[0] if candidates else None
 def transcript_text_from_file(path: Path) -> str:
    """Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
    try:
        raw = path.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        raw = path.read_text(errors="ignore")
    if path.suffix.lower() == ".txt":
        return raw.strip()
    # For SRT/VTT, drop timestamp lines, cue numbers and headers
    lines: list[str] = []
    for line in raw.splitlines():
        ls = line.strip()
        if not ls:
            continue
        if "-->" in ls:  # timestamp line
            continue
        if ls.upper().startswith("WEBVTT"):
            continue
        if re.match(r"^\d+$", ls):  # cue index
            continue
        lines.append(ls)
    return " ".join(lines)
 def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
    """Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed."""
    try:
        if sidecar.suffix.lower() == ".srt":
            dst = media_path.with_suffix(f".{lang}.srt")
            shutil.copy2(sidecar, dst)
        elif sidecar.suffix.lower() == ".vtt":
            tmp_srt = sidecar.with_suffix(".srt")
            subprocess.run(["ffmpeg", "-nostdin", "-y", "-i", str(sidecar), str(tmp_srt)], check=True)
            dst = media_path.with_suffix(f".{lang}.srt")
            shutil.move(str(tmp_srt), dst)
    except Exception as e:
        print(f"[post] sidecar copy/convert failed: {e}", flush=True)
 def write_plain_transcript(media_path: Path, text: str, language: str = "en") -> Path:
    """Write minimal transcript artifacts (.txt + .json) from plain text (no timestamps)."""
    title = media_path.stem
    base = TRN / title
    base.parent.mkdir(parents=True, exist_ok=True)
    (base.with_suffix(".txt")).write_text(text, encoding="utf-8")
    (base.with_suffix(".json")).write_bytes(orjson.dumps({
        "file": str(media_path),
        "language": language,
        "segments": [{"start": 0.0, "end": 0.0, "text": text}]
    }))
    return base
 def yt_dlp(url, outdir):
    # 1) Normalize YouTube Music URLs to standard YouTube
    yurl = url
@@ -316,6 +388,7 @@ def publish_to_openwebui(paths):
 def handle_local_file(path_str: str):
    """Transcribe & index a local media file that already exists in /library.
    If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper.
    Safe to call repeatedly; it skips if transcript JSON already exists.
    """
    try:
@@ -323,13 +396,30 @@ def handle_local_file(path_str: str):
        if not p.exists():
            log({"url": path_str, "status": "error", "error": "file_not_found"})
            return
        title = p.stem
        base_json = TRN / f"{title}.json"
        if base_json.exists():
            log({"url": path_str, "status": "skip", "reason": "already_transcribed"})
            return
-        info = {"url": path_str, "status": "transcribing", "title": title, "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
+
        info = {"url": path_str, "status": "transcribing", "title": title,
                "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
        log(info)
        # 1) Prefer an existing transcript sidecar if present
        sidecar = find_sidecar_transcript(p)
        if sidecar:
            plain = transcript_text_from_file(sidecar)
            lang = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
            base = write_plain_transcript(p, plain, language=lang)
            ensure_sidecar_next_to_media(sidecar, p, lang=lang)
            index_meili(base.with_suffix(".json"))
            publish_to_openwebui([base.with_suffix(".txt")])
            log({**info, **{"status": "done", "note": "used_existing_transcript"}})
            return
        # 2) Otherwise, run transcription
        base = transcribe(p)
        index_meili(base.with_suffix(".json"))
        publish_to_openwebui([base.with_suffix(".txt")])
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -41,6 +41,10 @@ services:
      WHISPER_MODEL: large-v3
      WHISPER_PRECISION: int8
      PYTHONPATH: /app
      JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
      JOB_TTL: ${JOB_TTL:-86400}
      RESULT_TTL: ${RESULT_TTL:-86400}
      FAILURE_TTL: ${FAILURE_TTL:-86400}
    volumes:
      - ${LIBRARY_HOST_DIR:-./library}:/library
      - ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
@@ -86,7 +90,7 @@ services:
      # - COOKIE_FILE=/config/cookies.txt
      # Optional: yt-dlp options (JSON). Example enables Android client fallback
      # - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}}
-      - YTDL_OPTIONS={"extract_flat":"in_playlist","concurrent_fragment_downloads":1}
+      - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1}
    volumes:
      - ${LIBRARY_HOST_DIR:-./library}:/downloads
      # Optional cookies file on host → /config/cookies.txt inside container
@@ -115,3 +119,32 @@ services:
    healthcheck:
      test: ["CMD-SHELL", "exit 0"]
    restart: unless-stopped
  podx-rss:
    build: ./app
    container_name: podx-rss
    command: ["python", "rss_ingest.py"]
    env_file: [.env]
    environment:
      MEILI_URL: http://meili:7700
      REDIS_URL: redis://redis:6379/0
      LIBRARY_ROOT: /library
      TRANSCRIPT_ROOT: /transcripts
      FEEDS_FILE: /library/feeds.txt
      RSS_STATE_FILE: /library/.rss_state.json
      RSS_SCAN_MINUTES: ${RSS_SCAN_MINUTES:-120}
      RSS_CONNECT_TIMEOUT: ${RSS_CONNECT_TIMEOUT:-15}
      RSS_READ_TIMEOUT: ${RSS_READ_TIMEOUT:-60}
      AUDIO_MAX_MB: ${AUDIO_MAX_MB:-4096}
      USER_AGENT: ${USER_AGENT:-podx-rss/1.0 (+local-archive)}
      RSS_ONCE: ${RSS_ONCE:-0}
    volumes:
      - ${LIBRARY_HOST_DIR:-./library}:/library
      - ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
    depends_on: [redis]
    healthcheck:
      test: ["CMD-SHELL", "python - <<'PY'\nimport os,sys; p=os.getenv('FEEDS_FILE',''); sys.exit(0 if (p and os.path.exists(p)) else 1)\nPY"]
      interval: 60s
      timeout: 5s
      retries: 3
    restart: unless-stopped
--- a/feeds.txt.example
+++ b/feeds.txt.example
@@ -0,0 +1,6 @@
 feeds.txt
 ---------
 # Apple Podcasts / Omny show
 https://www.omnycontent.com/d/playlist/....../podcast.rss
 # Another RSS
 https://example.com/feed.xml