#!/usr/bin/env python3 """ RSS ingester for PodX - Reads feeds from env var RSS_FEEDS (comma-separated) *and*/or from a file (FEEDS_FILE, default /library/feeds.txt) - Fetches RSS with ETag/Last-Modified caching to avoid re-downloading unchanged feeds - Saves audio to LIBRARY_ROOT//. - Saves transcript sidecars when `` links are present (prefers TextWithTimestamps → WebVTT → SRT → TXT) - Enqueues `worker.handle_local_file` for indexing/transcription (worker will skip Whisper if a transcript sidecar exists) - Keeps a small state JSON with per-feed ETag/Last-Modified and per-item processed GUIDs to avoid duplicate work Environment variables (with sane defaults): MEILI_URL (unused directly here, but kept for parity) REDIS_URL redis://redis:6379/0 LIBRARY_ROOT /library TRANSCRIPT_ROOT /transcripts RSS_FEEDS "" (comma-separated list) FEEDS_FILE /library/feeds.txt RSS_SCAN_MINUTES 120 RSS_ONCE 0 ("1" to run once and exit) USER_AGENT podx-rss/1.0 (+local-archive) RSS_STATE_FILE /library/.rss_state.json RSS_CONNECT_TIMEOUT 15 (seconds) RSS_READ_TIMEOUT 60 (seconds) AUDIO_MAX_MB 4096 (skip larger-than if HEAD reveals size > max, 0 = unlimited) """ import os import re import sys import json import time import logging import itertools from datetime import datetime from pathlib import Path from typing import Dict, List, Tuple, Optional from urllib.parse import urlparse import requests import xml.etree.ElementTree as ET import redis from rq import Queue logging.basicConfig(level=logging.INFO, format='[rss] %(message)s') log = logging.getLogger("rss") # Config MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700") REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") LIBRARY_ROOT = Path(os.getenv("LIBRARY_ROOT", "/library")) TRANSCRIPT_ROOT = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts")) RSS_FEEDS_ENV = [s.strip() for s in os.getenv("RSS_FEEDS", "").split(",") if s.strip()] FEEDS_FILE = Path(os.getenv("FEEDS_FILE", str(LIBRARY_ROOT / "feeds.txt"))) RSS_SCAN_MINUTES = int(os.getenv("RSS_SCAN_MINUTES", "120")) RSS_ONCE = os.getenv("RSS_ONCE", "0") == "1" USER_AGENT = os.getenv("USER_AGENT", "podx-rss/1.0 (+local-archive)") STATE_FILE = Path(os.getenv("RSS_STATE_FILE", str(LIBRARY_ROOT / ".rss_state.json"))) CONNECT_TIMEOUT = float(os.getenv("RSS_CONNECT_TIMEOUT", "15")) READ_TIMEOUT = float(os.getenv("RSS_READ_TIMEOUT", "60")) AUDIO_MAX_MB = int(os.getenv("AUDIO_MAX_MB", "4096")) # Redis queue r = redis.from_url(REDIS_URL) q = Queue("default", connection=r) # HTTP session SESSION = requests.Session() SESSION.headers.update({"User-Agent": USER_AGENT}) # Namespaces commonly used in podcast RSS NS = { "podcast": "https://podcastindex.org/namespace/1.0", "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", "media": "http://search.yahoo.com/mrss/", "content": "http://purl.org/rss/1.0/modules/content/", "dc": "http://purl.org/dc/elements/1.1/", } # ----------------- helpers ----------------- def safe(s: str) -> str: s = re.sub(r"[\\/:*?\"<>|]", "_", s) # collapse whitespace and trim s = re.sub(r"\s+", " ", s).strip() # guard against very long filenames return s[:200] if len(s) > 200 else s def load_state() -> Dict: if STATE_FILE.exists(): try: return json.loads(STATE_FILE.read_text("utf-8")) except Exception: log.warning("State file unreadable, starting fresh") return {"feeds": {}, "items": {}} # items keyed by GUID / enclosure URL def save_state(state: Dict) -> None: STATE_FILE.parent.mkdir(parents=True, exist_ok=True) tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp") tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2)) tmp.replace(STATE_FILE) def load_feeds() -> List[str]: feeds = list(RSS_FEEDS_ENV) if FEEDS_FILE.exists(): try: for line in FEEDS_FILE.read_text("utf-8").splitlines(): s = line.strip() if not s or s.startswith("#"): # allow comments continue feeds.append(s) except Exception as e: log.warning(f"Failed to read {FEEDS_FILE}: {e}") # de-dup preserving order seen = set() uniq = [] for f in feeds: if f not in seen: uniq.append(f) seen.add(f) return uniq def fetch(url: str, *, etag: Optional[str]=None, modified: Optional[str]=None, as_text=False): headers = {} if etag: headers["If-None-Match"] = etag if modified: headers["If-Modified-Since"] = modified resp = SESSION.get(url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT)) if resp.status_code == 304: return None, 304, None, None resp.raise_for_status() new_etag = resp.headers.get("ETag") new_mod = resp.headers.get("Last-Modified") return (resp.text if as_text else resp.content), resp.status_code, new_etag, new_mod def head_size(url: str) -> Optional[int]: try: h = SESSION.head(url, allow_redirects=True, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT)) if h.ok: cl = h.headers.get("Content-Length") return int(cl) if cl and cl.isdigit() else None except Exception: return None return None def best_transcript_links(item) -> List[str]: links: List[Tuple[int, str, str]] = [] # Try explicit QName first for tag in item.findall(".//{https://podcastindex.org/namespace/1.0}transcript"): t = (tag.attrib.get("type") or "").lower() url = tag.attrib.get("url") if url: links.append((0, t, url)) # Namespace-prefixed fallback for tag in item.findall(".//podcast:transcript", NS): t = (tag.attrib.get("type") or "").lower() url = tag.attrib.get("url") if url: links.append((0, t, url)) order = [ "text/plain", # often used for TextWithTimestamps "application/json", "text/vtt", "application/srt", "application/x-subrip", "application/text", "text/plain; charset=utf-8", ] key = {v: i for i, v in enumerate(order)} ranked = [] for _, t, url in links: ranked.append((key.get(t, 999), t, url)) ranked.sort() return [u for _, _, u in ranked] def get_enclosure(item) -> Optional[str]: enc = item.find("enclosure") if enc is not None and enc.attrib.get("url"): return enc.attrib["url"] mc = item.find("media:content", NS) if mc is not None and mc.attrib.get("url"): return mc.attrib["url"] return None def parse_pubdate(item) -> datetime: # Try common fields candidates = [ item.findtext("pubDate"), item.findtext("dc:date", namespaces=NS), item.findtext("{http://purl.org/dc/elements/1.1/}date"), ] for pd in filter(None, candidates): s = pd.strip() # Try several common formats for fmt in [ "%a, %d %b %Y %H:%M:%S %z", "%a, %d %b %Y %H:%M:%S", "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S", ]: try: return datetime.strptime(s, fmt) except Exception: pass return datetime.utcnow() def save_bytes(path: Path, data: bytes) -> None: path.parent.mkdir(parents=True, exist_ok=True) tmp = path.with_suffix(path.suffix + ".part") tmp.write_bytes(data) tmp.replace(path) def decide_audio_ext(audio_url: str) -> str: p = urlparse(audio_url) low = p.path.lower() if low.endswith(".m4a"): return ".m4a" if low.endswith(".mp3"): return ".mp3" if low.endswith(".ogg") or low.endswith(".oga"): return ".ogg" if low.endswith(".aac"): return ".aac" if low.endswith(".wav"): return ".wav" return ".mp3" def item_key(item) -> str: # Prefer GUID value, else enclosure URL, else title+date guid = item.findtext("guid") if guid: return guid.strip() enc = get_enclosure(item) if enc: return enc title = item.findtext("title") or "Episode" pub = parse_pubdate(item).strftime("%Y%m%d") return f"{pub}:{title}" # ----------------- core ingest ----------------- def ingest_feed(url: str, state: Dict) -> int: fstate = state.setdefault("feeds", {}).setdefault(url, {}) etag = fstate.get("etag") mod = fstate.get("modified") log.info(f"Fetch RSS: {url}") try: data, status, new_etag, new_mod = fetch(url, etag=etag, modified=mod, as_text=True) except Exception as e: log.error(f"Fetch failed: {e}") return 0 if status == 304: log.info("Not modified") return 0 if new_etag: fstate["etag"] = new_etag if new_mod: fstate["modified"] = new_mod try: root = ET.fromstring(data) except Exception as e: log.error(f"XML parse error: {e}") return 0 channel_title = safe((root.findtext("channel/title") or "Podcast")) new_items = 0 for item in root.findall("channel/item"): key = item_key(item) already = state.setdefault("items", {}) if already.get(key): continue title = safe(item.findtext("title") or "Episode") pub = parse_pubdate(item) date_prefix = pub.strftime("%Y%m%d") base = f"{date_prefix} - {title}" audio_url = get_enclosure(item) if not audio_url: log.info(f"Skip (no enclosure): {title}") already[key] = {"skipped": "no_enclosure"} continue # HEAD size guard (optional) if AUDIO_MAX_MB > 0: size = head_size(audio_url) if size and size > AUDIO_MAX_MB * 1024 * 1024: log.info(f"Skip (size>{AUDIO_MAX_MB}MB): {title}") already[key] = {"skipped": "too_large", "size": size} continue path_ext = decide_audio_ext(audio_url) audio_out = LIBRARY_ROOT / channel_title / f"{base}{path_ext}" transcript_links = best_transcript_links(item) # If audio exists and a transcript sidecar exists → just enqueue index sidecars = list((TRANSCRIPT_ROOT / channel_title).glob(f"{base}.*")) have_transcript = len(sidecars) > 0 if audio_out.exists() and have_transcript: log.info(f"Skip download, enqueue index (have audio+transcript): {audio_out.name}") try: q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400) except Exception as e: log.warning(f"Enqueue failed: {e}") already[key] = {"done": True, "audio": str(audio_out)} new_items += 1 continue # Download audio try: log.info(f"Downloading audio → {audio_out}") content, _, _, _ = fetch(audio_url, as_text=False) save_bytes(audio_out, content) except Exception as e: log.warning(f"Audio failed: {e}") already[key] = {"error": f"audio:{e}"} continue # Download transcript if present (take first best) transcript_out = None for turl in transcript_links: try: ext = ".vtt" if "vtt" in turl.lower() else ".srt" if "srt" in turl.lower() else ".txt" tout = TRANSCRIPT_ROOT / channel_title / f"{base}{ext}" log.info(f"Downloading transcript → {tout} ({turl})") tdata, _, _, _ = fetch(turl, as_text=False) save_bytes(tout, tdata) transcript_out = tout break except Exception as e: log.warning(f"Transcript fetch failed ({turl}): {e}") continue # Enqueue for indexing/transcription try: q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400) except Exception as e: log.warning(f"Enqueue failed: {e}") already[key] = {"done": True, "audio": str(audio_out), "transcript": str(transcript_out) if transcript_out else None} new_items += 1 return new_items # ----------------- main loop ----------------- def main(): while True: feeds = load_feeds() if not feeds: log.error("No RSS feeds configured. Set RSS_FEEDS or create feeds.txt.") sys.exit(1) state = load_state() total_new = 0 for url in feeds: try: added = ingest_feed(url, state) total_new += added save_state(state) except Exception as e: log.error(f"Feed error: {url} -> {e}") log.info(f"Cycle complete. New items: {total_new}") if RSS_ONCE: break time.sleep(RSS_SCAN_MINUTES * 60) if __name__ == "__main__": main()