diff --git a/app/rss_ingest.py b/app/rss_ingest.py index c4043dc..bb3e999 100644 --- a/app/rss_ingest.py +++ b/app/rss_ingest.py @@ -7,11 +7,14 @@ import xml.etree.ElementTree as ET # ---- Config ---- TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts")) OUT_INDEX = Path(os.getenv("RSS_INDEX_PATH", str(TRN / "rss_index.json"))) -FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/app/config/feeds.txt")) +FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/library/feeds.txt")) FEEDS_ENV = os.getenv("RSS_FEEDS", "").strip() TIMEOUT = int(os.getenv("RSS_HTTP_TIMEOUT", "30")) DOWNLOAD_TRANSCRIPTS = os.getenv("RSS_DOWNLOAD_TRANSCRIPTS", "true").lower() in {"1", "true", "yes", "y"} DEFAULT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en" +RSS_SCAN_MINUTES = int(os.getenv("RSS_SCAN_MINUTES", "15")) +RSS_ONCE = os.getenv("RSS_ONCE", "0").lower() in {"1", "true", "yes", "y"} +AUDIO_MAX_MB = int(os.getenv("RSS_AUDIO_MAX_MB", "0")) # 0 = unlimited # Where media files live; used to sidecar RSS transcripts next to matching media LIB = Path(os.getenv("LIBRARY_ROOT", "/library")) @@ -130,6 +133,58 @@ def _guess_ext_from_type(mime: str) -> str: return ".txt" +def _guess_audio_ext(mime: str, url: str) -> str: + # Prefer by MIME; fall back to URL suffix + mime = (mime or "").lower() + if "mp3" in mime: + return ".mp3" + if "aac" in mime or "mp4a" in mime: + return ".m4a" + if "m4a" in mime: + return ".m4a" + if "ogg" in mime: + return ".ogg" + if "opus" in mime: + return ".opus" + if "flac" in mime: + return ".flac" + if "wav" in mime: + return ".wav" + # fallback by URL + suf = Path(urlparse(url).path).suffix.lower() + if suf in {".mp3", ".m4a", ".aac", ".ogg", ".opus", ".flac", ".wav"}: + return ".m4a" if suf == ".aac" else suf + return ".mp3" + +def _download_stream(url: str, dst: Path) -> Path | None: + try: + dst.parent.mkdir(parents=True, exist_ok=True) + with requests.get(url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"}, stream=True) as r: + r.raise_for_status() + max_bytes = AUDIO_MAX_MB * 1024 * 1024 if AUDIO_MAX_MB > 0 else None + total = 0 + with open(dst, "wb") as f: + for chunk in r.iter_content(chunk_size=1024 * 256): + if not chunk: + continue + f.write(chunk) + total += len(chunk) + if max_bytes and total > max_bytes: + # stop and remove partial + try: + f.close() + except Exception: + pass + try: + dst.unlink(missing_ok=True) + except Exception: + pass + return None + return dst + except Exception: + return None + + def _norm_text(s: str) -> str: s = (s or "").lower() s = re.sub(r"\s+", " ", s) @@ -224,12 +279,15 @@ def _gather_transcripts(item: ET.Element): def parse_feed(feed_url: str): items = [] try: + print(f"[rss] fetching {feed_url}", flush=True) r = requests.get(feed_url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"}) r.raise_for_status() root = ET.fromstring(r.content) channel = root.find("channel") or root show_title = _text(_find_ns(channel, "title")) or _text(_find_ns(root, "title")) + if not show_title: + show_title = "" for it in _iter_items(channel): title = _text(_find_ns(it, "title")) @@ -320,12 +378,15 @@ def parse_feed(feed_url: str): items.append(item_rec) + print(f"[rss] parsed {len(items)} episode(s) from {show_title or feed_url}", flush=True) return {"feed_url": feed_url, "show": show_title, "episodes": items} except Exception as e: + print(f"[rss] ERROR parsing {feed_url}: {e}", flush=True) return {"feed_url": feed_url, "error": str(e), "episodes": []} def load_feeds_list(): + print(f"[rss] FEEDS_FILE={FEEDS_FILE} FEEDS_ENV={'set' if bool(FEEDS_ENV) else 'unset'}", flush=True) feeds = [] if FEEDS_ENV: feeds.extend([u.strip() for u in FEEDS_ENV.split(",") if u.strip()]) @@ -338,8 +399,12 @@ def load_feeds_list(): feeds.append(line) except Exception: pass + else: + print(f"[rss] feeds file not found: {FEEDS_FILE}", flush=True) # unique, keep order - return sorted(list(dict.fromkeys(feeds))) + feeds = sorted(list(dict.fromkeys(feeds))) + print(f"[rss] parsed {len(feeds)} feed URL(s)", flush=True) + return feeds def build_index(): @@ -354,4 +419,11 @@ def build_index(): if __name__ == "__main__": - build_index() \ No newline at end of file + while True: + try: + build_index() + except Exception as e: + print(f"[rss] build error: {e}", flush=True) + if RSS_ONCE: + break + time.sleep(max(1, RSS_SCAN_MINUTES) * 60) \ No newline at end of file