Bug fixes

2025-09-08 09:44:03 +02:00
parent 13b4ebf63f
commit d4f4a93acf
1 changed files with 75 additions and 3 deletions
--- a/app/rss_ingest.py
+++ b/app/rss_ingest.py
@@ -7,11 +7,14 @@ import xml.etree.ElementTree as ET
 # ---- Config ----
 TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
 OUT_INDEX = Path(os.getenv("RSS_INDEX_PATH", str(TRN / "rss_index.json")))
-FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/app/config/feeds.txt"))
+FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/library/feeds.txt"))
 FEEDS_ENV = os.getenv("RSS_FEEDS", "").strip()
 TIMEOUT = int(os.getenv("RSS_HTTP_TIMEOUT", "30"))
 DOWNLOAD_TRANSCRIPTS = os.getenv("RSS_DOWNLOAD_TRANSCRIPTS", "true").lower() in {"1", "true", "yes", "y"}
 DEFAULT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
+RSS_SCAN_MINUTES = int(os.getenv("RSS_SCAN_MINUTES", "15"))
+RSS_ONCE = os.getenv("RSS_ONCE", "0").lower() in {"1", "true", "yes", "y"}
+AUDIO_MAX_MB = int(os.getenv("RSS_AUDIO_MAX_MB", "0"))  # 0 = unlimited

 # Where media files live; used to sidecar RSS transcripts next to matching media
 LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
@@ -130,6 +133,58 @@ def _guess_ext_from_type(mime: str) -> str:
    return ".txt"


+def _guess_audio_ext(mime: str, url: str) -> str:
+    # Prefer by MIME; fall back to URL suffix
+    mime = (mime or "").lower()
+    if "mp3" in mime:
+        return ".mp3"
+    if "aac" in mime or "mp4a" in mime:
+        return ".m4a"
+    if "m4a" in mime:
+        return ".m4a"
+    if "ogg" in mime:
+        return ".ogg"
+    if "opus" in mime:
+        return ".opus"
+    if "flac" in mime:
+        return ".flac"
+    if "wav" in mime:
+        return ".wav"
+    # fallback by URL
+    suf = Path(urlparse(url).path).suffix.lower()
+    if suf in {".mp3", ".m4a", ".aac", ".ogg", ".opus", ".flac", ".wav"}:
+        return ".m4a" if suf == ".aac" else suf
+    return ".mp3"
+
+def _download_stream(url: str, dst: Path) -> Path | None:
+    try:
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        with requests.get(url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"}, stream=True) as r:
+            r.raise_for_status()
+            max_bytes = AUDIO_MAX_MB * 1024 * 1024 if AUDIO_MAX_MB > 0 else None
+            total = 0
+            with open(dst, "wb") as f:
+                for chunk in r.iter_content(chunk_size=1024 * 256):
+                    if not chunk:
+                        continue
+                    f.write(chunk)
+                    total += len(chunk)
+                    if max_bytes and total > max_bytes:
+                        # stop and remove partial
+                        try:
+                            f.close()
+                        except Exception:
+                            pass
+                        try:
+                            dst.unlink(missing_ok=True)
+                        except Exception:
+                            pass
+                        return None
+        return dst
+    except Exception:
+        return None
+
+
 def _norm_text(s: str) -> str:
    s = (s or "").lower()
    s = re.sub(r"\s+", " ", s)
@@ -224,12 +279,15 @@ def _gather_transcripts(item: ET.Element):
 def parse_feed(feed_url: str):
    items = []
    try:
+        print(f"[rss] fetching {feed_url}", flush=True)
        r = requests.get(feed_url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"})
        r.raise_for_status()
        root = ET.fromstring(r.content)

        channel = root.find("channel") or root
        show_title = _text(_find_ns(channel, "title")) or _text(_find_ns(root, "title"))
+        if not show_title:
+            show_title = ""

        for it in _iter_items(channel):
            title = _text(_find_ns(it, "title"))
@@ -320,12 +378,15 @@ def parse_feed(feed_url: str):

            items.append(item_rec)

+        print(f"[rss] parsed {len(items)} episode(s) from {show_title or feed_url}", flush=True)
        return {"feed_url": feed_url, "show": show_title, "episodes": items}
    except Exception as e:
+        print(f"[rss] ERROR parsing {feed_url}: {e}", flush=True)
        return {"feed_url": feed_url, "error": str(e), "episodes": []}


 def load_feeds_list():
+    print(f"[rss] FEEDS_FILE={FEEDS_FILE} FEEDS_ENV={'set' if bool(FEEDS_ENV) else 'unset'}", flush=True)
    feeds = []
    if FEEDS_ENV:
        feeds.extend([u.strip() for u in FEEDS_ENV.split(",") if u.strip()])
@@ -338,8 +399,12 @@ def load_feeds_list():
                feeds.append(line)
        except Exception:
            pass
+    else:
+        print(f"[rss] feeds file not found: {FEEDS_FILE}", flush=True)
    # unique, keep order
-    return sorted(list(dict.fromkeys(feeds)))
+    feeds = sorted(list(dict.fromkeys(feeds)))
+    print(f"[rss] parsed {len(feeds)} feed URL(s)", flush=True)
+    return feeds


 def build_index():
@@ -354,4 +419,11 @@ def build_index():


 if __name__ == "__main__":
-    build_index()
+    while True:
+        try:
+            build_index()
+        except Exception as e:
+            print(f"[rss] build error: {e}", flush=True)
+        if RSS_ONCE:
+            break
+        time.sleep(max(1, RSS_SCAN_MINUTES) * 60)