Bug fixes

2025-09-08 09:44:03 +02:00
parent 13b4ebf63f
commit d4f4a93acf
1 changed files with 75 additions and 3 deletions
--- a/app/rss_ingest.py
+++ b/app/rss_ingest.py
@@ -7,11 +7,14 @@ import xml.etree.ElementTree as ET
 # ---- Config ----
 TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
 OUT_INDEX = Path(os.getenv("RSS_INDEX_PATH", str(TRN / "rss_index.json")))
-FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/app/config/feeds.txt"))
+FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/library/feeds.txt"))
 FEEDS_ENV = os.getenv("RSS_FEEDS", "").strip()
 TIMEOUT = int(os.getenv("RSS_HTTP_TIMEOUT", "30"))
 DOWNLOAD_TRANSCRIPTS = os.getenv("RSS_DOWNLOAD_TRANSCRIPTS", "true").lower() in {"1", "true", "yes", "y"}
 DEFAULT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
 RSS_SCAN_MINUTES = int(os.getenv("RSS_SCAN_MINUTES", "15"))
 RSS_ONCE = os.getenv("RSS_ONCE", "0").lower() in {"1", "true", "yes", "y"}
 AUDIO_MAX_MB = int(os.getenv("RSS_AUDIO_MAX_MB", "0"))  # 0 = unlimited
 # Where media files live; used to sidecar RSS transcripts next to matching media
 LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
@@ -130,6 +133,58 @@ def _guess_ext_from_type(mime: str) -> str:
    return ".txt"
 def _guess_audio_ext(mime: str, url: str) -> str:
    # Prefer by MIME; fall back to URL suffix
    mime = (mime or "").lower()
    if "mp3" in mime:
        return ".mp3"
    if "aac" in mime or "mp4a" in mime:
        return ".m4a"
    if "m4a" in mime:
        return ".m4a"
    if "ogg" in mime:
        return ".ogg"
    if "opus" in mime:
        return ".opus"
    if "flac" in mime:
        return ".flac"
    if "wav" in mime:
        return ".wav"
    # fallback by URL
    suf = Path(urlparse(url).path).suffix.lower()
    if suf in {".mp3", ".m4a", ".aac", ".ogg", ".opus", ".flac", ".wav"}:
        return ".m4a" if suf == ".aac" else suf
    return ".mp3"
 def _download_stream(url: str, dst: Path) -> Path | None:
    try:
        dst.parent.mkdir(parents=True, exist_ok=True)
        with requests.get(url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"}, stream=True) as r:
            r.raise_for_status()
            max_bytes = AUDIO_MAX_MB * 1024 * 1024 if AUDIO_MAX_MB > 0 else None
            total = 0
            with open(dst, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024 * 256):
                    if not chunk:
                        continue
                    f.write(chunk)
                    total += len(chunk)
                    if max_bytes and total > max_bytes:
                        # stop and remove partial
                        try:
                            f.close()
                        except Exception:
                            pass
                        try:
                            dst.unlink(missing_ok=True)
                        except Exception:
                            pass
                        return None
        return dst
    except Exception:
        return None
 def _norm_text(s: str) -> str:
    s = (s or "").lower()
    s = re.sub(r"\s+", " ", s)
@@ -224,12 +279,15 @@ def _gather_transcripts(item: ET.Element):
 def parse_feed(feed_url: str):
    items = []
    try:
        print(f"[rss] fetching {feed_url}", flush=True)
        r = requests.get(feed_url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"})
        r.raise_for_status()
        root = ET.fromstring(r.content)
        channel = root.find("channel") or root
        show_title = _text(_find_ns(channel, "title")) or _text(_find_ns(root, "title"))
        if not show_title:
            show_title = ""
        for it in _iter_items(channel):
            title = _text(_find_ns(it, "title"))
@@ -320,12 +378,15 @@ def parse_feed(feed_url: str):
            items.append(item_rec)
        print(f"[rss] parsed {len(items)} episode(s) from {show_title or feed_url}", flush=True)
        return {"feed_url": feed_url, "show": show_title, "episodes": items}
    except Exception as e:
        print(f"[rss] ERROR parsing {feed_url}: {e}", flush=True)
        return {"feed_url": feed_url, "error": str(e), "episodes": []}
 def load_feeds_list():
    print(f"[rss] FEEDS_FILE={FEEDS_FILE} FEEDS_ENV={'set' if bool(FEEDS_ENV) else 'unset'}", flush=True)
    feeds = []
    if FEEDS_ENV:
        feeds.extend([u.strip() for u in FEEDS_ENV.split(",") if u.strip()])
@@ -338,8 +399,12 @@ def load_feeds_list():
                feeds.append(line)
        except Exception:
            pass
    else:
        print(f"[rss] feeds file not found: {FEEDS_FILE}", flush=True)
    # unique, keep order
-    return sorted(list(dict.fromkeys(feeds)))
+    feeds = sorted(list(dict.fromkeys(feeds)))
    print(f"[rss] parsed {len(feeds)} feed URL(s)", flush=True)
    return feeds
 def build_index():
@@ -354,4 +419,11 @@ def build_index():
 if __name__ == "__main__":
    while True:
        try:
            build_index()
        except Exception as e:
            print(f"[rss] build error: {e}", flush=True)
        if RSS_ONCE:
            break
        time.sleep(max(1, RSS_SCAN_MINUTES) * 60)