Bug fixes

This commit is contained in:
2025-09-08 09:44:03 +02:00
parent 13b4ebf63f
commit d4f4a93acf

View File

@@ -7,11 +7,14 @@ import xml.etree.ElementTree as ET
# ---- Config ----
TRN = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
OUT_INDEX = Path(os.getenv("RSS_INDEX_PATH", str(TRN / "rss_index.json")))
FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/app/config/feeds.txt"))
FEEDS_FILE = Path(os.getenv("RSS_FEEDS_FILE", "/library/feeds.txt"))
FEEDS_ENV = os.getenv("RSS_FEEDS", "").strip()
TIMEOUT = int(os.getenv("RSS_HTTP_TIMEOUT", "30"))
DOWNLOAD_TRANSCRIPTS = os.getenv("RSS_DOWNLOAD_TRANSCRIPTS", "true").lower() in {"1", "true", "yes", "y"}
DEFAULT_LANG = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
RSS_SCAN_MINUTES = int(os.getenv("RSS_SCAN_MINUTES", "15"))
RSS_ONCE = os.getenv("RSS_ONCE", "0").lower() in {"1", "true", "yes", "y"}
AUDIO_MAX_MB = int(os.getenv("RSS_AUDIO_MAX_MB", "0")) # 0 = unlimited
# Where media files live; used to sidecar RSS transcripts next to matching media
LIB = Path(os.getenv("LIBRARY_ROOT", "/library"))
@@ -130,6 +133,58 @@ def _guess_ext_from_type(mime: str) -> str:
return ".txt"
def _guess_audio_ext(mime: str, url: str) -> str:
# Prefer by MIME; fall back to URL suffix
mime = (mime or "").lower()
if "mp3" in mime:
return ".mp3"
if "aac" in mime or "mp4a" in mime:
return ".m4a"
if "m4a" in mime:
return ".m4a"
if "ogg" in mime:
return ".ogg"
if "opus" in mime:
return ".opus"
if "flac" in mime:
return ".flac"
if "wav" in mime:
return ".wav"
# fallback by URL
suf = Path(urlparse(url).path).suffix.lower()
if suf in {".mp3", ".m4a", ".aac", ".ogg", ".opus", ".flac", ".wav"}:
return ".m4a" if suf == ".aac" else suf
return ".mp3"
def _download_stream(url: str, dst: Path) -> Path | None:
try:
dst.parent.mkdir(parents=True, exist_ok=True)
with requests.get(url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"}, stream=True) as r:
r.raise_for_status()
max_bytes = AUDIO_MAX_MB * 1024 * 1024 if AUDIO_MAX_MB > 0 else None
total = 0
with open(dst, "wb") as f:
for chunk in r.iter_content(chunk_size=1024 * 256):
if not chunk:
continue
f.write(chunk)
total += len(chunk)
if max_bytes and total > max_bytes:
# stop and remove partial
try:
f.close()
except Exception:
pass
try:
dst.unlink(missing_ok=True)
except Exception:
pass
return None
return dst
except Exception:
return None
def _norm_text(s: str) -> str:
s = (s or "").lower()
s = re.sub(r"\s+", " ", s)
@@ -224,12 +279,15 @@ def _gather_transcripts(item: ET.Element):
def parse_feed(feed_url: str):
items = []
try:
print(f"[rss] fetching {feed_url}", flush=True)
r = requests.get(feed_url, timeout=TIMEOUT, headers={"User-Agent": "podx/1.0"})
r.raise_for_status()
root = ET.fromstring(r.content)
channel = root.find("channel") or root
show_title = _text(_find_ns(channel, "title")) or _text(_find_ns(root, "title"))
if not show_title:
show_title = ""
for it in _iter_items(channel):
title = _text(_find_ns(it, "title"))
@@ -320,12 +378,15 @@ def parse_feed(feed_url: str):
items.append(item_rec)
print(f"[rss] parsed {len(items)} episode(s) from {show_title or feed_url}", flush=True)
return {"feed_url": feed_url, "show": show_title, "episodes": items}
except Exception as e:
print(f"[rss] ERROR parsing {feed_url}: {e}", flush=True)
return {"feed_url": feed_url, "error": str(e), "episodes": []}
def load_feeds_list():
print(f"[rss] FEEDS_FILE={FEEDS_FILE} FEEDS_ENV={'set' if bool(FEEDS_ENV) else 'unset'}", flush=True)
feeds = []
if FEEDS_ENV:
feeds.extend([u.strip() for u in FEEDS_ENV.split(",") if u.strip()])
@@ -338,8 +399,12 @@ def load_feeds_list():
feeds.append(line)
except Exception:
pass
else:
print(f"[rss] feeds file not found: {FEEDS_FILE}", flush=True)
# unique, keep order
return sorted(list(dict.fromkeys(feeds)))
feeds = sorted(list(dict.fromkeys(feeds)))
print(f"[rss] parsed {len(feeds)} feed URL(s)", flush=True)
return feeds
def build_index():
@@ -354,4 +419,11 @@ def build_index():
if __name__ == "__main__":
build_index()
while True:
try:
build_index()
except Exception as e:
print(f"[rss] build error: {e}", flush=True)
if RSS_ONCE:
break
time.sleep(max(1, RSS_SCAN_MINUTES) * 60)