Add RSS feed downloading

This commit is contained in:
2025-09-07 15:30:22 +02:00
parent bcd874ecf8
commit e6582e9a6b
5 changed files with 522 additions and 2 deletions

3
.gitignore vendored
View File

@@ -1,6 +1,9 @@
# Local env and secrets
.env
# RSS Feeds
feeds.txt
# Runtime data
data/
models/

388
app/rss_ingest.py Normal file
View File

@@ -0,0 +1,388 @@
#!/usr/bin/env python3
"""
RSS ingester for PodX
- Reads feeds from env var RSS_FEEDS (comma-separated) *and*/or from a file (FEEDS_FILE, default /library/feeds.txt)
- Fetches RSS with ETag/Last-Modified caching to avoid re-downloading unchanged feeds
- Saves audio to LIBRARY_ROOT/<podcast>/<YYYYMMDD - title>.<ext>
- Saves transcript sidecars when `<podcast:transcript>` links are present (prefers TextWithTimestamps → WebVTT → SRT → TXT)
- Enqueues `worker.handle_local_file` for indexing/transcription (worker will skip Whisper if a transcript sidecar exists)
- Keeps a small state JSON with per-feed ETag/Last-Modified and per-item processed GUIDs to avoid duplicate work
Environment variables (with sane defaults):
MEILI_URL (unused directly here, but kept for parity)
REDIS_URL redis://redis:6379/0
LIBRARY_ROOT /library
TRANSCRIPT_ROOT /transcripts
RSS_FEEDS "" (comma-separated list)
FEEDS_FILE /library/feeds.txt
RSS_SCAN_MINUTES 120
RSS_ONCE 0 ("1" to run once and exit)
USER_AGENT podx-rss/1.0 (+local-archive)
RSS_STATE_FILE /library/.rss_state.json
RSS_CONNECT_TIMEOUT 15 (seconds)
RSS_READ_TIMEOUT 60 (seconds)
AUDIO_MAX_MB 4096 (skip larger-than if HEAD reveals size > max, 0 = unlimited)
"""
import os
import re
import sys
import json
import time
import logging
import itertools
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from urllib.parse import urlparse
import requests
import xml.etree.ElementTree as ET
import redis
from rq import Queue
logging.basicConfig(level=logging.INFO, format='[rss] %(message)s')
log = logging.getLogger("rss")
# Config
MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
LIBRARY_ROOT = Path(os.getenv("LIBRARY_ROOT", "/library"))
TRANSCRIPT_ROOT = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
RSS_FEEDS_ENV = [s.strip() for s in os.getenv("RSS_FEEDS", "").split(",") if s.strip()]
FEEDS_FILE = Path(os.getenv("FEEDS_FILE", str(LIBRARY_ROOT / "feeds.txt")))
RSS_SCAN_MINUTES = int(os.getenv("RSS_SCAN_MINUTES", "120"))
RSS_ONCE = os.getenv("RSS_ONCE", "0") == "1"
USER_AGENT = os.getenv("USER_AGENT", "podx-rss/1.0 (+local-archive)")
STATE_FILE = Path(os.getenv("RSS_STATE_FILE", str(LIBRARY_ROOT / ".rss_state.json")))
CONNECT_TIMEOUT = float(os.getenv("RSS_CONNECT_TIMEOUT", "15"))
READ_TIMEOUT = float(os.getenv("RSS_READ_TIMEOUT", "60"))
AUDIO_MAX_MB = int(os.getenv("AUDIO_MAX_MB", "4096"))
# Redis queue
r = redis.from_url(REDIS_URL)
q = Queue("default", connection=r)
# HTTP session
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": USER_AGENT})
# Namespaces commonly used in podcast RSS
NS = {
"podcast": "https://podcastindex.org/namespace/1.0",
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
"media": "http://search.yahoo.com/mrss/",
"content": "http://purl.org/rss/1.0/modules/content/",
"dc": "http://purl.org/dc/elements/1.1/",
}
# ----------------- helpers -----------------
def safe(s: str) -> str:
s = re.sub(r"[\\/:*?\"<>|]", "_", s)
# collapse whitespace and trim
s = re.sub(r"\s+", " ", s).strip()
# guard against very long filenames
return s[:200] if len(s) > 200 else s
def load_state() -> Dict:
if STATE_FILE.exists():
try:
return json.loads(STATE_FILE.read_text("utf-8"))
except Exception:
log.warning("State file unreadable, starting fresh")
return {"feeds": {}, "items": {}} # items keyed by GUID / enclosure URL
def save_state(state: Dict) -> None:
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp")
tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2))
tmp.replace(STATE_FILE)
def load_feeds() -> List[str]:
feeds = list(RSS_FEEDS_ENV)
if FEEDS_FILE.exists():
try:
for line in FEEDS_FILE.read_text("utf-8").splitlines():
s = line.strip()
if not s or s.startswith("#"): # allow comments
continue
feeds.append(s)
except Exception as e:
log.warning(f"Failed to read {FEEDS_FILE}: {e}")
# de-dup preserving order
seen = set()
uniq = []
for f in feeds:
if f not in seen:
uniq.append(f)
seen.add(f)
return uniq
def fetch(url: str, *, etag: Optional[str]=None, modified: Optional[str]=None, as_text=False):
headers = {}
if etag:
headers["If-None-Match"] = etag
if modified:
headers["If-Modified-Since"] = modified
resp = SESSION.get(url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
if resp.status_code == 304:
return None, 304, None, None
resp.raise_for_status()
new_etag = resp.headers.get("ETag")
new_mod = resp.headers.get("Last-Modified")
return (resp.text if as_text else resp.content), resp.status_code, new_etag, new_mod
def head_size(url: str) -> Optional[int]:
try:
h = SESSION.head(url, allow_redirects=True, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
if h.ok:
cl = h.headers.get("Content-Length")
return int(cl) if cl and cl.isdigit() else None
except Exception:
return None
return None
def best_transcript_links(item) -> List[str]:
links: List[Tuple[int, str, str]] = []
# Try explicit QName first
for tag in item.findall(".//{https://podcastindex.org/namespace/1.0}transcript"):
t = (tag.attrib.get("type") or "").lower()
url = tag.attrib.get("url")
if url:
links.append((0, t, url))
# Namespace-prefixed fallback
for tag in item.findall(".//podcast:transcript", NS):
t = (tag.attrib.get("type") or "").lower()
url = tag.attrib.get("url")
if url:
links.append((0, t, url))
order = [
"text/plain", # often used for TextWithTimestamps
"application/json",
"text/vtt",
"application/srt",
"application/x-subrip",
"application/text",
"text/plain; charset=utf-8",
]
key = {v: i for i, v in enumerate(order)}
ranked = []
for _, t, url in links:
ranked.append((key.get(t, 999), t, url))
ranked.sort()
return [u for _, _, u in ranked]
def get_enclosure(item) -> Optional[str]:
enc = item.find("enclosure")
if enc is not None and enc.attrib.get("url"):
return enc.attrib["url"]
mc = item.find("media:content", NS)
if mc is not None and mc.attrib.get("url"):
return mc.attrib["url"]
return None
def parse_pubdate(item) -> datetime:
# Try common fields
candidates = [
item.findtext("pubDate"),
item.findtext("dc:date", namespaces=NS),
item.findtext("{http://purl.org/dc/elements/1.1/}date"),
]
for pd in filter(None, candidates):
s = pd.strip()
# Try several common formats
for fmt in [
"%a, %d %b %Y %H:%M:%S %z",
"%a, %d %b %Y %H:%M:%S",
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S",
]:
try:
return datetime.strptime(s, fmt)
except Exception:
pass
return datetime.utcnow()
def save_bytes(path: Path, data: bytes) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".part")
tmp.write_bytes(data)
tmp.replace(path)
def decide_audio_ext(audio_url: str) -> str:
p = urlparse(audio_url)
low = p.path.lower()
if low.endswith(".m4a"):
return ".m4a"
if low.endswith(".mp3"):
return ".mp3"
if low.endswith(".ogg") or low.endswith(".oga"):
return ".ogg"
if low.endswith(".aac"):
return ".aac"
if low.endswith(".wav"):
return ".wav"
return ".mp3"
def item_key(item) -> str:
# Prefer GUID value, else enclosure URL, else title+date
guid = item.findtext("guid")
if guid:
return guid.strip()
enc = get_enclosure(item)
if enc:
return enc
title = item.findtext("title") or "Episode"
pub = parse_pubdate(item).strftime("%Y%m%d")
return f"{pub}:{title}"
# ----------------- core ingest -----------------
def ingest_feed(url: str, state: Dict) -> int:
fstate = state.setdefault("feeds", {}).setdefault(url, {})
etag = fstate.get("etag")
mod = fstate.get("modified")
log.info(f"Fetch RSS: {url}")
try:
data, status, new_etag, new_mod = fetch(url, etag=etag, modified=mod, as_text=True)
except Exception as e:
log.error(f"Fetch failed: {e}")
return 0
if status == 304:
log.info("Not modified")
return 0
if new_etag:
fstate["etag"] = new_etag
if new_mod:
fstate["modified"] = new_mod
try:
root = ET.fromstring(data)
except Exception as e:
log.error(f"XML parse error: {e}")
return 0
channel_title = safe((root.findtext("channel/title") or "Podcast"))
new_items = 0
for item in root.findall("channel/item"):
key = item_key(item)
already = state.setdefault("items", {})
if already.get(key):
continue
title = safe(item.findtext("title") or "Episode")
pub = parse_pubdate(item)
date_prefix = pub.strftime("%Y%m%d")
base = f"{date_prefix} - {title}"
audio_url = get_enclosure(item)
if not audio_url:
log.info(f"Skip (no enclosure): {title}")
already[key] = {"skipped": "no_enclosure"}
continue
# HEAD size guard (optional)
if AUDIO_MAX_MB > 0:
size = head_size(audio_url)
if size and size > AUDIO_MAX_MB * 1024 * 1024:
log.info(f"Skip (size>{AUDIO_MAX_MB}MB): {title}")
already[key] = {"skipped": "too_large", "size": size}
continue
path_ext = decide_audio_ext(audio_url)
audio_out = LIBRARY_ROOT / channel_title / f"{base}{path_ext}"
transcript_links = best_transcript_links(item)
# If audio exists and a transcript sidecar exists → just enqueue index
sidecars = list((TRANSCRIPT_ROOT / channel_title).glob(f"{base}.*"))
have_transcript = len(sidecars) > 0
if audio_out.exists() and have_transcript:
log.info(f"Skip download, enqueue index (have audio+transcript): {audio_out.name}")
try:
q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
except Exception as e:
log.warning(f"Enqueue failed: {e}")
already[key] = {"done": True, "audio": str(audio_out)}
new_items += 1
continue
# Download audio
try:
log.info(f"Downloading audio → {audio_out}")
content, _, _, _ = fetch(audio_url, as_text=False)
save_bytes(audio_out, content)
except Exception as e:
log.warning(f"Audio failed: {e}")
already[key] = {"error": f"audio:{e}"}
continue
# Download transcript if present (take first best)
transcript_out = None
for turl in transcript_links:
try:
ext = ".vtt" if "vtt" in turl.lower() else ".srt" if "srt" in turl.lower() else ".txt"
tout = TRANSCRIPT_ROOT / channel_title / f"{base}{ext}"
log.info(f"Downloading transcript → {tout} ({turl})")
tdata, _, _, _ = fetch(turl, as_text=False)
save_bytes(tout, tdata)
transcript_out = tout
break
except Exception as e:
log.warning(f"Transcript fetch failed ({turl}): {e}")
continue
# Enqueue for indexing/transcription
try:
q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
except Exception as e:
log.warning(f"Enqueue failed: {e}")
already[key] = {"done": True, "audio": str(audio_out), "transcript": str(transcript_out) if transcript_out else None}
new_items += 1
return new_items
# ----------------- main loop -----------------
def main():
while True:
feeds = load_feeds()
if not feeds:
log.error("No RSS feeds configured. Set RSS_FEEDS or create feeds.txt.")
sys.exit(1)
state = load_state()
total_new = 0
for url in feeds:
try:
added = ingest_feed(url, state)
total_new += added
save_state(state)
except Exception as e:
log.error(f"Feed error: {url} -> {e}")
log.info(f"Cycle complete. New items: {total_new}")
if RSS_ONCE:
break
time.sleep(RSS_SCAN_MINUTES * 60)
if __name__ == "__main__":
main()

View File

@@ -39,6 +39,78 @@ def log(feed):
def sanitize(name):
return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
def find_sidecar_transcript(media_path: Path) -> Path | None:
"""Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
Tries common variants including language-suffixed SRT/VTT.
"""
candidates: list[Path] = []
# exact same stem in same folder
for ext in [".txt", ".srt", ".vtt"]:
p = media_path.parent / (media_path.stem + ext)
if p.exists():
candidates.append(p)
# language-suffixed near the media file (e.g., .en.srt)
for ext in [".srt", ".vtt"]:
p = media_path.with_suffix(f".en{ext}")
if p.exists() and p not in candidates:
candidates.append(p)
return candidates[0] if candidates else None
def transcript_text_from_file(path: Path) -> str:
"""Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
try:
raw = path.read_text(encoding="utf-8", errors="ignore")
except Exception:
raw = path.read_text(errors="ignore")
if path.suffix.lower() == ".txt":
return raw.strip()
# For SRT/VTT, drop timestamp lines, cue numbers and headers
lines: list[str] = []
for line in raw.splitlines():
ls = line.strip()
if not ls:
continue
if "-->" in ls: # timestamp line
continue
if ls.upper().startswith("WEBVTT"):
continue
if re.match(r"^\d+$", ls): # cue index
continue
lines.append(ls)
return " ".join(lines)
def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
"""Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed."""
try:
if sidecar.suffix.lower() == ".srt":
dst = media_path.with_suffix(f".{lang}.srt")
shutil.copy2(sidecar, dst)
elif sidecar.suffix.lower() == ".vtt":
tmp_srt = sidecar.with_suffix(".srt")
subprocess.run(["ffmpeg", "-nostdin", "-y", "-i", str(sidecar), str(tmp_srt)], check=True)
dst = media_path.with_suffix(f".{lang}.srt")
shutil.move(str(tmp_srt), dst)
except Exception as e:
print(f"[post] sidecar copy/convert failed: {e}", flush=True)
def write_plain_transcript(media_path: Path, text: str, language: str = "en") -> Path:
"""Write minimal transcript artifacts (.txt + .json) from plain text (no timestamps)."""
title = media_path.stem
base = TRN / title
base.parent.mkdir(parents=True, exist_ok=True)
(base.with_suffix(".txt")).write_text(text, encoding="utf-8")
(base.with_suffix(".json")).write_bytes(orjson.dumps({
"file": str(media_path),
"language": language,
"segments": [{"start": 0.0, "end": 0.0, "text": text}]
}))
return base
def yt_dlp(url, outdir):
# 1) Normalize YouTube Music URLs to standard YouTube
yurl = url
@@ -316,6 +388,7 @@ def publish_to_openwebui(paths):
def handle_local_file(path_str: str):
"""Transcribe & index a local media file that already exists in /library.
If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper.
Safe to call repeatedly; it skips if transcript JSON already exists.
"""
try:
@@ -323,13 +396,30 @@ def handle_local_file(path_str: str):
if not p.exists():
log({"url": path_str, "status": "error", "error": "file_not_found"})
return
title = p.stem
base_json = TRN / f"{title}.json"
if base_json.exists():
log({"url": path_str, "status": "skip", "reason": "already_transcribed"})
return
info = {"url": path_str, "status": "transcribing", "title": title, "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
info = {"url": path_str, "status": "transcribing", "title": title,
"uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
log(info)
# 1) Prefer an existing transcript sidecar if present
sidecar = find_sidecar_transcript(p)
if sidecar:
plain = transcript_text_from_file(sidecar)
lang = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
base = write_plain_transcript(p, plain, language=lang)
ensure_sidecar_next_to_media(sidecar, p, lang=lang)
index_meili(base.with_suffix(".json"))
publish_to_openwebui([base.with_suffix(".txt")])
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
return
# 2) Otherwise, run transcription
base = transcribe(p)
index_meili(base.with_suffix(".json"))
publish_to_openwebui([base.with_suffix(".txt")])

View File

@@ -41,6 +41,10 @@ services:
WHISPER_MODEL: large-v3
WHISPER_PRECISION: int8
PYTHONPATH: /app
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
JOB_TTL: ${JOB_TTL:-86400}
RESULT_TTL: ${RESULT_TTL:-86400}
FAILURE_TTL: ${FAILURE_TTL:-86400}
volumes:
- ${LIBRARY_HOST_DIR:-./library}:/library
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
@@ -86,7 +90,7 @@ services:
# - COOKIE_FILE=/config/cookies.txt
# Optional: yt-dlp options (JSON). Example enables Android client fallback
# - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}}
- YTDL_OPTIONS={"extract_flat":"in_playlist","concurrent_fragment_downloads":1}
- YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1}
volumes:
- ${LIBRARY_HOST_DIR:-./library}:/downloads
# Optional cookies file on host → /config/cookies.txt inside container
@@ -115,3 +119,32 @@ services:
healthcheck:
test: ["CMD-SHELL", "exit 0"]
restart: unless-stopped
podx-rss:
build: ./app
container_name: podx-rss
command: ["python", "rss_ingest.py"]
env_file: [.env]
environment:
MEILI_URL: http://meili:7700
REDIS_URL: redis://redis:6379/0
LIBRARY_ROOT: /library
TRANSCRIPT_ROOT: /transcripts
FEEDS_FILE: /library/feeds.txt
RSS_STATE_FILE: /library/.rss_state.json
RSS_SCAN_MINUTES: ${RSS_SCAN_MINUTES:-120}
RSS_CONNECT_TIMEOUT: ${RSS_CONNECT_TIMEOUT:-15}
RSS_READ_TIMEOUT: ${RSS_READ_TIMEOUT:-60}
AUDIO_MAX_MB: ${AUDIO_MAX_MB:-4096}
USER_AGENT: ${USER_AGENT:-podx-rss/1.0 (+local-archive)}
RSS_ONCE: ${RSS_ONCE:-0}
volumes:
- ${LIBRARY_HOST_DIR:-./library}:/library
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
depends_on: [redis]
healthcheck:
test: ["CMD-SHELL", "python - <<'PY'\nimport os,sys; p=os.getenv('FEEDS_FILE',''); sys.exit(0 if (p and os.path.exists(p)) else 1)\nPY"]
interval: 60s
timeout: 5s
retries: 3
restart: unless-stopped

6
feeds.txt.example Normal file
View File

@@ -0,0 +1,6 @@
feeds.txt
---------
# Apple Podcasts / Omny show
https://www.omnycontent.com/d/playlist/....../podcast.rss
# Another RSS
https://example.com/feed.xml