Adding second worker

This commit is contained in:
2025-09-07 17:30:45 +02:00
parent f4b456ccae
commit bd846a8d9f
3 changed files with 560 additions and 50 deletions

View File

@@ -20,6 +20,11 @@ MEDIA_EXTS = {".mp3", ".m4a", ".flac", ".wav", ".ogg", ".opus", ".mp4", ".m4v",
# Fuzzy title match threshold for media ↔ transcript pairing
TITLE_MATCH_THRESHOLD = float(os.getenv("RSS_TITLE_MATCH_THRESHOLD", "0.60"))
# Download podcast audio (enclosures) to a local library
PODCASTS_ROOT = Path(os.getenv("PODCASTS_ROOT", str(LIB / "Podcasts")))
PODCASTS_PER_SHOW = os.getenv("PODCASTS_PER_SHOW", "true").lower() in {"1","true","yes","y"}
DOWNLOAD_AUDIO = os.getenv("RSS_DOWNLOAD_AUDIO", "true").lower() in {"1","true","yes","y"}
# Namespace map (extend as needed)
NS = {
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
@@ -31,6 +36,7 @@ NS = {
TRN.mkdir(parents=True, exist_ok=True)
OUT_INDEX.parent.mkdir(parents=True, exist_ok=True)
PODCASTS_ROOT.mkdir(parents=True, exist_ok=True)
def _text(el):
@@ -235,6 +241,7 @@ def parse_feed(feed_url: str):
duration_sec = _parse_duration(dur) or None
enclosure = _find_ns(it, "enclosure")
audio_url = enclosure.get("url") if enclosure is not None else ""
audio_type = enclosure.get("type") if enclosure is not None else ""
if not audio_url:
for mc in _findall_ns(it, "media:content"):
@@ -253,6 +260,7 @@ def parse_feed(feed_url: str):
"date": date,
"duration_sec": duration_sec,
"audio_url": audio_url,
"audio_type": audio_type,
"language": DEFAULT_LANG,
"transcripts": transcripts,
}
@@ -276,6 +284,40 @@ def parse_feed(feed_url: str):
if created:
t["sidecars"] = created
# Optionally download podcast audio locally
local_audio_path = None
if DOWNLOAD_AUDIO and audio_url:
show_dir = PODCASTS_ROOT / _slug(show_title or "Podcast") if PODCASTS_PER_SHOW else PODCASTS_ROOT
base_name = f"{(date or '00000000')} - {_slug(title or guid or 'episode')}"
ext = _guess_audio_ext(audio_type, audio_url)
target = (show_dir / base_name).with_suffix(ext)
# Avoid re-download if already exists
if not target.exists():
saved = _download_stream(audio_url, target)
if saved is None:
# Try a non-streaming fallback
saved = _download(audio_url, target)
else:
saved = target
if saved and saved.exists():
local_audio_path = saved
# If we previously downloaded transcript sidecars, try to place them next to this audio
for t in item_rec.get("transcripts", []) or []:
lp = t.get("local_path")
if lp:
try:
lp = Path(lp)
if lp.exists() and lp.suffix.lower() in {'.srt','.vtt','.txt'}:
sc = _sidecar_path_for(local_audio_path, t.get('language') or DEFAULT_LANG, lp.suffix.lower())
if not sc.exists():
sc.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(lp, sc)
t.setdefault("sidecars", []).append(str(sc))
except Exception:
pass
if local_audio_path:
item_rec["local_audio"] = str(local_audio_path)
items.append(item_rec)
return {"feed_url": feed_url, "show": show_title, "episodes": items}