Adding second worker
This commit is contained in:
@@ -20,6 +20,11 @@ MEDIA_EXTS = {".mp3", ".m4a", ".flac", ".wav", ".ogg", ".opus", ".mp4", ".m4v",
|
||||
# Fuzzy title match threshold for media ↔ transcript pairing
|
||||
TITLE_MATCH_THRESHOLD = float(os.getenv("RSS_TITLE_MATCH_THRESHOLD", "0.60"))
|
||||
|
||||
# Download podcast audio (enclosures) to a local library
|
||||
PODCASTS_ROOT = Path(os.getenv("PODCASTS_ROOT", str(LIB / "Podcasts")))
|
||||
PODCASTS_PER_SHOW = os.getenv("PODCASTS_PER_SHOW", "true").lower() in {"1","true","yes","y"}
|
||||
DOWNLOAD_AUDIO = os.getenv("RSS_DOWNLOAD_AUDIO", "true").lower() in {"1","true","yes","y"}
|
||||
|
||||
# Namespace map (extend as needed)
|
||||
NS = {
|
||||
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
||||
@@ -31,6 +36,7 @@ NS = {
|
||||
|
||||
TRN.mkdir(parents=True, exist_ok=True)
|
||||
OUT_INDEX.parent.mkdir(parents=True, exist_ok=True)
|
||||
PODCASTS_ROOT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _text(el):
|
||||
@@ -235,6 +241,7 @@ def parse_feed(feed_url: str):
|
||||
duration_sec = _parse_duration(dur) or None
|
||||
enclosure = _find_ns(it, "enclosure")
|
||||
audio_url = enclosure.get("url") if enclosure is not None else ""
|
||||
audio_type = enclosure.get("type") if enclosure is not None else ""
|
||||
|
||||
if not audio_url:
|
||||
for mc in _findall_ns(it, "media:content"):
|
||||
@@ -253,6 +260,7 @@ def parse_feed(feed_url: str):
|
||||
"date": date,
|
||||
"duration_sec": duration_sec,
|
||||
"audio_url": audio_url,
|
||||
"audio_type": audio_type,
|
||||
"language": DEFAULT_LANG,
|
||||
"transcripts": transcripts,
|
||||
}
|
||||
@@ -276,6 +284,40 @@ def parse_feed(feed_url: str):
|
||||
if created:
|
||||
t["sidecars"] = created
|
||||
|
||||
# Optionally download podcast audio locally
|
||||
local_audio_path = None
|
||||
if DOWNLOAD_AUDIO and audio_url:
|
||||
show_dir = PODCASTS_ROOT / _slug(show_title or "Podcast") if PODCASTS_PER_SHOW else PODCASTS_ROOT
|
||||
base_name = f"{(date or '00000000')} - {_slug(title or guid or 'episode')}"
|
||||
ext = _guess_audio_ext(audio_type, audio_url)
|
||||
target = (show_dir / base_name).with_suffix(ext)
|
||||
# Avoid re-download if already exists
|
||||
if not target.exists():
|
||||
saved = _download_stream(audio_url, target)
|
||||
if saved is None:
|
||||
# Try a non-streaming fallback
|
||||
saved = _download(audio_url, target)
|
||||
else:
|
||||
saved = target
|
||||
if saved and saved.exists():
|
||||
local_audio_path = saved
|
||||
# If we previously downloaded transcript sidecars, try to place them next to this audio
|
||||
for t in item_rec.get("transcripts", []) or []:
|
||||
lp = t.get("local_path")
|
||||
if lp:
|
||||
try:
|
||||
lp = Path(lp)
|
||||
if lp.exists() and lp.suffix.lower() in {'.srt','.vtt','.txt'}:
|
||||
sc = _sidecar_path_for(local_audio_path, t.get('language') or DEFAULT_LANG, lp.suffix.lower())
|
||||
if not sc.exists():
|
||||
sc.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(lp, sc)
|
||||
t.setdefault("sidecars", []).append(str(sc))
|
||||
except Exception:
|
||||
pass
|
||||
if local_audio_path:
|
||||
item_rec["local_audio"] = str(local_audio_path)
|
||||
|
||||
items.append(item_rec)
|
||||
|
||||
return {"feed_url": feed_url, "show": show_title, "episodes": items}
|
||||
|
Reference in New Issue
Block a user