Add RSS feed downloading
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,6 +1,9 @@
|
|||||||
# Local env and secrets
|
# Local env and secrets
|
||||||
.env
|
.env
|
||||||
|
|
||||||
|
# RSS Feeds
|
||||||
|
feeds.txt
|
||||||
|
|
||||||
# Runtime data
|
# Runtime data
|
||||||
data/
|
data/
|
||||||
models/
|
models/
|
||||||
|
388
app/rss_ingest.py
Normal file
388
app/rss_ingest.py
Normal file
@@ -0,0 +1,388 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
RSS ingester for PodX
|
||||||
|
- Reads feeds from env var RSS_FEEDS (comma-separated) *and*/or from a file (FEEDS_FILE, default /library/feeds.txt)
|
||||||
|
- Fetches RSS with ETag/Last-Modified caching to avoid re-downloading unchanged feeds
|
||||||
|
- Saves audio to LIBRARY_ROOT/<podcast>/<YYYYMMDD - title>.<ext>
|
||||||
|
- Saves transcript sidecars when `<podcast:transcript>` links are present (prefers TextWithTimestamps → WebVTT → SRT → TXT)
|
||||||
|
- Enqueues `worker.handle_local_file` for indexing/transcription (worker will skip Whisper if a transcript sidecar exists)
|
||||||
|
- Keeps a small state JSON with per-feed ETag/Last-Modified and per-item processed GUIDs to avoid duplicate work
|
||||||
|
|
||||||
|
Environment variables (with sane defaults):
|
||||||
|
MEILI_URL (unused directly here, but kept for parity)
|
||||||
|
REDIS_URL redis://redis:6379/0
|
||||||
|
LIBRARY_ROOT /library
|
||||||
|
TRANSCRIPT_ROOT /transcripts
|
||||||
|
RSS_FEEDS "" (comma-separated list)
|
||||||
|
FEEDS_FILE /library/feeds.txt
|
||||||
|
RSS_SCAN_MINUTES 120
|
||||||
|
RSS_ONCE 0 ("1" to run once and exit)
|
||||||
|
USER_AGENT podx-rss/1.0 (+local-archive)
|
||||||
|
RSS_STATE_FILE /library/.rss_state.json
|
||||||
|
RSS_CONNECT_TIMEOUT 15 (seconds)
|
||||||
|
RSS_READ_TIMEOUT 60 (seconds)
|
||||||
|
AUDIO_MAX_MB 4096 (skip larger-than if HEAD reveals size > max, 0 = unlimited)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import itertools
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Tuple, Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import redis
|
||||||
|
from rq import Queue
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format='[rss] %(message)s')
|
||||||
|
log = logging.getLogger("rss")
|
||||||
|
|
||||||
|
# Config
|
||||||
|
MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700")
|
||||||
|
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
|
||||||
|
LIBRARY_ROOT = Path(os.getenv("LIBRARY_ROOT", "/library"))
|
||||||
|
TRANSCRIPT_ROOT = Path(os.getenv("TRANSCRIPT_ROOT", "/transcripts"))
|
||||||
|
RSS_FEEDS_ENV = [s.strip() for s in os.getenv("RSS_FEEDS", "").split(",") if s.strip()]
|
||||||
|
FEEDS_FILE = Path(os.getenv("FEEDS_FILE", str(LIBRARY_ROOT / "feeds.txt")))
|
||||||
|
RSS_SCAN_MINUTES = int(os.getenv("RSS_SCAN_MINUTES", "120"))
|
||||||
|
RSS_ONCE = os.getenv("RSS_ONCE", "0") == "1"
|
||||||
|
USER_AGENT = os.getenv("USER_AGENT", "podx-rss/1.0 (+local-archive)")
|
||||||
|
STATE_FILE = Path(os.getenv("RSS_STATE_FILE", str(LIBRARY_ROOT / ".rss_state.json")))
|
||||||
|
CONNECT_TIMEOUT = float(os.getenv("RSS_CONNECT_TIMEOUT", "15"))
|
||||||
|
READ_TIMEOUT = float(os.getenv("RSS_READ_TIMEOUT", "60"))
|
||||||
|
AUDIO_MAX_MB = int(os.getenv("AUDIO_MAX_MB", "4096"))
|
||||||
|
|
||||||
|
# Redis queue
|
||||||
|
r = redis.from_url(REDIS_URL)
|
||||||
|
q = Queue("default", connection=r)
|
||||||
|
|
||||||
|
# HTTP session
|
||||||
|
SESSION = requests.Session()
|
||||||
|
SESSION.headers.update({"User-Agent": USER_AGENT})
|
||||||
|
|
||||||
|
# Namespaces commonly used in podcast RSS
|
||||||
|
NS = {
|
||||||
|
"podcast": "https://podcastindex.org/namespace/1.0",
|
||||||
|
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
|
||||||
|
"media": "http://search.yahoo.com/mrss/",
|
||||||
|
"content": "http://purl.org/rss/1.0/modules/content/",
|
||||||
|
"dc": "http://purl.org/dc/elements/1.1/",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ----------------- helpers -----------------
|
||||||
|
|
||||||
|
def safe(s: str) -> str:
|
||||||
|
s = re.sub(r"[\\/:*?\"<>|]", "_", s)
|
||||||
|
# collapse whitespace and trim
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
# guard against very long filenames
|
||||||
|
return s[:200] if len(s) > 200 else s
|
||||||
|
|
||||||
|
|
||||||
|
def load_state() -> Dict:
|
||||||
|
if STATE_FILE.exists():
|
||||||
|
try:
|
||||||
|
return json.loads(STATE_FILE.read_text("utf-8"))
|
||||||
|
except Exception:
|
||||||
|
log.warning("State file unreadable, starting fresh")
|
||||||
|
return {"feeds": {}, "items": {}} # items keyed by GUID / enclosure URL
|
||||||
|
|
||||||
|
|
||||||
|
def save_state(state: Dict) -> None:
|
||||||
|
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
tmp = STATE_FILE.with_suffix(STATE_FILE.suffix + ".tmp")
|
||||||
|
tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2))
|
||||||
|
tmp.replace(STATE_FILE)
|
||||||
|
|
||||||
|
|
||||||
|
def load_feeds() -> List[str]:
|
||||||
|
feeds = list(RSS_FEEDS_ENV)
|
||||||
|
if FEEDS_FILE.exists():
|
||||||
|
try:
|
||||||
|
for line in FEEDS_FILE.read_text("utf-8").splitlines():
|
||||||
|
s = line.strip()
|
||||||
|
if not s or s.startswith("#"): # allow comments
|
||||||
|
continue
|
||||||
|
feeds.append(s)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Failed to read {FEEDS_FILE}: {e}")
|
||||||
|
# de-dup preserving order
|
||||||
|
seen = set()
|
||||||
|
uniq = []
|
||||||
|
for f in feeds:
|
||||||
|
if f not in seen:
|
||||||
|
uniq.append(f)
|
||||||
|
seen.add(f)
|
||||||
|
return uniq
|
||||||
|
|
||||||
|
|
||||||
|
def fetch(url: str, *, etag: Optional[str]=None, modified: Optional[str]=None, as_text=False):
|
||||||
|
headers = {}
|
||||||
|
if etag:
|
||||||
|
headers["If-None-Match"] = etag
|
||||||
|
if modified:
|
||||||
|
headers["If-Modified-Since"] = modified
|
||||||
|
resp = SESSION.get(url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
|
||||||
|
if resp.status_code == 304:
|
||||||
|
return None, 304, None, None
|
||||||
|
resp.raise_for_status()
|
||||||
|
new_etag = resp.headers.get("ETag")
|
||||||
|
new_mod = resp.headers.get("Last-Modified")
|
||||||
|
return (resp.text if as_text else resp.content), resp.status_code, new_etag, new_mod
|
||||||
|
|
||||||
|
|
||||||
|
def head_size(url: str) -> Optional[int]:
|
||||||
|
try:
|
||||||
|
h = SESSION.head(url, allow_redirects=True, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
|
||||||
|
if h.ok:
|
||||||
|
cl = h.headers.get("Content-Length")
|
||||||
|
return int(cl) if cl and cl.isdigit() else None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def best_transcript_links(item) -> List[str]:
|
||||||
|
links: List[Tuple[int, str, str]] = []
|
||||||
|
# Try explicit QName first
|
||||||
|
for tag in item.findall(".//{https://podcastindex.org/namespace/1.0}transcript"):
|
||||||
|
t = (tag.attrib.get("type") or "").lower()
|
||||||
|
url = tag.attrib.get("url")
|
||||||
|
if url:
|
||||||
|
links.append((0, t, url))
|
||||||
|
# Namespace-prefixed fallback
|
||||||
|
for tag in item.findall(".//podcast:transcript", NS):
|
||||||
|
t = (tag.attrib.get("type") or "").lower()
|
||||||
|
url = tag.attrib.get("url")
|
||||||
|
if url:
|
||||||
|
links.append((0, t, url))
|
||||||
|
|
||||||
|
order = [
|
||||||
|
"text/plain", # often used for TextWithTimestamps
|
||||||
|
"application/json",
|
||||||
|
"text/vtt",
|
||||||
|
"application/srt",
|
||||||
|
"application/x-subrip",
|
||||||
|
"application/text",
|
||||||
|
"text/plain; charset=utf-8",
|
||||||
|
]
|
||||||
|
key = {v: i for i, v in enumerate(order)}
|
||||||
|
ranked = []
|
||||||
|
for _, t, url in links:
|
||||||
|
ranked.append((key.get(t, 999), t, url))
|
||||||
|
ranked.sort()
|
||||||
|
return [u for _, _, u in ranked]
|
||||||
|
|
||||||
|
|
||||||
|
def get_enclosure(item) -> Optional[str]:
|
||||||
|
enc = item.find("enclosure")
|
||||||
|
if enc is not None and enc.attrib.get("url"):
|
||||||
|
return enc.attrib["url"]
|
||||||
|
mc = item.find("media:content", NS)
|
||||||
|
if mc is not None and mc.attrib.get("url"):
|
||||||
|
return mc.attrib["url"]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pubdate(item) -> datetime:
|
||||||
|
# Try common fields
|
||||||
|
candidates = [
|
||||||
|
item.findtext("pubDate"),
|
||||||
|
item.findtext("dc:date", namespaces=NS),
|
||||||
|
item.findtext("{http://purl.org/dc/elements/1.1/}date"),
|
||||||
|
]
|
||||||
|
for pd in filter(None, candidates):
|
||||||
|
s = pd.strip()
|
||||||
|
# Try several common formats
|
||||||
|
for fmt in [
|
||||||
|
"%a, %d %b %Y %H:%M:%S %z",
|
||||||
|
"%a, %d %b %Y %H:%M:%S",
|
||||||
|
"%Y-%m-%dT%H:%M:%S%z",
|
||||||
|
"%Y-%m-%dT%H:%M:%S",
|
||||||
|
]:
|
||||||
|
try:
|
||||||
|
return datetime.strptime(s, fmt)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return datetime.utcnow()
|
||||||
|
|
||||||
|
|
||||||
|
def save_bytes(path: Path, data: bytes) -> None:
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
tmp = path.with_suffix(path.suffix + ".part")
|
||||||
|
tmp.write_bytes(data)
|
||||||
|
tmp.replace(path)
|
||||||
|
|
||||||
|
|
||||||
|
def decide_audio_ext(audio_url: str) -> str:
|
||||||
|
p = urlparse(audio_url)
|
||||||
|
low = p.path.lower()
|
||||||
|
if low.endswith(".m4a"):
|
||||||
|
return ".m4a"
|
||||||
|
if low.endswith(".mp3"):
|
||||||
|
return ".mp3"
|
||||||
|
if low.endswith(".ogg") or low.endswith(".oga"):
|
||||||
|
return ".ogg"
|
||||||
|
if low.endswith(".aac"):
|
||||||
|
return ".aac"
|
||||||
|
if low.endswith(".wav"):
|
||||||
|
return ".wav"
|
||||||
|
return ".mp3"
|
||||||
|
|
||||||
|
|
||||||
|
def item_key(item) -> str:
|
||||||
|
# Prefer GUID value, else enclosure URL, else title+date
|
||||||
|
guid = item.findtext("guid")
|
||||||
|
if guid:
|
||||||
|
return guid.strip()
|
||||||
|
enc = get_enclosure(item)
|
||||||
|
if enc:
|
||||||
|
return enc
|
||||||
|
title = item.findtext("title") or "Episode"
|
||||||
|
pub = parse_pubdate(item).strftime("%Y%m%d")
|
||||||
|
return f"{pub}:{title}"
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------- core ingest -----------------
|
||||||
|
|
||||||
|
def ingest_feed(url: str, state: Dict) -> int:
|
||||||
|
fstate = state.setdefault("feeds", {}).setdefault(url, {})
|
||||||
|
etag = fstate.get("etag")
|
||||||
|
mod = fstate.get("modified")
|
||||||
|
|
||||||
|
log.info(f"Fetch RSS: {url}")
|
||||||
|
try:
|
||||||
|
data, status, new_etag, new_mod = fetch(url, etag=etag, modified=mod, as_text=True)
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Fetch failed: {e}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if status == 304:
|
||||||
|
log.info("Not modified")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if new_etag:
|
||||||
|
fstate["etag"] = new_etag
|
||||||
|
if new_mod:
|
||||||
|
fstate["modified"] = new_mod
|
||||||
|
|
||||||
|
try:
|
||||||
|
root = ET.fromstring(data)
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"XML parse error: {e}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
channel_title = safe((root.findtext("channel/title") or "Podcast"))
|
||||||
|
|
||||||
|
new_items = 0
|
||||||
|
for item in root.findall("channel/item"):
|
||||||
|
key = item_key(item)
|
||||||
|
already = state.setdefault("items", {})
|
||||||
|
if already.get(key):
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = safe(item.findtext("title") or "Episode")
|
||||||
|
pub = parse_pubdate(item)
|
||||||
|
date_prefix = pub.strftime("%Y%m%d")
|
||||||
|
base = f"{date_prefix} - {title}"
|
||||||
|
|
||||||
|
audio_url = get_enclosure(item)
|
||||||
|
if not audio_url:
|
||||||
|
log.info(f"Skip (no enclosure): {title}")
|
||||||
|
already[key] = {"skipped": "no_enclosure"}
|
||||||
|
continue
|
||||||
|
|
||||||
|
# HEAD size guard (optional)
|
||||||
|
if AUDIO_MAX_MB > 0:
|
||||||
|
size = head_size(audio_url)
|
||||||
|
if size and size > AUDIO_MAX_MB * 1024 * 1024:
|
||||||
|
log.info(f"Skip (size>{AUDIO_MAX_MB}MB): {title}")
|
||||||
|
already[key] = {"skipped": "too_large", "size": size}
|
||||||
|
continue
|
||||||
|
|
||||||
|
path_ext = decide_audio_ext(audio_url)
|
||||||
|
audio_out = LIBRARY_ROOT / channel_title / f"{base}{path_ext}"
|
||||||
|
transcript_links = best_transcript_links(item)
|
||||||
|
|
||||||
|
# If audio exists and a transcript sidecar exists → just enqueue index
|
||||||
|
sidecars = list((TRANSCRIPT_ROOT / channel_title).glob(f"{base}.*"))
|
||||||
|
have_transcript = len(sidecars) > 0
|
||||||
|
if audio_out.exists() and have_transcript:
|
||||||
|
log.info(f"Skip download, enqueue index (have audio+transcript): {audio_out.name}")
|
||||||
|
try:
|
||||||
|
q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Enqueue failed: {e}")
|
||||||
|
already[key] = {"done": True, "audio": str(audio_out)}
|
||||||
|
new_items += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Download audio
|
||||||
|
try:
|
||||||
|
log.info(f"Downloading audio → {audio_out}")
|
||||||
|
content, _, _, _ = fetch(audio_url, as_text=False)
|
||||||
|
save_bytes(audio_out, content)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Audio failed: {e}")
|
||||||
|
already[key] = {"error": f"audio:{e}"}
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Download transcript if present (take first best)
|
||||||
|
transcript_out = None
|
||||||
|
for turl in transcript_links:
|
||||||
|
try:
|
||||||
|
ext = ".vtt" if "vtt" in turl.lower() else ".srt" if "srt" in turl.lower() else ".txt"
|
||||||
|
tout = TRANSCRIPT_ROOT / channel_title / f"{base}{ext}"
|
||||||
|
log.info(f"Downloading transcript → {tout} ({turl})")
|
||||||
|
tdata, _, _, _ = fetch(turl, as_text=False)
|
||||||
|
save_bytes(tout, tdata)
|
||||||
|
transcript_out = tout
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Transcript fetch failed ({turl}): {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Enqueue for indexing/transcription
|
||||||
|
try:
|
||||||
|
q.enqueue("worker.handle_local_file", str(audio_out), job_timeout=4*3600, result_ttl=86400, failure_ttl=86400)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Enqueue failed: {e}")
|
||||||
|
|
||||||
|
already[key] = {"done": True, "audio": str(audio_out), "transcript": str(transcript_out) if transcript_out else None}
|
||||||
|
new_items += 1
|
||||||
|
|
||||||
|
return new_items
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------- main loop -----------------
|
||||||
|
|
||||||
|
def main():
|
||||||
|
while True:
|
||||||
|
feeds = load_feeds()
|
||||||
|
if not feeds:
|
||||||
|
log.error("No RSS feeds configured. Set RSS_FEEDS or create feeds.txt.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
state = load_state()
|
||||||
|
total_new = 0
|
||||||
|
for url in feeds:
|
||||||
|
try:
|
||||||
|
added = ingest_feed(url, state)
|
||||||
|
total_new += added
|
||||||
|
save_state(state)
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Feed error: {url} -> {e}")
|
||||||
|
log.info(f"Cycle complete. New items: {total_new}")
|
||||||
|
if RSS_ONCE:
|
||||||
|
break
|
||||||
|
time.sleep(RSS_SCAN_MINUTES * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@@ -39,6 +39,78 @@ def log(feed):
|
|||||||
def sanitize(name):
|
def sanitize(name):
|
||||||
return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
|
return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
|
||||||
|
|
||||||
|
def find_sidecar_transcript(media_path: Path) -> Path | None:
|
||||||
|
"""Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
|
||||||
|
Tries common variants including language-suffixed SRT/VTT.
|
||||||
|
"""
|
||||||
|
candidates: list[Path] = []
|
||||||
|
# exact same stem in same folder
|
||||||
|
for ext in [".txt", ".srt", ".vtt"]:
|
||||||
|
p = media_path.parent / (media_path.stem + ext)
|
||||||
|
if p.exists():
|
||||||
|
candidates.append(p)
|
||||||
|
# language-suffixed near the media file (e.g., .en.srt)
|
||||||
|
for ext in [".srt", ".vtt"]:
|
||||||
|
p = media_path.with_suffix(f".en{ext}")
|
||||||
|
if p.exists() and p not in candidates:
|
||||||
|
candidates.append(p)
|
||||||
|
return candidates[0] if candidates else None
|
||||||
|
|
||||||
|
|
||||||
|
def transcript_text_from_file(path: Path) -> str:
|
||||||
|
"""Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
|
||||||
|
try:
|
||||||
|
raw = path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
except Exception:
|
||||||
|
raw = path.read_text(errors="ignore")
|
||||||
|
|
||||||
|
if path.suffix.lower() == ".txt":
|
||||||
|
return raw.strip()
|
||||||
|
|
||||||
|
# For SRT/VTT, drop timestamp lines, cue numbers and headers
|
||||||
|
lines: list[str] = []
|
||||||
|
for line in raw.splitlines():
|
||||||
|
ls = line.strip()
|
||||||
|
if not ls:
|
||||||
|
continue
|
||||||
|
if "-->" in ls: # timestamp line
|
||||||
|
continue
|
||||||
|
if ls.upper().startswith("WEBVTT"):
|
||||||
|
continue
|
||||||
|
if re.match(r"^\d+$", ls): # cue index
|
||||||
|
continue
|
||||||
|
lines.append(ls)
|
||||||
|
return " ".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
|
||||||
|
"""Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed."""
|
||||||
|
try:
|
||||||
|
if sidecar.suffix.lower() == ".srt":
|
||||||
|
dst = media_path.with_suffix(f".{lang}.srt")
|
||||||
|
shutil.copy2(sidecar, dst)
|
||||||
|
elif sidecar.suffix.lower() == ".vtt":
|
||||||
|
tmp_srt = sidecar.with_suffix(".srt")
|
||||||
|
subprocess.run(["ffmpeg", "-nostdin", "-y", "-i", str(sidecar), str(tmp_srt)], check=True)
|
||||||
|
dst = media_path.with_suffix(f".{lang}.srt")
|
||||||
|
shutil.move(str(tmp_srt), dst)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[post] sidecar copy/convert failed: {e}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def write_plain_transcript(media_path: Path, text: str, language: str = "en") -> Path:
|
||||||
|
"""Write minimal transcript artifacts (.txt + .json) from plain text (no timestamps)."""
|
||||||
|
title = media_path.stem
|
||||||
|
base = TRN / title
|
||||||
|
base.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
(base.with_suffix(".txt")).write_text(text, encoding="utf-8")
|
||||||
|
(base.with_suffix(".json")).write_bytes(orjson.dumps({
|
||||||
|
"file": str(media_path),
|
||||||
|
"language": language,
|
||||||
|
"segments": [{"start": 0.0, "end": 0.0, "text": text}]
|
||||||
|
}))
|
||||||
|
return base
|
||||||
|
|
||||||
def yt_dlp(url, outdir):
|
def yt_dlp(url, outdir):
|
||||||
# 1) Normalize YouTube Music URLs to standard YouTube
|
# 1) Normalize YouTube Music URLs to standard YouTube
|
||||||
yurl = url
|
yurl = url
|
||||||
@@ -316,6 +388,7 @@ def publish_to_openwebui(paths):
|
|||||||
|
|
||||||
def handle_local_file(path_str: str):
|
def handle_local_file(path_str: str):
|
||||||
"""Transcribe & index a local media file that already exists in /library.
|
"""Transcribe & index a local media file that already exists in /library.
|
||||||
|
If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper.
|
||||||
Safe to call repeatedly; it skips if transcript JSON already exists.
|
Safe to call repeatedly; it skips if transcript JSON already exists.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
@@ -323,13 +396,30 @@ def handle_local_file(path_str: str):
|
|||||||
if not p.exists():
|
if not p.exists():
|
||||||
log({"url": path_str, "status": "error", "error": "file_not_found"})
|
log({"url": path_str, "status": "error", "error": "file_not_found"})
|
||||||
return
|
return
|
||||||
|
|
||||||
title = p.stem
|
title = p.stem
|
||||||
base_json = TRN / f"{title}.json"
|
base_json = TRN / f"{title}.json"
|
||||||
if base_json.exists():
|
if base_json.exists():
|
||||||
log({"url": path_str, "status": "skip", "reason": "already_transcribed"})
|
log({"url": path_str, "status": "skip", "reason": "already_transcribed"})
|
||||||
return
|
return
|
||||||
info = {"url": path_str, "status": "transcribing", "title": title, "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
|
|
||||||
|
info = {"url": path_str, "status": "transcribing", "title": title,
|
||||||
|
"uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
|
||||||
log(info)
|
log(info)
|
||||||
|
|
||||||
|
# 1) Prefer an existing transcript sidecar if present
|
||||||
|
sidecar = find_sidecar_transcript(p)
|
||||||
|
if sidecar:
|
||||||
|
plain = transcript_text_from_file(sidecar)
|
||||||
|
lang = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
|
||||||
|
base = write_plain_transcript(p, plain, language=lang)
|
||||||
|
ensure_sidecar_next_to_media(sidecar, p, lang=lang)
|
||||||
|
index_meili(base.with_suffix(".json"))
|
||||||
|
publish_to_openwebui([base.with_suffix(".txt")])
|
||||||
|
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2) Otherwise, run transcription
|
||||||
base = transcribe(p)
|
base = transcribe(p)
|
||||||
index_meili(base.with_suffix(".json"))
|
index_meili(base.with_suffix(".json"))
|
||||||
publish_to_openwebui([base.with_suffix(".txt")])
|
publish_to_openwebui([base.with_suffix(".txt")])
|
||||||
|
@@ -41,6 +41,10 @@ services:
|
|||||||
WHISPER_MODEL: large-v3
|
WHISPER_MODEL: large-v3
|
||||||
WHISPER_PRECISION: int8
|
WHISPER_PRECISION: int8
|
||||||
PYTHONPATH: /app
|
PYTHONPATH: /app
|
||||||
|
JOB_TIMEOUT: ${JOB_TIMEOUT:-14400}
|
||||||
|
JOB_TTL: ${JOB_TTL:-86400}
|
||||||
|
RESULT_TTL: ${RESULT_TTL:-86400}
|
||||||
|
FAILURE_TTL: ${FAILURE_TTL:-86400}
|
||||||
volumes:
|
volumes:
|
||||||
- ${LIBRARY_HOST_DIR:-./library}:/library
|
- ${LIBRARY_HOST_DIR:-./library}:/library
|
||||||
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
|
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
|
||||||
@@ -86,7 +90,7 @@ services:
|
|||||||
# - COOKIE_FILE=/config/cookies.txt
|
# - COOKIE_FILE=/config/cookies.txt
|
||||||
# Optional: yt-dlp options (JSON). Example enables Android client fallback
|
# Optional: yt-dlp options (JSON). Example enables Android client fallback
|
||||||
# - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}}
|
# - YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}}}
|
||||||
- YTDL_OPTIONS={"extract_flat":"in_playlist","concurrent_fragment_downloads":1}
|
- YTDL_OPTIONS={"extractor_args":{"youtube":{"player_client":"android"}},"extract_flat":"in_playlist","concurrent_fragment_downloads":1}
|
||||||
volumes:
|
volumes:
|
||||||
- ${LIBRARY_HOST_DIR:-./library}:/downloads
|
- ${LIBRARY_HOST_DIR:-./library}:/downloads
|
||||||
# Optional cookies file on host → /config/cookies.txt inside container
|
# Optional cookies file on host → /config/cookies.txt inside container
|
||||||
@@ -115,3 +119,32 @@ services:
|
|||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "exit 0"]
|
test: ["CMD-SHELL", "exit 0"]
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
|
podx-rss:
|
||||||
|
build: ./app
|
||||||
|
container_name: podx-rss
|
||||||
|
command: ["python", "rss_ingest.py"]
|
||||||
|
env_file: [.env]
|
||||||
|
environment:
|
||||||
|
MEILI_URL: http://meili:7700
|
||||||
|
REDIS_URL: redis://redis:6379/0
|
||||||
|
LIBRARY_ROOT: /library
|
||||||
|
TRANSCRIPT_ROOT: /transcripts
|
||||||
|
FEEDS_FILE: /library/feeds.txt
|
||||||
|
RSS_STATE_FILE: /library/.rss_state.json
|
||||||
|
RSS_SCAN_MINUTES: ${RSS_SCAN_MINUTES:-120}
|
||||||
|
RSS_CONNECT_TIMEOUT: ${RSS_CONNECT_TIMEOUT:-15}
|
||||||
|
RSS_READ_TIMEOUT: ${RSS_READ_TIMEOUT:-60}
|
||||||
|
AUDIO_MAX_MB: ${AUDIO_MAX_MB:-4096}
|
||||||
|
USER_AGENT: ${USER_AGENT:-podx-rss/1.0 (+local-archive)}
|
||||||
|
RSS_ONCE: ${RSS_ONCE:-0}
|
||||||
|
volumes:
|
||||||
|
- ${LIBRARY_HOST_DIR:-./library}:/library
|
||||||
|
- ${TRANSCRIPTS_HOST_DIR:-./transcripts}:/transcripts
|
||||||
|
depends_on: [redis]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "python - <<'PY'\nimport os,sys; p=os.getenv('FEEDS_FILE',''); sys.exit(0 if (p and os.path.exists(p)) else 1)\nPY"]
|
||||||
|
interval: 60s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
restart: unless-stopped
|
||||||
|
6
feeds.txt.example
Normal file
6
feeds.txt.example
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
feeds.txt
|
||||||
|
---------
|
||||||
|
# Apple Podcasts / Omny show
|
||||||
|
https://www.omnycontent.com/d/playlist/....../podcast.rss
|
||||||
|
# Another RSS
|
||||||
|
https://example.com/feed.xml
|
Reference in New Issue
Block a user