Add RSS feed downloading

2025-09-07 15:30:22 +02:00
parent bcd874ecf8
commit e6582e9a6b
5 changed files with 522 additions and 2 deletions
--- a/app/worker.py
+++ b/app/worker.py
@@ -39,6 +39,78 @@ def log(feed):
 def sanitize(name):
    return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()

+def find_sidecar_transcript(media_path: Path) -> Path | None:
+    """Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
+    Tries common variants including language-suffixed SRT/VTT.
+    """
+    candidates: list[Path] = []
+    # exact same stem in same folder
+    for ext in [".txt", ".srt", ".vtt"]:
+        p = media_path.parent / (media_path.stem + ext)
+        if p.exists():
+            candidates.append(p)
+    # language-suffixed near the media file (e.g., .en.srt)
+    for ext in [".srt", ".vtt"]:
+        p = media_path.with_suffix(f".en{ext}")
+        if p.exists() and p not in candidates:
+            candidates.append(p)
+    return candidates[0] if candidates else None
+
+
+def transcript_text_from_file(path: Path) -> str:
+    """Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
+    try:
+        raw = path.read_text(encoding="utf-8", errors="ignore")
+    except Exception:
+        raw = path.read_text(errors="ignore")
+
+    if path.suffix.lower() == ".txt":
+        return raw.strip()
+
+    # For SRT/VTT, drop timestamp lines, cue numbers and headers
+    lines: list[str] = []
+    for line in raw.splitlines():
+        ls = line.strip()
+        if not ls:
+            continue
+        if "-->" in ls:  # timestamp line
+            continue
+        if ls.upper().startswith("WEBVTT"):
+            continue
+        if re.match(r"^\d+$", ls):  # cue index
+            continue
+        lines.append(ls)
+    return " ".join(lines)
+
+
+def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
+    """Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed."""
+    try:
+        if sidecar.suffix.lower() == ".srt":
+            dst = media_path.with_suffix(f".{lang}.srt")
+            shutil.copy2(sidecar, dst)
+        elif sidecar.suffix.lower() == ".vtt":
+            tmp_srt = sidecar.with_suffix(".srt")
+            subprocess.run(["ffmpeg", "-nostdin", "-y", "-i", str(sidecar), str(tmp_srt)], check=True)
+            dst = media_path.with_suffix(f".{lang}.srt")
+            shutil.move(str(tmp_srt), dst)
+    except Exception as e:
+        print(f"[post] sidecar copy/convert failed: {e}", flush=True)
+
+
+def write_plain_transcript(media_path: Path, text: str, language: str = "en") -> Path:
+    """Write minimal transcript artifacts (.txt + .json) from plain text (no timestamps)."""
+    title = media_path.stem
+    base = TRN / title
+    base.parent.mkdir(parents=True, exist_ok=True)
+    (base.with_suffix(".txt")).write_text(text, encoding="utf-8")
+    (base.with_suffix(".json")).write_bytes(orjson.dumps({
+        "file": str(media_path),
+        "language": language,
+        "segments": [{"start": 0.0, "end": 0.0, "text": text}]
+    }))
+    return base
+
 def yt_dlp(url, outdir):
    # 1) Normalize YouTube Music URLs to standard YouTube
    yurl = url
@@ -316,6 +388,7 @@ def publish_to_openwebui(paths):

 def handle_local_file(path_str: str):
    """Transcribe & index a local media file that already exists in /library.
+    If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper.
    Safe to call repeatedly; it skips if transcript JSON already exists.
    """
    try:
@@ -323,13 +396,30 @@ def handle_local_file(path_str: str):
        if not p.exists():
            log({"url": path_str, "status": "error", "error": "file_not_found"})
            return
+
        title = p.stem
        base_json = TRN / f"{title}.json"
        if base_json.exists():
            log({"url": path_str, "status": "skip", "reason": "already_transcribed"})
            return
-        info = {"url": path_str, "status": "transcribing", "title": title, "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
+
+        info = {"url": path_str, "status": "transcribing", "title": title,
+                "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
        log(info)
+
+        # 1) Prefer an existing transcript sidecar if present
+        sidecar = find_sidecar_transcript(p)
+        if sidecar:
+            plain = transcript_text_from_file(sidecar)
+            lang = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
+            base = write_plain_transcript(p, plain, language=lang)
+            ensure_sidecar_next_to_media(sidecar, p, lang=lang)
+            index_meili(base.with_suffix(".json"))
+            publish_to_openwebui([base.with_suffix(".txt")])
+            log({**info, **{"status": "done", "note": "used_existing_transcript"}})
+            return
+
+        # 2) Otherwise, run transcription
        base = transcribe(p)
        index_meili(base.with_suffix(".json"))
        publish_to_openwebui([base.with_suffix(".txt")])