Add RSS feed downloading
This commit is contained in:
@@ -39,6 +39,78 @@ def log(feed):
|
||||
def sanitize(name):
|
||||
return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip()
|
||||
|
||||
def find_sidecar_transcript(media_path: Path) -> Path | None:
|
||||
"""Return a .txt/.srt/.vtt transcript file sitting next to media, if any.
|
||||
Tries common variants including language-suffixed SRT/VTT.
|
||||
"""
|
||||
candidates: list[Path] = []
|
||||
# exact same stem in same folder
|
||||
for ext in [".txt", ".srt", ".vtt"]:
|
||||
p = media_path.parent / (media_path.stem + ext)
|
||||
if p.exists():
|
||||
candidates.append(p)
|
||||
# language-suffixed near the media file (e.g., .en.srt)
|
||||
for ext in [".srt", ".vtt"]:
|
||||
p = media_path.with_suffix(f".en{ext}")
|
||||
if p.exists() and p not in candidates:
|
||||
candidates.append(p)
|
||||
return candidates[0] if candidates else None
|
||||
|
||||
|
||||
def transcript_text_from_file(path: Path) -> str:
|
||||
"""Extract plain text from .txt/.srt/.vtt by stripping timestamps and counters."""
|
||||
try:
|
||||
raw = path.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception:
|
||||
raw = path.read_text(errors="ignore")
|
||||
|
||||
if path.suffix.lower() == ".txt":
|
||||
return raw.strip()
|
||||
|
||||
# For SRT/VTT, drop timestamp lines, cue numbers and headers
|
||||
lines: list[str] = []
|
||||
for line in raw.splitlines():
|
||||
ls = line.strip()
|
||||
if not ls:
|
||||
continue
|
||||
if "-->" in ls: # timestamp line
|
||||
continue
|
||||
if ls.upper().startswith("WEBVTT"):
|
||||
continue
|
||||
if re.match(r"^\d+$", ls): # cue index
|
||||
continue
|
||||
lines.append(ls)
|
||||
return " ".join(lines)
|
||||
|
||||
|
||||
def ensure_sidecar_next_to_media(sidecar: Path, media_path: Path, lang: str = "en") -> None:
|
||||
"""Ensure an `.lang.srt` sits next to the media for Plex. Convert VTT→SRT if needed."""
|
||||
try:
|
||||
if sidecar.suffix.lower() == ".srt":
|
||||
dst = media_path.with_suffix(f".{lang}.srt")
|
||||
shutil.copy2(sidecar, dst)
|
||||
elif sidecar.suffix.lower() == ".vtt":
|
||||
tmp_srt = sidecar.with_suffix(".srt")
|
||||
subprocess.run(["ffmpeg", "-nostdin", "-y", "-i", str(sidecar), str(tmp_srt)], check=True)
|
||||
dst = media_path.with_suffix(f".{lang}.srt")
|
||||
shutil.move(str(tmp_srt), dst)
|
||||
except Exception as e:
|
||||
print(f"[post] sidecar copy/convert failed: {e}", flush=True)
|
||||
|
||||
|
||||
def write_plain_transcript(media_path: Path, text: str, language: str = "en") -> Path:
|
||||
"""Write minimal transcript artifacts (.txt + .json) from plain text (no timestamps)."""
|
||||
title = media_path.stem
|
||||
base = TRN / title
|
||||
base.parent.mkdir(parents=True, exist_ok=True)
|
||||
(base.with_suffix(".txt")).write_text(text, encoding="utf-8")
|
||||
(base.with_suffix(".json")).write_bytes(orjson.dumps({
|
||||
"file": str(media_path),
|
||||
"language": language,
|
||||
"segments": [{"start": 0.0, "end": 0.0, "text": text}]
|
||||
}))
|
||||
return base
|
||||
|
||||
def yt_dlp(url, outdir):
|
||||
# 1) Normalize YouTube Music URLs to standard YouTube
|
||||
yurl = url
|
||||
@@ -316,6 +388,7 @@ def publish_to_openwebui(paths):
|
||||
|
||||
def handle_local_file(path_str: str):
|
||||
"""Transcribe & index a local media file that already exists in /library.
|
||||
If a sidecar .txt/.srt/.vtt exists, use it instead of running Whisper.
|
||||
Safe to call repeatedly; it skips if transcript JSON already exists.
|
||||
"""
|
||||
try:
|
||||
@@ -323,13 +396,30 @@ def handle_local_file(path_str: str):
|
||||
if not p.exists():
|
||||
log({"url": path_str, "status": "error", "error": "file_not_found"})
|
||||
return
|
||||
|
||||
title = p.stem
|
||||
base_json = TRN / f"{title}.json"
|
||||
if base_json.exists():
|
||||
log({"url": path_str, "status": "skip", "reason": "already_transcribed"})
|
||||
return
|
||||
info = {"url": path_str, "status": "transcribing", "title": title, "uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
|
||||
|
||||
info = {"url": path_str, "status": "transcribing", "title": title,
|
||||
"uploader": p.parent.name, "date": "", "path": str(p), "progress": 0}
|
||||
log(info)
|
||||
|
||||
# 1) Prefer an existing transcript sidecar if present
|
||||
sidecar = find_sidecar_transcript(p)
|
||||
if sidecar:
|
||||
plain = transcript_text_from_file(sidecar)
|
||||
lang = os.getenv("DEFAULT_TRANSCRIPT_LANG", "en").strip() or "en"
|
||||
base = write_plain_transcript(p, plain, language=lang)
|
||||
ensure_sidecar_next_to_media(sidecar, p, lang=lang)
|
||||
index_meili(base.with_suffix(".json"))
|
||||
publish_to_openwebui([base.with_suffix(".txt")])
|
||||
log({**info, **{"status": "done", "note": "used_existing_transcript"}})
|
||||
return
|
||||
|
||||
# 2) Otherwise, run transcription
|
||||
base = transcribe(p)
|
||||
index_meili(base.with_suffix(".json"))
|
||||
publish_to_openwebui([base.with_suffix(".txt")])
|
||||
|
Reference in New Issue
Block a user