diff --git a/app/app.py b/app/app.py
index bac7754..2191f3a 100644
--- a/app/app.py
+++ b/app/app.py
@@ -187,10 +187,26 @@ def recent():
return "\n".join(html)
def _safe_under(base: Path, rel_path: str) -> Path:
- candidate = (base / rel_path.lstrip('/')).resolve()
- if not str(candidate).startswith(str(base)):
- raise FileNotFoundError("Path escapes base")
- return candidate
+ """
+ Resolve rel_path safely under base. If an absolute path is provided and it is
+ already under base, allow it. Otherwise join to base. Reject any path that
+ escapes base.
+ """
+ try:
+ p = Path(rel_path)
+ if p.is_absolute():
+ candidate = p.resolve()
+ else:
+ candidate = (base / rel_path).resolve()
+ except Exception:
+ raise FileNotFoundError("Invalid path")
+
+ base_str = str(base.resolve())
+ cand_str = str(candidate)
+ # allow exact base or any child path
+ if cand_str == base_str or cand_str.startswith(base_str + os.sep):
+ return candidate
+ raise FileNotFoundError("Path escapes base")
def _vtt_header():
return "WEBVTT\n\n"
@@ -276,28 +292,46 @@ def _parse_vtt_to_cues(vtt_text: str):
def _load_transcript_variants(basename: str):
"""
Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None}
+ - Tries exact filename matches first.
+ - If not found, falls back to the first file whose name starts with the basename (prefix match).
"""
- # Look under TRANSCRIPT_ROOT securely
root = TRANSCRIPT_ROOT
- cand = [
+
+ def try_read(path: Path, k: str):
+ try:
+ rp = path.resolve()
+ if not str(rp).startswith(str(root)):
+ return None
+ if rp.exists():
+ with open(rp, "r", encoding="utf-8", errors="ignore") as f:
+ return (k, f.read(), str(rp))
+ except Exception:
+ return None
+ return None
+
+ # 1) exact matches
+ exact = [
(root / f"{basename}.vtt", "vtt"),
(root / f"{basename}.srt", "srt"),
(root / f"{basename}.json", "json"),
(root / f"{basename}.txt", "txt"),
]
- for p, k in cand:
+ for p, k in exact:
+ got = try_read(p, k)
+ if got:
+ return got
+
+ # 2) prefix/fuzzy matches (e.g., "
+ + # Remove common timestamp patterns like [00:12:34], (00:12), 00:12:34 - + safe = re.sub(r"\[(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\]\s*", "", safe) + safe = re.sub(r"\((\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\)\s*", "", safe) + safe = re.sub(r"(?m)^\s*(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\s*[-–—]?\s*", "", safe) + + # Collapse multiple blank lines + safe = re.sub(r"\n{3,}", "\n\n", safe) + + # Paragraphization: split on blank lines, collapse inner newlines to spaces paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()] clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]] items = "".join(f"
{p}
" for p in clean_paras) - # Build fallback without using backslashes inside an f-string expression + fallback = f"{safe}" body = items if items else fallback return ( "" "