Fixing fixes

This commit is contained in:
2025-09-08 18:41:03 +02:00
parent e2bf3e945d
commit 0603817743

View File

@@ -187,10 +187,26 @@ def recent():
return "\n".join(html)
def _safe_under(base: Path, rel_path: str) -> Path:
candidate = (base / rel_path.lstrip('/')).resolve()
if not str(candidate).startswith(str(base)):
raise FileNotFoundError("Path escapes base")
return candidate
"""
Resolve rel_path safely under base. If an absolute path is provided and it is
already under base, allow it. Otherwise join to base. Reject any path that
escapes base.
"""
try:
p = Path(rel_path)
if p.is_absolute():
candidate = p.resolve()
else:
candidate = (base / rel_path).resolve()
except Exception:
raise FileNotFoundError("Invalid path")
base_str = str(base.resolve())
cand_str = str(candidate)
# allow exact base or any child path
if cand_str == base_str or cand_str.startswith(base_str + os.sep):
return candidate
raise FileNotFoundError("Path escapes base")
def _vtt_header():
return "WEBVTT\n\n"
@@ -276,28 +292,46 @@ def _parse_vtt_to_cues(vtt_text: str):
def _load_transcript_variants(basename: str):
"""
Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None}
- Tries exact filename matches first.
- If not found, falls back to the first file whose name starts with the basename (prefix match).
"""
# Look under TRANSCRIPT_ROOT securely
root = TRANSCRIPT_ROOT
cand = [
def try_read(path: Path, k: str):
try:
rp = path.resolve()
if not str(rp).startswith(str(root)):
return None
if rp.exists():
with open(rp, "r", encoding="utf-8", errors="ignore") as f:
return (k, f.read(), str(rp))
except Exception:
return None
return None
# 1) exact matches
exact = [
(root / f"{basename}.vtt", "vtt"),
(root / f"{basename}.srt", "srt"),
(root / f"{basename}.json", "json"),
(root / f"{basename}.txt", "txt"),
]
for p, k in cand:
for p, k in exact:
got = try_read(p, k)
if got:
return got
# 2) prefix/fuzzy matches (e.g., "<base>*.vtt", "<base>*.txt", etc.)
exts = [("vtt","vtt"), ("srt","srt"), ("json","json"), ("txt","txt")]
for ext, k in exts:
try:
p = p.resolve()
for gp in root.glob(f"{basename}*.{ext}"):
got = try_read(gp, k)
if got:
return got
except Exception:
continue
if not str(p).startswith(str(root)):
continue
if p.exists():
try:
with open(p, "r", encoding="utf-8", errors="ignore") as f:
return (k, f.read(), str(p))
except Exception:
continue
return (None, "", "")
@app.get("/search")
@@ -452,20 +486,35 @@ def subtitle():
)
return html
elif kind == "txt":
# Normalize and lightly beautify plain text transcripts
safe = content.strip()
# Simple paragraphization: collapse >2 newlines, wrap in <p>
# Remove common timestamp patterns like [00:12:34], (00:12), 00:12:34 -
safe = re.sub(r"\[(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\]\s*", "", safe)
safe = re.sub(r"\((\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\)\s*", "", safe)
safe = re.sub(r"(?m)^\s*(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\s*[-–—]?\s*", "", safe)
# Collapse multiple blank lines
safe = re.sub(r"\n{3,}", "\n\n", safe)
# Paragraphization: split on blank lines, collapse inner newlines to spaces
paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()]
clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]]
items = "".join(f"<p>{p}</p>" for p in clean_paras)
# Build fallback without using backslashes inside an f-string expression
fallback = f"<pre style='white-space:pre-wrap'>{safe}</pre>"
body = items if items else fallback
return (
"<!doctype html><meta charset='utf-8'>"
"<title>Transcript</title>"
"<style>body{font-family:system-ui;margin:1rem;line-height:1.6;max-width:900px} p{margin:.4rem 0}</style>"
f"<h3>Transcript (plain text): {base}</h3>"
f"{body}"
"<style>"
"body{font-family:system-ui;margin:1rem;line-height:1.7;color:#111}"
".wrap{max-width:900px;margin:0 auto}"
"p{margin:.5rem 0}"
".wrap p{text-wrap:pretty}"
"</style>"
f"<div class='wrap'><h3>Transcript (plain text): {base}</h3>"
f"{body}</div>"
)
else:
return "<small>No transcript found.</small>"