Fixing fixes

This commit is contained in:
2025-09-08 18:41:03 +02:00
parent e2bf3e945d
commit 0603817743

View File

@@ -187,10 +187,26 @@ def recent():
return "\n".join(html) return "\n".join(html)
def _safe_under(base: Path, rel_path: str) -> Path: def _safe_under(base: Path, rel_path: str) -> Path:
candidate = (base / rel_path.lstrip('/')).resolve() """
if not str(candidate).startswith(str(base)): Resolve rel_path safely under base. If an absolute path is provided and it is
raise FileNotFoundError("Path escapes base") already under base, allow it. Otherwise join to base. Reject any path that
escapes base.
"""
try:
p = Path(rel_path)
if p.is_absolute():
candidate = p.resolve()
else:
candidate = (base / rel_path).resolve()
except Exception:
raise FileNotFoundError("Invalid path")
base_str = str(base.resolve())
cand_str = str(candidate)
# allow exact base or any child path
if cand_str == base_str or cand_str.startswith(base_str + os.sep):
return candidate return candidate
raise FileNotFoundError("Path escapes base")
def _vtt_header(): def _vtt_header():
return "WEBVTT\n\n" return "WEBVTT\n\n"
@@ -276,28 +292,46 @@ def _parse_vtt_to_cues(vtt_text: str):
def _load_transcript_variants(basename: str): def _load_transcript_variants(basename: str):
""" """
Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None} Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None}
- Tries exact filename matches first.
- If not found, falls back to the first file whose name starts with the basename (prefix match).
""" """
# Look under TRANSCRIPT_ROOT securely
root = TRANSCRIPT_ROOT root = TRANSCRIPT_ROOT
cand = [
def try_read(path: Path, k: str):
try:
rp = path.resolve()
if not str(rp).startswith(str(root)):
return None
if rp.exists():
with open(rp, "r", encoding="utf-8", errors="ignore") as f:
return (k, f.read(), str(rp))
except Exception:
return None
return None
# 1) exact matches
exact = [
(root / f"{basename}.vtt", "vtt"), (root / f"{basename}.vtt", "vtt"),
(root / f"{basename}.srt", "srt"), (root / f"{basename}.srt", "srt"),
(root / f"{basename}.json", "json"), (root / f"{basename}.json", "json"),
(root / f"{basename}.txt", "txt"), (root / f"{basename}.txt", "txt"),
] ]
for p, k in cand: for p, k in exact:
got = try_read(p, k)
if got:
return got
# 2) prefix/fuzzy matches (e.g., "<base>*.vtt", "<base>*.txt", etc.)
exts = [("vtt","vtt"), ("srt","srt"), ("json","json"), ("txt","txt")]
for ext, k in exts:
try: try:
p = p.resolve() for gp in root.glob(f"{basename}*.{ext}"):
except Exception: got = try_read(gp, k)
continue if got:
if not str(p).startswith(str(root)): return got
continue
if p.exists():
try:
with open(p, "r", encoding="utf-8", errors="ignore") as f:
return (k, f.read(), str(p))
except Exception: except Exception:
continue continue
return (None, "", "") return (None, "", "")
@app.get("/search") @app.get("/search")
@@ -452,20 +486,35 @@ def subtitle():
) )
return html return html
elif kind == "txt": elif kind == "txt":
# Normalize and lightly beautify plain text transcripts
safe = content.strip() safe = content.strip()
# Simple paragraphization: collapse >2 newlines, wrap in <p>
# Remove common timestamp patterns like [00:12:34], (00:12), 00:12:34 -
safe = re.sub(r"\[(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\]\s*", "", safe)
safe = re.sub(r"\((\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\)\s*", "", safe)
safe = re.sub(r"(?m)^\s*(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\s*[-–—]?\s*", "", safe)
# Collapse multiple blank lines
safe = re.sub(r"\n{3,}", "\n\n", safe)
# Paragraphization: split on blank lines, collapse inner newlines to spaces
paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()] paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()]
clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]] clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]]
items = "".join(f"<p>{p}</p>" for p in clean_paras) items = "".join(f"<p>{p}</p>" for p in clean_paras)
# Build fallback without using backslashes inside an f-string expression
fallback = f"<pre style='white-space:pre-wrap'>{safe}</pre>" fallback = f"<pre style='white-space:pre-wrap'>{safe}</pre>"
body = items if items else fallback body = items if items else fallback
return ( return (
"<!doctype html><meta charset='utf-8'>" "<!doctype html><meta charset='utf-8'>"
"<title>Transcript</title>" "<title>Transcript</title>"
"<style>body{font-family:system-ui;margin:1rem;line-height:1.6;max-width:900px} p{margin:.4rem 0}</style>" "<style>"
f"<h3>Transcript (plain text): {base}</h3>" "body{font-family:system-ui;margin:1rem;line-height:1.7;color:#111}"
f"{body}" ".wrap{max-width:900px;margin:0 auto}"
"p{margin:.5rem 0}"
".wrap p{text-wrap:pretty}"
"</style>"
f"<div class='wrap'><h3>Transcript (plain text): {base}</h3>"
f"{body}</div>"
) )
else: else:
return "<small>No transcript found.</small>" return "<small>No transcript found.</small>"