Fixing fixes
This commit is contained in:
91
app/app.py
91
app/app.py
@@ -187,10 +187,26 @@ def recent():
|
||||
return "\n".join(html)
|
||||
|
||||
def _safe_under(base: Path, rel_path: str) -> Path:
|
||||
candidate = (base / rel_path.lstrip('/')).resolve()
|
||||
if not str(candidate).startswith(str(base)):
|
||||
raise FileNotFoundError("Path escapes base")
|
||||
return candidate
|
||||
"""
|
||||
Resolve rel_path safely under base. If an absolute path is provided and it is
|
||||
already under base, allow it. Otherwise join to base. Reject any path that
|
||||
escapes base.
|
||||
"""
|
||||
try:
|
||||
p = Path(rel_path)
|
||||
if p.is_absolute():
|
||||
candidate = p.resolve()
|
||||
else:
|
||||
candidate = (base / rel_path).resolve()
|
||||
except Exception:
|
||||
raise FileNotFoundError("Invalid path")
|
||||
|
||||
base_str = str(base.resolve())
|
||||
cand_str = str(candidate)
|
||||
# allow exact base or any child path
|
||||
if cand_str == base_str or cand_str.startswith(base_str + os.sep):
|
||||
return candidate
|
||||
raise FileNotFoundError("Path escapes base")
|
||||
|
||||
def _vtt_header():
|
||||
return "WEBVTT\n\n"
|
||||
@@ -276,28 +292,46 @@ def _parse_vtt_to_cues(vtt_text: str):
|
||||
def _load_transcript_variants(basename: str):
|
||||
"""
|
||||
Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None}
|
||||
- Tries exact filename matches first.
|
||||
- If not found, falls back to the first file whose name starts with the basename (prefix match).
|
||||
"""
|
||||
# Look under TRANSCRIPT_ROOT securely
|
||||
root = TRANSCRIPT_ROOT
|
||||
cand = [
|
||||
|
||||
def try_read(path: Path, k: str):
|
||||
try:
|
||||
rp = path.resolve()
|
||||
if not str(rp).startswith(str(root)):
|
||||
return None
|
||||
if rp.exists():
|
||||
with open(rp, "r", encoding="utf-8", errors="ignore") as f:
|
||||
return (k, f.read(), str(rp))
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
# 1) exact matches
|
||||
exact = [
|
||||
(root / f"{basename}.vtt", "vtt"),
|
||||
(root / f"{basename}.srt", "srt"),
|
||||
(root / f"{basename}.json", "json"),
|
||||
(root / f"{basename}.txt", "txt"),
|
||||
]
|
||||
for p, k in cand:
|
||||
for p, k in exact:
|
||||
got = try_read(p, k)
|
||||
if got:
|
||||
return got
|
||||
|
||||
# 2) prefix/fuzzy matches (e.g., "<base>*.vtt", "<base>*.txt", etc.)
|
||||
exts = [("vtt","vtt"), ("srt","srt"), ("json","json"), ("txt","txt")]
|
||||
for ext, k in exts:
|
||||
try:
|
||||
p = p.resolve()
|
||||
for gp in root.glob(f"{basename}*.{ext}"):
|
||||
got = try_read(gp, k)
|
||||
if got:
|
||||
return got
|
||||
except Exception:
|
||||
continue
|
||||
if not str(p).startswith(str(root)):
|
||||
continue
|
||||
if p.exists():
|
||||
try:
|
||||
with open(p, "r", encoding="utf-8", errors="ignore") as f:
|
||||
return (k, f.read(), str(p))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return (None, "", "")
|
||||
|
||||
@app.get("/search")
|
||||
@@ -452,20 +486,35 @@ def subtitle():
|
||||
)
|
||||
return html
|
||||
elif kind == "txt":
|
||||
# Normalize and lightly beautify plain text transcripts
|
||||
safe = content.strip()
|
||||
# Simple paragraphization: collapse >2 newlines, wrap in <p>
|
||||
|
||||
# Remove common timestamp patterns like [00:12:34], (00:12), 00:12:34 -
|
||||
safe = re.sub(r"\[(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\]\s*", "", safe)
|
||||
safe = re.sub(r"\((\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\)\s*", "", safe)
|
||||
safe = re.sub(r"(?m)^\s*(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\s*[-–—]?\s*", "", safe)
|
||||
|
||||
# Collapse multiple blank lines
|
||||
safe = re.sub(r"\n{3,}", "\n\n", safe)
|
||||
|
||||
# Paragraphization: split on blank lines, collapse inner newlines to spaces
|
||||
paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()]
|
||||
clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]]
|
||||
items = "".join(f"<p>{p}</p>" for p in clean_paras)
|
||||
# Build fallback without using backslashes inside an f-string expression
|
||||
|
||||
fallback = f"<pre style='white-space:pre-wrap'>{safe}</pre>"
|
||||
body = items if items else fallback
|
||||
return (
|
||||
"<!doctype html><meta charset='utf-8'>"
|
||||
"<title>Transcript</title>"
|
||||
"<style>body{font-family:system-ui;margin:1rem;line-height:1.6;max-width:900px} p{margin:.4rem 0}</style>"
|
||||
f"<h3>Transcript (plain text): {base}</h3>"
|
||||
f"{body}"
|
||||
"<style>"
|
||||
"body{font-family:system-ui;margin:1rem;line-height:1.7;color:#111}"
|
||||
".wrap{max-width:900px;margin:0 auto}"
|
||||
"p{margin:.5rem 0}"
|
||||
".wrap p{text-wrap:pretty}"
|
||||
"</style>"
|
||||
f"<div class='wrap'><h3>Transcript (plain text): {base}</h3>"
|
||||
f"{body}</div>"
|
||||
)
|
||||
else:
|
||||
return "<small>No transcript found.</small>"
|
||||
|
Reference in New Issue
Block a user