Fixing fixes
This commit is contained in:
91
app/app.py
91
app/app.py
@@ -187,10 +187,26 @@ def recent():
|
|||||||
return "\n".join(html)
|
return "\n".join(html)
|
||||||
|
|
||||||
def _safe_under(base: Path, rel_path: str) -> Path:
|
def _safe_under(base: Path, rel_path: str) -> Path:
|
||||||
candidate = (base / rel_path.lstrip('/')).resolve()
|
"""
|
||||||
if not str(candidate).startswith(str(base)):
|
Resolve rel_path safely under base. If an absolute path is provided and it is
|
||||||
raise FileNotFoundError("Path escapes base")
|
already under base, allow it. Otherwise join to base. Reject any path that
|
||||||
return candidate
|
escapes base.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
p = Path(rel_path)
|
||||||
|
if p.is_absolute():
|
||||||
|
candidate = p.resolve()
|
||||||
|
else:
|
||||||
|
candidate = (base / rel_path).resolve()
|
||||||
|
except Exception:
|
||||||
|
raise FileNotFoundError("Invalid path")
|
||||||
|
|
||||||
|
base_str = str(base.resolve())
|
||||||
|
cand_str = str(candidate)
|
||||||
|
# allow exact base or any child path
|
||||||
|
if cand_str == base_str or cand_str.startswith(base_str + os.sep):
|
||||||
|
return candidate
|
||||||
|
raise FileNotFoundError("Path escapes base")
|
||||||
|
|
||||||
def _vtt_header():
|
def _vtt_header():
|
||||||
return "WEBVTT\n\n"
|
return "WEBVTT\n\n"
|
||||||
@@ -276,28 +292,46 @@ def _parse_vtt_to_cues(vtt_text: str):
|
|||||||
def _load_transcript_variants(basename: str):
|
def _load_transcript_variants(basename: str):
|
||||||
"""
|
"""
|
||||||
Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None}
|
Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None}
|
||||||
|
- Tries exact filename matches first.
|
||||||
|
- If not found, falls back to the first file whose name starts with the basename (prefix match).
|
||||||
"""
|
"""
|
||||||
# Look under TRANSCRIPT_ROOT securely
|
|
||||||
root = TRANSCRIPT_ROOT
|
root = TRANSCRIPT_ROOT
|
||||||
cand = [
|
|
||||||
|
def try_read(path: Path, k: str):
|
||||||
|
try:
|
||||||
|
rp = path.resolve()
|
||||||
|
if not str(rp).startswith(str(root)):
|
||||||
|
return None
|
||||||
|
if rp.exists():
|
||||||
|
with open(rp, "r", encoding="utf-8", errors="ignore") as f:
|
||||||
|
return (k, f.read(), str(rp))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 1) exact matches
|
||||||
|
exact = [
|
||||||
(root / f"{basename}.vtt", "vtt"),
|
(root / f"{basename}.vtt", "vtt"),
|
||||||
(root / f"{basename}.srt", "srt"),
|
(root / f"{basename}.srt", "srt"),
|
||||||
(root / f"{basename}.json", "json"),
|
(root / f"{basename}.json", "json"),
|
||||||
(root / f"{basename}.txt", "txt"),
|
(root / f"{basename}.txt", "txt"),
|
||||||
]
|
]
|
||||||
for p, k in cand:
|
for p, k in exact:
|
||||||
|
got = try_read(p, k)
|
||||||
|
if got:
|
||||||
|
return got
|
||||||
|
|
||||||
|
# 2) prefix/fuzzy matches (e.g., "<base>*.vtt", "<base>*.txt", etc.)
|
||||||
|
exts = [("vtt","vtt"), ("srt","srt"), ("json","json"), ("txt","txt")]
|
||||||
|
for ext, k in exts:
|
||||||
try:
|
try:
|
||||||
p = p.resolve()
|
for gp in root.glob(f"{basename}*.{ext}"):
|
||||||
|
got = try_read(gp, k)
|
||||||
|
if got:
|
||||||
|
return got
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
if not str(p).startswith(str(root)):
|
|
||||||
continue
|
|
||||||
if p.exists():
|
|
||||||
try:
|
|
||||||
with open(p, "r", encoding="utf-8", errors="ignore") as f:
|
|
||||||
return (k, f.read(), str(p))
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
return (None, "", "")
|
return (None, "", "")
|
||||||
|
|
||||||
@app.get("/search")
|
@app.get("/search")
|
||||||
@@ -452,20 +486,35 @@ def subtitle():
|
|||||||
)
|
)
|
||||||
return html
|
return html
|
||||||
elif kind == "txt":
|
elif kind == "txt":
|
||||||
|
# Normalize and lightly beautify plain text transcripts
|
||||||
safe = content.strip()
|
safe = content.strip()
|
||||||
# Simple paragraphization: collapse >2 newlines, wrap in <p>
|
|
||||||
|
# Remove common timestamp patterns like [00:12:34], (00:12), 00:12:34 -
|
||||||
|
safe = re.sub(r"\[(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\]\s*", "", safe)
|
||||||
|
safe = re.sub(r"\((\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\)\s*", "", safe)
|
||||||
|
safe = re.sub(r"(?m)^\s*(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\s*[-–—]?\s*", "", safe)
|
||||||
|
|
||||||
|
# Collapse multiple blank lines
|
||||||
|
safe = re.sub(r"\n{3,}", "\n\n", safe)
|
||||||
|
|
||||||
|
# Paragraphization: split on blank lines, collapse inner newlines to spaces
|
||||||
paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()]
|
paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()]
|
||||||
clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]]
|
clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]]
|
||||||
items = "".join(f"<p>{p}</p>" for p in clean_paras)
|
items = "".join(f"<p>{p}</p>" for p in clean_paras)
|
||||||
# Build fallback without using backslashes inside an f-string expression
|
|
||||||
fallback = f"<pre style='white-space:pre-wrap'>{safe}</pre>"
|
fallback = f"<pre style='white-space:pre-wrap'>{safe}</pre>"
|
||||||
body = items if items else fallback
|
body = items if items else fallback
|
||||||
return (
|
return (
|
||||||
"<!doctype html><meta charset='utf-8'>"
|
"<!doctype html><meta charset='utf-8'>"
|
||||||
"<title>Transcript</title>"
|
"<title>Transcript</title>"
|
||||||
"<style>body{font-family:system-ui;margin:1rem;line-height:1.6;max-width:900px} p{margin:.4rem 0}</style>"
|
"<style>"
|
||||||
f"<h3>Transcript (plain text): {base}</h3>"
|
"body{font-family:system-ui;margin:1rem;line-height:1.7;color:#111}"
|
||||||
f"{body}"
|
".wrap{max-width:900px;margin:0 auto}"
|
||||||
|
"p{margin:.5rem 0}"
|
||||||
|
".wrap p{text-wrap:pretty}"
|
||||||
|
"</style>"
|
||||||
|
f"<div class='wrap'><h3>Transcript (plain text): {base}</h3>"
|
||||||
|
f"{body}</div>"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return "<small>No transcript found.</small>"
|
return "<small>No transcript found.</small>"
|
||||||
|
Reference in New Issue
Block a user