From 060381774372a02e87ec35c2d1dc8e22f344b171 Mon Sep 17 00:00:00 2001 From: Tomas Kracmar Date: Mon, 8 Sep 2025 18:41:03 +0200 Subject: [PATCH] Fixing fixes --- app/app.py | 91 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 21 deletions(-) diff --git a/app/app.py b/app/app.py index bac7754..2191f3a 100644 --- a/app/app.py +++ b/app/app.py @@ -187,10 +187,26 @@ def recent(): return "\n".join(html) def _safe_under(base: Path, rel_path: str) -> Path: - candidate = (base / rel_path.lstrip('/')).resolve() - if not str(candidate).startswith(str(base)): - raise FileNotFoundError("Path escapes base") - return candidate + """ + Resolve rel_path safely under base. If an absolute path is provided and it is + already under base, allow it. Otherwise join to base. Reject any path that + escapes base. + """ + try: + p = Path(rel_path) + if p.is_absolute(): + candidate = p.resolve() + else: + candidate = (base / rel_path).resolve() + except Exception: + raise FileNotFoundError("Invalid path") + + base_str = str(base.resolve()) + cand_str = str(candidate) + # allow exact base or any child path + if cand_str == base_str or cand_str.startswith(base_str + os.sep): + return candidate + raise FileNotFoundError("Path escapes base") def _vtt_header(): return "WEBVTT\n\n" @@ -276,28 +292,46 @@ def _parse_vtt_to_cues(vtt_text: str): def _load_transcript_variants(basename: str): """ Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None} + - Tries exact filename matches first. + - If not found, falls back to the first file whose name starts with the basename (prefix match). """ - # Look under TRANSCRIPT_ROOT securely root = TRANSCRIPT_ROOT - cand = [ + + def try_read(path: Path, k: str): + try: + rp = path.resolve() + if not str(rp).startswith(str(root)): + return None + if rp.exists(): + with open(rp, "r", encoding="utf-8", errors="ignore") as f: + return (k, f.read(), str(rp)) + except Exception: + return None + return None + + # 1) exact matches + exact = [ (root / f"{basename}.vtt", "vtt"), (root / f"{basename}.srt", "srt"), (root / f"{basename}.json", "json"), (root / f"{basename}.txt", "txt"), ] - for p, k in cand: + for p, k in exact: + got = try_read(p, k) + if got: + return got + + # 2) prefix/fuzzy matches (e.g., "*.vtt", "*.txt", etc.) + exts = [("vtt","vtt"), ("srt","srt"), ("json","json"), ("txt","txt")] + for ext, k in exts: try: - p = p.resolve() + for gp in root.glob(f"{basename}*.{ext}"): + got = try_read(gp, k) + if got: + return got except Exception: continue - if not str(p).startswith(str(root)): - continue - if p.exists(): - try: - with open(p, "r", encoding="utf-8", errors="ignore") as f: - return (k, f.read(), str(p)) - except Exception: - continue + return (None, "", "") @app.get("/search") @@ -452,20 +486,35 @@ def subtitle(): ) return html elif kind == "txt": + # Normalize and lightly beautify plain text transcripts safe = content.strip() - # Simple paragraphization: collapse >2 newlines, wrap in

+ + # Remove common timestamp patterns like [00:12:34], (00:12), 00:12:34 - + safe = re.sub(r"\[(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\]\s*", "", safe) + safe = re.sub(r"\((\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\)\s*", "", safe) + safe = re.sub(r"(?m)^\s*(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\s*[-–—]?\s*", "", safe) + + # Collapse multiple blank lines + safe = re.sub(r"\n{3,}", "\n\n", safe) + + # Paragraphization: split on blank lines, collapse inner newlines to spaces paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()] clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]] items = "".join(f"

{p}

" for p in clean_paras) - # Build fallback without using backslashes inside an f-string expression + fallback = f"
{safe}
" body = items if items else fallback return ( "" "Transcript" - "" - f"

Transcript (plain text): {base}

" - f"{body}" + "" + f"

Transcript (plain text): {base}

" + f"{body}
" ) else: return "No transcript found."