Fixing fixes

2025-09-08 18:41:03 +02:00
parent e2bf3e945d
commit 0603817743
1 changed files with 70 additions and 21 deletions
@@ -187,10 +187,26 @@ def recent():
    return "\n".join(html)

 def _safe_under(base: Path, rel_path: str) -> Path:
-    candidate = (base / rel_path.lstrip('/')).resolve()
-    if not str(candidate).startswith(str(base)):
-        raise FileNotFoundError("Path escapes base")
-    return candidate
+    """
+    Resolve rel_path safely under base. If an absolute path is provided and it is
+    already under base, allow it. Otherwise join to base. Reject any path that
+    escapes base.
+    """
+    try:
+        p = Path(rel_path)
+        if p.is_absolute():
+            candidate = p.resolve()
+        else:
+            candidate = (base / rel_path).resolve()
+    except Exception:
+        raise FileNotFoundError("Invalid path")
+
+    base_str = str(base.resolve())
+    cand_str = str(candidate)
+    # allow exact base or any child path
+    if cand_str == base_str or cand_str.startswith(base_str + os.sep):
+        return candidate
+    raise FileNotFoundError("Path escapes base")

 def _vtt_header():
    return "WEBVTT\n\n"
@@ -276,28 +292,46 @@ def _parse_vtt_to_cues(vtt_text: str):
 def _load_transcript_variants(basename: str):
    """
    Return tuple (kind, content_text, path_used) where kind in {'vtt','srt','json','txt',None}
+    - Tries exact filename matches first.
+    - If not found, falls back to the first file whose name starts with the basename (prefix match).
    """
-    # Look under TRANSCRIPT_ROOT securely
    root = TRANSCRIPT_ROOT
-    cand = [
+
+    def try_read(path: Path, k: str):
+        try:
+            rp = path.resolve()
+            if not str(rp).startswith(str(root)):
+                return None
+            if rp.exists():
+                with open(rp, "r", encoding="utf-8", errors="ignore") as f:
+                    return (k, f.read(), str(rp))
+        except Exception:
+            return None
+        return None
+
+    # 1) exact matches
+    exact = [
        (root / f"{basename}.vtt", "vtt"),
        (root / f"{basename}.srt", "srt"),
        (root / f"{basename}.json", "json"),
        (root / f"{basename}.txt", "txt"),
    ]
-    for p, k in cand:
+    for p, k in exact:
+        got = try_read(p, k)
+        if got:
+            return got
+
+    # 2) prefix/fuzzy matches (e.g., "<base>*.vtt", "<base>*.txt", etc.)
+    exts = [("vtt","vtt"), ("srt","srt"), ("json","json"), ("txt","txt")]
+    for ext, k in exts:
        try:
-            p = p.resolve()
+            for gp in root.glob(f"{basename}*.{ext}"):
+                got = try_read(gp, k)
+                if got:
+                    return got
        except Exception:
            continue
-        if not str(p).startswith(str(root)):
-            continue
-        if p.exists():
-            try:
-                with open(p, "r", encoding="utf-8", errors="ignore") as f:
-                    return (k, f.read(), str(p))
-            except Exception:
-                continue
+
    return (None, "", "")

@app.get("/search")
@@ -452,20 +486,35 @@ def subtitle():
        )
        return html
    elif kind == "txt":
+        # Normalize and lightly beautify plain text transcripts
        safe = content.strip()
-        # Simple paragraphization: collapse >2 newlines, wrap in <p>
+
+        # Remove common timestamp patterns like [00:12:34], (00:12), 00:12:34 -
+        safe = re.sub(r"\[(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\]\s*", "", safe)
+        safe = re.sub(r"\((\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\)\s*", "", safe)
+        safe = re.sub(r"(?m)^\s*(\d{1,2}:){1,2}\d{2}(?:\.\d{1,3})?\s*[-–—]?\s*", "", safe)
+
+        # Collapse multiple blank lines
+        safe = re.sub(r"\n{3,}", "\n\n", safe)
+
+        # Paragraphization: split on blank lines, collapse inner newlines to spaces
        paras = [p.strip() for p in re.split(r"\n{2,}", safe) if p.strip()]
        clean_paras = [re.sub(r'[\n\r]+', ' ', p) for p in paras[:2000]]
        items = "".join(f"<p>{p}</p>" for p in clean_paras)
-        # Build fallback without using backslashes inside an f-string expression
+
        fallback = f"<pre style='white-space:pre-wrap'>{safe}</pre>"
        body = items if items else fallback
        return (
            "<!doctype html><meta charset='utf-8'>"
            "<title>Transcript</title>"
-            "<style>body{font-family:system-ui;margin:1rem;line-height:1.6;max-width:900px} p{margin:.4rem 0}</style>"
-            f"<h3>Transcript (plain text): {base}</h3>"
-            f"{body}"
+            "<style>"
+            "body{font-family:system-ui;margin:1rem;line-height:1.7;color:#111}"
+            ".wrap{max-width:900px;margin:0 auto}"
+            "p{margin:.5rem 0}"
+            ".wrap p{text-wrap:pretty}"
+            "</style>"
+            f"<div class='wrap'><h3>Transcript (plain text): {base}</h3>"
+            f"{body}</div>"
        )
    else:
        return "<small>No transcript found.</small>"