From a255be93fe4b650acf7096b05e1f4f5d47e4e5ea Mon Sep 17 00:00:00 2001
From: Tomas Kracmar <tomas.kracmar@cqre.net>
Date: Mon, 20 Apr 2026 16:23:55 +0200
Subject: [PATCH] feat: aggregate large event sets before sending to LLM

When a query matches >50 events, the LLM now receives:
- Aggregated counts by service, operation, result, and actor
- A list of failures (up to 10)
- The 50 most recent raw events as samples

This scales to thousands of events without blowing the token budget
or losing signal. The LLM gets a bird's-eye view plus concrete examples.

Also updates the system prompt to handle both individual event lists
and aggregated overviews correctly.
---
 VERSION               |  2 +-
 backend/routes/ask.py | 65 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/VERSION b/VERSION
index cb174d5..d2d61a7 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.1
\ No newline at end of file
+1.2.2
\ No newline at end of file
diff --git a/backend/routes/ask.py b/backend/routes/ask.py
index d465a8b..0a7f4dd 100644
--- a/backend/routes/ask.py
+++ b/backend/routes/ask.py
@@ -168,25 +168,76 @@ def _build_event_query(
 # ---------------------------------------------------------------------------
 
 _SYSTEM_PROMPT = """You are an IT operations assistant. An administrator has asked a question about audit logs.
-Your job is to read the list of audit events below and write a concise, plain-language answer.
+Your job is to read the data below and write a concise, plain-language answer.
+
+The input may be either:
+- A small list of individual audit events (numbered Event #1, #2, etc.), or
+- An aggregated overview with counts by service, action, result, and actor, plus sample events.
 
 Rules:
 - Assume the reader is a non-expert admin.
-- Group related events together and tell a coherent story.
+- For aggregated overviews: summarise the scale, top patterns, and highlight anomalies or failures.
+- For small event lists: group related events together and tell a coherent story.
 - Highlight anything unusual, failed actions, or privilege escalations.
 - Reference specific event numbers (e.g., "Event #3") when making claims so the user can verify.
-- If the event list is a subset of a larger result set, acknowledge the scale (e.g., "At least 200 events occurred...").
+- If the data is an aggregated subset of a larger result set, acknowledge the scale (e.g., "847 events occurred — the top pattern was...").
 - If there are no events, say so clearly.
 - Keep the answer under 300 words.
-- Do not invent events that are not in the list.
+- Do not invent events or patterns that are not supported by the data.
 """
 
 
+def _aggregate_counts(events: list[dict]) -> dict:
+    """Build lightweight aggregation tables for large result sets."""
+    from collections import Counter
+
+    svc_counts = Counter(e.get("service") or "Unknown" for e in events)
+    op_counts = Counter(e.get("operation") or "Unknown" for e in events)
+    result_counts = Counter(e.get("result") or "Unknown" for e in events)
+    actor_counts = Counter(e.get("actor_display") or "Unknown" for e in events)
+    return {
+        "services": svc_counts.most_common(10),
+        "operations": op_counts.most_common(10),
+        "results": result_counts.most_common(5),
+        "actors": actor_counts.most_common(10),
+    }
+
+
 def _format_events_for_llm(events: list[dict], total: int | None = None) -> str:
     lines = []
-    if total is not None and total > len(events):
-        lines.append(f"Showing {len(events)} of {total} total matching events (most recent first):\n")
-    for i, e in enumerate(events, 1):
+
+    # If we have a large result set, send aggregation + samples instead of raw dump
+    if total is not None and total > len(events) and len(events) >= 50:
+        lines.append(f"Result set overview: {total} total events (showing the {len(events)} most recent).\n")
+        agg = _aggregate_counts(events)
+        lines.append("Breakdown by service:")
+        for svc, cnt in agg["services"]:
+            lines.append(f"  {svc}: {cnt}")
+        lines.append("\nBreakdown by action:")
+        for op, cnt in agg["operations"]:
+            lines.append(f"  {op}: {cnt}")
+        lines.append("\nBreakdown by result:")
+        for res, cnt in agg["results"]:
+            lines.append(f"  {res}: {cnt}")
+        lines.append("\nTop actors:")
+        for actor, cnt in agg["actors"]:
+            lines.append(f"  {actor}: {cnt}")
+        # Include failures and a few recent samples
+        failures = [e for e in events if str(e.get("result") or "").lower() in ("failure", "failed")]
+        if failures:
+            lines.append(f"\nFailures ({len(failures)}):")
+            for e in failures[:10]:
+                ts = e.get("timestamp", "?")[:16].replace("T", " ")
+                op = e.get("operation", "unknown")
+                actor = e.get("actor_display", "unknown")
+                lines.append(f"  {ts} — {op} by {actor}")
+        lines.append("\nMost recent sample events:")
+    else:
+        if total is not None and total > len(events):
+            lines.append(f"Showing {len(events)} of {total} total matching events (most recent first):\n")
+
+    # Always include the first N raw events as detail (up to 50)
+    for i, e in enumerate(events[:50], 1):
         ts = e.get("timestamp") or "unknown time"
         op = e.get("operation") or "unknown action"
         actor = e.get("actor_display") or "unknown actor"