feat: intent-aware querying + smart sampling for large audit datasets
- Add keyword-based intent extraction: 'device' → Intune, 'user' → Directory, etc. - Broad questions without intent auto-exclude noisy services (Exchange, SharePoint) - Smart stratified sampling: failures always included, high-value services prioritised - Fetch up to 1000 events from MongoDB, then curate best 200 for the LLM - Excluded services noted in LLM prompt and query_info so the admin knows the scope
This commit is contained in:
@@ -13,6 +13,129 @@ from models.api import AskRequest, AskResponse
|
||||
router = APIRouter(dependencies=[Depends(require_auth)])
|
||||
logger = structlog.get_logger("aoc.ask")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Intent extraction — map question keywords to relevant audit services
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_SERVICE_INTENTS = {
|
||||
"intune": ["Intune"],
|
||||
"device": ["Intune", "Device"],
|
||||
"laptop": ["Intune", "Device"],
|
||||
"mobile": ["Intune", "Device"],
|
||||
"phone": ["Intune", "Device"],
|
||||
"ipad": ["Intune", "Device"],
|
||||
"app": ["Intune", "ApplicationManagement"],
|
||||
"application": ["Intune", "ApplicationManagement"],
|
||||
"policy": ["Intune", "Policy"],
|
||||
"compliance": ["Intune", "Policy"],
|
||||
"user": ["Directory", "UserManagement"],
|
||||
"group": ["Directory", "GroupManagement"],
|
||||
"role": ["Directory", "RoleManagement"],
|
||||
"permission": ["Directory", "RoleManagement"],
|
||||
"license": ["Directory", "License"],
|
||||
"email": ["Exchange"],
|
||||
"mailbox": ["Exchange"],
|
||||
"mail": ["Exchange"],
|
||||
"message": ["Exchange", "Teams"],
|
||||
"file": ["SharePoint"],
|
||||
"sharepoint": ["SharePoint"],
|
||||
"site": ["SharePoint"],
|
||||
"document": ["SharePoint"],
|
||||
"team": ["Teams"],
|
||||
"channel": ["Teams"],
|
||||
"meeting": ["Teams"],
|
||||
"call": ["Teams"],
|
||||
}
|
||||
|
||||
# Services that are extremely noisy for typical admin questions.
|
||||
# We exclude them by default on broad questions unless the user explicitly mentions them.
|
||||
_NOISY_SERVICES = {"Exchange", "SharePoint"}
|
||||
|
||||
# Services that are generally admin-relevant and kept by default.
|
||||
_DEFAULT_ADMIN_SERVICES = {
|
||||
"Directory",
|
||||
"UserManagement",
|
||||
"GroupManagement",
|
||||
"RoleManagement",
|
||||
"ApplicationManagement",
|
||||
"Intune",
|
||||
"Device",
|
||||
"Policy",
|
||||
"Teams",
|
||||
"License",
|
||||
}
|
||||
|
||||
|
||||
def _extract_intent_services(question: str) -> tuple[list[str] | None, bool]:
|
||||
"""
|
||||
Extract relevant services from the question.
|
||||
|
||||
Returns:
|
||||
(services, is_explicit):
|
||||
- services: list of service names to query, or None for default admin set
|
||||
- is_explicit: True if the user explicitly mentioned a noisy service
|
||||
"""
|
||||
q_lower = question.lower()
|
||||
tokens = set(re.findall(r"\b[a-z]+\b", q_lower))
|
||||
|
||||
matched_services = set()
|
||||
for token, services in _SERVICE_INTENTS.items():
|
||||
if token in tokens:
|
||||
matched_services.update(services)
|
||||
|
||||
if matched_services:
|
||||
# User asked something specific — return exactly what they asked for
|
||||
is_explicit = not matched_services.isdisjoint(_NOISY_SERVICES)
|
||||
return sorted(matched_services), is_explicit
|
||||
|
||||
# Broad question with no clear intent — default to admin-relevant services only
|
||||
return None, False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Smart sampling — stratified by importance so the LLM sees signal, not noise
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _smart_sample(events: list[dict], max_events: int = 200) -> list[dict]:
|
||||
"""
|
||||
Return a curated subset that preserves diversity and prioritises signal.
|
||||
|
||||
Tiers:
|
||||
1. Failures (always valuable)
|
||||
2. High-admin-value services (Intune, Device, Directory, etc.)
|
||||
3. Everything else
|
||||
"""
|
||||
if len(events) <= max_events:
|
||||
return events
|
||||
|
||||
high_value = {
|
||||
"Directory",
|
||||
"UserManagement",
|
||||
"GroupManagement",
|
||||
"RoleManagement",
|
||||
"Intune",
|
||||
"Device",
|
||||
"Policy",
|
||||
"ApplicationManagement",
|
||||
}
|
||||
|
||||
failures = [e for e in events if str(e.get("result") or "").lower() in ("failure", "failed")]
|
||||
high_val = [e for e in events if e.get("service") in high_value and e not in failures]
|
||||
rest = [e for e in events if e not in failures and e not in high_val]
|
||||
|
||||
# Allocate slots: half to failures+high-value, half to rest (but never let rest dominate)
|
||||
slots = max_events
|
||||
failure_cap = min(len(failures), max(10, slots // 4))
|
||||
high_cap = min(len(high_val), max(20, slots // 4))
|
||||
rest_cap = slots - failure_cap - high_cap
|
||||
|
||||
sampled = failures[:failure_cap] + high_val[:high_cap] + rest[:rest_cap]
|
||||
# Sort back to chronological order
|
||||
sampled.sort(key=lambda e: e.get("timestamp") or "", reverse=True)
|
||||
return sampled
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Time-range extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -203,12 +326,16 @@ def _aggregate_counts(events: list[dict]) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def _format_events_for_llm(events: list[dict], total: int | None = None) -> str:
|
||||
def _format_events_for_llm(
|
||||
events: list[dict], total: int | None = None, excluded_services: list[str] | None = None
|
||||
) -> str:
|
||||
lines = []
|
||||
|
||||
# If we have a large result set, send aggregation + samples instead of raw dump
|
||||
if total is not None and total > len(events) and len(events) >= 50:
|
||||
lines.append(f"Result set overview: {total} total events (showing the {len(events)} most recent).\n")
|
||||
lines.append(f"Result set overview: {total} total events (showing a curated sample of {len(events)}).\n")
|
||||
if excluded_services:
|
||||
lines.append(f"Note: high-volume services excluded by default: {', '.join(excluded_services)}.\n")
|
||||
agg = _aggregate_counts(events)
|
||||
lines.append("Breakdown by service:")
|
||||
for svc, cnt in agg["services"]:
|
||||
@@ -267,11 +394,16 @@ def _build_chat_url(base_url: str, api_version: str) -> str:
|
||||
return url
|
||||
|
||||
|
||||
async def _call_llm(question: str, events: list[dict], total: int | None = None) -> str:
|
||||
async def _call_llm(
|
||||
question: str,
|
||||
events: list[dict],
|
||||
total: int | None = None,
|
||||
excluded_services: list[str] | None = None,
|
||||
) -> str:
|
||||
if not LLM_API_KEY:
|
||||
raise RuntimeError("LLM_API_KEY not configured")
|
||||
|
||||
context = _format_events_for_llm(events, total=total)
|
||||
context = _format_events_for_llm(events, total=total, excluded_services=excluded_services)
|
||||
messages = [
|
||||
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||
{
|
||||
@@ -332,6 +464,7 @@ async def ask_question(body: AskRequest, user: dict = Depends(require_auth)):
|
||||
|
||||
start, end = _extract_time_range(question)
|
||||
entity = _extract_entity(question)
|
||||
intent_services, explicit_noisy = _extract_intent_services(question)
|
||||
|
||||
# Default to last 7 days if no time range detected
|
||||
if not start:
|
||||
@@ -339,11 +472,29 @@ async def ask_question(body: AskRequest, user: dict = Depends(require_auth)):
|
||||
start = (now - timedelta(days=7)).isoformat().replace("+00:00", "Z")
|
||||
end = now.isoformat().replace("+00:00", "Z")
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Decide which services to query
|
||||
# -----------------------------------------------------------------------
|
||||
excluded_services: list[str] = []
|
||||
if body.services:
|
||||
# User explicitly filtered via UI — respect that exactly
|
||||
query_services = body.services
|
||||
elif intent_services is not None:
|
||||
# NL question implies specific services
|
||||
query_services = intent_services
|
||||
else:
|
||||
# Broad question with no intent — exclude noisy services by default
|
||||
query_services = sorted(_DEFAULT_ADMIN_SERVICES)
|
||||
excluded_services = sorted(_NOISY_SERVICES)
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Build and run query
|
||||
# -----------------------------------------------------------------------
|
||||
query = _build_event_query(
|
||||
entity,
|
||||
start,
|
||||
end,
|
||||
services=body.services,
|
||||
services=query_services,
|
||||
actor=body.actor,
|
||||
operation=body.operation,
|
||||
result=body.result,
|
||||
@@ -353,21 +504,33 @@ async def ask_question(body: AskRequest, user: dict = Depends(require_auth)):
|
||||
|
||||
try:
|
||||
total = events_collection.count_documents(query)
|
||||
cursor = events_collection.find(query).sort([("timestamp", -1)]).limit(LLM_MAX_EVENTS)
|
||||
events = list(cursor)
|
||||
# Fetch a generous window so we can apply smart sampling in Python
|
||||
cursor = events_collection.find(query).sort([("timestamp", -1)]).limit(1000)
|
||||
raw_events = list(cursor)
|
||||
except Exception as exc:
|
||||
logger.error("Failed to query events for ask", error=str(exc))
|
||||
raise HTTPException(status_code=500, detail=f"Database query failed: {exc}") from exc
|
||||
|
||||
for e in events:
|
||||
for e in raw_events:
|
||||
e["_id"] = str(e.get("_id", ""))
|
||||
|
||||
# Apply smart sampling (preserves failures, prioritises admin-relevant services)
|
||||
events = _smart_sample(raw_events, max_events=LLM_MAX_EVENTS)
|
||||
|
||||
# If no events, return early
|
||||
if not events:
|
||||
return AskResponse(
|
||||
answer="I couldn't find any audit events matching your question. Try broadening the time range or checking the spelling of the device/user name.",
|
||||
events=[],
|
||||
query_info={"entity": entity, "start": start, "end": end, "event_count": 0},
|
||||
query_info={
|
||||
"entity": entity,
|
||||
"start": start,
|
||||
"end": end,
|
||||
"event_count": 0,
|
||||
"total_matched": total,
|
||||
"services_queried": query_services,
|
||||
"excluded_services": excluded_services,
|
||||
},
|
||||
llm_used=False,
|
||||
llm_error="LLM not used — no events found." if not LLM_API_KEY else None,
|
||||
)
|
||||
@@ -380,7 +543,7 @@ async def ask_question(body: AskRequest, user: dict = Depends(require_auth)):
|
||||
llm_error = "LLM_API_KEY is not configured. Set it in your .env to enable AI narrative summarisation."
|
||||
else:
|
||||
try:
|
||||
answer = await _call_llm(question, events, total=total)
|
||||
answer = await _call_llm(question, events, total=total, excluded_services=excluded_services)
|
||||
llm_used = True
|
||||
except Exception as exc:
|
||||
llm_error = f"LLM call failed: {exc}"
|
||||
@@ -388,9 +551,11 @@ async def ask_question(body: AskRequest, user: dict = Depends(require_auth)):
|
||||
|
||||
# Fallback: structured summary if LLM unavailable or failed
|
||||
if not answer:
|
||||
parts = [f"Found {len(events)} event(s)"]
|
||||
parts = [f"Found {total} event(s)"]
|
||||
if entity:
|
||||
parts.append(f"related to **{entity}**")
|
||||
if excluded_services:
|
||||
parts.append(f"(excluding {', '.join(excluded_services)})")
|
||||
parts.append(f"between {start[:10]} and {end[:10]}.\n")
|
||||
|
||||
for i, e in enumerate(events[:10], 1):
|
||||
@@ -415,6 +580,8 @@ async def ask_question(body: AskRequest, user: dict = Depends(require_auth)):
|
||||
"end": end,
|
||||
"event_count": len(events),
|
||||
"total_matched": total,
|
||||
"services_queried": query_services,
|
||||
"excluded_services": excluded_services,
|
||||
"mongo_query": json.dumps(query, default=str),
|
||||
},
|
||||
llm_used=llm_used,
|
||||
|
||||
@@ -236,7 +236,7 @@ class TestAskEndpoint:
|
||||
}
|
||||
)
|
||||
|
||||
async def fake_llm(question, events, total=None):
|
||||
async def fake_llm(question, events, total=None, excluded_services=None):
|
||||
return "The device had a failed wipe attempt."
|
||||
|
||||
monkeypatch.setattr("routes.ask.LLM_API_KEY", "fake-key")
|
||||
|
||||
Reference in New Issue
Block a user