feat: implement Phase 3 scaling

- Replace skip-based pagination with cursor-based pagination (timestamp|_id cursors) - Add Prometheus /metrics endpoint with request latency, fetch volume, and error counters - Implement incremental fetch watermarking per source (watermarks collection in MongoDB) - Add Graph change notification webhook endpoint (/api/webhooks/graph) - Add correlation ID middleware for distributed tracing (x-request-id header) - Update frontend to use cursor-based pagination with Prev/Next navigation - Update tests for cursor pagination, metrics, webhooks, and watermark mocking
2026-04-14 14:58:50 +02:00
parent 9271b4e461
commit b0198012eb
17 changed files with 402 additions and 147 deletions
--- a/backend/routes/fetch.py
+++ b/backend/routes/fetch.py
@@ -1,31 +1,46 @@
+import time
+
 from auth import require_auth
 from database import events_collection
 from fastapi import APIRouter, Depends, HTTPException, Query
 from graph.audit_logs import fetch_audit_logs
+from metrics import track_fetch, track_fetch_duration, track_fetch_error
 from models.api import FetchAuditLogsResponse
 from models.event_model import normalize_event
 from pymongo import UpdateOne
 from sources.intune_audit import fetch_intune_audit
 from sources.unified_audit import fetch_unified_audit
+from watermark import get_watermark, set_watermark

 router = APIRouter(dependencies=[Depends(require_auth)])


 def run_fetch(hours: int = 168):
+    from datetime import datetime
+
    window = max(1, min(hours, 720))  # cap to 30 days for sanity
+    now = datetime.utcnow().isoformat() + "Z"
    logs = []
    errors = []

-    def fetch_source(fn, label):
+    def fetch_source(fn, label, source_key):
+        start_time = time.time()
        try:
-            return fn(hours=window)
+            since = get_watermark(source_key)
+            result = fn(since=since) if since else fn(hours=window)
+            set_watermark(source_key, now)
+            track_fetch(source_key, len(result))
+            return result
        except Exception as exc:
            errors.append(f"{label}: {exc}")
+            track_fetch_error(source_key)
            return []
+        finally:
+            track_fetch_duration(source_key, time.time() - start_time)

-    logs.extend(fetch_source(fetch_audit_logs, "Directory audit"))
-    logs.extend(fetch_source(fetch_unified_audit, "Unified audit (Exchange/SharePoint/Teams)"))
-    logs.extend(fetch_source(fetch_intune_audit, "Intune audit"))
+    logs.extend(fetch_source(fetch_audit_logs, "Directory audit", "directory"))
+    logs.extend(fetch_source(fetch_unified_audit, "Unified audit", "unified"))
+    logs.extend(fetch_source(fetch_intune_audit, "Intune audit", "intune"))

    normalized = [normalize_event(e) for e in logs]
    if normalized: