feat: add Grafana alerting rules and documentation (refs #111)

Configure Grafana Unified Alerting with file-based provisioned alert rules, contact points, and notification policies. Add stable UID to Loki datasource for alert rule references. Update LOGGING.md with dashboard descriptions, alerting rules table, and LogQL query reference. Alert rules: Error Rate Spike (critical), Container Silence for backend/postgres/redis (warning), 5xx Response Spike (critical). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 10:19:00 -06:00
parent c891250946
commit 4b2b318aff
6 changed files with 311 additions and 2 deletions
--- a/config/grafana/alerting/alert-rules.yml
+++ b/config/grafana/alerting/alert-rules.yml
@@ -0,0 +1,210 @@
+apiVersion: 1
+
+groups:
+  - orgId: 1
+    name: MotoVaultPro Alerts
+    folder: MotoVaultPro
+    interval: 1m
+    rules:
+      # Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes
+      - uid: mvp-error-rate-spike
+        title: Error Rate Spike
+        condition: D
+        data:
+          - refId: A
+            relativeTimeRange:
+              from: 600
+              to: 0
+            datasourceUid: loki
+            model:
+              refId: A
+              expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))'
+              queryType: instant
+          - refId: B
+            relativeTimeRange:
+              from: 600
+              to: 0
+            datasourceUid: loki
+            model:
+              refId: B
+              expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))'
+              queryType: instant
+          - refId: C
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              refId: C
+              type: math
+              expression: '($A / $B) * 100'
+          - refId: D
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              refId: D
+              type: threshold
+              expression: C
+              conditions:
+                - evaluator:
+                    type: gt
+                    params:
+                      - 5
+        noDataState: OK
+        execErrState: Error
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers
+          description: Check the Error Investigation dashboard for details.
+
+      # Container Silence - mvp-backend
+      - uid: mvp-silence-backend
+        title: "Container Silence: mvp-backend"
+        condition: B
+        data:
+          - refId: A
+            relativeTimeRange:
+              from: 600
+              to: 0
+            datasourceUid: loki
+            model:
+              refId: A
+              expr: 'count_over_time({container="mvp-backend"}[5m])'
+              queryType: instant
+          - refId: B
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              refId: B
+              type: threshold
+              expression: A
+              conditions:
+                - evaluator:
+                    type: lt
+                    params:
+                      - 1
+        noDataState: Alerting
+        execErrState: Error
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: mvp-backend container has stopped producing logs
+          description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck.
+
+      # Container Silence - mvp-postgres
+      - uid: mvp-silence-postgres
+        title: "Container Silence: mvp-postgres"
+        condition: B
+        data:
+          - refId: A
+            relativeTimeRange:
+              from: 600
+              to: 0
+            datasourceUid: loki
+            model:
+              refId: A
+              expr: 'count_over_time({container="mvp-postgres"}[5m])'
+              queryType: instant
+          - refId: B
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              refId: B
+              type: threshold
+              expression: A
+              conditions:
+                - evaluator:
+                    type: lt
+                    params:
+                      - 1
+        noDataState: Alerting
+        execErrState: Error
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: mvp-postgres container has stopped producing logs
+          description: No logs received from mvp-postgres for 5 minutes. The database container may be down.
+
+      # Container Silence - mvp-redis
+      - uid: mvp-silence-redis
+        title: "Container Silence: mvp-redis"
+        condition: B
+        data:
+          - refId: A
+            relativeTimeRange:
+              from: 600
+              to: 0
+            datasourceUid: loki
+            model:
+              refId: A
+              expr: 'count_over_time({container="mvp-redis"}[5m])'
+              queryType: instant
+          - refId: B
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              refId: B
+              type: threshold
+              expression: A
+              conditions:
+                - evaluator:
+                    type: lt
+                    params:
+                      - 1
+        noDataState: Alerting
+        execErrState: Error
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: mvp-redis container has stopped producing logs
+          description: No logs received from mvp-redis for 5 minutes. The cache container may be down.
+
+      # 5xx Spike - Alert when 5xx responses exceed threshold
+      - uid: mvp-5xx-spike
+        title: 5xx Response Spike
+        condition: B
+        data:
+          - refId: A
+            relativeTimeRange:
+              from: 600
+              to: 0
+            datasourceUid: loki
+            model:
+              refId: A
+              expr: 'sum(count_over_time({container="mvp-backend"} | json | msg=`Request processed` | status >= 500 [5m]))'
+              queryType: instant
+          - refId: B
+            relativeTimeRange:
+              from: 0
+              to: 0
+            datasourceUid: __expr__
+            model:
+              refId: B
+              type: threshold
+              expression: A
+              conditions:
+                - evaluator:
+                    type: gt
+                    params:
+                      - 10
+        noDataState: OK
+        execErrState: Error
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: High rate of 5xx responses from mvp-backend
+          description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards.
--- a/config/grafana/alerting/contact-points.yml
+++ b/config/grafana/alerting/contact-points.yml
@@ -0,0 +1,12 @@
+apiVersion: 1
+
+contactPoints:
+  - orgId: 1
+    name: mvp-default
+    receivers:
+      - uid: mvp-webhook-placeholder
+        type: webhook
+        settings:
+          url: "https://example.com/mvp-webhook-placeholder"
+          httpMethod: POST
+        disableResolveMessage: false
--- a/config/grafana/alerting/notification-policies.yml
+++ b/config/grafana/alerting/notification-policies.yml
@@ -0,0 +1,11 @@
+apiVersion: 1
+
+policies:
+  - orgId: 1
+    receiver: mvp-default
+    group_by:
+      - alertname
+      - severity
+    group_wait: 30s
+    group_interval: 5m
+    repeat_interval: 4h
--- a/config/grafana/datasources/loki.yml
+++ b/config/grafana/datasources/loki.yml
@@ -2,6 +2,7 @@ apiVersion: 1

 datasources:
  - name: Loki
+    uid: loki
    type: loki
    access: proxy
    url: http://mvp-loki:3100