2026-02-06 17:44:05 +00:00
6 changed files with 311 additions and 2 deletions
--- a/config/grafana/alerting/alert-rules.yml
+++ b/config/grafana/alerting/alert-rules.yml
@@ -0,0 +1,210 @@
 apiVersion: 1
 groups:
  - orgId: 1
    name: MotoVaultPro Alerts
    folder: MotoVaultPro
    interval: 1m
    rules:
      # Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes
      - uid: mvp-error-rate-spike
        title: Error Rate Spike
        condition: D
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: B
              expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))'
              queryType: instant
          - refId: C
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: C
              type: math
              expression: '($A / $B) * 100'
          - refId: D
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: D
              type: threshold
              expression: C
              conditions:
                - evaluator:
                    type: gt
                    params:
                      - 5
        noDataState: OK
        execErrState: Error
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers
          description: Check the Error Investigation dashboard for details.
      # Container Silence - mvp-backend
      - uid: mvp-silence-backend
        title: "Container Silence: mvp-backend"
        condition: B
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'count_over_time({container="mvp-backend"}[5m])'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: B
              type: threshold
              expression: A
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
        noDataState: Alerting
        execErrState: Error
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: mvp-backend container has stopped producing logs
          description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck.
      # Container Silence - mvp-postgres
      - uid: mvp-silence-postgres
        title: "Container Silence: mvp-postgres"
        condition: B
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'count_over_time({container="mvp-postgres"}[5m])'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: B
              type: threshold
              expression: A
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
        noDataState: Alerting
        execErrState: Error
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: mvp-postgres container has stopped producing logs
          description: No logs received from mvp-postgres for 5 minutes. The database container may be down.
      # Container Silence - mvp-redis
      - uid: mvp-silence-redis
        title: "Container Silence: mvp-redis"
        condition: B
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'count_over_time({container="mvp-redis"}[5m])'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: B
              type: threshold
              expression: A
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
        noDataState: Alerting
        execErrState: Error
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: mvp-redis container has stopped producing logs
          description: No logs received from mvp-redis for 5 minutes. The cache container may be down.
      # 5xx Spike - Alert when 5xx responses exceed threshold
      - uid: mvp-5xx-spike
        title: 5xx Response Spike
        condition: B
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'sum(count_over_time({container="mvp-backend"} | json | msg=`Request processed` | status >= 500 [5m]))'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: B
              type: threshold
              expression: A
              conditions:
                - evaluator:
                    type: gt
                    params:
                      - 10
        noDataState: OK
        execErrState: Error
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: High rate of 5xx responses from mvp-backend
          description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards.
--- a/config/grafana/alerting/contact-points.yml
+++ b/config/grafana/alerting/contact-points.yml
@@ -0,0 +1,12 @@
 apiVersion: 1
 contactPoints:
  - orgId: 1
    name: mvp-default
    receivers:
      - uid: mvp-webhook-placeholder
        type: webhook
        settings:
          url: "https://example.com/mvp-webhook-placeholder"
          httpMethod: POST
        disableResolveMessage: false
--- a/config/grafana/alerting/notification-policies.yml
+++ b/config/grafana/alerting/notification-policies.yml
@@ -0,0 +1,11 @@
 apiVersion: 1
 policies:
  - orgId: 1
    receiver: mvp-default
    group_by:
      - alertname
      - severity
    group_wait: 30s
    group_interval: 5m
    repeat_interval: 4h
--- a/config/grafana/datasources/loki.yml
+++ b/config/grafana/datasources/loki.yml
@@ -2,6 +2,7 @@ apiVersion: 1
 datasources:
  - name: Loki
    uid: loki
    type: loki
    access: proxy
    url: http://mvp-loki:3100
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -326,6 +326,7 @@ services:
    volumes:
      - ./config/grafana/datasources:/etc/grafana/provisioning/datasources:ro
      - ./config/grafana/provisioning:/etc/grafana/provisioning/dashboards:ro
      - ./config/grafana/alerting:/etc/grafana/provisioning/alerting:ro
      - ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro
      - mvp_grafana_data:/var/lib/grafana
    networks:
--- a/docs/LOGGING.md
+++ b/docs/LOGGING.md
@@ -52,7 +52,39 @@ All logs include a `requestId` field (UUID v4) for tracing requests:
 - URL: https://logs.motovaultpro.com
 - Default credentials: admin/admin (change on first login)
-### Example LogQL Queries
+## Dashboards
 Four provisioned dashboards are available in the MotoVaultPro folder:
 | Dashboard | Purpose | Key Panels |
 |-----------|---------|------------|
 | Application Overview | System-wide health at a glance | Container log volume, error rate gauge, log level distribution, container health status, request count |
 | API Performance | Backend latency and throughput analysis | Request rate, response time percentiles (p50/p95/p99), status code distribution, slowest endpoints |
 | Error Investigation | Debugging and root cause analysis | Error log stream, errors by container/endpoint, stack trace viewer, correlation ID lookup, recent 5xx responses |
 | Infrastructure | Container-level logs and platform monitoring | Per-container throughput, PostgreSQL/Redis/Traefik/OCR logs, Loki ingestion rate |
 All dashboards refresh every 30 seconds and default to a 1-hour time window. Dashboard JSON files are in `config/grafana/dashboards/` and provisioned via `config/grafana/provisioning/dashboards.yml`.
 ## Alerting Rules
 Grafana Unified Alerting is configured with file-based provisioned rules. Alert rules are evaluated every 1 minute and must fire continuously for 5 minutes before triggering.
 | Alert | Severity | Condition | Description |
 |-------|----------|-----------|-------------|
 | Error Rate Spike | critical | Error rate > 5% over 5m | Fires when the percentage of error-level logs across all mvp-* containers exceeds 5% |
 | Container Silence: mvp-backend | warning | No logs for 5m | Fires when the backend container stops producing logs |
 | Container Silence: mvp-postgres | warning | No logs for 5m | Fires when the database container stops producing logs |
 | Container Silence: mvp-redis | warning | No logs for 5m | Fires when the cache container stops producing logs |
 | 5xx Response Spike | critical | > 10 5xx responses in 5m | Fires when the backend produces more than 10 HTTP 5xx responses |
 Alert configuration files are in `config/grafana/alerting/`:
 - `alert-rules.yml` - Alert rule definitions with LogQL queries
 - `contact-points.yml` - Notification endpoints (webhook placeholder for future email/Slack)
 - `notification-policies.yml` - Routing rules that group alerts by name and severity
 ## LogQL Query Reference
 ### Common Debugging Queries
 Query by requestId:
 ```
@@ -66,7 +98,49 @@ Query all errors:
 Query slow requests (>500ms):
 ```
-{container="mvp-backend"} | json | duration > 500
+{container="mvp-backend"} | json | msg="Request processed" | duration > 500
 ```
 ### Error Analysis
 Count errors per container over time:
 ```
 sum by (container) (count_over_time({container=~"mvp-.*"} | json | level="error" [5m]))
 ```
 Error rate as percentage:
 ```
 sum(count_over_time({container=~"mvp-.*"} | json | level="error" [5m]))
  / sum(count_over_time({container=~"mvp-.*"} [5m])) * 100
 ```
 ### HTTP Status Analysis
 All 5xx responses:
 ```
 {container="mvp-backend"} | json | msg="Request processed" | status >= 500
 ```
 Request count by status code:
 ```
 sum by (status) (count_over_time({container="mvp-backend"} | json | msg="Request processed" [5m]))
 ```
 ### Container-Specific Queries
 PostgreSQL errors:
 ```
 {container="mvp-postgres"} |~ "ERROR|FATAL|PANIC"
 ```
 Traefik access logs:
 ```
 {container="mvp-traefik"} | json
 ```
 OCR processing errors:
 ```
 {container="mvp-ocr"} |~ "ERROR|Exception|Traceback"
 ```
 ## Configuration