diff --git a/config/grafana/alerting/alert-rules.yml b/config/grafana/alerting/alert-rules.yml new file mode 100644 index 0000000..d237007 --- /dev/null +++ b/config/grafana/alerting/alert-rules.yml @@ -0,0 +1,210 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: MotoVaultPro Alerts + folder: MotoVaultPro + interval: 1m + rules: + # Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes + - uid: mvp-error-rate-spike + title: Error Rate Spike + condition: D + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))' + queryType: instant + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: B + expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))' + queryType: instant + - refId: C + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: C + type: math + expression: '($A / $B) * 100' + - refId: D + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: D + type: threshold + expression: C + conditions: + - evaluator: + type: gt + params: + - 5 + noDataState: OK + execErrState: Error + for: 5m + labels: + severity: critical + annotations: + summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers + description: Check the Error Investigation dashboard for details. + + # Container Silence - mvp-backend + - uid: mvp-silence-backend + title: "Container Silence: mvp-backend" + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'count_over_time({container="mvp-backend"}[5m])' + queryType: instant + - refId: B + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: B + type: threshold + expression: A + conditions: + - evaluator: + type: lt + params: + - 1 + noDataState: Alerting + execErrState: Error + for: 5m + labels: + severity: warning + annotations: + summary: mvp-backend container has stopped producing logs + description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck. + + # Container Silence - mvp-postgres + - uid: mvp-silence-postgres + title: "Container Silence: mvp-postgres" + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'count_over_time({container="mvp-postgres"}[5m])' + queryType: instant + - refId: B + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: B + type: threshold + expression: A + conditions: + - evaluator: + type: lt + params: + - 1 + noDataState: Alerting + execErrState: Error + for: 5m + labels: + severity: warning + annotations: + summary: mvp-postgres container has stopped producing logs + description: No logs received from mvp-postgres for 5 minutes. The database container may be down. + + # Container Silence - mvp-redis + - uid: mvp-silence-redis + title: "Container Silence: mvp-redis" + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'count_over_time({container="mvp-redis"}[5m])' + queryType: instant + - refId: B + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: B + type: threshold + expression: A + conditions: + - evaluator: + type: lt + params: + - 1 + noDataState: Alerting + execErrState: Error + for: 5m + labels: + severity: warning + annotations: + summary: mvp-redis container has stopped producing logs + description: No logs received from mvp-redis for 5 minutes. The cache container may be down. + + # 5xx Spike - Alert when 5xx responses exceed threshold + - uid: mvp-5xx-spike + title: 5xx Response Spike + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'sum(count_over_time({container="mvp-backend"} | json | msg=`Request processed` | status >= 500 [5m]))' + queryType: instant + - refId: B + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: B + type: threshold + expression: A + conditions: + - evaluator: + type: gt + params: + - 10 + noDataState: OK + execErrState: Error + for: 5m + labels: + severity: critical + annotations: + summary: High rate of 5xx responses from mvp-backend + description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards. diff --git a/config/grafana/alerting/contact-points.yml b/config/grafana/alerting/contact-points.yml new file mode 100644 index 0000000..a151276 --- /dev/null +++ b/config/grafana/alerting/contact-points.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +contactPoints: + - orgId: 1 + name: mvp-default + receivers: + - uid: mvp-webhook-placeholder + type: webhook + settings: + url: "https://example.com/mvp-webhook-placeholder" + httpMethod: POST + disableResolveMessage: false diff --git a/config/grafana/alerting/notification-policies.yml b/config/grafana/alerting/notification-policies.yml new file mode 100644 index 0000000..8e6fef7 --- /dev/null +++ b/config/grafana/alerting/notification-policies.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +policies: + - orgId: 1 + receiver: mvp-default + group_by: + - alertname + - severity + group_wait: 30s + group_interval: 5m + repeat_interval: 4h diff --git a/config/grafana/datasources/loki.yml b/config/grafana/datasources/loki.yml index 7154896..7fa4d24 100644 --- a/config/grafana/datasources/loki.yml +++ b/config/grafana/datasources/loki.yml @@ -2,6 +2,7 @@ apiVersion: 1 datasources: - name: Loki + uid: loki type: loki access: proxy url: http://mvp-loki:3100 diff --git a/docker-compose.yml b/docker-compose.yml index 441cb4f..0bef4b2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -326,6 +326,7 @@ services: volumes: - ./config/grafana/datasources:/etc/grafana/provisioning/datasources:ro - ./config/grafana/provisioning:/etc/grafana/provisioning/dashboards:ro + - ./config/grafana/alerting:/etc/grafana/provisioning/alerting:ro - ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro - mvp_grafana_data:/var/lib/grafana networks: diff --git a/docs/LOGGING.md b/docs/LOGGING.md index 19d818d..59eacd4 100644 --- a/docs/LOGGING.md +++ b/docs/LOGGING.md @@ -52,7 +52,39 @@ All logs include a `requestId` field (UUID v4) for tracing requests: - URL: https://logs.motovaultpro.com - Default credentials: admin/admin (change on first login) -### Example LogQL Queries +## Dashboards + +Four provisioned dashboards are available in the MotoVaultPro folder: + +| Dashboard | Purpose | Key Panels | +|-----------|---------|------------| +| Application Overview | System-wide health at a glance | Container log volume, error rate gauge, log level distribution, container health status, request count | +| API Performance | Backend latency and throughput analysis | Request rate, response time percentiles (p50/p95/p99), status code distribution, slowest endpoints | +| Error Investigation | Debugging and root cause analysis | Error log stream, errors by container/endpoint, stack trace viewer, correlation ID lookup, recent 5xx responses | +| Infrastructure | Container-level logs and platform monitoring | Per-container throughput, PostgreSQL/Redis/Traefik/OCR logs, Loki ingestion rate | + +All dashboards refresh every 30 seconds and default to a 1-hour time window. Dashboard JSON files are in `config/grafana/dashboards/` and provisioned via `config/grafana/provisioning/dashboards.yml`. + +## Alerting Rules + +Grafana Unified Alerting is configured with file-based provisioned rules. Alert rules are evaluated every 1 minute and must fire continuously for 5 minutes before triggering. + +| Alert | Severity | Condition | Description | +|-------|----------|-----------|-------------| +| Error Rate Spike | critical | Error rate > 5% over 5m | Fires when the percentage of error-level logs across all mvp-* containers exceeds 5% | +| Container Silence: mvp-backend | warning | No logs for 5m | Fires when the backend container stops producing logs | +| Container Silence: mvp-postgres | warning | No logs for 5m | Fires when the database container stops producing logs | +| Container Silence: mvp-redis | warning | No logs for 5m | Fires when the cache container stops producing logs | +| 5xx Response Spike | critical | > 10 5xx responses in 5m | Fires when the backend produces more than 10 HTTP 5xx responses | + +Alert configuration files are in `config/grafana/alerting/`: +- `alert-rules.yml` - Alert rule definitions with LogQL queries +- `contact-points.yml` - Notification endpoints (webhook placeholder for future email/Slack) +- `notification-policies.yml` - Routing rules that group alerts by name and severity + +## LogQL Query Reference + +### Common Debugging Queries Query by requestId: ``` @@ -66,7 +98,49 @@ Query all errors: Query slow requests (>500ms): ``` -{container="mvp-backend"} | json | duration > 500 +{container="mvp-backend"} | json | msg="Request processed" | duration > 500 +``` + +### Error Analysis + +Count errors per container over time: +``` +sum by (container) (count_over_time({container=~"mvp-.*"} | json | level="error" [5m])) +``` + +Error rate as percentage: +``` +sum(count_over_time({container=~"mvp-.*"} | json | level="error" [5m])) + / sum(count_over_time({container=~"mvp-.*"} [5m])) * 100 +``` + +### HTTP Status Analysis + +All 5xx responses: +``` +{container="mvp-backend"} | json | msg="Request processed" | status >= 500 +``` + +Request count by status code: +``` +sum by (status) (count_over_time({container="mvp-backend"} | json | msg="Request processed" [5m])) +``` + +### Container-Specific Queries + +PostgreSQL errors: +``` +{container="mvp-postgres"} |~ "ERROR|FATAL|PANIC" +``` + +Traefik access logs: +``` +{container="mvp-traefik"} | json +``` + +OCR processing errors: +``` +{container="mvp-ocr"} |~ "ERROR|Exception|Traceback" ``` ## Configuration