feat: Add Grafana dashboards and alerting (#105) #112

Merged
egullickson merged 8 commits from issue-105-add-grafana-dashboards into main 2026-02-06 17:44:05 +00:00
6 changed files with 311 additions and 2 deletions
Showing only changes of commit 4b2b318aff - Show all commits

View File

@@ -0,0 +1,210 @@
apiVersion: 1
groups:
- orgId: 1
name: MotoVaultPro Alerts
folder: MotoVaultPro
interval: 1m
rules:
# Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes
- uid: mvp-error-rate-spike
title: Error Rate Spike
condition: D
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))'
queryType: instant
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: B
expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))'
queryType: instant
- refId: C
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: C
type: math
expression: '($A / $B) * 100'
- refId: D
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: D
type: threshold
expression: C
conditions:
- evaluator:
type: gt
params:
- 5
noDataState: OK
execErrState: Error
for: 5m
labels:
severity: critical
annotations:
summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers
description: Check the Error Investigation dashboard for details.
# Container Silence - mvp-backend
- uid: mvp-silence-backend
title: "Container Silence: mvp-backend"
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'count_over_time({container="mvp-backend"}[5m])'
queryType: instant
- refId: B
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: B
type: threshold
expression: A
conditions:
- evaluator:
type: lt
params:
- 1
noDataState: Alerting
execErrState: Error
for: 5m
labels:
severity: warning
annotations:
summary: mvp-backend container has stopped producing logs
description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck.
# Container Silence - mvp-postgres
- uid: mvp-silence-postgres
title: "Container Silence: mvp-postgres"
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'count_over_time({container="mvp-postgres"}[5m])'
queryType: instant
- refId: B
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: B
type: threshold
expression: A
conditions:
- evaluator:
type: lt
params:
- 1
noDataState: Alerting
execErrState: Error
for: 5m
labels:
severity: warning
annotations:
summary: mvp-postgres container has stopped producing logs
description: No logs received from mvp-postgres for 5 minutes. The database container may be down.
# Container Silence - mvp-redis
- uid: mvp-silence-redis
title: "Container Silence: mvp-redis"
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'count_over_time({container="mvp-redis"}[5m])'
queryType: instant
- refId: B
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: B
type: threshold
expression: A
conditions:
- evaluator:
type: lt
params:
- 1
noDataState: Alerting
execErrState: Error
for: 5m
labels:
severity: warning
annotations:
summary: mvp-redis container has stopped producing logs
description: No logs received from mvp-redis for 5 minutes. The cache container may be down.
# 5xx Spike - Alert when 5xx responses exceed threshold
- uid: mvp-5xx-spike
title: 5xx Response Spike
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'sum(count_over_time({container="mvp-backend"} | json | msg=`Request processed` | status >= 500 [5m]))'
queryType: instant
- refId: B
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: B
type: threshold
expression: A
conditions:
- evaluator:
type: gt
params:
- 10
noDataState: OK
execErrState: Error
for: 5m
labels:
severity: critical
annotations:
summary: High rate of 5xx responses from mvp-backend
description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards.

View File

@@ -0,0 +1,12 @@
apiVersion: 1
contactPoints:
- orgId: 1
name: mvp-default
receivers:
- uid: mvp-webhook-placeholder
type: webhook
settings:
url: "https://example.com/mvp-webhook-placeholder"
httpMethod: POST
disableResolveMessage: false

View File

@@ -0,0 +1,11 @@
apiVersion: 1
policies:
- orgId: 1
receiver: mvp-default
group_by:
- alertname
- severity
group_wait: 30s
group_interval: 5m
repeat_interval: 4h

View File

@@ -2,6 +2,7 @@ apiVersion: 1
datasources: datasources:
- name: Loki - name: Loki
uid: loki
type: loki type: loki
access: proxy access: proxy
url: http://mvp-loki:3100 url: http://mvp-loki:3100

View File

@@ -326,6 +326,7 @@ services:
volumes: volumes:
- ./config/grafana/datasources:/etc/grafana/provisioning/datasources:ro - ./config/grafana/datasources:/etc/grafana/provisioning/datasources:ro
- ./config/grafana/provisioning:/etc/grafana/provisioning/dashboards:ro - ./config/grafana/provisioning:/etc/grafana/provisioning/dashboards:ro
- ./config/grafana/alerting:/etc/grafana/provisioning/alerting:ro
- ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro - ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro
- mvp_grafana_data:/var/lib/grafana - mvp_grafana_data:/var/lib/grafana
networks: networks:

View File

@@ -52,7 +52,39 @@ All logs include a `requestId` field (UUID v4) for tracing requests:
- URL: https://logs.motovaultpro.com - URL: https://logs.motovaultpro.com
- Default credentials: admin/admin (change on first login) - Default credentials: admin/admin (change on first login)
### Example LogQL Queries ## Dashboards
Four provisioned dashboards are available in the MotoVaultPro folder:
| Dashboard | Purpose | Key Panels |
|-----------|---------|------------|
| Application Overview | System-wide health at a glance | Container log volume, error rate gauge, log level distribution, container health status, request count |
| API Performance | Backend latency and throughput analysis | Request rate, response time percentiles (p50/p95/p99), status code distribution, slowest endpoints |
| Error Investigation | Debugging and root cause analysis | Error log stream, errors by container/endpoint, stack trace viewer, correlation ID lookup, recent 5xx responses |
| Infrastructure | Container-level logs and platform monitoring | Per-container throughput, PostgreSQL/Redis/Traefik/OCR logs, Loki ingestion rate |
All dashboards refresh every 30 seconds and default to a 1-hour time window. Dashboard JSON files are in `config/grafana/dashboards/` and provisioned via `config/grafana/provisioning/dashboards.yml`.
## Alerting Rules
Grafana Unified Alerting is configured with file-based provisioned rules. Alert rules are evaluated every 1 minute and must fire continuously for 5 minutes before triggering.
| Alert | Severity | Condition | Description |
|-------|----------|-----------|-------------|
| Error Rate Spike | critical | Error rate > 5% over 5m | Fires when the percentage of error-level logs across all mvp-* containers exceeds 5% |
| Container Silence: mvp-backend | warning | No logs for 5m | Fires when the backend container stops producing logs |
| Container Silence: mvp-postgres | warning | No logs for 5m | Fires when the database container stops producing logs |
| Container Silence: mvp-redis | warning | No logs for 5m | Fires when the cache container stops producing logs |
| 5xx Response Spike | critical | > 10 5xx responses in 5m | Fires when the backend produces more than 10 HTTP 5xx responses |
Alert configuration files are in `config/grafana/alerting/`:
- `alert-rules.yml` - Alert rule definitions with LogQL queries
- `contact-points.yml` - Notification endpoints (webhook placeholder for future email/Slack)
- `notification-policies.yml` - Routing rules that group alerts by name and severity
## LogQL Query Reference
### Common Debugging Queries
Query by requestId: Query by requestId:
``` ```
@@ -66,7 +98,49 @@ Query all errors:
Query slow requests (>500ms): Query slow requests (>500ms):
``` ```
{container="mvp-backend"} | json | duration > 500 {container="mvp-backend"} | json | msg="Request processed" | duration > 500
```
### Error Analysis
Count errors per container over time:
```
sum by (container) (count_over_time({container=~"mvp-.*"} | json | level="error" [5m]))
```
Error rate as percentage:
```
sum(count_over_time({container=~"mvp-.*"} | json | level="error" [5m]))
/ sum(count_over_time({container=~"mvp-.*"} [5m])) * 100
```
### HTTP Status Analysis
All 5xx responses:
```
{container="mvp-backend"} | json | msg="Request processed" | status >= 500
```
Request count by status code:
```
sum by (status) (count_over_time({container="mvp-backend"} | json | msg="Request processed" [5m]))
```
### Container-Specific Queries
PostgreSQL errors:
```
{container="mvp-postgres"} |~ "ERROR|FATAL|PANIC"
```
Traefik access logs:
```
{container="mvp-traefik"} | json
```
OCR processing errors:
```
{container="mvp-ocr"} |~ "ERROR|Exception|Traceback"
``` ```
## Configuration ## Configuration