feat: Add Grafana dashboards and alerting (#105) #112
210
config/grafana/alerting/alert-rules.yml
Normal file
210
config/grafana/alerting/alert-rules.yml
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
groups:
|
||||||
|
- orgId: 1
|
||||||
|
name: MotoVaultPro Alerts
|
||||||
|
folder: MotoVaultPro
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
# Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes
|
||||||
|
- uid: mvp-error-rate-spike
|
||||||
|
title: Error Rate Spike
|
||||||
|
condition: D
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: loki
|
||||||
|
model:
|
||||||
|
refId: A
|
||||||
|
expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))'
|
||||||
|
queryType: instant
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: loki
|
||||||
|
model:
|
||||||
|
refId: B
|
||||||
|
expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))'
|
||||||
|
queryType: instant
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 0
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
refId: C
|
||||||
|
type: math
|
||||||
|
expression: '($A / $B) * 100'
|
||||||
|
- refId: D
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 0
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
refId: D
|
||||||
|
type: threshold
|
||||||
|
expression: C
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
type: gt
|
||||||
|
params:
|
||||||
|
- 5
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers
|
||||||
|
description: Check the Error Investigation dashboard for details.
|
||||||
|
|
||||||
|
# Container Silence - mvp-backend
|
||||||
|
- uid: mvp-silence-backend
|
||||||
|
title: "Container Silence: mvp-backend"
|
||||||
|
condition: B
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: loki
|
||||||
|
model:
|
||||||
|
refId: A
|
||||||
|
expr: 'count_over_time({container="mvp-backend"}[5m])'
|
||||||
|
queryType: instant
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 0
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
refId: B
|
||||||
|
type: threshold
|
||||||
|
expression: A
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
type: lt
|
||||||
|
params:
|
||||||
|
- 1
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: mvp-backend container has stopped producing logs
|
||||||
|
description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck.
|
||||||
|
|
||||||
|
# Container Silence - mvp-postgres
|
||||||
|
- uid: mvp-silence-postgres
|
||||||
|
title: "Container Silence: mvp-postgres"
|
||||||
|
condition: B
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: loki
|
||||||
|
model:
|
||||||
|
refId: A
|
||||||
|
expr: 'count_over_time({container="mvp-postgres"}[5m])'
|
||||||
|
queryType: instant
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 0
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
refId: B
|
||||||
|
type: threshold
|
||||||
|
expression: A
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
type: lt
|
||||||
|
params:
|
||||||
|
- 1
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: mvp-postgres container has stopped producing logs
|
||||||
|
description: No logs received from mvp-postgres for 5 minutes. The database container may be down.
|
||||||
|
|
||||||
|
# Container Silence - mvp-redis
|
||||||
|
- uid: mvp-silence-redis
|
||||||
|
title: "Container Silence: mvp-redis"
|
||||||
|
condition: B
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: loki
|
||||||
|
model:
|
||||||
|
refId: A
|
||||||
|
expr: 'count_over_time({container="mvp-redis"}[5m])'
|
||||||
|
queryType: instant
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 0
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
refId: B
|
||||||
|
type: threshold
|
||||||
|
expression: A
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
type: lt
|
||||||
|
params:
|
||||||
|
- 1
|
||||||
|
noDataState: Alerting
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: mvp-redis container has stopped producing logs
|
||||||
|
description: No logs received from mvp-redis for 5 minutes. The cache container may be down.
|
||||||
|
|
||||||
|
# 5xx Spike - Alert when 5xx responses exceed threshold
|
||||||
|
- uid: mvp-5xx-spike
|
||||||
|
title: 5xx Response Spike
|
||||||
|
condition: B
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: loki
|
||||||
|
model:
|
||||||
|
refId: A
|
||||||
|
expr: 'sum(count_over_time({container="mvp-backend"} | json | msg=`Request processed` | status >= 500 [5m]))'
|
||||||
|
queryType: instant
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 0
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
refId: B
|
||||||
|
type: threshold
|
||||||
|
expression: A
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
type: gt
|
||||||
|
params:
|
||||||
|
- 10
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: High rate of 5xx responses from mvp-backend
|
||||||
|
description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards.
|
||||||
12
config/grafana/alerting/contact-points.yml
Normal file
12
config/grafana/alerting/contact-points.yml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
contactPoints:
|
||||||
|
- orgId: 1
|
||||||
|
name: mvp-default
|
||||||
|
receivers:
|
||||||
|
- uid: mvp-webhook-placeholder
|
||||||
|
type: webhook
|
||||||
|
settings:
|
||||||
|
url: "https://example.com/mvp-webhook-placeholder"
|
||||||
|
httpMethod: POST
|
||||||
|
disableResolveMessage: false
|
||||||
11
config/grafana/alerting/notification-policies.yml
Normal file
11
config/grafana/alerting/notification-policies.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
policies:
|
||||||
|
- orgId: 1
|
||||||
|
receiver: mvp-default
|
||||||
|
group_by:
|
||||||
|
- alertname
|
||||||
|
- severity
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 4h
|
||||||
@@ -2,6 +2,7 @@ apiVersion: 1
|
|||||||
|
|
||||||
datasources:
|
datasources:
|
||||||
- name: Loki
|
- name: Loki
|
||||||
|
uid: loki
|
||||||
type: loki
|
type: loki
|
||||||
access: proxy
|
access: proxy
|
||||||
url: http://mvp-loki:3100
|
url: http://mvp-loki:3100
|
||||||
|
|||||||
@@ -326,6 +326,7 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
- ./config/grafana/datasources:/etc/grafana/provisioning/datasources:ro
|
- ./config/grafana/datasources:/etc/grafana/provisioning/datasources:ro
|
||||||
- ./config/grafana/provisioning:/etc/grafana/provisioning/dashboards:ro
|
- ./config/grafana/provisioning:/etc/grafana/provisioning/dashboards:ro
|
||||||
|
- ./config/grafana/alerting:/etc/grafana/provisioning/alerting:ro
|
||||||
- ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
- ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||||
- mvp_grafana_data:/var/lib/grafana
|
- mvp_grafana_data:/var/lib/grafana
|
||||||
networks:
|
networks:
|
||||||
|
|||||||
@@ -52,7 +52,39 @@ All logs include a `requestId` field (UUID v4) for tracing requests:
|
|||||||
- URL: https://logs.motovaultpro.com
|
- URL: https://logs.motovaultpro.com
|
||||||
- Default credentials: admin/admin (change on first login)
|
- Default credentials: admin/admin (change on first login)
|
||||||
|
|
||||||
### Example LogQL Queries
|
## Dashboards
|
||||||
|
|
||||||
|
Four provisioned dashboards are available in the MotoVaultPro folder:
|
||||||
|
|
||||||
|
| Dashboard | Purpose | Key Panels |
|
||||||
|
|-----------|---------|------------|
|
||||||
|
| Application Overview | System-wide health at a glance | Container log volume, error rate gauge, log level distribution, container health status, request count |
|
||||||
|
| API Performance | Backend latency and throughput analysis | Request rate, response time percentiles (p50/p95/p99), status code distribution, slowest endpoints |
|
||||||
|
| Error Investigation | Debugging and root cause analysis | Error log stream, errors by container/endpoint, stack trace viewer, correlation ID lookup, recent 5xx responses |
|
||||||
|
| Infrastructure | Container-level logs and platform monitoring | Per-container throughput, PostgreSQL/Redis/Traefik/OCR logs, Loki ingestion rate |
|
||||||
|
|
||||||
|
All dashboards refresh every 30 seconds and default to a 1-hour time window. Dashboard JSON files are in `config/grafana/dashboards/` and provisioned via `config/grafana/provisioning/dashboards.yml`.
|
||||||
|
|
||||||
|
## Alerting Rules
|
||||||
|
|
||||||
|
Grafana Unified Alerting is configured with file-based provisioned rules. Alert rules are evaluated every 1 minute and must fire continuously for 5 minutes before triggering.
|
||||||
|
|
||||||
|
| Alert | Severity | Condition | Description |
|
||||||
|
|-------|----------|-----------|-------------|
|
||||||
|
| Error Rate Spike | critical | Error rate > 5% over 5m | Fires when the percentage of error-level logs across all mvp-* containers exceeds 5% |
|
||||||
|
| Container Silence: mvp-backend | warning | No logs for 5m | Fires when the backend container stops producing logs |
|
||||||
|
| Container Silence: mvp-postgres | warning | No logs for 5m | Fires when the database container stops producing logs |
|
||||||
|
| Container Silence: mvp-redis | warning | No logs for 5m | Fires when the cache container stops producing logs |
|
||||||
|
| 5xx Response Spike | critical | > 10 5xx responses in 5m | Fires when the backend produces more than 10 HTTP 5xx responses |
|
||||||
|
|
||||||
|
Alert configuration files are in `config/grafana/alerting/`:
|
||||||
|
- `alert-rules.yml` - Alert rule definitions with LogQL queries
|
||||||
|
- `contact-points.yml` - Notification endpoints (webhook placeholder for future email/Slack)
|
||||||
|
- `notification-policies.yml` - Routing rules that group alerts by name and severity
|
||||||
|
|
||||||
|
## LogQL Query Reference
|
||||||
|
|
||||||
|
### Common Debugging Queries
|
||||||
|
|
||||||
Query by requestId:
|
Query by requestId:
|
||||||
```
|
```
|
||||||
@@ -66,7 +98,49 @@ Query all errors:
|
|||||||
|
|
||||||
Query slow requests (>500ms):
|
Query slow requests (>500ms):
|
||||||
```
|
```
|
||||||
{container="mvp-backend"} | json | duration > 500
|
{container="mvp-backend"} | json | msg="Request processed" | duration > 500
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error Analysis
|
||||||
|
|
||||||
|
Count errors per container over time:
|
||||||
|
```
|
||||||
|
sum by (container) (count_over_time({container=~"mvp-.*"} | json | level="error" [5m]))
|
||||||
|
```
|
||||||
|
|
||||||
|
Error rate as percentage:
|
||||||
|
```
|
||||||
|
sum(count_over_time({container=~"mvp-.*"} | json | level="error" [5m]))
|
||||||
|
/ sum(count_over_time({container=~"mvp-.*"} [5m])) * 100
|
||||||
|
```
|
||||||
|
|
||||||
|
### HTTP Status Analysis
|
||||||
|
|
||||||
|
All 5xx responses:
|
||||||
|
```
|
||||||
|
{container="mvp-backend"} | json | msg="Request processed" | status >= 500
|
||||||
|
```
|
||||||
|
|
||||||
|
Request count by status code:
|
||||||
|
```
|
||||||
|
sum by (status) (count_over_time({container="mvp-backend"} | json | msg="Request processed" [5m]))
|
||||||
|
```
|
||||||
|
|
||||||
|
### Container-Specific Queries
|
||||||
|
|
||||||
|
PostgreSQL errors:
|
||||||
|
```
|
||||||
|
{container="mvp-postgres"} |~ "ERROR|FATAL|PANIC"
|
||||||
|
```
|
||||||
|
|
||||||
|
Traefik access logs:
|
||||||
|
```
|
||||||
|
{container="mvp-traefik"} | json
|
||||||
|
```
|
||||||
|
|
||||||
|
OCR processing errors:
|
||||||
|
```
|
||||||
|
{container="mvp-ocr"} |~ "ERROR|Exception|Traceback"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|||||||
Reference in New Issue
Block a user