Files
motovaultpro/config/grafana/alerting/alert-rules.yml
Eric Gullickson c88fbcdc4e
All checks were successful
Deploy to Staging / Build Images (push) Successful in 35s
Deploy to Staging / Deploy to Staging (push) Successful in 51s
Deploy to Staging / Verify Staging (push) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (push) Successful in 7s
Deploy to Staging / Notify Staging Failure (push) Has been skipped
fix: Update grafana dashboards
2026-02-06 13:50:17 -06:00

211 lines
6.2 KiB
YAML

apiVersion: 1
groups:
- orgId: 1
name: MotoVaultPro Alerts
folder: MotoVaultPro
interval: 1m
rules:
# Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes
- uid: mvp-error-rate-spike
title: Error Rate Spike
condition: D
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))'
queryType: instant
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: B
expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))'
queryType: instant
- refId: C
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: C
type: math
expression: '($A / $B) * 100'
- refId: D
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: D
type: threshold
expression: C
conditions:
- evaluator:
type: gt
params:
- 5
noDataState: OK
execErrState: Error
for: 5m
labels:
severity: critical
annotations:
summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers
description: Check the Error Investigation dashboard for details.
# Container Silence - mvp-backend
- uid: mvp-silence-backend
title: "Container Silence: mvp-backend"
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'count_over_time({container=~"mvp-backend(-staging)?"}[5m])'
queryType: instant
- refId: B
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: B
type: threshold
expression: A
conditions:
- evaluator:
type: lt
params:
- 1
noDataState: Alerting
execErrState: Error
for: 5m
labels:
severity: warning
annotations:
summary: mvp-backend container has stopped producing logs
description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck.
# Container Silence - mvp-postgres
- uid: mvp-silence-postgres
title: "Container Silence: mvp-postgres"
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'count_over_time({container=~"mvp-postgres(-staging)?"}[5m])'
queryType: instant
- refId: B
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: B
type: threshold
expression: A
conditions:
- evaluator:
type: lt
params:
- 1
noDataState: Alerting
execErrState: Error
for: 5m
labels:
severity: warning
annotations:
summary: mvp-postgres container has stopped producing logs
description: No logs received from mvp-postgres for 5 minutes. The database container may be down.
# Container Silence - mvp-redis
- uid: mvp-silence-redis
title: "Container Silence: mvp-redis"
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'count_over_time({container=~"mvp-redis(-staging)?"}[5m])'
queryType: instant
- refId: B
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: B
type: threshold
expression: A
conditions:
- evaluator:
type: lt
params:
- 1
noDataState: Alerting
execErrState: Error
for: 5m
labels:
severity: warning
annotations:
summary: mvp-redis container has stopped producing logs
description: No logs received from mvp-redis for 5 minutes. The cache container may be down.
# 5xx Spike - Alert when 5xx responses exceed threshold
- uid: mvp-5xx-spike
title: 5xx Response Spike
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: loki
model:
refId: A
expr: 'sum(count_over_time({container=~"mvp-backend(-staging)?"} | json | msg=`Request processed` | status >= 500 [5m]))'
queryType: instant
- refId: B
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
refId: B
type: threshold
expression: A
conditions:
- evaluator:
type: gt
params:
- 10
noDataState: OK
execErrState: Error
for: 5m
labels:
severity: critical
annotations:
summary: High rate of 5xx responses from mvp-backend
description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards.