All checks were successful
Deploy to Staging / Build Images (push) Successful in 35s
Deploy to Staging / Deploy to Staging (push) Successful in 51s
Deploy to Staging / Verify Staging (push) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (push) Successful in 7s
Deploy to Staging / Notify Staging Failure (push) Has been skipped
211 lines
6.2 KiB
YAML
211 lines
6.2 KiB
YAML
apiVersion: 1
|
|
|
|
groups:
|
|
- orgId: 1
|
|
name: MotoVaultPro Alerts
|
|
folder: MotoVaultPro
|
|
interval: 1m
|
|
rules:
|
|
# Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes
|
|
- uid: mvp-error-rate-spike
|
|
title: Error Rate Spike
|
|
condition: D
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: loki
|
|
model:
|
|
refId: A
|
|
expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))'
|
|
queryType: instant
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: loki
|
|
model:
|
|
refId: B
|
|
expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))'
|
|
queryType: instant
|
|
- refId: C
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
refId: C
|
|
type: math
|
|
expression: '($A / $B) * 100'
|
|
- refId: D
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
refId: D
|
|
type: threshold
|
|
expression: C
|
|
conditions:
|
|
- evaluator:
|
|
type: gt
|
|
params:
|
|
- 5
|
|
noDataState: OK
|
|
execErrState: Error
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers
|
|
description: Check the Error Investigation dashboard for details.
|
|
|
|
# Container Silence - mvp-backend
|
|
- uid: mvp-silence-backend
|
|
title: "Container Silence: mvp-backend"
|
|
condition: B
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: loki
|
|
model:
|
|
refId: A
|
|
expr: 'count_over_time({container=~"mvp-backend(-staging)?"}[5m])'
|
|
queryType: instant
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
refId: B
|
|
type: threshold
|
|
expression: A
|
|
conditions:
|
|
- evaluator:
|
|
type: lt
|
|
params:
|
|
- 1
|
|
noDataState: Alerting
|
|
execErrState: Error
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: mvp-backend container has stopped producing logs
|
|
description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck.
|
|
|
|
# Container Silence - mvp-postgres
|
|
- uid: mvp-silence-postgres
|
|
title: "Container Silence: mvp-postgres"
|
|
condition: B
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: loki
|
|
model:
|
|
refId: A
|
|
expr: 'count_over_time({container=~"mvp-postgres(-staging)?"}[5m])'
|
|
queryType: instant
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
refId: B
|
|
type: threshold
|
|
expression: A
|
|
conditions:
|
|
- evaluator:
|
|
type: lt
|
|
params:
|
|
- 1
|
|
noDataState: Alerting
|
|
execErrState: Error
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: mvp-postgres container has stopped producing logs
|
|
description: No logs received from mvp-postgres for 5 minutes. The database container may be down.
|
|
|
|
# Container Silence - mvp-redis
|
|
- uid: mvp-silence-redis
|
|
title: "Container Silence: mvp-redis"
|
|
condition: B
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: loki
|
|
model:
|
|
refId: A
|
|
expr: 'count_over_time({container=~"mvp-redis(-staging)?"}[5m])'
|
|
queryType: instant
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
refId: B
|
|
type: threshold
|
|
expression: A
|
|
conditions:
|
|
- evaluator:
|
|
type: lt
|
|
params:
|
|
- 1
|
|
noDataState: Alerting
|
|
execErrState: Error
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: mvp-redis container has stopped producing logs
|
|
description: No logs received from mvp-redis for 5 minutes. The cache container may be down.
|
|
|
|
# 5xx Spike - Alert when 5xx responses exceed threshold
|
|
- uid: mvp-5xx-spike
|
|
title: 5xx Response Spike
|
|
condition: B
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: loki
|
|
model:
|
|
refId: A
|
|
expr: 'sum(count_over_time({container=~"mvp-backend(-staging)?"} | json | msg=`Request processed` | status >= 500 [5m]))'
|
|
queryType: instant
|
|
- refId: B
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
refId: B
|
|
type: threshold
|
|
expression: A
|
|
conditions:
|
|
- evaluator:
|
|
type: gt
|
|
params:
|
|
- 10
|
|
noDataState: OK
|
|
execErrState: Error
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: High rate of 5xx responses from mvp-backend
|
|
description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards.
|