apiVersion: 1 groups: - orgId: 1 name: MotoVaultPro Alerts folder: MotoVaultPro interval: 1m rules: # Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes - uid: mvp-error-rate-spike title: Error Rate Spike condition: D data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: loki model: refId: A expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))' queryType: instant - refId: B relativeTimeRange: from: 600 to: 0 datasourceUid: loki model: refId: B expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))' queryType: instant - refId: C relativeTimeRange: from: 0 to: 0 datasourceUid: __expr__ model: refId: C type: math expression: '($A / $B) * 100' - refId: D relativeTimeRange: from: 0 to: 0 datasourceUid: __expr__ model: refId: D type: threshold expression: C conditions: - evaluator: type: gt params: - 5 noDataState: OK execErrState: Error for: 5m labels: severity: critical annotations: summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers description: Check the Error Investigation dashboard for details. # Container Silence - mvp-backend - uid: mvp-silence-backend title: "Container Silence: mvp-backend" condition: B data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: loki model: refId: A expr: 'count_over_time({container="mvp-backend"}[5m])' queryType: instant - refId: B relativeTimeRange: from: 0 to: 0 datasourceUid: __expr__ model: refId: B type: threshold expression: A conditions: - evaluator: type: lt params: - 1 noDataState: Alerting execErrState: Error for: 5m labels: severity: warning annotations: summary: mvp-backend container has stopped producing logs description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck. # Container Silence - mvp-postgres - uid: mvp-silence-postgres title: "Container Silence: mvp-postgres" condition: B data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: loki model: refId: A expr: 'count_over_time({container="mvp-postgres"}[5m])' queryType: instant - refId: B relativeTimeRange: from: 0 to: 0 datasourceUid: __expr__ model: refId: B type: threshold expression: A conditions: - evaluator: type: lt params: - 1 noDataState: Alerting execErrState: Error for: 5m labels: severity: warning annotations: summary: mvp-postgres container has stopped producing logs description: No logs received from mvp-postgres for 5 minutes. The database container may be down. # Container Silence - mvp-redis - uid: mvp-silence-redis title: "Container Silence: mvp-redis" condition: B data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: loki model: refId: A expr: 'count_over_time({container="mvp-redis"}[5m])' queryType: instant - refId: B relativeTimeRange: from: 0 to: 0 datasourceUid: __expr__ model: refId: B type: threshold expression: A conditions: - evaluator: type: lt params: - 1 noDataState: Alerting execErrState: Error for: 5m labels: severity: warning annotations: summary: mvp-redis container has stopped producing logs description: No logs received from mvp-redis for 5 minutes. The cache container may be down. # 5xx Spike - Alert when 5xx responses exceed threshold - uid: mvp-5xx-spike title: 5xx Response Spike condition: B data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: loki model: refId: A expr: 'sum(count_over_time({container="mvp-backend"} | json | msg=`Request processed` | status >= 500 [5m]))' queryType: instant - refId: B relativeTimeRange: from: 0 to: 0 datasourceUid: __expr__ model: refId: B type: threshold expression: A conditions: - evaluator: type: gt params: - 10 noDataState: OK execErrState: Error for: 5m labels: severity: critical annotations: summary: High rate of 5xx responses from mvp-backend description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards.