motovaultpro/config/grafana/alerting/alert-rules.yml

apiVersion: 1

groups:
  - orgId: 1
    name: MotoVaultPro Alerts
    folder: MotoVaultPro
    interval: 1m
    rules:
      # Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes
      - uid: mvp-error-rate-spike
        title: Error Rate Spike
        condition: D
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: B
              expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))'
              queryType: instant
          - refId: C
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: C
              type: math
              expression: '($A / $B) * 100'
          - refId: D
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: D
              type: threshold
              expression: C
              conditions:
                - evaluator:
                    type: gt
                    params:
                      - 5
        noDataState: OK
        execErrState: Error
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers
          description: Check the Error Investigation dashboard for details.

      # Container Silence - mvp-backend
      - uid: mvp-silence-backend
        title: "Container Silence: mvp-backend"
        condition: B
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'count_over_time({container=~"mvp-backend(-staging)?"}[5m])'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: B
              type: threshold
              expression: A
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
        noDataState: Alerting
        execErrState: Error
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: mvp-backend container has stopped producing logs
          description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck.

      # Container Silence - mvp-postgres
      - uid: mvp-silence-postgres
        title: "Container Silence: mvp-postgres"
        condition: B
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'count_over_time({container=~"mvp-postgres(-staging)?"}[5m])'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: B
              type: threshold
              expression: A
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
        noDataState: Alerting
        execErrState: Error
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: mvp-postgres container has stopped producing logs
          description: No logs received from mvp-postgres for 5 minutes. The database container may be down.

      # Container Silence - mvp-redis
      - uid: mvp-silence-redis
        title: "Container Silence: mvp-redis"
        condition: B
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'count_over_time({container=~"mvp-redis(-staging)?"}[5m])'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: B
              type: threshold
              expression: A
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
        noDataState: Alerting
        execErrState: Error
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: mvp-redis container has stopped producing logs
          description: No logs received from mvp-redis for 5 minutes. The cache container may be down.

      # 5xx Spike - Alert when 5xx responses exceed threshold
      - uid: mvp-5xx-spike
        title: 5xx Response Spike
        condition: B
        data:
          - refId: A
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: loki
            model:
              refId: A
              expr: 'sum(count_over_time({container=~"mvp-backend(-staging)?"} | json | msg=`Request processed` | status >= 500 [5m]))'
              queryType: instant
          - refId: B
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: __expr__
            model:
              refId: B
              type: threshold
              expression: A
              conditions:
                - evaluator:
                    type: gt
                    params:
                      - 10
        noDataState: OK
        execErrState: Error
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: High rate of 5xx responses from mvp-backend
          description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards.