motovaultpro/config/monitoring/alerts/service-health.yml

# Service Health Alerts for MotoVaultPro K8s-equivalent monitoring
# These alerts mirror K8s PrometheusRule patterns for service monitoring

groups:
  - name: service-health
    rules:
      # Service availability alerts
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute."

      - alert: HighResponseTime
        expr: traefik_service_request_duration_seconds{quantile="0.95"} > 2
        for: 2m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "High response time for service {{ $labels.service }}"
          description: "95th percentile response time for {{ $labels.service }} is {{ $value }}s"

      - alert: HighErrorRate
        expr: rate(traefik_service_requests_total{code!~"2.."}[5m]) > 0.1
        for: 2m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "High error rate for service {{ $labels.service }}"
          description: "Error rate for {{ $labels.service }} is {{ $value | humanizePercentage }}"

  - name: database-health
    rules:
      # Database connection alerts
      - alert: DatabaseConnectionsHigh
        expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
        for: 5m
        labels:
          severity: warning
          team: database
        annotations:
          summary: "High database connections for {{ $labels.datname }}"
          description: "Database {{ $labels.datname }} is using {{ $value | humanizePercentage }} of max connections"

      - alert: DatabaseReplicationLag
        expr: pg_stat_replication_lag_bytes > 1073741824  # 1GB
        for: 2m
        labels:
          severity: critical
          team: database
        annotations:
          summary: "High replication lag for database {{ $labels.datname }}"
          description: "Replication lag is {{ $value | humanize1024 }}B"

  - name: resource-usage
    rules:
      # Resource usage alerts
      - alert: HighMemoryUsage
        expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
        for: 2m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "High memory usage for container {{ $labels.name }}"
          description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of memory limit"

      - alert: HighCPUUsage
        expr: rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100 > 80
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "High CPU usage for container {{ $labels.name }}"
          description: "Container {{ $labels.name }} is using {{ $value }}% CPU"

  - name: traefik-health
    rules:
      # Traefik specific alerts
      - alert: TraefikServiceDiscoveryFailure
        expr: increase(traefik_config_reloads_failure_total[5m]) > 0
        for: 1m
        labels:
          severity: warning
          team: infrastructure
        annotations:
          summary: "Traefik configuration reload failures"
          description: "Traefik has {{ $value }} configuration reload failures in the last 5 minutes"

      - alert: TraefikBackendDown
        expr: traefik_service_server_up == 0
        for: 1m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Traefik backend {{ $labels.service }} is down"
          description: "Backend server for service {{ $labels.service }} is unreachable"