# Service Health Alerts for MotoVaultPro K8s-equivalent monitoring # These alerts mirror K8s PrometheusRule patterns for service monitoring groups: - name: service-health rules: # Service availability alerts - alert: ServiceDown expr: up == 0 for: 1m labels: severity: critical team: platform annotations: summary: "Service {{ $labels.job }} is down" description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute." - alert: HighResponseTime expr: traefik_service_request_duration_seconds{quantile="0.95"} > 2 for: 2m labels: severity: warning team: platform annotations: summary: "High response time for service {{ $labels.service }}" description: "95th percentile response time for {{ $labels.service }} is {{ $value }}s" - alert: HighErrorRate expr: rate(traefik_service_requests_total{code!~"2.."}[5m]) > 0.1 for: 2m labels: severity: warning team: platform annotations: summary: "High error rate for service {{ $labels.service }}" description: "Error rate for {{ $labels.service }} is {{ $value | humanizePercentage }}" - name: database-health rules: # Database connection alerts - alert: DatabaseConnectionsHigh expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8 for: 5m labels: severity: warning team: database annotations: summary: "High database connections for {{ $labels.datname }}" description: "Database {{ $labels.datname }} is using {{ $value | humanizePercentage }} of max connections" - alert: DatabaseReplicationLag expr: pg_stat_replication_lag_bytes > 1073741824 # 1GB for: 2m labels: severity: critical team: database annotations: summary: "High replication lag for database {{ $labels.datname }}" description: "Replication lag is {{ $value | humanize1024 }}B" - name: resource-usage rules: # Resource usage alerts - alert: HighMemoryUsage expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9 for: 2m labels: severity: warning team: platform annotations: summary: "High memory usage for container {{ $labels.name }}" description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of memory limit" - alert: HighCPUUsage expr: rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100 > 80 for: 5m labels: severity: warning team: platform annotations: summary: "High CPU usage for container {{ $labels.name }}" description: "Container {{ $labels.name }} is using {{ $value }}% CPU" - name: traefik-health rules: # Traefik specific alerts - alert: TraefikServiceDiscoveryFailure expr: increase(traefik_config_reloads_failure_total[5m]) > 0 for: 1m labels: severity: warning team: infrastructure annotations: summary: "Traefik configuration reload failures" description: "Traefik has {{ $value }} configuration reload failures in the last 5 minutes" - alert: TraefikBackendDown expr: traefik_service_server_up == 0 for: 1m labels: severity: critical team: platform annotations: summary: "Traefik backend {{ $labels.service }} is down" description: "Backend server for service {{ $labels.service }} is unreachable"