Config files were previously gitignored, causing CI/CD pipeline to fail because Docker would create directories instead of mounting the expected files. - Remove config/** from .gitignore - Track all config files (secrets still ignored) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
105 lines
3.7 KiB
YAML
Executable File
105 lines
3.7 KiB
YAML
Executable File
# Service Health Alerts for MotoVaultPro K8s-equivalent monitoring
|
|
# These alerts mirror K8s PrometheusRule patterns for service monitoring
|
|
|
|
groups:
|
|
- name: service-health
|
|
rules:
|
|
# Service availability alerts
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down"
|
|
description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute."
|
|
|
|
- alert: HighResponseTime
|
|
expr: traefik_service_request_duration_seconds{quantile="0.95"} > 2
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "High response time for service {{ $labels.service }}"
|
|
description: "95th percentile response time for {{ $labels.service }} is {{ $value }}s"
|
|
|
|
- alert: HighErrorRate
|
|
expr: rate(traefik_service_requests_total{code!~"2.."}[5m]) > 0.1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "High error rate for service {{ $labels.service }}"
|
|
description: "Error rate for {{ $labels.service }} is {{ $value | humanizePercentage }}"
|
|
|
|
- name: database-health
|
|
rules:
|
|
# Database connection alerts
|
|
- alert: DatabaseConnectionsHigh
|
|
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: database
|
|
annotations:
|
|
summary: "High database connections for {{ $labels.datname }}"
|
|
description: "Database {{ $labels.datname }} is using {{ $value | humanizePercentage }} of max connections"
|
|
|
|
- alert: DatabaseReplicationLag
|
|
expr: pg_stat_replication_lag_bytes > 1073741824 # 1GB
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
team: database
|
|
annotations:
|
|
summary: "High replication lag for database {{ $labels.datname }}"
|
|
description: "Replication lag is {{ $value | humanize1024 }}B"
|
|
|
|
- name: resource-usage
|
|
rules:
|
|
# Resource usage alerts
|
|
- alert: HighMemoryUsage
|
|
expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "High memory usage for container {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of memory limit"
|
|
|
|
- alert: HighCPUUsage
|
|
expr: rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "High CPU usage for container {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} is using {{ $value }}% CPU"
|
|
|
|
- name: traefik-health
|
|
rules:
|
|
# Traefik specific alerts
|
|
- alert: TraefikServiceDiscoveryFailure
|
|
expr: increase(traefik_config_reloads_failure_total[5m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
team: infrastructure
|
|
annotations:
|
|
summary: "Traefik configuration reload failures"
|
|
description: "Traefik has {{ $value }} configuration reload failures in the last 5 minutes"
|
|
|
|
- alert: TraefikBackendDown
|
|
expr: traefik_service_server_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "Traefik backend {{ $labels.service }} is down"
|
|
description: "Backend server for service {{ $labels.service }} is unreachable" |