Track config files for CI/CD deployment

Config files were previously gitignored, causing CI/CD pipeline to fail because Docker would create directories instead of mounting the expected files. - Remove config/** from .gitignore - Track all config files (secrets still ignored) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-18 13:28:27 -06:00
parent a991c01f64
commit 667632f54b
9 changed files with 931 additions and 5 deletions
--- a/config/monitoring/alerts/service-health.yml
+++ b/config/monitoring/alerts/service-health.yml
@@ -0,0 +1,105 @@
+# Service Health Alerts for MotoVaultPro K8s-equivalent monitoring
+# These alerts mirror K8s PrometheusRule patterns for service monitoring
+
+groups:
+  - name: service-health
+    rules:
+      # Service availability alerts
+      - alert: ServiceDown
+        expr: up == 0
+        for: 1m
+        labels:
+          severity: critical
+          team: platform
+        annotations:
+          summary: "Service {{ $labels.job }} is down"
+          description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute."
+
+      - alert: HighResponseTime
+        expr: traefik_service_request_duration_seconds{quantile="0.95"} > 2
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High response time for service {{ $labels.service }}"
+          description: "95th percentile response time for {{ $labels.service }} is {{ $value }}s"
+
+      - alert: HighErrorRate
+        expr: rate(traefik_service_requests_total{code!~"2.."}[5m]) > 0.1
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High error rate for service {{ $labels.service }}"
+          description: "Error rate for {{ $labels.service }} is {{ $value | humanizePercentage }}"
+
+  - name: database-health
+    rules:
+      # Database connection alerts
+      - alert: DatabaseConnectionsHigh
+        expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
+        for: 5m
+        labels:
+          severity: warning
+          team: database
+        annotations:
+          summary: "High database connections for {{ $labels.datname }}"
+          description: "Database {{ $labels.datname }} is using {{ $value | humanizePercentage }} of max connections"
+
+      - alert: DatabaseReplicationLag
+        expr: pg_stat_replication_lag_bytes > 1073741824  # 1GB
+        for: 2m
+        labels:
+          severity: critical
+          team: database
+        annotations:
+          summary: "High replication lag for database {{ $labels.datname }}"
+          description: "Replication lag is {{ $value | humanize1024 }}B"
+
+  - name: resource-usage
+    rules:
+      # Resource usage alerts
+      - alert: HighMemoryUsage
+        expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High memory usage for container {{ $labels.name }}"
+          description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of memory limit"
+
+      - alert: HighCPUUsage
+        expr: rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High CPU usage for container {{ $labels.name }}"
+          description: "Container {{ $labels.name }} is using {{ $value }}% CPU"
+
+  - name: traefik-health
+    rules:
+      # Traefik specific alerts
+      - alert: TraefikServiceDiscoveryFailure
+        expr: increase(traefik_config_reloads_failure_total[5m]) > 0
+        for: 1m
+        labels:
+          severity: warning
+          team: infrastructure
+        annotations:
+          summary: "Traefik configuration reload failures"
+          description: "Traefik has {{ $value }} configuration reload failures in the last 5 minutes"
+
+      - alert: TraefikBackendDown
+        expr: traefik_service_server_up == 0
+        for: 1m
+        labels:
+          severity: critical
+          team: platform
+        annotations:
+          summary: "Traefik backend {{ $labels.service }} is down"
+          description: "Backend server for service {{ $labels.service }} is unreachable"