Track config files for CI/CD deployment

Config files were previously gitignored, causing CI/CD pipeline to fail because Docker would create directories instead of mounting the expected files. - Remove config/** from .gitignore - Track all config files (secrets still ignored) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-18 13:28:27 -06:00
parent a991c01f64
commit 667632f54b
9 changed files with 931 additions and 5 deletions
--- a/config/monitoring/alerts/service-health.yml
+++ b/config/monitoring/alerts/service-health.yml
@@ -0,0 +1,105 @@
+# Service Health Alerts for MotoVaultPro K8s-equivalent monitoring
+# These alerts mirror K8s PrometheusRule patterns for service monitoring
+
+groups:
+  - name: service-health
+    rules:
+      # Service availability alerts
+      - alert: ServiceDown
+        expr: up == 0
+        for: 1m
+        labels:
+          severity: critical
+          team: platform
+        annotations:
+          summary: "Service {{ $labels.job }} is down"
+          description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute."
+
+      - alert: HighResponseTime
+        expr: traefik_service_request_duration_seconds{quantile="0.95"} > 2
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High response time for service {{ $labels.service }}"
+          description: "95th percentile response time for {{ $labels.service }} is {{ $value }}s"
+
+      - alert: HighErrorRate
+        expr: rate(traefik_service_requests_total{code!~"2.."}[5m]) > 0.1
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High error rate for service {{ $labels.service }}"
+          description: "Error rate for {{ $labels.service }} is {{ $value | humanizePercentage }}"
+
+  - name: database-health
+    rules:
+      # Database connection alerts
+      - alert: DatabaseConnectionsHigh
+        expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
+        for: 5m
+        labels:
+          severity: warning
+          team: database
+        annotations:
+          summary: "High database connections for {{ $labels.datname }}"
+          description: "Database {{ $labels.datname }} is using {{ $value | humanizePercentage }} of max connections"
+
+      - alert: DatabaseReplicationLag
+        expr: pg_stat_replication_lag_bytes > 1073741824  # 1GB
+        for: 2m
+        labels:
+          severity: critical
+          team: database
+        annotations:
+          summary: "High replication lag for database {{ $labels.datname }}"
+          description: "Replication lag is {{ $value | humanize1024 }}B"
+
+  - name: resource-usage
+    rules:
+      # Resource usage alerts
+      - alert: HighMemoryUsage
+        expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High memory usage for container {{ $labels.name }}"
+          description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of memory limit"
+
+      - alert: HighCPUUsage
+        expr: rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High CPU usage for container {{ $labels.name }}"
+          description: "Container {{ $labels.name }} is using {{ $value }}% CPU"
+
+  - name: traefik-health
+    rules:
+      # Traefik specific alerts
+      - alert: TraefikServiceDiscoveryFailure
+        expr: increase(traefik_config_reloads_failure_total[5m]) > 0
+        for: 1m
+        labels:
+          severity: warning
+          team: infrastructure
+        annotations:
+          summary: "Traefik configuration reload failures"
+          description: "Traefik has {{ $value }} configuration reload failures in the last 5 minutes"
+
+      - alert: TraefikBackendDown
+        expr: traefik_service_server_up == 0
+        for: 1m
+        labels:
+          severity: critical
+          team: platform
+        annotations:
+          summary: "Traefik backend {{ $labels.service }} is down"
+          description: "Backend server for service {{ $labels.service }} is unreachable"
--- a/config/monitoring/health-checks.yml
+++ b/config/monitoring/health-checks.yml
@@ -0,0 +1,147 @@
+# Health Check Configuration for K8s-Ready Environment
+# This file defines comprehensive health check patterns that mirror
+# Kubernetes readiness, liveness, and startup probes
+
+health_checks:
+  # Application Services
+  admin-backend:
+    startup_probe:
+      path: /health/startup
+      initial_delay: 30s
+      period: 10s
+      timeout: 5s
+      failure_threshold: 6
+    readiness_probe:
+      path: /health/ready
+      period: 10s
+      timeout: 5s
+      failure_threshold: 3
+    liveness_probe:
+      path: /health/live
+      period: 30s
+      timeout: 10s
+      failure_threshold: 3
+
+  admin-frontend:
+    startup_probe:
+      path: /
+      initial_delay: 20s
+      period: 10s
+      timeout: 5s
+      failure_threshold: 3
+    readiness_probe:
+      path: /
+      period: 15s
+      timeout: 5s
+      failure_threshold: 2
+    liveness_probe:
+      path: /
+      period: 30s
+      timeout: 10s
+      failure_threshold: 3
+
+  # Platform Services
+  mvp-platform-vehicles-api:
+    startup_probe:
+      path: /health/startup
+      initial_delay: 30s
+      period: 10s
+      timeout: 5s
+      failure_threshold: 6
+    readiness_probe:
+      path: /health/ready
+      period: 10s
+      timeout: 5s
+      failure_threshold: 3
+    liveness_probe:
+      path: /health/live
+      period: 30s
+      timeout: 10s
+      failure_threshold: 3
+
+  mvp-platform-tenants:
+    startup_probe:
+      path: /health/startup
+      initial_delay: 30s
+      period: 10s
+      timeout: 5s
+      failure_threshold: 6
+    readiness_probe:
+      path: /health/ready
+      period: 10s
+      timeout: 5s
+      failure_threshold: 3
+    liveness_probe:
+      path: /health/live
+      period: 30s
+      timeout: 10s
+      failure_threshold: 3
+
+  mvp-platform-landing:
+    startup_probe:
+      path: /
+      initial_delay: 20s
+      period: 10s
+      timeout: 5s
+      failure_threshold: 3
+    readiness_probe:
+      path: /
+      period: 15s
+      timeout: 5s
+      failure_threshold: 2
+    liveness_probe:
+      path: /
+      period: 30s
+      timeout: 10s
+      failure_threshold: 3
+
+# Monitoring Endpoints
+monitoring:
+  metrics:
+    traefik:
+      endpoint: http://localhost:8080/metrics
+      format: prometheus
+    services:
+      admin-backend:
+        endpoint: /metrics
+        port: 3001
+      mvp-platform-vehicles-api:
+        endpoint: /metrics
+        port: 8000
+      mvp-platform-tenants:
+        endpoint: /metrics
+        port: 8000
+
+  logging:
+    level: INFO
+    format: json
+    destinations:
+      - console
+      - file:/var/log/app/application.log
+
+# Service Discovery Validation
+service_discovery:
+  expected_services:
+    - admin-app@docker
+    - admin-api@docker
+    - landing@docker
+    - vehicles-api@docker
+    - tenants-api@docker
+  expected_routes:
+    - "Host(admin.motovaultpro.com)"
+    - "Host(motovaultpro.com)"
+    - "PathPrefix(/api/platform/vehicles)"
+    - "PathPrefix(/api/platform/tenants)"
+    - "PathPrefix(/api)"
+
+# Performance Thresholds (K8s-ready SLOs)
+performance:
+  response_time:
+    target: 500ms
+    max: 2000ms
+  availability:
+    target: 99.9%
+    min: 99.0%
+  error_rate:
+    target: 0.1%
+    max: 1.0%
--- a/config/monitoring/prometheus.yml
+++ b/config/monitoring/prometheus.yml
@@ -0,0 +1,105 @@
+# Prometheus Configuration for MotoVaultPro K8s-equivalent monitoring
+# This configuration mirrors K8s ServiceMonitor and PodMonitor patterns
+
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    cluster: 'motovaultpro-dev'
+    environment: 'development'
+
+# Rule files for alerting (K8s PrometheusRule equivalent)
+rule_files:
+  - "alerts/*.yml"
+
+# Scrape configurations (K8s ServiceMonitor equivalent)
+scrape_configs:
+  # Traefik metrics (Infrastructure monitoring)
+  - job_name: 'traefik'
+    static_configs:
+      - targets: ['traefik:8080']
+    metrics_path: '/metrics'
+    scrape_interval: 15s
+    scrape_timeout: 10s
+
+  # Application backend metrics
+  - job_name: 'admin-backend'
+    static_configs:
+      - targets: ['admin-backend:3001']
+    metrics_path: '/metrics'
+    scrape_interval: 30s
+    scrape_timeout: 10s
+
+  # Platform services metrics
+  - job_name: 'platform-vehicles-api'
+    static_configs:
+      - targets: ['mvp-platform-vehicles-api:8000']
+    metrics_path: '/metrics'
+    scrape_interval: 30s
+    scrape_timeout: 10s
+
+  - job_name: 'platform-tenants-api'
+    static_configs:
+      - targets: ['mvp-platform-tenants:8000']
+    metrics_path: '/metrics'
+    scrape_interval: 30s
+    scrape_timeout: 10s
+
+  # Database monitoring (PostgreSQL exporter equivalent)
+  - job_name: 'postgres-app'
+    static_configs:
+      - targets: ['admin-postgres:5432']
+    metrics_path: '/metrics'
+    scrape_interval: 60s
+    scrape_timeout: 15s
+
+  - job_name: 'postgres-platform'
+    static_configs:
+      - targets: ['platform-postgres:5432']
+    metrics_path: '/metrics'
+    scrape_interval: 60s
+    scrape_timeout: 15s
+
+  - job_name: 'postgres-vehicles'
+    static_configs:
+      - targets: ['mvp-platform-vehicles-db:5432']
+    metrics_path: '/metrics'
+    scrape_interval: 60s
+    scrape_timeout: 15s
+
+  # Redis monitoring
+  - job_name: 'redis-app'
+    static_configs:
+      - targets: ['admin-redis:6379']
+    metrics_path: '/metrics'
+    scrape_interval: 60s
+    scrape_timeout: 15s
+
+  - job_name: 'redis-platform'
+    static_configs:
+      - targets: ['platform-redis:6379']
+    metrics_path: '/metrics'
+    scrape_interval: 60s
+    scrape_timeout: 15s
+
+  - job_name: 'redis-vehicles'
+    static_configs:
+      - targets: ['mvp-platform-vehicles-redis:6379']
+    metrics_path: '/metrics'
+    scrape_interval: 60s
+    scrape_timeout: 15s
+
+  # MinIO monitoring
+  - job_name: 'minio'
+    static_configs:
+      - targets: ['admin-minio:9000']
+    metrics_path: '/minio/v2/metrics/cluster'
+    scrape_interval: 60s
+    scrape_timeout: 15s
+
+# Alertmanager configuration (K8s Alertmanager equivalent)
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+          - alertmanager:9093