Track config files for CI/CD deployment
Config files were previously gitignored, causing CI/CD pipeline to fail because Docker would create directories instead of mounting the expected files. - Remove config/** from .gitignore - Track all config files (secrets still ignored) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
105
config/monitoring/alerts/service-health.yml
Executable file
105
config/monitoring/alerts/service-health.yml
Executable file
@@ -0,0 +1,105 @@
|
||||
# Service Health Alerts for MotoVaultPro K8s-equivalent monitoring
|
||||
# These alerts mirror K8s PrometheusRule patterns for service monitoring
|
||||
|
||||
groups:
|
||||
- name: service-health
|
||||
rules:
|
||||
# Service availability alerts
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute."
|
||||
|
||||
- alert: HighResponseTime
|
||||
expr: traefik_service_request_duration_seconds{quantile="0.95"} > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High response time for service {{ $labels.service }}"
|
||||
description: "95th percentile response time for {{ $labels.service }} is {{ $value }}s"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: rate(traefik_service_requests_total{code!~"2.."}[5m]) > 0.1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High error rate for service {{ $labels.service }}"
|
||||
description: "Error rate for {{ $labels.service }} is {{ $value | humanizePercentage }}"
|
||||
|
||||
- name: database-health
|
||||
rules:
|
||||
# Database connection alerts
|
||||
- alert: DatabaseConnectionsHigh
|
||||
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: database
|
||||
annotations:
|
||||
summary: "High database connections for {{ $labels.datname }}"
|
||||
description: "Database {{ $labels.datname }} is using {{ $value | humanizePercentage }} of max connections"
|
||||
|
||||
- alert: DatabaseReplicationLag
|
||||
expr: pg_stat_replication_lag_bytes > 1073741824 # 1GB
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: database
|
||||
annotations:
|
||||
summary: "High replication lag for database {{ $labels.datname }}"
|
||||
description: "Replication lag is {{ $value | humanize1024 }}B"
|
||||
|
||||
- name: resource-usage
|
||||
rules:
|
||||
# Resource usage alerts
|
||||
- alert: HighMemoryUsage
|
||||
expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High memory usage for container {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of memory limit"
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High CPU usage for container {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is using {{ $value }}% CPU"
|
||||
|
||||
- name: traefik-health
|
||||
rules:
|
||||
# Traefik specific alerts
|
||||
- alert: TraefikServiceDiscoveryFailure
|
||||
expr: increase(traefik_config_reloads_failure_total[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
team: infrastructure
|
||||
annotations:
|
||||
summary: "Traefik configuration reload failures"
|
||||
description: "Traefik has {{ $value }} configuration reload failures in the last 5 minutes"
|
||||
|
||||
- alert: TraefikBackendDown
|
||||
expr: traefik_service_server_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Traefik backend {{ $labels.service }} is down"
|
||||
description: "Backend server for service {{ $labels.service }} is unreachable"
|
||||
147
config/monitoring/health-checks.yml
Executable file
147
config/monitoring/health-checks.yml
Executable file
@@ -0,0 +1,147 @@
|
||||
# Health Check Configuration for K8s-Ready Environment
|
||||
# This file defines comprehensive health check patterns that mirror
|
||||
# Kubernetes readiness, liveness, and startup probes
|
||||
|
||||
health_checks:
|
||||
# Application Services
|
||||
admin-backend:
|
||||
startup_probe:
|
||||
path: /health/startup
|
||||
initial_delay: 30s
|
||||
period: 10s
|
||||
timeout: 5s
|
||||
failure_threshold: 6
|
||||
readiness_probe:
|
||||
path: /health/ready
|
||||
period: 10s
|
||||
timeout: 5s
|
||||
failure_threshold: 3
|
||||
liveness_probe:
|
||||
path: /health/live
|
||||
period: 30s
|
||||
timeout: 10s
|
||||
failure_threshold: 3
|
||||
|
||||
admin-frontend:
|
||||
startup_probe:
|
||||
path: /
|
||||
initial_delay: 20s
|
||||
period: 10s
|
||||
timeout: 5s
|
||||
failure_threshold: 3
|
||||
readiness_probe:
|
||||
path: /
|
||||
period: 15s
|
||||
timeout: 5s
|
||||
failure_threshold: 2
|
||||
liveness_probe:
|
||||
path: /
|
||||
period: 30s
|
||||
timeout: 10s
|
||||
failure_threshold: 3
|
||||
|
||||
# Platform Services
|
||||
mvp-platform-vehicles-api:
|
||||
startup_probe:
|
||||
path: /health/startup
|
||||
initial_delay: 30s
|
||||
period: 10s
|
||||
timeout: 5s
|
||||
failure_threshold: 6
|
||||
readiness_probe:
|
||||
path: /health/ready
|
||||
period: 10s
|
||||
timeout: 5s
|
||||
failure_threshold: 3
|
||||
liveness_probe:
|
||||
path: /health/live
|
||||
period: 30s
|
||||
timeout: 10s
|
||||
failure_threshold: 3
|
||||
|
||||
mvp-platform-tenants:
|
||||
startup_probe:
|
||||
path: /health/startup
|
||||
initial_delay: 30s
|
||||
period: 10s
|
||||
timeout: 5s
|
||||
failure_threshold: 6
|
||||
readiness_probe:
|
||||
path: /health/ready
|
||||
period: 10s
|
||||
timeout: 5s
|
||||
failure_threshold: 3
|
||||
liveness_probe:
|
||||
path: /health/live
|
||||
period: 30s
|
||||
timeout: 10s
|
||||
failure_threshold: 3
|
||||
|
||||
mvp-platform-landing:
|
||||
startup_probe:
|
||||
path: /
|
||||
initial_delay: 20s
|
||||
period: 10s
|
||||
timeout: 5s
|
||||
failure_threshold: 3
|
||||
readiness_probe:
|
||||
path: /
|
||||
period: 15s
|
||||
timeout: 5s
|
||||
failure_threshold: 2
|
||||
liveness_probe:
|
||||
path: /
|
||||
period: 30s
|
||||
timeout: 10s
|
||||
failure_threshold: 3
|
||||
|
||||
# Monitoring Endpoints
|
||||
monitoring:
|
||||
metrics:
|
||||
traefik:
|
||||
endpoint: http://localhost:8080/metrics
|
||||
format: prometheus
|
||||
services:
|
||||
admin-backend:
|
||||
endpoint: /metrics
|
||||
port: 3001
|
||||
mvp-platform-vehicles-api:
|
||||
endpoint: /metrics
|
||||
port: 8000
|
||||
mvp-platform-tenants:
|
||||
endpoint: /metrics
|
||||
port: 8000
|
||||
|
||||
logging:
|
||||
level: INFO
|
||||
format: json
|
||||
destinations:
|
||||
- console
|
||||
- file:/var/log/app/application.log
|
||||
|
||||
# Service Discovery Validation
|
||||
service_discovery:
|
||||
expected_services:
|
||||
- admin-app@docker
|
||||
- admin-api@docker
|
||||
- landing@docker
|
||||
- vehicles-api@docker
|
||||
- tenants-api@docker
|
||||
expected_routes:
|
||||
- "Host(admin.motovaultpro.com)"
|
||||
- "Host(motovaultpro.com)"
|
||||
- "PathPrefix(/api/platform/vehicles)"
|
||||
- "PathPrefix(/api/platform/tenants)"
|
||||
- "PathPrefix(/api)"
|
||||
|
||||
# Performance Thresholds (K8s-ready SLOs)
|
||||
performance:
|
||||
response_time:
|
||||
target: 500ms
|
||||
max: 2000ms
|
||||
availability:
|
||||
target: 99.9%
|
||||
min: 99.0%
|
||||
error_rate:
|
||||
target: 0.1%
|
||||
max: 1.0%
|
||||
105
config/monitoring/prometheus.yml
Executable file
105
config/monitoring/prometheus.yml
Executable file
@@ -0,0 +1,105 @@
|
||||
# Prometheus Configuration for MotoVaultPro K8s-equivalent monitoring
|
||||
# This configuration mirrors K8s ServiceMonitor and PodMonitor patterns
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'motovaultpro-dev'
|
||||
environment: 'development'
|
||||
|
||||
# Rule files for alerting (K8s PrometheusRule equivalent)
|
||||
rule_files:
|
||||
- "alerts/*.yml"
|
||||
|
||||
# Scrape configurations (K8s ServiceMonitor equivalent)
|
||||
scrape_configs:
|
||||
# Traefik metrics (Infrastructure monitoring)
|
||||
- job_name: 'traefik'
|
||||
static_configs:
|
||||
- targets: ['traefik:8080']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# Application backend metrics
|
||||
- job_name: 'admin-backend'
|
||||
static_configs:
|
||||
- targets: ['admin-backend:3001']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# Platform services metrics
|
||||
- job_name: 'platform-vehicles-api'
|
||||
static_configs:
|
||||
- targets: ['mvp-platform-vehicles-api:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
|
||||
- job_name: 'platform-tenants-api'
|
||||
static_configs:
|
||||
- targets: ['mvp-platform-tenants:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# Database monitoring (PostgreSQL exporter equivalent)
|
||||
- job_name: 'postgres-app'
|
||||
static_configs:
|
||||
- targets: ['admin-postgres:5432']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 15s
|
||||
|
||||
- job_name: 'postgres-platform'
|
||||
static_configs:
|
||||
- targets: ['platform-postgres:5432']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 15s
|
||||
|
||||
- job_name: 'postgres-vehicles'
|
||||
static_configs:
|
||||
- targets: ['mvp-platform-vehicles-db:5432']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 15s
|
||||
|
||||
# Redis monitoring
|
||||
- job_name: 'redis-app'
|
||||
static_configs:
|
||||
- targets: ['admin-redis:6379']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 15s
|
||||
|
||||
- job_name: 'redis-platform'
|
||||
static_configs:
|
||||
- targets: ['platform-redis:6379']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 15s
|
||||
|
||||
- job_name: 'redis-vehicles'
|
||||
static_configs:
|
||||
- targets: ['mvp-platform-vehicles-redis:6379']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 15s
|
||||
|
||||
# MinIO monitoring
|
||||
- job_name: 'minio'
|
||||
static_configs:
|
||||
- targets: ['admin-minio:9000']
|
||||
metrics_path: '/minio/v2/metrics/cluster'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 15s
|
||||
|
||||
# Alertmanager configuration (K8s Alertmanager equivalent)
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
Reference in New Issue
Block a user