chore: unify health check timers across compose and workflows
Some checks failed
Deploy to Staging / Build Images (push) Successful in 32s
Deploy to Staging / Deploy to Staging (push) Successful in 21s
Deploy to Staging / Verify Staging (push) Failing after 1m18s
Deploy to Staging / Notify Staging Ready (push) Has been skipped
Deploy to Staging / Notify Staging Failure (push) Successful in 7s

Docker Compose health checks (all services):
- interval: 5s (was 10-30s)
- timeout: 5s (unified)
- backend start_period: 60s (was 30-180s)

Gitea workflow health check loops:
- Docker healthcheck: 48 attempts x 5s = 4 min (was 24 x 10s)
- Backend health: 12 attempts x 5s = 60s (was 6 x 10s)
- External health: 12 attempts x 5s = 60s (was 6 x 10s)
- Initial waits: 5s (was 10-15s)

Same total wait times, faster detection of success/failure.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-03 21:10:47 -06:00
parent 88db25019f
commit 26196d34ea
4 changed files with 60 additions and 59 deletions

View File

@@ -174,7 +174,7 @@ jobs:
mvp-ocr mvp-frontend-$TARGET_STACK mvp-backend-$TARGET_STACK mvp-ocr mvp-frontend-$TARGET_STACK mvp-backend-$TARGET_STACK
- name: Wait for stack initialization - name: Wait for stack initialization
run: sleep 10 run: sleep 5
- name: Verify container images - name: Verify container images
run: | run: |
@@ -272,8 +272,8 @@ jobs:
# Check if service has a healthcheck defined # Check if service has a healthcheck defined
has_healthcheck=$(docker inspect --format='{{if .Config.Healthcheck}}true{{else}}false{{end}}' $service 2>/dev/null || echo "false") has_healthcheck=$(docker inspect --format='{{if .Config.Healthcheck}}true{{else}}false{{end}}' $service 2>/dev/null || echo "false")
if [ "$has_healthcheck" = "true" ]; then if [ "$has_healthcheck" = "true" ]; then
# 24 attempts x 10 seconds = 4 minutes max wait (backend with fresh migrations can take ~3 min) # 48 attempts x 5 seconds = 4 minutes max wait (backend with fresh migrations can take ~3 min)
for i in $(seq 1 24); do for i in $(seq 1 48); do
health=$(docker inspect --format='{{.State.Health.Status}}' $service 2>/dev/null || echo "unknown") health=$(docker inspect --format='{{.State.Health.Status}}' $service 2>/dev/null || echo "unknown")
if [ "$health" = "healthy" ]; then if [ "$health" = "healthy" ]; then
echo "OK: $service is healthy" echo "OK: $service is healthy"
@@ -283,13 +283,13 @@ jobs:
docker logs $service --tail 50 2>/dev/null || true docker logs $service --tail 50 2>/dev/null || true
exit 1 exit 1
fi fi
if [ $i -eq 24 ]; then if [ $i -eq 48 ]; then
echo "ERROR: $service health check timed out (status: $health)" echo "ERROR: $service health check timed out (status: $health)"
docker logs $service --tail 50 2>/dev/null || true docker logs $service --tail 50 2>/dev/null || true
exit 1 exit 1
fi fi
echo "Waiting for $service healthcheck... (attempt $i/24, status: $health)" echo "Waiting for $service healthcheck... (attempt $i/48, status: $health)"
sleep 10 sleep 5
done done
else else
echo "SKIP: $service has no healthcheck defined" echo "SKIP: $service has no healthcheck defined"
@@ -298,17 +298,17 @@ jobs:
- name: Wait for backend health - name: Wait for backend health
run: | run: |
for i in $(seq 1 24); do for i in $(seq 1 12); do
if docker exec mvp-backend-$TARGET_STACK curl -sf http://localhost:3001/health > /dev/null 2>&1; then if docker exec mvp-backend-$TARGET_STACK curl -sf http://localhost:3001/health > /dev/null 2>&1; then
echo "OK: Backend health check passed" echo "OK: Backend health check passed"
exit 0 exit 0
fi fi
if [ $i -eq 24 ]; then if [ $i -eq 12 ]; then
echo "ERROR: Backend health check failed after 6 attempts" echo "ERROR: Backend health check failed after 12 attempts"
docker logs mvp-backend-$TARGET_STACK --tail 100 docker logs mvp-backend-$TARGET_STACK --tail 100
exit 1 exit 1
fi fi
echo "Attempt $i/24: Backend not ready, waiting 5s..." echo "Attempt $i/12: Backend not ready, waiting 5s..."
sleep 5 sleep 5
done done
@@ -316,9 +316,9 @@ jobs:
run: | run: |
REQUIRED_FEATURES='["admin","auth","onboarding","vehicles","documents","fuel-logs","stations","maintenance","platform","notifications","user-profile","user-preferences","user-export"]' REQUIRED_FEATURES='["admin","auth","onboarding","vehicles","documents","fuel-logs","stations","maintenance","platform","notifications","user-profile","user-preferences","user-export"]'
for i in $(seq 1 24); do for i in $(seq 1 12); do
RESPONSE=$(curl -sf https://motovaultpro.com/api/health 2>/dev/null) || { RESPONSE=$(curl -sf https://motovaultpro.com/api/health 2>/dev/null) || {
echo "Attempt $i/6: Connection failed, waiting 5s..." echo "Attempt $i/12: Connection failed, waiting 5s..."
sleep 5 sleep 5
continue continue
} }
@@ -326,8 +326,8 @@ jobs:
# Check status is "healthy" # Check status is "healthy"
STATUS=$(echo "$RESPONSE" | jq -r '.status') STATUS=$(echo "$RESPONSE" | jq -r '.status')
if [ "$STATUS" != "healthy" ]; then if [ "$STATUS" != "healthy" ]; then
echo "Attempt $i/6: Status is '$STATUS', not 'healthy'. Waiting 10s..." echo "Attempt $i/12: Status is '$STATUS', not 'healthy'. Waiting 5s..."
sleep 10 sleep 5
continue continue
fi fi
@@ -337,8 +337,8 @@ jobs:
') ')
if [ -n "$MISSING" ]; then if [ -n "$MISSING" ]; then
echo "Attempt $i/6: Missing features: $MISSING. Waiting 10s..." echo "Attempt $i/12: Missing features: $MISSING. Waiting 5s..."
sleep 10 sleep 5
continue continue
fi fi
@@ -347,7 +347,7 @@ jobs:
exit 0 exit 0
done done
echo "ERROR: Production health check failed after 6 attempts" echo "ERROR: Production health check failed after 12 attempts"
echo "Last response: $RESPONSE" echo "Last response: $RESPONSE"
exit 1 exit 1

View File

@@ -173,7 +173,7 @@ jobs:
docker compose -f $COMPOSE_FILE -f $COMPOSE_STAGING up -d docker compose -f $COMPOSE_FILE -f $COMPOSE_STAGING up -d
- name: Wait for services - name: Wait for services
run: sleep 15 run: sleep 5
# ============================================ # ============================================
# VERIFY STAGING - Health checks # VERIFY STAGING - Health checks
@@ -205,8 +205,8 @@ jobs:
# Check if service has a healthcheck defined # Check if service has a healthcheck defined
has_healthcheck=$(docker inspect --format='{{if .Config.Healthcheck}}true{{else}}false{{end}}' $service 2>/dev/null || echo "false") has_healthcheck=$(docker inspect --format='{{if .Config.Healthcheck}}true{{else}}false{{end}}' $service 2>/dev/null || echo "false")
if [ "$has_healthcheck" = "true" ]; then if [ "$has_healthcheck" = "true" ]; then
# 24 attempts x 10 seconds = 4 minutes max wait (backend with fresh migrations can take ~3 min) # 48 attempts x 5 seconds = 4 minutes max wait (backend with fresh migrations can take ~3 min)
for i in $(seq 1 24); do for i in $(seq 1 48); do
health=$(docker inspect --format='{{.State.Health.Status}}' $service 2>/dev/null || echo "unknown") health=$(docker inspect --format='{{.State.Health.Status}}' $service 2>/dev/null || echo "unknown")
if [ "$health" = "healthy" ]; then if [ "$health" = "healthy" ]; then
echo "OK: $service is healthy" echo "OK: $service is healthy"
@@ -216,13 +216,13 @@ jobs:
docker logs $service --tail 50 2>/dev/null || true docker logs $service --tail 50 2>/dev/null || true
exit 1 exit 1
fi fi
if [ $i -eq 24 ]; then if [ $i -eq 48 ]; then
echo "ERROR: $service health check timed out (status: $health)" echo "ERROR: $service health check timed out (status: $health)"
docker logs $service --tail 50 2>/dev/null || true docker logs $service --tail 50 2>/dev/null || true
exit 1 exit 1
fi fi
echo "Waiting for $service healthcheck... (attempt $i/24, status: $health)" echo "Waiting for $service healthcheck... (attempt $i/48, status: $health)"
sleep 10 sleep 5
done done
else else
echo "SKIP: $service has no healthcheck defined" echo "SKIP: $service has no healthcheck defined"
@@ -231,36 +231,36 @@ jobs:
- name: Wait for backend health - name: Wait for backend health
run: | run: |
for i in 1 2 3 4 5 6; do for i in $(seq 1 12); do
if docker exec mvp-backend-staging curl -sf http://localhost:3001/health > /dev/null 2>&1; then if docker exec mvp-backend-staging curl -sf http://localhost:3001/health > /dev/null 2>&1; then
echo "OK: Backend health check passed" echo "OK: Backend health check passed"
exit 0 exit 0
fi fi
if [ $i -eq 6 ]; then if [ $i -eq 12 ]; then
echo "ERROR: Backend health check failed after 6 attempts" echo "ERROR: Backend health check failed after 12 attempts"
docker logs mvp-backend-staging --tail 100 docker logs mvp-backend-staging --tail 100
exit 1 exit 1
fi fi
echo "Attempt $i/6: Backend not ready, waiting 10s..." echo "Attempt $i/12: Backend not ready, waiting 5s..."
sleep 10 sleep 5
done done
- name: Check external endpoint - name: Check external endpoint
run: | run: |
REQUIRED_FEATURES='["admin","auth","onboarding","vehicles","documents","fuel-logs","stations","maintenance","platform","notifications","user-profile","user-preferences","user-export"]' REQUIRED_FEATURES='["admin","auth","onboarding","vehicles","documents","fuel-logs","stations","maintenance","platform","notifications","user-profile","user-preferences","user-export"]'
for i in 1 2 3 4 5 6; do for i in $(seq 1 12); do
RESPONSE=$(curl -sf https://staging.motovaultpro.com/api/health 2>/dev/null) || { RESPONSE=$(curl -sf https://staging.motovaultpro.com/api/health 2>/dev/null) || {
echo "Attempt $i/6: Connection failed, waiting 10s..." echo "Attempt $i/12: Connection failed, waiting 5s..."
sleep 10 sleep 5
continue continue
} }
# Check status is "healthy" # Check status is "healthy"
STATUS=$(echo "$RESPONSE" | jq -r '.status') STATUS=$(echo "$RESPONSE" | jq -r '.status')
if [ "$STATUS" != "healthy" ]; then if [ "$STATUS" != "healthy" ]; then
echo "Attempt $i/6: Status is '$STATUS', not 'healthy'. Waiting 10s..." echo "Attempt $i/12: Status is '$STATUS', not 'healthy'. Waiting 5s..."
sleep 10 sleep 5
continue continue
fi fi
@@ -270,8 +270,8 @@ jobs:
') ')
if [ -n "$MISSING" ]; then if [ -n "$MISSING" ]; then
echo "Attempt $i/6: Missing features: $MISSING. Waiting 10s..." echo "Attempt $i/12: Missing features: $MISSING. Waiting 5s..."
sleep 10 sleep 5
continue continue
fi fi
@@ -280,7 +280,7 @@ jobs:
exit 0 exit 0
done done
echo "ERROR: Staging health check failed after 6 attempts" echo "ERROR: Staging health check failed after 12 attempts"
echo "Last response: $RESPONSE" echo "Last response: $RESPONSE"
exit 1 exit 1

View File

@@ -34,7 +34,7 @@ services:
- mvp-backend-blue - mvp-backend-blue
healthcheck: healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:3000 || exit 1"] test: ["CMD-SHELL", "curl -sf http://localhost:3000 || exit 1"]
interval: 10s interval: 5s
timeout: 5s timeout: 5s
retries: 3 retries: 3
start_period: 10s start_period: 10s
@@ -89,10 +89,10 @@ services:
test: test:
- CMD-SHELL - CMD-SHELL
- node -e "require('http').get('http://localhost:3001/health', r => process.exit(r.statusCode===200?0:1)).on('error', () => process.exit(1))" - node -e "require('http').get('http://localhost:3001/health', r => process.exit(r.statusCode===200?0:1)).on('error', () => process.exit(1))"
interval: 10s interval: 5s
timeout: 5s timeout: 5s
retries: 3 retries: 5
start_period: 30s start_period: 60s
deploy: deploy:
resources: resources:
limits: limits:
@@ -125,7 +125,7 @@ services:
- mvp-backend-green - mvp-backend-green
healthcheck: healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:3000 || exit 1"] test: ["CMD-SHELL", "curl -sf http://localhost:3000 || exit 1"]
interval: 10s interval: 5s
timeout: 5s timeout: 5s
retries: 3 retries: 3
start_period: 10s start_period: 10s
@@ -180,10 +180,10 @@ services:
test: test:
- CMD-SHELL - CMD-SHELL
- node -e "require('http').get('http://localhost:3001/health', r => process.exit(r.statusCode===200?0:1)).on('error', () => process.exit(1))" - node -e "require('http').get('http://localhost:3001/health', r => process.exit(r.statusCode===200?0:1)).on('error', () => process.exit(1))"
interval: 10s interval: 5s
timeout: 5s timeout: 5s
retries: 3 retries: 5
start_period: 30s start_period: 60s
deploy: deploy:
resources: resources:
limits: limits:

View File

@@ -29,10 +29,10 @@ services:
backend: backend:
healthcheck: healthcheck:
test: ["CMD", "traefik", "healthcheck"] test: ["CMD", "traefik", "healthcheck"]
interval: 30s interval: 5s
timeout: 10s timeout: 5s
retries: 3 retries: 3
start_period: 20s start_period: 10s
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.http.routers.traefik-dashboard.rule=Host(`traefik.motovaultpro.local`)" - "traefik.http.routers.traefik-dashboard.rule=Host(`traefik.motovaultpro.local`)"
@@ -76,10 +76,10 @@ services:
- mvp-backend - mvp-backend
healthcheck: healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:3000 || exit 1"] test: ["CMD-SHELL", "curl -sf http://localhost:3000 || exit 1"]
interval: 30s interval: 5s
timeout: 10s timeout: 5s
retries: 3 retries: 3
start_period: 20s start_period: 10s
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- traefik.docker.network=motovaultpro_frontend - traefik.docker.network=motovaultpro_frontend
@@ -148,10 +148,10 @@ services:
test: test:
- CMD-SHELL - CMD-SHELL
- node -e "require('http').get('http://localhost:3001/health', r => process.exit(r.statusCode===200?0:1)).on('error', () => process.exit(1))" - node -e "require('http').get('http://localhost:3001/health', r => process.exit(r.statusCode===200?0:1)).on('error', () => process.exit(1))"
interval: 30s interval: 5s
timeout: 10s timeout: 5s
retries: 5 retries: 5
start_period: 180s start_period: 60s
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=motovaultpro_backend" - "traefik.docker.network=motovaultpro_backend"
@@ -198,10 +198,10 @@ services:
- mvp-redis - mvp-redis
healthcheck: healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"] test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s interval: 5s
timeout: 10s timeout: 5s
retries: 3 retries: 3
start_period: 30s start_period: 15s
logging: logging:
driver: json-file driver: json-file
options: options:
@@ -230,10 +230,10 @@ services:
- "5432:5432" # Development access only - "5432:5432" # Development access only
healthcheck: healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"] test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 10s interval: 5s
timeout: 5s timeout: 5s
retries: 5 retries: 5
start_period: 30s start_period: 15s
logging: logging:
driver: json-file driver: json-file
options: options:
@@ -254,9 +254,10 @@ services:
- "6379:6379" # Development access only - "6379:6379" # Development access only
healthcheck: healthcheck:
test: ["CMD", "redis-cli", "ping"] test: ["CMD", "redis-cli", "ping"]
interval: 10s interval: 5s
timeout: 5s timeout: 5s
retries: 5 retries: 5
start_period: 5s
logging: logging:
driver: json-file driver: json-file
options: options: