Root cause: switch-traffic.sh was modifying Traefik config in the CI checkout directory ($GITHUB_WORKSPACE) instead of the deployment directory ($DEPLOY_PATH). Traefik never saw the weight changes, so traffic stayed on old containers. Changes: - Add DEPLOY_PATH environment variable support to all CI scripts - Add --force-recreate flag to ensure containers are recreated with new images - Add image verification step to confirm containers use expected images - Add weight verification to confirm Traefik routing was updated - Add routing validation step to verify traffic switch succeeded 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
156 lines
4.6 KiB
Bash
Executable File
156 lines
4.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# Auto-rollback script for blue-green deployment
|
|
# Reverts traffic to the previous healthy stack
|
|
#
|
|
# Usage: ./auto-rollback.sh [reason]
|
|
# reason: Optional description of why rollback is happening
|
|
#
|
|
# Exit codes:
|
|
# 0 - Rollback successful
|
|
# 1 - Rollback failed
|
|
|
|
set -euo pipefail
|
|
|
|
# Use DEPLOY_PATH if set (CI environment), otherwise calculate from script location
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="${DEPLOY_PATH:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
|
|
|
|
echo "Using PROJECT_ROOT: $PROJECT_ROOT"
|
|
|
|
REASON="${1:-Automatic rollback triggered}"
|
|
|
|
STATE_FILE="$PROJECT_ROOT/config/deployment/state.json"
|
|
SWITCH_SCRIPT="$SCRIPT_DIR/switch-traffic.sh"
|
|
HEALTH_SCRIPT="$SCRIPT_DIR/health-check.sh"
|
|
NOTIFY_SCRIPT="$SCRIPT_DIR/notify.sh"
|
|
|
|
echo "========================================"
|
|
echo "AUTO-ROLLBACK INITIATED"
|
|
echo "Reason: $REASON"
|
|
echo "Time: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
|
echo "========================================"
|
|
|
|
# Determine current and rollback stacks
|
|
if [[ -f "$STATE_FILE" ]] && command -v jq &> /dev/null; then
|
|
CURRENT_STACK=$(jq -r '.active_stack // "unknown"' "$STATE_FILE")
|
|
ROLLBACK_STACK=$(jq -r '.inactive_stack // "unknown"' "$STATE_FILE")
|
|
else
|
|
echo "ERROR: Cannot determine current stack state"
|
|
echo "State file: $STATE_FILE"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ "$CURRENT_STACK" == "unknown" ]] || [[ "$ROLLBACK_STACK" == "unknown" ]]; then
|
|
echo "ERROR: Invalid stack state"
|
|
echo " Current: $CURRENT_STACK"
|
|
echo " Rollback target: $ROLLBACK_STACK"
|
|
exit 1
|
|
fi
|
|
|
|
echo ""
|
|
echo "Stack Status:"
|
|
echo " Currently active: $CURRENT_STACK"
|
|
echo " Rollback target: $ROLLBACK_STACK"
|
|
echo ""
|
|
|
|
# Verify rollback stack is healthy before switching
|
|
echo "Step 1/3: Verifying rollback stack health..."
|
|
echo "----------------------------------------"
|
|
|
|
if [[ -x "$HEALTH_SCRIPT" ]]; then
|
|
if ! "$HEALTH_SCRIPT" "$ROLLBACK_STACK" 30; then
|
|
echo ""
|
|
echo "CRITICAL: Rollback stack ($ROLLBACK_STACK) is NOT healthy!"
|
|
echo "Manual intervention required."
|
|
echo ""
|
|
echo "Troubleshooting steps:"
|
|
echo " 1. Check container logs: docker logs mvp-backend-$ROLLBACK_STACK"
|
|
echo " 2. Check container status: docker ps -a"
|
|
echo " 3. Consider restarting rollback stack"
|
|
echo ""
|
|
|
|
# Send critical notification
|
|
if [[ -x "$NOTIFY_SCRIPT" ]]; then
|
|
"$NOTIFY_SCRIPT" "rollback_failed" \
|
|
"Rollback to $ROLLBACK_STACK failed - stack unhealthy. Manual intervention required. Reason: $REASON" \
|
|
|| true
|
|
fi
|
|
|
|
exit 1
|
|
fi
|
|
echo " OK: Rollback stack is healthy"
|
|
else
|
|
echo " WARNING: Health check script not found, proceeding anyway"
|
|
fi
|
|
|
|
# Switch traffic to rollback stack
|
|
echo ""
|
|
echo "Step 2/3: Switching traffic to $ROLLBACK_STACK..."
|
|
echo "----------------------------------------"
|
|
|
|
if [[ -x "$SWITCH_SCRIPT" ]]; then
|
|
if ! "$SWITCH_SCRIPT" "$ROLLBACK_STACK" instant; then
|
|
echo "ERROR: Traffic switch failed"
|
|
|
|
if [[ -x "$NOTIFY_SCRIPT" ]]; then
|
|
"$NOTIFY_SCRIPT" "rollback_failed" \
|
|
"Rollback traffic switch failed. Manual intervention required. Reason: $REASON" \
|
|
|| true
|
|
fi
|
|
|
|
exit 1
|
|
fi
|
|
else
|
|
echo "ERROR: Traffic switch script not found: $SWITCH_SCRIPT"
|
|
exit 1
|
|
fi
|
|
|
|
# Update state file with rollback info
|
|
echo ""
|
|
echo "Step 3/3: Updating deployment state..."
|
|
echo "----------------------------------------"
|
|
|
|
if [[ -f "$STATE_FILE" ]] && command -v jq &> /dev/null; then
|
|
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
jq --arg stack "$ROLLBACK_STACK" \
|
|
--arg reason "$REASON" \
|
|
--arg ts "$TIMESTAMP" \
|
|
--arg failed "$CURRENT_STACK" \
|
|
'.active_stack = $stack |
|
|
.inactive_stack = $failed |
|
|
.last_rollback = $ts |
|
|
.last_rollback_reason = $reason |
|
|
.rollback_available = false' \
|
|
"$STATE_FILE" > "${STATE_FILE}.tmp" && mv "${STATE_FILE}.tmp" "$STATE_FILE"
|
|
|
|
echo " State updated"
|
|
fi
|
|
|
|
# Send notification
|
|
if [[ -x "$NOTIFY_SCRIPT" ]]; then
|
|
echo ""
|
|
echo "Sending rollback notification..."
|
|
"$NOTIFY_SCRIPT" "rollback" \
|
|
"Rollback executed. Traffic switched from $CURRENT_STACK to $ROLLBACK_STACK. Reason: $REASON" \
|
|
|| echo " WARNING: Notification failed"
|
|
fi
|
|
|
|
echo ""
|
|
echo "========================================"
|
|
echo "ROLLBACK COMPLETE"
|
|
echo "========================================"
|
|
echo ""
|
|
echo "Summary:"
|
|
echo " Previous stack: $CURRENT_STACK (now inactive)"
|
|
echo " Current stack: $ROLLBACK_STACK (now active)"
|
|
echo " Reason: $REASON"
|
|
echo ""
|
|
echo "Next steps:"
|
|
echo " 1. Investigate why $CURRENT_STACK failed"
|
|
echo " 2. Check logs: docker logs mvp-backend-$CURRENT_STACK"
|
|
echo " 3. Fix issues before next deployment"
|
|
echo ""
|
|
|
|
exit 0
|