motovaultpro/.gitea/workflows/production.yaml

# MotoVaultPro Production Deployment Workflow
# Manual trigger only - run after verifying staging
# Blue-green deployment with auto-rollback
#
# Optimization: Uses sparse checkout (scripts/ only) + shallow clone
# since all scripts run from $DEPLOY_PATH on the production server

name: Deploy to Production
run-name: Production Deploy - ${{ inputs.image_tag || 'latest' }}

on:
  workflow_dispatch:
    inputs:
      image_tag:
        description: 'Image tag to deploy (defaults to latest)'
        required: false
        default: 'latest'

env:
  REGISTRY: git.motovaultpro.com
  DEPLOY_PATH: /opt/motovaultpro
  BASE_COMPOSE_FILE: docker-compose.yml
  COMPOSE_BLUE_GREEN: docker-compose.blue-green.yml
  COMPOSE_PROD: docker-compose.prod.yml
  HEALTH_CHECK_TIMEOUT: "240"
  LOG_LEVEL: INFO

jobs:
  # ============================================
  # VALIDATE - Determine target stack
  # ============================================
  validate:
    name: Validate Prerequisites
    runs-on: prod
    outputs:
      target_stack: ${{ steps.determine-stack.outputs.target_stack }}
      backend_image: ${{ steps.set-images.outputs.backend_image }}
      frontend_image: ${{ steps.set-images.outputs.frontend_image }}
      ocr_image: ${{ steps.set-images.outputs.ocr_image }}
    steps:
      - name: Check Docker availability
        run: |
          docker info > /dev/null 2>&1 || (echo "ERROR - Docker not accessible" && exit 1)
          docker compose version > /dev/null 2>&1 || (echo "ERROR - Docker Compose not available" && exit 1)

      - name: Check deployment path
        run: test -d "$DEPLOY_PATH" || (echo "ERROR - DEPLOY_PATH not found" && exit 1)

      - name: Login to Gitea Container Registry
        run: |
          echo "${{ secrets.REGISTRY_PASSWORD }}" | docker login -u "${{ secrets.REGISTRY_USER }}" --password-stdin "$REGISTRY"

      - name: Set image tags
        id: set-images
        run: |
          TAG="${{ inputs.image_tag }}"
          echo "backend_image=$REGISTRY/egullickson/backend:$TAG" >> $GITHUB_OUTPUT
          echo "frontend_image=$REGISTRY/egullickson/frontend:$TAG" >> $GITHUB_OUTPUT
          echo "ocr_image=$REGISTRY/egullickson/ocr:$TAG" >> $GITHUB_OUTPUT

      - name: Determine target stack
        id: determine-stack
        run: |
          STATE_FILE="$DEPLOY_PATH/config/deployment/state.json"
          if [ -f "$STATE_FILE" ] && command -v jq &> /dev/null; then
            ACTIVE_STACK=$(jq -r '.active_stack // "blue"' "$STATE_FILE")
            if [ "$ACTIVE_STACK" = "blue" ]; then
              echo "target_stack=green" >> $GITHUB_OUTPUT
              echo "Deploying to GREEN stack (BLUE is currently active)"
            else
              echo "target_stack=blue" >> $GITHUB_OUTPUT
              echo "Deploying to BLUE stack (GREEN is currently active)"
            fi
          else
            echo "target_stack=green" >> $GITHUB_OUTPUT
            echo "No state file found, defaulting to GREEN stack"
          fi

  # ============================================
  # DEPLOY PROD - Blue-green deployment
  # ============================================
  deploy-prod:
    name: Deploy to Production
    runs-on: prod
    needs: validate
    env:
      TARGET_STACK: ${{ needs.validate.outputs.target_stack }}
      BACKEND_IMAGE: ${{ needs.validate.outputs.backend_image }}
      FRONTEND_IMAGE: ${{ needs.validate.outputs.frontend_image }}
      OCR_IMAGE: ${{ needs.validate.outputs.ocr_image }}
    steps:
      - name: Checkout scripts, config, and compose files
        uses: actions/checkout@v4
        with:
          sparse-checkout: |
            scripts/
            config/
            docker-compose.yml
            docker-compose.blue-green.yml
            docker-compose.prod.yml
          sparse-checkout-cone-mode: false
          fetch-depth: 1

      - name: Sync config, scripts, and compose files to deploy path
        run: |
          rsync -av --delete "$GITHUB_WORKSPACE/config/" "$DEPLOY_PATH/config/"
          rsync -av --delete "$GITHUB_WORKSPACE/scripts/" "$DEPLOY_PATH/scripts/"
          cp "$GITHUB_WORKSPACE/docker-compose.yml" "$DEPLOY_PATH/"
          cp "$GITHUB_WORKSPACE/docker-compose.blue-green.yml" "$DEPLOY_PATH/"
          cp "$GITHUB_WORKSPACE/docker-compose.prod.yml" "$DEPLOY_PATH/"

      - name: Generate logging configuration
        run: |
          cd "$DEPLOY_PATH"
          chmod +x scripts/ci/generate-log-config.sh
          ./scripts/ci/generate-log-config.sh "$LOG_LEVEL"

      - name: Login to registry
        run: |
          echo "${{ secrets.REGISTRY_PASSWORD }}" | docker login -u "${{ secrets.REGISTRY_USER }}" --password-stdin "$REGISTRY"

      - name: Inject secrets
        run: |
          cd "$DEPLOY_PATH"
          chmod +x scripts/inject-secrets.sh
          SECRETS_DIR="$DEPLOY_PATH/secrets/app" ./scripts/inject-secrets.sh
        env:
          POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }}
          AUTH0_CLIENT_SECRET: ${{ secrets.AUTH0_CLIENT_SECRET }}
          AUTH0_MANAGEMENT_CLIENT_ID: ${{ secrets.AUTH0_MANAGEMENT_CLIENT_ID }}
          AUTH0_MANAGEMENT_CLIENT_SECRET: ${{ secrets.AUTH0_MANAGEMENT_CLIENT_SECRET }}
          GOOGLE_MAPS_API_KEY: ${{ secrets.GOOGLE_MAPS_API_KEY }}
          GOOGLE_MAPS_MAP_ID: ${{ secrets.GOOGLE_MAPS_MAP_ID }}
          CF_DNS_API_TOKEN: ${{ secrets.CF_DNS_API_TOKEN }}
          RESEND_API_KEY: ${{ secrets.RESEND_API_KEY }}
          STRIPE_SECRET_KEY: ${{ secrets.STRIPE_SECRET_KEY }}
          STRIPE_WEBHOOK_SECRET: ${{ secrets.STRIPE_WEBHOOK_SECRET }}

      - name: Initialize data directories
        run: |
          cd "$DEPLOY_PATH"
          sudo mkdir -p data/backups data/documents data/traefik
          sudo chown -R 1001:1001 data/backups data/documents
          sudo chmod 755 data/backups data/documents
          # Traefik acme.json requires 600 permissions
          if [ ! -f data/traefik/acme.json ]; then
            sudo touch data/traefik/acme.json
          fi
          sudo chmod 600 data/traefik/acme.json

      - name: Pull new images
        run: |
          docker pull $BACKEND_IMAGE
          docker pull $FRONTEND_IMAGE
          docker pull $OCR_IMAGE

      - name: Record expected image IDs
        id: expected-images
        run: |
          # Get the image IDs we just pulled - these are what containers should use
          FRONTEND_ID=$(docker images --format '{{.ID}}' $FRONTEND_IMAGE | head -1)
          BACKEND_ID=$(docker images --format '{{.ID}}' $BACKEND_IMAGE | head -1)
          echo "Expected frontend image ID: $FRONTEND_ID"
          echo "Expected backend image ID: $BACKEND_ID"
          echo "frontend_id=$FRONTEND_ID" >> $GITHUB_OUTPUT
          echo "backend_id=$BACKEND_ID" >> $GITHUB_OUTPUT

      - name: Start shared services
        run: |
          cd "$DEPLOY_PATH"
          # Start shared infrastructure services (database, cache, logging)
          # --no-recreate prevents restarting postgres/redis when config files change
          # These must persist across blue-green deployments to avoid data service disruption
          docker compose -f $BASE_COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d --no-recreate \
            mvp-postgres mvp-redis mvp-loki mvp-alloy mvp-grafana

      - name: Wait for shared services health
        run: |
          echo "Waiting for PostgreSQL and Redis to be healthy..."
          for service in mvp-postgres mvp-redis; do
            for i in $(seq 1 24); do
              health=$(docker inspect --format='{{.State.Health.Status}}' $service 2>/dev/null || echo "unknown")
              if [ "$health" = "healthy" ]; then
                echo "OK: $service is healthy"
                break
              fi
              if [ $i -eq 24 ]; then
                echo "ERROR: $service health check timed out (status: $health)"
                docker logs $service --tail 50 2>/dev/null || true
                exit 1
              fi
              echo "Waiting for $service... (attempt $i/24, status: $health)"
              sleep 5
            done
          done
          echo "All shared services healthy"

      - name: Start target stack
        run: |
          cd "$DEPLOY_PATH"
          export BACKEND_IMAGE=$BACKEND_IMAGE
          export FRONTEND_IMAGE=$FRONTEND_IMAGE
          export OCR_IMAGE=$OCR_IMAGE
          # --force-recreate ensures containers are recreated even if image tag is same
          # This prevents stale container content when image digest changes
          # Start shared OCR service and target stack
          docker compose -f $BASE_COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d --force-recreate \
            mvp-ocr mvp-frontend-$TARGET_STACK mvp-backend-$TARGET_STACK

      - name: Wait for stack initialization
        run: sleep 5

      - name: Verify container images
        run: |
          # Verify containers are running the expected images
          EXPECTED_FRONTEND="${{ steps.expected-images.outputs.frontend_id }}"
          EXPECTED_BACKEND="${{ steps.expected-images.outputs.backend_id }}"

          RUNNING_FRONTEND=$(docker inspect --format='{{.Image}}' mvp-frontend-$TARGET_STACK | sed 's/sha256://' | cut -c1-12)
          RUNNING_BACKEND=$(docker inspect --format='{{.Image}}' mvp-backend-$TARGET_STACK | sed 's/sha256://' | cut -c1-12)

          echo "Frontend - Expected: $EXPECTED_FRONTEND, Running: $RUNNING_FRONTEND"
          echo "Backend - Expected: $EXPECTED_BACKEND, Running: $RUNNING_BACKEND"

          if [[ "$RUNNING_FRONTEND" != "$EXPECTED_FRONTEND" ]]; then
            echo "ERROR: Frontend container not using expected image!"
            echo "Container may be stale. Force recreate should have prevented this."
            exit 1
          fi

          if [[ "$RUNNING_BACKEND" != "$EXPECTED_BACKEND" ]]; then
            echo "ERROR: Backend container not using expected image!"
            exit 1
          fi

          echo "OK: All containers using correct images"

      - name: Run health check
        run: |
          chmod +x "$GITHUB_WORKSPACE/scripts/ci/health-check.sh"
          DEPLOY_PATH="$DEPLOY_PATH" "$GITHUB_WORKSPACE/scripts/ci/health-check.sh" $TARGET_STACK $HEALTH_CHECK_TIMEOUT

      - name: Start Traefik
        run: |
          cd "$DEPLOY_PATH"
          docker compose -f $BASE_COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d mvp-traefik

      - name: Wait for Traefik
        run: |
          echo "Waiting for Traefik to be healthy..."
          timeout 30 bash -c "until docker inspect --format='{{.State.Health.Status}}' mvp-traefik 2>/dev/null | grep -q healthy; do sleep 2; done" || {
            echo "Traefik health check timed out, checking status..."
            docker inspect --format='{{.State.Status}}' mvp-traefik
            docker logs mvp-traefik --tail 20
            exit 1
          }
          echo "Traefik is healthy"

      - name: Switch traffic
        run: |
          chmod +x "$GITHUB_WORKSPACE/scripts/ci/switch-traffic.sh"
          # DEPLOY_PATH ensures script modifies config at /opt/motovaultpro, not checkout dir
          DEPLOY_PATH="$DEPLOY_PATH" "$GITHUB_WORKSPACE/scripts/ci/switch-traffic.sh" $TARGET_STACK instant

      - name: Update deployment state
        run: |
          cd "$DEPLOY_PATH"
          STATE_FILE="config/deployment/state.json"
          if [ -f "$STATE_FILE" ] && command -v jq &> /dev/null; then
            TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
            jq --arg commit "${{ inputs.image_tag }}" \
               --arg ts "$TIMESTAMP" \
               '.last_deployment = $ts | .last_deployment_commit = $commit | .last_deployment_status = "success" | .rollback_available = true' \
               "$STATE_FILE" > "${STATE_FILE}.tmp" && mv "${STATE_FILE}.tmp" "$STATE_FILE"
          fi

  # ============================================
  # VERIFY PROD - External health check
  # ============================================
  verify-prod:
    name: Verify Production
    runs-on: prod
    needs: [validate, deploy-prod]
    env:
      TARGET_STACK: ${{ needs.validate.outputs.target_stack }}
    steps:
      - name: Wait for routing propagation
        run: sleep 5

      - name: Check container status and health
        run: |
          for service in mvp-frontend-$TARGET_STACK mvp-backend-$TARGET_STACK mvp-ocr; do
            status=$(docker inspect --format='{{.State.Status}}' $service 2>/dev/null || echo "not found")
            if [ "$status" != "running" ]; then
              echo "ERROR: $service is not running (status: $status)"
              docker logs $service --tail 50 2>/dev/null || true
              exit 1
            fi
            echo "OK: $service is running"
          done

          # Wait for Docker healthchecks to complete (services with healthcheck defined)
          echo ""
          echo "Waiting for Docker healthchecks..."
          for service in mvp-frontend-$TARGET_STACK mvp-backend-$TARGET_STACK mvp-ocr; do
            # Check if service has a healthcheck defined
            has_healthcheck=$(docker inspect --format='{{if .Config.Healthcheck}}true{{else}}false{{end}}' $service 2>/dev/null || echo "false")
            if [ "$has_healthcheck" = "true" ]; then
              # 48 attempts x 5 seconds = 4 minutes max wait (backend with fresh migrations can take ~3 min)
              for i in $(seq 1 48); do
                health=$(docker inspect --format='{{.State.Health.Status}}' $service 2>/dev/null || echo "unknown")
                if [ "$health" = "healthy" ]; then
                  echo "OK: $service is healthy"
                  break
                fi
                # Don't fail immediately on unhealthy - container may still be starting up
                # and can recover. Let the timeout handle truly broken containers.
                if [ $i -eq 48 ]; then
                  echo "ERROR: $service health check timed out (status: $health)"
                  docker logs $service --tail 100 2>/dev/null || true
                  exit 1
                fi
                echo "Waiting for $service healthcheck... (attempt $i/48, status: $health)"
                sleep 5
              done
            else
              echo "SKIP: $service has no healthcheck defined"
            fi
          done

      - name: Wait for backend health
        run: |
          for i in $(seq 1 12); do
            if docker exec mvp-backend-$TARGET_STACK curl -sf http://localhost:3001/health > /dev/null 2>&1; then
              echo "OK: Backend health check passed"
              exit 0
            fi
            if [ $i -eq 12 ]; then
              echo "ERROR: Backend health check failed after 12 attempts"
              docker logs mvp-backend-$TARGET_STACK --tail 100
              exit 1
            fi
            echo "Attempt $i/12: Backend not ready, waiting 5s..."
            sleep 5
          done

      - name: External health check
        run: |
          REQUIRED_FEATURES='["admin","auth","onboarding","vehicles","documents","fuel-logs","stations","maintenance","platform","notifications","user-profile","user-preferences","user-export"]'

          for i in $(seq 1 12); do
            RESPONSE=$(curl -sf https://motovaultpro.com/api/health 2>/dev/null) || {
              echo "Attempt $i/12: Connection failed, waiting 5s..."
              sleep 5
              continue
            }

            # Check status is "healthy"
            STATUS=$(echo "$RESPONSE" | jq -r '.status')
            if [ "$STATUS" != "healthy" ]; then
              echo "Attempt $i/12: Status is '$STATUS', not 'healthy'. Waiting 5s..."
              sleep 5
              continue
            fi

            # Check all required features are present
            MISSING=$(echo "$RESPONSE" | jq -r --argjson required "$REQUIRED_FEATURES" '
              $required - .features | if length > 0 then . else empty end | @json
            ')

            if [ -n "$MISSING" ]; then
              echo "Attempt $i/12: Missing features: $MISSING. Waiting 5s..."
              sleep 5
              continue
            fi

            FEATURE_COUNT=$(echo "$RESPONSE" | jq '.features | length')
            echo "OK: Production health check passed - status: healthy, features: $FEATURE_COUNT"
            exit 0
          done

          echo "ERROR: Production health check failed after 12 attempts"
          echo "Last response: $RESPONSE"
          exit 1

      - name: Verify container status
        run: |
          for service in mvp-frontend-$TARGET_STACK mvp-backend-$TARGET_STACK; do
            status=$(docker inspect --format='{{.State.Status}}' $service 2>/dev/null || echo "not found")
            health=$(docker inspect --format='{{.State.Health.Status}}' $service 2>/dev/null || echo "unknown")
            if [ "$status" != "running" ] || [ "$health" != "healthy" ]; then
              echo "ERROR: $service is not healthy (status: $status, health: $health)"
              docker logs $service --tail 50 2>/dev/null || true
              exit 1
            fi
            echo "OK: $service is running and healthy"
          done

      - name: Validate Traefik routing weights
        run: |
          # Verify traffic has actually switched to the new stack
          BLUE_GREEN_CONFIG="$DEPLOY_PATH/config/traefik/dynamic/blue-green.yml"

          if [[ "$TARGET_STACK" == "green" ]]; then
            EXPECTED_TARGET_WEIGHT=100
            EXPECTED_OTHER_WEIGHT=0
            TARGET_SVC="mvp-frontend-green-svc"
          else
            EXPECTED_TARGET_WEIGHT=100
            EXPECTED_OTHER_WEIGHT=0
            TARGET_SVC="mvp-frontend-blue-svc"
          fi

          ACTUAL_WEIGHT=$(grep -A1 "$TARGET_SVC" "$BLUE_GREEN_CONFIG" | grep weight | grep -oE '[0-9]+' | head -1)

          if [[ "$ACTUAL_WEIGHT" != "$EXPECTED_TARGET_WEIGHT" ]]; then
            echo "ERROR: Traffic not routed to $TARGET_STACK stack!"
            echo "Expected weight for $TARGET_SVC: $EXPECTED_TARGET_WEIGHT, Actual: $ACTUAL_WEIGHT"
            cat "$BLUE_GREEN_CONFIG" | grep -A2 weight
            exit 1
          fi

          echo "OK: Traffic correctly routed to $TARGET_STACK (weight: $ACTUAL_WEIGHT)"

  # ============================================
  # ROLLBACK - Auto-rollback on failure
  # ============================================
  rollback:
    name: Auto Rollback
    runs-on: prod
    needs: [validate, deploy-prod, verify-prod]
    if: failure()
    steps:
      - name: Checkout scripts
        uses: actions/checkout@v4
        with:
          sparse-checkout: scripts/
          sparse-checkout-cone-mode: true
          fetch-depth: 1

      - name: Execute rollback
        run: |
          chmod +x "$GITHUB_WORKSPACE/scripts/ci/auto-rollback.sh"
          DEPLOY_PATH="$DEPLOY_PATH" "$GITHUB_WORKSPACE/scripts/ci/auto-rollback.sh" "Production verification failed - automatic rollback"

      - name: Update state
        run: |
          cd "$DEPLOY_PATH"
          STATE_FILE="config/deployment/state.json"
          if [ -f "$STATE_FILE" ] && command -v jq &> /dev/null; then
            jq '.last_deployment_status = "rolled_back"' "$STATE_FILE" > "${STATE_FILE}.tmp" && mv "${STATE_FILE}.tmp" "$STATE_FILE"
          fi

  # ============================================
  # NOTIFY SUCCESS
  # ============================================
  notify-success:
    name: Notify Success
    runs-on: prod
    needs: [validate, verify-prod]
    if: success()
    steps:
      - name: Checkout scripts only
        uses: actions/checkout@v4
        with:
          sparse-checkout: scripts/
          sparse-checkout-cone-mode: true
          fetch-depth: 1

      - name: Send success notification
        run: |
          chmod +x "$GITHUB_WORKSPACE/scripts/ci/notify.sh"
          "$GITHUB_WORKSPACE/scripts/ci/notify.sh" success "Production deployment successful - ${{ inputs.image_tag }} is now live" ${{ inputs.image_tag }}
        env:
          DEPLOY_NOTIFY_EMAIL: ${{ vars.DEPLOY_NOTIFY_EMAIL }}
          RESEND_API_KEY: ${{ secrets.RESEND_API_KEY }}

  # ============================================
  # NOTIFY FAILURE
  # ============================================
  notify-failure:
    name: Notify Failure
    runs-on: prod
    needs: [validate, deploy-prod, verify-prod, rollback]
    if: failure()
    steps:
      - name: Checkout scripts only
        uses: actions/checkout@v4
        with:
          sparse-checkout: scripts/
          sparse-checkout-cone-mode: true
          fetch-depth: 1

      - name: Send failure notification
        run: |
          chmod +x "$GITHUB_WORKSPACE/scripts/ci/notify.sh"
          "$GITHUB_WORKSPACE/scripts/ci/notify.sh" failure "Production deployment failed for ${{ inputs.image_tag }}" ${{ inputs.image_tag }}
        env:
          DEPLOY_NOTIFY_EMAIL: ${{ vars.DEPLOY_NOTIFY_EMAIL }}
          RESEND_API_KEY: ${{ secrets.RESEND_API_KEY }}