From c6b99ab29a80e31c96cfd23a2922cd42d9ff7e02 Mon Sep 17 00:00:00 2001
From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com>
Date: Sun, 8 Feb 2026 20:57:49 -0600
Subject: [PATCH] fix: Postgres Fixes for Prod

---
 .gitea/workflows/production.yaml | 28 +++++++++++++++++++++++++---
 docker-compose.prod.yml          |  2 +-
 docker-compose.yml               |  2 +-
 scripts/ci/health-check.sh       | 11 +++++------
 4 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/.gitea/workflows/production.yaml b/.gitea/workflows/production.yaml
index f8ad6b4..4686fb4 100644
--- a/.gitea/workflows/production.yaml
+++ b/.gitea/workflows/production.yaml
@@ -22,7 +22,7 @@ env:
   BASE_COMPOSE_FILE: docker-compose.yml
   COMPOSE_BLUE_GREEN: docker-compose.blue-green.yml
   COMPOSE_PROD: docker-compose.prod.yml
-  HEALTH_CHECK_TIMEOUT: "60"
+  HEALTH_CHECK_TIMEOUT: "240"
   LOG_LEVEL: INFO
 
 jobs:
@@ -169,10 +169,32 @@ jobs:
         run: |
           cd "$DEPLOY_PATH"
           # Start shared infrastructure services (database, cache, logging)
-          # These persist across blue-green deployments
-          docker compose -f $BASE_COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d \
+          # --no-recreate prevents restarting postgres/redis when config files change
+          # These must persist across blue-green deployments to avoid data service disruption
+          docker compose -f $BASE_COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d --no-recreate \
             mvp-postgres mvp-redis mvp-loki mvp-alloy mvp-grafana
 
+      - name: Wait for shared services health
+        run: |
+          echo "Waiting for PostgreSQL and Redis to be healthy..."
+          for service in mvp-postgres mvp-redis; do
+            for i in $(seq 1 24); do
+              health=$(docker inspect --format='{{.State.Health.Status}}' $service 2>/dev/null || echo "unknown")
+              if [ "$health" = "healthy" ]; then
+                echo "OK: $service is healthy"
+                break
+              fi
+              if [ $i -eq 24 ]; then
+                echo "ERROR: $service health check timed out (status: $health)"
+                docker logs $service --tail 50 2>/dev/null || true
+                exit 1
+              fi
+              echo "Waiting for $service... (attempt $i/24, status: $health)"
+              sleep 5
+            done
+          done
+          echo "All shared services healthy"
+
       - name: Start target stack
         run: |
           cd "$DEPLOY_PATH"
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index fb64c09..40c6ca5 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -61,7 +61,7 @@ services:
       LOG_LEVEL: error
       POSTGRES_LOG_STATEMENT: none
       POSTGRES_LOG_MIN_DURATION_STATEMENT: -1
-      PGDATA: /var/lib/postgresql/data
+      PGDATA: /var/lib/postgresql/data/pgdata
 
   # Redis - Remove dev ports, production log level
   mvp-redis:
diff --git a/docker-compose.yml b/docker-compose.yml
index ae4b8c6..5e9a247 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -233,7 +233,7 @@ services:
       LOG_LEVEL: debug
       POSTGRES_LOG_STATEMENT: all
       POSTGRES_LOG_MIN_DURATION_STATEMENT: 0
-      PGDATA: /var/lib/postgresql/data
+      PGDATA: /var/lib/postgresql/data/pgdata
     volumes:
       - mvp_postgres_data:/var/lib/postgresql/data
       # Secrets (K8s Secrets equivalent)
diff --git a/scripts/ci/health-check.sh b/scripts/ci/health-check.sh
index fa7b388..d3243fb 100755
--- a/scripts/ci/health-check.sh
+++ b/scripts/ci/health-check.sh
@@ -92,19 +92,18 @@ wait_for_health() {
 
     if [[ $status -eq 0 ]]; then
       return 0
-    elif [[ $status -eq 1 ]]; then
-      echo "  ERROR: Container $container is unhealthy"
-      docker logs "$container" --tail 20 2>/dev/null || true
-      return 1
     fi
 
-    # Still starting, wait
+    # Both "starting" and "unhealthy" are treated as transient during the wait period.
+    # Docker can report "unhealthy" briefly during start_period before the next check
+    # cycle transitions it back. Only the overall timeout should cause failure.
     sleep 2
     elapsed=$((elapsed + 2))
     echo "  Waiting for $container... (${elapsed}s/${TIMEOUT}s)"
   done
 
-  echo "  ERROR: Timeout waiting for $container"
+  echo "  ERROR: Container $container did not become healthy within ${TIMEOUT}s"
+  docker logs "$container" --tail 20 2>/dev/null || true
   return 1
 }