From c88fbcdc4ed5bfe3a313fbe2801de1813bdbe732 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:50:17 -0600 Subject: [PATCH] fix: Update grafana dashboards --- .gitea/workflows/production.yaml | 8 +++---- .gitea/workflows/staging.yaml | 8 +++---- ansible/deploy-production-runner.yml | 23 +++++++------------ ansible/deploy-staging-runner.yml | 23 +++++++------------ config/grafana/alerting/alert-rules.yml | 8 +++---- .../grafana/dashboards/api-performance.json | 16 ++++++------- .../dashboards/application-overview.json | 20 ++++++++-------- .../dashboards/error-investigation.json | 8 +++---- config/grafana/dashboards/infrastructure.json | 14 +++++------ docs/BUILD-SERVER-SETUP.md | 17 -------------- 10 files changed, 57 insertions(+), 88 deletions(-) diff --git a/.gitea/workflows/production.yaml b/.gitea/workflows/production.yaml index 634506f..f8ad6b4 100644 --- a/.gitea/workflows/production.yaml +++ b/.gitea/workflows/production.yaml @@ -19,7 +19,7 @@ on: env: REGISTRY: git.motovaultpro.com DEPLOY_PATH: /opt/motovaultpro - COMPOSE_FILE: docker-compose.yml + BASE_COMPOSE_FILE: docker-compose.yml COMPOSE_BLUE_GREEN: docker-compose.blue-green.yml COMPOSE_PROD: docker-compose.prod.yml HEALTH_CHECK_TIMEOUT: "60" @@ -170,7 +170,7 @@ jobs: cd "$DEPLOY_PATH" # Start shared infrastructure services (database, cache, logging) # These persist across blue-green deployments - docker compose -f $COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d \ + docker compose -f $BASE_COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d \ mvp-postgres mvp-redis mvp-loki mvp-alloy mvp-grafana - name: Start target stack @@ -182,7 +182,7 @@ jobs: # --force-recreate ensures containers are recreated even if image tag is same # This prevents stale container content when image digest changes # Start shared OCR service and target stack - docker compose -f $COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d --force-recreate \ + docker compose -f $BASE_COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d --force-recreate \ mvp-ocr mvp-frontend-$TARGET_STACK mvp-backend-$TARGET_STACK - name: Wait for stack initialization @@ -221,7 +221,7 @@ jobs: - name: Start Traefik run: | cd "$DEPLOY_PATH" - docker compose -f $COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d mvp-traefik + docker compose -f $BASE_COMPOSE_FILE -f $COMPOSE_BLUE_GREEN -f $COMPOSE_PROD up -d mvp-traefik - name: Wait for Traefik run: | diff --git a/.gitea/workflows/staging.yaml b/.gitea/workflows/staging.yaml index 1886b3b..c403203 100644 --- a/.gitea/workflows/staging.yaml +++ b/.gitea/workflows/staging.yaml @@ -15,8 +15,8 @@ on: env: REGISTRY: git.motovaultpro.com DEPLOY_PATH: /opt/motovaultpro - COMPOSE_FILE: docker-compose.yml - COMPOSE_STAGING: docker-compose.staging.yml + BASE_COMPOSE_FILE: docker-compose.yml + STAGING_COMPOSE_FILE: docker-compose.staging.yml HEALTH_CHECK_TIMEOUT: "60" LOG_LEVEL: DEBUG @@ -170,8 +170,8 @@ jobs: export BACKEND_IMAGE=$BACKEND_IMAGE export FRONTEND_IMAGE=$FRONTEND_IMAGE export OCR_IMAGE=$OCR_IMAGE - docker compose -f $COMPOSE_FILE -f $COMPOSE_STAGING down --timeout 30 || true - docker compose -f $COMPOSE_FILE -f $COMPOSE_STAGING up -d + docker compose -f $BASE_COMPOSE_FILE -f $STAGING_COMPOSE_FILE down --timeout 30 || true + docker compose -f $BASE_COMPOSE_FILE -f $STAGING_COMPOSE_FILE up -d - name: Wait for services run: sleep 5 diff --git a/ansible/deploy-production-runner.yml b/ansible/deploy-production-runner.yml index 484a695..d24eb9b 100644 --- a/ansible/deploy-production-runner.yml +++ b/ansible/deploy-production-runner.yml @@ -269,24 +269,17 @@ when: gitea_registry_token is defined # ============================================ - # Maintenance Scripts + # Remove Legacy Docker Cleanup (was destroying volumes) # ============================================ - - name: Create Docker cleanup script - copy: - dest: /usr/local/bin/docker-cleanup.sh - content: | - #!/bin/bash - # Remove unused Docker resources older than 7 days - docker system prune -af --filter "until=168h" - docker volume prune -f - mode: '0755' - - - name: Schedule Docker cleanup cron job + - name: Remove legacy Docker cleanup cron job cron: name: "Docker cleanup" - minute: "0" - hour: "3" - job: "/usr/local/bin/docker-cleanup.sh >> /var/log/docker-cleanup.log 2>&1" + state: absent + + - name: Remove legacy Docker cleanup script + file: + path: /usr/local/bin/docker-cleanup.sh + state: absent # ============================================ # Production-Specific Security Hardening diff --git a/ansible/deploy-staging-runner.yml b/ansible/deploy-staging-runner.yml index 4345080..74c4d15 100644 --- a/ansible/deploy-staging-runner.yml +++ b/ansible/deploy-staging-runner.yml @@ -300,24 +300,17 @@ when: gitea_registry_token is defined # ============================================ - # Maintenance Scripts + # Remove Legacy Docker Cleanup (was destroying volumes) # ============================================ - - name: Create Docker cleanup script - copy: - dest: /usr/local/bin/docker-cleanup.sh - content: | - #!/bin/bash - # Remove unused Docker resources older than 7 days - docker system prune -af --filter "until=168h" - docker volume prune -f - mode: '0755' - - - name: Schedule Docker cleanup cron job + - name: Remove legacy Docker cleanup cron job cron: name: "Docker cleanup" - minute: "0" - hour: "3" - job: "/usr/local/bin/docker-cleanup.sh >> /var/log/docker-cleanup.log 2>&1" + state: absent + + - name: Remove legacy Docker cleanup script + file: + path: /usr/local/bin/docker-cleanup.sh + state: absent handlers: - name: Restart act_runner diff --git a/config/grafana/alerting/alert-rules.yml b/config/grafana/alerting/alert-rules.yml index d237007..3122c75 100644 --- a/config/grafana/alerting/alert-rules.yml +++ b/config/grafana/alerting/alert-rules.yml @@ -73,7 +73,7 @@ groups: datasourceUid: loki model: refId: A - expr: 'count_over_time({container="mvp-backend"}[5m])' + expr: 'count_over_time({container=~"mvp-backend(-staging)?"}[5m])' queryType: instant - refId: B relativeTimeRange: @@ -110,7 +110,7 @@ groups: datasourceUid: loki model: refId: A - expr: 'count_over_time({container="mvp-postgres"}[5m])' + expr: 'count_over_time({container=~"mvp-postgres(-staging)?"}[5m])' queryType: instant - refId: B relativeTimeRange: @@ -147,7 +147,7 @@ groups: datasourceUid: loki model: refId: A - expr: 'count_over_time({container="mvp-redis"}[5m])' + expr: 'count_over_time({container=~"mvp-redis(-staging)?"}[5m])' queryType: instant - refId: B relativeTimeRange: @@ -184,7 +184,7 @@ groups: datasourceUid: loki model: refId: A - expr: 'sum(count_over_time({container="mvp-backend"} | json | msg=`Request processed` | status >= 500 [5m]))' + expr: 'sum(count_over_time({container=~"mvp-backend(-staging)?"} | json | msg=`Request processed` | status >= 500 [5m]))' queryType: instant - refId: B relativeTimeRange: diff --git a/config/grafana/dashboards/api-performance.json b/config/grafana/dashboards/api-performance.json index f0f9290..7f280e2 100644 --- a/config/grafana/dashboards/api-performance.json +++ b/config/grafana/dashboards/api-performance.json @@ -121,7 +121,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "sum(rate({container=\"mvp-backend\"} | json | msg=\"Request processed\" [1m]))", + "expr": "sum(rate({container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" [1m]))", "legendFormat": "Requests/sec", "refId": "A" } @@ -218,7 +218,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "quantile_over_time(0.50, {container=\"mvp-backend\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m]) by ()", + "expr": "quantile_over_time(0.50, {container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m]) by ()", "legendFormat": "p50", "refId": "A" }, @@ -227,7 +227,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "quantile_over_time(0.95, {container=\"mvp-backend\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m]) by ()", + "expr": "quantile_over_time(0.95, {container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m]) by ()", "legendFormat": "p95", "refId": "B" }, @@ -236,7 +236,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "quantile_over_time(0.99, {container=\"mvp-backend\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m]) by ()", + "expr": "quantile_over_time(0.99, {container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m]) by ()", "legendFormat": "p99", "refId": "C" } @@ -303,7 +303,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "sum by (status) (count_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" [5m]))", + "expr": "sum by (status) (count_over_time({container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" [5m]))", "legendFormat": "{{status}}", "refId": "A" } @@ -389,7 +389,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "sum by (path) (count_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" [5m]))", + "expr": "sum by (path) (count_over_time({container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" [5m]))", "legendFormat": "{{path}}", "refId": "A" } @@ -481,7 +481,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "topk(10, avg by (path) (avg_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m])))", + "expr": "topk(10, avg by (path) (avg_over_time({container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m])))", "legendFormat": "{{path}}", "refId": "A" } @@ -564,7 +564,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "sum by (path, status) (count_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" [5m]))", + "expr": "sum by (path, status) (count_over_time({container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" [5m]))", "legendFormat": "{{path}} - {{status}}", "refId": "A" } diff --git a/config/grafana/dashboards/application-overview.json b/config/grafana/dashboards/application-overview.json index 19bcb7e..14ce43c 100644 --- a/config/grafana/dashboards/application-overview.json +++ b/config/grafana/dashboards/application-overview.json @@ -334,7 +334,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-backend\"}[5m])", + "expr": "count_over_time({container=~\"mvp-backend(-staging)?\"}[5m])", "legendFormat": "mvp-backend", "refId": "A" }, @@ -343,7 +343,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-frontend\"}[5m])", + "expr": "count_over_time({container=~\"mvp-frontend(-staging)?\"}[5m])", "legendFormat": "mvp-frontend", "refId": "B" }, @@ -352,7 +352,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-postgres\"}[5m])", + "expr": "count_over_time({container=~\"mvp-postgres(-staging)?\"}[5m])", "legendFormat": "mvp-postgres", "refId": "C" }, @@ -361,7 +361,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-redis\"}[5m])", + "expr": "count_over_time({container=~\"mvp-redis(-staging)?\"}[5m])", "legendFormat": "mvp-redis", "refId": "D" }, @@ -370,7 +370,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-traefik\"}[5m])", + "expr": "count_over_time({container=~\"mvp-traefik(-staging)?\"}[5m])", "legendFormat": "mvp-traefik", "refId": "E" }, @@ -379,7 +379,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-ocr\"}[5m])", + "expr": "count_over_time({container=~\"mvp-ocr(-staging)?\"}[5m])", "legendFormat": "mvp-ocr", "refId": "F" }, @@ -388,7 +388,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-loki\"}[5m])", + "expr": "count_over_time({container=~\"mvp-loki(-staging)?\"}[5m])", "legendFormat": "mvp-loki", "refId": "G" }, @@ -397,7 +397,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-alloy\"}[5m])", + "expr": "count_over_time({container=~\"mvp-alloy(-staging)?\"}[5m])", "legendFormat": "mvp-alloy", "refId": "H" }, @@ -406,7 +406,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-grafana\"}[5m])", + "expr": "count_over_time({container=~\"mvp-grafana(-staging)?\"}[5m])", "legendFormat": "mvp-grafana", "refId": "I" } @@ -494,7 +494,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "count_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" [1m])", + "expr": "count_over_time({container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" [1m])", "legendFormat": "Backend Requests", "refId": "A" } diff --git a/config/grafana/dashboards/error-investigation.json b/config/grafana/dashboards/error-investigation.json index b90898c..1629872 100644 --- a/config/grafana/dashboards/error-investigation.json +++ b/config/grafana/dashboards/error-investigation.json @@ -337,7 +337,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "sum by (path) (count_over_time({container=\"mvp-backend\"} | json | level=\"error\" [5m]))", + "expr": "sum by (path) (count_over_time({container=~\"mvp-backend(-staging)?\"} | json | level=\"error\" [5m]))", "legendFormat": "{{path}}", "refId": "A" } @@ -377,7 +377,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "{container=\"mvp-backend\"} | json | level=\"error\" | line_format \"{{.error}}\\n{{.stack}}\"", + "expr": "{container=~\"mvp-backend(-staging)?\"} | json | level=\"error\" | line_format \"{{.error}}\\n{{.stack}}\"", "refId": "A" } ], @@ -416,7 +416,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "{container=\"mvp-backend\"} |= \"$requestId\"", + "expr": "{container=~\"mvp-backend(-staging)?\"} |= \"$requestId\"", "refId": "A" } ], @@ -510,7 +510,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "{container=\"mvp-backend\"} | json | msg=\"Request processed\" | status >= 500", + "expr": "{container=~\"mvp-backend(-staging)?\"} | json | msg=\"Request processed\" | status >= 500", "refId": "A" } ], diff --git a/config/grafana/dashboards/infrastructure.json b/config/grafana/dashboards/infrastructure.json index 5aefb24..3ade223 100644 --- a/config/grafana/dashboards/infrastructure.json +++ b/config/grafana/dashboards/infrastructure.json @@ -157,7 +157,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "{container=\"mvp-postgres\"} |~ \"ERROR|WARNING|FATAL\"", + "expr": "{container=~\"mvp-postgres(-staging)?\"} |~ \"ERROR|WARNING|FATAL\"", "refId": "A" } ], @@ -196,7 +196,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "{container=\"mvp-redis\"}", + "expr": "{container=~\"mvp-redis(-staging)?\"}", "refId": "A" } ], @@ -235,7 +235,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "{container=\"mvp-traefik\"}", + "expr": "{container=~\"mvp-traefik(-staging)?\"}", "refId": "A" } ], @@ -274,7 +274,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "{container=\"mvp-traefik\"} |~ \"level=error|err=\"", + "expr": "{container=~\"mvp-traefik(-staging)?\"} |~ \"level=error|err=\"", "refId": "A" } ], @@ -313,7 +313,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "{container=\"mvp-ocr\"}", + "expr": "{container=~\"mvp-ocr(-staging)?\"}", "refId": "A" } ], @@ -352,7 +352,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "{container=\"mvp-ocr\"} |~ \"ERROR|error|Exception|Traceback\"", + "expr": "{container=~\"mvp-ocr(-staging)?\"} |~ \"ERROR|error|Exception|Traceback\"", "refId": "A" } ], @@ -439,7 +439,7 @@ "type": "loki", "uid": "${datasource}" }, - "expr": "sum(rate({container=\"mvp-loki\"}[1m]))", + "expr": "sum(rate({container=~\"mvp-loki(-staging)?\"}[1m]))", "legendFormat": "Loki Lines/min", "refId": "A" } diff --git a/docs/BUILD-SERVER-SETUP.md b/docs/BUILD-SERVER-SETUP.md index 9986180..65c215c 100644 --- a/docs/BUILD-SERVER-SETUP.md +++ b/docs/BUILD-SERVER-SETUP.md @@ -240,23 +240,6 @@ sudo -u act_runner docker push git.motovaultpro.com/egullickson/test:latest ## Maintenance -### Disk Cleanup - -```bash -# Create cleanup script -sudo tee /usr/local/bin/docker-cleanup.sh > /dev/null <<'EOF' -#!/bin/bash -# Remove unused Docker resources older than 7 days -docker system prune -af --filter "until=168h" -docker volume prune -f -EOF - -sudo chmod +x /usr/local/bin/docker-cleanup.sh - -# Add to crontab (run daily at 3 AM) -echo "0 3 * * * /usr/local/bin/docker-cleanup.sh >> /var/log/docker-cleanup.log 2>&1" | sudo crontab - -``` - ### Update Runner ```bash