From 6f1195d90782af0020cab322b2023c6d3e0c76e2 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 08:19:28 -0600 Subject: [PATCH 1/8] feat: add Grafana dashboard provisioning infrastructure (refs #106) Add file-based dashboard provisioning config and mount dashboards directory into Grafana container for auto-loading dashboard JSON files. Co-Authored-By: Claude Opus 4.6 --- config/grafana/dashboards/.gitkeep | 0 config/grafana/provisioning/dashboards.yml | 11 +++++++++++ docker-compose.yml | 2 ++ 3 files changed, 13 insertions(+) create mode 100644 config/grafana/dashboards/.gitkeep create mode 100644 config/grafana/provisioning/dashboards.yml diff --git a/config/grafana/dashboards/.gitkeep b/config/grafana/dashboards/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/config/grafana/provisioning/dashboards.yml b/config/grafana/provisioning/dashboards.yml new file mode 100644 index 0000000..13c0e9b --- /dev/null +++ b/config/grafana/provisioning/dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 +providers: + - name: 'MotoVaultPro' + orgId: 1 + folder: 'MotoVaultPro' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards diff --git a/docker-compose.yml b/docker-compose.yml index d5bf4a7..441cb4f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -325,6 +325,8 @@ services: GF_USERS_ALLOW_SIGN_UP: "false" volumes: - ./config/grafana/datasources:/etc/grafana/provisioning/datasources:ro + - ./config/grafana/provisioning:/etc/grafana/provisioning/dashboards:ro + - ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro - mvp_grafana_data:/var/lib/grafana networks: - backend -- 2.49.1 From 33e561e537a1b119d7db143b12c80e902e02d22c Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 08:24:08 -0600 Subject: [PATCH 2/8] feat: add Application Overview Grafana dashboard (refs #107) Adds file-provisioned dashboard with 5 panels: - Container Log Volume Over Time (all 9 containers) - Error Rate Across All Containers (percentage stat) - Log Level Distribution Per Container (stacked bar chart) - Container Health Status (green/red per container) - Total Request Count Over Time (backend requests/min) Co-Authored-By: Claude Opus 4.6 --- config/grafana/dashboards/.gitkeep | 0 .../dashboards/application-overview.json | 545 ++++++++++++++++++ 2 files changed, 545 insertions(+) delete mode 100644 config/grafana/dashboards/.gitkeep create mode 100644 config/grafana/dashboards/application-overview.json diff --git a/config/grafana/dashboards/.gitkeep b/config/grafana/dashboards/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/config/grafana/dashboards/application-overview.json b/config/grafana/dashboards/application-overview.json new file mode 100644 index 0000000..19bcb7e --- /dev/null +++ b/config/grafana/dashboards/application-overview.json @@ -0,0 +1,545 @@ +{ + "__inputs": [], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "12.4.0" + }, + { + "type": "datasource", + "id": "loki", + "name": "Loki", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Log Lines / min", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum by (container) (count_over_time({container=~\"mvp-.*\"}[1m]))", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Container Log Volume Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "percent", + "decimals": 2 + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum(count_over_time({container=~\"mvp-.*\"} | json | level=\"error\" [5m])) / sum(count_over_time({container=~\"mvp-.*\"}[5m])) * 100", + "refId": "A" + } + ], + "title": "Error Rate Across All Containers", + "type": "stat" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 16, + "x": 8, + "y": 8 + }, + "id": 3, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "normal", + "tooltip": { + "mode": "multi", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum by (container, level) (count_over_time({container=~\"mvp-.*\"} | json [5m]))", + "legendFormat": "{{level}}", + "refId": "A" + } + ], + "title": "Log Level Distribution Per Container", + "type": "barchart" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "textMode": "name" + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-backend\"}[5m])", + "legendFormat": "mvp-backend", + "refId": "A" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-frontend\"}[5m])", + "legendFormat": "mvp-frontend", + "refId": "B" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-postgres\"}[5m])", + "legendFormat": "mvp-postgres", + "refId": "C" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-redis\"}[5m])", + "legendFormat": "mvp-redis", + "refId": "D" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-traefik\"}[5m])", + "legendFormat": "mvp-traefik", + "refId": "E" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-ocr\"}[5m])", + "legendFormat": "mvp-ocr", + "refId": "F" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-loki\"}[5m])", + "legendFormat": "mvp-loki", + "refId": "G" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-alloy\"}[5m])", + "legendFormat": "mvp-alloy", + "refId": "H" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-grafana\"}[5m])", + "legendFormat": "mvp-grafana", + "refId": "I" + } + ], + "title": "Container Health Status", + "type": "stat" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed", + "fixedColor": "blue" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Requests / min", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "count_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" [1m])", + "legendFormat": "Backend Requests", + "refId": "A" + } + ], + "title": "Total Request Count Over Time", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "overview", + "logs", + "containers" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Loki", + "value": "Loki" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "loki", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Application Overview", + "uid": "application-overview", + "version": 1, + "weekStart": "" +} -- 2.49.1 From 9e6f130fa6e826cae9d2cde76412a0ab454f30b8 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 09:48:11 -0600 Subject: [PATCH 3/8] feat: add API Performance Grafana dashboard (refs #108) Log-based dashboard with 6 panels: request rate, response time distribution (p50/p95/p99), HTTP status code distribution, request volume by endpoint, slowest endpoints, and status code breakdown. Co-Authored-By: Claude Opus 4.6 --- .../grafana/dashboards/api-performance.json | 615 ++++++++++++++++++ 1 file changed, 615 insertions(+) create mode 100644 config/grafana/dashboards/api-performance.json diff --git a/config/grafana/dashboards/api-performance.json b/config/grafana/dashboards/api-performance.json new file mode 100644 index 0000000..f0f9290 --- /dev/null +++ b/config/grafana/dashboards/api-performance.json @@ -0,0 +1,615 @@ +{ + "__inputs": [], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "12.4.0" + }, + { + "type": "datasource", + "id": "loki", + "name": "Loki", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed", + "fixedColor": "blue" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Requests / sec", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum(rate({container=\"mvp-backend\"} | json | msg=\"Request processed\" [1m]))", + "legendFormat": "Requests/sec", + "refId": "A" + } + ], + "title": "Request Rate Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Duration (ms)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 500 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "quantile_over_time(0.50, {container=\"mvp-backend\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m]) by ()", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "quantile_over_time(0.95, {container=\"mvp-backend\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m]) by ()", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "quantile_over_time(0.99, {container=\"mvp-backend\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m]) by ()", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Response Time Distribution (p50 / p95 / p99)", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "pieType": "donut", + "reduceOptions": { + "values": false, + "calcs": [ + "sum" + ], + "fields": "" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum by (status) (count_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" [5m]))", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "HTTP Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 16 + }, + "id": 4, + "options": { + "barRadius": 0, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "horizontal", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum by (path) (count_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" [5m]))", + "legendFormat": "{{path}}", + "refId": "A" + } + ], + "title": "Request Volume by Endpoint", + "type": "barchart" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 200 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "ms" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "path" + }, + "properties": [ + { + "id": "custom.width", + "value": 300 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 5, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "topk(10, avg by (path) (avg_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" | unwrap duration | __error__=\"\" [5m])))", + "legendFormat": "{{path}}", + "refId": "A" + } + ], + "title": "Slowest Endpoints (Avg Duration)", + "type": "table" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "path" + }, + "properties": [ + { + "id": "custom.width", + "value": 300 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 6, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum by (path, status) (count_over_time({container=\"mvp-backend\"} | json | msg=\"Request processed\" [5m]))", + "legendFormat": "{{path}} - {{status}}", + "refId": "A" + } + ], + "title": "Status Code Breakdown by Endpoint", + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "api", + "performance", + "backend" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Loki", + "value": "Loki" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "loki", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "API Performance", + "uid": "api-performance", + "version": 1, + "weekStart": "" +} -- 2.49.1 From 0345e3976f4a05d50041d90be02097520f50a783 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 09:54:52 -0600 Subject: [PATCH 4/8] feat: add Error Investigation Grafana dashboard (refs #109) Co-Authored-By: Claude Opus 4.6 --- .../dashboards/error-investigation.json | 580 ++++++++++++++++++ 1 file changed, 580 insertions(+) create mode 100644 config/grafana/dashboards/error-investigation.json diff --git a/config/grafana/dashboards/error-investigation.json b/config/grafana/dashboards/error-investigation.json new file mode 100644 index 0000000..b90898c --- /dev/null +++ b/config/grafana/dashboards/error-investigation.json @@ -0,0 +1,580 @@ +{ + "__inputs": [], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "12.4.0" + }, + { + "type": "datasource", + "id": "loki", + "name": "Loki", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=~\"mvp-.*\"} | json | level=\"error\"", + "refId": "A" + } + ], + "title": "Error Log Stream", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed", + "fixedColor": "red" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Errors / min", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "sum", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum(count_over_time({container=~\"mvp-.*\"} | json | level=\"error\" [1m]))", + "legendFormat": "Errors/min", + "refId": "A" + } + ], + "title": "Error Rate Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 3, + "options": { + "barRadius": 0, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "horizontal", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum by (container) (count_over_time({container=~\"mvp-.*\"} | json | level=\"error\" [5m]))", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Errors by Container", + "type": "barchart" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "path" + }, + "properties": [ + { + "id": "custom.width", + "value": 300 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 4, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum by (path) (count_over_time({container=\"mvp-backend\"} | json | level=\"error\" [5m]))", + "legendFormat": "{{path}}", + "refId": "A" + } + ], + "title": "Errors by Endpoint", + "type": "table" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 5, + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=\"mvp-backend\"} | json | level=\"error\" | line_format \"{{.error}}\\n{{.stack}}\"", + "refId": "A" + } + ], + "title": "Stack Trace Viewer", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 6, + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=\"mvp-backend\"} |= \"$requestId\"", + "refId": "A" + } + ], + "title": "Correlation ID Lookup", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "path" + }, + "properties": [ + { + "id": "custom.width", + "value": 250 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "status" + }, + "properties": [ + { + "id": "custom.width", + "value": 80 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 7, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Time" + } + ] + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=\"mvp-backend\"} | json | msg=\"Request processed\" | status >= 500", + "refId": "A" + } + ], + "title": "Recent 5xx Responses", + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "errors", + "debugging", + "backend" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Loki", + "value": "Loki" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "loki", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "hide": 0, + "label": "Request ID", + "name": "requestId", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Error Investigation", + "uid": "error-investigation", + "version": 1, + "weekStart": "" +} -- 2.49.1 From c891250946e1839126706f555977f584396f55d9 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 10:11:38 -0600 Subject: [PATCH 5/8] feat: add Infrastructure Grafana dashboard (refs #110) Co-Authored-By: Claude Opus 4.6 --- config/grafana/dashboards/infrastructure.json | 490 ++++++++++++++++++ 1 file changed, 490 insertions(+) create mode 100644 config/grafana/dashboards/infrastructure.json diff --git a/config/grafana/dashboards/infrastructure.json b/config/grafana/dashboards/infrastructure.json new file mode 100644 index 0000000..5aefb24 --- /dev/null +++ b/config/grafana/dashboards/infrastructure.json @@ -0,0 +1,490 @@ +{ + "__inputs": [], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "12.4.0" + }, + { + "type": "datasource", + "id": "loki", + "name": "Loki", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Log Lines / min", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum by (container) (rate({container=~\"mvp-.*\"}[1m]))", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Per-Container Log Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 2, + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=\"mvp-postgres\"} |~ \"ERROR|WARNING|FATAL\"", + "refId": "A" + } + ], + "title": "PostgreSQL Error/Warning Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 3, + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=\"mvp-redis\"}", + "refId": "A" + } + ], + "title": "Redis Connection and Command Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 4, + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=\"mvp-traefik\"}", + "refId": "A" + } + ], + "title": "Traefik Access Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 5, + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=\"mvp-traefik\"} |~ \"level=error|err=\"", + "refId": "A" + } + ], + "title": "Traefik Error Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 48 + }, + "id": 6, + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=\"mvp-ocr\"}", + "refId": "A" + } + ], + "title": "OCR Service Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 58 + }, + "id": 7, + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "{container=\"mvp-ocr\"} |~ \"ERROR|error|Exception|Traceback\"", + "refId": "A" + } + ], + "title": "OCR Processing Errors", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed", + "fixedColor": "purple" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Lines / min", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 68 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${datasource}" + }, + "expr": "sum(rate({container=\"mvp-loki\"}[1m]))", + "legendFormat": "Loki Lines/min", + "refId": "A" + } + ], + "title": "Loki Ingestion Rate", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "infrastructure", + "containers", + "logs" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Loki", + "value": "Loki" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "loki", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Infrastructure", + "uid": "infrastructure", + "version": 1, + "weekStart": "" +} -- 2.49.1 From 4b2b318affd144d0dae0ab5cee83250ba37f21fd Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 10:19:00 -0600 Subject: [PATCH 6/8] feat: add Grafana alerting rules and documentation (refs #111) Configure Grafana Unified Alerting with file-based provisioned alert rules, contact points, and notification policies. Add stable UID to Loki datasource for alert rule references. Update LOGGING.md with dashboard descriptions, alerting rules table, and LogQL query reference. Alert rules: Error Rate Spike (critical), Container Silence for backend/postgres/redis (warning), 5xx Response Spike (critical). Co-Authored-By: Claude Opus 4.6 --- config/grafana/alerting/alert-rules.yml | 210 ++++++++++++++++++ config/grafana/alerting/contact-points.yml | 12 + .../alerting/notification-policies.yml | 11 + config/grafana/datasources/loki.yml | 1 + docker-compose.yml | 1 + docs/LOGGING.md | 78 ++++++- 6 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 config/grafana/alerting/alert-rules.yml create mode 100644 config/grafana/alerting/contact-points.yml create mode 100644 config/grafana/alerting/notification-policies.yml diff --git a/config/grafana/alerting/alert-rules.yml b/config/grafana/alerting/alert-rules.yml new file mode 100644 index 0000000..d237007 --- /dev/null +++ b/config/grafana/alerting/alert-rules.yml @@ -0,0 +1,210 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: MotoVaultPro Alerts + folder: MotoVaultPro + interval: 1m + rules: + # Error Rate Spike - Alert when error rate exceeds 5% over 5 minutes + - uid: mvp-error-rate-spike + title: Error Rate Spike + condition: D + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'sum(count_over_time({container=~"mvp-.*"} | json | level=`error` [5m]))' + queryType: instant + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: B + expr: 'sum(count_over_time({container=~"mvp-.*"} [5m]))' + queryType: instant + - refId: C + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: C + type: math + expression: '($A / $B) * 100' + - refId: D + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: D + type: threshold + expression: C + conditions: + - evaluator: + type: gt + params: + - 5 + noDataState: OK + execErrState: Error + for: 5m + labels: + severity: critical + annotations: + summary: Error rate exceeds 5% over 5 minutes across all MotoVaultPro containers + description: Check the Error Investigation dashboard for details. + + # Container Silence - mvp-backend + - uid: mvp-silence-backend + title: "Container Silence: mvp-backend" + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'count_over_time({container="mvp-backend"}[5m])' + queryType: instant + - refId: B + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: B + type: threshold + expression: A + conditions: + - evaluator: + type: lt + params: + - 1 + noDataState: Alerting + execErrState: Error + for: 5m + labels: + severity: warning + annotations: + summary: mvp-backend container has stopped producing logs + description: No logs received from mvp-backend for 5 minutes. The container may be down or stuck. + + # Container Silence - mvp-postgres + - uid: mvp-silence-postgres + title: "Container Silence: mvp-postgres" + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'count_over_time({container="mvp-postgres"}[5m])' + queryType: instant + - refId: B + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: B + type: threshold + expression: A + conditions: + - evaluator: + type: lt + params: + - 1 + noDataState: Alerting + execErrState: Error + for: 5m + labels: + severity: warning + annotations: + summary: mvp-postgres container has stopped producing logs + description: No logs received from mvp-postgres for 5 minutes. The database container may be down. + + # Container Silence - mvp-redis + - uid: mvp-silence-redis + title: "Container Silence: mvp-redis" + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'count_over_time({container="mvp-redis"}[5m])' + queryType: instant + - refId: B + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: B + type: threshold + expression: A + conditions: + - evaluator: + type: lt + params: + - 1 + noDataState: Alerting + execErrState: Error + for: 5m + labels: + severity: warning + annotations: + summary: mvp-redis container has stopped producing logs + description: No logs received from mvp-redis for 5 minutes. The cache container may be down. + + # 5xx Spike - Alert when 5xx responses exceed threshold + - uid: mvp-5xx-spike + title: 5xx Response Spike + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: loki + model: + refId: A + expr: 'sum(count_over_time({container="mvp-backend"} | json | msg=`Request processed` | status >= 500 [5m]))' + queryType: instant + - refId: B + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: __expr__ + model: + refId: B + type: threshold + expression: A + conditions: + - evaluator: + type: gt + params: + - 10 + noDataState: OK + execErrState: Error + for: 5m + labels: + severity: critical + annotations: + summary: High rate of 5xx responses from mvp-backend + description: More than 10 HTTP 5xx responses in 5 minutes. Check the API Performance and Error Investigation dashboards. diff --git a/config/grafana/alerting/contact-points.yml b/config/grafana/alerting/contact-points.yml new file mode 100644 index 0000000..a151276 --- /dev/null +++ b/config/grafana/alerting/contact-points.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +contactPoints: + - orgId: 1 + name: mvp-default + receivers: + - uid: mvp-webhook-placeholder + type: webhook + settings: + url: "https://example.com/mvp-webhook-placeholder" + httpMethod: POST + disableResolveMessage: false diff --git a/config/grafana/alerting/notification-policies.yml b/config/grafana/alerting/notification-policies.yml new file mode 100644 index 0000000..8e6fef7 --- /dev/null +++ b/config/grafana/alerting/notification-policies.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +policies: + - orgId: 1 + receiver: mvp-default + group_by: + - alertname + - severity + group_wait: 30s + group_interval: 5m + repeat_interval: 4h diff --git a/config/grafana/datasources/loki.yml b/config/grafana/datasources/loki.yml index 7154896..7fa4d24 100644 --- a/config/grafana/datasources/loki.yml +++ b/config/grafana/datasources/loki.yml @@ -2,6 +2,7 @@ apiVersion: 1 datasources: - name: Loki + uid: loki type: loki access: proxy url: http://mvp-loki:3100 diff --git a/docker-compose.yml b/docker-compose.yml index 441cb4f..0bef4b2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -326,6 +326,7 @@ services: volumes: - ./config/grafana/datasources:/etc/grafana/provisioning/datasources:ro - ./config/grafana/provisioning:/etc/grafana/provisioning/dashboards:ro + - ./config/grafana/alerting:/etc/grafana/provisioning/alerting:ro - ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro - mvp_grafana_data:/var/lib/grafana networks: diff --git a/docs/LOGGING.md b/docs/LOGGING.md index 19d818d..59eacd4 100644 --- a/docs/LOGGING.md +++ b/docs/LOGGING.md @@ -52,7 +52,39 @@ All logs include a `requestId` field (UUID v4) for tracing requests: - URL: https://logs.motovaultpro.com - Default credentials: admin/admin (change on first login) -### Example LogQL Queries +## Dashboards + +Four provisioned dashboards are available in the MotoVaultPro folder: + +| Dashboard | Purpose | Key Panels | +|-----------|---------|------------| +| Application Overview | System-wide health at a glance | Container log volume, error rate gauge, log level distribution, container health status, request count | +| API Performance | Backend latency and throughput analysis | Request rate, response time percentiles (p50/p95/p99), status code distribution, slowest endpoints | +| Error Investigation | Debugging and root cause analysis | Error log stream, errors by container/endpoint, stack trace viewer, correlation ID lookup, recent 5xx responses | +| Infrastructure | Container-level logs and platform monitoring | Per-container throughput, PostgreSQL/Redis/Traefik/OCR logs, Loki ingestion rate | + +All dashboards refresh every 30 seconds and default to a 1-hour time window. Dashboard JSON files are in `config/grafana/dashboards/` and provisioned via `config/grafana/provisioning/dashboards.yml`. + +## Alerting Rules + +Grafana Unified Alerting is configured with file-based provisioned rules. Alert rules are evaluated every 1 minute and must fire continuously for 5 minutes before triggering. + +| Alert | Severity | Condition | Description | +|-------|----------|-----------|-------------| +| Error Rate Spike | critical | Error rate > 5% over 5m | Fires when the percentage of error-level logs across all mvp-* containers exceeds 5% | +| Container Silence: mvp-backend | warning | No logs for 5m | Fires when the backend container stops producing logs | +| Container Silence: mvp-postgres | warning | No logs for 5m | Fires when the database container stops producing logs | +| Container Silence: mvp-redis | warning | No logs for 5m | Fires when the cache container stops producing logs | +| 5xx Response Spike | critical | > 10 5xx responses in 5m | Fires when the backend produces more than 10 HTTP 5xx responses | + +Alert configuration files are in `config/grafana/alerting/`: +- `alert-rules.yml` - Alert rule definitions with LogQL queries +- `contact-points.yml` - Notification endpoints (webhook placeholder for future email/Slack) +- `notification-policies.yml` - Routing rules that group alerts by name and severity + +## LogQL Query Reference + +### Common Debugging Queries Query by requestId: ``` @@ -66,7 +98,49 @@ Query all errors: Query slow requests (>500ms): ``` -{container="mvp-backend"} | json | duration > 500 +{container="mvp-backend"} | json | msg="Request processed" | duration > 500 +``` + +### Error Analysis + +Count errors per container over time: +``` +sum by (container) (count_over_time({container=~"mvp-.*"} | json | level="error" [5m])) +``` + +Error rate as percentage: +``` +sum(count_over_time({container=~"mvp-.*"} | json | level="error" [5m])) + / sum(count_over_time({container=~"mvp-.*"} [5m])) * 100 +``` + +### HTTP Status Analysis + +All 5xx responses: +``` +{container="mvp-backend"} | json | msg="Request processed" | status >= 500 +``` + +Request count by status code: +``` +sum by (status) (count_over_time({container="mvp-backend"} | json | msg="Request processed" [5m])) +``` + +### Container-Specific Queries + +PostgreSQL errors: +``` +{container="mvp-postgres"} |~ "ERROR|FATAL|PANIC" +``` + +Traefik access logs: +``` +{container="mvp-traefik"} | json +``` + +OCR processing errors: +``` +{container="mvp-ocr"} |~ "ERROR|Exception|Traceback" ``` ## Configuration -- 2.49.1 From 842b0eb94568783ddcefaa0c85a6c82d8883be4a Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 10:32:58 -0600 Subject: [PATCH 7/8] docs: update config/CLAUDE.md with Grafana subdirectories (refs #111) Co-Authored-By: Claude Opus 4.6 --- config/CLAUDE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/CLAUDE.md b/config/CLAUDE.md index ca8e127..e2e4c23 100644 --- a/config/CLAUDE.md +++ b/config/CLAUDE.md @@ -8,6 +8,8 @@ | `alloy/` | Grafana Alloy log collector config | Log collection pipeline | | `deployment/` | Deployment environment configs | Deploy scripts, environment configs | | `grafana/` | Grafana dashboards and datasources | Log visualization setup | +| `grafana/dashboards/` | Provisioned Grafana dashboard JSON files | Dashboard modifications | +| `grafana/provisioning/` | Grafana provisioning configs (dashboards, alerting) | Provisioning setup | | `loki/` | Loki log storage config | Log storage, retention | | `monitoring/` | Monitoring and alert rules | Alerting rules, health checks | | `shared/` | Shared cross-service configuration | Cross-service settings | -- 2.49.1 From 462d306783a81cf39d1731526250da9b41eda321 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 10:51:00 -0600 Subject: [PATCH 8/8] fix: resolve staging deployment issues with Traefik, Loki, and Alloy (refs #105) - Exclude blue-green.yml from staging Traefik by mounting dynamic-staging/ directory (only grafana.yml + middleware.yml) instead of dynamic/ which contains production-only blue-green routing config - Disable Loki healthcheck: distroless image has no /bin/sh so CMD-SHELL healthchecks cannot execute; Alloy and Grafana verify Loki connectivity - Fix Alloy healthcheck: replace wget (not in image) with bash /dev/tcp - Add Grafana staging domain override (logs.staging.motovaultpro.com) Co-Authored-By: Claude Opus 4.6 --- config/traefik/dynamic-staging/grafana.yml | 8 + config/traefik/dynamic-staging/middleware.yml | 173 ++++++++++++++++++ docker-compose.staging.yml | 16 ++ docker-compose.yml | 9 +- 4 files changed, 201 insertions(+), 5 deletions(-) create mode 100644 config/traefik/dynamic-staging/grafana.yml create mode 100755 config/traefik/dynamic-staging/middleware.yml diff --git a/config/traefik/dynamic-staging/grafana.yml b/config/traefik/dynamic-staging/grafana.yml new file mode 100644 index 0000000..3b73ad3 --- /dev/null +++ b/config/traefik/dynamic-staging/grafana.yml @@ -0,0 +1,8 @@ +http: + middlewares: + grafana-ipwhitelist: + ipAllowList: + sourceRange: + - "10.0.0.0/8" + - "172.16.0.0/12" + - "192.168.0.0/16" diff --git a/config/traefik/dynamic-staging/middleware.yml b/config/traefik/dynamic-staging/middleware.yml new file mode 100755 index 0000000..31a61e8 --- /dev/null +++ b/config/traefik/dynamic-staging/middleware.yml @@ -0,0 +1,173 @@ +http: + middlewares: + # Security headers middleware + secure-headers: + headers: + accessControlAllowMethods: + - GET + - OPTIONS + - PUT + - POST + - DELETE + accessControlAllowOriginList: + - "https://admin.motovaultpro.com" + - "https://motovaultpro.com" + accessControlMaxAge: 100 + addVaryHeader: true + browserXssFilter: true + contentTypeNosniff: true + forceSTSHeader: true + frameDeny: true + stsIncludeSubdomains: true + stsPreload: true + stsSeconds: 31536000 + customRequestHeaders: + X-Forwarded-Proto: https + + # CORS middleware for API endpoints + cors: + headers: + accessControlAllowCredentials: true + accessControlAllowHeaders: + - "Authorization" + - "Content-Type" + - "X-Requested-With" + - "X-Tenant-ID" + - "X-Request-Id" + accessControlAllowMethods: + - "GET" + - "POST" + - "PUT" + - "DELETE" + - "OPTIONS" + accessControlAllowOriginList: + - "https://admin.motovaultpro.com" + - "https://motovaultpro.com" + accessControlMaxAge: 100 + + # API authentication middleware + api-auth: + forwardAuth: + address: "http://admin-backend:3001/auth/verify" + authResponseHeaders: + - "X-Auth-User" + - "X-Auth-Roles" + - "X-Tenant-ID" + authRequestHeaders: + - "Authorization" + - "X-Tenant-ID" + trustForwardHeader: true + + # Platform API authentication middleware + platform-auth: + forwardAuth: + address: "http://admin-backend:3001/auth/verify-platform" + authResponseHeaders: + - "X-Service-Name" + - "X-Auth-Scope" + authRequestHeaders: + - "X-API-Key" + - "Authorization" + trustForwardHeader: true + + # Rate limiting middleware + rate-limit: + rateLimit: + burst: 100 + average: 50 + period: 1m + + # Request/response size limits + size-limit: + buffering: + maxRequestBodyBytes: 26214400 # 25MB + maxResponseBodyBytes: 26214400 # 25MB + + # IP whitelist for development (optional) + local-ips: + ipAllowList: + sourceRange: + - "127.0.0.1/32" + - "10.0.0.0/8" + - "172.16.0.0/12" + - "192.168.0.0/16" + + # Advanced security headers for production + security-headers-strict: + headers: + accessControlAllowCredentials: false + accessControlAllowMethods: + - GET + - POST + - OPTIONS + accessControlAllowOriginList: + - "https://admin.motovaultpro.com" + - "https://motovaultpro.com" + browserXssFilter: true + contentTypeNosniff: true + customRequestHeaders: + X-Forwarded-Proto: https + customResponseHeaders: + X-Frame-Options: DENY + X-Content-Type-Options: nosniff + Referrer-Policy: strict-origin-when-cross-origin + Permissions-Policy: "geolocation=(), microphone=(), camera=()" + forceSTSHeader: true + frameDeny: true + stsIncludeSubdomains: true + stsPreload: true + stsSeconds: 31536000 + + # Circuit breaker for reliability + circuit-breaker: + circuitBreaker: + expression: "NetworkErrorRatio() > 0.3 || ResponseCodeRatio(500, 600, 0, 600) > 0.3" + checkPeriod: 30s + fallbackDuration: 10s + recoveryDuration: 30s + + # Request retry for resilience + retry-policy: + retry: + attempts: 3 + initialInterval: 100ms + + # Compress responses for performance + compression: + compress: {} + + # Health check middleware chain + health-check-chain: + chain: + middlewares: + - compression + - secure-headers + + # API middleware chain + api-chain: + chain: + middlewares: + - compression + - security-headers-strict + - cors + - rate-limit + - api-auth + - retry-policy + + # Platform API middleware chain + platform-chain: + chain: + middlewares: + - compression + - security-headers-strict + - rate-limit + - platform-auth + - circuit-breaker + - retry-policy + + # Public frontend middleware chain + frontend-chain: + chain: + middlewares: + - compression + - secure-headers \ No newline at end of file diff --git a/docker-compose.staging.yml b/docker-compose.staging.yml index df667b3..5702e00 100644 --- a/docker-compose.staging.yml +++ b/docker-compose.staging.yml @@ -15,6 +15,8 @@ services: mvp-traefik: image: ${REGISTRY_MIRRORS:-git.motovaultpro.com/egullickson/mirrors}/traefik:v3.6 container_name: mvp-traefik-staging + volumes: + - ./config/traefik/dynamic-staging:/etc/traefik/dynamic:ro labels: - "traefik.http.routers.traefik-dashboard.rule=Host(`traefik.staging.motovaultpro.com`)" @@ -79,6 +81,20 @@ services: volumes: - mvp_redis_staging_data:/data + # ======================================== + # Grafana (Staging domain override) + # ======================================== + mvp-grafana: + labels: + - "traefik.enable=true" + - "traefik.docker.network=motovaultpro_frontend" + - "traefik.http.routers.grafana.rule=Host(`logs.staging.motovaultpro.com`)" + - "traefik.http.routers.grafana.entrypoints=websecure" + - "traefik.http.routers.grafana.tls=true" + - "traefik.http.routers.grafana.tls.certresolver=letsencrypt" + - "traefik.http.routers.grafana.middlewares=grafana-ipwhitelist@file" + - "traefik.http.services.grafana.loadbalancer.server.port=3000" + # Staging-specific volumes (separate from production) volumes: mvp_postgres_staging_data: diff --git a/docker-compose.yml b/docker-compose.yml index 0bef4b2..0d83e30 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -276,10 +276,9 @@ services: networks: - backend healthcheck: - test: ["CMD-SHELL", "wget -q --spider http://localhost:3100/ready || exit 1"] - interval: 30s - timeout: 10s - retries: 3 + # Loki 3.x uses a distroless image with no shell or HTTP client. + # Disable in-container healthcheck; Alloy and Grafana verify connectivity. + disable: true logging: driver: json-file options: @@ -305,7 +304,7 @@ services: depends_on: - mvp-loki healthcheck: - test: ["CMD-SHELL", "wget -q --spider http://localhost:12345/ready || exit 1"] + test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/12345'"] interval: 30s timeout: 10s retries: 3 -- 2.49.1