From 3705e63fdef6c4adb7e5424561e1a685034a7a57 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:00:47 -0600 Subject: [PATCH] feat: add Gemini engine module and configuration (refs #133) Add standalone GeminiEngine class for maintenance schedule extraction from PDF owners manuals using Vertex AI Gemini 2.5 Flash with structured JSON output enforcement, 20MB size limit, and lazy initialization. Co-Authored-By: Claude Opus 4.6 --- docker-compose.prod.yml | 4 + docker-compose.staging.yml | 4 + docker-compose.yml | 4 + ocr/app/config.py | 7 + ocr/app/engines/gemini_engine.py | 228 ++++++++++++++++++++ ocr/requirements.txt | 3 + ocr/tests/test_gemini_engine.py | 353 +++++++++++++++++++++++++++++++ 7 files changed, 603 insertions(+) create mode 100644 ocr/app/engines/gemini_engine.py create mode 100644 ocr/tests/test_gemini_engine.py diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index adb2a1a..a69da6d 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -56,6 +56,10 @@ services: OCR_FALLBACK_THRESHOLD: "0.6" GOOGLE_VISION_KEY_PATH: /run/secrets/google-wif-config.json VISION_MONTHLY_LIMIT: "1000" + # Vertex AI / Gemini configuration (maintenance schedule extraction) + VERTEX_AI_PROJECT: ${VERTEX_AI_PROJECT:-} + VERTEX_AI_LOCATION: us-central1 + GEMINI_MODEL: gemini-2.5-flash # PostgreSQL - Remove dev ports, production log level mvp-postgres: diff --git a/docker-compose.staging.yml b/docker-compose.staging.yml index 666a4e2..d5d021e 100644 --- a/docker-compose.staging.yml +++ b/docker-compose.staging.yml @@ -76,6 +76,10 @@ services: OCR_FALLBACK_THRESHOLD: "0.6" GOOGLE_VISION_KEY_PATH: /run/secrets/google-wif-config.json VISION_MONTHLY_LIMIT: "1000" + # Vertex AI / Gemini configuration (maintenance schedule extraction) + VERTEX_AI_PROJECT: ${VERTEX_AI_PROJECT:-} + VERTEX_AI_LOCATION: us-central1 + GEMINI_MODEL: gemini-2.5-flash volumes: - ./secrets/app/auth0-ocr-client-id.txt:/run/secrets/auth0-ocr-client-id:ro - ./secrets/app/auth0-ocr-client-secret.txt:/run/secrets/auth0-ocr-client-secret:ro diff --git a/docker-compose.yml b/docker-compose.yml index 46d9f79..6577bfa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -203,6 +203,10 @@ services: OCR_FALLBACK_THRESHOLD: "0.6" GOOGLE_VISION_KEY_PATH: /run/secrets/google-wif-config.json VISION_MONTHLY_LIMIT: "1000" + # Vertex AI / Gemini configuration (maintenance schedule extraction) + VERTEX_AI_PROJECT: ${VERTEX_AI_PROJECT:-} + VERTEX_AI_LOCATION: us-central1 + GEMINI_MODEL: gemini-2.5-flash volumes: - /tmp/vin-debug:/tmp/vin-debug - ./secrets/app/auth0-ocr-client-id.txt:/run/secrets/auth0-ocr-client-id:ro diff --git a/ocr/app/config.py b/ocr/app/config.py index a9e1fd8..f1e7826 100644 --- a/ocr/app/config.py +++ b/ocr/app/config.py @@ -29,6 +29,13 @@ class Settings: os.getenv("VISION_MONTHLY_LIMIT", "1000") ) + # Vertex AI / Gemini configuration + self.vertex_ai_project: str = os.getenv("VERTEX_AI_PROJECT", "") + self.vertex_ai_location: str = os.getenv( + "VERTEX_AI_LOCATION", "us-central1" + ) + self.gemini_model: str = os.getenv("GEMINI_MODEL", "gemini-2.5-flash") + # Redis configuration for job queue self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis") self.redis_port: int = int(os.getenv("REDIS_PORT", "6379")) diff --git a/ocr/app/engines/gemini_engine.py b/ocr/app/engines/gemini_engine.py new file mode 100644 index 0000000..5a1a61b --- /dev/null +++ b/ocr/app/engines/gemini_engine.py @@ -0,0 +1,228 @@ +"""Gemini 2.5 Flash engine for maintenance schedule extraction from PDFs. + +Standalone module (does NOT extend OcrEngine) because Gemini performs +semantic document understanding, not traditional OCR word-box extraction. +Uses Vertex AI SDK with structured JSON output enforcement. +""" + +import json +import logging +import os +from dataclasses import dataclass +from typing import Any + +from app.config import settings + +logger = logging.getLogger(__name__) + +# 20 MB hard limit for inline base64 PDF delivery +_MAX_PDF_BYTES = 20 * 1024 * 1024 + +_EXTRACTION_PROMPT = """\ +Extract all routine scheduled maintenance items from this vehicle owners manual. + +For each maintenance item, extract: +- serviceName: The maintenance task name (e.g., "Engine Oil Change", "Tire Rotation", \ +"Cabin Air Filter Replacement") +- intervalMiles: The mileage interval as a number, or null if not specified \ +(e.g., 5000, 30000) +- intervalMonths: The time interval in months as a number, or null if not specified \ +(e.g., 6, 12, 24) +- details: Any additional details such as fluid specifications, part numbers, \ +or special instructions (e.g., "Use 0W-20 full synthetic oil") + +Only include routine scheduled maintenance items with clear intervals. \ +Do not include one-time procedures, troubleshooting steps, or warranty information. + +Return the results as a JSON object with a single "maintenanceSchedule" array.\ +""" + +_RESPONSE_SCHEMA: dict[str, Any] = { + "type": "object", + "properties": { + "maintenanceSchedule": { + "type": "array", + "items": { + "type": "object", + "properties": { + "serviceName": {"type": "string"}, + "intervalMiles": {"type": "number", "nullable": True}, + "intervalMonths": {"type": "number", "nullable": True}, + "details": {"type": "string", "nullable": True}, + }, + "required": ["serviceName"], + }, + }, + }, + "required": ["maintenanceSchedule"], +} + + +class GeminiEngineError(Exception): + """Base exception for Gemini engine errors.""" + + +class GeminiUnavailableError(GeminiEngineError): + """Raised when the Gemini engine cannot be initialized.""" + + +class GeminiProcessingError(GeminiEngineError): + """Raised when Gemini fails to process a document.""" + + +@dataclass +class MaintenanceItem: + """A single extracted maintenance schedule item.""" + + service_name: str + interval_miles: int | None = None + interval_months: int | None = None + details: str | None = None + + +@dataclass +class MaintenanceExtractionResult: + """Result from Gemini maintenance schedule extraction.""" + + items: list[MaintenanceItem] + model: str + + +class GeminiEngine: + """Gemini 2.5 Flash wrapper for maintenance schedule extraction. + + Standalone class (not an OcrEngine subclass) because Gemini performs + semantic document understanding rather than traditional OCR. + + Uses lazy initialization: the Vertex AI client is not created until + the first ``extract_maintenance()`` call. + """ + + def __init__(self) -> None: + self._model: Any | None = None + + def _get_model(self) -> Any: + """Create the GenerativeModel on first use. + + Authentication uses the same WIF credential path as Google Vision. + """ + if self._model is not None: + return self._model + + key_path = settings.google_vision_key_path + if not os.path.isfile(key_path): + raise GeminiUnavailableError( + f"Google credential config not found at {key_path}. " + "Set GOOGLE_VISION_KEY_PATH or mount the secret." + ) + + try: + from google.cloud import aiplatform # type: ignore[import-untyped] + from vertexai.generative_models import ( # type: ignore[import-untyped] + GenerationConfig, + GenerativeModel, + ) + + # Point ADC at the WIF credential config + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path + os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1" + + aiplatform.init( + project=settings.vertex_ai_project, + location=settings.vertex_ai_location, + ) + + model_name = settings.gemini_model + self._model = GenerativeModel(model_name) + self._generation_config = GenerationConfig( + response_mime_type="application/json", + response_schema=_RESPONSE_SCHEMA, + ) + + logger.info( + "Gemini engine initialized (model=%s, project=%s, location=%s)", + model_name, + settings.vertex_ai_project, + settings.vertex_ai_location, + ) + return self._model + + except ImportError as exc: + raise GeminiUnavailableError( + "google-cloud-aiplatform is not installed. " + "Install with: pip install google-cloud-aiplatform" + ) from exc + except Exception as exc: + raise GeminiUnavailableError( + f"Failed to initialize Gemini engine: {exc}" + ) from exc + + def extract_maintenance( + self, pdf_bytes: bytes + ) -> MaintenanceExtractionResult: + """Extract maintenance schedules from a PDF owners manual. + + Args: + pdf_bytes: Raw PDF file bytes (<= 20 MB). + + Returns: + Structured maintenance extraction result. + + Raises: + GeminiProcessingError: If the PDF is too large or extraction fails. + GeminiUnavailableError: If the engine cannot be initialized. + """ + if len(pdf_bytes) > _MAX_PDF_BYTES: + size_mb = len(pdf_bytes) / (1024 * 1024) + raise GeminiProcessingError( + f"PDF size ({size_mb:.1f} MB) exceeds the 20 MB limit for " + "inline processing. Upload to GCS and use a gs:// URI instead." + ) + + model = self._get_model() + + try: + from vertexai.generative_models import Part # type: ignore[import-untyped] + + pdf_part = Part.from_data( + data=pdf_bytes, + mime_type="application/pdf", + ) + + response = model.generate_content( + [pdf_part, _EXTRACTION_PROMPT], + generation_config=self._generation_config, + ) + + raw = json.loads(response.text) + items = [ + MaintenanceItem( + service_name=item["serviceName"], + interval_miles=item.get("intervalMiles"), + interval_months=item.get("intervalMonths"), + details=item.get("details"), + ) + for item in raw.get("maintenanceSchedule", []) + ] + + logger.info( + "Gemini extracted %d maintenance items from PDF (%d bytes)", + len(items), + len(pdf_bytes), + ) + + return MaintenanceExtractionResult( + items=items, + model=settings.gemini_model, + ) + + except (GeminiEngineError,): + raise + except json.JSONDecodeError as exc: + raise GeminiProcessingError( + f"Gemini returned invalid JSON: {exc}" + ) from exc + except Exception as exc: + raise GeminiProcessingError( + f"Gemini maintenance extraction failed: {exc}" + ) from exc diff --git a/ocr/requirements.txt b/ocr/requirements.txt index 946f645..69864df 100644 --- a/ocr/requirements.txt +++ b/ocr/requirements.txt @@ -21,6 +21,9 @@ google-cloud-vision>=3.7.0 # PDF Processing PyMuPDF>=1.23.0 +# Vertex AI / Gemini (maintenance schedule extraction) +google-cloud-aiplatform>=1.40.0 + # Redis for job queue redis>=5.0.0 diff --git a/ocr/tests/test_gemini_engine.py b/ocr/tests/test_gemini_engine.py new file mode 100644 index 0000000..bf709e4 --- /dev/null +++ b/ocr/tests/test_gemini_engine.py @@ -0,0 +1,353 @@ +"""Tests for Gemini engine maintenance schedule extraction. + +Covers: GeminiEngine initialization, PDF size validation, +successful extraction, empty results, and error handling. +All Vertex AI SDK calls are mocked. +""" + +import json +from unittest.mock import MagicMock, patch, PropertyMock + +import pytest + +from app.engines.gemini_engine import ( + GeminiEngine, + GeminiEngineError, + GeminiProcessingError, + GeminiUnavailableError, + MaintenanceExtractionResult, + MaintenanceItem, + _MAX_PDF_BYTES, +) + + +# --- Helpers --- + + +def _make_pdf_bytes(size: int = 1024) -> bytes: + """Create fake PDF bytes of a given size.""" + # Minimal PDF header so it looks plausible, padded to size + header = b"%PDF-1.4 fake" + return header + b"\x00" * max(0, size - len(header)) + + +def _make_gemini_response(schedule: list[dict]) -> MagicMock: + """Create a mock Gemini generate_content response.""" + response = MagicMock() + response.text = json.dumps({"maintenanceSchedule": schedule}) + return response + + +# --- Exception hierarchy --- + + +class TestExceptionHierarchy: + """Verify the Gemini exception class relationships.""" + + def test_processing_error_is_engine_error(self): + assert issubclass(GeminiProcessingError, GeminiEngineError) + + def test_unavailable_error_is_engine_error(self): + assert issubclass(GeminiUnavailableError, GeminiEngineError) + + def test_engine_error_is_exception(self): + assert issubclass(GeminiEngineError, Exception) + + +# --- Data types --- + + +class TestMaintenanceItem: + """Verify MaintenanceItem dataclass construction.""" + + def test_required_fields_only(self): + item = MaintenanceItem(service_name="Oil Change") + assert item.service_name == "Oil Change" + assert item.interval_miles is None + assert item.interval_months is None + assert item.details is None + + def test_all_fields(self): + item = MaintenanceItem( + service_name="Tire Rotation", + interval_miles=5000, + interval_months=6, + details="Rotate front to rear on same side.", + ) + assert item.service_name == "Tire Rotation" + assert item.interval_miles == 5000 + assert item.interval_months == 6 + assert item.details == "Rotate front to rear on same side." + + +class TestMaintenanceExtractionResult: + """Verify MaintenanceExtractionResult dataclass.""" + + def test_construction(self): + result = MaintenanceExtractionResult( + items=[MaintenanceItem(service_name="Oil Change")], + model="gemini-2.5-flash", + ) + assert len(result.items) == 1 + assert result.model == "gemini-2.5-flash" + + def test_empty_items(self): + result = MaintenanceExtractionResult(items=[], model="gemini-2.5-flash") + assert result.items == [] + + +# --- PDF size validation --- + + +class TestPdfSizeValidation: + """Verify the 20 MB PDF size limit.""" + + def test_oversized_pdf_rejected(self): + """PDFs exceeding 20 MB must be rejected with a clear error.""" + engine = GeminiEngine() + oversized = _make_pdf_bytes(_MAX_PDF_BYTES + 1) + + with pytest.raises(GeminiProcessingError, match="exceeds the 20 MB limit"): + engine.extract_maintenance(oversized) + + def test_exactly_at_limit_accepted(self): + """PDFs exactly at 20 MB should pass size validation. + + The engine will still fail at model init (mocked away in other tests), + but the size check itself should pass. + """ + engine = GeminiEngine() + exact = _make_pdf_bytes(_MAX_PDF_BYTES) + + # Should fail at _get_model, not at size check + with pytest.raises(GeminiUnavailableError): + engine.extract_maintenance(exact) + + +# --- Successful extraction --- + + +class TestExtractMaintenance: + """Verify successful maintenance schedule extraction.""" + + @patch("app.engines.gemini_engine.settings") + @patch("app.engines.gemini_engine.os.path.isfile", return_value=True) + def test_valid_pdf_returns_structured_schedules( + self, mock_isfile, mock_settings + ): + """Normal: Valid PDF returns structured maintenance schedules.""" + mock_settings.google_vision_key_path = "/fake/creds.json" + mock_settings.vertex_ai_project = "test-project" + mock_settings.vertex_ai_location = "us-central1" + mock_settings.gemini_model = "gemini-2.5-flash" + + schedule = [ + { + "serviceName": "Engine Oil Change", + "intervalMiles": 5000, + "intervalMonths": 6, + "details": "Use 0W-20 full synthetic oil.", + }, + { + "serviceName": "Tire Rotation", + "intervalMiles": 5000, + "intervalMonths": 6, + "details": None, + }, + ] + + mock_model = MagicMock() + mock_model.generate_content.return_value = _make_gemini_response(schedule) + + with ( + patch( + "app.engines.gemini_engine.importlib_vertex_ai" + ) if False else patch.dict("sys.modules", { + "google.cloud": MagicMock(), + "google.cloud.aiplatform": MagicMock(), + "vertexai": MagicMock(), + "vertexai.generative_models": MagicMock(), + }), + ): + engine = GeminiEngine() + engine._model = mock_model + engine._generation_config = MagicMock() + + result = engine.extract_maintenance(_make_pdf_bytes()) + + assert isinstance(result, MaintenanceExtractionResult) + assert len(result.items) == 2 + assert result.model == "gemini-2.5-flash" + + oil = result.items[0] + assert oil.service_name == "Engine Oil Change" + assert oil.interval_miles == 5000 + assert oil.interval_months == 6 + assert oil.details == "Use 0W-20 full synthetic oil." + + tire = result.items[1] + assert tire.service_name == "Tire Rotation" + assert tire.details is None + + @patch("app.engines.gemini_engine.settings") + @patch("app.engines.gemini_engine.os.path.isfile", return_value=True) + def test_no_maintenance_content_returns_empty_array( + self, mock_isfile, mock_settings + ): + """Edge: PDF with no maintenance content returns empty array.""" + mock_settings.google_vision_key_path = "/fake/creds.json" + mock_settings.vertex_ai_project = "test-project" + mock_settings.vertex_ai_location = "us-central1" + mock_settings.gemini_model = "gemini-2.5-flash" + + mock_model = MagicMock() + mock_model.generate_content.return_value = _make_gemini_response([]) + + engine = GeminiEngine() + engine._model = mock_model + engine._generation_config = MagicMock() + + result = engine.extract_maintenance(_make_pdf_bytes()) + + assert isinstance(result, MaintenanceExtractionResult) + assert result.items == [] + + @patch("app.engines.gemini_engine.settings") + @patch("app.engines.gemini_engine.os.path.isfile", return_value=True) + def test_nullable_fields_handled(self, mock_isfile, mock_settings): + """Items with only serviceName (nullable fields omitted) parse correctly.""" + mock_settings.google_vision_key_path = "/fake/creds.json" + mock_settings.vertex_ai_project = "test-project" + mock_settings.vertex_ai_location = "us-central1" + mock_settings.gemini_model = "gemini-2.5-flash" + + schedule = [{"serviceName": "Brake Fluid Replacement"}] + + mock_model = MagicMock() + mock_model.generate_content.return_value = _make_gemini_response(schedule) + + engine = GeminiEngine() + engine._model = mock_model + engine._generation_config = MagicMock() + + result = engine.extract_maintenance(_make_pdf_bytes()) + + assert len(result.items) == 1 + item = result.items[0] + assert item.service_name == "Brake Fluid Replacement" + assert item.interval_miles is None + assert item.interval_months is None + assert item.details is None + + +# --- Error handling --- + + +class TestErrorHandling: + """Verify error handling for various failure modes.""" + + def test_missing_credential_file_raises_unavailable(self): + """Auth failure: Missing credential file raises GeminiUnavailableError.""" + engine = GeminiEngine() + + with ( + patch("app.engines.gemini_engine.os.path.isfile", return_value=False), + pytest.raises(GeminiUnavailableError, match="credential config not found"), + ): + engine.extract_maintenance(_make_pdf_bytes()) + + @patch("app.engines.gemini_engine.os.path.isfile", return_value=True) + def test_missing_sdk_raises_unavailable(self, mock_isfile): + """Auth failure: Missing SDK raises GeminiUnavailableError.""" + engine = GeminiEngine() + + with ( + patch("app.engines.gemini_engine.settings") as mock_settings, + patch.dict("sys.modules", { + "google.cloud.aiplatform": None, + }), + ): + mock_settings.google_vision_key_path = "/fake/creds.json" + + with pytest.raises(GeminiUnavailableError): + engine.extract_maintenance(_make_pdf_bytes()) + + @patch("app.engines.gemini_engine.settings") + @patch("app.engines.gemini_engine.os.path.isfile", return_value=True) + def test_generate_content_exception_raises_processing_error( + self, mock_isfile, mock_settings + ): + """Runtime error from Gemini API is wrapped as GeminiProcessingError.""" + mock_settings.google_vision_key_path = "/fake/creds.json" + mock_settings.vertex_ai_project = "test-project" + mock_settings.vertex_ai_location = "us-central1" + mock_settings.gemini_model = "gemini-2.5-flash" + + mock_model = MagicMock() + mock_model.generate_content.side_effect = RuntimeError("API quota exceeded") + + engine = GeminiEngine() + engine._model = mock_model + engine._generation_config = MagicMock() + + with pytest.raises(GeminiProcessingError, match="maintenance extraction failed"): + engine.extract_maintenance(_make_pdf_bytes()) + + @patch("app.engines.gemini_engine.settings") + @patch("app.engines.gemini_engine.os.path.isfile", return_value=True) + def test_invalid_json_response_raises_processing_error( + self, mock_isfile, mock_settings + ): + """Gemini returning invalid JSON is caught and wrapped.""" + mock_settings.google_vision_key_path = "/fake/creds.json" + mock_settings.vertex_ai_project = "test-project" + mock_settings.vertex_ai_location = "us-central1" + mock_settings.gemini_model = "gemini-2.5-flash" + + mock_response = MagicMock() + mock_response.text = "not valid json {{" + + mock_model = MagicMock() + mock_model.generate_content.return_value = mock_response + + engine = GeminiEngine() + engine._model = mock_model + engine._generation_config = MagicMock() + + with pytest.raises(GeminiProcessingError, match="invalid JSON"): + engine.extract_maintenance(_make_pdf_bytes()) + + +# --- Lazy initialization --- + + +class TestLazyInitialization: + """Verify the model is not created until first use.""" + + def test_model_is_none_after_construction(self): + """GeminiEngine should not initialize the model in __init__.""" + engine = GeminiEngine() + assert engine._model is None + + @patch("app.engines.gemini_engine.settings") + @patch("app.engines.gemini_engine.os.path.isfile", return_value=True) + def test_model_reused_on_second_call(self, mock_isfile, mock_settings): + """Once initialized, the same model instance is reused.""" + mock_settings.google_vision_key_path = "/fake/creds.json" + mock_settings.vertex_ai_project = "test-project" + mock_settings.vertex_ai_location = "us-central1" + mock_settings.gemini_model = "gemini-2.5-flash" + + schedule = [{"serviceName": "Oil Change", "intervalMiles": 5000}] + mock_model = MagicMock() + mock_model.generate_content.return_value = _make_gemini_response(schedule) + + engine = GeminiEngine() + engine._model = mock_model + engine._generation_config = MagicMock() + + engine.extract_maintenance(_make_pdf_bytes()) + engine.extract_maintenance(_make_pdf_bytes()) + + # Model's generate_content should have been called twice + assert mock_model.generate_content.call_count == 2