From b7f472b3e85cc2dca8eae701c7cc9a55fd0645ce Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Sat, 28 Feb 2026 11:16:18 -0600 Subject: [PATCH] feat: migrate GeminiEngine to google-genai SDK with Google Search grounding (refs #233) Replace vertexai.generative_models with google.genai client pattern. Add Google Search grounding tool to VIN decode for improved accuracy. Convert response schema types to uppercase per Vertex AI Schema spec. Co-Authored-By: Claude Opus 4.6 --- ocr/app/config.py | 2 +- ocr/app/engines/gemini_engine.py | 120 +++++++++++++++---------------- 2 files changed, 59 insertions(+), 63 deletions(-) diff --git a/ocr/app/config.py b/ocr/app/config.py index c1f1041..5784753 100644 --- a/ocr/app/config.py +++ b/ocr/app/config.py @@ -29,7 +29,7 @@ class Settings: os.getenv("VISION_MONTHLY_LIMIT", "1000") ) - # Vertex AI / Gemini configuration + # Google GenAI / Gemini configuration self.vertex_ai_project: str = os.getenv("VERTEX_AI_PROJECT", "") self.vertex_ai_location: str = os.getenv( "VERTEX_AI_LOCATION", "global" diff --git a/ocr/app/engines/gemini_engine.py b/ocr/app/engines/gemini_engine.py index 6f2c556..e6f4cd5 100644 --- a/ocr/app/engines/gemini_engine.py +++ b/ocr/app/engines/gemini_engine.py @@ -2,7 +2,7 @@ Standalone module (does NOT extend OcrEngine) because Gemini performs semantic document understanding, not traditional OCR word-box extraction. -Uses Vertex AI SDK with structured JSON output enforcement. +Uses google-genai SDK with structured JSON output enforcement. """ import json @@ -117,34 +117,34 @@ Return the vehicle's make, model, trim level, body type, drive type, fuel type, """ _VIN_DECODE_SCHEMA: dict[str, Any] = { - "type": "object", + "type": "OBJECT", "properties": { - "year": {"type": "integer", "nullable": True}, - "make": {"type": "string", "nullable": True}, - "model": {"type": "string", "nullable": True}, - "trimLevel": {"type": "string", "nullable": True}, - "bodyType": {"type": "string", "nullable": True}, - "driveType": {"type": "string", "nullable": True}, - "fuelType": {"type": "string", "nullable": True}, - "engine": {"type": "string", "nullable": True}, - "transmission": {"type": "string", "nullable": True}, - "confidence": {"type": "number"}, + "year": {"type": "INTEGER", "nullable": True}, + "make": {"type": "STRING", "nullable": True}, + "model": {"type": "STRING", "nullable": True}, + "trimLevel": {"type": "STRING", "nullable": True}, + "bodyType": {"type": "STRING", "nullable": True}, + "driveType": {"type": "STRING", "nullable": True}, + "fuelType": {"type": "STRING", "nullable": True}, + "engine": {"type": "STRING", "nullable": True}, + "transmission": {"type": "STRING", "nullable": True}, + "confidence": {"type": "NUMBER"}, }, "required": ["confidence"], } _RESPONSE_SCHEMA: dict[str, Any] = { - "type": "object", + "type": "OBJECT", "properties": { "maintenanceSchedule": { - "type": "array", + "type": "ARRAY", "items": { - "type": "object", + "type": "OBJECT", "properties": { - "serviceName": {"type": "string"}, - "intervalMiles": {"type": "number", "nullable": True}, - "intervalMonths": {"type": "number", "nullable": True}, - "details": {"type": "string", "nullable": True}, + "serviceName": {"type": "STRING"}, + "intervalMiles": {"type": "NUMBER", "nullable": True}, + "intervalMonths": {"type": "NUMBER", "nullable": True}, + "details": {"type": "STRING", "nullable": True}, }, "required": ["serviceName"], }, @@ -206,20 +206,21 @@ class GeminiEngine: Standalone class (not an OcrEngine subclass) because Gemini performs semantic document understanding rather than traditional OCR. - Uses lazy initialization: the Vertex AI client is not created until + Uses lazy initialization: the Gemini client is not created until the first call to ``extract_maintenance()`` or ``decode_vin()``. """ def __init__(self) -> None: - self._model: Any | None = None + self._client: Any | None = None + self._model_name: str = "" - def _get_model(self) -> Any: - """Create the GenerativeModel on first use. + def _get_client(self) -> Any: + """Create the genai.Client on first use. Authentication uses the same WIF credential path as Google Vision. """ - if self._model is not None: - return self._model + if self._client is not None: + return self._client key_path = settings.google_vision_key_path if not os.path.isfile(key_path): @@ -229,46 +230,37 @@ class GeminiEngine: ) try: - from google.cloud import aiplatform # type: ignore[import-untyped] - from vertexai.generative_models import ( # type: ignore[import-untyped] - GenerationConfig, - GenerativeModel, - ) + from google import genai # type: ignore[import-untyped] - # Point ADC at the WIF credential config + # Point ADC at the WIF credential config (must be set BEFORE Client construction) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1" - aiplatform.init( + self._client = genai.Client( + vertexai=True, project=settings.vertex_ai_project, location=settings.vertex_ai_location, ) - - model_name = settings.gemini_model - self._model = GenerativeModel(model_name) - self._generation_config = GenerationConfig( - response_mime_type="application/json", - response_schema=_RESPONSE_SCHEMA, - ) + self._model_name = settings.gemini_model logger.info( "Gemini engine initialized (model=%s, project=%s, location=%s)", - model_name, + self._model_name, settings.vertex_ai_project, settings.vertex_ai_location, ) - return self._model + return self._client except ImportError as exc: - logger.exception("Vertex AI SDK import failed") + logger.exception("google-genai SDK import failed") raise GeminiUnavailableError( - "google-cloud-aiplatform is not installed. " - "Install with: pip install google-cloud-aiplatform" + "google-genai is not installed. " + "Install with: pip install google-genai" ) from exc except Exception as exc: - logger.exception("Vertex AI authentication failed") + logger.exception("Gemini authentication failed: %s", type(exc).__name__) raise GeminiUnavailableError( - f"Vertex AI authentication failed: {exc}" + f"Gemini authentication failed: {exc}" ) from exc def extract_maintenance( @@ -293,19 +285,23 @@ class GeminiEngine: "inline processing. Upload to GCS and use a gs:// URI instead." ) - model = self._get_model() + client = self._get_client() try: - from vertexai.generative_models import Part # type: ignore[import-untyped] + from google.genai import types # type: ignore[import-untyped] - pdf_part = Part.from_data( + pdf_part = types.Part.from_bytes( data=pdf_bytes, mime_type="application/pdf", ) - response = model.generate_content( - [pdf_part, _EXTRACTION_PROMPT], - generation_config=self._generation_config, + response = client.models.generate_content( + model=self._model_name, + contents=[pdf_part, _EXTRACTION_PROMPT], + config=types.GenerateContentConfig( + response_mime_type="application/json", + response_schema=_RESPONSE_SCHEMA, + ), ) raw = json.loads(response.text) @@ -358,7 +354,7 @@ class GeminiEngine: GeminiProcessingError: If Gemini fails to decode the VIN. GeminiUnavailableError: If the engine cannot be initialized. """ - model = self._get_model() + client = self._get_client() # Resolve year deterministically from VIN structure resolved_year = resolve_vin_year(vin) @@ -371,21 +367,21 @@ class GeminiEngine: ) try: - from vertexai.generative_models import GenerationConfig # type: ignore[import-untyped] - - vin_config = GenerationConfig( - response_mime_type="application/json", - response_schema=_VIN_DECODE_SCHEMA, - ) + from google.genai import types # type: ignore[import-untyped] prompt = _VIN_DECODE_PROMPT.format( vin=vin, year=resolved_year or "unknown", year_code=year_code, ) - response = model.generate_content( - [prompt], - generation_config=vin_config, + response = client.models.generate_content( + model=self._model_name, + contents=[prompt], + config=types.GenerateContentConfig( + response_mime_type="application/json", + response_schema=_VIN_DECODE_SCHEMA, + tools=[types.Tool(google_search=types.GoogleSearch())], + ), ) raw = json.loads(response.text)