feat: Expand OCR with fuel receipt scanning and maintenance extraction (#129) #147

Merged
egullickson merged 26 commits from issue-129-expand-ocr-fuel-receipt-maintenance into main 2026-02-13 02:25:55 +00:00
7 changed files with 603 additions and 0 deletions
Showing only changes of commit 3705e63fde - Show all commits

View File

@@ -56,6 +56,10 @@ services:
OCR_FALLBACK_THRESHOLD: "0.6"
GOOGLE_VISION_KEY_PATH: /run/secrets/google-wif-config.json
VISION_MONTHLY_LIMIT: "1000"
# Vertex AI / Gemini configuration (maintenance schedule extraction)
VERTEX_AI_PROJECT: ${VERTEX_AI_PROJECT:-}
VERTEX_AI_LOCATION: us-central1
GEMINI_MODEL: gemini-2.5-flash
# PostgreSQL - Remove dev ports, production log level
mvp-postgres:

View File

@@ -76,6 +76,10 @@ services:
OCR_FALLBACK_THRESHOLD: "0.6"
GOOGLE_VISION_KEY_PATH: /run/secrets/google-wif-config.json
VISION_MONTHLY_LIMIT: "1000"
# Vertex AI / Gemini configuration (maintenance schedule extraction)
VERTEX_AI_PROJECT: ${VERTEX_AI_PROJECT:-}
VERTEX_AI_LOCATION: us-central1
GEMINI_MODEL: gemini-2.5-flash
volumes:
- ./secrets/app/auth0-ocr-client-id.txt:/run/secrets/auth0-ocr-client-id:ro
- ./secrets/app/auth0-ocr-client-secret.txt:/run/secrets/auth0-ocr-client-secret:ro

View File

@@ -203,6 +203,10 @@ services:
OCR_FALLBACK_THRESHOLD: "0.6"
GOOGLE_VISION_KEY_PATH: /run/secrets/google-wif-config.json
VISION_MONTHLY_LIMIT: "1000"
# Vertex AI / Gemini configuration (maintenance schedule extraction)
VERTEX_AI_PROJECT: ${VERTEX_AI_PROJECT:-}
VERTEX_AI_LOCATION: us-central1
GEMINI_MODEL: gemini-2.5-flash
volumes:
- /tmp/vin-debug:/tmp/vin-debug
- ./secrets/app/auth0-ocr-client-id.txt:/run/secrets/auth0-ocr-client-id:ro

View File

@@ -29,6 +29,13 @@ class Settings:
os.getenv("VISION_MONTHLY_LIMIT", "1000")
)
# Vertex AI / Gemini configuration
self.vertex_ai_project: str = os.getenv("VERTEX_AI_PROJECT", "")
self.vertex_ai_location: str = os.getenv(
"VERTEX_AI_LOCATION", "us-central1"
)
self.gemini_model: str = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
# Redis configuration for job queue
self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis")
self.redis_port: int = int(os.getenv("REDIS_PORT", "6379"))

View File

@@ -0,0 +1,228 @@
"""Gemini 2.5 Flash engine for maintenance schedule extraction from PDFs.
Standalone module (does NOT extend OcrEngine) because Gemini performs
semantic document understanding, not traditional OCR word-box extraction.
Uses Vertex AI SDK with structured JSON output enforcement.
"""
import json
import logging
import os
from dataclasses import dataclass
from typing import Any
from app.config import settings
logger = logging.getLogger(__name__)
# 20 MB hard limit for inline base64 PDF delivery
_MAX_PDF_BYTES = 20 * 1024 * 1024
_EXTRACTION_PROMPT = """\
Extract all routine scheduled maintenance items from this vehicle owners manual.
For each maintenance item, extract:
- serviceName: The maintenance task name (e.g., "Engine Oil Change", "Tire Rotation", \
"Cabin Air Filter Replacement")
- intervalMiles: The mileage interval as a number, or null if not specified \
(e.g., 5000, 30000)
- intervalMonths: The time interval in months as a number, or null if not specified \
(e.g., 6, 12, 24)
- details: Any additional details such as fluid specifications, part numbers, \
or special instructions (e.g., "Use 0W-20 full synthetic oil")
Only include routine scheduled maintenance items with clear intervals. \
Do not include one-time procedures, troubleshooting steps, or warranty information.
Return the results as a JSON object with a single "maintenanceSchedule" array.\
"""
_RESPONSE_SCHEMA: dict[str, Any] = {
"type": "object",
"properties": {
"maintenanceSchedule": {
"type": "array",
"items": {
"type": "object",
"properties": {
"serviceName": {"type": "string"},
"intervalMiles": {"type": "number", "nullable": True},
"intervalMonths": {"type": "number", "nullable": True},
"details": {"type": "string", "nullable": True},
},
"required": ["serviceName"],
},
},
},
"required": ["maintenanceSchedule"],
}
class GeminiEngineError(Exception):
"""Base exception for Gemini engine errors."""
class GeminiUnavailableError(GeminiEngineError):
"""Raised when the Gemini engine cannot be initialized."""
class GeminiProcessingError(GeminiEngineError):
"""Raised when Gemini fails to process a document."""
@dataclass
class MaintenanceItem:
"""A single extracted maintenance schedule item."""
service_name: str
interval_miles: int | None = None
interval_months: int | None = None
details: str | None = None
@dataclass
class MaintenanceExtractionResult:
"""Result from Gemini maintenance schedule extraction."""
items: list[MaintenanceItem]
model: str
class GeminiEngine:
"""Gemini 2.5 Flash wrapper for maintenance schedule extraction.
Standalone class (not an OcrEngine subclass) because Gemini performs
semantic document understanding rather than traditional OCR.
Uses lazy initialization: the Vertex AI client is not created until
the first ``extract_maintenance()`` call.
"""
def __init__(self) -> None:
self._model: Any | None = None
def _get_model(self) -> Any:
"""Create the GenerativeModel on first use.
Authentication uses the same WIF credential path as Google Vision.
"""
if self._model is not None:
return self._model
key_path = settings.google_vision_key_path
if not os.path.isfile(key_path):
raise GeminiUnavailableError(
f"Google credential config not found at {key_path}. "
"Set GOOGLE_VISION_KEY_PATH or mount the secret."
)
try:
from google.cloud import aiplatform # type: ignore[import-untyped]
from vertexai.generative_models import ( # type: ignore[import-untyped]
GenerationConfig,
GenerativeModel,
)
# Point ADC at the WIF credential config
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1"
aiplatform.init(
project=settings.vertex_ai_project,
location=settings.vertex_ai_location,
)
model_name = settings.gemini_model
self._model = GenerativeModel(model_name)
self._generation_config = GenerationConfig(
response_mime_type="application/json",
response_schema=_RESPONSE_SCHEMA,
)
logger.info(
"Gemini engine initialized (model=%s, project=%s, location=%s)",
model_name,
settings.vertex_ai_project,
settings.vertex_ai_location,
)
return self._model
except ImportError as exc:
raise GeminiUnavailableError(
"google-cloud-aiplatform is not installed. "
"Install with: pip install google-cloud-aiplatform"
) from exc
except Exception as exc:
raise GeminiUnavailableError(
f"Failed to initialize Gemini engine: {exc}"
) from exc
def extract_maintenance(
self, pdf_bytes: bytes
) -> MaintenanceExtractionResult:
"""Extract maintenance schedules from a PDF owners manual.
Args:
pdf_bytes: Raw PDF file bytes (<= 20 MB).
Returns:
Structured maintenance extraction result.
Raises:
GeminiProcessingError: If the PDF is too large or extraction fails.
GeminiUnavailableError: If the engine cannot be initialized.
"""
if len(pdf_bytes) > _MAX_PDF_BYTES:
size_mb = len(pdf_bytes) / (1024 * 1024)
raise GeminiProcessingError(
f"PDF size ({size_mb:.1f} MB) exceeds the 20 MB limit for "
"inline processing. Upload to GCS and use a gs:// URI instead."
)
model = self._get_model()
try:
from vertexai.generative_models import Part # type: ignore[import-untyped]
pdf_part = Part.from_data(
data=pdf_bytes,
mime_type="application/pdf",
)
response = model.generate_content(
[pdf_part, _EXTRACTION_PROMPT],
generation_config=self._generation_config,
)
raw = json.loads(response.text)
items = [
MaintenanceItem(
service_name=item["serviceName"],
interval_miles=item.get("intervalMiles"),
interval_months=item.get("intervalMonths"),
details=item.get("details"),
)
for item in raw.get("maintenanceSchedule", [])
]
logger.info(
"Gemini extracted %d maintenance items from PDF (%d bytes)",
len(items),
len(pdf_bytes),
)
return MaintenanceExtractionResult(
items=items,
model=settings.gemini_model,
)
except (GeminiEngineError,):
raise
except json.JSONDecodeError as exc:
raise GeminiProcessingError(
f"Gemini returned invalid JSON: {exc}"
) from exc
except Exception as exc:
raise GeminiProcessingError(
f"Gemini maintenance extraction failed: {exc}"
) from exc

View File

@@ -21,6 +21,9 @@ google-cloud-vision>=3.7.0
# PDF Processing
PyMuPDF>=1.23.0
# Vertex AI / Gemini (maintenance schedule extraction)
google-cloud-aiplatform>=1.40.0
# Redis for job queue
redis>=5.0.0

View File

@@ -0,0 +1,353 @@
"""Tests for Gemini engine maintenance schedule extraction.
Covers: GeminiEngine initialization, PDF size validation,
successful extraction, empty results, and error handling.
All Vertex AI SDK calls are mocked.
"""
import json
from unittest.mock import MagicMock, patch, PropertyMock
import pytest
from app.engines.gemini_engine import (
GeminiEngine,
GeminiEngineError,
GeminiProcessingError,
GeminiUnavailableError,
MaintenanceExtractionResult,
MaintenanceItem,
_MAX_PDF_BYTES,
)
# --- Helpers ---
def _make_pdf_bytes(size: int = 1024) -> bytes:
"""Create fake PDF bytes of a given size."""
# Minimal PDF header so it looks plausible, padded to size
header = b"%PDF-1.4 fake"
return header + b"\x00" * max(0, size - len(header))
def _make_gemini_response(schedule: list[dict]) -> MagicMock:
"""Create a mock Gemini generate_content response."""
response = MagicMock()
response.text = json.dumps({"maintenanceSchedule": schedule})
return response
# --- Exception hierarchy ---
class TestExceptionHierarchy:
"""Verify the Gemini exception class relationships."""
def test_processing_error_is_engine_error(self):
assert issubclass(GeminiProcessingError, GeminiEngineError)
def test_unavailable_error_is_engine_error(self):
assert issubclass(GeminiUnavailableError, GeminiEngineError)
def test_engine_error_is_exception(self):
assert issubclass(GeminiEngineError, Exception)
# --- Data types ---
class TestMaintenanceItem:
"""Verify MaintenanceItem dataclass construction."""
def test_required_fields_only(self):
item = MaintenanceItem(service_name="Oil Change")
assert item.service_name == "Oil Change"
assert item.interval_miles is None
assert item.interval_months is None
assert item.details is None
def test_all_fields(self):
item = MaintenanceItem(
service_name="Tire Rotation",
interval_miles=5000,
interval_months=6,
details="Rotate front to rear on same side.",
)
assert item.service_name == "Tire Rotation"
assert item.interval_miles == 5000
assert item.interval_months == 6
assert item.details == "Rotate front to rear on same side."
class TestMaintenanceExtractionResult:
"""Verify MaintenanceExtractionResult dataclass."""
def test_construction(self):
result = MaintenanceExtractionResult(
items=[MaintenanceItem(service_name="Oil Change")],
model="gemini-2.5-flash",
)
assert len(result.items) == 1
assert result.model == "gemini-2.5-flash"
def test_empty_items(self):
result = MaintenanceExtractionResult(items=[], model="gemini-2.5-flash")
assert result.items == []
# --- PDF size validation ---
class TestPdfSizeValidation:
"""Verify the 20 MB PDF size limit."""
def test_oversized_pdf_rejected(self):
"""PDFs exceeding 20 MB must be rejected with a clear error."""
engine = GeminiEngine()
oversized = _make_pdf_bytes(_MAX_PDF_BYTES + 1)
with pytest.raises(GeminiProcessingError, match="exceeds the 20 MB limit"):
engine.extract_maintenance(oversized)
def test_exactly_at_limit_accepted(self):
"""PDFs exactly at 20 MB should pass size validation.
The engine will still fail at model init (mocked away in other tests),
but the size check itself should pass.
"""
engine = GeminiEngine()
exact = _make_pdf_bytes(_MAX_PDF_BYTES)
# Should fail at _get_model, not at size check
with pytest.raises(GeminiUnavailableError):
engine.extract_maintenance(exact)
# --- Successful extraction ---
class TestExtractMaintenance:
"""Verify successful maintenance schedule extraction."""
@patch("app.engines.gemini_engine.settings")
@patch("app.engines.gemini_engine.os.path.isfile", return_value=True)
def test_valid_pdf_returns_structured_schedules(
self, mock_isfile, mock_settings
):
"""Normal: Valid PDF returns structured maintenance schedules."""
mock_settings.google_vision_key_path = "/fake/creds.json"
mock_settings.vertex_ai_project = "test-project"
mock_settings.vertex_ai_location = "us-central1"
mock_settings.gemini_model = "gemini-2.5-flash"
schedule = [
{
"serviceName": "Engine Oil Change",
"intervalMiles": 5000,
"intervalMonths": 6,
"details": "Use 0W-20 full synthetic oil.",
},
{
"serviceName": "Tire Rotation",
"intervalMiles": 5000,
"intervalMonths": 6,
"details": None,
},
]
mock_model = MagicMock()
mock_model.generate_content.return_value = _make_gemini_response(schedule)
with (
patch(
"app.engines.gemini_engine.importlib_vertex_ai"
) if False else patch.dict("sys.modules", {
"google.cloud": MagicMock(),
"google.cloud.aiplatform": MagicMock(),
"vertexai": MagicMock(),
"vertexai.generative_models": MagicMock(),
}),
):
engine = GeminiEngine()
engine._model = mock_model
engine._generation_config = MagicMock()
result = engine.extract_maintenance(_make_pdf_bytes())
assert isinstance(result, MaintenanceExtractionResult)
assert len(result.items) == 2
assert result.model == "gemini-2.5-flash"
oil = result.items[0]
assert oil.service_name == "Engine Oil Change"
assert oil.interval_miles == 5000
assert oil.interval_months == 6
assert oil.details == "Use 0W-20 full synthetic oil."
tire = result.items[1]
assert tire.service_name == "Tire Rotation"
assert tire.details is None
@patch("app.engines.gemini_engine.settings")
@patch("app.engines.gemini_engine.os.path.isfile", return_value=True)
def test_no_maintenance_content_returns_empty_array(
self, mock_isfile, mock_settings
):
"""Edge: PDF with no maintenance content returns empty array."""
mock_settings.google_vision_key_path = "/fake/creds.json"
mock_settings.vertex_ai_project = "test-project"
mock_settings.vertex_ai_location = "us-central1"
mock_settings.gemini_model = "gemini-2.5-flash"
mock_model = MagicMock()
mock_model.generate_content.return_value = _make_gemini_response([])
engine = GeminiEngine()
engine._model = mock_model
engine._generation_config = MagicMock()
result = engine.extract_maintenance(_make_pdf_bytes())
assert isinstance(result, MaintenanceExtractionResult)
assert result.items == []
@patch("app.engines.gemini_engine.settings")
@patch("app.engines.gemini_engine.os.path.isfile", return_value=True)
def test_nullable_fields_handled(self, mock_isfile, mock_settings):
"""Items with only serviceName (nullable fields omitted) parse correctly."""
mock_settings.google_vision_key_path = "/fake/creds.json"
mock_settings.vertex_ai_project = "test-project"
mock_settings.vertex_ai_location = "us-central1"
mock_settings.gemini_model = "gemini-2.5-flash"
schedule = [{"serviceName": "Brake Fluid Replacement"}]
mock_model = MagicMock()
mock_model.generate_content.return_value = _make_gemini_response(schedule)
engine = GeminiEngine()
engine._model = mock_model
engine._generation_config = MagicMock()
result = engine.extract_maintenance(_make_pdf_bytes())
assert len(result.items) == 1
item = result.items[0]
assert item.service_name == "Brake Fluid Replacement"
assert item.interval_miles is None
assert item.interval_months is None
assert item.details is None
# --- Error handling ---
class TestErrorHandling:
"""Verify error handling for various failure modes."""
def test_missing_credential_file_raises_unavailable(self):
"""Auth failure: Missing credential file raises GeminiUnavailableError."""
engine = GeminiEngine()
with (
patch("app.engines.gemini_engine.os.path.isfile", return_value=False),
pytest.raises(GeminiUnavailableError, match="credential config not found"),
):
engine.extract_maintenance(_make_pdf_bytes())
@patch("app.engines.gemini_engine.os.path.isfile", return_value=True)
def test_missing_sdk_raises_unavailable(self, mock_isfile):
"""Auth failure: Missing SDK raises GeminiUnavailableError."""
engine = GeminiEngine()
with (
patch("app.engines.gemini_engine.settings") as mock_settings,
patch.dict("sys.modules", {
"google.cloud.aiplatform": None,
}),
):
mock_settings.google_vision_key_path = "/fake/creds.json"
with pytest.raises(GeminiUnavailableError):
engine.extract_maintenance(_make_pdf_bytes())
@patch("app.engines.gemini_engine.settings")
@patch("app.engines.gemini_engine.os.path.isfile", return_value=True)
def test_generate_content_exception_raises_processing_error(
self, mock_isfile, mock_settings
):
"""Runtime error from Gemini API is wrapped as GeminiProcessingError."""
mock_settings.google_vision_key_path = "/fake/creds.json"
mock_settings.vertex_ai_project = "test-project"
mock_settings.vertex_ai_location = "us-central1"
mock_settings.gemini_model = "gemini-2.5-flash"
mock_model = MagicMock()
mock_model.generate_content.side_effect = RuntimeError("API quota exceeded")
engine = GeminiEngine()
engine._model = mock_model
engine._generation_config = MagicMock()
with pytest.raises(GeminiProcessingError, match="maintenance extraction failed"):
engine.extract_maintenance(_make_pdf_bytes())
@patch("app.engines.gemini_engine.settings")
@patch("app.engines.gemini_engine.os.path.isfile", return_value=True)
def test_invalid_json_response_raises_processing_error(
self, mock_isfile, mock_settings
):
"""Gemini returning invalid JSON is caught and wrapped."""
mock_settings.google_vision_key_path = "/fake/creds.json"
mock_settings.vertex_ai_project = "test-project"
mock_settings.vertex_ai_location = "us-central1"
mock_settings.gemini_model = "gemini-2.5-flash"
mock_response = MagicMock()
mock_response.text = "not valid json {{"
mock_model = MagicMock()
mock_model.generate_content.return_value = mock_response
engine = GeminiEngine()
engine._model = mock_model
engine._generation_config = MagicMock()
with pytest.raises(GeminiProcessingError, match="invalid JSON"):
engine.extract_maintenance(_make_pdf_bytes())
# --- Lazy initialization ---
class TestLazyInitialization:
"""Verify the model is not created until first use."""
def test_model_is_none_after_construction(self):
"""GeminiEngine should not initialize the model in __init__."""
engine = GeminiEngine()
assert engine._model is None
@patch("app.engines.gemini_engine.settings")
@patch("app.engines.gemini_engine.os.path.isfile", return_value=True)
def test_model_reused_on_second_call(self, mock_isfile, mock_settings):
"""Once initialized, the same model instance is reused."""
mock_settings.google_vision_key_path = "/fake/creds.json"
mock_settings.vertex_ai_project = "test-project"
mock_settings.vertex_ai_location = "us-central1"
mock_settings.gemini_model = "gemini-2.5-flash"
schedule = [{"serviceName": "Oil Change", "intervalMiles": 5000}]
mock_model = MagicMock()
mock_model.generate_content.return_value = _make_gemini_response(schedule)
engine = GeminiEngine()
engine._model = mock_model
engine._generation_config = MagicMock()
engine.extract_maintenance(_make_pdf_bytes())
engine.extract_maintenance(_make_pdf_bytes())
# Model's generate_content should have been called twice
assert mock_model.generate_content.call_count == 2