motovaultpro/ocr/app/engines/gemini_engine.py

"""Gemini 2.5 Flash engine for document understanding and VIN decode.

Standalone module (does NOT extend OcrEngine) because Gemini performs
semantic document understanding, not traditional OCR word-box extraction.
Uses google-genai SDK with structured JSON output enforcement.
"""

import json
import logging
import os
from dataclasses import dataclass
from datetime import datetime
from typing import Any

from app.config import settings

logger = logging.getLogger(__name__)

# 20 MB hard limit for inline base64 PDF delivery
_MAX_PDF_BYTES = 20 * 1024 * 1024

_EXTRACTION_PROMPT = """\
Extract all routine scheduled maintenance items from this vehicle owners manual.

For each maintenance item, extract:
- serviceName: The maintenance task name (e.g., "Engine Oil Change", "Tire Rotation", \
"Cabin Air Filter Replacement")
- intervalMiles: The mileage interval as a number, or null if not specified \
(e.g., 5000, 30000)
- intervalMonths: The time interval in months as a number, or null if not specified \
(e.g., 6, 12, 24)
- details: Any additional details such as fluid specifications, part numbers, \
or special instructions (e.g., "Use 0W-20 full synthetic oil")

Only include routine scheduled maintenance items with clear intervals. \
Do not include one-time procedures, troubleshooting steps, or warranty information.

Return the results as a JSON object with a single "maintenanceSchedule" array.\
"""

# VIN year code lookup: position 10 character -> base year (first cycle, 1980-2009).
# The 30-year cycle repeats: +30 for 2010-2039, +60 for 2040-2069.
# Disambiguation uses position 7: alphabetic -> 2010+ cycle, numeric -> 1980s cycle.
# Per NHTSA FMVSS No. 115: MY2010+ vehicles must use alphabetic position 7.
# For the 2040+ cycle (when position 7 is numeric again), we pick the most
# recent plausible year (not more than 2 years in the future).
_VIN_YEAR_CODES: dict[str, int] = {
    "A": 1980, "B": 1981, "C": 1982, "D": 1983, "E": 1984,
    "F": 1985, "G": 1986, "H": 1987, "J": 1988, "K": 1989,
    "L": 1990, "M": 1991, "N": 1992, "P": 1993, "R": 1994,
    "S": 1995, "T": 1996, "V": 1997, "W": 1998, "X": 1999,
    "Y": 2000,
    "1": 2001, "2": 2002, "3": 2003, "4": 2004, "5": 2005,
    "6": 2006, "7": 2007, "8": 2008, "9": 2009,
}


def resolve_vin_year(vin: str) -> int | None:
    """Deterministically resolve model year from VIN positions 7 and 10.

    VIN year codes repeat on a 30-year cycle. Position 7 disambiguates:
      - Alphabetic position 7 -> 2010-2039 cycle (NHTSA MY2010+ requirement)
      - Numeric position 7 -> 1980-2009 or 2040-2069 cycle

    For the numeric case with two possible cycles, picks the most recent
    year that is not more than 2 years in the future.

    Returns None if the VIN is too short or position 10 is not a valid year code.
    """
    if len(vin) < 17:
        return None

    code = vin[9].upper()   # position 10 (0-indexed)
    pos7 = vin[6].upper()   # position 7 (0-indexed)

    base_year = _VIN_YEAR_CODES.get(code)
    if base_year is None:
        return None

    if pos7.isalpha():
        # Alphabetic position 7 -> second cycle (2010-2039)
        return base_year + 30

    # Numeric position 7 -> first cycle (1980-2009) or third cycle (2040-2069)
    # Pick the most recent plausible year
    max_plausible = datetime.now().year + 2

    third_cycle = base_year + 60  # 2040-2069
    if third_cycle <= max_plausible:
        return third_cycle

    return base_year


_VIN_DECODE_PROMPT = """\
Decode the following VIN (Vehicle Identification Number) using standard VIN structure rules.

VIN: {vin}
Model year: {year} (determined from position 10 code '{year_code}')

The model year has already been resolved deterministically. Use {year} as the year.

VIN position reference:
- Positions 1-3 (WMI): World Manufacturer Identifier (country + manufacturer)
- Positions 4-8 (VDS): Vehicle attributes (model, body, engine, etc.)
- Position 9: Check digit
- Position 10: Model year code (30-year cycle, extended through 2050):
  A=1980/2010/2040 B=1981/2011/2041 C=1982/2012/2042 D=1983/2013/2043 E=1984/2014/2044
  F=1985/2015/2045 G=1986/2016/2046 H=1987/2017/2047 J=1988/2018/2048 K=1989/2019/2049
  L=1990/2020/2050 M=1991/2021       N=1992/2022       P=1993/2023       R=1994/2024
  S=1995/2025       T=1996/2026       V=1997/2027       W=1998/2028       X=1999/2029
  Y=2000/2030       1=2001/2031       2=2002/2032       3=2003/2033       4=2004/2034
  5=2005/2035       6=2006/2036       7=2007/2037       8=2008/2038       9=2009/2039
- Position 11: Assembly plant
- Positions 12-17: Sequential production number

Return the vehicle's make, model, trim level, body type, drive type, fuel type, engine description, and transmission type. If a field cannot be determined from the VIN, return null for that field. Return a confidence score (0.0-1.0) indicating overall decode reliability.\
"""

_VIN_DECODE_SCHEMA: dict[str, Any] = {
    "type": "OBJECT",
    "properties": {
        "year": {"type": "INTEGER", "nullable": True},
        "make": {"type": "STRING", "nullable": True},
        "model": {"type": "STRING", "nullable": True},
        "trimLevel": {"type": "STRING", "nullable": True},
        "bodyType": {"type": "STRING", "nullable": True},
        "driveType": {"type": "STRING", "nullable": True},
        "fuelType": {"type": "STRING", "nullable": True},
        "engine": {"type": "STRING", "nullable": True},
        "transmission": {"type": "STRING", "nullable": True},
        "confidence": {"type": "NUMBER"},
    },
    "required": ["confidence"],
}

_RESPONSE_SCHEMA: dict[str, Any] = {
    "type": "OBJECT",
    "properties": {
        "maintenanceSchedule": {
            "type": "ARRAY",
            "items": {
                "type": "OBJECT",
                "properties": {
                    "serviceName": {"type": "STRING"},
                    "intervalMiles": {"type": "NUMBER", "nullable": True},
                    "intervalMonths": {"type": "NUMBER", "nullable": True},
                    "details": {"type": "STRING", "nullable": True},
                },
                "required": ["serviceName"],
            },
        },
    },
    "required": ["maintenanceSchedule"],
}


class GeminiEngineError(Exception):
    """Base exception for Gemini engine errors."""


class GeminiUnavailableError(GeminiEngineError):
    """Raised when the Gemini engine cannot be initialized."""


class GeminiProcessingError(GeminiEngineError):
    """Raised when Gemini fails to process a document."""


@dataclass
class VinDecodeResult:
    """Result from Gemini VIN decode."""

    year: int | None = None
    make: str | None = None
    model: str | None = None
    trim_level: str | None = None
    body_type: str | None = None
    drive_type: str | None = None
    fuel_type: str | None = None
    engine: str | None = None
    transmission: str | None = None
    confidence: float = 0.0


@dataclass
class MaintenanceItem:
    """A single extracted maintenance schedule item."""

    service_name: str
    interval_miles: int | None = None
    interval_months: int | None = None
    details: str | None = None


@dataclass
class MaintenanceExtractionResult:
    """Result from Gemini maintenance schedule extraction."""

    items: list[MaintenanceItem]
    model: str


class GeminiEngine:
    """Gemini 2.5 Flash wrapper for maintenance schedule extraction and VIN decode.

    Standalone class (not an OcrEngine subclass) because Gemini performs
    semantic document understanding rather than traditional OCR.

    Uses lazy initialization: the Gemini client is not created until
    the first call to ``extract_maintenance()`` or ``decode_vin()``.
    """

    def __init__(self) -> None:
        self._client: Any | None = None
        self._model_name: str = ""

    def _get_client(self) -> Any:
        """Create the genai.Client on first use.

        Authentication uses the same WIF credential path as Google Vision.
        """
        if self._client is not None:
            return self._client

        key_path = settings.google_vision_key_path
        if not os.path.isfile(key_path):
            raise GeminiUnavailableError(
                f"Google credential config not found at {key_path}. "
                "Set GOOGLE_VISION_KEY_PATH or mount the secret."
            )

        try:
            from google import genai  # type: ignore[import-untyped]

            # Point ADC at the WIF credential config (must be set BEFORE Client construction)
            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
            os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1"

            self._client = genai.Client(
                vertexai=True,
                project=settings.vertex_ai_project,
                location=settings.vertex_ai_location,
            )
            self._model_name = settings.gemini_model

            logger.info(
                "Gemini engine initialized (model=%s, project=%s, location=%s)",
                self._model_name,
                settings.vertex_ai_project,
                settings.vertex_ai_location,
            )
            return self._client

        except ImportError as exc:
            logger.exception("google-genai SDK import failed")
            raise GeminiUnavailableError(
                "google-genai is not installed. "
                "Install with: pip install google-genai"
            ) from exc
        except Exception as exc:
            logger.exception("Gemini authentication failed: %s", type(exc).__name__)
            raise GeminiUnavailableError(
                f"Gemini authentication failed: {exc}"
            ) from exc

    def extract_maintenance(
        self, pdf_bytes: bytes
    ) -> MaintenanceExtractionResult:
        """Extract maintenance schedules from a PDF owners manual.

        Args:
            pdf_bytes: Raw PDF file bytes (<= 20 MB).

        Returns:
            Structured maintenance extraction result.

        Raises:
            GeminiProcessingError: If the PDF is too large or extraction fails.
            GeminiUnavailableError: If the engine cannot be initialized.
        """
        if len(pdf_bytes) > _MAX_PDF_BYTES:
            size_mb = len(pdf_bytes) / (1024 * 1024)
            raise GeminiProcessingError(
                f"PDF size ({size_mb:.1f} MB) exceeds the 20 MB limit for "
                "inline processing. Upload to GCS and use a gs:// URI instead."
            )

        client = self._get_client()

        try:
            from google.genai import types  # type: ignore[import-untyped]

            pdf_part = types.Part.from_bytes(
                data=pdf_bytes,
                mime_type="application/pdf",
            )

            response = client.models.generate_content(
                model=self._model_name,
                contents=[pdf_part, _EXTRACTION_PROMPT],
                config=types.GenerateContentConfig(
                    response_mime_type="application/json",
                    response_schema=_RESPONSE_SCHEMA,
                ),
            )

            raw = json.loads(response.text)
            items = [
                MaintenanceItem(
                    service_name=item["serviceName"],
                    interval_miles=item.get("intervalMiles"),
                    interval_months=item.get("intervalMonths"),
                    details=item.get("details"),
                )
                for item in raw.get("maintenanceSchedule", [])
            ]

            logger.info(
                "Gemini extracted %d maintenance items from PDF (%d bytes)",
                len(items),
                len(pdf_bytes),
            )

            return MaintenanceExtractionResult(
                items=items,
                model=settings.gemini_model,
            )

        except (GeminiEngineError,):
            raise
        except json.JSONDecodeError as exc:
            raise GeminiProcessingError(
                f"Gemini returned invalid JSON: {exc}"
            ) from exc
        except Exception as exc:
            raise GeminiProcessingError(
                f"Gemini maintenance extraction failed: {exc}"
            ) from exc

    def decode_vin(self, vin: str) -> VinDecodeResult:
        """Decode a VIN string into structured vehicle data via Gemini.

        The model year is resolved deterministically from VIN positions 7
        and 10 -- never delegated to the LLM. Gemini handles make, model,
        trim, and other fields that require manufacturer knowledge.

        Args:
            vin: A 17-character Vehicle Identification Number.

        Returns:
            Structured vehicle specification result.

        Raises:
            GeminiProcessingError: If Gemini fails to decode the VIN.
            GeminiUnavailableError: If the engine cannot be initialized.
        """
        client = self._get_client()

        # Resolve year deterministically from VIN structure
        resolved_year = resolve_vin_year(vin)
        year_code = vin[9].upper() if len(vin) >= 10 else "?"
        logger.info(
            "VIN year resolved: code=%s pos7=%s -> year=%s",
            year_code,
            vin[6] if len(vin) >= 7 else "?",
            resolved_year,
        )

        try:
            from google.genai import types  # type: ignore[import-untyped]

            prompt = _VIN_DECODE_PROMPT.format(
                vin=vin,
                year=resolved_year or "unknown",
                year_code=year_code,
            )
            response = client.models.generate_content(
                model=self._model_name,
                contents=[prompt],
                config=types.GenerateContentConfig(
                    response_mime_type="application/json",
                    response_schema=_VIN_DECODE_SCHEMA,
                    tools=[types.Tool(google_search=types.GoogleSearch())],
                    automatic_function_calling=types.AutomaticFunctionCallingConfig(
                        max_remote_calls=3,
                    ),
                ),
            )

            raw = json.loads(response.text)

            # Override year with deterministic value -- never trust the LLM
            # for a mechanical lookup
            gemini_year = raw.get("year")
            if resolved_year and gemini_year != resolved_year:
                logger.warning(
                    "Gemini returned year %s but resolved year is %s for VIN %s -- overriding",
                    gemini_year,
                    resolved_year,
                    vin,
                )

            logger.info("Gemini decoded VIN %s (confidence=%.2f)", vin, raw.get("confidence", 0))

            return VinDecodeResult(
                year=resolved_year if resolved_year else raw.get("year"),
                make=raw.get("make"),
                model=raw.get("model"),
                trim_level=raw.get("trimLevel"),
                body_type=raw.get("bodyType"),
                drive_type=raw.get("driveType"),
                fuel_type=raw.get("fuelType"),
                engine=raw.get("engine"),
                transmission=raw.get("transmission"),
                confidence=raw.get("confidence", 0.0),
            )

        except (GeminiEngineError,):
            raise
        except json.JSONDecodeError as exc:
            raise GeminiProcessingError(
                f"Gemini returned invalid JSON for VIN decode: {exc}"
            ) from exc
        except Exception as exc:
            raise GeminiProcessingError(
                f"Gemini VIN decode failed: {exc}"
            ) from exc