feat: add maintenance receipt extraction pipeline with Gemini + regex (refs #150)

- New MaintenanceReceiptExtractor: Gemini-primary extraction with regex cross-validation for dates, amounts, and odometer readings - New maintenance_receipt_validation.py: cross-validation patterns for structured field confidence adjustment - New POST /extract/maintenance-receipt endpoint reusing ReceiptExtractionResponse model - Per-field confidence scores (0.0-1.0) with Gemini base 0.85, boosted/reduced by regex agreement Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-12 21:14:13 -06:00
parent 0e97128a31
commit 90401dc1ba
5 changed files with 713 additions and 0 deletions
--- a/ocr/app/extractors/init.py
+++ b/ocr/app/extractors/init.py
@@ -8,6 +8,10 @@ from app.extractors.receipt_extractor import (
    ExtractedField,
 )
 from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor
 from app.extractors.maintenance_receipt_extractor import (
    MaintenanceReceiptExtractor,
    maintenance_receipt_extractor,
 )
 from app.extractors.manual_extractor import (
    ManualExtractor,
    manual_extractor,
@@ -27,6 +31,8 @@ __all__ = [
    "ExtractedField",
    "FuelReceiptExtractor",
    "fuel_receipt_extractor",
    "MaintenanceReceiptExtractor",
    "maintenance_receipt_extractor",
    "ManualExtractor",
    "manual_extractor",
    "ManualExtractionResult",
--- a/ocr/app/extractors/maintenance_receipt_extractor.py
+++ b/ocr/app/extractors/maintenance_receipt_extractor.py
@@ -0,0 +1,312 @@
 """Maintenance receipt extraction with Gemini-primary and regex cross-validation.
 Flow:
 1. Preprocess image and OCR via receipt_extractor (PaddleOCR)
 2. Send OCR text to Gemini text API for semantic field extraction
 3. Cross-validate structured fields (date, cost, odometer) with regex
 4. Return ReceiptExtractionResult with per-field confidence scores
 """
 import json
 import logging
 import os
 import time
 from typing import Any, Optional
 from app.config import settings
 from app.extractors.receipt_extractor import (
    ExtractedField,
    ReceiptExtractionResult,
    receipt_extractor,
 )
 from app.patterns.maintenance_receipt_validation import (
    MaintenanceReceiptValidation,
    maintenance_receipt_validator,
 )
 logger = logging.getLogger(__name__)
 # Default confidence for Gemini-extracted fields before cross-validation
 DEFAULT_GEMINI_CONFIDENCE = 0.85
 # Gemini prompt for maintenance receipt field extraction
 _RECEIPT_EXTRACTION_PROMPT = """\
 Extract maintenance service receipt fields from the following OCR text.
 For each field, extract the value if present. Return null for fields not found.
 Fields to extract:
 - serviceName: The maintenance service performed (e.g., "Oil Change", "Brake Pad Replacement", "Tire Rotation")
 - serviceDate: Date of service in YYYY-MM-DD format
 - totalCost: Total cost as a number (e.g., 89.95)
 - shopName: Name of the shop or business
 - laborCost: Labor cost as a number, or null if not broken out
 - partsCost: Parts cost as a number, or null if not broken out
 - odometerReading: Odometer/mileage reading as a number, or null if not present
 - vehicleInfo: Vehicle description if present (e.g., "2022 Toyota Camry"), or null
 Return a JSON object with these field names and their extracted values.
 OCR Text:
 ---
 {ocr_text}
 ---\
 """
 _RECEIPT_RESPONSE_SCHEMA: dict[str, Any] = {
    "type": "object",
    "properties": {
        "serviceName": {"type": "string", "nullable": True},
        "serviceDate": {"type": "string", "nullable": True},
        "totalCost": {"type": "number", "nullable": True},
        "shopName": {"type": "string", "nullable": True},
        "laborCost": {"type": "number", "nullable": True},
        "partsCost": {"type": "number", "nullable": True},
        "odometerReading": {"type": "number", "nullable": True},
        "vehicleInfo": {"type": "string", "nullable": True},
    },
    "required": [
        "serviceName",
        "serviceDate",
        "totalCost",
        "shopName",
        "laborCost",
        "partsCost",
        "odometerReading",
        "vehicleInfo",
    ],
 }
 class MaintenanceReceiptExtractor:
    """Maintenance receipt extractor using Gemini for semantic extraction.
    Wraps receipt_extractor for OCR preprocessing, then sends raw text to
    Gemini for field extraction. Structured fields (dates, amounts, odometer)
    are cross-validated against regex patterns for confidence adjustment.
    """
    def __init__(self) -> None:
        self._model: Any | None = None
        self._generation_config: Any | None = None
    def extract(
        self,
        image_bytes: bytes,
        content_type: Optional[str] = None,
    ) -> ReceiptExtractionResult:
        """Extract maintenance receipt fields from an image.
        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG).
            content_type: MIME type (auto-detected if not provided).
        Returns:
            ReceiptExtractionResult with maintenance-specific fields.
        """
        start_time = time.time()
        # Step 1: OCR the image via receipt_extractor
        ocr_result = receipt_extractor.extract(
            image_bytes=image_bytes,
            content_type=content_type,
        )
        if not ocr_result.success:
            return ocr_result
        raw_text = ocr_result.raw_text
        if not raw_text.strip():
            return ReceiptExtractionResult(
                success=False,
                error="No text found in image",
                processing_time_ms=int((time.time() - start_time) * 1000),
            )
        # Step 2: Extract fields with Gemini
        try:
            gemini_fields = self._extract_with_gemini(raw_text)
        except Exception as e:
            logger.warning(f"Gemini extraction failed, falling back to OCR-only: {e}")
            gemini_fields = {}
        # Step 3: Build extracted fields with base confidence
        extracted_fields = self._build_fields(gemini_fields)
        if not extracted_fields:
            return ReceiptExtractionResult(
                success=False,
                receipt_type="maintenance",
                error="No maintenance receipt fields could be extracted",
                raw_text=raw_text,
                processing_time_ms=int((time.time() - start_time) * 1000),
            )
        # Step 4: Cross-validate structured fields with regex
        validation = maintenance_receipt_validator.validate(gemini_fields, raw_text)
        if validation.issues:
            logger.info(f"Maintenance receipt validation issues: {validation.issues}")
        # Step 5: Adjust confidences based on cross-validation
        adjusted_fields = self._adjust_confidences(extracted_fields, validation)
        processing_time_ms = int((time.time() - start_time) * 1000)
        logger.info(
            f"Maintenance receipt extraction: "
            f"fields={len(adjusted_fields)}, "
            f"validated={validation.is_valid}, "
            f"time={processing_time_ms}ms"
        )
        return ReceiptExtractionResult(
            success=True,
            receipt_type="maintenance",
            extracted_fields=adjusted_fields,
            raw_text=raw_text,
            processing_time_ms=processing_time_ms,
        )
    def _get_model(self) -> Any:
        """Lazy-initialize Vertex AI Gemini model.
        Uses the same authentication pattern as GeminiEngine.
        """
        if self._model is not None:
            return self._model
        key_path = settings.google_vision_key_path
        if not os.path.isfile(key_path):
            raise RuntimeError(
                f"Google credential config not found at {key_path}. "
                "Set GOOGLE_VISION_KEY_PATH or mount the secret."
            )
        from google.cloud import aiplatform  # type: ignore[import-untyped]
        from vertexai.generative_models import (  # type: ignore[import-untyped]
            GenerationConfig,
            GenerativeModel,
        )
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
        os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1"
        aiplatform.init(
            project=settings.vertex_ai_project,
            location=settings.vertex_ai_location,
        )
        model_name = settings.gemini_model
        self._model = GenerativeModel(model_name)
        self._generation_config = GenerationConfig(
            response_mime_type="application/json",
            response_schema=_RECEIPT_RESPONSE_SCHEMA,
        )
        logger.info(
            "Maintenance receipt Gemini model initialized (model=%s)",
            model_name,
        )
        return self._model
    def _extract_with_gemini(self, ocr_text: str) -> dict:
        """Send OCR text to Gemini for semantic field extraction.
        Args:
            ocr_text: Raw OCR text from receipt image.
        Returns:
            Dictionary of field_name -> extracted_value from Gemini.
        """
        model = self._get_model()
        prompt = _RECEIPT_EXTRACTION_PROMPT.format(ocr_text=ocr_text)
        response = model.generate_content(
            [prompt],
            generation_config=self._generation_config,
        )
        raw = json.loads(response.text)
        logger.info(
            "Gemini extracted maintenance fields: %s",
            [k for k, v in raw.items() if v is not None],
        )
        return raw
    def _build_fields(self, gemini_fields: dict) -> dict[str, ExtractedField]:
        """Convert Gemini response to ExtractedField dict with base confidence.
        Args:
            gemini_fields: Raw Gemini response dict.
        Returns:
            Dictionary of field_name -> ExtractedField.
        """
        fields: dict[str, ExtractedField] = {}
        for field_name, value in gemini_fields.items():
            if value is None:
                continue
            # Convert numeric values to appropriate types
            if field_name in ("totalCost", "laborCost", "partsCost"):
                try:
                    value = round(float(value), 2)
                except (ValueError, TypeError):
                    continue
            elif field_name == "odometerReading":
                try:
                    value = int(float(value))
                except (ValueError, TypeError):
                    continue
            elif isinstance(value, str) and not value.strip():
                continue
            fields[field_name] = ExtractedField(
                value=value,
                confidence=DEFAULT_GEMINI_CONFIDENCE,
            )
        return fields
    def _adjust_confidences(
        self,
        fields: dict[str, ExtractedField],
        validation: MaintenanceReceiptValidation,
    ) -> dict[str, ExtractedField]:
        """Adjust field confidences based on cross-validation results.
        Args:
            fields: Extracted fields with base confidence.
            validation: Cross-validation results.
        Returns:
            Fields with adjusted confidences.
        """
        adjusted: dict[str, ExtractedField] = {}
        for name, extracted_field in fields.items():
            if name in validation.field_validations:
                fv = validation.field_validations[name]
                new_confidence = min(
                    1.0, extracted_field.confidence * fv.confidence_adjustment
                )
            else:
                # Semantic fields (no regex validation) keep base confidence
                new_confidence = extracted_field.confidence
            adjusted[name] = ExtractedField(
                value=extracted_field.value,
                confidence=round(new_confidence, 3),
            )
        return adjusted
 # Singleton instance
 maintenance_receipt_extractor = MaintenanceReceiptExtractor()
--- a/ocr/app/patterns/init.py
+++ b/ocr/app/patterns/init.py
@@ -4,6 +4,10 @@ from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matc
 from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
 from app.patterns.maintenance_patterns import MaintenancePatternMatcher, maintenance_matcher
 from app.patterns.service_mapping import ServiceMapper, service_mapper
 from app.patterns.maintenance_receipt_validation import (
    MaintenanceReceiptValidator,
    maintenance_receipt_validator,
 )
 __all__ = [
    "DatePatternMatcher",
@@ -16,4 +20,6 @@ __all__ = [
    "maintenance_matcher",
    "ServiceMapper",
    "service_mapper",
    "MaintenanceReceiptValidator",
    "maintenance_receipt_validator",
 ]
--- a/ocr/app/patterns/maintenance_receipt_validation.py
+++ b/ocr/app/patterns/maintenance_receipt_validation.py
@@ -0,0 +1,299 @@
 """Cross-validation patterns for maintenance receipt field extraction.
 Validates structured fields (dates, amounts, odometer) extracted by Gemini
 against regex patterns found in the OCR raw text. Boosts or reduces confidence
 based on regex agreement.
 """
 import re
 from dataclasses import dataclass, field
 from typing import Optional
 from app.patterns.currency_patterns import currency_matcher
 from app.patterns.date_patterns import date_matcher
@dataclass
 class FieldValidation:
    """Validation result for a single extracted field."""
    field_name: str
    regex_confirmed: bool
    confidence_adjustment: float  # Multiplier: >1.0 boosts, <1.0 reduces
    regex_value: Optional[str] = None  # Value found by regex, if any
@dataclass
 class MaintenanceReceiptValidation:
    """Aggregated validation result for a maintenance receipt."""
    is_valid: bool
    issues: list[str]
    field_validations: dict[str, FieldValidation] = field(default_factory=dict)
    overall_confidence: float = 1.0
 # Odometer patterns: 5-7 digit numbers near odometer keywords
 ODOMETER_PATTERNS = [
    # "Odometer: 45,231" or "Mileage: 45231"
    (
        r"(?:ODOMETER|MILEAGE|MILES|ODO|MI)\s*[:\s]\s*(\d{1,3}[,.]?\d{3,4})",
        "labeled_odometer",
        0.95,
    ),
    # "45,231 mi" or "45231 miles"
    (
        r"(\d{1,3}[,.]?\d{3,4})\s*(?:MI|MILES|KM)",
        "unit_odometer",
        0.90,
    ),
    # Standalone 5-6 digit number (lower confidence)
    (
        r"(?<!\d)(\d{5,6})(?!\d)",
        "standalone_odometer",
        0.60,
    ),
 ]
@dataclass
 class OdometerMatch:
    """Result of odometer pattern matching."""
    value: int
    raw_match: str
    confidence: float
    pattern_name: str
 class MaintenanceReceiptValidator:
    """Cross-validates Gemini-extracted maintenance receipt fields against regex patterns."""
    def validate(
        self,
        gemini_fields: dict,
        raw_text: str,
    ) -> MaintenanceReceiptValidation:
        """Validate Gemini-extracted fields against regex patterns in raw OCR text.
        Args:
            gemini_fields: Fields extracted by Gemini (field_name -> value).
            raw_text: Raw OCR text for regex cross-validation.
        Returns:
            MaintenanceReceiptValidation with per-field results.
        """
        issues: list[str] = []
        field_validations: dict[str, FieldValidation] = {}
        overall_confidence = 1.0
        # Validate date field
        if "serviceDate" in gemini_fields:
            date_validation = self._validate_date(
                gemini_fields["serviceDate"], raw_text
            )
            field_validations["serviceDate"] = date_validation
            if not date_validation.regex_confirmed:
                issues.append(
                    f"Service date '{gemini_fields['serviceDate']}' not confirmed by regex"
                )
                overall_confidence *= 0.85
        # Validate total cost
        if "totalCost" in gemini_fields:
            cost_validation = self._validate_amount(
                "totalCost", gemini_fields["totalCost"], raw_text
            )
            field_validations["totalCost"] = cost_validation
            if not cost_validation.regex_confirmed:
                issues.append(
                    f"Total cost '{gemini_fields['totalCost']}' not confirmed by regex"
                )
                overall_confidence *= 0.85
        # Validate labor cost
        if "laborCost" in gemini_fields:
            labor_validation = self._validate_amount(
                "laborCost", gemini_fields["laborCost"], raw_text
            )
            field_validations["laborCost"] = labor_validation
            if not labor_validation.regex_confirmed:
                issues.append(
                    f"Labor cost '{gemini_fields['laborCost']}' not confirmed by regex"
                )
                overall_confidence *= 0.90
        # Validate odometer
        if "odometerReading" in gemini_fields:
            odo_validation = self._validate_odometer(
                gemini_fields["odometerReading"], raw_text
            )
            field_validations["odometerReading"] = odo_validation
            if not odo_validation.regex_confirmed:
                issues.append(
                    f"Odometer '{gemini_fields['odometerReading']}' not confirmed by regex"
                )
                overall_confidence *= 0.90
        is_valid = len(issues) == 0
        return MaintenanceReceiptValidation(
            is_valid=is_valid,
            issues=issues,
            field_validations=field_validations,
            overall_confidence=overall_confidence,
        )
    def extract_odometer(self, text: str) -> Optional[OdometerMatch]:
        """Extract odometer reading from text using regex patterns.
        Args:
            text: OCR text to search.
        Returns:
            Best OdometerMatch or None.
        """
        text_upper = text.upper()
        best_match: Optional[OdometerMatch] = None
        for pattern, name, confidence in ODOMETER_PATTERNS:
            for match in re.finditer(pattern, text_upper):
                raw_value = match.group(1)
                parsed = self._parse_odometer(raw_value)
                if parsed is not None and self._is_reasonable_odometer(parsed):
                    candidate = OdometerMatch(
                        value=parsed,
                        raw_match=match.group(0).strip(),
                        confidence=confidence,
                        pattern_name=name,
                    )
                    if best_match is None or candidate.confidence > best_match.confidence:
                        best_match = candidate
        return best_match
    def _validate_date(self, gemini_date: str, raw_text: str) -> FieldValidation:
        """Check if Gemini-extracted date matches a regex-found date."""
        regex_dates = date_matcher.extract_dates(raw_text)
        if not regex_dates:
            # No dates found by regex -- cannot confirm or deny
            return FieldValidation(
                field_name="serviceDate",
                regex_confirmed=False,
                confidence_adjustment=0.95,
            )
        # Normalize Gemini date for comparison
        gemini_normalized = gemini_date.strip().replace("/", "-")
        for regex_date in regex_dates:
            if regex_date.value == gemini_normalized:
                return FieldValidation(
                    field_name="serviceDate",
                    regex_confirmed=True,
                    confidence_adjustment=1.10,
                    regex_value=regex_date.value,
                )
        # Gemini found a date but it doesn't match regex dates
        return FieldValidation(
            field_name="serviceDate",
            regex_confirmed=False,
            confidence_adjustment=0.80,
            regex_value=regex_dates[0].value if regex_dates else None,
        )
    def _validate_amount(
        self, field_name: str, gemini_amount: str | float, raw_text: str
    ) -> FieldValidation:
        """Check if Gemini-extracted amount matches a regex-found amount."""
        try:
            gemini_value = float(str(gemini_amount).replace("$", "").replace(",", ""))
        except (ValueError, TypeError):
            return FieldValidation(
                field_name=field_name,
                regex_confirmed=False,
                confidence_adjustment=0.70,
            )
        regex_amounts = currency_matcher.extract_all_amounts(raw_text)
        if not regex_amounts:
            return FieldValidation(
                field_name=field_name,
                regex_confirmed=False,
                confidence_adjustment=0.95,
            )
        # Check if any regex amount matches within 5% tolerance
        for regex_amount in regex_amounts:
            if gemini_value > 0 and abs(regex_amount.value - gemini_value) / gemini_value < 0.05:
                return FieldValidation(
                    field_name=field_name,
                    regex_confirmed=True,
                    confidence_adjustment=1.10,
                    regex_value=str(regex_amount.value),
                )
        return FieldValidation(
            field_name=field_name,
            regex_confirmed=False,
            confidence_adjustment=0.80,
            regex_value=str(regex_amounts[0].value) if regex_amounts else None,
        )
    def _validate_odometer(
        self, gemini_odometer: str | int, raw_text: str
    ) -> FieldValidation:
        """Check if Gemini-extracted odometer matches a regex-found reading."""
        try:
            gemini_value = int(
                str(gemini_odometer).replace(",", "").replace(".", "").strip()
            )
        except (ValueError, TypeError):
            return FieldValidation(
                field_name="odometerReading",
                regex_confirmed=False,
                confidence_adjustment=0.70,
            )
        regex_match = self.extract_odometer(raw_text)
        if not regex_match:
            return FieldValidation(
                field_name="odometerReading",
                regex_confirmed=False,
                confidence_adjustment=0.95,
            )
        # Check if values match within 1% tolerance (OCR might misread a digit)
        if gemini_value > 0 and abs(regex_match.value - gemini_value) / gemini_value < 0.01:
            return FieldValidation(
                field_name="odometerReading",
                regex_confirmed=True,
                confidence_adjustment=1.10,
                regex_value=str(regex_match.value),
            )
        return FieldValidation(
            field_name="odometerReading",
            regex_confirmed=False,
            confidence_adjustment=0.80,
            regex_value=str(regex_match.value),
        )
    def _parse_odometer(self, raw: str) -> Optional[int]:
        """Parse odometer string to integer."""
        cleaned = raw.replace(",", "").replace(".", "").strip()
        try:
            return int(cleaned)
        except ValueError:
            return None
    def _is_reasonable_odometer(self, value: int) -> bool:
        """Check if odometer reading is in a reasonable range."""
        return 100 <= value <= 999_999
 # Singleton instance
 maintenance_receipt_validator = MaintenanceReceiptValidator()
--- a/ocr/app/routers/extract.py
+++ b/ocr/app/routers/extract.py
@@ -6,6 +6,7 @@ from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, Query
 from app.extractors.vin_extractor import vin_extractor
 from app.extractors.receipt_extractor import receipt_extractor
 from app.extractors.maintenance_receipt_extractor import maintenance_receipt_extractor
 from app.extractors.manual_extractor import manual_extractor
 from app.models import (
    BoundingBox,
@@ -267,6 +268,95 @@ async def extract_receipt(
    )
@router.post("/maintenance-receipt", response_model=ReceiptExtractionResponse)
 async def extract_maintenance_receipt(
    file: UploadFile = File(..., description="Maintenance receipt image file"),
 ) -> ReceiptExtractionResponse:
    """
    Extract data from a maintenance receipt image using OCR + Gemini.
    Gemini-primary extraction with regex cross-validation:
    - OCR preprocessing (HEIC conversion, contrast, thresholding)
    - PaddleOCR text extraction
    - Gemini semantic field extraction from OCR text
    - Regex cross-validation for dates, amounts, odometer
    Supports HEIC, JPEG, PNG formats.
    - **file**: Maintenance receipt image file (max 10MB)
    Returns:
    - **receiptType**: "maintenance"
    - **extractedFields**: Dictionary of extracted fields with confidence scores
      - serviceName: Service performed (e.g., "Oil Change")
      - serviceDate: Date in YYYY-MM-DD format
      - totalCost: Total cost
      - shopName: Shop or business name
      - laborCost: Labor cost (if broken out)
      - partsCost: Parts cost (if broken out)
      - odometerReading: Odometer reading (if present)
      - vehicleInfo: Vehicle description (if present)
    - **rawText**: Full OCR text
    - **processingTimeMs**: Processing time in milliseconds
    """
    # Validate file presence
    if not file.filename:
        raise HTTPException(status_code=400, detail="No file provided")
    # Read file content
    content = await file.read()
    file_size = len(content)
    # Validate file size
    if file_size > MAX_SYNC_SIZE:
        raise HTTPException(
            status_code=413,
            detail=f"File too large. Max: {MAX_SYNC_SIZE // (1024*1024)}MB",
        )
    if file_size == 0:
        raise HTTPException(status_code=400, detail="Empty file provided")
    logger.info(
        f"Maintenance receipt extraction: {file.filename}, "
        f"size: {file_size} bytes, "
        f"content_type: {file.content_type}"
    )
    # Perform maintenance receipt extraction
    result = maintenance_receipt_extractor.extract(
        image_bytes=content,
        content_type=file.content_type,
    )
    if not result.success:
        logger.warning(
            f"Maintenance receipt extraction failed for {file.filename}: {result.error}"
        )
        raise HTTPException(
            status_code=422,
            detail=result.error or "Failed to extract data from maintenance receipt",
        )
    # Convert internal fields to API response format
    extracted_fields = {
        name: ReceiptExtractedField(
            value=field.value,
            confidence=field.confidence,
        )
        for name, field in result.extracted_fields.items()
    }
    return ReceiptExtractionResponse(
        success=result.success,
        receiptType=result.receipt_type,
        extractedFields=extracted_fields,
        rawText=result.raw_text,
        processingTimeMs=result.processing_time_ms,
        error=result.error,
    )
@router.post("/manual", response_model=ManualJobResponse)
 async def extract_manual(
    background_tasks: BackgroundTasks,