From 90401dc1ba20caf81d2d7b5eca4c3848225b5ea2 Mon Sep 17 00:00:00 2001
From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com>
Date: Thu, 12 Feb 2026 21:14:13 -0600
Subject: [PATCH] feat: add maintenance receipt extraction pipeline with Gemini
 + regex (refs #150)

- New MaintenanceReceiptExtractor: Gemini-primary extraction with regex
  cross-validation for dates, amounts, and odometer readings
- New maintenance_receipt_validation.py: cross-validation patterns for
  structured field confidence adjustment
- New POST /extract/maintenance-receipt endpoint reusing
  ReceiptExtractionResponse model
- Per-field confidence scores (0.0-1.0) with Gemini base 0.85,
  boosted/reduced by regex agreement

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ocr/app/extractors/__init__.py                |   6 +
 .../maintenance_receipt_extractor.py          | 312 ++++++++++++++++++
 ocr/app/patterns/__init__.py                  |   6 +
 .../maintenance_receipt_validation.py         | 299 +++++++++++++++++
 ocr/app/routers/extract.py                    |  90 +++++
 5 files changed, 713 insertions(+)
 create mode 100644 ocr/app/extractors/maintenance_receipt_extractor.py
 create mode 100644 ocr/app/patterns/maintenance_receipt_validation.py

diff --git a/ocr/app/extractors/__init__.py b/ocr/app/extractors/__init__.py
index c0026a7..97d7480 100644
--- a/ocr/app/extractors/__init__.py
+++ b/ocr/app/extractors/__init__.py
@@ -8,6 +8,10 @@ from app.extractors.receipt_extractor import (
     ExtractedField,
 )
 from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor
+from app.extractors.maintenance_receipt_extractor import (
+    MaintenanceReceiptExtractor,
+    maintenance_receipt_extractor,
+)
 from app.extractors.manual_extractor import (
     ManualExtractor,
     manual_extractor,
@@ -27,6 +31,8 @@ __all__ = [
     "ExtractedField",
     "FuelReceiptExtractor",
     "fuel_receipt_extractor",
+    "MaintenanceReceiptExtractor",
+    "maintenance_receipt_extractor",
     "ManualExtractor",
     "manual_extractor",
     "ManualExtractionResult",
diff --git a/ocr/app/extractors/maintenance_receipt_extractor.py b/ocr/app/extractors/maintenance_receipt_extractor.py
new file mode 100644
index 0000000..93285ba
--- /dev/null
+++ b/ocr/app/extractors/maintenance_receipt_extractor.py
@@ -0,0 +1,312 @@
+"""Maintenance receipt extraction with Gemini-primary and regex cross-validation.
+
+Flow:
+1. Preprocess image and OCR via receipt_extractor (PaddleOCR)
+2. Send OCR text to Gemini text API for semantic field extraction
+3. Cross-validate structured fields (date, cost, odometer) with regex
+4. Return ReceiptExtractionResult with per-field confidence scores
+"""
+
+import json
+import logging
+import os
+import time
+from typing import Any, Optional
+
+from app.config import settings
+from app.extractors.receipt_extractor import (
+    ExtractedField,
+    ReceiptExtractionResult,
+    receipt_extractor,
+)
+from app.patterns.maintenance_receipt_validation import (
+    MaintenanceReceiptValidation,
+    maintenance_receipt_validator,
+)
+
+logger = logging.getLogger(__name__)
+
+# Default confidence for Gemini-extracted fields before cross-validation
+DEFAULT_GEMINI_CONFIDENCE = 0.85
+
+# Gemini prompt for maintenance receipt field extraction
+_RECEIPT_EXTRACTION_PROMPT = """\
+Extract maintenance service receipt fields from the following OCR text.
+
+For each field, extract the value if present. Return null for fields not found.
+
+Fields to extract:
+- serviceName: The maintenance service performed (e.g., "Oil Change", "Brake Pad Replacement", "Tire Rotation")
+- serviceDate: Date of service in YYYY-MM-DD format
+- totalCost: Total cost as a number (e.g., 89.95)
+- shopName: Name of the shop or business
+- laborCost: Labor cost as a number, or null if not broken out
+- partsCost: Parts cost as a number, or null if not broken out
+- odometerReading: Odometer/mileage reading as a number, or null if not present
+- vehicleInfo: Vehicle description if present (e.g., "2022 Toyota Camry"), or null
+
+Return a JSON object with these field names and their extracted values.
+
+OCR Text:
+---
+{ocr_text}
+---\
+"""
+
+_RECEIPT_RESPONSE_SCHEMA: dict[str, Any] = {
+    "type": "object",
+    "properties": {
+        "serviceName": {"type": "string", "nullable": True},
+        "serviceDate": {"type": "string", "nullable": True},
+        "totalCost": {"type": "number", "nullable": True},
+        "shopName": {"type": "string", "nullable": True},
+        "laborCost": {"type": "number", "nullable": True},
+        "partsCost": {"type": "number", "nullable": True},
+        "odometerReading": {"type": "number", "nullable": True},
+        "vehicleInfo": {"type": "string", "nullable": True},
+    },
+    "required": [
+        "serviceName",
+        "serviceDate",
+        "totalCost",
+        "shopName",
+        "laborCost",
+        "partsCost",
+        "odometerReading",
+        "vehicleInfo",
+    ],
+}
+
+
+class MaintenanceReceiptExtractor:
+    """Maintenance receipt extractor using Gemini for semantic extraction.
+
+    Wraps receipt_extractor for OCR preprocessing, then sends raw text to
+    Gemini for field extraction. Structured fields (dates, amounts, odometer)
+    are cross-validated against regex patterns for confidence adjustment.
+    """
+
+    def __init__(self) -> None:
+        self._model: Any | None = None
+        self._generation_config: Any | None = None
+
+    def extract(
+        self,
+        image_bytes: bytes,
+        content_type: Optional[str] = None,
+    ) -> ReceiptExtractionResult:
+        """Extract maintenance receipt fields from an image.
+
+        Args:
+            image_bytes: Raw image bytes (HEIC, JPEG, PNG).
+            content_type: MIME type (auto-detected if not provided).
+
+        Returns:
+            ReceiptExtractionResult with maintenance-specific fields.
+        """
+        start_time = time.time()
+
+        # Step 1: OCR the image via receipt_extractor
+        ocr_result = receipt_extractor.extract(
+            image_bytes=image_bytes,
+            content_type=content_type,
+        )
+
+        if not ocr_result.success:
+            return ocr_result
+
+        raw_text = ocr_result.raw_text
+
+        if not raw_text.strip():
+            return ReceiptExtractionResult(
+                success=False,
+                error="No text found in image",
+                processing_time_ms=int((time.time() - start_time) * 1000),
+            )
+
+        # Step 2: Extract fields with Gemini
+        try:
+            gemini_fields = self._extract_with_gemini(raw_text)
+        except Exception as e:
+            logger.warning(f"Gemini extraction failed, falling back to OCR-only: {e}")
+            gemini_fields = {}
+
+        # Step 3: Build extracted fields with base confidence
+        extracted_fields = self._build_fields(gemini_fields)
+
+        if not extracted_fields:
+            return ReceiptExtractionResult(
+                success=False,
+                receipt_type="maintenance",
+                error="No maintenance receipt fields could be extracted",
+                raw_text=raw_text,
+                processing_time_ms=int((time.time() - start_time) * 1000),
+            )
+
+        # Step 4: Cross-validate structured fields with regex
+        validation = maintenance_receipt_validator.validate(gemini_fields, raw_text)
+
+        if validation.issues:
+            logger.info(f"Maintenance receipt validation issues: {validation.issues}")
+
+        # Step 5: Adjust confidences based on cross-validation
+        adjusted_fields = self._adjust_confidences(extracted_fields, validation)
+
+        processing_time_ms = int((time.time() - start_time) * 1000)
+
+        logger.info(
+            f"Maintenance receipt extraction: "
+            f"fields={len(adjusted_fields)}, "
+            f"validated={validation.is_valid}, "
+            f"time={processing_time_ms}ms"
+        )
+
+        return ReceiptExtractionResult(
+            success=True,
+            receipt_type="maintenance",
+            extracted_fields=adjusted_fields,
+            raw_text=raw_text,
+            processing_time_ms=processing_time_ms,
+        )
+
+    def _get_model(self) -> Any:
+        """Lazy-initialize Vertex AI Gemini model.
+
+        Uses the same authentication pattern as GeminiEngine.
+        """
+        if self._model is not None:
+            return self._model
+
+        key_path = settings.google_vision_key_path
+        if not os.path.isfile(key_path):
+            raise RuntimeError(
+                f"Google credential config not found at {key_path}. "
+                "Set GOOGLE_VISION_KEY_PATH or mount the secret."
+            )
+
+        from google.cloud import aiplatform  # type: ignore[import-untyped]
+        from vertexai.generative_models import (  # type: ignore[import-untyped]
+            GenerationConfig,
+            GenerativeModel,
+        )
+
+        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
+        os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1"
+
+        aiplatform.init(
+            project=settings.vertex_ai_project,
+            location=settings.vertex_ai_location,
+        )
+
+        model_name = settings.gemini_model
+        self._model = GenerativeModel(model_name)
+        self._generation_config = GenerationConfig(
+            response_mime_type="application/json",
+            response_schema=_RECEIPT_RESPONSE_SCHEMA,
+        )
+
+        logger.info(
+            "Maintenance receipt Gemini model initialized (model=%s)",
+            model_name,
+        )
+        return self._model
+
+    def _extract_with_gemini(self, ocr_text: str) -> dict:
+        """Send OCR text to Gemini for semantic field extraction.
+
+        Args:
+            ocr_text: Raw OCR text from receipt image.
+
+        Returns:
+            Dictionary of field_name -> extracted_value from Gemini.
+        """
+        model = self._get_model()
+
+        prompt = _RECEIPT_EXTRACTION_PROMPT.format(ocr_text=ocr_text)
+
+        response = model.generate_content(
+            [prompt],
+            generation_config=self._generation_config,
+        )
+
+        raw = json.loads(response.text)
+
+        logger.info(
+            "Gemini extracted maintenance fields: %s",
+            [k for k, v in raw.items() if v is not None],
+        )
+
+        return raw
+
+    def _build_fields(self, gemini_fields: dict) -> dict[str, ExtractedField]:
+        """Convert Gemini response to ExtractedField dict with base confidence.
+
+        Args:
+            gemini_fields: Raw Gemini response dict.
+
+        Returns:
+            Dictionary of field_name -> ExtractedField.
+        """
+        fields: dict[str, ExtractedField] = {}
+
+        for field_name, value in gemini_fields.items():
+            if value is None:
+                continue
+
+            # Convert numeric values to appropriate types
+            if field_name in ("totalCost", "laborCost", "partsCost"):
+                try:
+                    value = round(float(value), 2)
+                except (ValueError, TypeError):
+                    continue
+            elif field_name == "odometerReading":
+                try:
+                    value = int(float(value))
+                except (ValueError, TypeError):
+                    continue
+            elif isinstance(value, str) and not value.strip():
+                continue
+
+            fields[field_name] = ExtractedField(
+                value=value,
+                confidence=DEFAULT_GEMINI_CONFIDENCE,
+            )
+
+        return fields
+
+    def _adjust_confidences(
+        self,
+        fields: dict[str, ExtractedField],
+        validation: MaintenanceReceiptValidation,
+    ) -> dict[str, ExtractedField]:
+        """Adjust field confidences based on cross-validation results.
+
+        Args:
+            fields: Extracted fields with base confidence.
+            validation: Cross-validation results.
+
+        Returns:
+            Fields with adjusted confidences.
+        """
+        adjusted: dict[str, ExtractedField] = {}
+
+        for name, extracted_field in fields.items():
+            if name in validation.field_validations:
+                fv = validation.field_validations[name]
+                new_confidence = min(
+                    1.0, extracted_field.confidence * fv.confidence_adjustment
+                )
+            else:
+                # Semantic fields (no regex validation) keep base confidence
+                new_confidence = extracted_field.confidence
+
+            adjusted[name] = ExtractedField(
+                value=extracted_field.value,
+                confidence=round(new_confidence, 3),
+            )
+
+        return adjusted
+
+
+# Singleton instance
+maintenance_receipt_extractor = MaintenanceReceiptExtractor()
diff --git a/ocr/app/patterns/__init__.py b/ocr/app/patterns/__init__.py
index f9a8bc4..969c36a 100644
--- a/ocr/app/patterns/__init__.py
+++ b/ocr/app/patterns/__init__.py
@@ -4,6 +4,10 @@ from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matc
 from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
 from app.patterns.maintenance_patterns import MaintenancePatternMatcher, maintenance_matcher
 from app.patterns.service_mapping import ServiceMapper, service_mapper
+from app.patterns.maintenance_receipt_validation import (
+    MaintenanceReceiptValidator,
+    maintenance_receipt_validator,
+)
 
 __all__ = [
     "DatePatternMatcher",
@@ -16,4 +20,6 @@ __all__ = [
     "maintenance_matcher",
     "ServiceMapper",
     "service_mapper",
+    "MaintenanceReceiptValidator",
+    "maintenance_receipt_validator",
 ]
diff --git a/ocr/app/patterns/maintenance_receipt_validation.py b/ocr/app/patterns/maintenance_receipt_validation.py
new file mode 100644
index 0000000..ef2abcc
--- /dev/null
+++ b/ocr/app/patterns/maintenance_receipt_validation.py
@@ -0,0 +1,299 @@
+"""Cross-validation patterns for maintenance receipt field extraction.
+
+Validates structured fields (dates, amounts, odometer) extracted by Gemini
+against regex patterns found in the OCR raw text. Boosts or reduces confidence
+based on regex agreement.
+"""
+
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+
+from app.patterns.currency_patterns import currency_matcher
+from app.patterns.date_patterns import date_matcher
+
+
+@dataclass
+class FieldValidation:
+    """Validation result for a single extracted field."""
+
+    field_name: str
+    regex_confirmed: bool
+    confidence_adjustment: float  # Multiplier: >1.0 boosts, <1.0 reduces
+    regex_value: Optional[str] = None  # Value found by regex, if any
+
+
+@dataclass
+class MaintenanceReceiptValidation:
+    """Aggregated validation result for a maintenance receipt."""
+
+    is_valid: bool
+    issues: list[str]
+    field_validations: dict[str, FieldValidation] = field(default_factory=dict)
+    overall_confidence: float = 1.0
+
+
+# Odometer patterns: 5-7 digit numbers near odometer keywords
+ODOMETER_PATTERNS = [
+    # "Odometer: 45,231" or "Mileage: 45231"
+    (
+        r"(?:ODOMETER|MILEAGE|MILES|ODO|MI)\s*[:\s]\s*(\d{1,3}[,.]?\d{3,4})",
+        "labeled_odometer",
+        0.95,
+    ),
+    # "45,231 mi" or "45231 miles"
+    (
+        r"(\d{1,3}[,.]?\d{3,4})\s*(?:MI|MILES|KM)",
+        "unit_odometer",
+        0.90,
+    ),
+    # Standalone 5-6 digit number (lower confidence)
+    (
+        r"(?<!\d)(\d{5,6})(?!\d)",
+        "standalone_odometer",
+        0.60,
+    ),
+]
+
+
+@dataclass
+class OdometerMatch:
+    """Result of odometer pattern matching."""
+
+    value: int
+    raw_match: str
+    confidence: float
+    pattern_name: str
+
+
+class MaintenanceReceiptValidator:
+    """Cross-validates Gemini-extracted maintenance receipt fields against regex patterns."""
+
+    def validate(
+        self,
+        gemini_fields: dict,
+        raw_text: str,
+    ) -> MaintenanceReceiptValidation:
+        """Validate Gemini-extracted fields against regex patterns in raw OCR text.
+
+        Args:
+            gemini_fields: Fields extracted by Gemini (field_name -> value).
+            raw_text: Raw OCR text for regex cross-validation.
+
+        Returns:
+            MaintenanceReceiptValidation with per-field results.
+        """
+        issues: list[str] = []
+        field_validations: dict[str, FieldValidation] = {}
+        overall_confidence = 1.0
+
+        # Validate date field
+        if "serviceDate" in gemini_fields:
+            date_validation = self._validate_date(
+                gemini_fields["serviceDate"], raw_text
+            )
+            field_validations["serviceDate"] = date_validation
+            if not date_validation.regex_confirmed:
+                issues.append(
+                    f"Service date '{gemini_fields['serviceDate']}' not confirmed by regex"
+                )
+                overall_confidence *= 0.85
+
+        # Validate total cost
+        if "totalCost" in gemini_fields:
+            cost_validation = self._validate_amount(
+                "totalCost", gemini_fields["totalCost"], raw_text
+            )
+            field_validations["totalCost"] = cost_validation
+            if not cost_validation.regex_confirmed:
+                issues.append(
+                    f"Total cost '{gemini_fields['totalCost']}' not confirmed by regex"
+                )
+                overall_confidence *= 0.85
+
+        # Validate labor cost
+        if "laborCost" in gemini_fields:
+            labor_validation = self._validate_amount(
+                "laborCost", gemini_fields["laborCost"], raw_text
+            )
+            field_validations["laborCost"] = labor_validation
+            if not labor_validation.regex_confirmed:
+                issues.append(
+                    f"Labor cost '{gemini_fields['laborCost']}' not confirmed by regex"
+                )
+                overall_confidence *= 0.90
+
+        # Validate odometer
+        if "odometerReading" in gemini_fields:
+            odo_validation = self._validate_odometer(
+                gemini_fields["odometerReading"], raw_text
+            )
+            field_validations["odometerReading"] = odo_validation
+            if not odo_validation.regex_confirmed:
+                issues.append(
+                    f"Odometer '{gemini_fields['odometerReading']}' not confirmed by regex"
+                )
+                overall_confidence *= 0.90
+
+        is_valid = len(issues) == 0
+        return MaintenanceReceiptValidation(
+            is_valid=is_valid,
+            issues=issues,
+            field_validations=field_validations,
+            overall_confidence=overall_confidence,
+        )
+
+    def extract_odometer(self, text: str) -> Optional[OdometerMatch]:
+        """Extract odometer reading from text using regex patterns.
+
+        Args:
+            text: OCR text to search.
+
+        Returns:
+            Best OdometerMatch or None.
+        """
+        text_upper = text.upper()
+        best_match: Optional[OdometerMatch] = None
+
+        for pattern, name, confidence in ODOMETER_PATTERNS:
+            for match in re.finditer(pattern, text_upper):
+                raw_value = match.group(1)
+                parsed = self._parse_odometer(raw_value)
+                if parsed is not None and self._is_reasonable_odometer(parsed):
+                    candidate = OdometerMatch(
+                        value=parsed,
+                        raw_match=match.group(0).strip(),
+                        confidence=confidence,
+                        pattern_name=name,
+                    )
+                    if best_match is None or candidate.confidence > best_match.confidence:
+                        best_match = candidate
+
+        return best_match
+
+    def _validate_date(self, gemini_date: str, raw_text: str) -> FieldValidation:
+        """Check if Gemini-extracted date matches a regex-found date."""
+        regex_dates = date_matcher.extract_dates(raw_text)
+
+        if not regex_dates:
+            # No dates found by regex -- cannot confirm or deny
+            return FieldValidation(
+                field_name="serviceDate",
+                regex_confirmed=False,
+                confidence_adjustment=0.95,
+            )
+
+        # Normalize Gemini date for comparison
+        gemini_normalized = gemini_date.strip().replace("/", "-")
+
+        for regex_date in regex_dates:
+            if regex_date.value == gemini_normalized:
+                return FieldValidation(
+                    field_name="serviceDate",
+                    regex_confirmed=True,
+                    confidence_adjustment=1.10,
+                    regex_value=regex_date.value,
+                )
+
+        # Gemini found a date but it doesn't match regex dates
+        return FieldValidation(
+            field_name="serviceDate",
+            regex_confirmed=False,
+            confidence_adjustment=0.80,
+            regex_value=regex_dates[0].value if regex_dates else None,
+        )
+
+    def _validate_amount(
+        self, field_name: str, gemini_amount: str | float, raw_text: str
+    ) -> FieldValidation:
+        """Check if Gemini-extracted amount matches a regex-found amount."""
+        try:
+            gemini_value = float(str(gemini_amount).replace("$", "").replace(",", ""))
+        except (ValueError, TypeError):
+            return FieldValidation(
+                field_name=field_name,
+                regex_confirmed=False,
+                confidence_adjustment=0.70,
+            )
+
+        regex_amounts = currency_matcher.extract_all_amounts(raw_text)
+
+        if not regex_amounts:
+            return FieldValidation(
+                field_name=field_name,
+                regex_confirmed=False,
+                confidence_adjustment=0.95,
+            )
+
+        # Check if any regex amount matches within 5% tolerance
+        for regex_amount in regex_amounts:
+            if gemini_value > 0 and abs(regex_amount.value - gemini_value) / gemini_value < 0.05:
+                return FieldValidation(
+                    field_name=field_name,
+                    regex_confirmed=True,
+                    confidence_adjustment=1.10,
+                    regex_value=str(regex_amount.value),
+                )
+
+        return FieldValidation(
+            field_name=field_name,
+            regex_confirmed=False,
+            confidence_adjustment=0.80,
+            regex_value=str(regex_amounts[0].value) if regex_amounts else None,
+        )
+
+    def _validate_odometer(
+        self, gemini_odometer: str | int, raw_text: str
+    ) -> FieldValidation:
+        """Check if Gemini-extracted odometer matches a regex-found reading."""
+        try:
+            gemini_value = int(
+                str(gemini_odometer).replace(",", "").replace(".", "").strip()
+            )
+        except (ValueError, TypeError):
+            return FieldValidation(
+                field_name="odometerReading",
+                regex_confirmed=False,
+                confidence_adjustment=0.70,
+            )
+
+        regex_match = self.extract_odometer(raw_text)
+
+        if not regex_match:
+            return FieldValidation(
+                field_name="odometerReading",
+                regex_confirmed=False,
+                confidence_adjustment=0.95,
+            )
+
+        # Check if values match within 1% tolerance (OCR might misread a digit)
+        if gemini_value > 0 and abs(regex_match.value - gemini_value) / gemini_value < 0.01:
+            return FieldValidation(
+                field_name="odometerReading",
+                regex_confirmed=True,
+                confidence_adjustment=1.10,
+                regex_value=str(regex_match.value),
+            )
+
+        return FieldValidation(
+            field_name="odometerReading",
+            regex_confirmed=False,
+            confidence_adjustment=0.80,
+            regex_value=str(regex_match.value),
+        )
+
+    def _parse_odometer(self, raw: str) -> Optional[int]:
+        """Parse odometer string to integer."""
+        cleaned = raw.replace(",", "").replace(".", "").strip()
+        try:
+            return int(cleaned)
+        except ValueError:
+            return None
+
+    def _is_reasonable_odometer(self, value: int) -> bool:
+        """Check if odometer reading is in a reasonable range."""
+        return 100 <= value <= 999_999
+
+
+# Singleton instance
+maintenance_receipt_validator = MaintenanceReceiptValidator()
diff --git a/ocr/app/routers/extract.py b/ocr/app/routers/extract.py
index e9aa6e3..3c1d02f 100644
--- a/ocr/app/routers/extract.py
+++ b/ocr/app/routers/extract.py
@@ -6,6 +6,7 @@ from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, Query
 
 from app.extractors.vin_extractor import vin_extractor
 from app.extractors.receipt_extractor import receipt_extractor
+from app.extractors.maintenance_receipt_extractor import maintenance_receipt_extractor
 from app.extractors.manual_extractor import manual_extractor
 from app.models import (
     BoundingBox,
@@ -267,6 +268,95 @@ async def extract_receipt(
     )
 
 
+@router.post("/maintenance-receipt", response_model=ReceiptExtractionResponse)
+async def extract_maintenance_receipt(
+    file: UploadFile = File(..., description="Maintenance receipt image file"),
+) -> ReceiptExtractionResponse:
+    """
+    Extract data from a maintenance receipt image using OCR + Gemini.
+
+    Gemini-primary extraction with regex cross-validation:
+    - OCR preprocessing (HEIC conversion, contrast, thresholding)
+    - PaddleOCR text extraction
+    - Gemini semantic field extraction from OCR text
+    - Regex cross-validation for dates, amounts, odometer
+
+    Supports HEIC, JPEG, PNG formats.
+
+    - **file**: Maintenance receipt image file (max 10MB)
+
+    Returns:
+    - **receiptType**: "maintenance"
+    - **extractedFields**: Dictionary of extracted fields with confidence scores
+      - serviceName: Service performed (e.g., "Oil Change")
+      - serviceDate: Date in YYYY-MM-DD format
+      - totalCost: Total cost
+      - shopName: Shop or business name
+      - laborCost: Labor cost (if broken out)
+      - partsCost: Parts cost (if broken out)
+      - odometerReading: Odometer reading (if present)
+      - vehicleInfo: Vehicle description (if present)
+    - **rawText**: Full OCR text
+    - **processingTimeMs**: Processing time in milliseconds
+    """
+    # Validate file presence
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No file provided")
+
+    # Read file content
+    content = await file.read()
+    file_size = len(content)
+
+    # Validate file size
+    if file_size > MAX_SYNC_SIZE:
+        raise HTTPException(
+            status_code=413,
+            detail=f"File too large. Max: {MAX_SYNC_SIZE // (1024*1024)}MB",
+        )
+
+    if file_size == 0:
+        raise HTTPException(status_code=400, detail="Empty file provided")
+
+    logger.info(
+        f"Maintenance receipt extraction: {file.filename}, "
+        f"size: {file_size} bytes, "
+        f"content_type: {file.content_type}"
+    )
+
+    # Perform maintenance receipt extraction
+    result = maintenance_receipt_extractor.extract(
+        image_bytes=content,
+        content_type=file.content_type,
+    )
+
+    if not result.success:
+        logger.warning(
+            f"Maintenance receipt extraction failed for {file.filename}: {result.error}"
+        )
+        raise HTTPException(
+            status_code=422,
+            detail=result.error or "Failed to extract data from maintenance receipt",
+        )
+
+    # Convert internal fields to API response format
+    extracted_fields = {
+        name: ReceiptExtractedField(
+            value=field.value,
+            confidence=field.confidence,
+        )
+        for name, field in result.extracted_fields.items()
+    }
+
+    return ReceiptExtractionResponse(
+        success=result.success,
+        receiptType=result.receipt_type,
+        extractedFields=extracted_fields,
+        rawText=result.raw_text,
+        processingTimeMs=result.processing_time_ms,
+        error=result.error,
+    )
+
+
 @router.post("/manual", response_model=ManualJobResponse)
 async def extract_manual(
     background_tasks: BackgroundTasks,