feat: add maintenance receipt extraction pipeline with Gemini + regex (refs #150)

- New MaintenanceReceiptExtractor: Gemini-primary extraction with regex
  cross-validation for dates, amounts, and odometer readings
- New maintenance_receipt_validation.py: cross-validation patterns for
  structured field confidence adjustment
- New POST /extract/maintenance-receipt endpoint reusing
  ReceiptExtractionResponse model
- Per-field confidence scores (0.0-1.0) with Gemini base 0.85,
  boosted/reduced by regex agreement

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-12 21:14:13 -06:00
parent 0e97128a31
commit 90401dc1ba
5 changed files with 713 additions and 0 deletions

View File

@@ -8,6 +8,10 @@ from app.extractors.receipt_extractor import (
ExtractedField, ExtractedField,
) )
from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor
from app.extractors.maintenance_receipt_extractor import (
MaintenanceReceiptExtractor,
maintenance_receipt_extractor,
)
from app.extractors.manual_extractor import ( from app.extractors.manual_extractor import (
ManualExtractor, ManualExtractor,
manual_extractor, manual_extractor,
@@ -27,6 +31,8 @@ __all__ = [
"ExtractedField", "ExtractedField",
"FuelReceiptExtractor", "FuelReceiptExtractor",
"fuel_receipt_extractor", "fuel_receipt_extractor",
"MaintenanceReceiptExtractor",
"maintenance_receipt_extractor",
"ManualExtractor", "ManualExtractor",
"manual_extractor", "manual_extractor",
"ManualExtractionResult", "ManualExtractionResult",

View File

@@ -0,0 +1,312 @@
"""Maintenance receipt extraction with Gemini-primary and regex cross-validation.
Flow:
1. Preprocess image and OCR via receipt_extractor (PaddleOCR)
2. Send OCR text to Gemini text API for semantic field extraction
3. Cross-validate structured fields (date, cost, odometer) with regex
4. Return ReceiptExtractionResult with per-field confidence scores
"""
import json
import logging
import os
import time
from typing import Any, Optional
from app.config import settings
from app.extractors.receipt_extractor import (
ExtractedField,
ReceiptExtractionResult,
receipt_extractor,
)
from app.patterns.maintenance_receipt_validation import (
MaintenanceReceiptValidation,
maintenance_receipt_validator,
)
logger = logging.getLogger(__name__)
# Default confidence for Gemini-extracted fields before cross-validation
DEFAULT_GEMINI_CONFIDENCE = 0.85
# Gemini prompt for maintenance receipt field extraction
_RECEIPT_EXTRACTION_PROMPT = """\
Extract maintenance service receipt fields from the following OCR text.
For each field, extract the value if present. Return null for fields not found.
Fields to extract:
- serviceName: The maintenance service performed (e.g., "Oil Change", "Brake Pad Replacement", "Tire Rotation")
- serviceDate: Date of service in YYYY-MM-DD format
- totalCost: Total cost as a number (e.g., 89.95)
- shopName: Name of the shop or business
- laborCost: Labor cost as a number, or null if not broken out
- partsCost: Parts cost as a number, or null if not broken out
- odometerReading: Odometer/mileage reading as a number, or null if not present
- vehicleInfo: Vehicle description if present (e.g., "2022 Toyota Camry"), or null
Return a JSON object with these field names and their extracted values.
OCR Text:
---
{ocr_text}
---\
"""
_RECEIPT_RESPONSE_SCHEMA: dict[str, Any] = {
"type": "object",
"properties": {
"serviceName": {"type": "string", "nullable": True},
"serviceDate": {"type": "string", "nullable": True},
"totalCost": {"type": "number", "nullable": True},
"shopName": {"type": "string", "nullable": True},
"laborCost": {"type": "number", "nullable": True},
"partsCost": {"type": "number", "nullable": True},
"odometerReading": {"type": "number", "nullable": True},
"vehicleInfo": {"type": "string", "nullable": True},
},
"required": [
"serviceName",
"serviceDate",
"totalCost",
"shopName",
"laborCost",
"partsCost",
"odometerReading",
"vehicleInfo",
],
}
class MaintenanceReceiptExtractor:
"""Maintenance receipt extractor using Gemini for semantic extraction.
Wraps receipt_extractor for OCR preprocessing, then sends raw text to
Gemini for field extraction. Structured fields (dates, amounts, odometer)
are cross-validated against regex patterns for confidence adjustment.
"""
def __init__(self) -> None:
self._model: Any | None = None
self._generation_config: Any | None = None
def extract(
self,
image_bytes: bytes,
content_type: Optional[str] = None,
) -> ReceiptExtractionResult:
"""Extract maintenance receipt fields from an image.
Args:
image_bytes: Raw image bytes (HEIC, JPEG, PNG).
content_type: MIME type (auto-detected if not provided).
Returns:
ReceiptExtractionResult with maintenance-specific fields.
"""
start_time = time.time()
# Step 1: OCR the image via receipt_extractor
ocr_result = receipt_extractor.extract(
image_bytes=image_bytes,
content_type=content_type,
)
if not ocr_result.success:
return ocr_result
raw_text = ocr_result.raw_text
if not raw_text.strip():
return ReceiptExtractionResult(
success=False,
error="No text found in image",
processing_time_ms=int((time.time() - start_time) * 1000),
)
# Step 2: Extract fields with Gemini
try:
gemini_fields = self._extract_with_gemini(raw_text)
except Exception as e:
logger.warning(f"Gemini extraction failed, falling back to OCR-only: {e}")
gemini_fields = {}
# Step 3: Build extracted fields with base confidence
extracted_fields = self._build_fields(gemini_fields)
if not extracted_fields:
return ReceiptExtractionResult(
success=False,
receipt_type="maintenance",
error="No maintenance receipt fields could be extracted",
raw_text=raw_text,
processing_time_ms=int((time.time() - start_time) * 1000),
)
# Step 4: Cross-validate structured fields with regex
validation = maintenance_receipt_validator.validate(gemini_fields, raw_text)
if validation.issues:
logger.info(f"Maintenance receipt validation issues: {validation.issues}")
# Step 5: Adjust confidences based on cross-validation
adjusted_fields = self._adjust_confidences(extracted_fields, validation)
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
f"Maintenance receipt extraction: "
f"fields={len(adjusted_fields)}, "
f"validated={validation.is_valid}, "
f"time={processing_time_ms}ms"
)
return ReceiptExtractionResult(
success=True,
receipt_type="maintenance",
extracted_fields=adjusted_fields,
raw_text=raw_text,
processing_time_ms=processing_time_ms,
)
def _get_model(self) -> Any:
"""Lazy-initialize Vertex AI Gemini model.
Uses the same authentication pattern as GeminiEngine.
"""
if self._model is not None:
return self._model
key_path = settings.google_vision_key_path
if not os.path.isfile(key_path):
raise RuntimeError(
f"Google credential config not found at {key_path}. "
"Set GOOGLE_VISION_KEY_PATH or mount the secret."
)
from google.cloud import aiplatform # type: ignore[import-untyped]
from vertexai.generative_models import ( # type: ignore[import-untyped]
GenerationConfig,
GenerativeModel,
)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1"
aiplatform.init(
project=settings.vertex_ai_project,
location=settings.vertex_ai_location,
)
model_name = settings.gemini_model
self._model = GenerativeModel(model_name)
self._generation_config = GenerationConfig(
response_mime_type="application/json",
response_schema=_RECEIPT_RESPONSE_SCHEMA,
)
logger.info(
"Maintenance receipt Gemini model initialized (model=%s)",
model_name,
)
return self._model
def _extract_with_gemini(self, ocr_text: str) -> dict:
"""Send OCR text to Gemini for semantic field extraction.
Args:
ocr_text: Raw OCR text from receipt image.
Returns:
Dictionary of field_name -> extracted_value from Gemini.
"""
model = self._get_model()
prompt = _RECEIPT_EXTRACTION_PROMPT.format(ocr_text=ocr_text)
response = model.generate_content(
[prompt],
generation_config=self._generation_config,
)
raw = json.loads(response.text)
logger.info(
"Gemini extracted maintenance fields: %s",
[k for k, v in raw.items() if v is not None],
)
return raw
def _build_fields(self, gemini_fields: dict) -> dict[str, ExtractedField]:
"""Convert Gemini response to ExtractedField dict with base confidence.
Args:
gemini_fields: Raw Gemini response dict.
Returns:
Dictionary of field_name -> ExtractedField.
"""
fields: dict[str, ExtractedField] = {}
for field_name, value in gemini_fields.items():
if value is None:
continue
# Convert numeric values to appropriate types
if field_name in ("totalCost", "laborCost", "partsCost"):
try:
value = round(float(value), 2)
except (ValueError, TypeError):
continue
elif field_name == "odometerReading":
try:
value = int(float(value))
except (ValueError, TypeError):
continue
elif isinstance(value, str) and not value.strip():
continue
fields[field_name] = ExtractedField(
value=value,
confidence=DEFAULT_GEMINI_CONFIDENCE,
)
return fields
def _adjust_confidences(
self,
fields: dict[str, ExtractedField],
validation: MaintenanceReceiptValidation,
) -> dict[str, ExtractedField]:
"""Adjust field confidences based on cross-validation results.
Args:
fields: Extracted fields with base confidence.
validation: Cross-validation results.
Returns:
Fields with adjusted confidences.
"""
adjusted: dict[str, ExtractedField] = {}
for name, extracted_field in fields.items():
if name in validation.field_validations:
fv = validation.field_validations[name]
new_confidence = min(
1.0, extracted_field.confidence * fv.confidence_adjustment
)
else:
# Semantic fields (no regex validation) keep base confidence
new_confidence = extracted_field.confidence
adjusted[name] = ExtractedField(
value=extracted_field.value,
confidence=round(new_confidence, 3),
)
return adjusted
# Singleton instance
maintenance_receipt_extractor = MaintenanceReceiptExtractor()

View File

@@ -4,6 +4,10 @@ from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matc
from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
from app.patterns.maintenance_patterns import MaintenancePatternMatcher, maintenance_matcher from app.patterns.maintenance_patterns import MaintenancePatternMatcher, maintenance_matcher
from app.patterns.service_mapping import ServiceMapper, service_mapper from app.patterns.service_mapping import ServiceMapper, service_mapper
from app.patterns.maintenance_receipt_validation import (
MaintenanceReceiptValidator,
maintenance_receipt_validator,
)
__all__ = [ __all__ = [
"DatePatternMatcher", "DatePatternMatcher",
@@ -16,4 +20,6 @@ __all__ = [
"maintenance_matcher", "maintenance_matcher",
"ServiceMapper", "ServiceMapper",
"service_mapper", "service_mapper",
"MaintenanceReceiptValidator",
"maintenance_receipt_validator",
] ]

View File

@@ -0,0 +1,299 @@
"""Cross-validation patterns for maintenance receipt field extraction.
Validates structured fields (dates, amounts, odometer) extracted by Gemini
against regex patterns found in the OCR raw text. Boosts or reduces confidence
based on regex agreement.
"""
import re
from dataclasses import dataclass, field
from typing import Optional
from app.patterns.currency_patterns import currency_matcher
from app.patterns.date_patterns import date_matcher
@dataclass
class FieldValidation:
"""Validation result for a single extracted field."""
field_name: str
regex_confirmed: bool
confidence_adjustment: float # Multiplier: >1.0 boosts, <1.0 reduces
regex_value: Optional[str] = None # Value found by regex, if any
@dataclass
class MaintenanceReceiptValidation:
"""Aggregated validation result for a maintenance receipt."""
is_valid: bool
issues: list[str]
field_validations: dict[str, FieldValidation] = field(default_factory=dict)
overall_confidence: float = 1.0
# Odometer patterns: 5-7 digit numbers near odometer keywords
ODOMETER_PATTERNS = [
# "Odometer: 45,231" or "Mileage: 45231"
(
r"(?:ODOMETER|MILEAGE|MILES|ODO|MI)\s*[:\s]\s*(\d{1,3}[,.]?\d{3,4})",
"labeled_odometer",
0.95,
),
# "45,231 mi" or "45231 miles"
(
r"(\d{1,3}[,.]?\d{3,4})\s*(?:MI|MILES|KM)",
"unit_odometer",
0.90,
),
# Standalone 5-6 digit number (lower confidence)
(
r"(?<!\d)(\d{5,6})(?!\d)",
"standalone_odometer",
0.60,
),
]
@dataclass
class OdometerMatch:
"""Result of odometer pattern matching."""
value: int
raw_match: str
confidence: float
pattern_name: str
class MaintenanceReceiptValidator:
"""Cross-validates Gemini-extracted maintenance receipt fields against regex patterns."""
def validate(
self,
gemini_fields: dict,
raw_text: str,
) -> MaintenanceReceiptValidation:
"""Validate Gemini-extracted fields against regex patterns in raw OCR text.
Args:
gemini_fields: Fields extracted by Gemini (field_name -> value).
raw_text: Raw OCR text for regex cross-validation.
Returns:
MaintenanceReceiptValidation with per-field results.
"""
issues: list[str] = []
field_validations: dict[str, FieldValidation] = {}
overall_confidence = 1.0
# Validate date field
if "serviceDate" in gemini_fields:
date_validation = self._validate_date(
gemini_fields["serviceDate"], raw_text
)
field_validations["serviceDate"] = date_validation
if not date_validation.regex_confirmed:
issues.append(
f"Service date '{gemini_fields['serviceDate']}' not confirmed by regex"
)
overall_confidence *= 0.85
# Validate total cost
if "totalCost" in gemini_fields:
cost_validation = self._validate_amount(
"totalCost", gemini_fields["totalCost"], raw_text
)
field_validations["totalCost"] = cost_validation
if not cost_validation.regex_confirmed:
issues.append(
f"Total cost '{gemini_fields['totalCost']}' not confirmed by regex"
)
overall_confidence *= 0.85
# Validate labor cost
if "laborCost" in gemini_fields:
labor_validation = self._validate_amount(
"laborCost", gemini_fields["laborCost"], raw_text
)
field_validations["laborCost"] = labor_validation
if not labor_validation.regex_confirmed:
issues.append(
f"Labor cost '{gemini_fields['laborCost']}' not confirmed by regex"
)
overall_confidence *= 0.90
# Validate odometer
if "odometerReading" in gemini_fields:
odo_validation = self._validate_odometer(
gemini_fields["odometerReading"], raw_text
)
field_validations["odometerReading"] = odo_validation
if not odo_validation.regex_confirmed:
issues.append(
f"Odometer '{gemini_fields['odometerReading']}' not confirmed by regex"
)
overall_confidence *= 0.90
is_valid = len(issues) == 0
return MaintenanceReceiptValidation(
is_valid=is_valid,
issues=issues,
field_validations=field_validations,
overall_confidence=overall_confidence,
)
def extract_odometer(self, text: str) -> Optional[OdometerMatch]:
"""Extract odometer reading from text using regex patterns.
Args:
text: OCR text to search.
Returns:
Best OdometerMatch or None.
"""
text_upper = text.upper()
best_match: Optional[OdometerMatch] = None
for pattern, name, confidence in ODOMETER_PATTERNS:
for match in re.finditer(pattern, text_upper):
raw_value = match.group(1)
parsed = self._parse_odometer(raw_value)
if parsed is not None and self._is_reasonable_odometer(parsed):
candidate = OdometerMatch(
value=parsed,
raw_match=match.group(0).strip(),
confidence=confidence,
pattern_name=name,
)
if best_match is None or candidate.confidence > best_match.confidence:
best_match = candidate
return best_match
def _validate_date(self, gemini_date: str, raw_text: str) -> FieldValidation:
"""Check if Gemini-extracted date matches a regex-found date."""
regex_dates = date_matcher.extract_dates(raw_text)
if not regex_dates:
# No dates found by regex -- cannot confirm or deny
return FieldValidation(
field_name="serviceDate",
regex_confirmed=False,
confidence_adjustment=0.95,
)
# Normalize Gemini date for comparison
gemini_normalized = gemini_date.strip().replace("/", "-")
for regex_date in regex_dates:
if regex_date.value == gemini_normalized:
return FieldValidation(
field_name="serviceDate",
regex_confirmed=True,
confidence_adjustment=1.10,
regex_value=regex_date.value,
)
# Gemini found a date but it doesn't match regex dates
return FieldValidation(
field_name="serviceDate",
regex_confirmed=False,
confidence_adjustment=0.80,
regex_value=regex_dates[0].value if regex_dates else None,
)
def _validate_amount(
self, field_name: str, gemini_amount: str | float, raw_text: str
) -> FieldValidation:
"""Check if Gemini-extracted amount matches a regex-found amount."""
try:
gemini_value = float(str(gemini_amount).replace("$", "").replace(",", ""))
except (ValueError, TypeError):
return FieldValidation(
field_name=field_name,
regex_confirmed=False,
confidence_adjustment=0.70,
)
regex_amounts = currency_matcher.extract_all_amounts(raw_text)
if not regex_amounts:
return FieldValidation(
field_name=field_name,
regex_confirmed=False,
confidence_adjustment=0.95,
)
# Check if any regex amount matches within 5% tolerance
for regex_amount in regex_amounts:
if gemini_value > 0 and abs(regex_amount.value - gemini_value) / gemini_value < 0.05:
return FieldValidation(
field_name=field_name,
regex_confirmed=True,
confidence_adjustment=1.10,
regex_value=str(regex_amount.value),
)
return FieldValidation(
field_name=field_name,
regex_confirmed=False,
confidence_adjustment=0.80,
regex_value=str(regex_amounts[0].value) if regex_amounts else None,
)
def _validate_odometer(
self, gemini_odometer: str | int, raw_text: str
) -> FieldValidation:
"""Check if Gemini-extracted odometer matches a regex-found reading."""
try:
gemini_value = int(
str(gemini_odometer).replace(",", "").replace(".", "").strip()
)
except (ValueError, TypeError):
return FieldValidation(
field_name="odometerReading",
regex_confirmed=False,
confidence_adjustment=0.70,
)
regex_match = self.extract_odometer(raw_text)
if not regex_match:
return FieldValidation(
field_name="odometerReading",
regex_confirmed=False,
confidence_adjustment=0.95,
)
# Check if values match within 1% tolerance (OCR might misread a digit)
if gemini_value > 0 and abs(regex_match.value - gemini_value) / gemini_value < 0.01:
return FieldValidation(
field_name="odometerReading",
regex_confirmed=True,
confidence_adjustment=1.10,
regex_value=str(regex_match.value),
)
return FieldValidation(
field_name="odometerReading",
regex_confirmed=False,
confidence_adjustment=0.80,
regex_value=str(regex_match.value),
)
def _parse_odometer(self, raw: str) -> Optional[int]:
"""Parse odometer string to integer."""
cleaned = raw.replace(",", "").replace(".", "").strip()
try:
return int(cleaned)
except ValueError:
return None
def _is_reasonable_odometer(self, value: int) -> bool:
"""Check if odometer reading is in a reasonable range."""
return 100 <= value <= 999_999
# Singleton instance
maintenance_receipt_validator = MaintenanceReceiptValidator()

View File

@@ -6,6 +6,7 @@ from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, Query
from app.extractors.vin_extractor import vin_extractor from app.extractors.vin_extractor import vin_extractor
from app.extractors.receipt_extractor import receipt_extractor from app.extractors.receipt_extractor import receipt_extractor
from app.extractors.maintenance_receipt_extractor import maintenance_receipt_extractor
from app.extractors.manual_extractor import manual_extractor from app.extractors.manual_extractor import manual_extractor
from app.models import ( from app.models import (
BoundingBox, BoundingBox,
@@ -267,6 +268,95 @@ async def extract_receipt(
) )
@router.post("/maintenance-receipt", response_model=ReceiptExtractionResponse)
async def extract_maintenance_receipt(
file: UploadFile = File(..., description="Maintenance receipt image file"),
) -> ReceiptExtractionResponse:
"""
Extract data from a maintenance receipt image using OCR + Gemini.
Gemini-primary extraction with regex cross-validation:
- OCR preprocessing (HEIC conversion, contrast, thresholding)
- PaddleOCR text extraction
- Gemini semantic field extraction from OCR text
- Regex cross-validation for dates, amounts, odometer
Supports HEIC, JPEG, PNG formats.
- **file**: Maintenance receipt image file (max 10MB)
Returns:
- **receiptType**: "maintenance"
- **extractedFields**: Dictionary of extracted fields with confidence scores
- serviceName: Service performed (e.g., "Oil Change")
- serviceDate: Date in YYYY-MM-DD format
- totalCost: Total cost
- shopName: Shop or business name
- laborCost: Labor cost (if broken out)
- partsCost: Parts cost (if broken out)
- odometerReading: Odometer reading (if present)
- vehicleInfo: Vehicle description (if present)
- **rawText**: Full OCR text
- **processingTimeMs**: Processing time in milliseconds
"""
# Validate file presence
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Read file content
content = await file.read()
file_size = len(content)
# Validate file size
if file_size > MAX_SYNC_SIZE:
raise HTTPException(
status_code=413,
detail=f"File too large. Max: {MAX_SYNC_SIZE // (1024*1024)}MB",
)
if file_size == 0:
raise HTTPException(status_code=400, detail="Empty file provided")
logger.info(
f"Maintenance receipt extraction: {file.filename}, "
f"size: {file_size} bytes, "
f"content_type: {file.content_type}"
)
# Perform maintenance receipt extraction
result = maintenance_receipt_extractor.extract(
image_bytes=content,
content_type=file.content_type,
)
if not result.success:
logger.warning(
f"Maintenance receipt extraction failed for {file.filename}: {result.error}"
)
raise HTTPException(
status_code=422,
detail=result.error or "Failed to extract data from maintenance receipt",
)
# Convert internal fields to API response format
extracted_fields = {
name: ReceiptExtractedField(
value=field.value,
confidence=field.confidence,
)
for name, field in result.extracted_fields.items()
}
return ReceiptExtractionResponse(
success=result.success,
receiptType=result.receipt_type,
extractedFields=extracted_fields,
rawText=result.raw_text,
processingTimeMs=result.processing_time_ms,
error=result.error,
)
@router.post("/manual", response_model=ManualJobResponse) @router.post("/manual", response_model=ManualJobResponse)
async def extract_manual( async def extract_manual(
background_tasks: BackgroundTasks, background_tasks: BackgroundTasks,