Files
motovaultpro/ocr/app/patterns/maintenance_receipt_validation.py
Eric Gullickson 90401dc1ba feat: add maintenance receipt extraction pipeline with Gemini + regex (refs #150)
- New MaintenanceReceiptExtractor: Gemini-primary extraction with regex
  cross-validation for dates, amounts, and odometer readings
- New maintenance_receipt_validation.py: cross-validation patterns for
  structured field confidence adjustment
- New POST /extract/maintenance-receipt endpoint reusing
  ReceiptExtractionResponse model
- Per-field confidence scores (0.0-1.0) with Gemini base 0.85,
  boosted/reduced by regex agreement

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-12 21:14:13 -06:00

300 lines
10 KiB
Python

"""Cross-validation patterns for maintenance receipt field extraction.
Validates structured fields (dates, amounts, odometer) extracted by Gemini
against regex patterns found in the OCR raw text. Boosts or reduces confidence
based on regex agreement.
"""
import re
from dataclasses import dataclass, field
from typing import Optional
from app.patterns.currency_patterns import currency_matcher
from app.patterns.date_patterns import date_matcher
@dataclass
class FieldValidation:
"""Validation result for a single extracted field."""
field_name: str
regex_confirmed: bool
confidence_adjustment: float # Multiplier: >1.0 boosts, <1.0 reduces
regex_value: Optional[str] = None # Value found by regex, if any
@dataclass
class MaintenanceReceiptValidation:
"""Aggregated validation result for a maintenance receipt."""
is_valid: bool
issues: list[str]
field_validations: dict[str, FieldValidation] = field(default_factory=dict)
overall_confidence: float = 1.0
# Odometer patterns: 5-7 digit numbers near odometer keywords
ODOMETER_PATTERNS = [
# "Odometer: 45,231" or "Mileage: 45231"
(
r"(?:ODOMETER|MILEAGE|MILES|ODO|MI)\s*[:\s]\s*(\d{1,3}[,.]?\d{3,4})",
"labeled_odometer",
0.95,
),
# "45,231 mi" or "45231 miles"
(
r"(\d{1,3}[,.]?\d{3,4})\s*(?:MI|MILES|KM)",
"unit_odometer",
0.90,
),
# Standalone 5-6 digit number (lower confidence)
(
r"(?<!\d)(\d{5,6})(?!\d)",
"standalone_odometer",
0.60,
),
]
@dataclass
class OdometerMatch:
"""Result of odometer pattern matching."""
value: int
raw_match: str
confidence: float
pattern_name: str
class MaintenanceReceiptValidator:
"""Cross-validates Gemini-extracted maintenance receipt fields against regex patterns."""
def validate(
self,
gemini_fields: dict,
raw_text: str,
) -> MaintenanceReceiptValidation:
"""Validate Gemini-extracted fields against regex patterns in raw OCR text.
Args:
gemini_fields: Fields extracted by Gemini (field_name -> value).
raw_text: Raw OCR text for regex cross-validation.
Returns:
MaintenanceReceiptValidation with per-field results.
"""
issues: list[str] = []
field_validations: dict[str, FieldValidation] = {}
overall_confidence = 1.0
# Validate date field
if "serviceDate" in gemini_fields:
date_validation = self._validate_date(
gemini_fields["serviceDate"], raw_text
)
field_validations["serviceDate"] = date_validation
if not date_validation.regex_confirmed:
issues.append(
f"Service date '{gemini_fields['serviceDate']}' not confirmed by regex"
)
overall_confidence *= 0.85
# Validate total cost
if "totalCost" in gemini_fields:
cost_validation = self._validate_amount(
"totalCost", gemini_fields["totalCost"], raw_text
)
field_validations["totalCost"] = cost_validation
if not cost_validation.regex_confirmed:
issues.append(
f"Total cost '{gemini_fields['totalCost']}' not confirmed by regex"
)
overall_confidence *= 0.85
# Validate labor cost
if "laborCost" in gemini_fields:
labor_validation = self._validate_amount(
"laborCost", gemini_fields["laborCost"], raw_text
)
field_validations["laborCost"] = labor_validation
if not labor_validation.regex_confirmed:
issues.append(
f"Labor cost '{gemini_fields['laborCost']}' not confirmed by regex"
)
overall_confidence *= 0.90
# Validate odometer
if "odometerReading" in gemini_fields:
odo_validation = self._validate_odometer(
gemini_fields["odometerReading"], raw_text
)
field_validations["odometerReading"] = odo_validation
if not odo_validation.regex_confirmed:
issues.append(
f"Odometer '{gemini_fields['odometerReading']}' not confirmed by regex"
)
overall_confidence *= 0.90
is_valid = len(issues) == 0
return MaintenanceReceiptValidation(
is_valid=is_valid,
issues=issues,
field_validations=field_validations,
overall_confidence=overall_confidence,
)
def extract_odometer(self, text: str) -> Optional[OdometerMatch]:
"""Extract odometer reading from text using regex patterns.
Args:
text: OCR text to search.
Returns:
Best OdometerMatch or None.
"""
text_upper = text.upper()
best_match: Optional[OdometerMatch] = None
for pattern, name, confidence in ODOMETER_PATTERNS:
for match in re.finditer(pattern, text_upper):
raw_value = match.group(1)
parsed = self._parse_odometer(raw_value)
if parsed is not None and self._is_reasonable_odometer(parsed):
candidate = OdometerMatch(
value=parsed,
raw_match=match.group(0).strip(),
confidence=confidence,
pattern_name=name,
)
if best_match is None or candidate.confidence > best_match.confidence:
best_match = candidate
return best_match
def _validate_date(self, gemini_date: str, raw_text: str) -> FieldValidation:
"""Check if Gemini-extracted date matches a regex-found date."""
regex_dates = date_matcher.extract_dates(raw_text)
if not regex_dates:
# No dates found by regex -- cannot confirm or deny
return FieldValidation(
field_name="serviceDate",
regex_confirmed=False,
confidence_adjustment=0.95,
)
# Normalize Gemini date for comparison
gemini_normalized = gemini_date.strip().replace("/", "-")
for regex_date in regex_dates:
if regex_date.value == gemini_normalized:
return FieldValidation(
field_name="serviceDate",
regex_confirmed=True,
confidence_adjustment=1.10,
regex_value=regex_date.value,
)
# Gemini found a date but it doesn't match regex dates
return FieldValidation(
field_name="serviceDate",
regex_confirmed=False,
confidence_adjustment=0.80,
regex_value=regex_dates[0].value if regex_dates else None,
)
def _validate_amount(
self, field_name: str, gemini_amount: str | float, raw_text: str
) -> FieldValidation:
"""Check if Gemini-extracted amount matches a regex-found amount."""
try:
gemini_value = float(str(gemini_amount).replace("$", "").replace(",", ""))
except (ValueError, TypeError):
return FieldValidation(
field_name=field_name,
regex_confirmed=False,
confidence_adjustment=0.70,
)
regex_amounts = currency_matcher.extract_all_amounts(raw_text)
if not regex_amounts:
return FieldValidation(
field_name=field_name,
regex_confirmed=False,
confidence_adjustment=0.95,
)
# Check if any regex amount matches within 5% tolerance
for regex_amount in regex_amounts:
if gemini_value > 0 and abs(regex_amount.value - gemini_value) / gemini_value < 0.05:
return FieldValidation(
field_name=field_name,
regex_confirmed=True,
confidence_adjustment=1.10,
regex_value=str(regex_amount.value),
)
return FieldValidation(
field_name=field_name,
regex_confirmed=False,
confidence_adjustment=0.80,
regex_value=str(regex_amounts[0].value) if regex_amounts else None,
)
def _validate_odometer(
self, gemini_odometer: str | int, raw_text: str
) -> FieldValidation:
"""Check if Gemini-extracted odometer matches a regex-found reading."""
try:
gemini_value = int(
str(gemini_odometer).replace(",", "").replace(".", "").strip()
)
except (ValueError, TypeError):
return FieldValidation(
field_name="odometerReading",
regex_confirmed=False,
confidence_adjustment=0.70,
)
regex_match = self.extract_odometer(raw_text)
if not regex_match:
return FieldValidation(
field_name="odometerReading",
regex_confirmed=False,
confidence_adjustment=0.95,
)
# Check if values match within 1% tolerance (OCR might misread a digit)
if gemini_value > 0 and abs(regex_match.value - gemini_value) / gemini_value < 0.01:
return FieldValidation(
field_name="odometerReading",
regex_confirmed=True,
confidence_adjustment=1.10,
regex_value=str(regex_match.value),
)
return FieldValidation(
field_name="odometerReading",
regex_confirmed=False,
confidence_adjustment=0.80,
regex_value=str(regex_match.value),
)
def _parse_odometer(self, raw: str) -> Optional[int]:
"""Parse odometer string to integer."""
cleaned = raw.replace(",", "").replace(".", "").strip()
try:
return int(cleaned)
except ValueError:
return None
def _is_reasonable_odometer(self, value: int) -> bool:
"""Check if odometer reading is in a reasonable range."""
return 100 <= value <= 999_999
# Singleton instance
maintenance_receipt_validator = MaintenanceReceiptValidator()