- New MaintenanceReceiptExtractor: Gemini-primary extraction with regex cross-validation for dates, amounts, and odometer readings - New maintenance_receipt_validation.py: cross-validation patterns for structured field confidence adjustment - New POST /extract/maintenance-receipt endpoint reusing ReceiptExtractionResponse model - Per-field confidence scores (0.0-1.0) with Gemini base 0.85, boosted/reduced by regex agreement Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
300 lines
10 KiB
Python
300 lines
10 KiB
Python
"""Cross-validation patterns for maintenance receipt field extraction.
|
|
|
|
Validates structured fields (dates, amounts, odometer) extracted by Gemini
|
|
against regex patterns found in the OCR raw text. Boosts or reduces confidence
|
|
based on regex agreement.
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
from app.patterns.currency_patterns import currency_matcher
|
|
from app.patterns.date_patterns import date_matcher
|
|
|
|
|
|
@dataclass
|
|
class FieldValidation:
|
|
"""Validation result for a single extracted field."""
|
|
|
|
field_name: str
|
|
regex_confirmed: bool
|
|
confidence_adjustment: float # Multiplier: >1.0 boosts, <1.0 reduces
|
|
regex_value: Optional[str] = None # Value found by regex, if any
|
|
|
|
|
|
@dataclass
|
|
class MaintenanceReceiptValidation:
|
|
"""Aggregated validation result for a maintenance receipt."""
|
|
|
|
is_valid: bool
|
|
issues: list[str]
|
|
field_validations: dict[str, FieldValidation] = field(default_factory=dict)
|
|
overall_confidence: float = 1.0
|
|
|
|
|
|
# Odometer patterns: 5-7 digit numbers near odometer keywords
|
|
ODOMETER_PATTERNS = [
|
|
# "Odometer: 45,231" or "Mileage: 45231"
|
|
(
|
|
r"(?:ODOMETER|MILEAGE|MILES|ODO|MI)\s*[:\s]\s*(\d{1,3}[,.]?\d{3,4})",
|
|
"labeled_odometer",
|
|
0.95,
|
|
),
|
|
# "45,231 mi" or "45231 miles"
|
|
(
|
|
r"(\d{1,3}[,.]?\d{3,4})\s*(?:MI|MILES|KM)",
|
|
"unit_odometer",
|
|
0.90,
|
|
),
|
|
# Standalone 5-6 digit number (lower confidence)
|
|
(
|
|
r"(?<!\d)(\d{5,6})(?!\d)",
|
|
"standalone_odometer",
|
|
0.60,
|
|
),
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class OdometerMatch:
|
|
"""Result of odometer pattern matching."""
|
|
|
|
value: int
|
|
raw_match: str
|
|
confidence: float
|
|
pattern_name: str
|
|
|
|
|
|
class MaintenanceReceiptValidator:
|
|
"""Cross-validates Gemini-extracted maintenance receipt fields against regex patterns."""
|
|
|
|
def validate(
|
|
self,
|
|
gemini_fields: dict,
|
|
raw_text: str,
|
|
) -> MaintenanceReceiptValidation:
|
|
"""Validate Gemini-extracted fields against regex patterns in raw OCR text.
|
|
|
|
Args:
|
|
gemini_fields: Fields extracted by Gemini (field_name -> value).
|
|
raw_text: Raw OCR text for regex cross-validation.
|
|
|
|
Returns:
|
|
MaintenanceReceiptValidation with per-field results.
|
|
"""
|
|
issues: list[str] = []
|
|
field_validations: dict[str, FieldValidation] = {}
|
|
overall_confidence = 1.0
|
|
|
|
# Validate date field
|
|
if "serviceDate" in gemini_fields:
|
|
date_validation = self._validate_date(
|
|
gemini_fields["serviceDate"], raw_text
|
|
)
|
|
field_validations["serviceDate"] = date_validation
|
|
if not date_validation.regex_confirmed:
|
|
issues.append(
|
|
f"Service date '{gemini_fields['serviceDate']}' not confirmed by regex"
|
|
)
|
|
overall_confidence *= 0.85
|
|
|
|
# Validate total cost
|
|
if "totalCost" in gemini_fields:
|
|
cost_validation = self._validate_amount(
|
|
"totalCost", gemini_fields["totalCost"], raw_text
|
|
)
|
|
field_validations["totalCost"] = cost_validation
|
|
if not cost_validation.regex_confirmed:
|
|
issues.append(
|
|
f"Total cost '{gemini_fields['totalCost']}' not confirmed by regex"
|
|
)
|
|
overall_confidence *= 0.85
|
|
|
|
# Validate labor cost
|
|
if "laborCost" in gemini_fields:
|
|
labor_validation = self._validate_amount(
|
|
"laborCost", gemini_fields["laborCost"], raw_text
|
|
)
|
|
field_validations["laborCost"] = labor_validation
|
|
if not labor_validation.regex_confirmed:
|
|
issues.append(
|
|
f"Labor cost '{gemini_fields['laborCost']}' not confirmed by regex"
|
|
)
|
|
overall_confidence *= 0.90
|
|
|
|
# Validate odometer
|
|
if "odometerReading" in gemini_fields:
|
|
odo_validation = self._validate_odometer(
|
|
gemini_fields["odometerReading"], raw_text
|
|
)
|
|
field_validations["odometerReading"] = odo_validation
|
|
if not odo_validation.regex_confirmed:
|
|
issues.append(
|
|
f"Odometer '{gemini_fields['odometerReading']}' not confirmed by regex"
|
|
)
|
|
overall_confidence *= 0.90
|
|
|
|
is_valid = len(issues) == 0
|
|
return MaintenanceReceiptValidation(
|
|
is_valid=is_valid,
|
|
issues=issues,
|
|
field_validations=field_validations,
|
|
overall_confidence=overall_confidence,
|
|
)
|
|
|
|
def extract_odometer(self, text: str) -> Optional[OdometerMatch]:
|
|
"""Extract odometer reading from text using regex patterns.
|
|
|
|
Args:
|
|
text: OCR text to search.
|
|
|
|
Returns:
|
|
Best OdometerMatch or None.
|
|
"""
|
|
text_upper = text.upper()
|
|
best_match: Optional[OdometerMatch] = None
|
|
|
|
for pattern, name, confidence in ODOMETER_PATTERNS:
|
|
for match in re.finditer(pattern, text_upper):
|
|
raw_value = match.group(1)
|
|
parsed = self._parse_odometer(raw_value)
|
|
if parsed is not None and self._is_reasonable_odometer(parsed):
|
|
candidate = OdometerMatch(
|
|
value=parsed,
|
|
raw_match=match.group(0).strip(),
|
|
confidence=confidence,
|
|
pattern_name=name,
|
|
)
|
|
if best_match is None or candidate.confidence > best_match.confidence:
|
|
best_match = candidate
|
|
|
|
return best_match
|
|
|
|
def _validate_date(self, gemini_date: str, raw_text: str) -> FieldValidation:
|
|
"""Check if Gemini-extracted date matches a regex-found date."""
|
|
regex_dates = date_matcher.extract_dates(raw_text)
|
|
|
|
if not regex_dates:
|
|
# No dates found by regex -- cannot confirm or deny
|
|
return FieldValidation(
|
|
field_name="serviceDate",
|
|
regex_confirmed=False,
|
|
confidence_adjustment=0.95,
|
|
)
|
|
|
|
# Normalize Gemini date for comparison
|
|
gemini_normalized = gemini_date.strip().replace("/", "-")
|
|
|
|
for regex_date in regex_dates:
|
|
if regex_date.value == gemini_normalized:
|
|
return FieldValidation(
|
|
field_name="serviceDate",
|
|
regex_confirmed=True,
|
|
confidence_adjustment=1.10,
|
|
regex_value=regex_date.value,
|
|
)
|
|
|
|
# Gemini found a date but it doesn't match regex dates
|
|
return FieldValidation(
|
|
field_name="serviceDate",
|
|
regex_confirmed=False,
|
|
confidence_adjustment=0.80,
|
|
regex_value=regex_dates[0].value if regex_dates else None,
|
|
)
|
|
|
|
def _validate_amount(
|
|
self, field_name: str, gemini_amount: str | float, raw_text: str
|
|
) -> FieldValidation:
|
|
"""Check if Gemini-extracted amount matches a regex-found amount."""
|
|
try:
|
|
gemini_value = float(str(gemini_amount).replace("$", "").replace(",", ""))
|
|
except (ValueError, TypeError):
|
|
return FieldValidation(
|
|
field_name=field_name,
|
|
regex_confirmed=False,
|
|
confidence_adjustment=0.70,
|
|
)
|
|
|
|
regex_amounts = currency_matcher.extract_all_amounts(raw_text)
|
|
|
|
if not regex_amounts:
|
|
return FieldValidation(
|
|
field_name=field_name,
|
|
regex_confirmed=False,
|
|
confidence_adjustment=0.95,
|
|
)
|
|
|
|
# Check if any regex amount matches within 5% tolerance
|
|
for regex_amount in regex_amounts:
|
|
if gemini_value > 0 and abs(regex_amount.value - gemini_value) / gemini_value < 0.05:
|
|
return FieldValidation(
|
|
field_name=field_name,
|
|
regex_confirmed=True,
|
|
confidence_adjustment=1.10,
|
|
regex_value=str(regex_amount.value),
|
|
)
|
|
|
|
return FieldValidation(
|
|
field_name=field_name,
|
|
regex_confirmed=False,
|
|
confidence_adjustment=0.80,
|
|
regex_value=str(regex_amounts[0].value) if regex_amounts else None,
|
|
)
|
|
|
|
def _validate_odometer(
|
|
self, gemini_odometer: str | int, raw_text: str
|
|
) -> FieldValidation:
|
|
"""Check if Gemini-extracted odometer matches a regex-found reading."""
|
|
try:
|
|
gemini_value = int(
|
|
str(gemini_odometer).replace(",", "").replace(".", "").strip()
|
|
)
|
|
except (ValueError, TypeError):
|
|
return FieldValidation(
|
|
field_name="odometerReading",
|
|
regex_confirmed=False,
|
|
confidence_adjustment=0.70,
|
|
)
|
|
|
|
regex_match = self.extract_odometer(raw_text)
|
|
|
|
if not regex_match:
|
|
return FieldValidation(
|
|
field_name="odometerReading",
|
|
regex_confirmed=False,
|
|
confidence_adjustment=0.95,
|
|
)
|
|
|
|
# Check if values match within 1% tolerance (OCR might misread a digit)
|
|
if gemini_value > 0 and abs(regex_match.value - gemini_value) / gemini_value < 0.01:
|
|
return FieldValidation(
|
|
field_name="odometerReading",
|
|
regex_confirmed=True,
|
|
confidence_adjustment=1.10,
|
|
regex_value=str(regex_match.value),
|
|
)
|
|
|
|
return FieldValidation(
|
|
field_name="odometerReading",
|
|
regex_confirmed=False,
|
|
confidence_adjustment=0.80,
|
|
regex_value=str(regex_match.value),
|
|
)
|
|
|
|
def _parse_odometer(self, raw: str) -> Optional[int]:
|
|
"""Parse odometer string to integer."""
|
|
cleaned = raw.replace(",", "").replace(".", "").strip()
|
|
try:
|
|
return int(cleaned)
|
|
except ValueError:
|
|
return None
|
|
|
|
def _is_reasonable_odometer(self, value: int) -> bool:
|
|
"""Check if odometer reading is in a reasonable range."""
|
|
return 100 <= value <= 999_999
|
|
|
|
|
|
# Singleton instance
|
|
maintenance_receipt_validator = MaintenanceReceiptValidator()
|