feat: add maintenance receipt extraction pipeline with Gemini + regex (refs #150)
- New MaintenanceReceiptExtractor: Gemini-primary extraction with regex cross-validation for dates, amounts, and odometer readings - New maintenance_receipt_validation.py: cross-validation patterns for structured field confidence adjustment - New POST /extract/maintenance-receipt endpoint reusing ReceiptExtractionResponse model - Per-field confidence scores (0.0-1.0) with Gemini base 0.85, boosted/reduced by regex agreement Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
299
ocr/app/patterns/maintenance_receipt_validation.py
Normal file
299
ocr/app/patterns/maintenance_receipt_validation.py
Normal file
@@ -0,0 +1,299 @@
|
||||
"""Cross-validation patterns for maintenance receipt field extraction.
|
||||
|
||||
Validates structured fields (dates, amounts, odometer) extracted by Gemini
|
||||
against regex patterns found in the OCR raw text. Boosts or reduces confidence
|
||||
based on regex agreement.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from app.patterns.currency_patterns import currency_matcher
|
||||
from app.patterns.date_patterns import date_matcher
|
||||
|
||||
|
||||
@dataclass
|
||||
class FieldValidation:
|
||||
"""Validation result for a single extracted field."""
|
||||
|
||||
field_name: str
|
||||
regex_confirmed: bool
|
||||
confidence_adjustment: float # Multiplier: >1.0 boosts, <1.0 reduces
|
||||
regex_value: Optional[str] = None # Value found by regex, if any
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaintenanceReceiptValidation:
|
||||
"""Aggregated validation result for a maintenance receipt."""
|
||||
|
||||
is_valid: bool
|
||||
issues: list[str]
|
||||
field_validations: dict[str, FieldValidation] = field(default_factory=dict)
|
||||
overall_confidence: float = 1.0
|
||||
|
||||
|
||||
# Odometer patterns: 5-7 digit numbers near odometer keywords
|
||||
ODOMETER_PATTERNS = [
|
||||
# "Odometer: 45,231" or "Mileage: 45231"
|
||||
(
|
||||
r"(?:ODOMETER|MILEAGE|MILES|ODO|MI)\s*[:\s]\s*(\d{1,3}[,.]?\d{3,4})",
|
||||
"labeled_odometer",
|
||||
0.95,
|
||||
),
|
||||
# "45,231 mi" or "45231 miles"
|
||||
(
|
||||
r"(\d{1,3}[,.]?\d{3,4})\s*(?:MI|MILES|KM)",
|
||||
"unit_odometer",
|
||||
0.90,
|
||||
),
|
||||
# Standalone 5-6 digit number (lower confidence)
|
||||
(
|
||||
r"(?<!\d)(\d{5,6})(?!\d)",
|
||||
"standalone_odometer",
|
||||
0.60,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class OdometerMatch:
|
||||
"""Result of odometer pattern matching."""
|
||||
|
||||
value: int
|
||||
raw_match: str
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
|
||||
|
||||
class MaintenanceReceiptValidator:
|
||||
"""Cross-validates Gemini-extracted maintenance receipt fields against regex patterns."""
|
||||
|
||||
def validate(
|
||||
self,
|
||||
gemini_fields: dict,
|
||||
raw_text: str,
|
||||
) -> MaintenanceReceiptValidation:
|
||||
"""Validate Gemini-extracted fields against regex patterns in raw OCR text.
|
||||
|
||||
Args:
|
||||
gemini_fields: Fields extracted by Gemini (field_name -> value).
|
||||
raw_text: Raw OCR text for regex cross-validation.
|
||||
|
||||
Returns:
|
||||
MaintenanceReceiptValidation with per-field results.
|
||||
"""
|
||||
issues: list[str] = []
|
||||
field_validations: dict[str, FieldValidation] = {}
|
||||
overall_confidence = 1.0
|
||||
|
||||
# Validate date field
|
||||
if "serviceDate" in gemini_fields:
|
||||
date_validation = self._validate_date(
|
||||
gemini_fields["serviceDate"], raw_text
|
||||
)
|
||||
field_validations["serviceDate"] = date_validation
|
||||
if not date_validation.regex_confirmed:
|
||||
issues.append(
|
||||
f"Service date '{gemini_fields['serviceDate']}' not confirmed by regex"
|
||||
)
|
||||
overall_confidence *= 0.85
|
||||
|
||||
# Validate total cost
|
||||
if "totalCost" in gemini_fields:
|
||||
cost_validation = self._validate_amount(
|
||||
"totalCost", gemini_fields["totalCost"], raw_text
|
||||
)
|
||||
field_validations["totalCost"] = cost_validation
|
||||
if not cost_validation.regex_confirmed:
|
||||
issues.append(
|
||||
f"Total cost '{gemini_fields['totalCost']}' not confirmed by regex"
|
||||
)
|
||||
overall_confidence *= 0.85
|
||||
|
||||
# Validate labor cost
|
||||
if "laborCost" in gemini_fields:
|
||||
labor_validation = self._validate_amount(
|
||||
"laborCost", gemini_fields["laborCost"], raw_text
|
||||
)
|
||||
field_validations["laborCost"] = labor_validation
|
||||
if not labor_validation.regex_confirmed:
|
||||
issues.append(
|
||||
f"Labor cost '{gemini_fields['laborCost']}' not confirmed by regex"
|
||||
)
|
||||
overall_confidence *= 0.90
|
||||
|
||||
# Validate odometer
|
||||
if "odometerReading" in gemini_fields:
|
||||
odo_validation = self._validate_odometer(
|
||||
gemini_fields["odometerReading"], raw_text
|
||||
)
|
||||
field_validations["odometerReading"] = odo_validation
|
||||
if not odo_validation.regex_confirmed:
|
||||
issues.append(
|
||||
f"Odometer '{gemini_fields['odometerReading']}' not confirmed by regex"
|
||||
)
|
||||
overall_confidence *= 0.90
|
||||
|
||||
is_valid = len(issues) == 0
|
||||
return MaintenanceReceiptValidation(
|
||||
is_valid=is_valid,
|
||||
issues=issues,
|
||||
field_validations=field_validations,
|
||||
overall_confidence=overall_confidence,
|
||||
)
|
||||
|
||||
def extract_odometer(self, text: str) -> Optional[OdometerMatch]:
|
||||
"""Extract odometer reading from text using regex patterns.
|
||||
|
||||
Args:
|
||||
text: OCR text to search.
|
||||
|
||||
Returns:
|
||||
Best OdometerMatch or None.
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
best_match: Optional[OdometerMatch] = None
|
||||
|
||||
for pattern, name, confidence in ODOMETER_PATTERNS:
|
||||
for match in re.finditer(pattern, text_upper):
|
||||
raw_value = match.group(1)
|
||||
parsed = self._parse_odometer(raw_value)
|
||||
if parsed is not None and self._is_reasonable_odometer(parsed):
|
||||
candidate = OdometerMatch(
|
||||
value=parsed,
|
||||
raw_match=match.group(0).strip(),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
if best_match is None or candidate.confidence > best_match.confidence:
|
||||
best_match = candidate
|
||||
|
||||
return best_match
|
||||
|
||||
def _validate_date(self, gemini_date: str, raw_text: str) -> FieldValidation:
|
||||
"""Check if Gemini-extracted date matches a regex-found date."""
|
||||
regex_dates = date_matcher.extract_dates(raw_text)
|
||||
|
||||
if not regex_dates:
|
||||
# No dates found by regex -- cannot confirm or deny
|
||||
return FieldValidation(
|
||||
field_name="serviceDate",
|
||||
regex_confirmed=False,
|
||||
confidence_adjustment=0.95,
|
||||
)
|
||||
|
||||
# Normalize Gemini date for comparison
|
||||
gemini_normalized = gemini_date.strip().replace("/", "-")
|
||||
|
||||
for regex_date in regex_dates:
|
||||
if regex_date.value == gemini_normalized:
|
||||
return FieldValidation(
|
||||
field_name="serviceDate",
|
||||
regex_confirmed=True,
|
||||
confidence_adjustment=1.10,
|
||||
regex_value=regex_date.value,
|
||||
)
|
||||
|
||||
# Gemini found a date but it doesn't match regex dates
|
||||
return FieldValidation(
|
||||
field_name="serviceDate",
|
||||
regex_confirmed=False,
|
||||
confidence_adjustment=0.80,
|
||||
regex_value=regex_dates[0].value if regex_dates else None,
|
||||
)
|
||||
|
||||
def _validate_amount(
|
||||
self, field_name: str, gemini_amount: str | float, raw_text: str
|
||||
) -> FieldValidation:
|
||||
"""Check if Gemini-extracted amount matches a regex-found amount."""
|
||||
try:
|
||||
gemini_value = float(str(gemini_amount).replace("$", "").replace(",", ""))
|
||||
except (ValueError, TypeError):
|
||||
return FieldValidation(
|
||||
field_name=field_name,
|
||||
regex_confirmed=False,
|
||||
confidence_adjustment=0.70,
|
||||
)
|
||||
|
||||
regex_amounts = currency_matcher.extract_all_amounts(raw_text)
|
||||
|
||||
if not regex_amounts:
|
||||
return FieldValidation(
|
||||
field_name=field_name,
|
||||
regex_confirmed=False,
|
||||
confidence_adjustment=0.95,
|
||||
)
|
||||
|
||||
# Check if any regex amount matches within 5% tolerance
|
||||
for regex_amount in regex_amounts:
|
||||
if gemini_value > 0 and abs(regex_amount.value - gemini_value) / gemini_value < 0.05:
|
||||
return FieldValidation(
|
||||
field_name=field_name,
|
||||
regex_confirmed=True,
|
||||
confidence_adjustment=1.10,
|
||||
regex_value=str(regex_amount.value),
|
||||
)
|
||||
|
||||
return FieldValidation(
|
||||
field_name=field_name,
|
||||
regex_confirmed=False,
|
||||
confidence_adjustment=0.80,
|
||||
regex_value=str(regex_amounts[0].value) if regex_amounts else None,
|
||||
)
|
||||
|
||||
def _validate_odometer(
|
||||
self, gemini_odometer: str | int, raw_text: str
|
||||
) -> FieldValidation:
|
||||
"""Check if Gemini-extracted odometer matches a regex-found reading."""
|
||||
try:
|
||||
gemini_value = int(
|
||||
str(gemini_odometer).replace(",", "").replace(".", "").strip()
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
return FieldValidation(
|
||||
field_name="odometerReading",
|
||||
regex_confirmed=False,
|
||||
confidence_adjustment=0.70,
|
||||
)
|
||||
|
||||
regex_match = self.extract_odometer(raw_text)
|
||||
|
||||
if not regex_match:
|
||||
return FieldValidation(
|
||||
field_name="odometerReading",
|
||||
regex_confirmed=False,
|
||||
confidence_adjustment=0.95,
|
||||
)
|
||||
|
||||
# Check if values match within 1% tolerance (OCR might misread a digit)
|
||||
if gemini_value > 0 and abs(regex_match.value - gemini_value) / gemini_value < 0.01:
|
||||
return FieldValidation(
|
||||
field_name="odometerReading",
|
||||
regex_confirmed=True,
|
||||
confidence_adjustment=1.10,
|
||||
regex_value=str(regex_match.value),
|
||||
)
|
||||
|
||||
return FieldValidation(
|
||||
field_name="odometerReading",
|
||||
regex_confirmed=False,
|
||||
confidence_adjustment=0.80,
|
||||
regex_value=str(regex_match.value),
|
||||
)
|
||||
|
||||
def _parse_odometer(self, raw: str) -> Optional[int]:
|
||||
"""Parse odometer string to integer."""
|
||||
cleaned = raw.replace(",", "").replace(".", "").strip()
|
||||
try:
|
||||
return int(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _is_reasonable_odometer(self, value: int) -> bool:
|
||||
"""Check if odometer reading is in a reasonable range."""
|
||||
return 100 <= value <= 999_999
|
||||
|
||||
|
||||
# Singleton instance
|
||||
maintenance_receipt_validator = MaintenanceReceiptValidator()
|
||||
Reference in New Issue
Block a user