"""Cross-validation patterns for maintenance receipt field extraction. Validates structured fields (dates, amounts, odometer) extracted by Gemini against regex patterns found in the OCR raw text. Boosts or reduces confidence based on regex agreement. """ import re from dataclasses import dataclass, field from typing import Optional from app.patterns.currency_patterns import currency_matcher from app.patterns.date_patterns import date_matcher @dataclass class FieldValidation: """Validation result for a single extracted field.""" field_name: str regex_confirmed: bool confidence_adjustment: float # Multiplier: >1.0 boosts, <1.0 reduces regex_value: Optional[str] = None # Value found by regex, if any @dataclass class MaintenanceReceiptValidation: """Aggregated validation result for a maintenance receipt.""" is_valid: bool issues: list[str] field_validations: dict[str, FieldValidation] = field(default_factory=dict) overall_confidence: float = 1.0 # Odometer patterns: 5-7 digit numbers near odometer keywords ODOMETER_PATTERNS = [ # "Odometer: 45,231" or "Mileage: 45231" ( r"(?:ODOMETER|MILEAGE|MILES|ODO|MI)\s*[:\s]\s*(\d{1,3}[,.]?\d{3,4})", "labeled_odometer", 0.95, ), # "45,231 mi" or "45231 miles" ( r"(\d{1,3}[,.]?\d{3,4})\s*(?:MI|MILES|KM)", "unit_odometer", 0.90, ), # Standalone 5-6 digit number (lower confidence) ( r"(? MaintenanceReceiptValidation: """Validate Gemini-extracted fields against regex patterns in raw OCR text. Args: gemini_fields: Fields extracted by Gemini (field_name -> value). raw_text: Raw OCR text for regex cross-validation. Returns: MaintenanceReceiptValidation with per-field results. """ issues: list[str] = [] field_validations: dict[str, FieldValidation] = {} overall_confidence = 1.0 # Validate date field if "serviceDate" in gemini_fields: date_validation = self._validate_date( gemini_fields["serviceDate"], raw_text ) field_validations["serviceDate"] = date_validation if not date_validation.regex_confirmed: issues.append( f"Service date '{gemini_fields['serviceDate']}' not confirmed by regex" ) overall_confidence *= 0.85 # Validate total cost if "totalCost" in gemini_fields: cost_validation = self._validate_amount( "totalCost", gemini_fields["totalCost"], raw_text ) field_validations["totalCost"] = cost_validation if not cost_validation.regex_confirmed: issues.append( f"Total cost '{gemini_fields['totalCost']}' not confirmed by regex" ) overall_confidence *= 0.85 # Validate labor cost if "laborCost" in gemini_fields: labor_validation = self._validate_amount( "laborCost", gemini_fields["laborCost"], raw_text ) field_validations["laborCost"] = labor_validation if not labor_validation.regex_confirmed: issues.append( f"Labor cost '{gemini_fields['laborCost']}' not confirmed by regex" ) overall_confidence *= 0.90 # Validate odometer if "odometerReading" in gemini_fields: odo_validation = self._validate_odometer( gemini_fields["odometerReading"], raw_text ) field_validations["odometerReading"] = odo_validation if not odo_validation.regex_confirmed: issues.append( f"Odometer '{gemini_fields['odometerReading']}' not confirmed by regex" ) overall_confidence *= 0.90 is_valid = len(issues) == 0 return MaintenanceReceiptValidation( is_valid=is_valid, issues=issues, field_validations=field_validations, overall_confidence=overall_confidence, ) def extract_odometer(self, text: str) -> Optional[OdometerMatch]: """Extract odometer reading from text using regex patterns. Args: text: OCR text to search. Returns: Best OdometerMatch or None. """ text_upper = text.upper() best_match: Optional[OdometerMatch] = None for pattern, name, confidence in ODOMETER_PATTERNS: for match in re.finditer(pattern, text_upper): raw_value = match.group(1) parsed = self._parse_odometer(raw_value) if parsed is not None and self._is_reasonable_odometer(parsed): candidate = OdometerMatch( value=parsed, raw_match=match.group(0).strip(), confidence=confidence, pattern_name=name, ) if best_match is None or candidate.confidence > best_match.confidence: best_match = candidate return best_match def _validate_date(self, gemini_date: str, raw_text: str) -> FieldValidation: """Check if Gemini-extracted date matches a regex-found date.""" regex_dates = date_matcher.extract_dates(raw_text) if not regex_dates: # No dates found by regex -- cannot confirm or deny return FieldValidation( field_name="serviceDate", regex_confirmed=False, confidence_adjustment=0.95, ) # Normalize Gemini date for comparison gemini_normalized = gemini_date.strip().replace("/", "-") for regex_date in regex_dates: if regex_date.value == gemini_normalized: return FieldValidation( field_name="serviceDate", regex_confirmed=True, confidence_adjustment=1.10, regex_value=regex_date.value, ) # Gemini found a date but it doesn't match regex dates return FieldValidation( field_name="serviceDate", regex_confirmed=False, confidence_adjustment=0.80, regex_value=regex_dates[0].value if regex_dates else None, ) def _validate_amount( self, field_name: str, gemini_amount: str | float, raw_text: str ) -> FieldValidation: """Check if Gemini-extracted amount matches a regex-found amount.""" try: gemini_value = float(str(gemini_amount).replace("$", "").replace(",", "")) except (ValueError, TypeError): return FieldValidation( field_name=field_name, regex_confirmed=False, confidence_adjustment=0.70, ) regex_amounts = currency_matcher.extract_all_amounts(raw_text) if not regex_amounts: return FieldValidation( field_name=field_name, regex_confirmed=False, confidence_adjustment=0.95, ) # Check if any regex amount matches within 5% tolerance for regex_amount in regex_amounts: if gemini_value > 0 and abs(regex_amount.value - gemini_value) / gemini_value < 0.05: return FieldValidation( field_name=field_name, regex_confirmed=True, confidence_adjustment=1.10, regex_value=str(regex_amount.value), ) return FieldValidation( field_name=field_name, regex_confirmed=False, confidence_adjustment=0.80, regex_value=str(regex_amounts[0].value) if regex_amounts else None, ) def _validate_odometer( self, gemini_odometer: str | int, raw_text: str ) -> FieldValidation: """Check if Gemini-extracted odometer matches a regex-found reading.""" try: gemini_value = int( str(gemini_odometer).replace(",", "").replace(".", "").strip() ) except (ValueError, TypeError): return FieldValidation( field_name="odometerReading", regex_confirmed=False, confidence_adjustment=0.70, ) regex_match = self.extract_odometer(raw_text) if not regex_match: return FieldValidation( field_name="odometerReading", regex_confirmed=False, confidence_adjustment=0.95, ) # Check if values match within 1% tolerance (OCR might misread a digit) if gemini_value > 0 and abs(regex_match.value - gemini_value) / gemini_value < 0.01: return FieldValidation( field_name="odometerReading", regex_confirmed=True, confidence_adjustment=1.10, regex_value=str(regex_match.value), ) return FieldValidation( field_name="odometerReading", regex_confirmed=False, confidence_adjustment=0.80, regex_value=str(regex_match.value), ) def _parse_odometer(self, raw: str) -> Optional[int]: """Parse odometer string to integer.""" cleaned = raw.replace(",", "").replace(".", "").strip() try: return int(cleaned) except ValueError: return None def _is_reasonable_odometer(self, value: int) -> bool: """Check if odometer reading is in a reasonable range.""" return 100 <= value <= 999_999 # Singleton instance maintenance_receipt_validator = MaintenanceReceiptValidator()