"""VIN validation with check digit verification and OCR error correction.""" import re from dataclasses import dataclass from typing import Optional @dataclass class VinValidationResult: """Result of VIN validation.""" is_valid: bool vin: str confidence_adjustment: float error: Optional[str] = None class VinValidator: """Validates and corrects VIN strings.""" # VIN character set (excludes I, O, Q) VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789") # Common OCR misreads and their corrections TRANSLITERATION = { "I": "1", "O": "0", "Q": "0", "i": "1", "o": "0", "q": "0", "l": "1", "L": "1", "B": "8", # Sometimes confused "S": "5", # Sometimes confused } # Weights for check digit calculation (positions 1-17) CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2] # Character to value mapping for check digit CHAR_VALUES = { "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9, "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9, "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, } # Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$") # Pre-1981 VIN pattern: 11-17 characters LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$") def correct_ocr_errors(self, vin: str) -> str: """ Apply common OCR error corrections to a VIN string. Args: vin: Raw VIN string from OCR Returns: Corrected VIN string """ corrected = vin.upper().strip() # Remove any spaces or dashes (common in formatted VINs) corrected = corrected.replace(" ", "").replace("-", "") # Apply transliteration for common OCR errors result = [] for char in corrected: if char in self.TRANSLITERATION: result.append(self.TRANSLITERATION[char]) else: result.append(char) return "".join(result) def calculate_check_digit(self, vin: str) -> Optional[str]: """ Calculate the check digit (position 9) for a VIN. Args: vin: 17-character VIN string Returns: Expected check digit character, or None if calculation fails """ if len(vin) != 17: return None try: total = 0 for i, char in enumerate(vin.upper()): if i == 8: # Skip check digit position continue value = self.CHAR_VALUES.get(char) if value is None: return None total += value * self.CHECK_WEIGHTS[i] remainder = total % 11 if remainder == 10: return "X" return str(remainder) except (KeyError, ValueError): return None def validate_check_digit(self, vin: str) -> bool: """ Validate the check digit of a VIN. Args: vin: 17-character VIN string Returns: True if check digit is valid """ if len(vin) != 17: return False expected = self.calculate_check_digit(vin) if expected is None: return False return vin[8].upper() == expected def validate( self, vin: str, correct_errors: bool = True, allow_legacy: bool = False ) -> VinValidationResult: """ Validate a VIN string and optionally correct OCR errors. Args: vin: VIN string to validate correct_errors: Whether to apply OCR error corrections allow_legacy: Whether to allow pre-1981 VINs (11-17 chars) Returns: VinValidationResult with validation status and corrected VIN """ if not vin: return VinValidationResult( is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN" ) # Apply error corrections if enabled corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper() # Check length if len(corrected_vin) != 17: if allow_legacy and 11 <= len(corrected_vin) <= 17: # Legacy VIN - reduced confidence if self.LEGACY_VIN_PATTERN.match(corrected_vin): return VinValidationResult( is_valid=True, vin=corrected_vin, confidence_adjustment=-0.2, ) return VinValidationResult( is_valid=False, vin=corrected_vin, confidence_adjustment=-0.5, error=f"Invalid length: {len(corrected_vin)} (expected 17)", ) # Check character set if not self.MODERN_VIN_PATTERN.match(corrected_vin): invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS] return VinValidationResult( is_valid=False, vin=corrected_vin, confidence_adjustment=-0.3, error=f"Invalid characters: {invalid_chars}", ) # Validate check digit if self.validate_check_digit(corrected_vin): # Valid check digit - boost confidence return VinValidationResult( is_valid=True, vin=corrected_vin, confidence_adjustment=0.1 ) else: # Invalid check digit - could be OCR error or old VIN return VinValidationResult( is_valid=True, # Still return as valid but with reduced confidence vin=corrected_vin, confidence_adjustment=-0.15, error="Check digit validation failed", ) def extract_candidates( self, text: str, max_candidates: int = 5 ) -> list[tuple[str, int, int]]: """ Extract VIN candidates from raw OCR text. Args: text: Raw OCR text max_candidates: Maximum number of candidates to return Returns: List of (vin, start_pos, end_pos) tuples """ # Pattern to find potential VIN sequences # Allow some flexibility for OCR errors (include I, O, Q for correction later) potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE) candidates = [] for match in potential_vin_pattern.finditer(text.upper()): candidate = match.group() corrected = self.correct_ocr_errors(candidate) # Only include if it could be a valid VIN after correction if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected): candidates.append((corrected, match.start(), match.end())) # Sort by likelihood of being valid (check digit validation) def score_candidate(c: tuple[str, int, int]) -> int: vin = c[0] if self.validate_check_digit(vin): return 0 # Best score return 1 candidates.sort(key=score_candidate) return candidates[:max_candidates] # Singleton instance vin_validator = VinValidator()