"""VIN validation with check digit verification and OCR error correction.""" import re from dataclasses import dataclass from typing import Optional @dataclass class VinValidationResult: """Result of VIN validation.""" is_valid: bool vin: str confidence_adjustment: float error: Optional[str] = None class VinValidator: """Validates and corrects VIN strings.""" # VIN character set (excludes I, O, Q) VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789") # Common OCR misreads and their corrections. # Only map characters that are INVALID in VINs to their likely correct values. # B and S are valid VIN characters and must NOT be transliterated. TRANSLITERATION = { "I": "1", "O": "0", "Q": "0", "i": "1", "o": "0", "q": "0", "l": "1", } # Weights for check digit calculation (positions 1-17) CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2] # Character to value mapping for check digit CHAR_VALUES = { "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9, "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9, "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, } # Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$") # Pre-1981 VIN pattern: 11-17 characters LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$") def correct_ocr_errors(self, vin: str) -> str: """ Apply common OCR error corrections to a VIN string. Args: vin: Raw VIN string from OCR Returns: Corrected VIN string """ corrected = vin.upper().strip() # Remove any spaces or dashes (common in formatted VINs) corrected = corrected.replace(" ", "").replace("-", "") # Apply transliteration for common OCR errors result = [] for char in corrected: if char in self.TRANSLITERATION: result.append(self.TRANSLITERATION[char]) else: result.append(char) return "".join(result) def calculate_check_digit(self, vin: str) -> Optional[str]: """ Calculate the check digit (position 9) for a VIN. Args: vin: 17-character VIN string Returns: Expected check digit character, or None if calculation fails """ if len(vin) != 17: return None try: total = 0 for i, char in enumerate(vin.upper()): if i == 8: # Skip check digit position continue value = self.CHAR_VALUES.get(char) if value is None: return None total += value * self.CHECK_WEIGHTS[i] remainder = total % 11 if remainder == 10: return "X" return str(remainder) except (KeyError, ValueError): return None def validate_check_digit(self, vin: str) -> bool: """ Validate the check digit of a VIN. Args: vin: 17-character VIN string Returns: True if check digit is valid """ if len(vin) != 17: return False expected = self.calculate_check_digit(vin) if expected is None: return False return vin[8].upper() == expected def validate( self, vin: str, correct_errors: bool = True, allow_legacy: bool = False ) -> VinValidationResult: """ Validate a VIN string and optionally correct OCR errors. Args: vin: VIN string to validate correct_errors: Whether to apply OCR error corrections allow_legacy: Whether to allow pre-1981 VINs (11-17 chars) Returns: VinValidationResult with validation status and corrected VIN """ if not vin: return VinValidationResult( is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN" ) # Apply error corrections if enabled corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper() # Check length if len(corrected_vin) != 17: if allow_legacy and 11 <= len(corrected_vin) <= 17: # Legacy VIN - reduced confidence if self.LEGACY_VIN_PATTERN.match(corrected_vin): return VinValidationResult( is_valid=True, vin=corrected_vin, confidence_adjustment=-0.2, ) return VinValidationResult( is_valid=False, vin=corrected_vin, confidence_adjustment=-0.5, error=f"Invalid length: {len(corrected_vin)} (expected 17)", ) # Check character set if not self.MODERN_VIN_PATTERN.match(corrected_vin): invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS] return VinValidationResult( is_valid=False, vin=corrected_vin, confidence_adjustment=-0.3, error=f"Invalid characters: {invalid_chars}", ) # Validate check digit if self.validate_check_digit(corrected_vin): # Valid check digit - boost confidence return VinValidationResult( is_valid=True, vin=corrected_vin, confidence_adjustment=0.1 ) else: # Invalid check digit - could be OCR error or old VIN return VinValidationResult( is_valid=True, # Still return as valid but with reduced confidence vin=corrected_vin, confidence_adjustment=-0.15, error="Check digit validation failed", ) def extract_candidates( self, text: str, max_candidates: int = 5 ) -> list[tuple[str, int, int]]: """ Extract VIN candidates from raw OCR text. Uses two strategies: 1. Find continuous 11-20 char alphanumeric runs (handles intact VINs) 2. Concatenate adjacent short fragments separated by spaces/dashes (handles OCR fragmenting VINs into multiple words) Args: text: Raw OCR text max_candidates: Maximum number of candidates to return Returns: List of (vin, start_pos, end_pos) tuples """ candidates = [] seen_vins: set[str] = set() upper_text = text.upper() # Strategy 1: Find continuous runs of VIN-like characters continuous_pattern = re.compile(r"[A-Z0-9IOQ]{11,20}", re.IGNORECASE) for match in continuous_pattern.finditer(upper_text): self._try_add_candidate( match.group(), match.start(), match.end(), candidates, seen_vins ) # Strategy 2: Concatenate adjacent alphanumeric fragments # This handles OCR fragmentation like "1HGBH 41JXMN 109186" # Only consider fragments >= 3 chars (filters out noise/short words) fragment_pattern = re.compile(r"[A-Z0-9IOQ]{3,}", re.IGNORECASE) fragments = [ (m.group(), m.start(), m.end()) for m in fragment_pattern.finditer(upper_text) ] # Try sliding windows of 2-4 adjacent fragments for window_size in range(2, min(5, len(fragments) + 1)): for i in range(len(fragments) - window_size + 1): window = fragments[i : i + window_size] combined = "".join(f[0] for f in window) # Combined length must be close to 17 (allow +/- 2 for OCR noise) # Must contain at least 2 digit characters (VINs always have digits; # pure-alphabetic text is almost certainly not a VIN) if 15 <= len(combined) <= 19 and sum(c.isdigit() for c in combined) >= 2: self._try_add_candidate( combined, window[0][1], window[-1][2], candidates, seen_vins ) # Sort by likelihood of being valid (check digit first, then position) def score_candidate(c: tuple[str, int, int]) -> int: vin = c[0] if self.validate_check_digit(vin): return 0 return 1 candidates.sort(key=score_candidate) return candidates[:max_candidates] def _try_add_candidate( self, raw: str, start: int, end: int, candidates: list[tuple[str, int, int]], seen_vins: set[str], ) -> None: """Try to add a corrected VIN candidate if it passes validation.""" corrected = self.correct_ocr_errors(raw) if len(corrected) == 17: self._add_if_valid(corrected, start, end, candidates, seen_vins) return if len(corrected) > 17: # Strategy A: try every 17-char sliding window for i in range(len(corrected) - 16): window = corrected[i : i + 17] self._add_if_valid(window, start, end, candidates, seen_vins) # Strategy B: for 18-19 char strings, try deleting each # character one at a time. OCR often inserts a spurious # character (e.g. sticker border read as 'C') that breaks # the VIN. Check-digit validation filters out false hits. if len(corrected) <= 19: for i in range(len(corrected)): reduced = corrected[:i] + corrected[i + 1 :] if len(reduced) == 17: self._add_if_valid( reduced, start, end, candidates, seen_vins ) elif len(reduced) == 18: # Two deletions needed — try removing one more for j in range(len(reduced)): reduced2 = reduced[:j] + reduced[j + 1 :] self._add_if_valid( reduced2, start, end, candidates, seen_vins ) def _add_if_valid( self, vin: str, start: int, end: int, candidates: list[tuple[str, int, int]], seen_vins: set[str], ) -> None: """Add a 17-char VIN to candidates if it matches the pattern.""" if len(vin) == 17 and self.MODERN_VIN_PATTERN.match(vin): if vin not in seen_vins: seen_vins.add(vin) candidates.append((vin, start, end)) # Singleton instance vin_validator = VinValidator()