Files
motovaultpro/ocr/app/validators/vin_validator.py
Eric Gullickson e4336ce9da
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 37s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
fix: extract VIN from noisy OCR via sliding window + char deletion (refs #113)
When OCR reads extra characters (e.g. sticker border as 'C', spurious
'Z' insertion), the raw text exceeds 17 chars and the old first-17
trim produced wrong VINs. New strategy tries all 17-char sliding
windows and single/double character deletions, validating each via
check digit. For 'CWVGGNPE2Z4NP069500', this finds the correct VIN
'WVGGNPE24NP069500' (valid check digit) instead of 'CWVGGNPE2Z4NP0695'
(invalid).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 22:00:07 -06:00

339 lines
11 KiB
Python

"""VIN validation with check digit verification and OCR error correction."""
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class VinValidationResult:
"""Result of VIN validation."""
is_valid: bool
vin: str
confidence_adjustment: float
error: Optional[str] = None
class VinValidator:
"""Validates and corrects VIN strings."""
# VIN character set (excludes I, O, Q)
VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
# Common OCR misreads and their corrections.
# Only map characters that are INVALID in VINs to their likely correct values.
# B and S are valid VIN characters and must NOT be transliterated.
TRANSLITERATION = {
"I": "1",
"O": "0",
"Q": "0",
"i": "1",
"o": "0",
"q": "0",
"l": "1",
}
# Weights for check digit calculation (positions 1-17)
CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
# Character to value mapping for check digit
CHAR_VALUES = {
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5,
"F": 6,
"G": 7,
"H": 8,
"J": 1,
"K": 2,
"L": 3,
"M": 4,
"N": 5,
"P": 7,
"R": 9,
"S": 2,
"T": 3,
"U": 4,
"V": 5,
"W": 6,
"X": 7,
"Y": 8,
"Z": 9,
"0": 0,
"1": 1,
"2": 2,
"3": 3,
"4": 4,
"5": 5,
"6": 6,
"7": 7,
"8": 8,
"9": 9,
}
# Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q
MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$")
# Pre-1981 VIN pattern: 11-17 characters
LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$")
def correct_ocr_errors(self, vin: str) -> str:
"""
Apply common OCR error corrections to a VIN string.
Args:
vin: Raw VIN string from OCR
Returns:
Corrected VIN string
"""
corrected = vin.upper().strip()
# Remove any spaces or dashes (common in formatted VINs)
corrected = corrected.replace(" ", "").replace("-", "")
# Apply transliteration for common OCR errors
result = []
for char in corrected:
if char in self.TRANSLITERATION:
result.append(self.TRANSLITERATION[char])
else:
result.append(char)
return "".join(result)
def calculate_check_digit(self, vin: str) -> Optional[str]:
"""
Calculate the check digit (position 9) for a VIN.
Args:
vin: 17-character VIN string
Returns:
Expected check digit character, or None if calculation fails
"""
if len(vin) != 17:
return None
try:
total = 0
for i, char in enumerate(vin.upper()):
if i == 8: # Skip check digit position
continue
value = self.CHAR_VALUES.get(char)
if value is None:
return None
total += value * self.CHECK_WEIGHTS[i]
remainder = total % 11
if remainder == 10:
return "X"
return str(remainder)
except (KeyError, ValueError):
return None
def validate_check_digit(self, vin: str) -> bool:
"""
Validate the check digit of a VIN.
Args:
vin: 17-character VIN string
Returns:
True if check digit is valid
"""
if len(vin) != 17:
return False
expected = self.calculate_check_digit(vin)
if expected is None:
return False
return vin[8].upper() == expected
def validate(
self, vin: str, correct_errors: bool = True, allow_legacy: bool = False
) -> VinValidationResult:
"""
Validate a VIN string and optionally correct OCR errors.
Args:
vin: VIN string to validate
correct_errors: Whether to apply OCR error corrections
allow_legacy: Whether to allow pre-1981 VINs (11-17 chars)
Returns:
VinValidationResult with validation status and corrected VIN
"""
if not vin:
return VinValidationResult(
is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN"
)
# Apply error corrections if enabled
corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper()
# Check length
if len(corrected_vin) != 17:
if allow_legacy and 11 <= len(corrected_vin) <= 17:
# Legacy VIN - reduced confidence
if self.LEGACY_VIN_PATTERN.match(corrected_vin):
return VinValidationResult(
is_valid=True,
vin=corrected_vin,
confidence_adjustment=-0.2,
)
return VinValidationResult(
is_valid=False,
vin=corrected_vin,
confidence_adjustment=-0.5,
error=f"Invalid length: {len(corrected_vin)} (expected 17)",
)
# Check character set
if not self.MODERN_VIN_PATTERN.match(corrected_vin):
invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS]
return VinValidationResult(
is_valid=False,
vin=corrected_vin,
confidence_adjustment=-0.3,
error=f"Invalid characters: {invalid_chars}",
)
# Validate check digit
if self.validate_check_digit(corrected_vin):
# Valid check digit - boost confidence
return VinValidationResult(
is_valid=True, vin=corrected_vin, confidence_adjustment=0.1
)
else:
# Invalid check digit - could be OCR error or old VIN
return VinValidationResult(
is_valid=True, # Still return as valid but with reduced confidence
vin=corrected_vin,
confidence_adjustment=-0.15,
error="Check digit validation failed",
)
def extract_candidates(
self, text: str, max_candidates: int = 5
) -> list[tuple[str, int, int]]:
"""
Extract VIN candidates from raw OCR text.
Uses two strategies:
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
2. Concatenate adjacent short fragments separated by spaces/dashes
(handles Tesseract fragmenting VINs into multiple words)
Args:
text: Raw OCR text
max_candidates: Maximum number of candidates to return
Returns:
List of (vin, start_pos, end_pos) tuples
"""
candidates = []
seen_vins: set[str] = set()
upper_text = text.upper()
# Strategy 1: Find continuous runs of VIN-like characters
continuous_pattern = re.compile(r"[A-Z0-9IOQ]{11,20}", re.IGNORECASE)
for match in continuous_pattern.finditer(upper_text):
self._try_add_candidate(
match.group(), match.start(), match.end(), candidates, seen_vins
)
# Strategy 2: Concatenate adjacent alphanumeric fragments
# This handles OCR fragmentation like "1HGBH 41JXMN 109186"
# Only consider fragments >= 3 chars (filters out noise/short words)
fragment_pattern = re.compile(r"[A-Z0-9IOQ]{3,}", re.IGNORECASE)
fragments = [
(m.group(), m.start(), m.end())
for m in fragment_pattern.finditer(upper_text)
]
# Try sliding windows of 2-4 adjacent fragments
for window_size in range(2, min(5, len(fragments) + 1)):
for i in range(len(fragments) - window_size + 1):
window = fragments[i : i + window_size]
combined = "".join(f[0] for f in window)
# Combined length must be close to 17 (allow +/- 2 for OCR noise)
# Must contain at least 2 digit characters (VINs always have digits;
# pure-alphabetic text is almost certainly not a VIN)
if 15 <= len(combined) <= 19 and sum(c.isdigit() for c in combined) >= 2:
self._try_add_candidate(
combined, window[0][1], window[-1][2], candidates, seen_vins
)
# Sort by likelihood of being valid (check digit first, then position)
def score_candidate(c: tuple[str, int, int]) -> int:
vin = c[0]
if self.validate_check_digit(vin):
return 0
return 1
candidates.sort(key=score_candidate)
return candidates[:max_candidates]
def _try_add_candidate(
self,
raw: str,
start: int,
end: int,
candidates: list[tuple[str, int, int]],
seen_vins: set[str],
) -> None:
"""Try to add a corrected VIN candidate if it passes validation."""
corrected = self.correct_ocr_errors(raw)
if len(corrected) == 17:
self._add_if_valid(corrected, start, end, candidates, seen_vins)
return
if len(corrected) > 17:
# Strategy A: try every 17-char sliding window
for i in range(len(corrected) - 16):
window = corrected[i : i + 17]
self._add_if_valid(window, start, end, candidates, seen_vins)
# Strategy B: for 18-19 char strings, try deleting each
# character one at a time. OCR often inserts a spurious
# character (e.g. sticker border read as 'C') that breaks
# the VIN. Check-digit validation filters out false hits.
if len(corrected) <= 19:
for i in range(len(corrected)):
reduced = corrected[:i] + corrected[i + 1 :]
if len(reduced) == 17:
self._add_if_valid(
reduced, start, end, candidates, seen_vins
)
elif len(reduced) == 18:
# Two deletions needed — try removing one more
for j in range(len(reduced)):
reduced2 = reduced[:j] + reduced[j + 1 :]
self._add_if_valid(
reduced2, start, end, candidates, seen_vins
)
def _add_if_valid(
self,
vin: str,
start: int,
end: int,
candidates: list[tuple[str, int, int]],
seen_vins: set[str],
) -> None:
"""Add a 17-char VIN to candidates if it matches the pattern."""
if len(vin) == 17 and self.MODERN_VIN_PATTERN.match(vin):
if vin not in seen_vins:
seen_vins.add(vin)
candidates.append((vin, start, end))
# Singleton instance
vin_validator = VinValidator()