Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
339 lines
11 KiB
Python
339 lines
11 KiB
Python
"""VIN validation with check digit verification and OCR error correction."""
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class VinValidationResult:
|
|
"""Result of VIN validation."""
|
|
|
|
is_valid: bool
|
|
vin: str
|
|
confidence_adjustment: float
|
|
error: Optional[str] = None
|
|
|
|
|
|
class VinValidator:
|
|
"""Validates and corrects VIN strings."""
|
|
|
|
# VIN character set (excludes I, O, Q)
|
|
VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
|
|
|
|
# Common OCR misreads and their corrections.
|
|
# Only map characters that are INVALID in VINs to their likely correct values.
|
|
# B and S are valid VIN characters and must NOT be transliterated.
|
|
TRANSLITERATION = {
|
|
"I": "1",
|
|
"O": "0",
|
|
"Q": "0",
|
|
"i": "1",
|
|
"o": "0",
|
|
"q": "0",
|
|
"l": "1",
|
|
}
|
|
|
|
# Weights for check digit calculation (positions 1-17)
|
|
CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
|
|
|
|
# Character to value mapping for check digit
|
|
CHAR_VALUES = {
|
|
"A": 1,
|
|
"B": 2,
|
|
"C": 3,
|
|
"D": 4,
|
|
"E": 5,
|
|
"F": 6,
|
|
"G": 7,
|
|
"H": 8,
|
|
"J": 1,
|
|
"K": 2,
|
|
"L": 3,
|
|
"M": 4,
|
|
"N": 5,
|
|
"P": 7,
|
|
"R": 9,
|
|
"S": 2,
|
|
"T": 3,
|
|
"U": 4,
|
|
"V": 5,
|
|
"W": 6,
|
|
"X": 7,
|
|
"Y": 8,
|
|
"Z": 9,
|
|
"0": 0,
|
|
"1": 1,
|
|
"2": 2,
|
|
"3": 3,
|
|
"4": 4,
|
|
"5": 5,
|
|
"6": 6,
|
|
"7": 7,
|
|
"8": 8,
|
|
"9": 9,
|
|
}
|
|
|
|
# Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q
|
|
MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$")
|
|
|
|
# Pre-1981 VIN pattern: 11-17 characters
|
|
LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$")
|
|
|
|
def correct_ocr_errors(self, vin: str) -> str:
|
|
"""
|
|
Apply common OCR error corrections to a VIN string.
|
|
|
|
Args:
|
|
vin: Raw VIN string from OCR
|
|
|
|
Returns:
|
|
Corrected VIN string
|
|
"""
|
|
corrected = vin.upper().strip()
|
|
|
|
# Remove any spaces or dashes (common in formatted VINs)
|
|
corrected = corrected.replace(" ", "").replace("-", "")
|
|
|
|
# Apply transliteration for common OCR errors
|
|
result = []
|
|
for char in corrected:
|
|
if char in self.TRANSLITERATION:
|
|
result.append(self.TRANSLITERATION[char])
|
|
else:
|
|
result.append(char)
|
|
|
|
return "".join(result)
|
|
|
|
def calculate_check_digit(self, vin: str) -> Optional[str]:
|
|
"""
|
|
Calculate the check digit (position 9) for a VIN.
|
|
|
|
Args:
|
|
vin: 17-character VIN string
|
|
|
|
Returns:
|
|
Expected check digit character, or None if calculation fails
|
|
"""
|
|
if len(vin) != 17:
|
|
return None
|
|
|
|
try:
|
|
total = 0
|
|
for i, char in enumerate(vin.upper()):
|
|
if i == 8: # Skip check digit position
|
|
continue
|
|
value = self.CHAR_VALUES.get(char)
|
|
if value is None:
|
|
return None
|
|
total += value * self.CHECK_WEIGHTS[i]
|
|
|
|
remainder = total % 11
|
|
if remainder == 10:
|
|
return "X"
|
|
return str(remainder)
|
|
except (KeyError, ValueError):
|
|
return None
|
|
|
|
def validate_check_digit(self, vin: str) -> bool:
|
|
"""
|
|
Validate the check digit of a VIN.
|
|
|
|
Args:
|
|
vin: 17-character VIN string
|
|
|
|
Returns:
|
|
True if check digit is valid
|
|
"""
|
|
if len(vin) != 17:
|
|
return False
|
|
|
|
expected = self.calculate_check_digit(vin)
|
|
if expected is None:
|
|
return False
|
|
|
|
return vin[8].upper() == expected
|
|
|
|
def validate(
|
|
self, vin: str, correct_errors: bool = True, allow_legacy: bool = False
|
|
) -> VinValidationResult:
|
|
"""
|
|
Validate a VIN string and optionally correct OCR errors.
|
|
|
|
Args:
|
|
vin: VIN string to validate
|
|
correct_errors: Whether to apply OCR error corrections
|
|
allow_legacy: Whether to allow pre-1981 VINs (11-17 chars)
|
|
|
|
Returns:
|
|
VinValidationResult with validation status and corrected VIN
|
|
"""
|
|
if not vin:
|
|
return VinValidationResult(
|
|
is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN"
|
|
)
|
|
|
|
# Apply error corrections if enabled
|
|
corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper()
|
|
|
|
# Check length
|
|
if len(corrected_vin) != 17:
|
|
if allow_legacy and 11 <= len(corrected_vin) <= 17:
|
|
# Legacy VIN - reduced confidence
|
|
if self.LEGACY_VIN_PATTERN.match(corrected_vin):
|
|
return VinValidationResult(
|
|
is_valid=True,
|
|
vin=corrected_vin,
|
|
confidence_adjustment=-0.2,
|
|
)
|
|
return VinValidationResult(
|
|
is_valid=False,
|
|
vin=corrected_vin,
|
|
confidence_adjustment=-0.5,
|
|
error=f"Invalid length: {len(corrected_vin)} (expected 17)",
|
|
)
|
|
|
|
# Check character set
|
|
if not self.MODERN_VIN_PATTERN.match(corrected_vin):
|
|
invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS]
|
|
return VinValidationResult(
|
|
is_valid=False,
|
|
vin=corrected_vin,
|
|
confidence_adjustment=-0.3,
|
|
error=f"Invalid characters: {invalid_chars}",
|
|
)
|
|
|
|
# Validate check digit
|
|
if self.validate_check_digit(corrected_vin):
|
|
# Valid check digit - boost confidence
|
|
return VinValidationResult(
|
|
is_valid=True, vin=corrected_vin, confidence_adjustment=0.1
|
|
)
|
|
else:
|
|
# Invalid check digit - could be OCR error or old VIN
|
|
return VinValidationResult(
|
|
is_valid=True, # Still return as valid but with reduced confidence
|
|
vin=corrected_vin,
|
|
confidence_adjustment=-0.15,
|
|
error="Check digit validation failed",
|
|
)
|
|
|
|
def extract_candidates(
|
|
self, text: str, max_candidates: int = 5
|
|
) -> list[tuple[str, int, int]]:
|
|
"""
|
|
Extract VIN candidates from raw OCR text.
|
|
|
|
Uses two strategies:
|
|
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
|
|
2. Concatenate adjacent short fragments separated by spaces/dashes
|
|
(handles OCR fragmenting VINs into multiple words)
|
|
|
|
Args:
|
|
text: Raw OCR text
|
|
max_candidates: Maximum number of candidates to return
|
|
|
|
Returns:
|
|
List of (vin, start_pos, end_pos) tuples
|
|
"""
|
|
candidates = []
|
|
seen_vins: set[str] = set()
|
|
|
|
upper_text = text.upper()
|
|
|
|
# Strategy 1: Find continuous runs of VIN-like characters
|
|
continuous_pattern = re.compile(r"[A-Z0-9IOQ]{11,20}", re.IGNORECASE)
|
|
for match in continuous_pattern.finditer(upper_text):
|
|
self._try_add_candidate(
|
|
match.group(), match.start(), match.end(), candidates, seen_vins
|
|
)
|
|
|
|
# Strategy 2: Concatenate adjacent alphanumeric fragments
|
|
# This handles OCR fragmentation like "1HGBH 41JXMN 109186"
|
|
# Only consider fragments >= 3 chars (filters out noise/short words)
|
|
fragment_pattern = re.compile(r"[A-Z0-9IOQ]{3,}", re.IGNORECASE)
|
|
fragments = [
|
|
(m.group(), m.start(), m.end())
|
|
for m in fragment_pattern.finditer(upper_text)
|
|
]
|
|
|
|
# Try sliding windows of 2-4 adjacent fragments
|
|
for window_size in range(2, min(5, len(fragments) + 1)):
|
|
for i in range(len(fragments) - window_size + 1):
|
|
window = fragments[i : i + window_size]
|
|
combined = "".join(f[0] for f in window)
|
|
# Combined length must be close to 17 (allow +/- 2 for OCR noise)
|
|
# Must contain at least 2 digit characters (VINs always have digits;
|
|
# pure-alphabetic text is almost certainly not a VIN)
|
|
if 15 <= len(combined) <= 19 and sum(c.isdigit() for c in combined) >= 2:
|
|
self._try_add_candidate(
|
|
combined, window[0][1], window[-1][2], candidates, seen_vins
|
|
)
|
|
|
|
# Sort by likelihood of being valid (check digit first, then position)
|
|
def score_candidate(c: tuple[str, int, int]) -> int:
|
|
vin = c[0]
|
|
if self.validate_check_digit(vin):
|
|
return 0
|
|
return 1
|
|
|
|
candidates.sort(key=score_candidate)
|
|
return candidates[:max_candidates]
|
|
|
|
def _try_add_candidate(
|
|
self,
|
|
raw: str,
|
|
start: int,
|
|
end: int,
|
|
candidates: list[tuple[str, int, int]],
|
|
seen_vins: set[str],
|
|
) -> None:
|
|
"""Try to add a corrected VIN candidate if it passes validation."""
|
|
corrected = self.correct_ocr_errors(raw)
|
|
|
|
if len(corrected) == 17:
|
|
self._add_if_valid(corrected, start, end, candidates, seen_vins)
|
|
return
|
|
|
|
if len(corrected) > 17:
|
|
# Strategy A: try every 17-char sliding window
|
|
for i in range(len(corrected) - 16):
|
|
window = corrected[i : i + 17]
|
|
self._add_if_valid(window, start, end, candidates, seen_vins)
|
|
|
|
# Strategy B: for 18-19 char strings, try deleting each
|
|
# character one at a time. OCR often inserts a spurious
|
|
# character (e.g. sticker border read as 'C') that breaks
|
|
# the VIN. Check-digit validation filters out false hits.
|
|
if len(corrected) <= 19:
|
|
for i in range(len(corrected)):
|
|
reduced = corrected[:i] + corrected[i + 1 :]
|
|
if len(reduced) == 17:
|
|
self._add_if_valid(
|
|
reduced, start, end, candidates, seen_vins
|
|
)
|
|
elif len(reduced) == 18:
|
|
# Two deletions needed — try removing one more
|
|
for j in range(len(reduced)):
|
|
reduced2 = reduced[:j] + reduced[j + 1 :]
|
|
self._add_if_valid(
|
|
reduced2, start, end, candidates, seen_vins
|
|
)
|
|
|
|
def _add_if_valid(
|
|
self,
|
|
vin: str,
|
|
start: int,
|
|
end: int,
|
|
candidates: list[tuple[str, int, int]],
|
|
seen_vins: set[str],
|
|
) -> None:
|
|
"""Add a 17-char VIN to candidates if it matches the pattern."""
|
|
if len(vin) == 17 and self.MODERN_VIN_PATTERN.match(vin):
|
|
if vin not in seen_vins:
|
|
seen_vins.add(vin)
|
|
candidates.append((vin, start, end))
|
|
|
|
|
|
# Singleton instance
|
|
vin_validator = VinValidator()
|