All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 31s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement VIN-specific OCR extraction with optimized preprocessing: - Add POST /extract/vin endpoint for VIN extraction - VIN preprocessor: CLAHE, deskew, denoise, adaptive threshold - VIN validator: check digit validation, OCR error correction (I->1, O->0) - VIN extractor: PSM modes 6/7/8, character whitelist, alternatives - Response includes confidence, bounding box, and alternatives - Unit tests for validator and preprocessor - Integration tests for VIN extraction endpoint Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
260 lines
7.5 KiB
Python
260 lines
7.5 KiB
Python
"""VIN validation with check digit verification and OCR error correction."""
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class VinValidationResult:
|
|
"""Result of VIN validation."""
|
|
|
|
is_valid: bool
|
|
vin: str
|
|
confidence_adjustment: float
|
|
error: Optional[str] = None
|
|
|
|
|
|
class VinValidator:
|
|
"""Validates and corrects VIN strings."""
|
|
|
|
# VIN character set (excludes I, O, Q)
|
|
VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
|
|
|
|
# Common OCR misreads and their corrections
|
|
TRANSLITERATION = {
|
|
"I": "1",
|
|
"O": "0",
|
|
"Q": "0",
|
|
"i": "1",
|
|
"o": "0",
|
|
"q": "0",
|
|
"l": "1",
|
|
"L": "1",
|
|
"B": "8", # Sometimes confused
|
|
"S": "5", # Sometimes confused
|
|
}
|
|
|
|
# Weights for check digit calculation (positions 1-17)
|
|
CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
|
|
|
|
# Character to value mapping for check digit
|
|
CHAR_VALUES = {
|
|
"A": 1,
|
|
"B": 2,
|
|
"C": 3,
|
|
"D": 4,
|
|
"E": 5,
|
|
"F": 6,
|
|
"G": 7,
|
|
"H": 8,
|
|
"J": 1,
|
|
"K": 2,
|
|
"L": 3,
|
|
"M": 4,
|
|
"N": 5,
|
|
"P": 7,
|
|
"R": 9,
|
|
"S": 2,
|
|
"T": 3,
|
|
"U": 4,
|
|
"V": 5,
|
|
"W": 6,
|
|
"X": 7,
|
|
"Y": 8,
|
|
"Z": 9,
|
|
"0": 0,
|
|
"1": 1,
|
|
"2": 2,
|
|
"3": 3,
|
|
"4": 4,
|
|
"5": 5,
|
|
"6": 6,
|
|
"7": 7,
|
|
"8": 8,
|
|
"9": 9,
|
|
}
|
|
|
|
# Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q
|
|
MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$")
|
|
|
|
# Pre-1981 VIN pattern: 11-17 characters
|
|
LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$")
|
|
|
|
def correct_ocr_errors(self, vin: str) -> str:
|
|
"""
|
|
Apply common OCR error corrections to a VIN string.
|
|
|
|
Args:
|
|
vin: Raw VIN string from OCR
|
|
|
|
Returns:
|
|
Corrected VIN string
|
|
"""
|
|
corrected = vin.upper().strip()
|
|
|
|
# Remove any spaces or dashes (common in formatted VINs)
|
|
corrected = corrected.replace(" ", "").replace("-", "")
|
|
|
|
# Apply transliteration for common OCR errors
|
|
result = []
|
|
for char in corrected:
|
|
if char in self.TRANSLITERATION:
|
|
result.append(self.TRANSLITERATION[char])
|
|
else:
|
|
result.append(char)
|
|
|
|
return "".join(result)
|
|
|
|
def calculate_check_digit(self, vin: str) -> Optional[str]:
|
|
"""
|
|
Calculate the check digit (position 9) for a VIN.
|
|
|
|
Args:
|
|
vin: 17-character VIN string
|
|
|
|
Returns:
|
|
Expected check digit character, or None if calculation fails
|
|
"""
|
|
if len(vin) != 17:
|
|
return None
|
|
|
|
try:
|
|
total = 0
|
|
for i, char in enumerate(vin.upper()):
|
|
if i == 8: # Skip check digit position
|
|
continue
|
|
value = self.CHAR_VALUES.get(char)
|
|
if value is None:
|
|
return None
|
|
total += value * self.CHECK_WEIGHTS[i]
|
|
|
|
remainder = total % 11
|
|
if remainder == 10:
|
|
return "X"
|
|
return str(remainder)
|
|
except (KeyError, ValueError):
|
|
return None
|
|
|
|
def validate_check_digit(self, vin: str) -> bool:
|
|
"""
|
|
Validate the check digit of a VIN.
|
|
|
|
Args:
|
|
vin: 17-character VIN string
|
|
|
|
Returns:
|
|
True if check digit is valid
|
|
"""
|
|
if len(vin) != 17:
|
|
return False
|
|
|
|
expected = self.calculate_check_digit(vin)
|
|
if expected is None:
|
|
return False
|
|
|
|
return vin[8].upper() == expected
|
|
|
|
def validate(
|
|
self, vin: str, correct_errors: bool = True, allow_legacy: bool = False
|
|
) -> VinValidationResult:
|
|
"""
|
|
Validate a VIN string and optionally correct OCR errors.
|
|
|
|
Args:
|
|
vin: VIN string to validate
|
|
correct_errors: Whether to apply OCR error corrections
|
|
allow_legacy: Whether to allow pre-1981 VINs (11-17 chars)
|
|
|
|
Returns:
|
|
VinValidationResult with validation status and corrected VIN
|
|
"""
|
|
if not vin:
|
|
return VinValidationResult(
|
|
is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN"
|
|
)
|
|
|
|
# Apply error corrections if enabled
|
|
corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper()
|
|
|
|
# Check length
|
|
if len(corrected_vin) != 17:
|
|
if allow_legacy and 11 <= len(corrected_vin) <= 17:
|
|
# Legacy VIN - reduced confidence
|
|
if self.LEGACY_VIN_PATTERN.match(corrected_vin):
|
|
return VinValidationResult(
|
|
is_valid=True,
|
|
vin=corrected_vin,
|
|
confidence_adjustment=-0.2,
|
|
)
|
|
return VinValidationResult(
|
|
is_valid=False,
|
|
vin=corrected_vin,
|
|
confidence_adjustment=-0.5,
|
|
error=f"Invalid length: {len(corrected_vin)} (expected 17)",
|
|
)
|
|
|
|
# Check character set
|
|
if not self.MODERN_VIN_PATTERN.match(corrected_vin):
|
|
invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS]
|
|
return VinValidationResult(
|
|
is_valid=False,
|
|
vin=corrected_vin,
|
|
confidence_adjustment=-0.3,
|
|
error=f"Invalid characters: {invalid_chars}",
|
|
)
|
|
|
|
# Validate check digit
|
|
if self.validate_check_digit(corrected_vin):
|
|
# Valid check digit - boost confidence
|
|
return VinValidationResult(
|
|
is_valid=True, vin=corrected_vin, confidence_adjustment=0.1
|
|
)
|
|
else:
|
|
# Invalid check digit - could be OCR error or old VIN
|
|
return VinValidationResult(
|
|
is_valid=True, # Still return as valid but with reduced confidence
|
|
vin=corrected_vin,
|
|
confidence_adjustment=-0.15,
|
|
error="Check digit validation failed",
|
|
)
|
|
|
|
def extract_candidates(
|
|
self, text: str, max_candidates: int = 5
|
|
) -> list[tuple[str, int, int]]:
|
|
"""
|
|
Extract VIN candidates from raw OCR text.
|
|
|
|
Args:
|
|
text: Raw OCR text
|
|
max_candidates: Maximum number of candidates to return
|
|
|
|
Returns:
|
|
List of (vin, start_pos, end_pos) tuples
|
|
"""
|
|
# Pattern to find potential VIN sequences
|
|
# Allow some flexibility for OCR errors (include I, O, Q for correction later)
|
|
potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE)
|
|
|
|
candidates = []
|
|
for match in potential_vin_pattern.finditer(text.upper()):
|
|
candidate = match.group()
|
|
corrected = self.correct_ocr_errors(candidate)
|
|
|
|
# Only include if it could be a valid VIN after correction
|
|
if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
|
|
candidates.append((corrected, match.start(), match.end()))
|
|
|
|
# Sort by likelihood of being valid (check digit validation)
|
|
def score_candidate(c: tuple[str, int, int]) -> int:
|
|
vin = c[0]
|
|
if self.validate_check_digit(vin):
|
|
return 0 # Best score
|
|
return 1
|
|
|
|
candidates.sort(key=score_candidate)
|
|
return candidates[:max_candidates]
|
|
|
|
|
|
# Singleton instance
|
|
vin_validator = VinValidator()
|