feat: add VIN photo OCR pipeline (refs #67)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 31s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

Implement VIN-specific OCR extraction with optimized preprocessing:

- Add POST /extract/vin endpoint for VIN extraction
- VIN preprocessor: CLAHE, deskew, denoise, adaptive threshold
- VIN validator: check digit validation, OCR error correction (I->1, O->0)
- VIN extractor: PSM modes 6/7/8, character whitelist, alternatives
- Response includes confidence, bounding box, and alternatives
- Unit tests for validator and preprocessor
- Integration tests for VIN extraction endpoint

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 19:31:36 -06:00
parent 004940b013
commit 54cbd49171
14 changed files with 1694 additions and 1 deletions

View File

@@ -0,0 +1,259 @@
"""VIN validation with check digit verification and OCR error correction."""
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class VinValidationResult:
"""Result of VIN validation."""
is_valid: bool
vin: str
confidence_adjustment: float
error: Optional[str] = None
class VinValidator:
"""Validates and corrects VIN strings."""
# VIN character set (excludes I, O, Q)
VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
# Common OCR misreads and their corrections
TRANSLITERATION = {
"I": "1",
"O": "0",
"Q": "0",
"i": "1",
"o": "0",
"q": "0",
"l": "1",
"L": "1",
"B": "8", # Sometimes confused
"S": "5", # Sometimes confused
}
# Weights for check digit calculation (positions 1-17)
CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
# Character to value mapping for check digit
CHAR_VALUES = {
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5,
"F": 6,
"G": 7,
"H": 8,
"J": 1,
"K": 2,
"L": 3,
"M": 4,
"N": 5,
"P": 7,
"R": 9,
"S": 2,
"T": 3,
"U": 4,
"V": 5,
"W": 6,
"X": 7,
"Y": 8,
"Z": 9,
"0": 0,
"1": 1,
"2": 2,
"3": 3,
"4": 4,
"5": 5,
"6": 6,
"7": 7,
"8": 8,
"9": 9,
}
# Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q
MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$")
# Pre-1981 VIN pattern: 11-17 characters
LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$")
def correct_ocr_errors(self, vin: str) -> str:
"""
Apply common OCR error corrections to a VIN string.
Args:
vin: Raw VIN string from OCR
Returns:
Corrected VIN string
"""
corrected = vin.upper().strip()
# Remove any spaces or dashes (common in formatted VINs)
corrected = corrected.replace(" ", "").replace("-", "")
# Apply transliteration for common OCR errors
result = []
for char in corrected:
if char in self.TRANSLITERATION:
result.append(self.TRANSLITERATION[char])
else:
result.append(char)
return "".join(result)
def calculate_check_digit(self, vin: str) -> Optional[str]:
"""
Calculate the check digit (position 9) for a VIN.
Args:
vin: 17-character VIN string
Returns:
Expected check digit character, or None if calculation fails
"""
if len(vin) != 17:
return None
try:
total = 0
for i, char in enumerate(vin.upper()):
if i == 8: # Skip check digit position
continue
value = self.CHAR_VALUES.get(char)
if value is None:
return None
total += value * self.CHECK_WEIGHTS[i]
remainder = total % 11
if remainder == 10:
return "X"
return str(remainder)
except (KeyError, ValueError):
return None
def validate_check_digit(self, vin: str) -> bool:
"""
Validate the check digit of a VIN.
Args:
vin: 17-character VIN string
Returns:
True if check digit is valid
"""
if len(vin) != 17:
return False
expected = self.calculate_check_digit(vin)
if expected is None:
return False
return vin[8].upper() == expected
def validate(
self, vin: str, correct_errors: bool = True, allow_legacy: bool = False
) -> VinValidationResult:
"""
Validate a VIN string and optionally correct OCR errors.
Args:
vin: VIN string to validate
correct_errors: Whether to apply OCR error corrections
allow_legacy: Whether to allow pre-1981 VINs (11-17 chars)
Returns:
VinValidationResult with validation status and corrected VIN
"""
if not vin:
return VinValidationResult(
is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN"
)
# Apply error corrections if enabled
corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper()
# Check length
if len(corrected_vin) != 17:
if allow_legacy and 11 <= len(corrected_vin) <= 17:
# Legacy VIN - reduced confidence
if self.LEGACY_VIN_PATTERN.match(corrected_vin):
return VinValidationResult(
is_valid=True,
vin=corrected_vin,
confidence_adjustment=-0.2,
)
return VinValidationResult(
is_valid=False,
vin=corrected_vin,
confidence_adjustment=-0.5,
error=f"Invalid length: {len(corrected_vin)} (expected 17)",
)
# Check character set
if not self.MODERN_VIN_PATTERN.match(corrected_vin):
invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS]
return VinValidationResult(
is_valid=False,
vin=corrected_vin,
confidence_adjustment=-0.3,
error=f"Invalid characters: {invalid_chars}",
)
# Validate check digit
if self.validate_check_digit(corrected_vin):
# Valid check digit - boost confidence
return VinValidationResult(
is_valid=True, vin=corrected_vin, confidence_adjustment=0.1
)
else:
# Invalid check digit - could be OCR error or old VIN
return VinValidationResult(
is_valid=True, # Still return as valid but with reduced confidence
vin=corrected_vin,
confidence_adjustment=-0.15,
error="Check digit validation failed",
)
def extract_candidates(
self, text: str, max_candidates: int = 5
) -> list[tuple[str, int, int]]:
"""
Extract VIN candidates from raw OCR text.
Args:
text: Raw OCR text
max_candidates: Maximum number of candidates to return
Returns:
List of (vin, start_pos, end_pos) tuples
"""
# Pattern to find potential VIN sequences
# Allow some flexibility for OCR errors (include I, O, Q for correction later)
potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE)
candidates = []
for match in potential_vin_pattern.finditer(text.upper()):
candidate = match.group()
corrected = self.correct_ocr_errors(candidate)
# Only include if it could be a valid VIN after correction
if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
candidates.append((corrected, match.start(), match.end()))
# Sort by likelihood of being valid (check digit validation)
def score_candidate(c: tuple[str, int, int]) -> int:
vin = c[0]
if self.validate_check_digit(vin):
return 0 # Best score
return 1
candidates.sort(key=score_candidate)
return candidates[:max_candidates]
# Singleton instance
vin_validator = VinValidator()