feat: add VIN photo OCR pipeline (refs #67)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 31s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 31s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement VIN-specific OCR extraction with optimized preprocessing: - Add POST /extract/vin endpoint for VIN extraction - VIN preprocessor: CLAHE, deskew, denoise, adaptive threshold - VIN validator: check digit validation, OCR error correction (I->1, O->0) - VIN extractor: PSM modes 6/7/8, character whitelist, alternatives - Response includes confidence, bounding box, and alternatives - Unit tests for validator and preprocessor - Integration tests for VIN extraction endpoint Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
4
ocr/app/validators/__init__.py
Normal file
4
ocr/app/validators/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
"""Validators package for OCR data validation."""
|
||||
from app.validators.vin_validator import VinValidator, vin_validator
|
||||
|
||||
__all__ = ["VinValidator", "vin_validator"]
|
||||
259
ocr/app/validators/vin_validator.py
Normal file
259
ocr/app/validators/vin_validator.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""VIN validation with check digit verification and OCR error correction."""
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class VinValidationResult:
|
||||
"""Result of VIN validation."""
|
||||
|
||||
is_valid: bool
|
||||
vin: str
|
||||
confidence_adjustment: float
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class VinValidator:
|
||||
"""Validates and corrects VIN strings."""
|
||||
|
||||
# VIN character set (excludes I, O, Q)
|
||||
VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
|
||||
|
||||
# Common OCR misreads and their corrections
|
||||
TRANSLITERATION = {
|
||||
"I": "1",
|
||||
"O": "0",
|
||||
"Q": "0",
|
||||
"i": "1",
|
||||
"o": "0",
|
||||
"q": "0",
|
||||
"l": "1",
|
||||
"L": "1",
|
||||
"B": "8", # Sometimes confused
|
||||
"S": "5", # Sometimes confused
|
||||
}
|
||||
|
||||
# Weights for check digit calculation (positions 1-17)
|
||||
CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
|
||||
|
||||
# Character to value mapping for check digit
|
||||
CHAR_VALUES = {
|
||||
"A": 1,
|
||||
"B": 2,
|
||||
"C": 3,
|
||||
"D": 4,
|
||||
"E": 5,
|
||||
"F": 6,
|
||||
"G": 7,
|
||||
"H": 8,
|
||||
"J": 1,
|
||||
"K": 2,
|
||||
"L": 3,
|
||||
"M": 4,
|
||||
"N": 5,
|
||||
"P": 7,
|
||||
"R": 9,
|
||||
"S": 2,
|
||||
"T": 3,
|
||||
"U": 4,
|
||||
"V": 5,
|
||||
"W": 6,
|
||||
"X": 7,
|
||||
"Y": 8,
|
||||
"Z": 9,
|
||||
"0": 0,
|
||||
"1": 1,
|
||||
"2": 2,
|
||||
"3": 3,
|
||||
"4": 4,
|
||||
"5": 5,
|
||||
"6": 6,
|
||||
"7": 7,
|
||||
"8": 8,
|
||||
"9": 9,
|
||||
}
|
||||
|
||||
# Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q
|
||||
MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$")
|
||||
|
||||
# Pre-1981 VIN pattern: 11-17 characters
|
||||
LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$")
|
||||
|
||||
def correct_ocr_errors(self, vin: str) -> str:
|
||||
"""
|
||||
Apply common OCR error corrections to a VIN string.
|
||||
|
||||
Args:
|
||||
vin: Raw VIN string from OCR
|
||||
|
||||
Returns:
|
||||
Corrected VIN string
|
||||
"""
|
||||
corrected = vin.upper().strip()
|
||||
|
||||
# Remove any spaces or dashes (common in formatted VINs)
|
||||
corrected = corrected.replace(" ", "").replace("-", "")
|
||||
|
||||
# Apply transliteration for common OCR errors
|
||||
result = []
|
||||
for char in corrected:
|
||||
if char in self.TRANSLITERATION:
|
||||
result.append(self.TRANSLITERATION[char])
|
||||
else:
|
||||
result.append(char)
|
||||
|
||||
return "".join(result)
|
||||
|
||||
def calculate_check_digit(self, vin: str) -> Optional[str]:
|
||||
"""
|
||||
Calculate the check digit (position 9) for a VIN.
|
||||
|
||||
Args:
|
||||
vin: 17-character VIN string
|
||||
|
||||
Returns:
|
||||
Expected check digit character, or None if calculation fails
|
||||
"""
|
||||
if len(vin) != 17:
|
||||
return None
|
||||
|
||||
try:
|
||||
total = 0
|
||||
for i, char in enumerate(vin.upper()):
|
||||
if i == 8: # Skip check digit position
|
||||
continue
|
||||
value = self.CHAR_VALUES.get(char)
|
||||
if value is None:
|
||||
return None
|
||||
total += value * self.CHECK_WEIGHTS[i]
|
||||
|
||||
remainder = total % 11
|
||||
if remainder == 10:
|
||||
return "X"
|
||||
return str(remainder)
|
||||
except (KeyError, ValueError):
|
||||
return None
|
||||
|
||||
def validate_check_digit(self, vin: str) -> bool:
|
||||
"""
|
||||
Validate the check digit of a VIN.
|
||||
|
||||
Args:
|
||||
vin: 17-character VIN string
|
||||
|
||||
Returns:
|
||||
True if check digit is valid
|
||||
"""
|
||||
if len(vin) != 17:
|
||||
return False
|
||||
|
||||
expected = self.calculate_check_digit(vin)
|
||||
if expected is None:
|
||||
return False
|
||||
|
||||
return vin[8].upper() == expected
|
||||
|
||||
def validate(
|
||||
self, vin: str, correct_errors: bool = True, allow_legacy: bool = False
|
||||
) -> VinValidationResult:
|
||||
"""
|
||||
Validate a VIN string and optionally correct OCR errors.
|
||||
|
||||
Args:
|
||||
vin: VIN string to validate
|
||||
correct_errors: Whether to apply OCR error corrections
|
||||
allow_legacy: Whether to allow pre-1981 VINs (11-17 chars)
|
||||
|
||||
Returns:
|
||||
VinValidationResult with validation status and corrected VIN
|
||||
"""
|
||||
if not vin:
|
||||
return VinValidationResult(
|
||||
is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN"
|
||||
)
|
||||
|
||||
# Apply error corrections if enabled
|
||||
corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper()
|
||||
|
||||
# Check length
|
||||
if len(corrected_vin) != 17:
|
||||
if allow_legacy and 11 <= len(corrected_vin) <= 17:
|
||||
# Legacy VIN - reduced confidence
|
||||
if self.LEGACY_VIN_PATTERN.match(corrected_vin):
|
||||
return VinValidationResult(
|
||||
is_valid=True,
|
||||
vin=corrected_vin,
|
||||
confidence_adjustment=-0.2,
|
||||
)
|
||||
return VinValidationResult(
|
||||
is_valid=False,
|
||||
vin=corrected_vin,
|
||||
confidence_adjustment=-0.5,
|
||||
error=f"Invalid length: {len(corrected_vin)} (expected 17)",
|
||||
)
|
||||
|
||||
# Check character set
|
||||
if not self.MODERN_VIN_PATTERN.match(corrected_vin):
|
||||
invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS]
|
||||
return VinValidationResult(
|
||||
is_valid=False,
|
||||
vin=corrected_vin,
|
||||
confidence_adjustment=-0.3,
|
||||
error=f"Invalid characters: {invalid_chars}",
|
||||
)
|
||||
|
||||
# Validate check digit
|
||||
if self.validate_check_digit(corrected_vin):
|
||||
# Valid check digit - boost confidence
|
||||
return VinValidationResult(
|
||||
is_valid=True, vin=corrected_vin, confidence_adjustment=0.1
|
||||
)
|
||||
else:
|
||||
# Invalid check digit - could be OCR error or old VIN
|
||||
return VinValidationResult(
|
||||
is_valid=True, # Still return as valid but with reduced confidence
|
||||
vin=corrected_vin,
|
||||
confidence_adjustment=-0.15,
|
||||
error="Check digit validation failed",
|
||||
)
|
||||
|
||||
def extract_candidates(
|
||||
self, text: str, max_candidates: int = 5
|
||||
) -> list[tuple[str, int, int]]:
|
||||
"""
|
||||
Extract VIN candidates from raw OCR text.
|
||||
|
||||
Args:
|
||||
text: Raw OCR text
|
||||
max_candidates: Maximum number of candidates to return
|
||||
|
||||
Returns:
|
||||
List of (vin, start_pos, end_pos) tuples
|
||||
"""
|
||||
# Pattern to find potential VIN sequences
|
||||
# Allow some flexibility for OCR errors (include I, O, Q for correction later)
|
||||
potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE)
|
||||
|
||||
candidates = []
|
||||
for match in potential_vin_pattern.finditer(text.upper()):
|
||||
candidate = match.group()
|
||||
corrected = self.correct_ocr_errors(candidate)
|
||||
|
||||
# Only include if it could be a valid VIN after correction
|
||||
if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
|
||||
candidates.append((corrected, match.start(), match.end()))
|
||||
|
||||
# Sort by likelihood of being valid (check digit validation)
|
||||
def score_candidate(c: tuple[str, int, int]) -> int:
|
||||
vin = c[0]
|
||||
if self.validate_check_digit(vin):
|
||||
return 0 # Best score
|
||||
return 1
|
||||
|
||||
candidates.sort(key=score_candidate)
|
||||
return candidates[:max_candidates]
|
||||
|
||||
|
||||
# Singleton instance
|
||||
vin_validator = VinValidator()
|
||||
Reference in New Issue
Block a user