Files
motovaultpro/ocr/app/extractors/vin_extractor.py
Eric Gullickson 6a4c2137f7
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 35s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
fix: resolve VIN OCR scanning failures on all images (refs #113)
Root cause: Tesseract fragments VINs into multiple words but candidate
extraction required continuous 17-char sequences, rejecting all results.

Changes:
- Fix candidate extraction to concatenate adjacent OCR fragments
- Disable Tesseract dictionaries (VINs are not dictionary words)
- Set OEM 1 (LSTM engine) for better accuracy
- Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes
- Add Otsu's thresholding as alternative preprocessing pipeline
- Upscale small images to meet Tesseract's 300 DPI requirement
- Remove incorrect B->8 and S->5 transliterations (valid VIN chars)
- Fix pre-existing test bug in check digit expected value

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 15:57:14 -06:00

288 lines
9.4 KiB
Python

"""VIN-specific OCR extractor with preprocessing and validation."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator
# Register HEIF/HEIC opener
register_heif_opener()
logger = logging.getLogger(__name__)
@dataclass
class VinAlternative:
"""Alternative VIN candidate with confidence."""
vin: str
confidence: float
@dataclass
class VinExtractionResult:
"""Result of VIN extraction."""
success: bool
vin: Optional[str] = None
confidence: float = 0.0
bounding_box: Optional[BoundingBox] = None
alternatives: list[VinAlternative] = field(default_factory=list)
processing_time_ms: int = 0
error: Optional[str] = None
raw_text: Optional[str] = None
class VinExtractor(BaseExtractor):
"""VIN-specific OCR extractor optimized for VIN plates and stickers."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
}
# VIN character whitelist for Tesseract
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
def __init__(self) -> None:
"""Initialize VIN extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
def extract(
self, image_bytes: bytes, content_type: Optional[str] = None
) -> VinExtractionResult:
"""
Extract VIN from an image using optimized preprocessing and OCR.
Args:
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
content_type: MIME type (auto-detected if not provided)
Returns:
VinExtractionResult with extracted VIN and metadata
"""
start_time = time.time()
# Detect content type if not provided
if not content_type:
content_type = self._detect_mime_type(image_bytes)
# Validate content type
if content_type not in self.SUPPORTED_TYPES:
return VinExtractionResult(
success=False,
error=f"Unsupported file type: {content_type}",
processing_time_ms=int((time.time() - start_time) * 1000),
)
try:
# Apply VIN-optimized preprocessing
preprocessing_result = vin_preprocessor.preprocess(image_bytes)
preprocessed_bytes = preprocessing_result.image_bytes
# Perform OCR with VIN-optimized settings
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
# Extract VIN candidates from raw text
candidates = vin_validator.extract_candidates(raw_text)
if not candidates:
# No VIN candidates found - try with different PSM modes
candidates = self._try_alternate_ocr(preprocessed_bytes)
if not candidates:
# Try alternative preprocessing (Otsu's thresholding)
otsu_result = vin_preprocessor.preprocess_otsu(image_bytes)
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
candidates = vin_validator.extract_candidates(raw_text)
if not candidates:
candidates = self._try_alternate_ocr(otsu_result.image_bytes)
if not candidates:
return VinExtractionResult(
success=False,
error="No VIN pattern found in image",
raw_text=raw_text,
processing_time_ms=int((time.time() - start_time) * 1000),
)
# Validate and score candidates
scored_candidates = []
for vin, start_pos, end_pos in candidates:
validation = vin_validator.validate(vin)
# Calculate confidence
base_confidence = self._calculate_base_confidence(word_confidences)
adjusted_confidence = min(
1.0, max(0.0, base_confidence + validation.confidence_adjustment)
)
scored_candidates.append(
(validation.vin, adjusted_confidence, validation.is_valid)
)
# Sort by confidence
scored_candidates.sort(key=lambda x: x[1], reverse=True)
# Primary result is the highest confidence valid candidate
primary_vin = None
primary_confidence = 0.0
for vin, confidence, is_valid in scored_candidates:
if is_valid:
primary_vin = vin
primary_confidence = confidence
break
# If no valid candidate, use the highest confidence one
if primary_vin is None and scored_candidates:
primary_vin = scored_candidates[0][0]
primary_confidence = scored_candidates[0][1]
# Build alternatives list (excluding primary)
alternatives = [
VinAlternative(vin=vin, confidence=conf)
for vin, conf, _ in scored_candidates[1:5] # Max 4 alternatives
]
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
f"VIN extraction: {primary_vin}, confidence={primary_confidence:.2%}, "
f"time={processing_time_ms}ms"
)
return VinExtractionResult(
success=True,
vin=primary_vin,
confidence=primary_confidence,
bounding_box=preprocessing_result.bounding_box,
alternatives=alternatives,
processing_time_ms=processing_time_ms,
raw_text=raw_text,
)
except Exception as e:
logger.error(f"VIN extraction failed: {e}", exc_info=True)
return VinExtractionResult(
success=False,
error=str(e),
processing_time_ms=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _perform_ocr(
self, image_bytes: bytes, psm: int = 6
) -> tuple[str, list[float]]:
"""
Perform OCR with VIN-optimized settings.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
6 = Uniform block of text
7 = Single text line
8 = Single word
Returns:
Tuple of (raw_text, word_confidences)
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for VIN extraction
# OEM 1 = LSTM neural network engine (best accuracy)
# Disable dictionaries since VINs are not dictionary words
config = (
f"--psm {psm} "
f"--oem 1 "
f"-c tessedit_char_whitelist={self.VIN_WHITELIST} "
f"-c load_system_dawg=false "
f"-c load_freq_dawg=false"
)
# Get detailed OCR data
ocr_data = pytesseract.image_to_data(
image, config=config, output_type=pytesseract.Output.DICT
)
# Extract words and confidences
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text.strip())
confidences.append(conf / 100.0)
raw_text = " ".join(words)
return raw_text, confidences
def _try_alternate_ocr(self, image_bytes: bytes) -> list[tuple[str, int, int]]:
"""
Try alternate OCR configurations when initial extraction fails.
PSM modes tried in order:
7 - Single text line
8 - Single word
11 - Sparse text (finds text in any order, good for angled photos)
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
Returns:
List of VIN candidates
"""
for psm in (7, 8, 11, 13):
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
candidates = vin_validator.extract_candidates(raw_text)
if candidates:
return candidates
return []
def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
"""Calculate base confidence from word confidences."""
if not word_confidences:
return 0.5
# Use average confidence, weighted slightly toward minimum
avg_conf = sum(word_confidences) / len(word_confidences)
min_conf = min(word_confidences)
# Blend: 70% average, 30% minimum
return 0.7 * avg_conf + 0.3 * min_conf
def validate(self, data: str) -> bool:
"""
Validate a VIN string.
Args:
data: VIN string to validate
Returns:
True if VIN is valid
"""
result = vin_validator.validate(data)
return result.is_valid
# Singleton instance
vin_extractor = VinExtractor()