Files
motovaultpro/ocr/app/extractors/vin_extractor.py
Eric Gullickson d5696320f1
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m25s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m36s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 9s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
fix: align VIN OCR logging with unified logging design (refs #113)
Replace filesystem-based debug system (VIN_DEBUG_DIR) with standard
logger.debug() calls that flow through Loki when LOG_LEVEL=DEBUG.
Use .env.logging variable for OCR LOG_LEVEL. Increase image capture
quality to 0.95 for better OCR accuracy.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 19:36:35 -06:00

320 lines
11 KiB
Python

"""VIN-specific OCR extractor with preprocessing and validation."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator
# Register HEIF/HEIC opener
register_heif_opener()
logger = logging.getLogger(__name__)
@dataclass
class VinAlternative:
"""Alternative VIN candidate with confidence."""
vin: str
confidence: float
@dataclass
class VinExtractionResult:
"""Result of VIN extraction."""
success: bool
vin: Optional[str] = None
confidence: float = 0.0
bounding_box: Optional[BoundingBox] = None
alternatives: list[VinAlternative] = field(default_factory=list)
processing_time_ms: int = 0
error: Optional[str] = None
raw_text: Optional[str] = None
class VinExtractor(BaseExtractor):
"""VIN-specific OCR extractor optimized for VIN plates and stickers."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
}
# VIN character whitelist for Tesseract
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
def __init__(self) -> None:
"""Initialize VIN extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
def extract(
self, image_bytes: bytes, content_type: Optional[str] = None
) -> VinExtractionResult:
"""
Extract VIN from an image using optimized preprocessing and OCR.
Args:
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
content_type: MIME type (auto-detected if not provided)
Returns:
VinExtractionResult with extracted VIN and metadata
"""
start_time = time.time()
# Detect content type if not provided
if not content_type:
content_type = self._detect_mime_type(image_bytes)
# Validate content type
if content_type not in self.SUPPORTED_TYPES:
return VinExtractionResult(
success=False,
error=f"Unsupported file type: {content_type}",
processing_time_ms=int((time.time() - start_time) * 1000),
)
try:
logger.debug(
"VIN extraction input: %d bytes, content_type=%s",
len(image_bytes), content_type,
)
# Apply VIN-optimized preprocessing
preprocessing_result = vin_preprocessor.preprocess(image_bytes)
preprocessed_bytes = preprocessing_result.image_bytes
logger.debug(
"Preprocessing steps: %s", preprocessing_result.preprocessing_applied
)
# Perform OCR with VIN-optimized settings
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
logger.debug("PSM 6 raw text: '%s'", raw_text)
logger.debug("PSM 6 word confidences: %s", word_confidences)
# Extract VIN candidates from raw text
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("PSM 6 candidates: %s", candidates)
if not candidates:
# No VIN candidates found - try with different PSM modes
candidates = self._try_alternate_ocr(preprocessed_bytes)
if not candidates:
# Try alternative preprocessing (Otsu's thresholding)
otsu_result = vin_preprocessor.preprocess_otsu(image_bytes)
logger.debug(
"Otsu preprocessing steps: %s",
otsu_result.preprocessing_applied,
)
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Otsu PSM 6 candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
otsu_result.image_bytes, prefix="Otsu"
)
if not candidates:
logger.debug("No VIN pattern found in any OCR attempt")
return VinExtractionResult(
success=False,
error="No VIN pattern found in image",
raw_text=raw_text,
processing_time_ms=int((time.time() - start_time) * 1000),
)
# Validate and score candidates
scored_candidates = []
for vin, start_pos, end_pos in candidates:
validation = vin_validator.validate(vin)
# Calculate confidence
base_confidence = self._calculate_base_confidence(word_confidences)
adjusted_confidence = min(
1.0, max(0.0, base_confidence + validation.confidence_adjustment)
)
scored_candidates.append(
(validation.vin, adjusted_confidence, validation.is_valid)
)
# Sort by confidence
scored_candidates.sort(key=lambda x: x[1], reverse=True)
# Primary result is the highest confidence valid candidate
primary_vin = None
primary_confidence = 0.0
for vin, confidence, is_valid in scored_candidates:
if is_valid:
primary_vin = vin
primary_confidence = confidence
break
# If no valid candidate, use the highest confidence one
if primary_vin is None and scored_candidates:
primary_vin = scored_candidates[0][0]
primary_confidence = scored_candidates[0][1]
# Build alternatives list (excluding primary)
alternatives = [
VinAlternative(vin=vin, confidence=conf)
for vin, conf, _ in scored_candidates[1:5] # Max 4 alternatives
]
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
"VIN extraction: %s, confidence=%.2f%%, time=%dms",
primary_vin, primary_confidence * 100, processing_time_ms,
)
logger.debug(
"VIN alternatives: %s",
[(a.vin, a.confidence) for a in alternatives],
)
return VinExtractionResult(
success=True,
vin=primary_vin,
confidence=primary_confidence,
bounding_box=preprocessing_result.bounding_box,
alternatives=alternatives,
processing_time_ms=processing_time_ms,
raw_text=raw_text,
)
except Exception as e:
logger.error("VIN extraction failed: %s", e, exc_info=True)
return VinExtractionResult(
success=False,
error=str(e),
processing_time_ms=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _perform_ocr(
self, image_bytes: bytes, psm: int = 6
) -> tuple[str, list[float]]:
"""
Perform OCR with VIN-optimized settings.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
6 = Uniform block of text
7 = Single text line
8 = Single word
Returns:
Tuple of (raw_text, word_confidences)
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for VIN extraction
# OEM 1 = LSTM neural network engine (best accuracy)
# Disable dictionaries since VINs are not dictionary words
config = (
f"--psm {psm} "
f"--oem 1 "
f"-c tessedit_char_whitelist={self.VIN_WHITELIST} "
f"-c load_system_dawg=false "
f"-c load_freq_dawg=false"
)
# Get detailed OCR data
ocr_data = pytesseract.image_to_data(
image, config=config, output_type=pytesseract.Output.DICT
)
# Extract words and confidences
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text.strip())
confidences.append(conf / 100.0)
raw_text = " ".join(words)
return raw_text, confidences
def _try_alternate_ocr(
self,
image_bytes: bytes,
prefix: str = "",
) -> list[tuple[str, int, int]]:
"""
Try alternate OCR configurations when initial extraction fails.
PSM modes tried in order:
7 - Single text line
8 - Single word
11 - Sparse text (finds text in any order, good for angled photos)
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
Returns:
List of VIN candidates
"""
tag = f"{prefix} " if prefix else ""
for psm in (7, 8, 11, 13):
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
if candidates:
return candidates
return []
def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
"""Calculate base confidence from word confidences."""
if not word_confidences:
return 0.5
# Use average confidence, weighted slightly toward minimum
avg_conf = sum(word_confidences) / len(word_confidences)
min_conf = min(word_confidences)
# Blend: 70% average, 30% minimum
return 0.7 * avg_conf + 0.3 * min_conf
def validate(self, data: str) -> bool:
"""
Validate a VIN string.
Args:
data: VIN string to validate
Returns:
True if VIN is valid
"""
result = vin_validator.validate(data)
return result.is_valid
# Singleton instance
vin_extractor = VinExtractor()