Files
motovaultpro/ocr/app/extractors/vin_extractor.py
Eric Gullickson ff3858f750
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 36s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 21s
Deploy to Staging / Verify Staging (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
fix: add debug image saving gated on LOG_LEVEL=debug (refs #113)
Save original, adaptive, and Otsu preprocessed images to
/tmp/vin-debug/{timestamp}/ when LOG_LEVEL is set to debug.
No images saved at info level. Volume mount added for access.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 20:26:06 -06:00

357 lines
12 KiB
Python

"""VIN-specific OCR extractor with preprocessing and validation."""
import io
import logging
import os
import time
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator
# Register HEIF/HEIC opener
register_heif_opener()
logger = logging.getLogger(__name__)
@dataclass
class VinAlternative:
"""Alternative VIN candidate with confidence."""
vin: str
confidence: float
@dataclass
class VinExtractionResult:
"""Result of VIN extraction."""
success: bool
vin: Optional[str] = None
confidence: float = 0.0
bounding_box: Optional[BoundingBox] = None
alternatives: list[VinAlternative] = field(default_factory=list)
processing_time_ms: int = 0
error: Optional[str] = None
raw_text: Optional[str] = None
class VinExtractor(BaseExtractor):
"""VIN-specific OCR extractor optimized for VIN plates and stickers."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
}
# VIN character whitelist for Tesseract
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
# Fixed debug output directory (inside container)
DEBUG_DIR = "/tmp/vin-debug"
def __init__(self) -> None:
"""Initialize VIN extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
self._debug = settings.log_level.upper() == "DEBUG"
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
"""Save image bytes to the debug session directory when LOG_LEVEL=debug."""
if not self._debug:
return
path = os.path.join(session_dir, name)
with open(path, "wb") as f:
f.write(data)
logger.debug("Saved debug image: %s (%d bytes)", name, len(data))
def _create_debug_session(self) -> Optional[str]:
"""Create a timestamped debug directory. Returns path or None."""
if not self._debug:
return None
ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
session_dir = os.path.join(self.DEBUG_DIR, ts)
os.makedirs(session_dir, exist_ok=True)
return session_dir
def extract(
self, image_bytes: bytes, content_type: Optional[str] = None
) -> VinExtractionResult:
"""
Extract VIN from an image using optimized preprocessing and OCR.
Args:
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
content_type: MIME type (auto-detected if not provided)
Returns:
VinExtractionResult with extracted VIN and metadata
"""
start_time = time.time()
# Detect content type if not provided
if not content_type:
content_type = self._detect_mime_type(image_bytes)
# Validate content type
if content_type not in self.SUPPORTED_TYPES:
return VinExtractionResult(
success=False,
error=f"Unsupported file type: {content_type}",
processing_time_ms=int((time.time() - start_time) * 1000),
)
try:
debug_session = self._create_debug_session()
logger.debug(
"VIN extraction input: %d bytes, content_type=%s",
len(image_bytes), content_type,
)
if debug_session:
self._save_debug_image(debug_session, "01_original.jpg", image_bytes)
# Apply VIN-optimized preprocessing
preprocessing_result = vin_preprocessor.preprocess(image_bytes)
preprocessed_bytes = preprocessing_result.image_bytes
logger.debug(
"Preprocessing steps: %s", preprocessing_result.preprocessing_applied
)
if debug_session:
self._save_debug_image(
debug_session, "02_preprocessed_adaptive.png", preprocessed_bytes
)
# Perform OCR with VIN-optimized settings
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
logger.debug("PSM 6 raw text: '%s'", raw_text)
logger.debug("PSM 6 word confidences: %s", word_confidences)
# Extract VIN candidates from raw text
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("PSM 6 candidates: %s", candidates)
if not candidates:
# No VIN candidates found - try with different PSM modes
candidates = self._try_alternate_ocr(preprocessed_bytes)
if not candidates:
# Try alternative preprocessing (Otsu's thresholding)
otsu_result = vin_preprocessor.preprocess_otsu(image_bytes)
logger.debug(
"Otsu preprocessing steps: %s",
otsu_result.preprocessing_applied,
)
if debug_session:
self._save_debug_image(
debug_session, "03_preprocessed_otsu.png",
otsu_result.image_bytes,
)
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Otsu PSM 6 candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
otsu_result.image_bytes, prefix="Otsu"
)
if not candidates:
logger.debug("No VIN pattern found in any OCR attempt")
return VinExtractionResult(
success=False,
error="No VIN pattern found in image",
raw_text=raw_text,
processing_time_ms=int((time.time() - start_time) * 1000),
)
# Validate and score candidates
scored_candidates = []
for vin, start_pos, end_pos in candidates:
validation = vin_validator.validate(vin)
# Calculate confidence
base_confidence = self._calculate_base_confidence(word_confidences)
adjusted_confidence = min(
1.0, max(0.0, base_confidence + validation.confidence_adjustment)
)
scored_candidates.append(
(validation.vin, adjusted_confidence, validation.is_valid)
)
# Sort by confidence
scored_candidates.sort(key=lambda x: x[1], reverse=True)
# Primary result is the highest confidence valid candidate
primary_vin = None
primary_confidence = 0.0
for vin, confidence, is_valid in scored_candidates:
if is_valid:
primary_vin = vin
primary_confidence = confidence
break
# If no valid candidate, use the highest confidence one
if primary_vin is None and scored_candidates:
primary_vin = scored_candidates[0][0]
primary_confidence = scored_candidates[0][1]
# Build alternatives list (excluding primary)
alternatives = [
VinAlternative(vin=vin, confidence=conf)
for vin, conf, _ in scored_candidates[1:5] # Max 4 alternatives
]
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
"VIN extraction: %s, confidence=%.2f%%, time=%dms",
primary_vin, primary_confidence * 100, processing_time_ms,
)
logger.debug(
"VIN alternatives: %s",
[(a.vin, a.confidence) for a in alternatives],
)
return VinExtractionResult(
success=True,
vin=primary_vin,
confidence=primary_confidence,
bounding_box=preprocessing_result.bounding_box,
alternatives=alternatives,
processing_time_ms=processing_time_ms,
raw_text=raw_text,
)
except Exception as e:
logger.error("VIN extraction failed: %s", e, exc_info=True)
return VinExtractionResult(
success=False,
error=str(e),
processing_time_ms=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _perform_ocr(
self, image_bytes: bytes, psm: int = 6
) -> tuple[str, list[float]]:
"""
Perform OCR with VIN-optimized settings.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
6 = Uniform block of text
7 = Single text line
8 = Single word
Returns:
Tuple of (raw_text, word_confidences)
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for VIN extraction
# OEM 1 = LSTM neural network engine (best accuracy)
# Disable dictionaries since VINs are not dictionary words
config = (
f"--psm {psm} "
f"--oem 1 "
f"-c tessedit_char_whitelist={self.VIN_WHITELIST} "
f"-c load_system_dawg=false "
f"-c load_freq_dawg=false"
)
# Get detailed OCR data
ocr_data = pytesseract.image_to_data(
image, config=config, output_type=pytesseract.Output.DICT
)
# Extract words and confidences
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text.strip())
confidences.append(conf / 100.0)
raw_text = " ".join(words)
return raw_text, confidences
def _try_alternate_ocr(
self,
image_bytes: bytes,
prefix: str = "",
) -> list[tuple[str, int, int]]:
"""
Try alternate OCR configurations when initial extraction fails.
PSM modes tried in order:
7 - Single text line
8 - Single word
11 - Sparse text (finds text in any order, good for angled photos)
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
Returns:
List of VIN candidates
"""
tag = f"{prefix} " if prefix else ""
for psm in (7, 8, 11, 13):
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
if candidates:
return candidates
return []
def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
"""Calculate base confidence from word confidences."""
if not word_confidences:
return 0.5
# Use average confidence, weighted slightly toward minimum
avg_conf = sum(word_confidences) / len(word_confidences)
min_conf = min(word_confidences)
# Blend: 70% average, 30% minimum
return 0.7 * avg_conf + 0.3 * min_conf
def validate(self, data: str) -> bool:
"""
Validate a VIN string.
Args:
data: VIN string to validate
Returns:
True if VIN is valid
"""
result = vin_validator.validate(data)
return result.is_valid
# Singleton instance
vin_extractor = VinExtractor()