"""VIN-specific OCR extractor with preprocessing and validation.""" import logging import os import time from dataclasses import dataclass, field from datetime import datetime from typing import Optional import magic from pillow_heif import register_heif_opener from app.config import settings from app.engines import OcrConfig, create_engine from app.extractors.base import BaseExtractor from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox from app.validators.vin_validator import vin_validator # Register HEIF/HEIC opener register_heif_opener() logger = logging.getLogger(__name__) @dataclass class VinAlternative: """Alternative VIN candidate with confidence.""" vin: str confidence: float @dataclass class VinExtractionResult: """Result of VIN extraction.""" success: bool vin: Optional[str] = None confidence: float = 0.0 bounding_box: Optional[BoundingBox] = None alternatives: list[VinAlternative] = field(default_factory=list) processing_time_ms: int = 0 error: Optional[str] = None raw_text: Optional[str] = None class VinExtractor(BaseExtractor): """VIN-specific OCR extractor optimized for VIN plates and stickers.""" # Supported MIME types SUPPORTED_TYPES = { "image/jpeg", "image/png", "image/heic", "image/heif", } # VIN character whitelist (passed to engine for post-OCR filtering) VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789" # Fixed debug output directory (inside container) DEBUG_DIR = "/tmp/vin-debug" def __init__(self) -> None: """Initialize VIN extractor with engine from factory.""" self._engine = create_engine() self._debug = settings.log_level.upper() == "DEBUG" def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None: """Save image bytes to the debug session directory when LOG_LEVEL=debug.""" if not self._debug: return path = os.path.join(session_dir, name) with open(path, "wb") as f: f.write(data) logger.debug("Saved debug image: %s (%d bytes)", name, len(data)) def _create_debug_session(self) -> Optional[str]: """Create a timestamped debug directory. Returns path or None.""" if not self._debug: return None ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f") session_dir = os.path.join(self.DEBUG_DIR, ts) os.makedirs(session_dir, exist_ok=True) return session_dir def extract( self, image_bytes: bytes, content_type: Optional[str] = None ) -> VinExtractionResult: """ Extract VIN from an image using optimized preprocessing and OCR. Args: image_bytes: Raw image bytes (HEIC, JPEG, PNG) content_type: MIME type (auto-detected if not provided) Returns: VinExtractionResult with extracted VIN and metadata """ start_time = time.time() # Detect content type if not provided if not content_type: content_type = self._detect_mime_type(image_bytes) # Validate content type if content_type not in self.SUPPORTED_TYPES: return VinExtractionResult( success=False, error=f"Unsupported file type: {content_type}", processing_time_ms=int((time.time() - start_time) * 1000), ) try: debug_session = self._create_debug_session() logger.debug( "VIN extraction input: %d bytes, content_type=%s", len(image_bytes), content_type, ) if debug_session: self._save_debug_image(debug_session, "01_original.jpg", image_bytes) # Apply VIN-optimized preprocessing preprocessing_result = vin_preprocessor.preprocess(image_bytes) preprocessed_bytes = preprocessing_result.image_bytes logger.debug( "Preprocessing steps: %s", preprocessing_result.preprocessing_applied ) if debug_session: self._save_debug_image( debug_session, "02_preprocessed_adaptive.png", preprocessed_bytes ) # Perform OCR with VIN-optimized settings raw_text, word_confidences = self._perform_ocr(preprocessed_bytes) logger.debug("Primary OCR raw text: '%s'", raw_text) logger.debug("Primary OCR word confidences: %s", word_confidences) # Extract VIN candidates from raw text candidates = vin_validator.extract_candidates(raw_text) logger.debug("Primary OCR candidates: %s", candidates) if not candidates: # No VIN candidates found - try alternate OCR configurations candidates = self._try_alternate_ocr(preprocessed_bytes) if not candidates: # Try grayscale-only (no thresholding) — OCR engines often # perform better on non-binarized input because they do # their own internal preprocessing. gray_result = vin_preprocessor.preprocess( image_bytes, apply_threshold=False ) logger.debug( "Grayscale preprocessing steps: %s", gray_result.preprocessing_applied, ) if debug_session: self._save_debug_image( debug_session, "04_preprocessed_gray.png", gray_result.image_bytes, ) raw_text, word_confidences = self._perform_ocr( gray_result.image_bytes ) logger.debug("Gray primary raw text: '%s'", raw_text) candidates = vin_validator.extract_candidates(raw_text) logger.debug("Gray primary candidates: %s", candidates) if not candidates: candidates = self._try_alternate_ocr( gray_result.image_bytes, prefix="Gray" ) if not candidates: # Try alternative preprocessing (Otsu's thresholding) otsu_result = vin_preprocessor.preprocess_otsu(image_bytes) logger.debug( "Otsu preprocessing steps: %s", otsu_result.preprocessing_applied, ) if debug_session: self._save_debug_image( debug_session, "03_preprocessed_otsu.png", otsu_result.image_bytes, ) raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes) logger.debug("Otsu primary raw text: '%s'", raw_text) candidates = vin_validator.extract_candidates(raw_text) logger.debug("Otsu primary candidates: %s", candidates) if not candidates: candidates = self._try_alternate_ocr( otsu_result.image_bytes, prefix="Otsu" ) if not candidates: logger.debug("No VIN pattern found in any OCR attempt") return VinExtractionResult( success=False, error="No VIN pattern found in image", raw_text=raw_text, processing_time_ms=int((time.time() - start_time) * 1000), ) # Validate and score candidates scored_candidates = [] for vin, start_pos, end_pos in candidates: validation = vin_validator.validate(vin) # Calculate confidence base_confidence = self._calculate_base_confidence(word_confidences) adjusted_confidence = min( 1.0, max(0.0, base_confidence + validation.confidence_adjustment) ) scored_candidates.append( (validation.vin, adjusted_confidence, validation.is_valid) ) # Sort by confidence scored_candidates.sort(key=lambda x: x[1], reverse=True) # Primary result is the highest confidence valid candidate primary_vin = None primary_confidence = 0.0 for vin, confidence, is_valid in scored_candidates: if is_valid: primary_vin = vin primary_confidence = confidence break # If no valid candidate, use the highest confidence one if primary_vin is None and scored_candidates: primary_vin = scored_candidates[0][0] primary_confidence = scored_candidates[0][1] # Build alternatives list (excluding primary) alternatives = [ VinAlternative(vin=vin, confidence=conf) for vin, conf, _ in scored_candidates[1:5] # Max 4 alternatives ] processing_time_ms = int((time.time() - start_time) * 1000) logger.info( "VIN extraction: %s, confidence=%.2f%%, time=%dms", primary_vin, primary_confidence * 100, processing_time_ms, ) logger.debug( "VIN alternatives: %s", [(a.vin, a.confidence) for a in alternatives], ) return VinExtractionResult( success=True, vin=primary_vin, confidence=primary_confidence, bounding_box=preprocessing_result.bounding_box, alternatives=alternatives, processing_time_ms=processing_time_ms, raw_text=raw_text, ) except Exception as e: logger.error("VIN extraction failed: %s", e, exc_info=True) return VinExtractionResult( success=False, error=str(e), processing_time_ms=int((time.time() - start_time) * 1000), ) def _detect_mime_type(self, file_bytes: bytes) -> str: """Detect MIME type using python-magic.""" mime = magic.Magic(mime=True) detected = mime.from_buffer(file_bytes) return detected or "application/octet-stream" def _perform_ocr( self, image_bytes: bytes, single_line: bool = False, single_word: bool = False, ) -> tuple[str, list[float]]: """ Perform OCR with VIN-optimized settings via engine abstraction. Args: image_bytes: Preprocessed image bytes single_line: Treat image as a single text line single_word: Treat image as a single word Returns: Tuple of (raw_text, word_confidences) """ config = OcrConfig( char_whitelist=self.VIN_WHITELIST, single_line=single_line, single_word=single_word, use_angle_cls=True, ) result = self._engine.recognize(image_bytes, config) word_confidences = [wb.confidence for wb in result.word_boxes] return result.text, word_confidences def _try_alternate_ocr( self, image_bytes: bytes, prefix: str = "", ) -> list[tuple[str, int, int]]: """ Try alternate OCR configurations when initial extraction fails. Modes tried: single-line - Treat as a single text line single-word - Treat as a single word For PaddleOCR, angle classification handles rotated/angled text inherently, replacing the need for Tesseract PSM mode fallbacks. Returns: List of VIN candidates """ tag = f"{prefix} " if prefix else "" for mode_name, kwargs in [ ("single-line", {"single_line": True}), ("single-word", {"single_word": True}), ]: raw_text, _ = self._perform_ocr(image_bytes, **kwargs) logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text) candidates = vin_validator.extract_candidates(raw_text) logger.debug("%s%s candidates: %s", tag, mode_name, candidates) if candidates: return candidates return [] def _calculate_base_confidence(self, word_confidences: list[float]) -> float: """Calculate base confidence from word confidences.""" if not word_confidences: return 0.5 # Use average confidence, weighted slightly toward minimum avg_conf = sum(word_confidences) / len(word_confidences) min_conf = min(word_confidences) # Blend: 70% average, 30% minimum return 0.7 * avg_conf + 0.3 * min_conf def validate(self, data: str) -> bool: """ Validate a VIN string. Args: data: VIN string to validate Returns: True if VIN is valid """ result = vin_validator.validate(data) return result.is_valid # Singleton instance vin_extractor = VinExtractor()