"""VIN-specific OCR extractor with preprocessing and validation.""" import io import logging import os import time from dataclasses import dataclass, field from datetime import datetime from typing import Optional import magic import pytesseract from PIL import Image from pillow_heif import register_heif_opener from app.config import settings from app.extractors.base import BaseExtractor from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox from app.validators.vin_validator import vin_validator # Register HEIF/HEIC opener register_heif_opener() logger = logging.getLogger(__name__) @dataclass class VinAlternative: """Alternative VIN candidate with confidence.""" vin: str confidence: float @dataclass class VinExtractionResult: """Result of VIN extraction.""" success: bool vin: Optional[str] = None confidence: float = 0.0 bounding_box: Optional[BoundingBox] = None alternatives: list[VinAlternative] = field(default_factory=list) processing_time_ms: int = 0 error: Optional[str] = None raw_text: Optional[str] = None class VinExtractor(BaseExtractor): """VIN-specific OCR extractor optimized for VIN plates and stickers.""" # Supported MIME types SUPPORTED_TYPES = { "image/jpeg", "image/png", "image/heic", "image/heif", } # VIN character whitelist for Tesseract VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789" # Fixed debug output directory (inside container) DEBUG_DIR = "/tmp/vin-debug" def __init__(self) -> None: """Initialize VIN extractor.""" pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd self._debug = settings.log_level.upper() == "DEBUG" def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None: """Save image bytes to the debug session directory when LOG_LEVEL=debug.""" if not self._debug: return path = os.path.join(session_dir, name) with open(path, "wb") as f: f.write(data) logger.debug("Saved debug image: %s (%d bytes)", name, len(data)) def _create_debug_session(self) -> Optional[str]: """Create a timestamped debug directory. Returns path or None.""" if not self._debug: return None ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f") session_dir = os.path.join(self.DEBUG_DIR, ts) os.makedirs(session_dir, exist_ok=True) return session_dir def extract( self, image_bytes: bytes, content_type: Optional[str] = None ) -> VinExtractionResult: """ Extract VIN from an image using optimized preprocessing and OCR. Args: image_bytes: Raw image bytes (HEIC, JPEG, PNG) content_type: MIME type (auto-detected if not provided) Returns: VinExtractionResult with extracted VIN and metadata """ start_time = time.time() # Detect content type if not provided if not content_type: content_type = self._detect_mime_type(image_bytes) # Validate content type if content_type not in self.SUPPORTED_TYPES: return VinExtractionResult( success=False, error=f"Unsupported file type: {content_type}", processing_time_ms=int((time.time() - start_time) * 1000), ) try: debug_session = self._create_debug_session() logger.debug( "VIN extraction input: %d bytes, content_type=%s", len(image_bytes), content_type, ) if debug_session: self._save_debug_image(debug_session, "01_original.jpg", image_bytes) # Apply VIN-optimized preprocessing preprocessing_result = vin_preprocessor.preprocess(image_bytes) preprocessed_bytes = preprocessing_result.image_bytes logger.debug( "Preprocessing steps: %s", preprocessing_result.preprocessing_applied ) if debug_session: self._save_debug_image( debug_session, "02_preprocessed_adaptive.png", preprocessed_bytes ) # Perform OCR with VIN-optimized settings raw_text, word_confidences = self._perform_ocr(preprocessed_bytes) logger.debug("PSM 6 raw text: '%s'", raw_text) logger.debug("PSM 6 word confidences: %s", word_confidences) # Extract VIN candidates from raw text candidates = vin_validator.extract_candidates(raw_text) logger.debug("PSM 6 candidates: %s", candidates) if not candidates: # No VIN candidates found - try with different PSM modes candidates = self._try_alternate_ocr(preprocessed_bytes) if not candidates: # Try alternative preprocessing (Otsu's thresholding) otsu_result = vin_preprocessor.preprocess_otsu(image_bytes) logger.debug( "Otsu preprocessing steps: %s", otsu_result.preprocessing_applied, ) if debug_session: self._save_debug_image( debug_session, "03_preprocessed_otsu.png", otsu_result.image_bytes, ) raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes) logger.debug("Otsu PSM 6 raw text: '%s'", raw_text) candidates = vin_validator.extract_candidates(raw_text) logger.debug("Otsu PSM 6 candidates: %s", candidates) if not candidates: candidates = self._try_alternate_ocr( otsu_result.image_bytes, prefix="Otsu" ) if not candidates: logger.debug("No VIN pattern found in any OCR attempt") return VinExtractionResult( success=False, error="No VIN pattern found in image", raw_text=raw_text, processing_time_ms=int((time.time() - start_time) * 1000), ) # Validate and score candidates scored_candidates = [] for vin, start_pos, end_pos in candidates: validation = vin_validator.validate(vin) # Calculate confidence base_confidence = self._calculate_base_confidence(word_confidences) adjusted_confidence = min( 1.0, max(0.0, base_confidence + validation.confidence_adjustment) ) scored_candidates.append( (validation.vin, adjusted_confidence, validation.is_valid) ) # Sort by confidence scored_candidates.sort(key=lambda x: x[1], reverse=True) # Primary result is the highest confidence valid candidate primary_vin = None primary_confidence = 0.0 for vin, confidence, is_valid in scored_candidates: if is_valid: primary_vin = vin primary_confidence = confidence break # If no valid candidate, use the highest confidence one if primary_vin is None and scored_candidates: primary_vin = scored_candidates[0][0] primary_confidence = scored_candidates[0][1] # Build alternatives list (excluding primary) alternatives = [ VinAlternative(vin=vin, confidence=conf) for vin, conf, _ in scored_candidates[1:5] # Max 4 alternatives ] processing_time_ms = int((time.time() - start_time) * 1000) logger.info( "VIN extraction: %s, confidence=%.2f%%, time=%dms", primary_vin, primary_confidence * 100, processing_time_ms, ) logger.debug( "VIN alternatives: %s", [(a.vin, a.confidence) for a in alternatives], ) return VinExtractionResult( success=True, vin=primary_vin, confidence=primary_confidence, bounding_box=preprocessing_result.bounding_box, alternatives=alternatives, processing_time_ms=processing_time_ms, raw_text=raw_text, ) except Exception as e: logger.error("VIN extraction failed: %s", e, exc_info=True) return VinExtractionResult( success=False, error=str(e), processing_time_ms=int((time.time() - start_time) * 1000), ) def _detect_mime_type(self, file_bytes: bytes) -> str: """Detect MIME type using python-magic.""" mime = magic.Magic(mime=True) detected = mime.from_buffer(file_bytes) return detected or "application/octet-stream" def _perform_ocr( self, image_bytes: bytes, psm: int = 6 ) -> tuple[str, list[float]]: """ Perform OCR with VIN-optimized settings. Args: image_bytes: Preprocessed image bytes psm: Tesseract page segmentation mode 6 = Uniform block of text 7 = Single text line 8 = Single word Returns: Tuple of (raw_text, word_confidences) """ image = Image.open(io.BytesIO(image_bytes)) # Configure Tesseract for VIN extraction # OEM 1 = LSTM neural network engine (best accuracy) # Disable dictionaries since VINs are not dictionary words config = ( f"--psm {psm} " f"--oem 1 " f"-c tessedit_char_whitelist={self.VIN_WHITELIST} " f"-c load_system_dawg=false " f"-c load_freq_dawg=false" ) # Get detailed OCR data ocr_data = pytesseract.image_to_data( image, config=config, output_type=pytesseract.Output.DICT ) # Extract words and confidences words = [] confidences = [] for i, text in enumerate(ocr_data["text"]): conf = int(ocr_data["conf"][i]) if text.strip() and conf > 0: words.append(text.strip()) confidences.append(conf / 100.0) raw_text = " ".join(words) return raw_text, confidences def _try_alternate_ocr( self, image_bytes: bytes, prefix: str = "", ) -> list[tuple[str, int, int]]: """ Try alternate OCR configurations when initial extraction fails. PSM modes tried in order: 7 - Single text line 8 - Single word 11 - Sparse text (finds text in any order, good for angled photos) 13 - Raw line (no Tesseract heuristics, good for clean VIN plates) Returns: List of VIN candidates """ tag = f"{prefix} " if prefix else "" for psm in (7, 8, 11, 13): raw_text, _ = self._perform_ocr(image_bytes, psm=psm) logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text) candidates = vin_validator.extract_candidates(raw_text) logger.debug("%sPSM %d candidates: %s", tag, psm, candidates) if candidates: return candidates return [] def _calculate_base_confidence(self, word_confidences: list[float]) -> float: """Calculate base confidence from word confidences.""" if not word_confidences: return 0.5 # Use average confidence, weighted slightly toward minimum avg_conf = sum(word_confidences) / len(word_confidences) min_conf = min(word_confidences) # Blend: 70% average, 30% minimum return 0.7 * avg_conf + 0.3 * min_conf def validate(self, data: str) -> bool: """ Validate a VIN string. Args: data: VIN string to validate Returns: True if VIN is valid """ result = vin_validator.validate(data) return result.is_valid # Singleton instance vin_extractor = VinExtractor()