motovaultpro/ocr/app/extractors/vin_extractor.py

"""VIN-specific OCR extractor with preprocessing and validation."""
import logging
import os
import time
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional

import magic
from pillow_heif import register_heif_opener

from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator

# Register HEIF/HEIC opener
register_heif_opener()

logger = logging.getLogger(__name__)


@dataclass
class VinAlternative:
    """Alternative VIN candidate with confidence."""

    vin: str
    confidence: float


@dataclass
class VinExtractionResult:
    """Result of VIN extraction."""

    success: bool
    vin: Optional[str] = None
    confidence: float = 0.0
    bounding_box: Optional[BoundingBox] = None
    alternatives: list[VinAlternative] = field(default_factory=list)
    processing_time_ms: int = 0
    error: Optional[str] = None
    raw_text: Optional[str] = None


class VinExtractor(BaseExtractor):
    """VIN-specific OCR extractor optimized for VIN plates and stickers."""

    # Supported MIME types
    SUPPORTED_TYPES = {
        "image/jpeg",
        "image/png",
        "image/heic",
        "image/heif",
    }

    # VIN character whitelist (passed to engine for post-OCR filtering)
    VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"

    # Fixed debug output directory (inside container)
    DEBUG_DIR = "/tmp/vin-debug"

    def __init__(self) -> None:
        """Initialize VIN extractor with engine from factory."""
        self._engine = create_engine()
        self._debug = settings.log_level.upper() == "DEBUG"

    def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
        """Save image bytes to the debug session directory when LOG_LEVEL=debug."""
        if not self._debug:
            return
        path = os.path.join(session_dir, name)
        with open(path, "wb") as f:
            f.write(data)
        logger.debug("Saved debug image: %s (%d bytes)", name, len(data))

    def _create_debug_session(self) -> Optional[str]:
        """Create a timestamped debug directory. Returns path or None."""
        if not self._debug:
            return None
        ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        session_dir = os.path.join(self.DEBUG_DIR, ts)
        os.makedirs(session_dir, exist_ok=True)
        return session_dir

    def extract(
        self, image_bytes: bytes, content_type: Optional[str] = None
    ) -> VinExtractionResult:
        """
        Extract VIN from an image using optimized preprocessing and OCR.

        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
            content_type: MIME type (auto-detected if not provided)

        Returns:
            VinExtractionResult with extracted VIN and metadata
        """
        start_time = time.time()

        # Detect content type if not provided
        if not content_type:
            content_type = self._detect_mime_type(image_bytes)

        # Validate content type
        if content_type not in self.SUPPORTED_TYPES:
            return VinExtractionResult(
                success=False,
                error=f"Unsupported file type: {content_type}",
                processing_time_ms=int((time.time() - start_time) * 1000),
            )

        try:
            debug_session = self._create_debug_session()

            logger.debug(
                "VIN extraction input: %d bytes, content_type=%s",
                len(image_bytes), content_type,
            )
            if debug_session:
                self._save_debug_image(debug_session, "01_original.jpg", image_bytes)

            # Apply VIN-optimized preprocessing
            preprocessing_result = vin_preprocessor.preprocess(image_bytes)
            preprocessed_bytes = preprocessing_result.image_bytes
            logger.debug(
                "Preprocessing steps: %s", preprocessing_result.preprocessing_applied
            )
            if debug_session:
                self._save_debug_image(
                    debug_session, "02_preprocessed_adaptive.png", preprocessed_bytes
                )

            # Perform OCR with VIN-optimized settings
            raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
            logger.debug("Primary OCR raw text: '%s'", raw_text)
            logger.debug("Primary OCR word confidences: %s", word_confidences)

            # Extract VIN candidates from raw text
            candidates = vin_validator.extract_candidates(raw_text)
            logger.debug("Primary OCR candidates: %s", candidates)

            if not candidates:
                # No VIN candidates found - try alternate OCR configurations
                candidates = self._try_alternate_ocr(preprocessed_bytes)

            if not candidates:
                # Try grayscale-only (no thresholding) — OCR engines often
                # perform better on non-binarized input because they do
                # their own internal preprocessing.
                gray_result = vin_preprocessor.preprocess(
                    image_bytes, apply_threshold=False
                )
                logger.debug(
                    "Grayscale preprocessing steps: %s",
                    gray_result.preprocessing_applied,
                )
                if debug_session:
                    self._save_debug_image(
                        debug_session, "04_preprocessed_gray.png",
                        gray_result.image_bytes,
                    )

                raw_text, word_confidences = self._perform_ocr(
                    gray_result.image_bytes
                )
                logger.debug("Gray primary raw text: '%s'", raw_text)
                candidates = vin_validator.extract_candidates(raw_text)
                logger.debug("Gray primary candidates: %s", candidates)
                if not candidates:
                    candidates = self._try_alternate_ocr(
                        gray_result.image_bytes, prefix="Gray"
                    )

            if not candidates:
                # Try alternative preprocessing (Otsu's thresholding)
                otsu_result = vin_preprocessor.preprocess_otsu(image_bytes)
                logger.debug(
                    "Otsu preprocessing steps: %s",
                    otsu_result.preprocessing_applied,
                )
                if debug_session:
                    self._save_debug_image(
                        debug_session, "03_preprocessed_otsu.png",
                        otsu_result.image_bytes,
                    )

                raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
                logger.debug("Otsu primary raw text: '%s'", raw_text)
                candidates = vin_validator.extract_candidates(raw_text)
                logger.debug("Otsu primary candidates: %s", candidates)
                if not candidates:
                    candidates = self._try_alternate_ocr(
                        otsu_result.image_bytes, prefix="Otsu"
                    )

            if not candidates:
                logger.debug("No VIN pattern found in any OCR attempt")
                return VinExtractionResult(
                    success=False,
                    error="No VIN pattern found in image",
                    raw_text=raw_text,
                    processing_time_ms=int((time.time() - start_time) * 1000),
                )

            # Validate and score candidates
            scored_candidates = []
            for vin, start_pos, end_pos in candidates:
                validation = vin_validator.validate(vin)

                # Calculate confidence
                base_confidence = self._calculate_base_confidence(word_confidences)
                adjusted_confidence = min(
                    1.0, max(0.0, base_confidence + validation.confidence_adjustment)
                )

                scored_candidates.append(
                    (validation.vin, adjusted_confidence, validation.is_valid)
                )

            # Sort by confidence
            scored_candidates.sort(key=lambda x: x[1], reverse=True)

            # Primary result is the highest confidence valid candidate
            primary_vin = None
            primary_confidence = 0.0

            for vin, confidence, is_valid in scored_candidates:
                if is_valid:
                    primary_vin = vin
                    primary_confidence = confidence
                    break

            # If no valid candidate, use the highest confidence one
            if primary_vin is None and scored_candidates:
                primary_vin = scored_candidates[0][0]
                primary_confidence = scored_candidates[0][1]

            # Build alternatives list (excluding primary)
            alternatives = [
                VinAlternative(vin=vin, confidence=conf)
                for vin, conf, _ in scored_candidates[1:5]  # Max 4 alternatives
            ]

            processing_time_ms = int((time.time() - start_time) * 1000)

            logger.info(
                "VIN extraction: %s, confidence=%.2f%%, time=%dms",
                primary_vin, primary_confidence * 100, processing_time_ms,
            )
            logger.debug(
                "VIN alternatives: %s",
                [(a.vin, a.confidence) for a in alternatives],
            )

            return VinExtractionResult(
                success=True,
                vin=primary_vin,
                confidence=primary_confidence,
                bounding_box=preprocessing_result.bounding_box,
                alternatives=alternatives,
                processing_time_ms=processing_time_ms,
                raw_text=raw_text,
            )

        except Exception as e:
            logger.error("VIN extraction failed: %s", e, exc_info=True)
            return VinExtractionResult(
                success=False,
                error=str(e),
                processing_time_ms=int((time.time() - start_time) * 1000),
            )

    def _detect_mime_type(self, file_bytes: bytes) -> str:
        """Detect MIME type using python-magic."""
        mime = magic.Magic(mime=True)
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"

    def _perform_ocr(
        self,
        image_bytes: bytes,
        single_line: bool = False,
        single_word: bool = False,
    ) -> tuple[str, list[float]]:
        """
        Perform OCR with VIN-optimized settings via engine abstraction.

        Args:
            image_bytes: Preprocessed image bytes
            single_line: Treat image as a single text line
            single_word: Treat image as a single word

        Returns:
            Tuple of (raw_text, word_confidences)
        """
        config = OcrConfig(
            char_whitelist=self.VIN_WHITELIST,
            single_line=single_line,
            single_word=single_word,
            use_angle_cls=True,
        )
        result = self._engine.recognize(image_bytes, config)
        word_confidences = [wb.confidence for wb in result.word_boxes]
        return result.text, word_confidences

    def _try_alternate_ocr(
        self,
        image_bytes: bytes,
        prefix: str = "",
    ) -> list[tuple[str, int, int]]:
        """
        Try alternate OCR configurations when initial extraction fails.

        Modes tried:
            single-line - Treat as a single text line
            single-word - Treat as a single word

        For PaddleOCR, angle classification handles rotated/angled text
        inherently, replacing the need for Tesseract PSM mode fallbacks.

        Returns:
            List of VIN candidates
        """
        tag = f"{prefix} " if prefix else ""
        for mode_name, kwargs in [
            ("single-line", {"single_line": True}),
            ("single-word", {"single_word": True}),
        ]:
            raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
            logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
            candidates = vin_validator.extract_candidates(raw_text)
            logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
            if candidates:
                return candidates

        return []

    def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
        """Calculate base confidence from word confidences."""
        if not word_confidences:
            return 0.5

        # Use average confidence, weighted slightly toward minimum
        avg_conf = sum(word_confidences) / len(word_confidences)
        min_conf = min(word_confidences)

        # Blend: 70% average, 30% minimum
        return 0.7 * avg_conf + 0.3 * min_conf

    def validate(self, data: str) -> bool:
        """
        Validate a VIN string.

        Args:
            data: VIN string to validate

        Returns:
            True if VIN is valid
        """
        result = vin_validator.validate(data)
        return result.is_valid


# Singleton instance
vin_extractor = VinExtractor()