motovaultpro/ocr/app/extractors/vin_extractor.py

"""VIN-specific OCR extractor with preprocessing and validation."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Optional

import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener

from app.config import settings
from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator

# Register HEIF/HEIC opener
register_heif_opener()

logger = logging.getLogger(__name__)


@dataclass
class VinAlternative:
    """Alternative VIN candidate with confidence."""

    vin: str
    confidence: float


@dataclass
class VinExtractionResult:
    """Result of VIN extraction."""

    success: bool
    vin: Optional[str] = None
    confidence: float = 0.0
    bounding_box: Optional[BoundingBox] = None
    alternatives: list[VinAlternative] = field(default_factory=list)
    processing_time_ms: int = 0
    error: Optional[str] = None
    raw_text: Optional[str] = None


class VinExtractor(BaseExtractor):
    """VIN-specific OCR extractor optimized for VIN plates and stickers."""

    # Supported MIME types
    SUPPORTED_TYPES = {
        "image/jpeg",
        "image/png",
        "image/heic",
        "image/heif",
    }

    # VIN character whitelist for Tesseract
    VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"

    def __init__(self) -> None:
        """Initialize VIN extractor."""
        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd

    def extract(
        self, image_bytes: bytes, content_type: Optional[str] = None
    ) -> VinExtractionResult:
        """
        Extract VIN from an image using optimized preprocessing and OCR.

        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
            content_type: MIME type (auto-detected if not provided)

        Returns:
            VinExtractionResult with extracted VIN and metadata
        """
        start_time = time.time()

        # Detect content type if not provided
        if not content_type:
            content_type = self._detect_mime_type(image_bytes)

        # Validate content type
        if content_type not in self.SUPPORTED_TYPES:
            return VinExtractionResult(
                success=False,
                error=f"Unsupported file type: {content_type}",
                processing_time_ms=int((time.time() - start_time) * 1000),
            )

        try:
            logger.debug(
                "VIN extraction input: %d bytes, content_type=%s",
                len(image_bytes), content_type,
            )

            # Apply VIN-optimized preprocessing
            preprocessing_result = vin_preprocessor.preprocess(image_bytes)
            preprocessed_bytes = preprocessing_result.image_bytes
            logger.debug(
                "Preprocessing steps: %s", preprocessing_result.preprocessing_applied
            )

            # Perform OCR with VIN-optimized settings
            raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
            logger.debug("PSM 6 raw text: '%s'", raw_text)
            logger.debug("PSM 6 word confidences: %s", word_confidences)

            # Extract VIN candidates from raw text
            candidates = vin_validator.extract_candidates(raw_text)
            logger.debug("PSM 6 candidates: %s", candidates)

            if not candidates:
                # No VIN candidates found - try with different PSM modes
                candidates = self._try_alternate_ocr(preprocessed_bytes)

            if not candidates:
                # Try alternative preprocessing (Otsu's thresholding)
                otsu_result = vin_preprocessor.preprocess_otsu(image_bytes)
                logger.debug(
                    "Otsu preprocessing steps: %s",
                    otsu_result.preprocessing_applied,
                )

                raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
                logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
                candidates = vin_validator.extract_candidates(raw_text)
                logger.debug("Otsu PSM 6 candidates: %s", candidates)
                if not candidates:
                    candidates = self._try_alternate_ocr(
                        otsu_result.image_bytes, prefix="Otsu"
                    )

            if not candidates:
                logger.debug("No VIN pattern found in any OCR attempt")
                return VinExtractionResult(
                    success=False,
                    error="No VIN pattern found in image",
                    raw_text=raw_text,
                    processing_time_ms=int((time.time() - start_time) * 1000),
                )

            # Validate and score candidates
            scored_candidates = []
            for vin, start_pos, end_pos in candidates:
                validation = vin_validator.validate(vin)

                # Calculate confidence
                base_confidence = self._calculate_base_confidence(word_confidences)
                adjusted_confidence = min(
                    1.0, max(0.0, base_confidence + validation.confidence_adjustment)
                )

                scored_candidates.append(
                    (validation.vin, adjusted_confidence, validation.is_valid)
                )

            # Sort by confidence
            scored_candidates.sort(key=lambda x: x[1], reverse=True)

            # Primary result is the highest confidence valid candidate
            primary_vin = None
            primary_confidence = 0.0

            for vin, confidence, is_valid in scored_candidates:
                if is_valid:
                    primary_vin = vin
                    primary_confidence = confidence
                    break

            # If no valid candidate, use the highest confidence one
            if primary_vin is None and scored_candidates:
                primary_vin = scored_candidates[0][0]
                primary_confidence = scored_candidates[0][1]

            # Build alternatives list (excluding primary)
            alternatives = [
                VinAlternative(vin=vin, confidence=conf)
                for vin, conf, _ in scored_candidates[1:5]  # Max 4 alternatives
            ]

            processing_time_ms = int((time.time() - start_time) * 1000)

            logger.info(
                "VIN extraction: %s, confidence=%.2f%%, time=%dms",
                primary_vin, primary_confidence * 100, processing_time_ms,
            )
            logger.debug(
                "VIN alternatives: %s",
                [(a.vin, a.confidence) for a in alternatives],
            )

            return VinExtractionResult(
                success=True,
                vin=primary_vin,
                confidence=primary_confidence,
                bounding_box=preprocessing_result.bounding_box,
                alternatives=alternatives,
                processing_time_ms=processing_time_ms,
                raw_text=raw_text,
            )

        except Exception as e:
            logger.error("VIN extraction failed: %s", e, exc_info=True)
            return VinExtractionResult(
                success=False,
                error=str(e),
                processing_time_ms=int((time.time() - start_time) * 1000),
            )

    def _detect_mime_type(self, file_bytes: bytes) -> str:
        """Detect MIME type using python-magic."""
        mime = magic.Magic(mime=True)
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"

    def _perform_ocr(
        self, image_bytes: bytes, psm: int = 6
    ) -> tuple[str, list[float]]:
        """
        Perform OCR with VIN-optimized settings.

        Args:
            image_bytes: Preprocessed image bytes
            psm: Tesseract page segmentation mode
                 6 = Uniform block of text
                 7 = Single text line
                 8 = Single word

        Returns:
            Tuple of (raw_text, word_confidences)
        """
        image = Image.open(io.BytesIO(image_bytes))

        # Configure Tesseract for VIN extraction
        # OEM 1 = LSTM neural network engine (best accuracy)
        # Disable dictionaries since VINs are not dictionary words
        config = (
            f"--psm {psm} "
            f"--oem 1 "
            f"-c tessedit_char_whitelist={self.VIN_WHITELIST} "
            f"-c load_system_dawg=false "
            f"-c load_freq_dawg=false"
        )

        # Get detailed OCR data
        ocr_data = pytesseract.image_to_data(
            image, config=config, output_type=pytesseract.Output.DICT
        )

        # Extract words and confidences
        words = []
        confidences = []

        for i, text in enumerate(ocr_data["text"]):
            conf = int(ocr_data["conf"][i])
            if text.strip() and conf > 0:
                words.append(text.strip())
                confidences.append(conf / 100.0)

        raw_text = " ".join(words)
        return raw_text, confidences

    def _try_alternate_ocr(
        self,
        image_bytes: bytes,
        prefix: str = "",
    ) -> list[tuple[str, int, int]]:
        """
        Try alternate OCR configurations when initial extraction fails.

        PSM modes tried in order:
            7  - Single text line
            8  - Single word
            11 - Sparse text (finds text in any order, good for angled photos)
            13 - Raw line (no Tesseract heuristics, good for clean VIN plates)

        Returns:
            List of VIN candidates
        """
        tag = f"{prefix} " if prefix else ""
        for psm in (7, 8, 11, 13):
            raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
            logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
            candidates = vin_validator.extract_candidates(raw_text)
            logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
            if candidates:
                return candidates

        return []

    def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
        """Calculate base confidence from word confidences."""
        if not word_confidences:
            return 0.5

        # Use average confidence, weighted slightly toward minimum
        avg_conf = sum(word_confidences) / len(word_confidences)
        min_conf = min(word_confidences)

        # Blend: 70% average, 30% minimum
        return 0.7 * avg_conf + 0.3 * min_conf

    def validate(self, data: str) -> bool:
        """
        Validate a VIN string.

        Args:
            data: VIN string to validate

        Returns:
            True if VIN is valid
        """
        result = vin_validator.validate(data)
        return result.is_valid


# Singleton instance
vin_extractor = VinExtractor()