motovaultpro/ocr/app/preprocessors/vin_preprocessor.py

"""VIN-optimized image preprocessing pipeline."""
import io
import logging
from dataclasses import dataclass
from typing import Optional

import cv2
import numpy as np
from PIL import Image
from pillow_heif import register_heif_opener

# Register HEIF/HEIC opener
register_heif_opener()

logger = logging.getLogger(__name__)


@dataclass
class BoundingBox:
    """Represents a region in an image."""

    x: int
    y: int
    width: int
    height: int


@dataclass
class PreprocessingResult:
    """Result of VIN preprocessing."""

    image_bytes: bytes
    bounding_box: Optional[BoundingBox] = None
    preprocessing_applied: list[str] = None

    def __post_init__(self) -> None:
        if self.preprocessing_applied is None:
            self.preprocessing_applied = []


class VinPreprocessor:
    """VIN-optimized image preprocessing for improved OCR accuracy."""

    def preprocess(
        self,
        image_bytes: bytes,
        apply_clahe: bool = True,
        apply_deskew: bool = True,
        apply_denoise: bool = True,
        apply_threshold: bool = True,
    ) -> PreprocessingResult:
        """
        Apply VIN-optimized preprocessing pipeline.

        Pipeline:
        1. HEIC conversion (if needed)
        2. Grayscale conversion
        3. Deskew (correct rotation/tilt)
        4. Contrast enhancement (CLAHE)
        5. Noise reduction (fastNlMeansDenoising)
        6. Adaptive thresholding

        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
            apply_clahe: Apply CLAHE contrast enhancement
            apply_deskew: Apply deskew correction
            apply_denoise: Apply noise reduction
            apply_threshold: Apply adaptive thresholding

        Returns:
            PreprocessingResult with processed image bytes
        """
        steps_applied = []

        # Load image with PIL (handles HEIC via pillow-heif)
        pil_image = Image.open(io.BytesIO(image_bytes))
        steps_applied.append("loaded")

        # Convert to RGB if needed
        if pil_image.mode not in ("RGB", "L"):
            pil_image = pil_image.convert("RGB")
            steps_applied.append("convert_rgb")

        # Convert to OpenCV format
        cv_image = np.array(pil_image)
        if len(cv_image.shape) == 3:
            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)

        # Convert to grayscale using best-contrast channel selection
        if len(cv_image.shape) == 3:
            gray = self._best_contrast_channel(cv_image)
        else:
            gray = cv_image
        steps_applied.append("grayscale")

        # Upscale small images for better OCR (Tesseract needs ~300 DPI)
        gray = self._ensure_minimum_resolution(gray)
        steps_applied.append("resolution_check")

        # Apply deskew
        if apply_deskew:
            gray = self._deskew(gray)
            steps_applied.append("deskew")

        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
        if apply_clahe:
            gray = self._apply_clahe(gray)
            steps_applied.append("clahe")

        # Apply denoising
        if apply_denoise:
            gray = self._denoise(gray)
            steps_applied.append("denoise")

        # Apply adaptive thresholding
        if apply_threshold:
            gray = self._adaptive_threshold(gray)
            gray = self._morphological_cleanup(gray)
            steps_applied.append("threshold")

        # Convert back to PNG bytes
        result_image = Image.fromarray(gray)
        buffer = io.BytesIO()
        result_image.save(buffer, format="PNG")

        return PreprocessingResult(
            image_bytes=buffer.getvalue(),
            preprocessing_applied=steps_applied,
        )

    # Minimum width in pixels for reliable VIN OCR.
    # A 17-char VIN needs ~30px per character for Tesseract accuracy.
    MIN_WIDTH_FOR_VIN = 600

    def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
        """
        Upscale image if too small for reliable OCR.

        Tesseract works best at ~300 DPI. Mobile photos of VINs may have
        the text occupy only a small portion of the frame, resulting in
        low effective resolution for the VIN characters.
        """
        height, width = image.shape[:2]
        if width < self.MIN_WIDTH_FOR_VIN:
            scale = self.MIN_WIDTH_FOR_VIN / width
            new_width = int(width * scale)
            new_height = int(height * scale)
            image = cv2.resize(
                image, (new_width, new_height), interpolation=cv2.INTER_CUBIC
            )
            logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}")
        return image

    def _best_contrast_channel(self, bgr_image: np.ndarray) -> np.ndarray:
        """
        Compute a grayscale image that maximizes text-to-background contrast.

        Uses per-pixel minimum across B, G, R channels.  White text has
        min(255,255,255) = 255 regardless of channel, while any colored
        background has a low value in at least one channel (e.g. green
        sticker: min(130,230,150) = 130).  This gives ~125 units of
        contrast vs ~60 from standard grayscale.

        Falls back to standard grayscale when the min-channel doesn't
        improve contrast (i.e. for already-neutral/gray images).
        """
        b_channel, g_channel, r_channel = cv2.split(bgr_image)

        min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)

        min_std = float(np.std(min_channel))
        gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
        gray_std = float(np.std(gray))

        # Use min-channel when it provides meaningfully more contrast
        if min_std > gray_std * 1.1:
            logger.debug(
                "Using min-channel (std=%.1f) over grayscale (std=%.1f)",
                min_std, gray_std,
            )
            return min_channel

        return gray

    def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
        """
        Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).

        CLAHE improves contrast in images with varying illumination,
        which is common in VIN photos taken in different lighting conditions.
        """
        try:
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
            return clahe.apply(image)
        except cv2.error as e:
            logger.warning(f"CLAHE failed: {e}")
            return image

    def _deskew(self, image: np.ndarray) -> np.ndarray:
        """
        Correct image rotation using Hough transform line detection.

        VIN plates/stickers are often photographed at slight angles.
        """
        try:
            # Detect edges
            edges = cv2.Canny(image, 50, 150, apertureSize=3)

            # Detect lines
            lines = cv2.HoughLinesP(
                edges,
                rho=1,
                theta=np.pi / 180,
                threshold=100,
                minLineLength=100,
                maxLineGap=10,
            )

            if lines is None:
                return image

            # Calculate angles of detected lines
            angles = []
            for line in lines:
                x1, y1, x2, y2 = line[0]
                if x2 - x1 != 0:
                    angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
                    # Only consider nearly horizontal lines
                    if -45 < angle < 45:
                        angles.append(angle)

            if not angles:
                return image

            # Use median angle to avoid outliers
            median_angle = np.median(angles)

            # Only correct if skew is significant but not extreme
            if abs(median_angle) < 0.5 or abs(median_angle) > 20:
                return image

            # Rotate to correct skew
            height, width = image.shape[:2]
            center = (width // 2, height // 2)
            rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)

            # Calculate new bounds
            cos_val = abs(rotation_matrix[0, 0])
            sin_val = abs(rotation_matrix[0, 1])
            new_width = int(height * sin_val + width * cos_val)
            new_height = int(height * cos_val + width * sin_val)

            rotation_matrix[0, 2] += (new_width - width) / 2
            rotation_matrix[1, 2] += (new_height - height) / 2

            rotated = cv2.warpAffine(
                image,
                rotation_matrix,
                (new_width, new_height),
                borderMode=cv2.BORDER_REPLICATE,
            )

            logger.debug(f"Deskewed by {median_angle:.2f} degrees")
            return rotated

        except Exception as e:
            logger.warning(f"Deskew failed: {e}")
            return image

    def _denoise(self, image: np.ndarray) -> np.ndarray:
        """
        Apply non-local means denoising.

        This helps remove noise while preserving VIN character edges.
        """
        try:
            return cv2.fastNlMeansDenoising(
                image, h=10, templateWindowSize=7, searchWindowSize=21
            )
        except cv2.error as e:
            logger.warning(f"Denoising failed: {e}")
            return image

    def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
        """
        Apply adaptive thresholding for binarization.

        Adaptive thresholding handles varying illumination across the image,
        which is common in VIN photos.
        """
        try:
            return cv2.adaptiveThreshold(
                image,
                255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY,
                blockSize=11,
                C=2,
            )
        except cv2.error as e:
            logger.warning(f"Adaptive threshold failed: {e}")
            return image

    def _morphological_cleanup(self, image: np.ndarray) -> np.ndarray:
        """
        Remove small noise artifacts from a thresholded binary image.

        Morphological opening (erosion then dilation) removes isolated
        pixels and thin noise lines while preserving larger text characters.
        """
        try:
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
            return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
        except cv2.error as e:
            logger.warning(f"Morphological cleanup failed: {e}")
            return image

    def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
        """
        Apply Otsu's thresholding for binarization.

        Otsu's method auto-calculates the optimal threshold value,
        which can work better than adaptive thresholding on evenly-lit images.
        """
        try:
            _, result = cv2.threshold(
                image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
            )
            return result
        except cv2.error as e:
            logger.warning(f"Otsu threshold failed: {e}")
            return image

    def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult:
        """
        Alternative preprocessing pipeline using Otsu's thresholding.

        Used as a fallback when adaptive thresholding doesn't produce
        good OCR results.
        """
        steps_applied = []

        pil_image = Image.open(io.BytesIO(image_bytes))
        steps_applied.append("loaded")

        if pil_image.mode not in ("RGB", "L"):
            pil_image = pil_image.convert("RGB")
            steps_applied.append("convert_rgb")

        cv_image = np.array(pil_image)
        if len(cv_image.shape) == 3:
            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)

        if len(cv_image.shape) == 3:
            gray = self._best_contrast_channel(cv_image)
        else:
            gray = cv_image
        steps_applied.append("grayscale")

        gray = self._ensure_minimum_resolution(gray)
        steps_applied.append("resolution_check")

        gray = self._apply_clahe(gray)
        steps_applied.append("clahe")

        gray = self._denoise(gray)
        steps_applied.append("denoise")

        gray = self._otsu_threshold(gray)
        gray = self._morphological_cleanup(gray)
        steps_applied.append("otsu_threshold")

        result_image = Image.fromarray(gray)
        buffer = io.BytesIO()
        result_image.save(buffer, format="PNG")

        return PreprocessingResult(
            image_bytes=buffer.getvalue(),
            preprocessing_applied=steps_applied,
        )

    def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
        """
        Attempt to detect the VIN region in an image.

        Uses contour detection to find rectangular regions that might contain VINs.

        Args:
            image_bytes: Raw image bytes

        Returns:
            BoundingBox of detected VIN region, or None if not found
        """
        try:
            pil_image = Image.open(io.BytesIO(image_bytes))
            if pil_image.mode != "L":
                pil_image = pil_image.convert("L")

            cv_image = np.array(pil_image)

            # Apply preprocessing for better contour detection
            blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
            edges = cv2.Canny(blurred, 50, 150)

            # Find contours
            contours, _ = cv2.findContours(
                edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
            )

            if not contours:
                return None

            # Find rectangular contours with appropriate aspect ratio for VIN
            # VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
            vin_candidates = []

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                if h == 0:
                    continue

                aspect_ratio = w / h
                area = w * h

                # VIN regions typically have:
                # - Aspect ratio between 4:1 and 12:1
                # - Minimum area (to filter out noise)
                if 4 <= aspect_ratio <= 12 and area > 1000:
                    vin_candidates.append((x, y, w, h, area))

            if not vin_candidates:
                return None

            # Return the largest candidate
            vin_candidates.sort(key=lambda c: c[4], reverse=True)
            x, y, w, h, _ = vin_candidates[0]

            return BoundingBox(x=x, y=y, width=w, height=h)

        except Exception as e:
            logger.warning(f"VIN region detection failed: {e}")
            return None


# Singleton instance
vin_preprocessor = VinPreprocessor()