motovaultpro/ocr/app/preprocessors/vin_preprocessor.py

"""VIN-optimized image preprocessing pipeline."""
import io
import logging
from dataclasses import dataclass
from typing import Optional

import cv2
import numpy as np
from PIL import Image
from pillow_heif import register_heif_opener

# Register HEIF/HEIC opener
register_heif_opener()

logger = logging.getLogger(__name__)


@dataclass
class BoundingBox:
    """Represents a region in an image."""

    x: int
    y: int
    width: int
    height: int


@dataclass
class PreprocessingResult:
    """Result of VIN preprocessing."""

    image_bytes: bytes
    bounding_box: Optional[BoundingBox] = None
    preprocessing_applied: list[str] = None

    def __post_init__(self) -> None:
        if self.preprocessing_applied is None:
            self.preprocessing_applied = []


class VinPreprocessor:
    """VIN-optimized image preprocessing for improved OCR accuracy."""

    def preprocess(
        self,
        image_bytes: bytes,
        apply_clahe: bool = True,
        apply_deskew: bool = True,
        apply_denoise: bool = True,
        apply_threshold: bool = True,
    ) -> PreprocessingResult:
        """
        Apply VIN-optimized preprocessing pipeline.

        Pipeline:
        1. HEIC conversion (if needed)
        2. Grayscale conversion
        3. Deskew (correct rotation/tilt)
        4. Contrast enhancement (CLAHE)
        5. Noise reduction (fastNlMeansDenoising)
        6. Adaptive thresholding

        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
            apply_clahe: Apply CLAHE contrast enhancement
            apply_deskew: Apply deskew correction
            apply_denoise: Apply noise reduction
            apply_threshold: Apply adaptive thresholding

        Returns:
            PreprocessingResult with processed image bytes
        """
        steps_applied = []

        # Load image with PIL (handles HEIC via pillow-heif)
        pil_image = Image.open(io.BytesIO(image_bytes))
        steps_applied.append("loaded")

        # Convert to RGB if needed
        if pil_image.mode not in ("RGB", "L"):
            pil_image = pil_image.convert("RGB")
            steps_applied.append("convert_rgb")

        # Convert to OpenCV format
        cv_image = np.array(pil_image)
        if len(cv_image.shape) == 3:
            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)

        # Convert to grayscale using best-contrast channel selection
        if len(cv_image.shape) == 3:
            gray = self._best_contrast_channel(cv_image)
        else:
            gray = cv_image
        steps_applied.append("grayscale")

        # Upscale small images for better OCR (Tesseract needs ~300 DPI)
        gray = self._ensure_minimum_resolution(gray)
        steps_applied.append("resolution_check")

        # Apply deskew
        if apply_deskew:
            gray = self._deskew(gray)
            steps_applied.append("deskew")

        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
        if apply_clahe:
            gray = self._apply_clahe(gray)
            steps_applied.append("clahe")

        # Apply denoising
        if apply_denoise:
            gray = self._denoise(gray)
            steps_applied.append("denoise")

        # Apply adaptive thresholding
        if apply_threshold:
            gray = self._adaptive_threshold(gray)
            steps_applied.append("threshold")

        # Convert back to PNG bytes
        result_image = Image.fromarray(gray)
        buffer = io.BytesIO()
        result_image.save(buffer, format="PNG")

        return PreprocessingResult(
            image_bytes=buffer.getvalue(),
            preprocessing_applied=steps_applied,
        )

    # Minimum width in pixels for reliable VIN OCR.
    # A 17-char VIN needs ~30px per character for Tesseract accuracy.
    MIN_WIDTH_FOR_VIN = 600

    def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
        """
        Upscale image if too small for reliable OCR.

        Tesseract works best at ~300 DPI. Mobile photos of VINs may have
        the text occupy only a small portion of the frame, resulting in
        low effective resolution for the VIN characters.
        """
        height, width = image.shape[:2]
        if width < self.MIN_WIDTH_FOR_VIN:
            scale = self.MIN_WIDTH_FOR_VIN / width
            new_width = int(width * scale)
            new_height = int(height * scale)
            image = cv2.resize(
                image, (new_width, new_height), interpolation=cv2.INTER_CUBIC
            )
            logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}")
        return image

    def _best_contrast_channel(self, bgr_image: np.ndarray) -> np.ndarray:
        """
        Select the single color channel with the highest contrast.

        Standard grayscale conversion (0.299R + 0.587G + 0.114B) averages
        channels, which destroys contrast when text and background differ
        primarily in one channel.  For example, white text on a green VIN
        sticker has almost identical luminance, but the blue and red channels
        show strong contrast.

        This method evaluates each BGR channel by its standard deviation
        (a proxy for contrast) and returns the one with the highest value.
        Falls back to standard grayscale when all channels are similar.
        """
        b_channel, g_channel, r_channel = cv2.split(bgr_image)

        stds = [
            float(np.std(b_channel)),
            float(np.std(g_channel)),
            float(np.std(r_channel)),
        ]
        channels = [b_channel, g_channel, r_channel]
        channel_names = ["blue", "green", "red"]

        best_idx = int(np.argmax(stds))
        max_std = stds[best_idx]
        min_std = min(stds)

        # Only use single-channel extraction when one channel is notably
        # better (>20% higher std than the weakest).  Otherwise, standard
        # grayscale is fine and more robust for neutral-colored images.
        if max_std > 0 and (max_std - min_std) / max_std > 0.20:
            logger.debug(
                "Using %s channel (std=%.1f) over grayscale (stds: B=%.1f G=%.1f R=%.1f)",
                channel_names[best_idx], max_std, stds[0], stds[1], stds[2],
            )
            return channels[best_idx]

        return cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)

    def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
        """
        Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).

        CLAHE improves contrast in images with varying illumination,
        which is common in VIN photos taken in different lighting conditions.
        """
        try:
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
            return clahe.apply(image)
        except cv2.error as e:
            logger.warning(f"CLAHE failed: {e}")
            return image

    def _deskew(self, image: np.ndarray) -> np.ndarray:
        """
        Correct image rotation using Hough transform line detection.

        VIN plates/stickers are often photographed at slight angles.
        """
        try:
            # Detect edges
            edges = cv2.Canny(image, 50, 150, apertureSize=3)

            # Detect lines
            lines = cv2.HoughLinesP(
                edges,
                rho=1,
                theta=np.pi / 180,
                threshold=100,
                minLineLength=100,
                maxLineGap=10,
            )

            if lines is None:
                return image

            # Calculate angles of detected lines
            angles = []
            for line in lines:
                x1, y1, x2, y2 = line[0]
                if x2 - x1 != 0:
                    angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
                    # Only consider nearly horizontal lines
                    if -45 < angle < 45:
                        angles.append(angle)

            if not angles:
                return image

            # Use median angle to avoid outliers
            median_angle = np.median(angles)

            # Only correct if skew is significant but not extreme
            if abs(median_angle) < 0.5 or abs(median_angle) > 20:
                return image

            # Rotate to correct skew
            height, width = image.shape[:2]
            center = (width // 2, height // 2)
            rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)

            # Calculate new bounds
            cos_val = abs(rotation_matrix[0, 0])
            sin_val = abs(rotation_matrix[0, 1])
            new_width = int(height * sin_val + width * cos_val)
            new_height = int(height * cos_val + width * sin_val)

            rotation_matrix[0, 2] += (new_width - width) / 2
            rotation_matrix[1, 2] += (new_height - height) / 2

            rotated = cv2.warpAffine(
                image,
                rotation_matrix,
                (new_width, new_height),
                borderMode=cv2.BORDER_REPLICATE,
            )

            logger.debug(f"Deskewed by {median_angle:.2f} degrees")
            return rotated

        except Exception as e:
            logger.warning(f"Deskew failed: {e}")
            return image

    def _denoise(self, image: np.ndarray) -> np.ndarray:
        """
        Apply non-local means denoising.

        This helps remove noise while preserving VIN character edges.
        """
        try:
            return cv2.fastNlMeansDenoising(
                image, h=10, templateWindowSize=7, searchWindowSize=21
            )
        except cv2.error as e:
            logger.warning(f"Denoising failed: {e}")
            return image

    def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
        """
        Apply adaptive thresholding for binarization.

        Adaptive thresholding handles varying illumination across the image,
        which is common in VIN photos.
        """
        try:
            return cv2.adaptiveThreshold(
                image,
                255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY,
                blockSize=11,
                C=2,
            )
        except cv2.error as e:
            logger.warning(f"Adaptive threshold failed: {e}")
            return image

    def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
        """
        Apply Otsu's thresholding for binarization.

        Otsu's method auto-calculates the optimal threshold value,
        which can work better than adaptive thresholding on evenly-lit images.
        """
        try:
            _, result = cv2.threshold(
                image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
            )
            return result
        except cv2.error as e:
            logger.warning(f"Otsu threshold failed: {e}")
            return image

    def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult:
        """
        Alternative preprocessing pipeline using Otsu's thresholding.

        Used as a fallback when adaptive thresholding doesn't produce
        good OCR results.
        """
        steps_applied = []

        pil_image = Image.open(io.BytesIO(image_bytes))
        steps_applied.append("loaded")

        if pil_image.mode not in ("RGB", "L"):
            pil_image = pil_image.convert("RGB")
            steps_applied.append("convert_rgb")

        cv_image = np.array(pil_image)
        if len(cv_image.shape) == 3:
            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)

        if len(cv_image.shape) == 3:
            gray = self._best_contrast_channel(cv_image)
        else:
            gray = cv_image
        steps_applied.append("grayscale")

        gray = self._ensure_minimum_resolution(gray)
        steps_applied.append("resolution_check")

        gray = self._apply_clahe(gray)
        steps_applied.append("clahe")

        gray = self._denoise(gray)
        steps_applied.append("denoise")

        gray = self._otsu_threshold(gray)
        steps_applied.append("otsu_threshold")

        result_image = Image.fromarray(gray)
        buffer = io.BytesIO()
        result_image.save(buffer, format="PNG")

        return PreprocessingResult(
            image_bytes=buffer.getvalue(),
            preprocessing_applied=steps_applied,
        )

    def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
        """
        Attempt to detect the VIN region in an image.

        Uses contour detection to find rectangular regions that might contain VINs.

        Args:
            image_bytes: Raw image bytes

        Returns:
            BoundingBox of detected VIN region, or None if not found
        """
        try:
            pil_image = Image.open(io.BytesIO(image_bytes))
            if pil_image.mode != "L":
                pil_image = pil_image.convert("L")

            cv_image = np.array(pil_image)

            # Apply preprocessing for better contour detection
            blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
            edges = cv2.Canny(blurred, 50, 150)

            # Find contours
            contours, _ = cv2.findContours(
                edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
            )

            if not contours:
                return None

            # Find rectangular contours with appropriate aspect ratio for VIN
            # VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
            vin_candidates = []

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                if h == 0:
                    continue

                aspect_ratio = w / h
                area = w * h

                # VIN regions typically have:
                # - Aspect ratio between 4:1 and 12:1
                # - Minimum area (to filter out noise)
                if 4 <= aspect_ratio <= 12 and area > 1000:
                    vin_candidates.append((x, y, w, h, area))

            if not vin_candidates:
                return None

            # Return the largest candidate
            vin_candidates.sort(key=lambda c: c[4], reverse=True)
            x, y, w, h, _ = vin_candidates[0]

            return BoundingBox(x=x, y=y, width=w, height=h)

        except Exception as e:
            logger.warning(f"VIN region detection failed: {e}")
            return None


# Singleton instance
vin_preprocessor = VinPreprocessor()