"""VIN-optimized image preprocessing pipeline.""" import io import logging from dataclasses import dataclass from typing import Optional import cv2 import numpy as np from PIL import Image from pillow_heif import register_heif_opener # Register HEIF/HEIC opener register_heif_opener() logger = logging.getLogger(__name__) @dataclass class BoundingBox: """Represents a region in an image.""" x: int y: int width: int height: int @dataclass class PreprocessingResult: """Result of VIN preprocessing.""" image_bytes: bytes bounding_box: Optional[BoundingBox] = None preprocessing_applied: list[str] = None def __post_init__(self) -> None: if self.preprocessing_applied is None: self.preprocessing_applied = [] class VinPreprocessor: """VIN-optimized image preprocessing for improved OCR accuracy.""" def preprocess( self, image_bytes: bytes, apply_clahe: bool = True, apply_deskew: bool = True, apply_denoise: bool = True, apply_threshold: bool = True, ) -> PreprocessingResult: """ Apply VIN-optimized preprocessing pipeline. Pipeline: 1. HEIC conversion (if needed) 2. Grayscale conversion 3. Deskew (correct rotation/tilt) 4. Contrast enhancement (CLAHE) 5. Noise reduction (fastNlMeansDenoising) 6. Adaptive thresholding Args: image_bytes: Raw image bytes (HEIC, JPEG, PNG) apply_clahe: Apply CLAHE contrast enhancement apply_deskew: Apply deskew correction apply_denoise: Apply noise reduction apply_threshold: Apply adaptive thresholding Returns: PreprocessingResult with processed image bytes """ steps_applied = [] # Load image with PIL (handles HEIC via pillow-heif) pil_image = Image.open(io.BytesIO(image_bytes)) steps_applied.append("loaded") # Convert to RGB if needed if pil_image.mode not in ("RGB", "L"): pil_image = pil_image.convert("RGB") steps_applied.append("convert_rgb") # Convert to OpenCV format cv_image = np.array(pil_image) if len(cv_image.shape) == 3: cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR) # Convert to grayscale using best-contrast channel selection if len(cv_image.shape) == 3: gray = self._best_contrast_channel(cv_image) else: gray = cv_image steps_applied.append("grayscale") # Upscale small images for better OCR (~300 DPI recommended) gray = self._ensure_minimum_resolution(gray) steps_applied.append("resolution_check") # Apply deskew if apply_deskew: gray = self._deskew(gray) steps_applied.append("deskew") # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) if apply_clahe: gray = self._apply_clahe(gray) steps_applied.append("clahe") # Apply denoising if apply_denoise: gray = self._denoise(gray) steps_applied.append("denoise") # Apply adaptive thresholding if apply_threshold: gray = self._adaptive_threshold(gray) gray = self._morphological_cleanup(gray) steps_applied.append("threshold") # Convert back to PNG bytes result_image = Image.fromarray(gray) buffer = io.BytesIO() result_image.save(buffer, format="PNG") return PreprocessingResult( image_bytes=buffer.getvalue(), preprocessing_applied=steps_applied, ) # Minimum width in pixels for reliable VIN OCR. # A 17-char VIN needs ~30px per character for reliable OCR accuracy. MIN_WIDTH_FOR_VIN = 600 def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray: """ Upscale image if too small for reliable OCR. OCR works best at ~300 DPI. Mobile photos of VINs may have the text occupy only a small portion of the frame, resulting in low effective resolution for the VIN characters. """ height, width = image.shape[:2] if width < self.MIN_WIDTH_FOR_VIN: scale = self.MIN_WIDTH_FOR_VIN / width new_width = int(width * scale) new_height = int(height * scale) image = cv2.resize( image, (new_width, new_height), interpolation=cv2.INTER_CUBIC ) logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}") return image def _best_contrast_channel(self, bgr_image: np.ndarray) -> np.ndarray: """ Compute a grayscale image with dark text on light background. Uses inverted per-pixel minimum across B, G, R channels. White text has min(255,255,255) = 255 → inverted to 0 (black). Colored backgrounds have a low min value (e.g. green sticker: min(130,230,150) = 130) → inverted to 125 (medium gray). The inversion ensures the OCR engine always receives dark-text-on- light-background, which is the polarity it expects. """ b_channel, g_channel, r_channel = cv2.split(bgr_image) min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel) # Invert so white text (min=255) becomes black (0) and colored # backgrounds (min~130) become lighter gray (~125). OCR engines # expect dark text on light background. inverted = cv2.bitwise_not(min_channel) gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY) logger.debug( "Channel contrast: inverted-min std=%.1f, grayscale std=%.1f", float(np.std(inverted)), float(np.std(gray)), ) return inverted def _apply_clahe(self, image: np.ndarray) -> np.ndarray: """ Apply CLAHE (Contrast Limited Adaptive Histogram Equalization). CLAHE improves contrast in images with varying illumination, which is common in VIN photos taken in different lighting conditions. """ try: clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) return clahe.apply(image) except cv2.error as e: logger.warning(f"CLAHE failed: {e}") return image def _deskew(self, image: np.ndarray) -> np.ndarray: """ Correct image rotation using Hough transform line detection. VIN plates/stickers are often photographed at slight angles. """ try: # Detect edges edges = cv2.Canny(image, 50, 150, apertureSize=3) # Detect lines lines = cv2.HoughLinesP( edges, rho=1, theta=np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10, ) if lines is None: return image # Calculate angles of detected lines angles = [] for line in lines: x1, y1, x2, y2 = line[0] if x2 - x1 != 0: angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi # Only consider nearly horizontal lines if -45 < angle < 45: angles.append(angle) if not angles: return image # Use median angle to avoid outliers median_angle = np.median(angles) # Only correct if skew is significant but not extreme if abs(median_angle) < 0.5 or abs(median_angle) > 20: return image # Rotate to correct skew height, width = image.shape[:2] center = (width // 2, height // 2) rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0) # Calculate new bounds cos_val = abs(rotation_matrix[0, 0]) sin_val = abs(rotation_matrix[0, 1]) new_width = int(height * sin_val + width * cos_val) new_height = int(height * cos_val + width * sin_val) rotation_matrix[0, 2] += (new_width - width) / 2 rotation_matrix[1, 2] += (new_height - height) / 2 rotated = cv2.warpAffine( image, rotation_matrix, (new_width, new_height), borderMode=cv2.BORDER_REPLICATE, ) logger.debug(f"Deskewed by {median_angle:.2f} degrees") return rotated except Exception as e: logger.warning(f"Deskew failed: {e}") return image def _denoise(self, image: np.ndarray) -> np.ndarray: """ Apply non-local means denoising. This helps remove noise while preserving VIN character edges. """ try: return cv2.fastNlMeansDenoising( image, h=10, templateWindowSize=7, searchWindowSize=21 ) except cv2.error as e: logger.warning(f"Denoising failed: {e}") return image def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray: """ Apply adaptive thresholding for binarization. Adaptive thresholding handles varying illumination across the image, which is common in VIN photos. """ try: return cv2.adaptiveThreshold( image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blockSize=11, C=2, ) except cv2.error as e: logger.warning(f"Adaptive threshold failed: {e}") return image def _morphological_cleanup(self, image: np.ndarray) -> np.ndarray: """ Remove small noise artifacts from a thresholded binary image. Morphological opening (erosion then dilation) removes isolated pixels and thin noise lines while preserving larger text characters. """ try: kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel) except cv2.error as e: logger.warning(f"Morphological cleanup failed: {e}") return image def _otsu_threshold(self, image: np.ndarray) -> np.ndarray: """ Apply Otsu's thresholding for binarization. Otsu's method auto-calculates the optimal threshold value, which can work better than adaptive thresholding on evenly-lit images. """ try: _, result = cv2.threshold( image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU ) return result except cv2.error as e: logger.warning(f"Otsu threshold failed: {e}") return image def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult: """ Alternative preprocessing pipeline using Otsu's thresholding. Used as a fallback when adaptive thresholding doesn't produce good OCR results. """ steps_applied = [] pil_image = Image.open(io.BytesIO(image_bytes)) steps_applied.append("loaded") if pil_image.mode not in ("RGB", "L"): pil_image = pil_image.convert("RGB") steps_applied.append("convert_rgb") cv_image = np.array(pil_image) if len(cv_image.shape) == 3: cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR) if len(cv_image.shape) == 3: gray = self._best_contrast_channel(cv_image) else: gray = cv_image steps_applied.append("grayscale") gray = self._ensure_minimum_resolution(gray) steps_applied.append("resolution_check") gray = self._apply_clahe(gray) steps_applied.append("clahe") gray = self._denoise(gray) steps_applied.append("denoise") gray = self._otsu_threshold(gray) gray = self._morphological_cleanup(gray) steps_applied.append("otsu_threshold") result_image = Image.fromarray(gray) buffer = io.BytesIO() result_image.save(buffer, format="PNG") return PreprocessingResult( image_bytes=buffer.getvalue(), preprocessing_applied=steps_applied, ) def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]: """ Attempt to detect the VIN region in an image. Uses contour detection to find rectangular regions that might contain VINs. Args: image_bytes: Raw image bytes Returns: BoundingBox of detected VIN region, or None if not found """ try: pil_image = Image.open(io.BytesIO(image_bytes)) if pil_image.mode != "L": pil_image = pil_image.convert("L") cv_image = np.array(pil_image) # Apply preprocessing for better contour detection blurred = cv2.GaussianBlur(cv_image, (5, 5), 0) edges = cv2.Canny(blurred, 50, 150) # Find contours contours, _ = cv2.findContours( edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE ) if not contours: return None # Find rectangular contours with appropriate aspect ratio for VIN # VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio vin_candidates = [] for contour in contours: x, y, w, h = cv2.boundingRect(contour) if h == 0: continue aspect_ratio = w / h area = w * h # VIN regions typically have: # - Aspect ratio between 4:1 and 12:1 # - Minimum area (to filter out noise) if 4 <= aspect_ratio <= 12 and area > 1000: vin_candidates.append((x, y, w, h, area)) if not vin_candidates: return None # Return the largest candidate vin_candidates.sort(key=lambda c: c[4], reverse=True) x, y, w, h, _ = vin_candidates[0] return BoundingBox(x=x, y=y, width=w, height=h) except Exception as e: logger.warning(f"VIN region detection failed: {e}") return None # Singleton instance vin_preprocessor = VinPreprocessor()