"""VIN-optimized image preprocessing pipeline.""" import io import logging from dataclasses import dataclass from typing import Optional import cv2 import numpy as np from PIL import Image from pillow_heif import register_heif_opener # Register HEIF/HEIC opener register_heif_opener() logger = logging.getLogger(__name__) @dataclass class BoundingBox: """Represents a region in an image.""" x: int y: int width: int height: int @dataclass class PreprocessingResult: """Result of VIN preprocessing.""" image_bytes: bytes bounding_box: Optional[BoundingBox] = None preprocessing_applied: list[str] = None def __post_init__(self) -> None: if self.preprocessing_applied is None: self.preprocessing_applied = [] class VinPreprocessor: """VIN-optimized image preprocessing for improved OCR accuracy.""" def preprocess( self, image_bytes: bytes, apply_clahe: bool = True, apply_deskew: bool = True, apply_denoise: bool = True, apply_threshold: bool = True, ) -> PreprocessingResult: """ Apply VIN-optimized preprocessing pipeline. Pipeline: 1. HEIC conversion (if needed) 2. Grayscale conversion 3. Deskew (correct rotation/tilt) 4. Contrast enhancement (CLAHE) 5. Noise reduction (fastNlMeansDenoising) 6. Adaptive thresholding Args: image_bytes: Raw image bytes (HEIC, JPEG, PNG) apply_clahe: Apply CLAHE contrast enhancement apply_deskew: Apply deskew correction apply_denoise: Apply noise reduction apply_threshold: Apply adaptive thresholding Returns: PreprocessingResult with processed image bytes """ steps_applied = [] # Load image with PIL (handles HEIC via pillow-heif) pil_image = Image.open(io.BytesIO(image_bytes)) steps_applied.append("loaded") # Convert to RGB if needed if pil_image.mode not in ("RGB", "L"): pil_image = pil_image.convert("RGB") steps_applied.append("convert_rgb") # Convert to OpenCV format cv_image = np.array(pil_image) if len(cv_image.shape) == 3: cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR) # Convert to grayscale using best-contrast channel selection if len(cv_image.shape) == 3: gray = self._best_contrast_channel(cv_image) else: gray = cv_image steps_applied.append("grayscale") # Upscale small images for better OCR (Tesseract needs ~300 DPI) gray = self._ensure_minimum_resolution(gray) steps_applied.append("resolution_check") # Apply deskew if apply_deskew: gray = self._deskew(gray) steps_applied.append("deskew") # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) if apply_clahe: gray = self._apply_clahe(gray) steps_applied.append("clahe") # Apply denoising if apply_denoise: gray = self._denoise(gray) steps_applied.append("denoise") # Apply adaptive thresholding if apply_threshold: gray = self._adaptive_threshold(gray) steps_applied.append("threshold") # Convert back to PNG bytes result_image = Image.fromarray(gray) buffer = io.BytesIO() result_image.save(buffer, format="PNG") return PreprocessingResult( image_bytes=buffer.getvalue(), preprocessing_applied=steps_applied, ) # Minimum width in pixels for reliable VIN OCR. # A 17-char VIN needs ~30px per character for Tesseract accuracy. MIN_WIDTH_FOR_VIN = 600 def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray: """ Upscale image if too small for reliable OCR. Tesseract works best at ~300 DPI. Mobile photos of VINs may have the text occupy only a small portion of the frame, resulting in low effective resolution for the VIN characters. """ height, width = image.shape[:2] if width < self.MIN_WIDTH_FOR_VIN: scale = self.MIN_WIDTH_FOR_VIN / width new_width = int(width * scale) new_height = int(height * scale) image = cv2.resize( image, (new_width, new_height), interpolation=cv2.INTER_CUBIC ) logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}") return image def _best_contrast_channel(self, bgr_image: np.ndarray) -> np.ndarray: """ Select the single color channel with the highest contrast. Standard grayscale conversion (0.299R + 0.587G + 0.114B) averages channels, which destroys contrast when text and background differ primarily in one channel. For example, white text on a green VIN sticker has almost identical luminance, but the blue and red channels show strong contrast. This method evaluates each BGR channel by its standard deviation (a proxy for contrast) and returns the one with the highest value. Falls back to standard grayscale when all channels are similar. """ b_channel, g_channel, r_channel = cv2.split(bgr_image) stds = [ float(np.std(b_channel)), float(np.std(g_channel)), float(np.std(r_channel)), ] channels = [b_channel, g_channel, r_channel] channel_names = ["blue", "green", "red"] best_idx = int(np.argmax(stds)) max_std = stds[best_idx] min_std = min(stds) # Only use single-channel extraction when one channel is notably # better (>20% higher std than the weakest). Otherwise, standard # grayscale is fine and more robust for neutral-colored images. if max_std > 0 and (max_std - min_std) / max_std > 0.20: logger.debug( "Using %s channel (std=%.1f) over grayscale (stds: B=%.1f G=%.1f R=%.1f)", channel_names[best_idx], max_std, stds[0], stds[1], stds[2], ) return channels[best_idx] return cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY) def _apply_clahe(self, image: np.ndarray) -> np.ndarray: """ Apply CLAHE (Contrast Limited Adaptive Histogram Equalization). CLAHE improves contrast in images with varying illumination, which is common in VIN photos taken in different lighting conditions. """ try: clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) return clahe.apply(image) except cv2.error as e: logger.warning(f"CLAHE failed: {e}") return image def _deskew(self, image: np.ndarray) -> np.ndarray: """ Correct image rotation using Hough transform line detection. VIN plates/stickers are often photographed at slight angles. """ try: # Detect edges edges = cv2.Canny(image, 50, 150, apertureSize=3) # Detect lines lines = cv2.HoughLinesP( edges, rho=1, theta=np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10, ) if lines is None: return image # Calculate angles of detected lines angles = [] for line in lines: x1, y1, x2, y2 = line[0] if x2 - x1 != 0: angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi # Only consider nearly horizontal lines if -45 < angle < 45: angles.append(angle) if not angles: return image # Use median angle to avoid outliers median_angle = np.median(angles) # Only correct if skew is significant but not extreme if abs(median_angle) < 0.5 or abs(median_angle) > 20: return image # Rotate to correct skew height, width = image.shape[:2] center = (width // 2, height // 2) rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0) # Calculate new bounds cos_val = abs(rotation_matrix[0, 0]) sin_val = abs(rotation_matrix[0, 1]) new_width = int(height * sin_val + width * cos_val) new_height = int(height * cos_val + width * sin_val) rotation_matrix[0, 2] += (new_width - width) / 2 rotation_matrix[1, 2] += (new_height - height) / 2 rotated = cv2.warpAffine( image, rotation_matrix, (new_width, new_height), borderMode=cv2.BORDER_REPLICATE, ) logger.debug(f"Deskewed by {median_angle:.2f} degrees") return rotated except Exception as e: logger.warning(f"Deskew failed: {e}") return image def _denoise(self, image: np.ndarray) -> np.ndarray: """ Apply non-local means denoising. This helps remove noise while preserving VIN character edges. """ try: return cv2.fastNlMeansDenoising( image, h=10, templateWindowSize=7, searchWindowSize=21 ) except cv2.error as e: logger.warning(f"Denoising failed: {e}") return image def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray: """ Apply adaptive thresholding for binarization. Adaptive thresholding handles varying illumination across the image, which is common in VIN photos. """ try: return cv2.adaptiveThreshold( image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blockSize=11, C=2, ) except cv2.error as e: logger.warning(f"Adaptive threshold failed: {e}") return image def _otsu_threshold(self, image: np.ndarray) -> np.ndarray: """ Apply Otsu's thresholding for binarization. Otsu's method auto-calculates the optimal threshold value, which can work better than adaptive thresholding on evenly-lit images. """ try: _, result = cv2.threshold( image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU ) return result except cv2.error as e: logger.warning(f"Otsu threshold failed: {e}") return image def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult: """ Alternative preprocessing pipeline using Otsu's thresholding. Used as a fallback when adaptive thresholding doesn't produce good OCR results. """ steps_applied = [] pil_image = Image.open(io.BytesIO(image_bytes)) steps_applied.append("loaded") if pil_image.mode not in ("RGB", "L"): pil_image = pil_image.convert("RGB") steps_applied.append("convert_rgb") cv_image = np.array(pil_image) if len(cv_image.shape) == 3: cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR) if len(cv_image.shape) == 3: gray = self._best_contrast_channel(cv_image) else: gray = cv_image steps_applied.append("grayscale") gray = self._ensure_minimum_resolution(gray) steps_applied.append("resolution_check") gray = self._apply_clahe(gray) steps_applied.append("clahe") gray = self._denoise(gray) steps_applied.append("denoise") gray = self._otsu_threshold(gray) steps_applied.append("otsu_threshold") result_image = Image.fromarray(gray) buffer = io.BytesIO() result_image.save(buffer, format="PNG") return PreprocessingResult( image_bytes=buffer.getvalue(), preprocessing_applied=steps_applied, ) def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]: """ Attempt to detect the VIN region in an image. Uses contour detection to find rectangular regions that might contain VINs. Args: image_bytes: Raw image bytes Returns: BoundingBox of detected VIN region, or None if not found """ try: pil_image = Image.open(io.BytesIO(image_bytes)) if pil_image.mode != "L": pil_image = pil_image.convert("L") cv_image = np.array(pil_image) # Apply preprocessing for better contour detection blurred = cv2.GaussianBlur(cv_image, (5, 5), 0) edges = cv2.Canny(blurred, 50, 150) # Find contours contours, _ = cv2.findContours( edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE ) if not contours: return None # Find rectangular contours with appropriate aspect ratio for VIN # VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio vin_candidates = [] for contour in contours: x, y, w, h = cv2.boundingRect(contour) if h == 0: continue aspect_ratio = w / h area = w * h # VIN regions typically have: # - Aspect ratio between 4:1 and 12:1 # - Minimum area (to filter out noise) if 4 <= aspect_ratio <= 12 and area > 1000: vin_candidates.append((x, y, w, h, area)) if not vin_candidates: return None # Return the largest candidate vin_candidates.sort(key=lambda c: c[4], reverse=True) x, y, w, h, _ = vin_candidates[0] return BoundingBox(x=x, y=y, width=w, height=h) except Exception as e: logger.warning(f"VIN region detection failed: {e}") return None # Singleton instance vin_preprocessor = VinPreprocessor()