feat: add VIN photo OCR pipeline (refs #67)

Implement VIN-specific OCR extraction with optimized preprocessing: - Add POST /extract/vin endpoint for VIN extraction - VIN preprocessor: CLAHE, deskew, denoise, adaptive threshold - VIN validator: check digit validation, OCR error correction (I->1, O->0) - VIN extractor: PSM modes 6/7/8, character whitelist, alternatives - Response includes confidence, bounding box, and alternatives - Unit tests for validator and preprocessor - Integration tests for VIN extraction endpoint Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 19:31:36 -06:00
parent 004940b013
commit 54cbd49171
14 changed files with 1694 additions and 1 deletions
--- a/ocr/app/preprocessors/init.py
+++ b/ocr/app/preprocessors/init.py
@@ -0,0 +1,10 @@
+"""Image preprocessors for OCR optimization."""
+from app.services.preprocessor import ImagePreprocessor, preprocessor
+from app.preprocessors.vin_preprocessor import VinPreprocessor, vin_preprocessor
+
+__all__ = [
+    "ImagePreprocessor",
+    "preprocessor",
+    "VinPreprocessor",
+    "vin_preprocessor",
+]
--- a/ocr/app/preprocessors/vin_preprocessor.py
+++ b/ocr/app/preprocessors/vin_preprocessor.py
@@ -0,0 +1,309 @@
+"""VIN-optimized image preprocessing pipeline."""
+import io
+import logging
+from dataclasses import dataclass
+from typing import Optional
+
+import cv2
+import numpy as np
+from PIL import Image
+from pillow_heif import register_heif_opener
+
+# Register HEIF/HEIC opener
+register_heif_opener()
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BoundingBox:
+    """Represents a region in an image."""
+
+    x: int
+    y: int
+    width: int
+    height: int
+
+
+@dataclass
+class PreprocessingResult:
+    """Result of VIN preprocessing."""
+
+    image_bytes: bytes
+    bounding_box: Optional[BoundingBox] = None
+    preprocessing_applied: list[str] = None
+
+    def __post_init__(self) -> None:
+        if self.preprocessing_applied is None:
+            self.preprocessing_applied = []
+
+
+class VinPreprocessor:
+    """VIN-optimized image preprocessing for improved OCR accuracy."""
+
+    def preprocess(
+        self,
+        image_bytes: bytes,
+        apply_clahe: bool = True,
+        apply_deskew: bool = True,
+        apply_denoise: bool = True,
+        apply_threshold: bool = True,
+    ) -> PreprocessingResult:
+        """
+        Apply VIN-optimized preprocessing pipeline.
+
+        Pipeline:
+        1. HEIC conversion (if needed)
+        2. Grayscale conversion
+        3. Deskew (correct rotation/tilt)
+        4. Contrast enhancement (CLAHE)
+        5. Noise reduction (fastNlMeansDenoising)
+        6. Adaptive thresholding
+
+        Args:
+            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
+            apply_clahe: Apply CLAHE contrast enhancement
+            apply_deskew: Apply deskew correction
+            apply_denoise: Apply noise reduction
+            apply_threshold: Apply adaptive thresholding
+
+        Returns:
+            PreprocessingResult with processed image bytes
+        """
+        steps_applied = []
+
+        # Load image with PIL (handles HEIC via pillow-heif)
+        pil_image = Image.open(io.BytesIO(image_bytes))
+        steps_applied.append("loaded")
+
+        # Convert to RGB if needed
+        if pil_image.mode not in ("RGB", "L"):
+            pil_image = pil_image.convert("RGB")
+            steps_applied.append("convert_rgb")
+
+        # Convert to OpenCV format
+        cv_image = np.array(pil_image)
+        if len(cv_image.shape) == 3:
+            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
+
+        # Convert to grayscale
+        if len(cv_image.shape) == 3:
+            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = cv_image
+        steps_applied.append("grayscale")
+
+        # Apply deskew
+        if apply_deskew:
+            gray = self._deskew(gray)
+            steps_applied.append("deskew")
+
+        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
+        if apply_clahe:
+            gray = self._apply_clahe(gray)
+            steps_applied.append("clahe")
+
+        # Apply denoising
+        if apply_denoise:
+            gray = self._denoise(gray)
+            steps_applied.append("denoise")
+
+        # Apply adaptive thresholding
+        if apply_threshold:
+            gray = self._adaptive_threshold(gray)
+            steps_applied.append("threshold")
+
+        # Convert back to PNG bytes
+        result_image = Image.fromarray(gray)
+        buffer = io.BytesIO()
+        result_image.save(buffer, format="PNG")
+
+        return PreprocessingResult(
+            image_bytes=buffer.getvalue(),
+            preprocessing_applied=steps_applied,
+        )
+
+    def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
+
+        CLAHE improves contrast in images with varying illumination,
+        which is common in VIN photos taken in different lighting conditions.
+        """
+        try:
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+            return clahe.apply(image)
+        except cv2.error as e:
+            logger.warning(f"CLAHE failed: {e}")
+            return image
+
+    def _deskew(self, image: np.ndarray) -> np.ndarray:
+        """
+        Correct image rotation using Hough transform line detection.
+
+        VIN plates/stickers are often photographed at slight angles.
+        """
+        try:
+            # Detect edges
+            edges = cv2.Canny(image, 50, 150, apertureSize=3)
+
+            # Detect lines
+            lines = cv2.HoughLinesP(
+                edges,
+                rho=1,
+                theta=np.pi / 180,
+                threshold=100,
+                minLineLength=100,
+                maxLineGap=10,
+            )
+
+            if lines is None:
+                return image
+
+            # Calculate angles of detected lines
+            angles = []
+            for line in lines:
+                x1, y1, x2, y2 = line[0]
+                if x2 - x1 != 0:
+                    angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
+                    # Only consider nearly horizontal lines
+                    if -45 < angle < 45:
+                        angles.append(angle)
+
+            if not angles:
+                return image
+
+            # Use median angle to avoid outliers
+            median_angle = np.median(angles)
+
+            # Only correct if skew is significant but not extreme
+            if abs(median_angle) < 0.5 or abs(median_angle) > 20:
+                return image
+
+            # Rotate to correct skew
+            height, width = image.shape[:2]
+            center = (width // 2, height // 2)
+            rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+
+            # Calculate new bounds
+            cos_val = abs(rotation_matrix[0, 0])
+            sin_val = abs(rotation_matrix[0, 1])
+            new_width = int(height * sin_val + width * cos_val)
+            new_height = int(height * cos_val + width * sin_val)
+
+            rotation_matrix[0, 2] += (new_width - width) / 2
+            rotation_matrix[1, 2] += (new_height - height) / 2
+
+            rotated = cv2.warpAffine(
+                image,
+                rotation_matrix,
+                (new_width, new_height),
+                borderMode=cv2.BORDER_REPLICATE,
+            )
+
+            logger.debug(f"Deskewed by {median_angle:.2f} degrees")
+            return rotated
+
+        except Exception as e:
+            logger.warning(f"Deskew failed: {e}")
+            return image
+
+    def _denoise(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply non-local means denoising.
+
+        This helps remove noise while preserving VIN character edges.
+        """
+        try:
+            return cv2.fastNlMeansDenoising(
+                image, h=10, templateWindowSize=7, searchWindowSize=21
+            )
+        except cv2.error as e:
+            logger.warning(f"Denoising failed: {e}")
+            return image
+
+    def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply adaptive thresholding for binarization.
+
+        Adaptive thresholding handles varying illumination across the image,
+        which is common in VIN photos.
+        """
+        try:
+            return cv2.adaptiveThreshold(
+                image,
+                255,
+                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY,
+                blockSize=11,
+                C=2,
+            )
+        except cv2.error as e:
+            logger.warning(f"Adaptive threshold failed: {e}")
+            return image
+
+    def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
+        """
+        Attempt to detect the VIN region in an image.
+
+        Uses contour detection to find rectangular regions that might contain VINs.
+
+        Args:
+            image_bytes: Raw image bytes
+
+        Returns:
+            BoundingBox of detected VIN region, or None if not found
+        """
+        try:
+            pil_image = Image.open(io.BytesIO(image_bytes))
+            if pil_image.mode != "L":
+                pil_image = pil_image.convert("L")
+
+            cv_image = np.array(pil_image)
+
+            # Apply preprocessing for better contour detection
+            blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
+            edges = cv2.Canny(blurred, 50, 150)
+
+            # Find contours
+            contours, _ = cv2.findContours(
+                edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            )
+
+            if not contours:
+                return None
+
+            # Find rectangular contours with appropriate aspect ratio for VIN
+            # VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
+            vin_candidates = []
+
+            for contour in contours:
+                x, y, w, h = cv2.boundingRect(contour)
+                if h == 0:
+                    continue
+
+                aspect_ratio = w / h
+                area = w * h
+
+                # VIN regions typically have:
+                # - Aspect ratio between 4:1 and 12:1
+                # - Minimum area (to filter out noise)
+                if 4 <= aspect_ratio <= 12 and area > 1000:
+                    vin_candidates.append((x, y, w, h, area))
+
+            if not vin_candidates:
+                return None
+
+            # Return the largest candidate
+            vin_candidates.sort(key=lambda c: c[4], reverse=True)
+            x, y, w, h, _ = vin_candidates[0]
+
+            return BoundingBox(x=x, y=y, width=w, height=h)
+
+        except Exception as e:
+            logger.warning(f"VIN region detection failed: {e}")
+            return None
+
+
+# Singleton instance
+vin_preprocessor = VinPreprocessor()