Files
motovaultpro/ocr/app/preprocessors/vin_preprocessor.py
Eric Gullickson b9fe222f12
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
fix: Build errors and tesseract removal
2026-02-07 12:12:04 -06:00

445 lines
14 KiB
Python

"""VIN-optimized image preprocessing pipeline."""
import io
import logging
from dataclasses import dataclass
from typing import Optional
import cv2
import numpy as np
from PIL import Image
from pillow_heif import register_heif_opener
# Register HEIF/HEIC opener
register_heif_opener()
logger = logging.getLogger(__name__)
@dataclass
class BoundingBox:
"""Represents a region in an image."""
x: int
y: int
width: int
height: int
@dataclass
class PreprocessingResult:
"""Result of VIN preprocessing."""
image_bytes: bytes
bounding_box: Optional[BoundingBox] = None
preprocessing_applied: list[str] = None
def __post_init__(self) -> None:
if self.preprocessing_applied is None:
self.preprocessing_applied = []
class VinPreprocessor:
"""VIN-optimized image preprocessing for improved OCR accuracy."""
def preprocess(
self,
image_bytes: bytes,
apply_clahe: bool = True,
apply_deskew: bool = True,
apply_denoise: bool = True,
apply_threshold: bool = True,
) -> PreprocessingResult:
"""
Apply VIN-optimized preprocessing pipeline.
Pipeline:
1. HEIC conversion (if needed)
2. Grayscale conversion
3. Deskew (correct rotation/tilt)
4. Contrast enhancement (CLAHE)
5. Noise reduction (fastNlMeansDenoising)
6. Adaptive thresholding
Args:
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
apply_clahe: Apply CLAHE contrast enhancement
apply_deskew: Apply deskew correction
apply_denoise: Apply noise reduction
apply_threshold: Apply adaptive thresholding
Returns:
PreprocessingResult with processed image bytes
"""
steps_applied = []
# Load image with PIL (handles HEIC via pillow-heif)
pil_image = Image.open(io.BytesIO(image_bytes))
steps_applied.append("loaded")
# Convert to RGB if needed
if pil_image.mode not in ("RGB", "L"):
pil_image = pil_image.convert("RGB")
steps_applied.append("convert_rgb")
# Convert to OpenCV format
cv_image = np.array(pil_image)
if len(cv_image.shape) == 3:
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
# Convert to grayscale using best-contrast channel selection
if len(cv_image.shape) == 3:
gray = self._best_contrast_channel(cv_image)
else:
gray = cv_image
steps_applied.append("grayscale")
# Upscale small images for better OCR (~300 DPI recommended)
gray = self._ensure_minimum_resolution(gray)
steps_applied.append("resolution_check")
# Apply deskew
if apply_deskew:
gray = self._deskew(gray)
steps_applied.append("deskew")
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
if apply_clahe:
gray = self._apply_clahe(gray)
steps_applied.append("clahe")
# Apply denoising
if apply_denoise:
gray = self._denoise(gray)
steps_applied.append("denoise")
# Apply adaptive thresholding
if apply_threshold:
gray = self._adaptive_threshold(gray)
gray = self._morphological_cleanup(gray)
steps_applied.append("threshold")
# Convert back to PNG bytes
result_image = Image.fromarray(gray)
buffer = io.BytesIO()
result_image.save(buffer, format="PNG")
return PreprocessingResult(
image_bytes=buffer.getvalue(),
preprocessing_applied=steps_applied,
)
# Minimum width in pixels for reliable VIN OCR.
# A 17-char VIN needs ~30px per character for reliable OCR accuracy.
MIN_WIDTH_FOR_VIN = 600
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
"""
Upscale image if too small for reliable OCR.
OCR works best at ~300 DPI. Mobile photos of VINs may have
the text occupy only a small portion of the frame, resulting in
low effective resolution for the VIN characters.
"""
height, width = image.shape[:2]
if width < self.MIN_WIDTH_FOR_VIN:
scale = self.MIN_WIDTH_FOR_VIN / width
new_width = int(width * scale)
new_height = int(height * scale)
image = cv2.resize(
image, (new_width, new_height), interpolation=cv2.INTER_CUBIC
)
logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}")
return image
def _best_contrast_channel(self, bgr_image: np.ndarray) -> np.ndarray:
"""
Compute a grayscale image with dark text on light background.
Uses inverted per-pixel minimum across B, G, R channels.
White text has min(255,255,255) = 255 → inverted to 0 (black).
Colored backgrounds have a low min value (e.g. green sticker:
min(130,230,150) = 130) → inverted to 125 (medium gray).
The inversion ensures the OCR engine always receives dark-text-on-
light-background, which is the polarity it expects.
"""
b_channel, g_channel, r_channel = cv2.split(bgr_image)
min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)
# Invert so white text (min=255) becomes black (0) and colored
# backgrounds (min~130) become lighter gray (~125). OCR engines
# expect dark text on light background.
inverted = cv2.bitwise_not(min_channel)
gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
logger.debug(
"Channel contrast: inverted-min std=%.1f, grayscale std=%.1f",
float(np.std(inverted)), float(np.std(gray)),
)
return inverted
def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
"""
Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
CLAHE improves contrast in images with varying illumination,
which is common in VIN photos taken in different lighting conditions.
"""
try:
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
return clahe.apply(image)
except cv2.error as e:
logger.warning(f"CLAHE failed: {e}")
return image
def _deskew(self, image: np.ndarray) -> np.ndarray:
"""
Correct image rotation using Hough transform line detection.
VIN plates/stickers are often photographed at slight angles.
"""
try:
# Detect edges
edges = cv2.Canny(image, 50, 150, apertureSize=3)
# Detect lines
lines = cv2.HoughLinesP(
edges,
rho=1,
theta=np.pi / 180,
threshold=100,
minLineLength=100,
maxLineGap=10,
)
if lines is None:
return image
# Calculate angles of detected lines
angles = []
for line in lines:
x1, y1, x2, y2 = line[0]
if x2 - x1 != 0:
angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
# Only consider nearly horizontal lines
if -45 < angle < 45:
angles.append(angle)
if not angles:
return image
# Use median angle to avoid outliers
median_angle = np.median(angles)
# Only correct if skew is significant but not extreme
if abs(median_angle) < 0.5 or abs(median_angle) > 20:
return image
# Rotate to correct skew
height, width = image.shape[:2]
center = (width // 2, height // 2)
rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
# Calculate new bounds
cos_val = abs(rotation_matrix[0, 0])
sin_val = abs(rotation_matrix[0, 1])
new_width = int(height * sin_val + width * cos_val)
new_height = int(height * cos_val + width * sin_val)
rotation_matrix[0, 2] += (new_width - width) / 2
rotation_matrix[1, 2] += (new_height - height) / 2
rotated = cv2.warpAffine(
image,
rotation_matrix,
(new_width, new_height),
borderMode=cv2.BORDER_REPLICATE,
)
logger.debug(f"Deskewed by {median_angle:.2f} degrees")
return rotated
except Exception as e:
logger.warning(f"Deskew failed: {e}")
return image
def _denoise(self, image: np.ndarray) -> np.ndarray:
"""
Apply non-local means denoising.
This helps remove noise while preserving VIN character edges.
"""
try:
return cv2.fastNlMeansDenoising(
image, h=10, templateWindowSize=7, searchWindowSize=21
)
except cv2.error as e:
logger.warning(f"Denoising failed: {e}")
return image
def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
"""
Apply adaptive thresholding for binarization.
Adaptive thresholding handles varying illumination across the image,
which is common in VIN photos.
"""
try:
return cv2.adaptiveThreshold(
image,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11,
C=2,
)
except cv2.error as e:
logger.warning(f"Adaptive threshold failed: {e}")
return image
def _morphological_cleanup(self, image: np.ndarray) -> np.ndarray:
"""
Remove small noise artifacts from a thresholded binary image.
Morphological opening (erosion then dilation) removes isolated
pixels and thin noise lines while preserving larger text characters.
"""
try:
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
except cv2.error as e:
logger.warning(f"Morphological cleanup failed: {e}")
return image
def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
"""
Apply Otsu's thresholding for binarization.
Otsu's method auto-calculates the optimal threshold value,
which can work better than adaptive thresholding on evenly-lit images.
"""
try:
_, result = cv2.threshold(
image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
)
return result
except cv2.error as e:
logger.warning(f"Otsu threshold failed: {e}")
return image
def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult:
"""
Alternative preprocessing pipeline using Otsu's thresholding.
Used as a fallback when adaptive thresholding doesn't produce
good OCR results.
"""
steps_applied = []
pil_image = Image.open(io.BytesIO(image_bytes))
steps_applied.append("loaded")
if pil_image.mode not in ("RGB", "L"):
pil_image = pil_image.convert("RGB")
steps_applied.append("convert_rgb")
cv_image = np.array(pil_image)
if len(cv_image.shape) == 3:
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
if len(cv_image.shape) == 3:
gray = self._best_contrast_channel(cv_image)
else:
gray = cv_image
steps_applied.append("grayscale")
gray = self._ensure_minimum_resolution(gray)
steps_applied.append("resolution_check")
gray = self._apply_clahe(gray)
steps_applied.append("clahe")
gray = self._denoise(gray)
steps_applied.append("denoise")
gray = self._otsu_threshold(gray)
gray = self._morphological_cleanup(gray)
steps_applied.append("otsu_threshold")
result_image = Image.fromarray(gray)
buffer = io.BytesIO()
result_image.save(buffer, format="PNG")
return PreprocessingResult(
image_bytes=buffer.getvalue(),
preprocessing_applied=steps_applied,
)
def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
"""
Attempt to detect the VIN region in an image.
Uses contour detection to find rectangular regions that might contain VINs.
Args:
image_bytes: Raw image bytes
Returns:
BoundingBox of detected VIN region, or None if not found
"""
try:
pil_image = Image.open(io.BytesIO(image_bytes))
if pil_image.mode != "L":
pil_image = pil_image.convert("L")
cv_image = np.array(pil_image)
# Apply preprocessing for better contour detection
blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
edges = cv2.Canny(blurred, 50, 150)
# Find contours
contours, _ = cv2.findContours(
edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
if not contours:
return None
# Find rectangular contours with appropriate aspect ratio for VIN
# VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
vin_candidates = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
if h == 0:
continue
aspect_ratio = w / h
area = w * h
# VIN regions typically have:
# - Aspect ratio between 4:1 and 12:1
# - Minimum area (to filter out noise)
if 4 <= aspect_ratio <= 12 and area > 1000:
vin_candidates.append((x, y, w, h, area))
if not vin_candidates:
return None
# Return the largest candidate
vin_candidates.sort(key=lambda c: c[4], reverse=True)
x, y, w, h, _ = vin_candidates[0]
return BoundingBox(x=x, y=y, width=w, height=h)
except Exception as e:
logger.warning(f"VIN region detection failed: {e}")
return None
# Singleton instance
vin_preprocessor = VinPreprocessor()