Files
motovaultpro/ocr/app/preprocessors/vin_preprocessor.py
Eric Gullickson 0de34983bb
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 36s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 1m7s
Deploy to Staging / Verify Staging (pull_request) Successful in 10s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 9s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
fix: use best-contrast color channel for VIN preprocessing (refs #113)
White text on green VIN stickers has only ~12% contrast in standard
grayscale conversion because the green channel dominates luminance.
The new _best_contrast_channel method evaluates each RGB channel's
standard deviation and selects the one with highest contrast, giving
~2x improvement for green-tinted VIN stickers. Falls back to standard
grayscale for neutral-colored images.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 21:14:56 -06:00

440 lines
14 KiB
Python

"""VIN-optimized image preprocessing pipeline."""
import io
import logging
from dataclasses import dataclass
from typing import Optional
import cv2
import numpy as np
from PIL import Image
from pillow_heif import register_heif_opener
# Register HEIF/HEIC opener
register_heif_opener()
logger = logging.getLogger(__name__)
@dataclass
class BoundingBox:
"""Represents a region in an image."""
x: int
y: int
width: int
height: int
@dataclass
class PreprocessingResult:
"""Result of VIN preprocessing."""
image_bytes: bytes
bounding_box: Optional[BoundingBox] = None
preprocessing_applied: list[str] = None
def __post_init__(self) -> None:
if self.preprocessing_applied is None:
self.preprocessing_applied = []
class VinPreprocessor:
"""VIN-optimized image preprocessing for improved OCR accuracy."""
def preprocess(
self,
image_bytes: bytes,
apply_clahe: bool = True,
apply_deskew: bool = True,
apply_denoise: bool = True,
apply_threshold: bool = True,
) -> PreprocessingResult:
"""
Apply VIN-optimized preprocessing pipeline.
Pipeline:
1. HEIC conversion (if needed)
2. Grayscale conversion
3. Deskew (correct rotation/tilt)
4. Contrast enhancement (CLAHE)
5. Noise reduction (fastNlMeansDenoising)
6. Adaptive thresholding
Args:
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
apply_clahe: Apply CLAHE contrast enhancement
apply_deskew: Apply deskew correction
apply_denoise: Apply noise reduction
apply_threshold: Apply adaptive thresholding
Returns:
PreprocessingResult with processed image bytes
"""
steps_applied = []
# Load image with PIL (handles HEIC via pillow-heif)
pil_image = Image.open(io.BytesIO(image_bytes))
steps_applied.append("loaded")
# Convert to RGB if needed
if pil_image.mode not in ("RGB", "L"):
pil_image = pil_image.convert("RGB")
steps_applied.append("convert_rgb")
# Convert to OpenCV format
cv_image = np.array(pil_image)
if len(cv_image.shape) == 3:
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
# Convert to grayscale using best-contrast channel selection
if len(cv_image.shape) == 3:
gray = self._best_contrast_channel(cv_image)
else:
gray = cv_image
steps_applied.append("grayscale")
# Upscale small images for better OCR (Tesseract needs ~300 DPI)
gray = self._ensure_minimum_resolution(gray)
steps_applied.append("resolution_check")
# Apply deskew
if apply_deskew:
gray = self._deskew(gray)
steps_applied.append("deskew")
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
if apply_clahe:
gray = self._apply_clahe(gray)
steps_applied.append("clahe")
# Apply denoising
if apply_denoise:
gray = self._denoise(gray)
steps_applied.append("denoise")
# Apply adaptive thresholding
if apply_threshold:
gray = self._adaptive_threshold(gray)
steps_applied.append("threshold")
# Convert back to PNG bytes
result_image = Image.fromarray(gray)
buffer = io.BytesIO()
result_image.save(buffer, format="PNG")
return PreprocessingResult(
image_bytes=buffer.getvalue(),
preprocessing_applied=steps_applied,
)
# Minimum width in pixels for reliable VIN OCR.
# A 17-char VIN needs ~30px per character for Tesseract accuracy.
MIN_WIDTH_FOR_VIN = 600
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
"""
Upscale image if too small for reliable OCR.
Tesseract works best at ~300 DPI. Mobile photos of VINs may have
the text occupy only a small portion of the frame, resulting in
low effective resolution for the VIN characters.
"""
height, width = image.shape[:2]
if width < self.MIN_WIDTH_FOR_VIN:
scale = self.MIN_WIDTH_FOR_VIN / width
new_width = int(width * scale)
new_height = int(height * scale)
image = cv2.resize(
image, (new_width, new_height), interpolation=cv2.INTER_CUBIC
)
logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}")
return image
def _best_contrast_channel(self, bgr_image: np.ndarray) -> np.ndarray:
"""
Select the single color channel with the highest contrast.
Standard grayscale conversion (0.299R + 0.587G + 0.114B) averages
channels, which destroys contrast when text and background differ
primarily in one channel. For example, white text on a green VIN
sticker has almost identical luminance, but the blue and red channels
show strong contrast.
This method evaluates each BGR channel by its standard deviation
(a proxy for contrast) and returns the one with the highest value.
Falls back to standard grayscale when all channels are similar.
"""
b_channel, g_channel, r_channel = cv2.split(bgr_image)
stds = [
float(np.std(b_channel)),
float(np.std(g_channel)),
float(np.std(r_channel)),
]
channels = [b_channel, g_channel, r_channel]
channel_names = ["blue", "green", "red"]
best_idx = int(np.argmax(stds))
max_std = stds[best_idx]
min_std = min(stds)
# Only use single-channel extraction when one channel is notably
# better (>20% higher std than the weakest). Otherwise, standard
# grayscale is fine and more robust for neutral-colored images.
if max_std > 0 and (max_std - min_std) / max_std > 0.20:
logger.debug(
"Using %s channel (std=%.1f) over grayscale (stds: B=%.1f G=%.1f R=%.1f)",
channel_names[best_idx], max_std, stds[0], stds[1], stds[2],
)
return channels[best_idx]
return cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
"""
Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
CLAHE improves contrast in images with varying illumination,
which is common in VIN photos taken in different lighting conditions.
"""
try:
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
return clahe.apply(image)
except cv2.error as e:
logger.warning(f"CLAHE failed: {e}")
return image
def _deskew(self, image: np.ndarray) -> np.ndarray:
"""
Correct image rotation using Hough transform line detection.
VIN plates/stickers are often photographed at slight angles.
"""
try:
# Detect edges
edges = cv2.Canny(image, 50, 150, apertureSize=3)
# Detect lines
lines = cv2.HoughLinesP(
edges,
rho=1,
theta=np.pi / 180,
threshold=100,
minLineLength=100,
maxLineGap=10,
)
if lines is None:
return image
# Calculate angles of detected lines
angles = []
for line in lines:
x1, y1, x2, y2 = line[0]
if x2 - x1 != 0:
angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
# Only consider nearly horizontal lines
if -45 < angle < 45:
angles.append(angle)
if not angles:
return image
# Use median angle to avoid outliers
median_angle = np.median(angles)
# Only correct if skew is significant but not extreme
if abs(median_angle) < 0.5 or abs(median_angle) > 20:
return image
# Rotate to correct skew
height, width = image.shape[:2]
center = (width // 2, height // 2)
rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
# Calculate new bounds
cos_val = abs(rotation_matrix[0, 0])
sin_val = abs(rotation_matrix[0, 1])
new_width = int(height * sin_val + width * cos_val)
new_height = int(height * cos_val + width * sin_val)
rotation_matrix[0, 2] += (new_width - width) / 2
rotation_matrix[1, 2] += (new_height - height) / 2
rotated = cv2.warpAffine(
image,
rotation_matrix,
(new_width, new_height),
borderMode=cv2.BORDER_REPLICATE,
)
logger.debug(f"Deskewed by {median_angle:.2f} degrees")
return rotated
except Exception as e:
logger.warning(f"Deskew failed: {e}")
return image
def _denoise(self, image: np.ndarray) -> np.ndarray:
"""
Apply non-local means denoising.
This helps remove noise while preserving VIN character edges.
"""
try:
return cv2.fastNlMeansDenoising(
image, h=10, templateWindowSize=7, searchWindowSize=21
)
except cv2.error as e:
logger.warning(f"Denoising failed: {e}")
return image
def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
"""
Apply adaptive thresholding for binarization.
Adaptive thresholding handles varying illumination across the image,
which is common in VIN photos.
"""
try:
return cv2.adaptiveThreshold(
image,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11,
C=2,
)
except cv2.error as e:
logger.warning(f"Adaptive threshold failed: {e}")
return image
def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
"""
Apply Otsu's thresholding for binarization.
Otsu's method auto-calculates the optimal threshold value,
which can work better than adaptive thresholding on evenly-lit images.
"""
try:
_, result = cv2.threshold(
image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
)
return result
except cv2.error as e:
logger.warning(f"Otsu threshold failed: {e}")
return image
def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult:
"""
Alternative preprocessing pipeline using Otsu's thresholding.
Used as a fallback when adaptive thresholding doesn't produce
good OCR results.
"""
steps_applied = []
pil_image = Image.open(io.BytesIO(image_bytes))
steps_applied.append("loaded")
if pil_image.mode not in ("RGB", "L"):
pil_image = pil_image.convert("RGB")
steps_applied.append("convert_rgb")
cv_image = np.array(pil_image)
if len(cv_image.shape) == 3:
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
if len(cv_image.shape) == 3:
gray = self._best_contrast_channel(cv_image)
else:
gray = cv_image
steps_applied.append("grayscale")
gray = self._ensure_minimum_resolution(gray)
steps_applied.append("resolution_check")
gray = self._apply_clahe(gray)
steps_applied.append("clahe")
gray = self._denoise(gray)
steps_applied.append("denoise")
gray = self._otsu_threshold(gray)
steps_applied.append("otsu_threshold")
result_image = Image.fromarray(gray)
buffer = io.BytesIO()
result_image.save(buffer, format="PNG")
return PreprocessingResult(
image_bytes=buffer.getvalue(),
preprocessing_applied=steps_applied,
)
def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
"""
Attempt to detect the VIN region in an image.
Uses contour detection to find rectangular regions that might contain VINs.
Args:
image_bytes: Raw image bytes
Returns:
BoundingBox of detected VIN region, or None if not found
"""
try:
pil_image = Image.open(io.BytesIO(image_bytes))
if pil_image.mode != "L":
pil_image = pil_image.convert("L")
cv_image = np.array(pil_image)
# Apply preprocessing for better contour detection
blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
edges = cv2.Canny(blurred, 50, 150)
# Find contours
contours, _ = cv2.findContours(
edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
if not contours:
return None
# Find rectangular contours with appropriate aspect ratio for VIN
# VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
vin_candidates = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
if h == 0:
continue
aspect_ratio = w / h
area = w * h
# VIN regions typically have:
# - Aspect ratio between 4:1 and 12:1
# - Minimum area (to filter out noise)
if 4 <= aspect_ratio <= 12 and area > 1000:
vin_candidates.append((x, y, w, h, area))
if not vin_candidates:
return None
# Return the largest candidate
vin_candidates.sort(key=lambda c: c[4], reverse=True)
x, y, w, h, _ = vin_candidates[0]
return BoundingBox(x=x, y=y, width=w, height=h)
except Exception as e:
logger.warning(f"VIN region detection failed: {e}")
return None
# Singleton instance
vin_preprocessor = VinPreprocessor()