All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 35s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Replace std-based channel selection (which incorrectly picked green for green-tinted VIN stickers) with per-pixel min(B,G,R). White text stays 255 in all channels while colored backgrounds drop to their weakest channel value, giving 2x contrast improvement. Add morphological opening after thresholding to remove noise speckles from car body surface that were confusing Tesseract's page segmentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
447 lines
14 KiB
Python
447 lines
14 KiB
Python
"""VIN-optimized image preprocessing pipeline."""
|
|
import io
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
import cv2
|
|
import numpy as np
|
|
from PIL import Image
|
|
from pillow_heif import register_heif_opener
|
|
|
|
# Register HEIF/HEIC opener
|
|
register_heif_opener()
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class BoundingBox:
|
|
"""Represents a region in an image."""
|
|
|
|
x: int
|
|
y: int
|
|
width: int
|
|
height: int
|
|
|
|
|
|
@dataclass
|
|
class PreprocessingResult:
|
|
"""Result of VIN preprocessing."""
|
|
|
|
image_bytes: bytes
|
|
bounding_box: Optional[BoundingBox] = None
|
|
preprocessing_applied: list[str] = None
|
|
|
|
def __post_init__(self) -> None:
|
|
if self.preprocessing_applied is None:
|
|
self.preprocessing_applied = []
|
|
|
|
|
|
class VinPreprocessor:
|
|
"""VIN-optimized image preprocessing for improved OCR accuracy."""
|
|
|
|
def preprocess(
|
|
self,
|
|
image_bytes: bytes,
|
|
apply_clahe: bool = True,
|
|
apply_deskew: bool = True,
|
|
apply_denoise: bool = True,
|
|
apply_threshold: bool = True,
|
|
) -> PreprocessingResult:
|
|
"""
|
|
Apply VIN-optimized preprocessing pipeline.
|
|
|
|
Pipeline:
|
|
1. HEIC conversion (if needed)
|
|
2. Grayscale conversion
|
|
3. Deskew (correct rotation/tilt)
|
|
4. Contrast enhancement (CLAHE)
|
|
5. Noise reduction (fastNlMeansDenoising)
|
|
6. Adaptive thresholding
|
|
|
|
Args:
|
|
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
|
|
apply_clahe: Apply CLAHE contrast enhancement
|
|
apply_deskew: Apply deskew correction
|
|
apply_denoise: Apply noise reduction
|
|
apply_threshold: Apply adaptive thresholding
|
|
|
|
Returns:
|
|
PreprocessingResult with processed image bytes
|
|
"""
|
|
steps_applied = []
|
|
|
|
# Load image with PIL (handles HEIC via pillow-heif)
|
|
pil_image = Image.open(io.BytesIO(image_bytes))
|
|
steps_applied.append("loaded")
|
|
|
|
# Convert to RGB if needed
|
|
if pil_image.mode not in ("RGB", "L"):
|
|
pil_image = pil_image.convert("RGB")
|
|
steps_applied.append("convert_rgb")
|
|
|
|
# Convert to OpenCV format
|
|
cv_image = np.array(pil_image)
|
|
if len(cv_image.shape) == 3:
|
|
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
|
|
|
|
# Convert to grayscale using best-contrast channel selection
|
|
if len(cv_image.shape) == 3:
|
|
gray = self._best_contrast_channel(cv_image)
|
|
else:
|
|
gray = cv_image
|
|
steps_applied.append("grayscale")
|
|
|
|
# Upscale small images for better OCR (Tesseract needs ~300 DPI)
|
|
gray = self._ensure_minimum_resolution(gray)
|
|
steps_applied.append("resolution_check")
|
|
|
|
# Apply deskew
|
|
if apply_deskew:
|
|
gray = self._deskew(gray)
|
|
steps_applied.append("deskew")
|
|
|
|
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
|
if apply_clahe:
|
|
gray = self._apply_clahe(gray)
|
|
steps_applied.append("clahe")
|
|
|
|
# Apply denoising
|
|
if apply_denoise:
|
|
gray = self._denoise(gray)
|
|
steps_applied.append("denoise")
|
|
|
|
# Apply adaptive thresholding
|
|
if apply_threshold:
|
|
gray = self._adaptive_threshold(gray)
|
|
gray = self._morphological_cleanup(gray)
|
|
steps_applied.append("threshold")
|
|
|
|
# Convert back to PNG bytes
|
|
result_image = Image.fromarray(gray)
|
|
buffer = io.BytesIO()
|
|
result_image.save(buffer, format="PNG")
|
|
|
|
return PreprocessingResult(
|
|
image_bytes=buffer.getvalue(),
|
|
preprocessing_applied=steps_applied,
|
|
)
|
|
|
|
# Minimum width in pixels for reliable VIN OCR.
|
|
# A 17-char VIN needs ~30px per character for Tesseract accuracy.
|
|
MIN_WIDTH_FOR_VIN = 600
|
|
|
|
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Upscale image if too small for reliable OCR.
|
|
|
|
Tesseract works best at ~300 DPI. Mobile photos of VINs may have
|
|
the text occupy only a small portion of the frame, resulting in
|
|
low effective resolution for the VIN characters.
|
|
"""
|
|
height, width = image.shape[:2]
|
|
if width < self.MIN_WIDTH_FOR_VIN:
|
|
scale = self.MIN_WIDTH_FOR_VIN / width
|
|
new_width = int(width * scale)
|
|
new_height = int(height * scale)
|
|
image = cv2.resize(
|
|
image, (new_width, new_height), interpolation=cv2.INTER_CUBIC
|
|
)
|
|
logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}")
|
|
return image
|
|
|
|
def _best_contrast_channel(self, bgr_image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Compute a grayscale image that maximizes text-to-background contrast.
|
|
|
|
Uses per-pixel minimum across B, G, R channels. White text has
|
|
min(255,255,255) = 255 regardless of channel, while any colored
|
|
background has a low value in at least one channel (e.g. green
|
|
sticker: min(130,230,150) = 130). This gives ~125 units of
|
|
contrast vs ~60 from standard grayscale.
|
|
|
|
Falls back to standard grayscale when the min-channel doesn't
|
|
improve contrast (i.e. for already-neutral/gray images).
|
|
"""
|
|
b_channel, g_channel, r_channel = cv2.split(bgr_image)
|
|
|
|
min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)
|
|
|
|
min_std = float(np.std(min_channel))
|
|
gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
|
|
gray_std = float(np.std(gray))
|
|
|
|
# Use min-channel when it provides meaningfully more contrast
|
|
if min_std > gray_std * 1.1:
|
|
logger.debug(
|
|
"Using min-channel (std=%.1f) over grayscale (std=%.1f)",
|
|
min_std, gray_std,
|
|
)
|
|
return min_channel
|
|
|
|
return gray
|
|
|
|
def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
|
|
|
|
CLAHE improves contrast in images with varying illumination,
|
|
which is common in VIN photos taken in different lighting conditions.
|
|
"""
|
|
try:
|
|
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
|
return clahe.apply(image)
|
|
except cv2.error as e:
|
|
logger.warning(f"CLAHE failed: {e}")
|
|
return image
|
|
|
|
def _deskew(self, image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Correct image rotation using Hough transform line detection.
|
|
|
|
VIN plates/stickers are often photographed at slight angles.
|
|
"""
|
|
try:
|
|
# Detect edges
|
|
edges = cv2.Canny(image, 50, 150, apertureSize=3)
|
|
|
|
# Detect lines
|
|
lines = cv2.HoughLinesP(
|
|
edges,
|
|
rho=1,
|
|
theta=np.pi / 180,
|
|
threshold=100,
|
|
minLineLength=100,
|
|
maxLineGap=10,
|
|
)
|
|
|
|
if lines is None:
|
|
return image
|
|
|
|
# Calculate angles of detected lines
|
|
angles = []
|
|
for line in lines:
|
|
x1, y1, x2, y2 = line[0]
|
|
if x2 - x1 != 0:
|
|
angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
|
|
# Only consider nearly horizontal lines
|
|
if -45 < angle < 45:
|
|
angles.append(angle)
|
|
|
|
if not angles:
|
|
return image
|
|
|
|
# Use median angle to avoid outliers
|
|
median_angle = np.median(angles)
|
|
|
|
# Only correct if skew is significant but not extreme
|
|
if abs(median_angle) < 0.5 or abs(median_angle) > 20:
|
|
return image
|
|
|
|
# Rotate to correct skew
|
|
height, width = image.shape[:2]
|
|
center = (width // 2, height // 2)
|
|
rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
|
|
|
|
# Calculate new bounds
|
|
cos_val = abs(rotation_matrix[0, 0])
|
|
sin_val = abs(rotation_matrix[0, 1])
|
|
new_width = int(height * sin_val + width * cos_val)
|
|
new_height = int(height * cos_val + width * sin_val)
|
|
|
|
rotation_matrix[0, 2] += (new_width - width) / 2
|
|
rotation_matrix[1, 2] += (new_height - height) / 2
|
|
|
|
rotated = cv2.warpAffine(
|
|
image,
|
|
rotation_matrix,
|
|
(new_width, new_height),
|
|
borderMode=cv2.BORDER_REPLICATE,
|
|
)
|
|
|
|
logger.debug(f"Deskewed by {median_angle:.2f} degrees")
|
|
return rotated
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Deskew failed: {e}")
|
|
return image
|
|
|
|
def _denoise(self, image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Apply non-local means denoising.
|
|
|
|
This helps remove noise while preserving VIN character edges.
|
|
"""
|
|
try:
|
|
return cv2.fastNlMeansDenoising(
|
|
image, h=10, templateWindowSize=7, searchWindowSize=21
|
|
)
|
|
except cv2.error as e:
|
|
logger.warning(f"Denoising failed: {e}")
|
|
return image
|
|
|
|
def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Apply adaptive thresholding for binarization.
|
|
|
|
Adaptive thresholding handles varying illumination across the image,
|
|
which is common in VIN photos.
|
|
"""
|
|
try:
|
|
return cv2.adaptiveThreshold(
|
|
image,
|
|
255,
|
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
cv2.THRESH_BINARY,
|
|
blockSize=11,
|
|
C=2,
|
|
)
|
|
except cv2.error as e:
|
|
logger.warning(f"Adaptive threshold failed: {e}")
|
|
return image
|
|
|
|
def _morphological_cleanup(self, image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Remove small noise artifacts from a thresholded binary image.
|
|
|
|
Morphological opening (erosion then dilation) removes isolated
|
|
pixels and thin noise lines while preserving larger text characters.
|
|
"""
|
|
try:
|
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
|
return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
|
|
except cv2.error as e:
|
|
logger.warning(f"Morphological cleanup failed: {e}")
|
|
return image
|
|
|
|
def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Apply Otsu's thresholding for binarization.
|
|
|
|
Otsu's method auto-calculates the optimal threshold value,
|
|
which can work better than adaptive thresholding on evenly-lit images.
|
|
"""
|
|
try:
|
|
_, result = cv2.threshold(
|
|
image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
|
|
)
|
|
return result
|
|
except cv2.error as e:
|
|
logger.warning(f"Otsu threshold failed: {e}")
|
|
return image
|
|
|
|
def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult:
|
|
"""
|
|
Alternative preprocessing pipeline using Otsu's thresholding.
|
|
|
|
Used as a fallback when adaptive thresholding doesn't produce
|
|
good OCR results.
|
|
"""
|
|
steps_applied = []
|
|
|
|
pil_image = Image.open(io.BytesIO(image_bytes))
|
|
steps_applied.append("loaded")
|
|
|
|
if pil_image.mode not in ("RGB", "L"):
|
|
pil_image = pil_image.convert("RGB")
|
|
steps_applied.append("convert_rgb")
|
|
|
|
cv_image = np.array(pil_image)
|
|
if len(cv_image.shape) == 3:
|
|
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
|
|
|
|
if len(cv_image.shape) == 3:
|
|
gray = self._best_contrast_channel(cv_image)
|
|
else:
|
|
gray = cv_image
|
|
steps_applied.append("grayscale")
|
|
|
|
gray = self._ensure_minimum_resolution(gray)
|
|
steps_applied.append("resolution_check")
|
|
|
|
gray = self._apply_clahe(gray)
|
|
steps_applied.append("clahe")
|
|
|
|
gray = self._denoise(gray)
|
|
steps_applied.append("denoise")
|
|
|
|
gray = self._otsu_threshold(gray)
|
|
gray = self._morphological_cleanup(gray)
|
|
steps_applied.append("otsu_threshold")
|
|
|
|
result_image = Image.fromarray(gray)
|
|
buffer = io.BytesIO()
|
|
result_image.save(buffer, format="PNG")
|
|
|
|
return PreprocessingResult(
|
|
image_bytes=buffer.getvalue(),
|
|
preprocessing_applied=steps_applied,
|
|
)
|
|
|
|
def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
|
|
"""
|
|
Attempt to detect the VIN region in an image.
|
|
|
|
Uses contour detection to find rectangular regions that might contain VINs.
|
|
|
|
Args:
|
|
image_bytes: Raw image bytes
|
|
|
|
Returns:
|
|
BoundingBox of detected VIN region, or None if not found
|
|
"""
|
|
try:
|
|
pil_image = Image.open(io.BytesIO(image_bytes))
|
|
if pil_image.mode != "L":
|
|
pil_image = pil_image.convert("L")
|
|
|
|
cv_image = np.array(pil_image)
|
|
|
|
# Apply preprocessing for better contour detection
|
|
blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
|
|
edges = cv2.Canny(blurred, 50, 150)
|
|
|
|
# Find contours
|
|
contours, _ = cv2.findContours(
|
|
edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
|
)
|
|
|
|
if not contours:
|
|
return None
|
|
|
|
# Find rectangular contours with appropriate aspect ratio for VIN
|
|
# VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
|
|
vin_candidates = []
|
|
|
|
for contour in contours:
|
|
x, y, w, h = cv2.boundingRect(contour)
|
|
if h == 0:
|
|
continue
|
|
|
|
aspect_ratio = w / h
|
|
area = w * h
|
|
|
|
# VIN regions typically have:
|
|
# - Aspect ratio between 4:1 and 12:1
|
|
# - Minimum area (to filter out noise)
|
|
if 4 <= aspect_ratio <= 12 and area > 1000:
|
|
vin_candidates.append((x, y, w, h, area))
|
|
|
|
if not vin_candidates:
|
|
return None
|
|
|
|
# Return the largest candidate
|
|
vin_candidates.sort(key=lambda c: c[4], reverse=True)
|
|
x, y, w, h, _ = vin_candidates[0]
|
|
|
|
return BoundingBox(x=x, y=y, width=w, height=h)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"VIN region detection failed: {e}")
|
|
return None
|
|
|
|
|
|
# Singleton instance
|
|
vin_preprocessor = VinPreprocessor()
|