Files
motovaultpro/ocr/app/preprocessors/vin_preprocessor.py
Eric Gullickson 6a4c2137f7
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 35s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
fix: resolve VIN OCR scanning failures on all images (refs #113)
Root cause: Tesseract fragments VINs into multiple words but candidate
extraction required continuous 17-char sequences, rejecting all results.

Changes:
- Fix candidate extraction to concatenate adjacent OCR fragments
- Disable Tesseract dictionaries (VINs are not dictionary words)
- Set OEM 1 (LSTM engine) for better accuracy
- Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes
- Add Otsu's thresholding as alternative preprocessing pipeline
- Upscale small images to meet Tesseract's 300 DPI requirement
- Remove incorrect B->8 and S->5 transliterations (valid VIN chars)
- Fix pre-existing test bug in check digit expected value

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 15:57:14 -06:00

400 lines
12 KiB
Python

"""VIN-optimized image preprocessing pipeline."""
import io
import logging
from dataclasses import dataclass
from typing import Optional
import cv2
import numpy as np
from PIL import Image
from pillow_heif import register_heif_opener
# Register HEIF/HEIC opener
register_heif_opener()
logger = logging.getLogger(__name__)
@dataclass
class BoundingBox:
"""Represents a region in an image."""
x: int
y: int
width: int
height: int
@dataclass
class PreprocessingResult:
"""Result of VIN preprocessing."""
image_bytes: bytes
bounding_box: Optional[BoundingBox] = None
preprocessing_applied: list[str] = None
def __post_init__(self) -> None:
if self.preprocessing_applied is None:
self.preprocessing_applied = []
class VinPreprocessor:
"""VIN-optimized image preprocessing for improved OCR accuracy."""
def preprocess(
self,
image_bytes: bytes,
apply_clahe: bool = True,
apply_deskew: bool = True,
apply_denoise: bool = True,
apply_threshold: bool = True,
) -> PreprocessingResult:
"""
Apply VIN-optimized preprocessing pipeline.
Pipeline:
1. HEIC conversion (if needed)
2. Grayscale conversion
3. Deskew (correct rotation/tilt)
4. Contrast enhancement (CLAHE)
5. Noise reduction (fastNlMeansDenoising)
6. Adaptive thresholding
Args:
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
apply_clahe: Apply CLAHE contrast enhancement
apply_deskew: Apply deskew correction
apply_denoise: Apply noise reduction
apply_threshold: Apply adaptive thresholding
Returns:
PreprocessingResult with processed image bytes
"""
steps_applied = []
# Load image with PIL (handles HEIC via pillow-heif)
pil_image = Image.open(io.BytesIO(image_bytes))
steps_applied.append("loaded")
# Convert to RGB if needed
if pil_image.mode not in ("RGB", "L"):
pil_image = pil_image.convert("RGB")
steps_applied.append("convert_rgb")
# Convert to OpenCV format
cv_image = np.array(pil_image)
if len(cv_image.shape) == 3:
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
# Convert to grayscale
if len(cv_image.shape) == 3:
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
else:
gray = cv_image
steps_applied.append("grayscale")
# Upscale small images for better OCR (Tesseract needs ~300 DPI)
gray = self._ensure_minimum_resolution(gray)
steps_applied.append("resolution_check")
# Apply deskew
if apply_deskew:
gray = self._deskew(gray)
steps_applied.append("deskew")
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
if apply_clahe:
gray = self._apply_clahe(gray)
steps_applied.append("clahe")
# Apply denoising
if apply_denoise:
gray = self._denoise(gray)
steps_applied.append("denoise")
# Apply adaptive thresholding
if apply_threshold:
gray = self._adaptive_threshold(gray)
steps_applied.append("threshold")
# Convert back to PNG bytes
result_image = Image.fromarray(gray)
buffer = io.BytesIO()
result_image.save(buffer, format="PNG")
return PreprocessingResult(
image_bytes=buffer.getvalue(),
preprocessing_applied=steps_applied,
)
# Minimum width in pixels for reliable VIN OCR.
# A 17-char VIN needs ~30px per character for Tesseract accuracy.
MIN_WIDTH_FOR_VIN = 600
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
"""
Upscale image if too small for reliable OCR.
Tesseract works best at ~300 DPI. Mobile photos of VINs may have
the text occupy only a small portion of the frame, resulting in
low effective resolution for the VIN characters.
"""
height, width = image.shape[:2]
if width < self.MIN_WIDTH_FOR_VIN:
scale = self.MIN_WIDTH_FOR_VIN / width
new_width = int(width * scale)
new_height = int(height * scale)
image = cv2.resize(
image, (new_width, new_height), interpolation=cv2.INTER_CUBIC
)
logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}")
return image
def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
"""
Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
CLAHE improves contrast in images with varying illumination,
which is common in VIN photos taken in different lighting conditions.
"""
try:
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
return clahe.apply(image)
except cv2.error as e:
logger.warning(f"CLAHE failed: {e}")
return image
def _deskew(self, image: np.ndarray) -> np.ndarray:
"""
Correct image rotation using Hough transform line detection.
VIN plates/stickers are often photographed at slight angles.
"""
try:
# Detect edges
edges = cv2.Canny(image, 50, 150, apertureSize=3)
# Detect lines
lines = cv2.HoughLinesP(
edges,
rho=1,
theta=np.pi / 180,
threshold=100,
minLineLength=100,
maxLineGap=10,
)
if lines is None:
return image
# Calculate angles of detected lines
angles = []
for line in lines:
x1, y1, x2, y2 = line[0]
if x2 - x1 != 0:
angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
# Only consider nearly horizontal lines
if -45 < angle < 45:
angles.append(angle)
if not angles:
return image
# Use median angle to avoid outliers
median_angle = np.median(angles)
# Only correct if skew is significant but not extreme
if abs(median_angle) < 0.5 or abs(median_angle) > 20:
return image
# Rotate to correct skew
height, width = image.shape[:2]
center = (width // 2, height // 2)
rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
# Calculate new bounds
cos_val = abs(rotation_matrix[0, 0])
sin_val = abs(rotation_matrix[0, 1])
new_width = int(height * sin_val + width * cos_val)
new_height = int(height * cos_val + width * sin_val)
rotation_matrix[0, 2] += (new_width - width) / 2
rotation_matrix[1, 2] += (new_height - height) / 2
rotated = cv2.warpAffine(
image,
rotation_matrix,
(new_width, new_height),
borderMode=cv2.BORDER_REPLICATE,
)
logger.debug(f"Deskewed by {median_angle:.2f} degrees")
return rotated
except Exception as e:
logger.warning(f"Deskew failed: {e}")
return image
def _denoise(self, image: np.ndarray) -> np.ndarray:
"""
Apply non-local means denoising.
This helps remove noise while preserving VIN character edges.
"""
try:
return cv2.fastNlMeansDenoising(
image, h=10, templateWindowSize=7, searchWindowSize=21
)
except cv2.error as e:
logger.warning(f"Denoising failed: {e}")
return image
def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
"""
Apply adaptive thresholding for binarization.
Adaptive thresholding handles varying illumination across the image,
which is common in VIN photos.
"""
try:
return cv2.adaptiveThreshold(
image,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11,
C=2,
)
except cv2.error as e:
logger.warning(f"Adaptive threshold failed: {e}")
return image
def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
"""
Apply Otsu's thresholding for binarization.
Otsu's method auto-calculates the optimal threshold value,
which can work better than adaptive thresholding on evenly-lit images.
"""
try:
_, result = cv2.threshold(
image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
)
return result
except cv2.error as e:
logger.warning(f"Otsu threshold failed: {e}")
return image
def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult:
"""
Alternative preprocessing pipeline using Otsu's thresholding.
Used as a fallback when adaptive thresholding doesn't produce
good OCR results.
"""
steps_applied = []
pil_image = Image.open(io.BytesIO(image_bytes))
steps_applied.append("loaded")
if pil_image.mode not in ("RGB", "L"):
pil_image = pil_image.convert("RGB")
steps_applied.append("convert_rgb")
cv_image = np.array(pil_image)
if len(cv_image.shape) == 3:
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
if len(cv_image.shape) == 3:
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
else:
gray = cv_image
steps_applied.append("grayscale")
gray = self._ensure_minimum_resolution(gray)
steps_applied.append("resolution_check")
gray = self._apply_clahe(gray)
steps_applied.append("clahe")
gray = self._denoise(gray)
steps_applied.append("denoise")
gray = self._otsu_threshold(gray)
steps_applied.append("otsu_threshold")
result_image = Image.fromarray(gray)
buffer = io.BytesIO()
result_image.save(buffer, format="PNG")
return PreprocessingResult(
image_bytes=buffer.getvalue(),
preprocessing_applied=steps_applied,
)
def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
"""
Attempt to detect the VIN region in an image.
Uses contour detection to find rectangular regions that might contain VINs.
Args:
image_bytes: Raw image bytes
Returns:
BoundingBox of detected VIN region, or None if not found
"""
try:
pil_image = Image.open(io.BytesIO(image_bytes))
if pil_image.mode != "L":
pil_image = pil_image.convert("L")
cv_image = np.array(pil_image)
# Apply preprocessing for better contour detection
blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
edges = cv2.Canny(blurred, 50, 150)
# Find contours
contours, _ = cv2.findContours(
edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
if not contours:
return None
# Find rectangular contours with appropriate aspect ratio for VIN
# VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
vin_candidates = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
if h == 0:
continue
aspect_ratio = w / h
area = w * h
# VIN regions typically have:
# - Aspect ratio between 4:1 and 12:1
# - Minimum area (to filter out noise)
if 4 <= aspect_ratio <= 12 and area > 1000:
vin_candidates.append((x, y, w, h, area))
if not vin_candidates:
return None
# Return the largest candidate
vin_candidates.sort(key=lambda c: c[4], reverse=True)
x, y, w, h, _ = vin_candidates[0]
return BoundingBox(x=x, y=y, width=w, height=h)
except Exception as e:
logger.warning(f"VIN region detection failed: {e}")
return None
# Singleton instance
vin_preprocessor = VinPreprocessor()