feat: add VIN photo OCR pipeline (refs #67)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 31s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 31s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement VIN-specific OCR extraction with optimized preprocessing: - Add POST /extract/vin endpoint for VIN extraction - VIN preprocessor: CLAHE, deskew, denoise, adaptive threshold - VIN validator: check digit validation, OCR error correction (I->1, O->0) - VIN extractor: PSM modes 6/7/8, character whitelist, alternatives - Response includes confidence, bounding box, and alternatives - Unit tests for validator and preprocessor - Integration tests for VIN extraction endpoint Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
10
ocr/app/preprocessors/__init__.py
Normal file
10
ocr/app/preprocessors/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""Image preprocessors for OCR optimization."""
|
||||
from app.services.preprocessor import ImagePreprocessor, preprocessor
|
||||
from app.preprocessors.vin_preprocessor import VinPreprocessor, vin_preprocessor
|
||||
|
||||
__all__ = [
|
||||
"ImagePreprocessor",
|
||||
"preprocessor",
|
||||
"VinPreprocessor",
|
||||
"vin_preprocessor",
|
||||
]
|
||||
309
ocr/app/preprocessors/vin_preprocessor.py
Normal file
309
ocr/app/preprocessors/vin_preprocessor.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""VIN-optimized image preprocessing pipeline."""
|
||||
import io
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from pillow_heif import register_heif_opener
|
||||
|
||||
# Register HEIF/HEIC opener
|
||||
register_heif_opener()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BoundingBox:
|
||||
"""Represents a region in an image."""
|
||||
|
||||
x: int
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreprocessingResult:
|
||||
"""Result of VIN preprocessing."""
|
||||
|
||||
image_bytes: bytes
|
||||
bounding_box: Optional[BoundingBox] = None
|
||||
preprocessing_applied: list[str] = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.preprocessing_applied is None:
|
||||
self.preprocessing_applied = []
|
||||
|
||||
|
||||
class VinPreprocessor:
|
||||
"""VIN-optimized image preprocessing for improved OCR accuracy."""
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
image_bytes: bytes,
|
||||
apply_clahe: bool = True,
|
||||
apply_deskew: bool = True,
|
||||
apply_denoise: bool = True,
|
||||
apply_threshold: bool = True,
|
||||
) -> PreprocessingResult:
|
||||
"""
|
||||
Apply VIN-optimized preprocessing pipeline.
|
||||
|
||||
Pipeline:
|
||||
1. HEIC conversion (if needed)
|
||||
2. Grayscale conversion
|
||||
3. Deskew (correct rotation/tilt)
|
||||
4. Contrast enhancement (CLAHE)
|
||||
5. Noise reduction (fastNlMeansDenoising)
|
||||
6. Adaptive thresholding
|
||||
|
||||
Args:
|
||||
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
|
||||
apply_clahe: Apply CLAHE contrast enhancement
|
||||
apply_deskew: Apply deskew correction
|
||||
apply_denoise: Apply noise reduction
|
||||
apply_threshold: Apply adaptive thresholding
|
||||
|
||||
Returns:
|
||||
PreprocessingResult with processed image bytes
|
||||
"""
|
||||
steps_applied = []
|
||||
|
||||
# Load image with PIL (handles HEIC via pillow-heif)
|
||||
pil_image = Image.open(io.BytesIO(image_bytes))
|
||||
steps_applied.append("loaded")
|
||||
|
||||
# Convert to RGB if needed
|
||||
if pil_image.mode not in ("RGB", "L"):
|
||||
pil_image = pil_image.convert("RGB")
|
||||
steps_applied.append("convert_rgb")
|
||||
|
||||
# Convert to OpenCV format
|
||||
cv_image = np.array(pil_image)
|
||||
if len(cv_image.shape) == 3:
|
||||
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
|
||||
|
||||
# Convert to grayscale
|
||||
if len(cv_image.shape) == 3:
|
||||
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
|
||||
else:
|
||||
gray = cv_image
|
||||
steps_applied.append("grayscale")
|
||||
|
||||
# Apply deskew
|
||||
if apply_deskew:
|
||||
gray = self._deskew(gray)
|
||||
steps_applied.append("deskew")
|
||||
|
||||
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
||||
if apply_clahe:
|
||||
gray = self._apply_clahe(gray)
|
||||
steps_applied.append("clahe")
|
||||
|
||||
# Apply denoising
|
||||
if apply_denoise:
|
||||
gray = self._denoise(gray)
|
||||
steps_applied.append("denoise")
|
||||
|
||||
# Apply adaptive thresholding
|
||||
if apply_threshold:
|
||||
gray = self._adaptive_threshold(gray)
|
||||
steps_applied.append("threshold")
|
||||
|
||||
# Convert back to PNG bytes
|
||||
result_image = Image.fromarray(gray)
|
||||
buffer = io.BytesIO()
|
||||
result_image.save(buffer, format="PNG")
|
||||
|
||||
return PreprocessingResult(
|
||||
image_bytes=buffer.getvalue(),
|
||||
preprocessing_applied=steps_applied,
|
||||
)
|
||||
|
||||
def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
|
||||
|
||||
CLAHE improves contrast in images with varying illumination,
|
||||
which is common in VIN photos taken in different lighting conditions.
|
||||
"""
|
||||
try:
|
||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||
return clahe.apply(image)
|
||||
except cv2.error as e:
|
||||
logger.warning(f"CLAHE failed: {e}")
|
||||
return image
|
||||
|
||||
def _deskew(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Correct image rotation using Hough transform line detection.
|
||||
|
||||
VIN plates/stickers are often photographed at slight angles.
|
||||
"""
|
||||
try:
|
||||
# Detect edges
|
||||
edges = cv2.Canny(image, 50, 150, apertureSize=3)
|
||||
|
||||
# Detect lines
|
||||
lines = cv2.HoughLinesP(
|
||||
edges,
|
||||
rho=1,
|
||||
theta=np.pi / 180,
|
||||
threshold=100,
|
||||
minLineLength=100,
|
||||
maxLineGap=10,
|
||||
)
|
||||
|
||||
if lines is None:
|
||||
return image
|
||||
|
||||
# Calculate angles of detected lines
|
||||
angles = []
|
||||
for line in lines:
|
||||
x1, y1, x2, y2 = line[0]
|
||||
if x2 - x1 != 0:
|
||||
angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
|
||||
# Only consider nearly horizontal lines
|
||||
if -45 < angle < 45:
|
||||
angles.append(angle)
|
||||
|
||||
if not angles:
|
||||
return image
|
||||
|
||||
# Use median angle to avoid outliers
|
||||
median_angle = np.median(angles)
|
||||
|
||||
# Only correct if skew is significant but not extreme
|
||||
if abs(median_angle) < 0.5 or abs(median_angle) > 20:
|
||||
return image
|
||||
|
||||
# Rotate to correct skew
|
||||
height, width = image.shape[:2]
|
||||
center = (width // 2, height // 2)
|
||||
rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
|
||||
|
||||
# Calculate new bounds
|
||||
cos_val = abs(rotation_matrix[0, 0])
|
||||
sin_val = abs(rotation_matrix[0, 1])
|
||||
new_width = int(height * sin_val + width * cos_val)
|
||||
new_height = int(height * cos_val + width * sin_val)
|
||||
|
||||
rotation_matrix[0, 2] += (new_width - width) / 2
|
||||
rotation_matrix[1, 2] += (new_height - height) / 2
|
||||
|
||||
rotated = cv2.warpAffine(
|
||||
image,
|
||||
rotation_matrix,
|
||||
(new_width, new_height),
|
||||
borderMode=cv2.BORDER_REPLICATE,
|
||||
)
|
||||
|
||||
logger.debug(f"Deskewed by {median_angle:.2f} degrees")
|
||||
return rotated
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Deskew failed: {e}")
|
||||
return image
|
||||
|
||||
def _denoise(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Apply non-local means denoising.
|
||||
|
||||
This helps remove noise while preserving VIN character edges.
|
||||
"""
|
||||
try:
|
||||
return cv2.fastNlMeansDenoising(
|
||||
image, h=10, templateWindowSize=7, searchWindowSize=21
|
||||
)
|
||||
except cv2.error as e:
|
||||
logger.warning(f"Denoising failed: {e}")
|
||||
return image
|
||||
|
||||
def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Apply adaptive thresholding for binarization.
|
||||
|
||||
Adaptive thresholding handles varying illumination across the image,
|
||||
which is common in VIN photos.
|
||||
"""
|
||||
try:
|
||||
return cv2.adaptiveThreshold(
|
||||
image,
|
||||
255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY,
|
||||
blockSize=11,
|
||||
C=2,
|
||||
)
|
||||
except cv2.error as e:
|
||||
logger.warning(f"Adaptive threshold failed: {e}")
|
||||
return image
|
||||
|
||||
def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
|
||||
"""
|
||||
Attempt to detect the VIN region in an image.
|
||||
|
||||
Uses contour detection to find rectangular regions that might contain VINs.
|
||||
|
||||
Args:
|
||||
image_bytes: Raw image bytes
|
||||
|
||||
Returns:
|
||||
BoundingBox of detected VIN region, or None if not found
|
||||
"""
|
||||
try:
|
||||
pil_image = Image.open(io.BytesIO(image_bytes))
|
||||
if pil_image.mode != "L":
|
||||
pil_image = pil_image.convert("L")
|
||||
|
||||
cv_image = np.array(pil_image)
|
||||
|
||||
# Apply preprocessing for better contour detection
|
||||
blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
|
||||
edges = cv2.Canny(blurred, 50, 150)
|
||||
|
||||
# Find contours
|
||||
contours, _ = cv2.findContours(
|
||||
edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
|
||||
if not contours:
|
||||
return None
|
||||
|
||||
# Find rectangular contours with appropriate aspect ratio for VIN
|
||||
# VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
|
||||
vin_candidates = []
|
||||
|
||||
for contour in contours:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
if h == 0:
|
||||
continue
|
||||
|
||||
aspect_ratio = w / h
|
||||
area = w * h
|
||||
|
||||
# VIN regions typically have:
|
||||
# - Aspect ratio between 4:1 and 12:1
|
||||
# - Minimum area (to filter out noise)
|
||||
if 4 <= aspect_ratio <= 12 and area > 1000:
|
||||
vin_candidates.append((x, y, w, h, area))
|
||||
|
||||
if not vin_candidates:
|
||||
return None
|
||||
|
||||
# Return the largest candidate
|
||||
vin_candidates.sort(key=lambda c: c[4], reverse=True)
|
||||
x, y, w, h, _ = vin_candidates[0]
|
||||
|
||||
return BoundingBox(x=x, y=y, width=w, height=h)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"VIN region detection failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# Singleton instance
|
||||
vin_preprocessor = VinPreprocessor()
|
||||
Reference in New Issue
Block a user