fix: resolve VIN OCR scanning failures on all images (refs #113)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 35s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 35s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Root cause: Tesseract fragments VINs into multiple words but candidate extraction required continuous 17-char sequences, rejecting all results. Changes: - Fix candidate extraction to concatenate adjacent OCR fragments - Disable Tesseract dictionaries (VINs are not dictionary words) - Set OEM 1 (LSTM engine) for better accuracy - Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes - Add Otsu's thresholding as alternative preprocessing pipeline - Upscale small images to meet Tesseract's 300 DPI requirement - Remove incorrect B->8 and S->5 transliterations (valid VIN chars) - Fix pre-existing test bug in check digit expected value Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -93,6 +93,10 @@ class VinPreprocessor:
|
||||
gray = cv_image
|
||||
steps_applied.append("grayscale")
|
||||
|
||||
# Upscale small images for better OCR (Tesseract needs ~300 DPI)
|
||||
gray = self._ensure_minimum_resolution(gray)
|
||||
steps_applied.append("resolution_check")
|
||||
|
||||
# Apply deskew
|
||||
if apply_deskew:
|
||||
gray = self._deskew(gray)
|
||||
@@ -123,6 +127,29 @@ class VinPreprocessor:
|
||||
preprocessing_applied=steps_applied,
|
||||
)
|
||||
|
||||
# Minimum width in pixels for reliable VIN OCR.
|
||||
# A 17-char VIN needs ~30px per character for Tesseract accuracy.
|
||||
MIN_WIDTH_FOR_VIN = 600
|
||||
|
||||
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Upscale image if too small for reliable OCR.
|
||||
|
||||
Tesseract works best at ~300 DPI. Mobile photos of VINs may have
|
||||
the text occupy only a small portion of the frame, resulting in
|
||||
low effective resolution for the VIN characters.
|
||||
"""
|
||||
height, width = image.shape[:2]
|
||||
if width < self.MIN_WIDTH_FOR_VIN:
|
||||
scale = self.MIN_WIDTH_FOR_VIN / width
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
image = cv2.resize(
|
||||
image, (new_width, new_height), interpolation=cv2.INTER_CUBIC
|
||||
)
|
||||
logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}")
|
||||
return image
|
||||
|
||||
def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
|
||||
@@ -242,6 +269,69 @@ class VinPreprocessor:
|
||||
logger.warning(f"Adaptive threshold failed: {e}")
|
||||
return image
|
||||
|
||||
def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Apply Otsu's thresholding for binarization.
|
||||
|
||||
Otsu's method auto-calculates the optimal threshold value,
|
||||
which can work better than adaptive thresholding on evenly-lit images.
|
||||
"""
|
||||
try:
|
||||
_, result = cv2.threshold(
|
||||
image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
|
||||
)
|
||||
return result
|
||||
except cv2.error as e:
|
||||
logger.warning(f"Otsu threshold failed: {e}")
|
||||
return image
|
||||
|
||||
def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult:
|
||||
"""
|
||||
Alternative preprocessing pipeline using Otsu's thresholding.
|
||||
|
||||
Used as a fallback when adaptive thresholding doesn't produce
|
||||
good OCR results.
|
||||
"""
|
||||
steps_applied = []
|
||||
|
||||
pil_image = Image.open(io.BytesIO(image_bytes))
|
||||
steps_applied.append("loaded")
|
||||
|
||||
if pil_image.mode not in ("RGB", "L"):
|
||||
pil_image = pil_image.convert("RGB")
|
||||
steps_applied.append("convert_rgb")
|
||||
|
||||
cv_image = np.array(pil_image)
|
||||
if len(cv_image.shape) == 3:
|
||||
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
|
||||
|
||||
if len(cv_image.shape) == 3:
|
||||
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
|
||||
else:
|
||||
gray = cv_image
|
||||
steps_applied.append("grayscale")
|
||||
|
||||
gray = self._ensure_minimum_resolution(gray)
|
||||
steps_applied.append("resolution_check")
|
||||
|
||||
gray = self._apply_clahe(gray)
|
||||
steps_applied.append("clahe")
|
||||
|
||||
gray = self._denoise(gray)
|
||||
steps_applied.append("denoise")
|
||||
|
||||
gray = self._otsu_threshold(gray)
|
||||
steps_applied.append("otsu_threshold")
|
||||
|
||||
result_image = Image.fromarray(gray)
|
||||
buffer = io.BytesIO()
|
||||
result_image.save(buffer, format="PNG")
|
||||
|
||||
return PreprocessingResult(
|
||||
image_bytes=buffer.getvalue(),
|
||||
preprocessing_applied=steps_applied,
|
||||
)
|
||||
|
||||
def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
|
||||
"""
|
||||
Attempt to detect the VIN region in an image.
|
||||
|
||||
Reference in New Issue
Block a user