fix: resolve VIN OCR scanning failures on all images (refs #113)

Root cause: Tesseract fragments VINs into multiple words but candidate extraction required continuous 17-char sequences, rejecting all results. Changes: - Fix candidate extraction to concatenate adjacent OCR fragments - Disable Tesseract dictionaries (VINs are not dictionary words) - Set OEM 1 (LSTM engine) for better accuracy - Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes - Add Otsu's thresholding as alternative preprocessing pipeline - Upscale small images to meet Tesseract's 300 DPI requirement - Remove incorrect B->8 and S->5 transliterations (valid VIN chars) - Fix pre-existing test bug in check digit expected value Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 15:57:14 -06:00
parent 45aaeab973
commit 6a4c2137f7
5 changed files with 248 additions and 31 deletions
--- a/ocr/app/preprocessors/vin_preprocessor.py
+++ b/ocr/app/preprocessors/vin_preprocessor.py
@@ -93,6 +93,10 @@ class VinPreprocessor:
            gray = cv_image
        steps_applied.append("grayscale")

+        # Upscale small images for better OCR (Tesseract needs ~300 DPI)
+        gray = self._ensure_minimum_resolution(gray)
+        steps_applied.append("resolution_check")
+
        # Apply deskew
        if apply_deskew:
            gray = self._deskew(gray)
@@ -123,6 +127,29 @@ class VinPreprocessor:
            preprocessing_applied=steps_applied,
        )

+    # Minimum width in pixels for reliable VIN OCR.
+    # A 17-char VIN needs ~30px per character for Tesseract accuracy.
+    MIN_WIDTH_FOR_VIN = 600
+
+    def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
+        """
+        Upscale image if too small for reliable OCR.
+
+        Tesseract works best at ~300 DPI. Mobile photos of VINs may have
+        the text occupy only a small portion of the frame, resulting in
+        low effective resolution for the VIN characters.
+        """
+        height, width = image.shape[:2]
+        if width < self.MIN_WIDTH_FOR_VIN:
+            scale = self.MIN_WIDTH_FOR_VIN / width
+            new_width = int(width * scale)
+            new_height = int(height * scale)
+            image = cv2.resize(
+                image, (new_width, new_height), interpolation=cv2.INTER_CUBIC
+            )
+            logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}")
+        return image
+
    def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
        """
        Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
@@ -242,6 +269,69 @@ class VinPreprocessor:
            logger.warning(f"Adaptive threshold failed: {e}")
            return image

+    def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply Otsu's thresholding for binarization.
+
+        Otsu's method auto-calculates the optimal threshold value,
+        which can work better than adaptive thresholding on evenly-lit images.
+        """
+        try:
+            _, result = cv2.threshold(
+                image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
+            )
+            return result
+        except cv2.error as e:
+            logger.warning(f"Otsu threshold failed: {e}")
+            return image
+
+    def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult:
+        """
+        Alternative preprocessing pipeline using Otsu's thresholding.
+
+        Used as a fallback when adaptive thresholding doesn't produce
+        good OCR results.
+        """
+        steps_applied = []
+
+        pil_image = Image.open(io.BytesIO(image_bytes))
+        steps_applied.append("loaded")
+
+        if pil_image.mode not in ("RGB", "L"):
+            pil_image = pil_image.convert("RGB")
+            steps_applied.append("convert_rgb")
+
+        cv_image = np.array(pil_image)
+        if len(cv_image.shape) == 3:
+            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
+
+        if len(cv_image.shape) == 3:
+            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = cv_image
+        steps_applied.append("grayscale")
+
+        gray = self._ensure_minimum_resolution(gray)
+        steps_applied.append("resolution_check")
+
+        gray = self._apply_clahe(gray)
+        steps_applied.append("clahe")
+
+        gray = self._denoise(gray)
+        steps_applied.append("denoise")
+
+        gray = self._otsu_threshold(gray)
+        steps_applied.append("otsu_threshold")
+
+        result_image = Image.fromarray(gray)
+        buffer = io.BytesIO()
+        result_image.save(buffer, format="PNG")
+
+        return PreprocessingResult(
+            image_bytes=buffer.getvalue(),
+            preprocessing_applied=steps_applied,
+        )
+
    def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
        """
        Attempt to detect the VIN region in an image.