From 6a4c2137f7e7edf6a345abc55de5ec9be3df0bca Mon Sep 17 00:00:00 2001
From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com>
Date: Fri, 6 Feb 2026 15:57:14 -0600
Subject: [PATCH] fix: resolve VIN OCR scanning failures on all images (refs
 #113)

Root cause: Tesseract fragments VINs into multiple words but candidate
extraction required continuous 17-char sequences, rejecting all results.

Changes:
- Fix candidate extraction to concatenate adjacent OCR fragments
- Disable Tesseract dictionaries (VINs are not dictionary words)
- Set OEM 1 (LSTM engine) for better accuracy
- Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes
- Add Otsu's thresholding as alternative preprocessing pipeline
- Upscale small images to meet Tesseract's 300 DPI requirement
- Remove incorrect B->8 and S->5 transliterations (valid VIN chars)
- Fix pre-existing test bug in check digit expected value

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ocr/app/extractors/vin_extractor.py       | 38 ++++++----
 ocr/app/preprocessors/vin_preprocessor.py | 90 +++++++++++++++++++++++
 ocr/app/validators/vin_validator.py       | 77 +++++++++++++++----
 ocr/tests/test_vin_preprocessor.py        | 49 ++++++++++++
 ocr/tests/test_vin_validator.py           | 25 ++++++-
 5 files changed, 248 insertions(+), 31 deletions(-)

diff --git a/ocr/app/extractors/vin_extractor.py b/ocr/app/extractors/vin_extractor.py
index 37fdad1..ef1cb67 100644
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -103,6 +103,14 @@ class VinExtractor(BaseExtractor):
                 # No VIN candidates found - try with different PSM modes
                 candidates = self._try_alternate_ocr(preprocessed_bytes)
 
+            if not candidates:
+                # Try alternative preprocessing (Otsu's thresholding)
+                otsu_result = vin_preprocessor.preprocess_otsu(image_bytes)
+                raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
+                candidates = vin_validator.extract_candidates(raw_text)
+                if not candidates:
+                    candidates = self._try_alternate_ocr(otsu_result.image_bytes)
+
             if not candidates:
                 return VinExtractionResult(
                     success=False,
@@ -200,10 +208,14 @@ class VinExtractor(BaseExtractor):
         image = Image.open(io.BytesIO(image_bytes))
 
         # Configure Tesseract for VIN extraction
-        # Use character whitelist to exclude I, O, Q
+        # OEM 1 = LSTM neural network engine (best accuracy)
+        # Disable dictionaries since VINs are not dictionary words
         config = (
             f"--psm {psm} "
-            f"-c tessedit_char_whitelist={self.VIN_WHITELIST}"
+            f"--oem 1 "
+            f"-c tessedit_char_whitelist={self.VIN_WHITELIST} "
+            f"-c load_system_dawg=false "
+            f"-c load_freq_dawg=false"
         )
 
         # Get detailed OCR data
@@ -228,20 +240,20 @@ class VinExtractor(BaseExtractor):
         """
         Try alternate OCR configurations when initial extraction fails.
 
+        PSM modes tried in order:
+            7  - Single text line
+            8  - Single word
+            11 - Sparse text (finds text in any order, good for angled photos)
+            13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
+
         Returns:
             List of VIN candidates
         """
-        # Try PSM 7 (single text line)
-        raw_text, _ = self._perform_ocr(image_bytes, psm=7)
-        candidates = vin_validator.extract_candidates(raw_text)
-        if candidates:
-            return candidates
-
-        # Try PSM 8 (single word)
-        raw_text, _ = self._perform_ocr(image_bytes, psm=8)
-        candidates = vin_validator.extract_candidates(raw_text)
-        if candidates:
-            return candidates
+        for psm in (7, 8, 11, 13):
+            raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
+            candidates = vin_validator.extract_candidates(raw_text)
+            if candidates:
+                return candidates
 
         return []
 
diff --git a/ocr/app/preprocessors/vin_preprocessor.py b/ocr/app/preprocessors/vin_preprocessor.py
index e0ffbba..95ba4bc 100644
--- a/ocr/app/preprocessors/vin_preprocessor.py
+++ b/ocr/app/preprocessors/vin_preprocessor.py
@@ -93,6 +93,10 @@ class VinPreprocessor:
             gray = cv_image
         steps_applied.append("grayscale")
 
+        # Upscale small images for better OCR (Tesseract needs ~300 DPI)
+        gray = self._ensure_minimum_resolution(gray)
+        steps_applied.append("resolution_check")
+
         # Apply deskew
         if apply_deskew:
             gray = self._deskew(gray)
@@ -123,6 +127,29 @@ class VinPreprocessor:
             preprocessing_applied=steps_applied,
         )
 
+    # Minimum width in pixels for reliable VIN OCR.
+    # A 17-char VIN needs ~30px per character for Tesseract accuracy.
+    MIN_WIDTH_FOR_VIN = 600
+
+    def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
+        """
+        Upscale image if too small for reliable OCR.
+
+        Tesseract works best at ~300 DPI. Mobile photos of VINs may have
+        the text occupy only a small portion of the frame, resulting in
+        low effective resolution for the VIN characters.
+        """
+        height, width = image.shape[:2]
+        if width < self.MIN_WIDTH_FOR_VIN:
+            scale = self.MIN_WIDTH_FOR_VIN / width
+            new_width = int(width * scale)
+            new_height = int(height * scale)
+            image = cv2.resize(
+                image, (new_width, new_height), interpolation=cv2.INTER_CUBIC
+            )
+            logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}")
+        return image
+
     def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
         """
         Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
@@ -242,6 +269,69 @@ class VinPreprocessor:
             logger.warning(f"Adaptive threshold failed: {e}")
             return image
 
+    def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply Otsu's thresholding for binarization.
+
+        Otsu's method auto-calculates the optimal threshold value,
+        which can work better than adaptive thresholding on evenly-lit images.
+        """
+        try:
+            _, result = cv2.threshold(
+                image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
+            )
+            return result
+        except cv2.error as e:
+            logger.warning(f"Otsu threshold failed: {e}")
+            return image
+
+    def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult:
+        """
+        Alternative preprocessing pipeline using Otsu's thresholding.
+
+        Used as a fallback when adaptive thresholding doesn't produce
+        good OCR results.
+        """
+        steps_applied = []
+
+        pil_image = Image.open(io.BytesIO(image_bytes))
+        steps_applied.append("loaded")
+
+        if pil_image.mode not in ("RGB", "L"):
+            pil_image = pil_image.convert("RGB")
+            steps_applied.append("convert_rgb")
+
+        cv_image = np.array(pil_image)
+        if len(cv_image.shape) == 3:
+            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
+
+        if len(cv_image.shape) == 3:
+            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = cv_image
+        steps_applied.append("grayscale")
+
+        gray = self._ensure_minimum_resolution(gray)
+        steps_applied.append("resolution_check")
+
+        gray = self._apply_clahe(gray)
+        steps_applied.append("clahe")
+
+        gray = self._denoise(gray)
+        steps_applied.append("denoise")
+
+        gray = self._otsu_threshold(gray)
+        steps_applied.append("otsu_threshold")
+
+        result_image = Image.fromarray(gray)
+        buffer = io.BytesIO()
+        result_image.save(buffer, format="PNG")
+
+        return PreprocessingResult(
+            image_bytes=buffer.getvalue(),
+            preprocessing_applied=steps_applied,
+        )
+
     def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
         """
         Attempt to detect the VIN region in an image.
diff --git a/ocr/app/validators/vin_validator.py b/ocr/app/validators/vin_validator.py
index 6a4b264..7c74ae9 100644
--- a/ocr/app/validators/vin_validator.py
+++ b/ocr/app/validators/vin_validator.py
@@ -20,7 +20,9 @@ class VinValidator:
     # VIN character set (excludes I, O, Q)
     VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
 
-    # Common OCR misreads and their corrections
+    # Common OCR misreads and their corrections.
+    # Only map characters that are INVALID in VINs to their likely correct values.
+    # B and S are valid VIN characters and must NOT be transliterated.
     TRANSLITERATION = {
         "I": "1",
         "O": "0",
@@ -29,9 +31,6 @@ class VinValidator:
         "o": "0",
         "q": "0",
         "l": "1",
-        "L": "1",
-        "B": "8",  # Sometimes confused
-        "S": "5",  # Sometimes confused
     }
 
     # Weights for check digit calculation (positions 1-17)
@@ -224,6 +223,11 @@ class VinValidator:
         """
         Extract VIN candidates from raw OCR text.
 
+        Uses two strategies:
+        1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
+        2. Concatenate adjacent short fragments separated by spaces/dashes
+           (handles Tesseract fragmenting VINs into multiple words)
+
         Args:
             text: Raw OCR text
             max_candidates: Maximum number of candidates to return
@@ -231,29 +235,70 @@ class VinValidator:
         Returns:
             List of (vin, start_pos, end_pos) tuples
         """
-        # Pattern to find potential VIN sequences
-        # Allow some flexibility for OCR errors (include I, O, Q for correction later)
-        potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE)
-
         candidates = []
-        for match in potential_vin_pattern.finditer(text.upper()):
-            candidate = match.group()
-            corrected = self.correct_ocr_errors(candidate)
+        seen_vins: set[str] = set()
 
-            # Only include if it could be a valid VIN after correction
-            if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
-                candidates.append((corrected, match.start(), match.end()))
+        upper_text = text.upper()
 
-        # Sort by likelihood of being valid (check digit validation)
+        # Strategy 1: Find continuous runs of VIN-like characters
+        continuous_pattern = re.compile(r"[A-Z0-9IOQ]{11,20}", re.IGNORECASE)
+        for match in continuous_pattern.finditer(upper_text):
+            self._try_add_candidate(
+                match.group(), match.start(), match.end(), candidates, seen_vins
+            )
+
+        # Strategy 2: Concatenate adjacent alphanumeric fragments
+        # This handles OCR fragmentation like "1HGBH 41JXMN 109186"
+        # Only consider fragments >= 3 chars (filters out noise/short words)
+        fragment_pattern = re.compile(r"[A-Z0-9IOQ]{3,}", re.IGNORECASE)
+        fragments = [
+            (m.group(), m.start(), m.end())
+            for m in fragment_pattern.finditer(upper_text)
+        ]
+
+        # Try sliding windows of 2-4 adjacent fragments
+        for window_size in range(2, min(5, len(fragments) + 1)):
+            for i in range(len(fragments) - window_size + 1):
+                window = fragments[i : i + window_size]
+                combined = "".join(f[0] for f in window)
+                # Combined length must be close to 17 (allow +/- 2 for OCR noise)
+                # Must contain at least 2 digit characters (VINs always have digits;
+                # pure-alphabetic text is almost certainly not a VIN)
+                if 15 <= len(combined) <= 19 and sum(c.isdigit() for c in combined) >= 2:
+                    self._try_add_candidate(
+                        combined, window[0][1], window[-1][2], candidates, seen_vins
+                    )
+
+        # Sort by likelihood of being valid (check digit first, then position)
         def score_candidate(c: tuple[str, int, int]) -> int:
             vin = c[0]
             if self.validate_check_digit(vin):
-                return 0  # Best score
+                return 0
             return 1
 
         candidates.sort(key=score_candidate)
         return candidates[:max_candidates]
 
+    def _try_add_candidate(
+        self,
+        raw: str,
+        start: int,
+        end: int,
+        candidates: list[tuple[str, int, int]],
+        seen_vins: set[str],
+    ) -> None:
+        """Try to add a corrected VIN candidate if it passes validation."""
+        corrected = self.correct_ocr_errors(raw)
+
+        # Trim to 17 chars if OCR captured extra characters
+        if len(corrected) > 17:
+            corrected = corrected[:17]
+
+        if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
+            if corrected not in seen_vins:
+                seen_vins.add(corrected)
+                candidates.append((corrected, start, end))
+
 
 # Singleton instance
 vin_validator = VinValidator()
diff --git a/ocr/tests/test_vin_preprocessor.py b/ocr/tests/test_vin_preprocessor.py
index 8076294..2d81a7b 100644
--- a/ocr/tests/test_vin_preprocessor.py
+++ b/ocr/tests/test_vin_preprocessor.py
@@ -53,6 +53,7 @@ class TestVinPreprocessor:
         )
 
         assert "grayscale" in result.preprocessing_applied
+        assert "resolution_check" in result.preprocessing_applied
         assert "clahe" in result.preprocessing_applied
         assert "deskew" in result.preprocessing_applied
         assert "denoise" in result.preprocessing_applied
@@ -185,6 +186,54 @@ class TestVinPreprocessorThreshold:
         assert len(unique_values) <= 2
 
 
+class TestVinPreprocessorOtsu:
+    """Tests for Otsu's thresholding preprocessing."""
+
+    def test_otsu_threshold_creates_binary_image(self) -> None:
+        """Test Otsu's thresholding creates binary output."""
+        preprocessor = VinPreprocessor()
+        image = np.full((100, 400), 128, dtype=np.uint8)
+
+        result = preprocessor._otsu_threshold(image)
+
+        unique_values = np.unique(result)
+        assert len(unique_values) <= 2
+
+    def test_preprocess_otsu_returns_result(self) -> None:
+        """Test Otsu preprocessing pipeline returns valid result."""
+        preprocessor = VinPreprocessor()
+        image_bytes = create_test_image()
+
+        result = preprocessor.preprocess_otsu(image_bytes)
+
+        assert result.image_bytes is not None
+        assert len(result.image_bytes) > 0
+        assert "otsu_threshold" in result.preprocessing_applied
+        assert "grayscale" in result.preprocessing_applied
+
+
+class TestVinPreprocessorResolution:
+    """Tests for resolution upscaling."""
+
+    def test_upscale_small_image(self) -> None:
+        """Test small images are upscaled."""
+        preprocessor = VinPreprocessor()
+        small_image = np.full((50, 200), 128, dtype=np.uint8)
+
+        result = preprocessor._ensure_minimum_resolution(small_image)
+
+        assert result.shape[1] >= preprocessor.MIN_WIDTH_FOR_VIN
+
+    def test_no_upscale_large_image(self) -> None:
+        """Test large images are not upscaled."""
+        preprocessor = VinPreprocessor()
+        large_image = np.full((200, 800), 128, dtype=np.uint8)
+
+        result = preprocessor._ensure_minimum_resolution(large_image)
+
+        assert result.shape == large_image.shape
+
+
 class TestVinRegionDetection:
     """Tests for VIN region detection."""
 
diff --git a/ocr/tests/test_vin_validator.py b/ocr/tests/test_vin_validator.py
index 26f170b..241eabd 100644
--- a/ocr/tests/test_vin_validator.py
+++ b/ocr/tests/test_vin_validator.py
@@ -43,9 +43,9 @@ class TestVinValidator:
         result = validator.calculate_check_digit("1HGBH41JXMN109186")
         assert result == "X"
 
-        # 5YJSA1E28HF123456 has check digit 2 at position 9
+        # 5YJSA1E28HF123456 has check digit at position 9
         result = validator.calculate_check_digit("5YJSA1E28HF123456")
-        assert result == "8"  # Verify this is correct for this VIN
+        assert result == "5"
 
     def test_validate_check_digit_valid(self) -> None:
         """Test check digit validation with valid VIN."""
@@ -161,6 +161,27 @@ class TestVinValidator:
         assert len(candidates) >= 1
         assert candidates[0][0] == "1HGBH41JXMN109186"
 
+    def test_extract_candidates_fragmented_vin(self) -> None:
+        """Test candidate extraction handles space-fragmented VINs from OCR."""
+        validator = VinValidator()
+
+        # Tesseract often fragments VINs into multiple words
+        text = "1HGBH 41JXMN 109186"
+        candidates = validator.extract_candidates(text)
+
+        assert len(candidates) >= 1
+        assert candidates[0][0] == "1HGBH41JXMN109186"
+
+    def test_extract_candidates_dash_fragmented_vin(self) -> None:
+        """Test candidate extraction handles dash-separated VINs."""
+        validator = VinValidator()
+
+        text = "1HGBH41J-XMN109186"
+        candidates = validator.extract_candidates(text)
+
+        assert len(candidates) >= 1
+        assert candidates[0][0] == "1HGBH41JXMN109186"
+
     def test_extract_candidates_no_vin(self) -> None:
         """Test candidate extraction with no VIN."""
         validator = VinValidator()