fix: resolve VIN OCR scanning failures on all images (refs #113)

Root cause: Tesseract fragments VINs into multiple words but candidate extraction required continuous 17-char sequences, rejecting all results. Changes: - Fix candidate extraction to concatenate adjacent OCR fragments - Disable Tesseract dictionaries (VINs are not dictionary words) - Set OEM 1 (LSTM engine) for better accuracy - Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes - Add Otsu's thresholding as alternative preprocessing pipeline - Upscale small images to meet Tesseract's 300 DPI requirement - Remove incorrect B->8 and S->5 transliterations (valid VIN chars) - Fix pre-existing test bug in check digit expected value Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 15:57:14 -06:00
parent 45aaeab973
commit 6a4c2137f7
5 changed files with 248 additions and 31 deletions
--- a/ocr/app/validators/vin_validator.py
+++ b/ocr/app/validators/vin_validator.py
@@ -20,7 +20,9 @@ class VinValidator:
    # VIN character set (excludes I, O, Q)
    VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")

-    # Common OCR misreads and their corrections
+    # Common OCR misreads and their corrections.
+    # Only map characters that are INVALID in VINs to their likely correct values.
+    # B and S are valid VIN characters and must NOT be transliterated.
    TRANSLITERATION = {
        "I": "1",
        "O": "0",
@@ -29,9 +31,6 @@ class VinValidator:
        "o": "0",
        "q": "0",
        "l": "1",
-        "L": "1",
-        "B": "8",  # Sometimes confused
-        "S": "5",  # Sometimes confused
    }

    # Weights for check digit calculation (positions 1-17)
@@ -224,6 +223,11 @@ class VinValidator:
        """
        Extract VIN candidates from raw OCR text.

+        Uses two strategies:
+        1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
+        2. Concatenate adjacent short fragments separated by spaces/dashes
+           (handles Tesseract fragmenting VINs into multiple words)
+
        Args:
            text: Raw OCR text
            max_candidates: Maximum number of candidates to return
@@ -231,29 +235,70 @@ class VinValidator:
        Returns:
            List of (vin, start_pos, end_pos) tuples
        """
-        # Pattern to find potential VIN sequences
-        # Allow some flexibility for OCR errors (include I, O, Q for correction later)
-        potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE)
-
        candidates = []
-        for match in potential_vin_pattern.finditer(text.upper()):
-            candidate = match.group()
-            corrected = self.correct_ocr_errors(candidate)
+        seen_vins: set[str] = set()

-            # Only include if it could be a valid VIN after correction
-            if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
-                candidates.append((corrected, match.start(), match.end()))
+        upper_text = text.upper()

-        # Sort by likelihood of being valid (check digit validation)
+        # Strategy 1: Find continuous runs of VIN-like characters
+        continuous_pattern = re.compile(r"[A-Z0-9IOQ]{11,20}", re.IGNORECASE)
+        for match in continuous_pattern.finditer(upper_text):
+            self._try_add_candidate(
+                match.group(), match.start(), match.end(), candidates, seen_vins
+            )
+
+        # Strategy 2: Concatenate adjacent alphanumeric fragments
+        # This handles OCR fragmentation like "1HGBH 41JXMN 109186"
+        # Only consider fragments >= 3 chars (filters out noise/short words)
+        fragment_pattern = re.compile(r"[A-Z0-9IOQ]{3,}", re.IGNORECASE)
+        fragments = [
+            (m.group(), m.start(), m.end())
+            for m in fragment_pattern.finditer(upper_text)
+        ]
+
+        # Try sliding windows of 2-4 adjacent fragments
+        for window_size in range(2, min(5, len(fragments) + 1)):
+            for i in range(len(fragments) - window_size + 1):
+                window = fragments[i : i + window_size]
+                combined = "".join(f[0] for f in window)
+                # Combined length must be close to 17 (allow +/- 2 for OCR noise)
+                # Must contain at least 2 digit characters (VINs always have digits;
+                # pure-alphabetic text is almost certainly not a VIN)
+                if 15 <= len(combined) <= 19 and sum(c.isdigit() for c in combined) >= 2:
+                    self._try_add_candidate(
+                        combined, window[0][1], window[-1][2], candidates, seen_vins
+                    )
+
+        # Sort by likelihood of being valid (check digit first, then position)
        def score_candidate(c: tuple[str, int, int]) -> int:
            vin = c[0]
            if self.validate_check_digit(vin):
-                return 0  # Best score
+                return 0
            return 1

        candidates.sort(key=score_candidate)
        return candidates[:max_candidates]

+    def _try_add_candidate(
+        self,
+        raw: str,
+        start: int,
+        end: int,
+        candidates: list[tuple[str, int, int]],
+        seen_vins: set[str],
+    ) -> None:
+        """Try to add a corrected VIN candidate if it passes validation."""
+        corrected = self.correct_ocr_errors(raw)
+
+        # Trim to 17 chars if OCR captured extra characters
+        if len(corrected) > 17:
+            corrected = corrected[:17]
+
+        if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
+            if corrected not in seen_vins:
+                seen_vins.add(corrected)
+                candidates.append((corrected, start, end))
+

 # Singleton instance
 vin_validator = VinValidator()