fix: resolve VIN OCR scanning failures on all images (refs #113)

Root cause: Tesseract fragments VINs into multiple words but candidate extraction required continuous 17-char sequences, rejecting all results. Changes: - Fix candidate extraction to concatenate adjacent OCR fragments - Disable Tesseract dictionaries (VINs are not dictionary words) - Set OEM 1 (LSTM engine) for better accuracy - Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes - Add Otsu's thresholding as alternative preprocessing pipeline - Upscale small images to meet Tesseract's 300 DPI requirement - Remove incorrect B->8 and S->5 transliterations (valid VIN chars) - Fix pre-existing test bug in check digit expected value Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 15:57:14 -06:00
parent 45aaeab973
commit 6a4c2137f7
5 changed files with 248 additions and 31 deletions
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -103,6 +103,14 @@ class VinExtractor(BaseExtractor):
                # No VIN candidates found - try with different PSM modes
                candidates = self._try_alternate_ocr(preprocessed_bytes)

+            if not candidates:
+                # Try alternative preprocessing (Otsu's thresholding)
+                otsu_result = vin_preprocessor.preprocess_otsu(image_bytes)
+                raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
+                candidates = vin_validator.extract_candidates(raw_text)
+                if not candidates:
+                    candidates = self._try_alternate_ocr(otsu_result.image_bytes)
+
            if not candidates:
                return VinExtractionResult(
                    success=False,
@@ -200,10 +208,14 @@ class VinExtractor(BaseExtractor):
        image = Image.open(io.BytesIO(image_bytes))

        # Configure Tesseract for VIN extraction
-        # Use character whitelist to exclude I, O, Q
+        # OEM 1 = LSTM neural network engine (best accuracy)
+        # Disable dictionaries since VINs are not dictionary words
        config = (
            f"--psm {psm} "
-            f"-c tessedit_char_whitelist={self.VIN_WHITELIST}"
+            f"--oem 1 "
+            f"-c tessedit_char_whitelist={self.VIN_WHITELIST} "
+            f"-c load_system_dawg=false "
+            f"-c load_freq_dawg=false"
        )

        # Get detailed OCR data
@@ -228,20 +240,20 @@ class VinExtractor(BaseExtractor):
        """
        Try alternate OCR configurations when initial extraction fails.

+        PSM modes tried in order:
+            7  - Single text line
+            8  - Single word
+            11 - Sparse text (finds text in any order, good for angled photos)
+            13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
+
        Returns:
            List of VIN candidates
        """
-        # Try PSM 7 (single text line)
-        raw_text, _ = self._perform_ocr(image_bytes, psm=7)
-        candidates = vin_validator.extract_candidates(raw_text)
-        if candidates:
-            return candidates
-
-        # Try PSM 8 (single word)
-        raw_text, _ = self._perform_ocr(image_bytes, psm=8)
-        candidates = vin_validator.extract_candidates(raw_text)
-        if candidates:
-            return candidates
+        for psm in (7, 8, 11, 13):
+            raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
+            candidates = vin_validator.extract_candidates(raw_text)
+            if candidates:
+                return candidates

        return []