fix: resolve VIN OCR scanning failures on all images (refs #113)

Root cause: Tesseract fragments VINs into multiple words but candidate extraction required continuous 17-char sequences, rejecting all results. Changes: - Fix candidate extraction to concatenate adjacent OCR fragments - Disable Tesseract dictionaries (VINs are not dictionary words) - Set OEM 1 (LSTM engine) for better accuracy - Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes - Add Otsu's thresholding as alternative preprocessing pipeline - Upscale small images to meet Tesseract's 300 DPI requirement - Remove incorrect B->8 and S->5 transliterations (valid VIN chars) - Fix pre-existing test bug in check digit expected value Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 15:57:14 -06:00
parent 45aaeab973
commit 6a4c2137f7
5 changed files with 248 additions and 31 deletions
--- a/ocr/tests/test_vin_preprocessor.py
+++ b/ocr/tests/test_vin_preprocessor.py
@@ -53,6 +53,7 @@ class TestVinPreprocessor:
        )

        assert "grayscale" in result.preprocessing_applied
+        assert "resolution_check" in result.preprocessing_applied
        assert "clahe" in result.preprocessing_applied
        assert "deskew" in result.preprocessing_applied
        assert "denoise" in result.preprocessing_applied
@@ -185,6 +186,54 @@ class TestVinPreprocessorThreshold:
        assert len(unique_values) <= 2


+class TestVinPreprocessorOtsu:
+    """Tests for Otsu's thresholding preprocessing."""
+
+    def test_otsu_threshold_creates_binary_image(self) -> None:
+        """Test Otsu's thresholding creates binary output."""
+        preprocessor = VinPreprocessor()
+        image = np.full((100, 400), 128, dtype=np.uint8)
+
+        result = preprocessor._otsu_threshold(image)
+
+        unique_values = np.unique(result)
+        assert len(unique_values) <= 2
+
+    def test_preprocess_otsu_returns_result(self) -> None:
+        """Test Otsu preprocessing pipeline returns valid result."""
+        preprocessor = VinPreprocessor()
+        image_bytes = create_test_image()
+
+        result = preprocessor.preprocess_otsu(image_bytes)
+
+        assert result.image_bytes is not None
+        assert len(result.image_bytes) > 0
+        assert "otsu_threshold" in result.preprocessing_applied
+        assert "grayscale" in result.preprocessing_applied
+
+
+class TestVinPreprocessorResolution:
+    """Tests for resolution upscaling."""
+
+    def test_upscale_small_image(self) -> None:
+        """Test small images are upscaled."""
+        preprocessor = VinPreprocessor()
+        small_image = np.full((50, 200), 128, dtype=np.uint8)
+
+        result = preprocessor._ensure_minimum_resolution(small_image)
+
+        assert result.shape[1] >= preprocessor.MIN_WIDTH_FOR_VIN
+
+    def test_no_upscale_large_image(self) -> None:
+        """Test large images are not upscaled."""
+        preprocessor = VinPreprocessor()
+        large_image = np.full((200, 800), 128, dtype=np.uint8)
+
+        result = preprocessor._ensure_minimum_resolution(large_image)
+
+        assert result.shape == large_image.shape
+
+
 class TestVinRegionDetection:
    """Tests for VIN region detection."""

--- a/ocr/tests/test_vin_validator.py
+++ b/ocr/tests/test_vin_validator.py
@@ -43,9 +43,9 @@ class TestVinValidator:
        result = validator.calculate_check_digit("1HGBH41JXMN109186")
        assert result == "X"

-        # 5YJSA1E28HF123456 has check digit 2 at position 9
+        # 5YJSA1E28HF123456 has check digit at position 9
        result = validator.calculate_check_digit("5YJSA1E28HF123456")
-        assert result == "8"  # Verify this is correct for this VIN
+        assert result == "5"

    def test_validate_check_digit_valid(self) -> None:
        """Test check digit validation with valid VIN."""
@@ -161,6 +161,27 @@ class TestVinValidator:
        assert len(candidates) >= 1
        assert candidates[0][0] == "1HGBH41JXMN109186"

+    def test_extract_candidates_fragmented_vin(self) -> None:
+        """Test candidate extraction handles space-fragmented VINs from OCR."""
+        validator = VinValidator()
+
+        # Tesseract often fragments VINs into multiple words
+        text = "1HGBH 41JXMN 109186"
+        candidates = validator.extract_candidates(text)
+
+        assert len(candidates) >= 1
+        assert candidates[0][0] == "1HGBH41JXMN109186"
+
+    def test_extract_candidates_dash_fragmented_vin(self) -> None:
+        """Test candidate extraction handles dash-separated VINs."""
+        validator = VinValidator()
+
+        text = "1HGBH41J-XMN109186"
+        candidates = validator.extract_candidates(text)
+
+        assert len(candidates) >= 1
+        assert candidates[0][0] == "1HGBH41JXMN109186"
+
    def test_extract_candidates_no_vin(self) -> None:
        """Test candidate extraction with no VIN."""
        validator = VinValidator()