From 6a4c2137f7e7edf6a345abc55de5ec9be3df0bca Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 15:57:14 -0600 Subject: [PATCH] fix: resolve VIN OCR scanning failures on all images (refs #113) Root cause: Tesseract fragments VINs into multiple words but candidate extraction required continuous 17-char sequences, rejecting all results. Changes: - Fix candidate extraction to concatenate adjacent OCR fragments - Disable Tesseract dictionaries (VINs are not dictionary words) - Set OEM 1 (LSTM engine) for better accuracy - Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes - Add Otsu's thresholding as alternative preprocessing pipeline - Upscale small images to meet Tesseract's 300 DPI requirement - Remove incorrect B->8 and S->5 transliterations (valid VIN chars) - Fix pre-existing test bug in check digit expected value Co-Authored-By: Claude Opus 4.6 --- ocr/app/extractors/vin_extractor.py | 38 ++++++---- ocr/app/preprocessors/vin_preprocessor.py | 90 +++++++++++++++++++++++ ocr/app/validators/vin_validator.py | 77 +++++++++++++++---- ocr/tests/test_vin_preprocessor.py | 49 ++++++++++++ ocr/tests/test_vin_validator.py | 25 ++++++- 5 files changed, 248 insertions(+), 31 deletions(-) diff --git a/ocr/app/extractors/vin_extractor.py b/ocr/app/extractors/vin_extractor.py index 37fdad1..ef1cb67 100644 --- a/ocr/app/extractors/vin_extractor.py +++ b/ocr/app/extractors/vin_extractor.py @@ -103,6 +103,14 @@ class VinExtractor(BaseExtractor): # No VIN candidates found - try with different PSM modes candidates = self._try_alternate_ocr(preprocessed_bytes) + if not candidates: + # Try alternative preprocessing (Otsu's thresholding) + otsu_result = vin_preprocessor.preprocess_otsu(image_bytes) + raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes) + candidates = vin_validator.extract_candidates(raw_text) + if not candidates: + candidates = self._try_alternate_ocr(otsu_result.image_bytes) + if not candidates: return VinExtractionResult( success=False, @@ -200,10 +208,14 @@ class VinExtractor(BaseExtractor): image = Image.open(io.BytesIO(image_bytes)) # Configure Tesseract for VIN extraction - # Use character whitelist to exclude I, O, Q + # OEM 1 = LSTM neural network engine (best accuracy) + # Disable dictionaries since VINs are not dictionary words config = ( f"--psm {psm} " - f"-c tessedit_char_whitelist={self.VIN_WHITELIST}" + f"--oem 1 " + f"-c tessedit_char_whitelist={self.VIN_WHITELIST} " + f"-c load_system_dawg=false " + f"-c load_freq_dawg=false" ) # Get detailed OCR data @@ -228,20 +240,20 @@ class VinExtractor(BaseExtractor): """ Try alternate OCR configurations when initial extraction fails. + PSM modes tried in order: + 7 - Single text line + 8 - Single word + 11 - Sparse text (finds text in any order, good for angled photos) + 13 - Raw line (no Tesseract heuristics, good for clean VIN plates) + Returns: List of VIN candidates """ - # Try PSM 7 (single text line) - raw_text, _ = self._perform_ocr(image_bytes, psm=7) - candidates = vin_validator.extract_candidates(raw_text) - if candidates: - return candidates - - # Try PSM 8 (single word) - raw_text, _ = self._perform_ocr(image_bytes, psm=8) - candidates = vin_validator.extract_candidates(raw_text) - if candidates: - return candidates + for psm in (7, 8, 11, 13): + raw_text, _ = self._perform_ocr(image_bytes, psm=psm) + candidates = vin_validator.extract_candidates(raw_text) + if candidates: + return candidates return [] diff --git a/ocr/app/preprocessors/vin_preprocessor.py b/ocr/app/preprocessors/vin_preprocessor.py index e0ffbba..95ba4bc 100644 --- a/ocr/app/preprocessors/vin_preprocessor.py +++ b/ocr/app/preprocessors/vin_preprocessor.py @@ -93,6 +93,10 @@ class VinPreprocessor: gray = cv_image steps_applied.append("grayscale") + # Upscale small images for better OCR (Tesseract needs ~300 DPI) + gray = self._ensure_minimum_resolution(gray) + steps_applied.append("resolution_check") + # Apply deskew if apply_deskew: gray = self._deskew(gray) @@ -123,6 +127,29 @@ class VinPreprocessor: preprocessing_applied=steps_applied, ) + # Minimum width in pixels for reliable VIN OCR. + # A 17-char VIN needs ~30px per character for Tesseract accuracy. + MIN_WIDTH_FOR_VIN = 600 + + def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray: + """ + Upscale image if too small for reliable OCR. + + Tesseract works best at ~300 DPI. Mobile photos of VINs may have + the text occupy only a small portion of the frame, resulting in + low effective resolution for the VIN characters. + """ + height, width = image.shape[:2] + if width < self.MIN_WIDTH_FOR_VIN: + scale = self.MIN_WIDTH_FOR_VIN / width + new_width = int(width * scale) + new_height = int(height * scale) + image = cv2.resize( + image, (new_width, new_height), interpolation=cv2.INTER_CUBIC + ) + logger.debug(f"Upscaled image from {width}x{height} to {new_width}x{new_height}") + return image + def _apply_clahe(self, image: np.ndarray) -> np.ndarray: """ Apply CLAHE (Contrast Limited Adaptive Histogram Equalization). @@ -242,6 +269,69 @@ class VinPreprocessor: logger.warning(f"Adaptive threshold failed: {e}") return image + def _otsu_threshold(self, image: np.ndarray) -> np.ndarray: + """ + Apply Otsu's thresholding for binarization. + + Otsu's method auto-calculates the optimal threshold value, + which can work better than adaptive thresholding on evenly-lit images. + """ + try: + _, result = cv2.threshold( + image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU + ) + return result + except cv2.error as e: + logger.warning(f"Otsu threshold failed: {e}") + return image + + def preprocess_otsu(self, image_bytes: bytes) -> PreprocessingResult: + """ + Alternative preprocessing pipeline using Otsu's thresholding. + + Used as a fallback when adaptive thresholding doesn't produce + good OCR results. + """ + steps_applied = [] + + pil_image = Image.open(io.BytesIO(image_bytes)) + steps_applied.append("loaded") + + if pil_image.mode not in ("RGB", "L"): + pil_image = pil_image.convert("RGB") + steps_applied.append("convert_rgb") + + cv_image = np.array(pil_image) + if len(cv_image.shape) == 3: + cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR) + + if len(cv_image.shape) == 3: + gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) + else: + gray = cv_image + steps_applied.append("grayscale") + + gray = self._ensure_minimum_resolution(gray) + steps_applied.append("resolution_check") + + gray = self._apply_clahe(gray) + steps_applied.append("clahe") + + gray = self._denoise(gray) + steps_applied.append("denoise") + + gray = self._otsu_threshold(gray) + steps_applied.append("otsu_threshold") + + result_image = Image.fromarray(gray) + buffer = io.BytesIO() + result_image.save(buffer, format="PNG") + + return PreprocessingResult( + image_bytes=buffer.getvalue(), + preprocessing_applied=steps_applied, + ) + def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]: """ Attempt to detect the VIN region in an image. diff --git a/ocr/app/validators/vin_validator.py b/ocr/app/validators/vin_validator.py index 6a4b264..7c74ae9 100644 --- a/ocr/app/validators/vin_validator.py +++ b/ocr/app/validators/vin_validator.py @@ -20,7 +20,9 @@ class VinValidator: # VIN character set (excludes I, O, Q) VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789") - # Common OCR misreads and their corrections + # Common OCR misreads and their corrections. + # Only map characters that are INVALID in VINs to their likely correct values. + # B and S are valid VIN characters and must NOT be transliterated. TRANSLITERATION = { "I": "1", "O": "0", @@ -29,9 +31,6 @@ class VinValidator: "o": "0", "q": "0", "l": "1", - "L": "1", - "B": "8", # Sometimes confused - "S": "5", # Sometimes confused } # Weights for check digit calculation (positions 1-17) @@ -224,6 +223,11 @@ class VinValidator: """ Extract VIN candidates from raw OCR text. + Uses two strategies: + 1. Find continuous 11-20 char alphanumeric runs (handles intact VINs) + 2. Concatenate adjacent short fragments separated by spaces/dashes + (handles Tesseract fragmenting VINs into multiple words) + Args: text: Raw OCR text max_candidates: Maximum number of candidates to return @@ -231,29 +235,70 @@ class VinValidator: Returns: List of (vin, start_pos, end_pos) tuples """ - # Pattern to find potential VIN sequences - # Allow some flexibility for OCR errors (include I, O, Q for correction later) - potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE) - candidates = [] - for match in potential_vin_pattern.finditer(text.upper()): - candidate = match.group() - corrected = self.correct_ocr_errors(candidate) + seen_vins: set[str] = set() - # Only include if it could be a valid VIN after correction - if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected): - candidates.append((corrected, match.start(), match.end())) + upper_text = text.upper() - # Sort by likelihood of being valid (check digit validation) + # Strategy 1: Find continuous runs of VIN-like characters + continuous_pattern = re.compile(r"[A-Z0-9IOQ]{11,20}", re.IGNORECASE) + for match in continuous_pattern.finditer(upper_text): + self._try_add_candidate( + match.group(), match.start(), match.end(), candidates, seen_vins + ) + + # Strategy 2: Concatenate adjacent alphanumeric fragments + # This handles OCR fragmentation like "1HGBH 41JXMN 109186" + # Only consider fragments >= 3 chars (filters out noise/short words) + fragment_pattern = re.compile(r"[A-Z0-9IOQ]{3,}", re.IGNORECASE) + fragments = [ + (m.group(), m.start(), m.end()) + for m in fragment_pattern.finditer(upper_text) + ] + + # Try sliding windows of 2-4 adjacent fragments + for window_size in range(2, min(5, len(fragments) + 1)): + for i in range(len(fragments) - window_size + 1): + window = fragments[i : i + window_size] + combined = "".join(f[0] for f in window) + # Combined length must be close to 17 (allow +/- 2 for OCR noise) + # Must contain at least 2 digit characters (VINs always have digits; + # pure-alphabetic text is almost certainly not a VIN) + if 15 <= len(combined) <= 19 and sum(c.isdigit() for c in combined) >= 2: + self._try_add_candidate( + combined, window[0][1], window[-1][2], candidates, seen_vins + ) + + # Sort by likelihood of being valid (check digit first, then position) def score_candidate(c: tuple[str, int, int]) -> int: vin = c[0] if self.validate_check_digit(vin): - return 0 # Best score + return 0 return 1 candidates.sort(key=score_candidate) return candidates[:max_candidates] + def _try_add_candidate( + self, + raw: str, + start: int, + end: int, + candidates: list[tuple[str, int, int]], + seen_vins: set[str], + ) -> None: + """Try to add a corrected VIN candidate if it passes validation.""" + corrected = self.correct_ocr_errors(raw) + + # Trim to 17 chars if OCR captured extra characters + if len(corrected) > 17: + corrected = corrected[:17] + + if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected): + if corrected not in seen_vins: + seen_vins.add(corrected) + candidates.append((corrected, start, end)) + # Singleton instance vin_validator = VinValidator() diff --git a/ocr/tests/test_vin_preprocessor.py b/ocr/tests/test_vin_preprocessor.py index 8076294..2d81a7b 100644 --- a/ocr/tests/test_vin_preprocessor.py +++ b/ocr/tests/test_vin_preprocessor.py @@ -53,6 +53,7 @@ class TestVinPreprocessor: ) assert "grayscale" in result.preprocessing_applied + assert "resolution_check" in result.preprocessing_applied assert "clahe" in result.preprocessing_applied assert "deskew" in result.preprocessing_applied assert "denoise" in result.preprocessing_applied @@ -185,6 +186,54 @@ class TestVinPreprocessorThreshold: assert len(unique_values) <= 2 +class TestVinPreprocessorOtsu: + """Tests for Otsu's thresholding preprocessing.""" + + def test_otsu_threshold_creates_binary_image(self) -> None: + """Test Otsu's thresholding creates binary output.""" + preprocessor = VinPreprocessor() + image = np.full((100, 400), 128, dtype=np.uint8) + + result = preprocessor._otsu_threshold(image) + + unique_values = np.unique(result) + assert len(unique_values) <= 2 + + def test_preprocess_otsu_returns_result(self) -> None: + """Test Otsu preprocessing pipeline returns valid result.""" + preprocessor = VinPreprocessor() + image_bytes = create_test_image() + + result = preprocessor.preprocess_otsu(image_bytes) + + assert result.image_bytes is not None + assert len(result.image_bytes) > 0 + assert "otsu_threshold" in result.preprocessing_applied + assert "grayscale" in result.preprocessing_applied + + +class TestVinPreprocessorResolution: + """Tests for resolution upscaling.""" + + def test_upscale_small_image(self) -> None: + """Test small images are upscaled.""" + preprocessor = VinPreprocessor() + small_image = np.full((50, 200), 128, dtype=np.uint8) + + result = preprocessor._ensure_minimum_resolution(small_image) + + assert result.shape[1] >= preprocessor.MIN_WIDTH_FOR_VIN + + def test_no_upscale_large_image(self) -> None: + """Test large images are not upscaled.""" + preprocessor = VinPreprocessor() + large_image = np.full((200, 800), 128, dtype=np.uint8) + + result = preprocessor._ensure_minimum_resolution(large_image) + + assert result.shape == large_image.shape + + class TestVinRegionDetection: """Tests for VIN region detection.""" diff --git a/ocr/tests/test_vin_validator.py b/ocr/tests/test_vin_validator.py index 26f170b..241eabd 100644 --- a/ocr/tests/test_vin_validator.py +++ b/ocr/tests/test_vin_validator.py @@ -43,9 +43,9 @@ class TestVinValidator: result = validator.calculate_check_digit("1HGBH41JXMN109186") assert result == "X" - # 5YJSA1E28HF123456 has check digit 2 at position 9 + # 5YJSA1E28HF123456 has check digit at position 9 result = validator.calculate_check_digit("5YJSA1E28HF123456") - assert result == "8" # Verify this is correct for this VIN + assert result == "5" def test_validate_check_digit_valid(self) -> None: """Test check digit validation with valid VIN.""" @@ -161,6 +161,27 @@ class TestVinValidator: assert len(candidates) >= 1 assert candidates[0][0] == "1HGBH41JXMN109186" + def test_extract_candidates_fragmented_vin(self) -> None: + """Test candidate extraction handles space-fragmented VINs from OCR.""" + validator = VinValidator() + + # Tesseract often fragments VINs into multiple words + text = "1HGBH 41JXMN 109186" + candidates = validator.extract_candidates(text) + + assert len(candidates) >= 1 + assert candidates[0][0] == "1HGBH41JXMN109186" + + def test_extract_candidates_dash_fragmented_vin(self) -> None: + """Test candidate extraction handles dash-separated VINs.""" + validator = VinValidator() + + text = "1HGBH41J-XMN109186" + candidates = validator.extract_candidates(text) + + assert len(candidates) >= 1 + assert candidates[0][0] == "1HGBH41JXMN109186" + def test_extract_candidates_no_vin(self) -> None: """Test candidate extraction with no VIN.""" validator = VinValidator()