From e4336ce9da80b6c0bbf200d41f0fe9e240228a36 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 22:00:07 -0600 Subject: [PATCH] fix: extract VIN from noisy OCR via sliding window + char deletion (refs #113) When OCR reads extra characters (e.g. sticker border as 'C', spurious 'Z' insertion), the raw text exceeds 17 chars and the old first-17 trim produced wrong VINs. New strategy tries all 17-char sliding windows and single/double character deletions, validating each via check digit. For 'CWVGGNPE2Z4NP069500', this finds the correct VIN 'WVGGNPE24NP069500' (valid check digit) instead of 'CWVGGNPE2Z4NP0695' (invalid). Co-Authored-By: Claude Opus 4.6 --- ocr/app/validators/vin_validator.py | 48 ++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/ocr/app/validators/vin_validator.py b/ocr/app/validators/vin_validator.py index 7c74ae9..c9c60ef 100644 --- a/ocr/app/validators/vin_validator.py +++ b/ocr/app/validators/vin_validator.py @@ -290,14 +290,48 @@ class VinValidator: """Try to add a corrected VIN candidate if it passes validation.""" corrected = self.correct_ocr_errors(raw) - # Trim to 17 chars if OCR captured extra characters - if len(corrected) > 17: - corrected = corrected[:17] + if len(corrected) == 17: + self._add_if_valid(corrected, start, end, candidates, seen_vins) + return - if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected): - if corrected not in seen_vins: - seen_vins.add(corrected) - candidates.append((corrected, start, end)) + if len(corrected) > 17: + # Strategy A: try every 17-char sliding window + for i in range(len(corrected) - 16): + window = corrected[i : i + 17] + self._add_if_valid(window, start, end, candidates, seen_vins) + + # Strategy B: for 18-19 char strings, try deleting each + # character one at a time. OCR often inserts a spurious + # character (e.g. sticker border read as 'C') that breaks + # the VIN. Check-digit validation filters out false hits. + if len(corrected) <= 19: + for i in range(len(corrected)): + reduced = corrected[:i] + corrected[i + 1 :] + if len(reduced) == 17: + self._add_if_valid( + reduced, start, end, candidates, seen_vins + ) + elif len(reduced) == 18: + # Two deletions needed — try removing one more + for j in range(len(reduced)): + reduced2 = reduced[:j] + reduced[j + 1 :] + self._add_if_valid( + reduced2, start, end, candidates, seen_vins + ) + + def _add_if_valid( + self, + vin: str, + start: int, + end: int, + candidates: list[tuple[str, int, int]], + seen_vins: set[str], + ) -> None: + """Add a 17-char VIN to candidates if it matches the pattern.""" + if len(vin) == 17 and self.MODERN_VIN_PATTERN.match(vin): + if vin not in seen_vins: + seen_vins.add(vin) + candidates.append((vin, start, end)) # Singleton instance