From 432b3bda36e544f61586b5a792c349dd7d540e50 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 6 Feb 2026 21:52:08 -0600 Subject: [PATCH] fix: remove char whitelist incompatible with Tesseract LSTM (refs #113) tessedit_char_whitelist does not work with OEM 1 (LSTM engine) and causes empty/erratic output. This was the root cause of Tesseract returning empty text despite clear, well-preprocessed images. Character filtering is already handled post-OCR by the VIN validator's correct_ocr_errors() method (I->1, O->0, Q->0, etc). Co-Authored-By: Claude Opus 4.6 --- ocr/app/extractors/vin_extractor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocr/app/extractors/vin_extractor.py b/ocr/app/extractors/vin_extractor.py index 66a694b..1edca3f 100644 --- a/ocr/app/extractors/vin_extractor.py +++ b/ocr/app/extractors/vin_extractor.py @@ -299,11 +299,12 @@ class VinExtractor(BaseExtractor): # Configure Tesseract for VIN extraction # OEM 1 = LSTM neural network engine (best accuracy) - # Disable dictionaries since VINs are not dictionary words + # NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM). + # Using it causes empty/erratic output. Character filtering is + # handled post-OCR by vin_validator.correct_ocr_errors() instead. config = ( f"--psm {psm} " f"--oem 1 " - f"-c tessedit_char_whitelist={self.VIN_WHITELIST} " f"-c load_system_dawg=false " f"-c load_freq_dawg=false" )