From 432b3bda36e544f61586b5a792c349dd7d540e50 Mon Sep 17 00:00:00 2001
From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com>
Date: Fri, 6 Feb 2026 21:52:08 -0600
Subject: [PATCH] fix: remove char whitelist incompatible with Tesseract LSTM
 (refs #113)

tessedit_char_whitelist does not work with OEM 1 (LSTM engine) and
causes empty/erratic output. This was the root cause of Tesseract
returning empty text despite clear, well-preprocessed images.
Character filtering is already handled post-OCR by the VIN validator's
correct_ocr_errors() method (I->1, O->0, Q->0, etc).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ocr/app/extractors/vin_extractor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ocr/app/extractors/vin_extractor.py b/ocr/app/extractors/vin_extractor.py
index 66a694b..1edca3f 100644
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -299,11 +299,12 @@ class VinExtractor(BaseExtractor):
 
         # Configure Tesseract for VIN extraction
         # OEM 1 = LSTM neural network engine (best accuracy)
-        # Disable dictionaries since VINs are not dictionary words
+        # NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
+        # Using it causes empty/erratic output.  Character filtering is
+        # handled post-OCR by vin_validator.correct_ocr_errors() instead.
         config = (
             f"--psm {psm} "
             f"--oem 1 "
-            f"-c tessedit_char_whitelist={self.VIN_WHITELIST} "
             f"-c load_system_dawg=false "
             f"-c load_freq_dawg=false"
         )