fix: resolve VIN OCR scanning failures on all images (refs #113)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 35s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

Root cause: Tesseract fragments VINs into multiple words but candidate
extraction required continuous 17-char sequences, rejecting all results.

Changes:
- Fix candidate extraction to concatenate adjacent OCR fragments
- Disable Tesseract dictionaries (VINs are not dictionary words)
- Set OEM 1 (LSTM engine) for better accuracy
- Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes
- Add Otsu's thresholding as alternative preprocessing pipeline
- Upscale small images to meet Tesseract's 300 DPI requirement
- Remove incorrect B->8 and S->5 transliterations (valid VIN chars)
- Fix pre-existing test bug in check digit expected value

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-06 15:57:14 -06:00
parent 45aaeab973
commit 6a4c2137f7
5 changed files with 248 additions and 31 deletions

View File

@@ -20,7 +20,9 @@ class VinValidator:
# VIN character set (excludes I, O, Q)
VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
# Common OCR misreads and their corrections
# Common OCR misreads and their corrections.
# Only map characters that are INVALID in VINs to their likely correct values.
# B and S are valid VIN characters and must NOT be transliterated.
TRANSLITERATION = {
"I": "1",
"O": "0",
@@ -29,9 +31,6 @@ class VinValidator:
"o": "0",
"q": "0",
"l": "1",
"L": "1",
"B": "8", # Sometimes confused
"S": "5", # Sometimes confused
}
# Weights for check digit calculation (positions 1-17)
@@ -224,6 +223,11 @@ class VinValidator:
"""
Extract VIN candidates from raw OCR text.
Uses two strategies:
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
2. Concatenate adjacent short fragments separated by spaces/dashes
(handles Tesseract fragmenting VINs into multiple words)
Args:
text: Raw OCR text
max_candidates: Maximum number of candidates to return
@@ -231,29 +235,70 @@ class VinValidator:
Returns:
List of (vin, start_pos, end_pos) tuples
"""
# Pattern to find potential VIN sequences
# Allow some flexibility for OCR errors (include I, O, Q for correction later)
potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE)
candidates = []
for match in potential_vin_pattern.finditer(text.upper()):
candidate = match.group()
corrected = self.correct_ocr_errors(candidate)
seen_vins: set[str] = set()
# Only include if it could be a valid VIN after correction
if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
candidates.append((corrected, match.start(), match.end()))
upper_text = text.upper()
# Sort by likelihood of being valid (check digit validation)
# Strategy 1: Find continuous runs of VIN-like characters
continuous_pattern = re.compile(r"[A-Z0-9IOQ]{11,20}", re.IGNORECASE)
for match in continuous_pattern.finditer(upper_text):
self._try_add_candidate(
match.group(), match.start(), match.end(), candidates, seen_vins
)
# Strategy 2: Concatenate adjacent alphanumeric fragments
# This handles OCR fragmentation like "1HGBH 41JXMN 109186"
# Only consider fragments >= 3 chars (filters out noise/short words)
fragment_pattern = re.compile(r"[A-Z0-9IOQ]{3,}", re.IGNORECASE)
fragments = [
(m.group(), m.start(), m.end())
for m in fragment_pattern.finditer(upper_text)
]
# Try sliding windows of 2-4 adjacent fragments
for window_size in range(2, min(5, len(fragments) + 1)):
for i in range(len(fragments) - window_size + 1):
window = fragments[i : i + window_size]
combined = "".join(f[0] for f in window)
# Combined length must be close to 17 (allow +/- 2 for OCR noise)
# Must contain at least 2 digit characters (VINs always have digits;
# pure-alphabetic text is almost certainly not a VIN)
if 15 <= len(combined) <= 19 and sum(c.isdigit() for c in combined) >= 2:
self._try_add_candidate(
combined, window[0][1], window[-1][2], candidates, seen_vins
)
# Sort by likelihood of being valid (check digit first, then position)
def score_candidate(c: tuple[str, int, int]) -> int:
vin = c[0]
if self.validate_check_digit(vin):
return 0 # Best score
return 0
return 1
candidates.sort(key=score_candidate)
return candidates[:max_candidates]
def _try_add_candidate(
self,
raw: str,
start: int,
end: int,
candidates: list[tuple[str, int, int]],
seen_vins: set[str],
) -> None:
"""Try to add a corrected VIN candidate if it passes validation."""
corrected = self.correct_ocr_errors(raw)
# Trim to 17 chars if OCR captured extra characters
if len(corrected) > 17:
corrected = corrected[:17]
if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
if corrected not in seen_vins:
seen_vins.add(corrected)
candidates.append((corrected, start, end))
# Singleton instance
vin_validator = VinValidator()