fix: resolve VIN OCR scanning failures on all images (refs #113)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 35s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 35s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Root cause: Tesseract fragments VINs into multiple words but candidate extraction required continuous 17-char sequences, rejecting all results. Changes: - Fix candidate extraction to concatenate adjacent OCR fragments - Disable Tesseract dictionaries (VINs are not dictionary words) - Set OEM 1 (LSTM engine) for better accuracy - Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes - Add Otsu's thresholding as alternative preprocessing pipeline - Upscale small images to meet Tesseract's 300 DPI requirement - Remove incorrect B->8 and S->5 transliterations (valid VIN chars) - Fix pre-existing test bug in check digit expected value Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,7 +20,9 @@ class VinValidator:
|
||||
# VIN character set (excludes I, O, Q)
|
||||
VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
|
||||
|
||||
# Common OCR misreads and their corrections
|
||||
# Common OCR misreads and their corrections.
|
||||
# Only map characters that are INVALID in VINs to their likely correct values.
|
||||
# B and S are valid VIN characters and must NOT be transliterated.
|
||||
TRANSLITERATION = {
|
||||
"I": "1",
|
||||
"O": "0",
|
||||
@@ -29,9 +31,6 @@ class VinValidator:
|
||||
"o": "0",
|
||||
"q": "0",
|
||||
"l": "1",
|
||||
"L": "1",
|
||||
"B": "8", # Sometimes confused
|
||||
"S": "5", # Sometimes confused
|
||||
}
|
||||
|
||||
# Weights for check digit calculation (positions 1-17)
|
||||
@@ -224,6 +223,11 @@ class VinValidator:
|
||||
"""
|
||||
Extract VIN candidates from raw OCR text.
|
||||
|
||||
Uses two strategies:
|
||||
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
|
||||
2. Concatenate adjacent short fragments separated by spaces/dashes
|
||||
(handles Tesseract fragmenting VINs into multiple words)
|
||||
|
||||
Args:
|
||||
text: Raw OCR text
|
||||
max_candidates: Maximum number of candidates to return
|
||||
@@ -231,29 +235,70 @@ class VinValidator:
|
||||
Returns:
|
||||
List of (vin, start_pos, end_pos) tuples
|
||||
"""
|
||||
# Pattern to find potential VIN sequences
|
||||
# Allow some flexibility for OCR errors (include I, O, Q for correction later)
|
||||
potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE)
|
||||
|
||||
candidates = []
|
||||
for match in potential_vin_pattern.finditer(text.upper()):
|
||||
candidate = match.group()
|
||||
corrected = self.correct_ocr_errors(candidate)
|
||||
seen_vins: set[str] = set()
|
||||
|
||||
# Only include if it could be a valid VIN after correction
|
||||
if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
|
||||
candidates.append((corrected, match.start(), match.end()))
|
||||
upper_text = text.upper()
|
||||
|
||||
# Sort by likelihood of being valid (check digit validation)
|
||||
# Strategy 1: Find continuous runs of VIN-like characters
|
||||
continuous_pattern = re.compile(r"[A-Z0-9IOQ]{11,20}", re.IGNORECASE)
|
||||
for match in continuous_pattern.finditer(upper_text):
|
||||
self._try_add_candidate(
|
||||
match.group(), match.start(), match.end(), candidates, seen_vins
|
||||
)
|
||||
|
||||
# Strategy 2: Concatenate adjacent alphanumeric fragments
|
||||
# This handles OCR fragmentation like "1HGBH 41JXMN 109186"
|
||||
# Only consider fragments >= 3 chars (filters out noise/short words)
|
||||
fragment_pattern = re.compile(r"[A-Z0-9IOQ]{3,}", re.IGNORECASE)
|
||||
fragments = [
|
||||
(m.group(), m.start(), m.end())
|
||||
for m in fragment_pattern.finditer(upper_text)
|
||||
]
|
||||
|
||||
# Try sliding windows of 2-4 adjacent fragments
|
||||
for window_size in range(2, min(5, len(fragments) + 1)):
|
||||
for i in range(len(fragments) - window_size + 1):
|
||||
window = fragments[i : i + window_size]
|
||||
combined = "".join(f[0] for f in window)
|
||||
# Combined length must be close to 17 (allow +/- 2 for OCR noise)
|
||||
# Must contain at least 2 digit characters (VINs always have digits;
|
||||
# pure-alphabetic text is almost certainly not a VIN)
|
||||
if 15 <= len(combined) <= 19 and sum(c.isdigit() for c in combined) >= 2:
|
||||
self._try_add_candidate(
|
||||
combined, window[0][1], window[-1][2], candidates, seen_vins
|
||||
)
|
||||
|
||||
# Sort by likelihood of being valid (check digit first, then position)
|
||||
def score_candidate(c: tuple[str, int, int]) -> int:
|
||||
vin = c[0]
|
||||
if self.validate_check_digit(vin):
|
||||
return 0 # Best score
|
||||
return 0
|
||||
return 1
|
||||
|
||||
candidates.sort(key=score_candidate)
|
||||
return candidates[:max_candidates]
|
||||
|
||||
def _try_add_candidate(
|
||||
self,
|
||||
raw: str,
|
||||
start: int,
|
||||
end: int,
|
||||
candidates: list[tuple[str, int, int]],
|
||||
seen_vins: set[str],
|
||||
) -> None:
|
||||
"""Try to add a corrected VIN candidate if it passes validation."""
|
||||
corrected = self.correct_ocr_errors(raw)
|
||||
|
||||
# Trim to 17 chars if OCR captured extra characters
|
||||
if len(corrected) > 17:
|
||||
corrected = corrected[:17]
|
||||
|
||||
if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
|
||||
if corrected not in seen_vins:
|
||||
seen_vins.add(corrected)
|
||||
candidates.append((corrected, start, end))
|
||||
|
||||
|
||||
# Singleton instance
|
||||
vin_validator = VinValidator()
|
||||
|
||||
Reference in New Issue
Block a user