feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)

Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py,
receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with
engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-07 10:56:27 -06:00
parent ebc633fb36
commit 013fb0c67a
3 changed files with 60 additions and 113 deletions

View File

@@ -1,5 +1,4 @@
"""VIN-specific OCR extractor with preprocessing and validation."""
import io
import logging
import os
import time
@@ -8,11 +7,10 @@ from datetime import datetime
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
"image/heif",
}
# VIN character whitelist for Tesseract
# VIN character whitelist (passed to engine for post-OCR filtering)
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
# Fixed debug output directory (inside container)
DEBUG_DIR = "/tmp/vin-debug"
def __init__(self) -> None:
"""Initialize VIN extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
"""Initialize VIN extractor with engine from factory."""
self._engine = create_engine()
self._debug = settings.log_level.upper() == "DEBUG"
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
# Perform OCR with VIN-optimized settings
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
logger.debug("PSM 6 raw text: '%s'", raw_text)
logger.debug("PSM 6 word confidences: %s", word_confidences)
logger.debug("Primary OCR raw text: '%s'", raw_text)
logger.debug("Primary OCR word confidences: %s", word_confidences)
# Extract VIN candidates from raw text
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("PSM 6 candidates: %s", candidates)
logger.debug("Primary OCR candidates: %s", candidates)
if not candidates:
# No VIN candidates found - try with different PSM modes
# No VIN candidates found - try alternate OCR configurations
candidates = self._try_alternate_ocr(preprocessed_bytes)
if not candidates:
# Try grayscale-only (no thresholding) — the Tesseract
# LSTM engine often performs better on non-binarized input
# because it does its own internal preprocessing.
# Try grayscale-only (no thresholding) — OCR engines often
# perform better on non-binarized input because they do
# their own internal preprocessing.
gray_result = vin_preprocessor.preprocess(
image_bytes, apply_threshold=False
)
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
raw_text, word_confidences = self._perform_ocr(
gray_result.image_bytes
)
logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
logger.debug("Gray primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Gray PSM 6 candidates: %s", candidates)
logger.debug("Gray primary candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
gray_result.image_bytes, prefix="Gray"
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
)
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
logger.debug("Otsu primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Otsu PSM 6 candidates: %s", candidates)
logger.debug("Otsu primary candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
otsu_result.image_bytes, prefix="Otsu"
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
return detected or "application/octet-stream"
def _perform_ocr(
self, image_bytes: bytes, psm: int = 6
self,
image_bytes: bytes,
single_line: bool = False,
single_word: bool = False,
) -> tuple[str, list[float]]:
"""
Perform OCR with VIN-optimized settings.
Perform OCR with VIN-optimized settings via engine abstraction.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
6 = Uniform block of text
7 = Single text line
8 = Single word
single_line: Treat image as a single text line
single_word: Treat image as a single word
Returns:
Tuple of (raw_text, word_confidences)
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for VIN extraction
# OEM 1 = LSTM neural network engine (best accuracy)
# NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
# Using it causes empty/erratic output. Character filtering is
# handled post-OCR by vin_validator.correct_ocr_errors() instead.
config = (
f"--psm {psm} "
f"--oem 1 "
f"-c load_system_dawg=false "
f"-c load_freq_dawg=false"
config = OcrConfig(
char_whitelist=self.VIN_WHITELIST,
single_line=single_line,
single_word=single_word,
use_angle_cls=True,
)
# Get detailed OCR data
ocr_data = pytesseract.image_to_data(
image, config=config, output_type=pytesseract.Output.DICT
)
# Extract words and confidences
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text.strip())
confidences.append(conf / 100.0)
raw_text = " ".join(words)
return raw_text, confidences
result = self._engine.recognize(image_bytes, config)
word_confidences = [wb.confidence for wb in result.word_boxes]
return result.text, word_confidences
def _try_alternate_ocr(
self,
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
"""
Try alternate OCR configurations when initial extraction fails.
PSM modes tried in order:
7 - Single text line
8 - Single word
11 - Sparse text (finds text in any order, good for angled photos)
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
Modes tried:
single-line - Treat as a single text line
single-word - Treat as a single word
For PaddleOCR, angle classification handles rotated/angled text
inherently, replacing the need for Tesseract PSM mode fallbacks.
Returns:
List of VIN candidates
"""
tag = f"{prefix} " if prefix else ""
for psm in (7, 8, 11, 13):
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
for mode_name, kwargs in [
("single-line", {"single_line": True}),
("single-word", {"single_word": True}),
]:
raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
if candidates:
return candidates