feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)

Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py,
receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with
engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-07 10:56:27 -06:00
parent ebc633fb36
commit 013fb0c67a
3 changed files with 60 additions and 113 deletions

View File

@@ -1,16 +1,13 @@
"""Receipt-specific OCR extractor with field extraction."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
from app.patterns import currency_matcher, date_matcher, fuel_matcher
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
}
def __init__(self) -> None:
"""Initialize receipt extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
"""Initialize receipt extractor with engine from factory."""
self._engine = create_engine()
def extract(
self,
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
def _perform_ocr(self, image_bytes: bytes) -> str:
"""
Perform OCR on preprocessed image.
Perform OCR on preprocessed image via engine abstraction.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
4 = Assume single column of text
6 = Uniform block of text
Returns:
Raw OCR text
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for receipt OCR
# PSM 4 works well for columnar receipt text
config = f"--psm {psm}"
return pytesseract.image_to_string(image, config=config)
config = OcrConfig()
result = self._engine.recognize(image_bytes, config)
return result.text
def _detect_receipt_type(self, text: str) -> str:
"""

View File

@@ -1,5 +1,4 @@
"""VIN-specific OCR extractor with preprocessing and validation."""
import io
import logging
import os
import time
@@ -8,11 +7,10 @@ from datetime import datetime
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
"image/heif",
}
# VIN character whitelist for Tesseract
# VIN character whitelist (passed to engine for post-OCR filtering)
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
# Fixed debug output directory (inside container)
DEBUG_DIR = "/tmp/vin-debug"
def __init__(self) -> None:
"""Initialize VIN extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
"""Initialize VIN extractor with engine from factory."""
self._engine = create_engine()
self._debug = settings.log_level.upper() == "DEBUG"
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
# Perform OCR with VIN-optimized settings
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
logger.debug("PSM 6 raw text: '%s'", raw_text)
logger.debug("PSM 6 word confidences: %s", word_confidences)
logger.debug("Primary OCR raw text: '%s'", raw_text)
logger.debug("Primary OCR word confidences: %s", word_confidences)
# Extract VIN candidates from raw text
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("PSM 6 candidates: %s", candidates)
logger.debug("Primary OCR candidates: %s", candidates)
if not candidates:
# No VIN candidates found - try with different PSM modes
# No VIN candidates found - try alternate OCR configurations
candidates = self._try_alternate_ocr(preprocessed_bytes)
if not candidates:
# Try grayscale-only (no thresholding) — the Tesseract
# LSTM engine often performs better on non-binarized input
# because it does its own internal preprocessing.
# Try grayscale-only (no thresholding) — OCR engines often
# perform better on non-binarized input because they do
# their own internal preprocessing.
gray_result = vin_preprocessor.preprocess(
image_bytes, apply_threshold=False
)
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
raw_text, word_confidences = self._perform_ocr(
gray_result.image_bytes
)
logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
logger.debug("Gray primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Gray PSM 6 candidates: %s", candidates)
logger.debug("Gray primary candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
gray_result.image_bytes, prefix="Gray"
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
)
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
logger.debug("Otsu primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Otsu PSM 6 candidates: %s", candidates)
logger.debug("Otsu primary candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
otsu_result.image_bytes, prefix="Otsu"
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
return detected or "application/octet-stream"
def _perform_ocr(
self, image_bytes: bytes, psm: int = 6
self,
image_bytes: bytes,
single_line: bool = False,
single_word: bool = False,
) -> tuple[str, list[float]]:
"""
Perform OCR with VIN-optimized settings.
Perform OCR with VIN-optimized settings via engine abstraction.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
6 = Uniform block of text
7 = Single text line
8 = Single word
single_line: Treat image as a single text line
single_word: Treat image as a single word
Returns:
Tuple of (raw_text, word_confidences)
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for VIN extraction
# OEM 1 = LSTM neural network engine (best accuracy)
# NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
# Using it causes empty/erratic output. Character filtering is
# handled post-OCR by vin_validator.correct_ocr_errors() instead.
config = (
f"--psm {psm} "
f"--oem 1 "
f"-c load_system_dawg=false "
f"-c load_freq_dawg=false"
config = OcrConfig(
char_whitelist=self.VIN_WHITELIST,
single_line=single_line,
single_word=single_word,
use_angle_cls=True,
)
# Get detailed OCR data
ocr_data = pytesseract.image_to_data(
image, config=config, output_type=pytesseract.Output.DICT
)
# Extract words and confidences
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text.strip())
confidences.append(conf / 100.0)
raw_text = " ".join(words)
return raw_text, confidences
result = self._engine.recognize(image_bytes, config)
word_confidences = [wb.confidence for wb in result.word_boxes]
return result.text, word_confidences
def _try_alternate_ocr(
self,
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
"""
Try alternate OCR configurations when initial extraction fails.
PSM modes tried in order:
7 - Single text line
8 - Single word
11 - Sparse text (finds text in any order, good for angled photos)
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
Modes tried:
single-line - Treat as a single text line
single-word - Treat as a single word
For PaddleOCR, angle classification handles rotated/angled text
inherently, replacing the need for Tesseract PSM mode fallbacks.
Returns:
List of VIN candidates
"""
tag = f"{prefix} " if prefix else ""
for psm in (7, 8, 11, 13):
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
for mode_name, kwargs in [
("single-line", {"single_line": True}),
("single-word", {"single_word": True}),
]:
raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
if candidates:
return candidates