feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)

Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py,
receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with
engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-07 10:56:27 -06:00
parent ebc633fb36
commit 013fb0c67a
3 changed files with 60 additions and 113 deletions

View File

@@ -1,16 +1,13 @@
"""Receipt-specific OCR extractor with field extraction.""" """Receipt-specific OCR extractor with field extraction."""
import io
import logging import logging
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, Optional from typing import Any, Optional
import magic import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener from pillow_heif import register_heif_opener
from app.config import settings from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor from app.extractors.base import BaseExtractor
from app.preprocessors.receipt_preprocessor import receipt_preprocessor from app.preprocessors.receipt_preprocessor import receipt_preprocessor
from app.patterns import currency_matcher, date_matcher, fuel_matcher from app.patterns import currency_matcher, date_matcher, fuel_matcher
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
} }
def __init__(self) -> None: def __init__(self) -> None:
"""Initialize receipt extractor.""" """Initialize receipt extractor with engine from factory."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd self._engine = create_engine()
def extract( def extract(
self, self,
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
detected = mime.from_buffer(file_bytes) detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream" return detected or "application/octet-stream"
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str: def _perform_ocr(self, image_bytes: bytes) -> str:
""" """
Perform OCR on preprocessed image. Perform OCR on preprocessed image via engine abstraction.
Args: Args:
image_bytes: Preprocessed image bytes image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
4 = Assume single column of text
6 = Uniform block of text
Returns: Returns:
Raw OCR text Raw OCR text
""" """
image = Image.open(io.BytesIO(image_bytes)) config = OcrConfig()
result = self._engine.recognize(image_bytes, config)
# Configure Tesseract for receipt OCR return result.text
# PSM 4 works well for columnar receipt text
config = f"--psm {psm}"
return pytesseract.image_to_string(image, config=config)
def _detect_receipt_type(self, text: str) -> str: def _detect_receipt_type(self, text: str) -> str:
""" """

View File

@@ -1,5 +1,4 @@
"""VIN-specific OCR extractor with preprocessing and validation.""" """VIN-specific OCR extractor with preprocessing and validation."""
import io
import logging import logging
import os import os
import time import time
@@ -8,11 +7,10 @@ from datetime import datetime
from typing import Optional from typing import Optional
import magic import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener from pillow_heif import register_heif_opener
from app.config import settings from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator from app.validators.vin_validator import vin_validator
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
"image/heif", "image/heif",
} }
# VIN character whitelist for Tesseract # VIN character whitelist (passed to engine for post-OCR filtering)
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789" VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
# Fixed debug output directory (inside container) # Fixed debug output directory (inside container)
DEBUG_DIR = "/tmp/vin-debug" DEBUG_DIR = "/tmp/vin-debug"
def __init__(self) -> None: def __init__(self) -> None:
"""Initialize VIN extractor.""" """Initialize VIN extractor with engine from factory."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd self._engine = create_engine()
self._debug = settings.log_level.upper() == "DEBUG" self._debug = settings.log_level.upper() == "DEBUG"
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None: def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
# Perform OCR with VIN-optimized settings # Perform OCR with VIN-optimized settings
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes) raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
logger.debug("PSM 6 raw text: '%s'", raw_text) logger.debug("Primary OCR raw text: '%s'", raw_text)
logger.debug("PSM 6 word confidences: %s", word_confidences) logger.debug("Primary OCR word confidences: %s", word_confidences)
# Extract VIN candidates from raw text # Extract VIN candidates from raw text
candidates = vin_validator.extract_candidates(raw_text) candidates = vin_validator.extract_candidates(raw_text)
logger.debug("PSM 6 candidates: %s", candidates) logger.debug("Primary OCR candidates: %s", candidates)
if not candidates: if not candidates:
# No VIN candidates found - try with different PSM modes # No VIN candidates found - try alternate OCR configurations
candidates = self._try_alternate_ocr(preprocessed_bytes) candidates = self._try_alternate_ocr(preprocessed_bytes)
if not candidates: if not candidates:
# Try grayscale-only (no thresholding) — the Tesseract # Try grayscale-only (no thresholding) — OCR engines often
# LSTM engine often performs better on non-binarized input # perform better on non-binarized input because they do
# because it does its own internal preprocessing. # their own internal preprocessing.
gray_result = vin_preprocessor.preprocess( gray_result = vin_preprocessor.preprocess(
image_bytes, apply_threshold=False image_bytes, apply_threshold=False
) )
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
raw_text, word_confidences = self._perform_ocr( raw_text, word_confidences = self._perform_ocr(
gray_result.image_bytes gray_result.image_bytes
) )
logger.debug("Gray PSM 6 raw text: '%s'", raw_text) logger.debug("Gray primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text) candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Gray PSM 6 candidates: %s", candidates) logger.debug("Gray primary candidates: %s", candidates)
if not candidates: if not candidates:
candidates = self._try_alternate_ocr( candidates = self._try_alternate_ocr(
gray_result.image_bytes, prefix="Gray" gray_result.image_bytes, prefix="Gray"
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
) )
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes) raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text) logger.debug("Otsu primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text) candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Otsu PSM 6 candidates: %s", candidates) logger.debug("Otsu primary candidates: %s", candidates)
if not candidates: if not candidates:
candidates = self._try_alternate_ocr( candidates = self._try_alternate_ocr(
otsu_result.image_bytes, prefix="Otsu" otsu_result.image_bytes, prefix="Otsu"
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
return detected or "application/octet-stream" return detected or "application/octet-stream"
def _perform_ocr( def _perform_ocr(
self, image_bytes: bytes, psm: int = 6 self,
image_bytes: bytes,
single_line: bool = False,
single_word: bool = False,
) -> tuple[str, list[float]]: ) -> tuple[str, list[float]]:
""" """
Perform OCR with VIN-optimized settings. Perform OCR with VIN-optimized settings via engine abstraction.
Args: Args:
image_bytes: Preprocessed image bytes image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode single_line: Treat image as a single text line
6 = Uniform block of text single_word: Treat image as a single word
7 = Single text line
8 = Single word
Returns: Returns:
Tuple of (raw_text, word_confidences) Tuple of (raw_text, word_confidences)
""" """
image = Image.open(io.BytesIO(image_bytes)) config = OcrConfig(
char_whitelist=self.VIN_WHITELIST,
# Configure Tesseract for VIN extraction single_line=single_line,
# OEM 1 = LSTM neural network engine (best accuracy) single_word=single_word,
# NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM). use_angle_cls=True,
# Using it causes empty/erratic output. Character filtering is
# handled post-OCR by vin_validator.correct_ocr_errors() instead.
config = (
f"--psm {psm} "
f"--oem 1 "
f"-c load_system_dawg=false "
f"-c load_freq_dawg=false"
) )
result = self._engine.recognize(image_bytes, config)
# Get detailed OCR data word_confidences = [wb.confidence for wb in result.word_boxes]
ocr_data = pytesseract.image_to_data( return result.text, word_confidences
image, config=config, output_type=pytesseract.Output.DICT
)
# Extract words and confidences
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text.strip())
confidences.append(conf / 100.0)
raw_text = " ".join(words)
return raw_text, confidences
def _try_alternate_ocr( def _try_alternate_ocr(
self, self,
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
""" """
Try alternate OCR configurations when initial extraction fails. Try alternate OCR configurations when initial extraction fails.
PSM modes tried in order: Modes tried:
7 - Single text line single-line - Treat as a single text line
8 - Single word single-word - Treat as a single word
11 - Sparse text (finds text in any order, good for angled photos)
13 - Raw line (no Tesseract heuristics, good for clean VIN plates) For PaddleOCR, angle classification handles rotated/angled text
inherently, replacing the need for Tesseract PSM mode fallbacks.
Returns: Returns:
List of VIN candidates List of VIN candidates
""" """
tag = f"{prefix} " if prefix else "" tag = f"{prefix} " if prefix else ""
for psm in (7, 8, 11, 13): for mode_name, kwargs in [
raw_text, _ = self._perform_ocr(image_bytes, psm=psm) ("single-line", {"single_line": True}),
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text) ("single-word", {"single_word": True}),
]:
raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
candidates = vin_validator.extract_candidates(raw_text) candidates = vin_validator.extract_candidates(raw_text)
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates) logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
if candidates: if candidates:
return candidates return candidates

View File

@@ -1,15 +1,14 @@
"""Core OCR service using Tesseract with HEIC support.""" """Core OCR service with HEIC support, using pluggable engine abstraction."""
import io import io
import logging import logging
import time import time
from typing import Optional from typing import Optional
import magic import magic
import pytesseract
from PIL import Image from PIL import Image
from pillow_heif import register_heif_opener from pillow_heif import register_heif_opener
from app.config import settings from app.engines import OcrConfig, create_engine
from app.models import DocumentType, ExtractedField, OcrResponse from app.models import DocumentType, ExtractedField, OcrResponse
from app.services.preprocessor import preprocessor from app.services.preprocessor import preprocessor
@@ -32,8 +31,8 @@ class OcrService:
} }
def __init__(self) -> None: def __init__(self) -> None:
"""Initialize OCR service.""" """Initialize OCR service with engine from factory."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd self._engine = create_engine()
def extract( def extract(
self, self,
@@ -86,14 +85,11 @@ class OcrService:
file_bytes, deskew=True, denoise=True file_bytes, deskew=True, denoise=True
) )
# Perform OCR # Perform OCR via engine abstraction
image = Image.open(io.BytesIO(file_bytes)) config = OcrConfig()
ocr_data = pytesseract.image_to_data( result = self._engine.recognize(file_bytes, config)
image, output_type=pytesseract.Output.DICT raw_text = result.text
) confidence = result.confidence
# Extract text and calculate confidence
raw_text, confidence = self._process_ocr_data(ocr_data)
# Detect document type from content # Detect document type from content
document_type = self._detect_document_type(raw_text) document_type = self._detect_document_type(raw_text)
@@ -160,26 +156,6 @@ class OcrService:
return b"" return b""
def _process_ocr_data(
self, ocr_data: dict
) -> tuple[str, float]:
"""Process Tesseract output to extract text and confidence."""
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
# Filter out empty strings and low-confidence results
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text)
confidences.append(conf)
raw_text = " ".join(words)
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
return raw_text, avg_confidence / 100.0
def _detect_document_type(self, text: str) -> DocumentType: def _detect_document_type(self, text: str) -> DocumentType:
"""Detect document type from extracted text content.""" """Detect document type from extracted text content."""
text_lower = text.lower() text_lower = text.lower()