feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)

Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py,
receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with
engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-07 10:56:27 -06:00
parent ebc633fb36
commit 013fb0c67a
3 changed files with 60 additions and 113 deletions

View File

@@ -1,16 +1,13 @@
"""Receipt-specific OCR extractor with field extraction."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
from app.patterns import currency_matcher, date_matcher, fuel_matcher
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
}
def __init__(self) -> None:
"""Initialize receipt extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
"""Initialize receipt extractor with engine from factory."""
self._engine = create_engine()
def extract(
self,
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
def _perform_ocr(self, image_bytes: bytes) -> str:
"""
Perform OCR on preprocessed image.
Perform OCR on preprocessed image via engine abstraction.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
4 = Assume single column of text
6 = Uniform block of text
Returns:
Raw OCR text
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for receipt OCR
# PSM 4 works well for columnar receipt text
config = f"--psm {psm}"
return pytesseract.image_to_string(image, config=config)
config = OcrConfig()
result = self._engine.recognize(image_bytes, config)
return result.text
def _detect_receipt_type(self, text: str) -> str:
"""