feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)

Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py,
receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with
engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-07 10:56:27 -06:00
parent ebc633fb36
commit 013fb0c67a
3 changed files with 60 additions and 113 deletions

View File

@@ -1,15 +1,14 @@
"""Core OCR service using Tesseract with HEIC support."""
"""Core OCR service with HEIC support, using pluggable engine abstraction."""
import io
import logging
import time
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.engines import OcrConfig, create_engine
from app.models import DocumentType, ExtractedField, OcrResponse
from app.services.preprocessor import preprocessor
@@ -32,8 +31,8 @@ class OcrService:
}
def __init__(self) -> None:
"""Initialize OCR service."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
"""Initialize OCR service with engine from factory."""
self._engine = create_engine()
def extract(
self,
@@ -86,14 +85,11 @@ class OcrService:
file_bytes, deskew=True, denoise=True
)
# Perform OCR
image = Image.open(io.BytesIO(file_bytes))
ocr_data = pytesseract.image_to_data(
image, output_type=pytesseract.Output.DICT
)
# Extract text and calculate confidence
raw_text, confidence = self._process_ocr_data(ocr_data)
# Perform OCR via engine abstraction
config = OcrConfig()
result = self._engine.recognize(file_bytes, config)
raw_text = result.text
confidence = result.confidence
# Detect document type from content
document_type = self._detect_document_type(raw_text)
@@ -160,26 +156,6 @@ class OcrService:
return b""
def _process_ocr_data(
self, ocr_data: dict
) -> tuple[str, float]:
"""Process Tesseract output to extract text and confidence."""
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
# Filter out empty strings and low-confidence results
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text)
confidences.append(conf)
raw_text = " ".join(words)
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
return raw_text, avg_confidence / 100.0
def _detect_document_type(self, text: str) -> DocumentType:
"""Detect document type from extracted text content."""
text_lower = text.lower()