feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)

Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py, receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 10:56:27 -06:00
parent ebc633fb36
commit 013fb0c67a
3 changed files with 60 additions and 113 deletions
--- a/ocr/app/services/ocr_service.py
+++ b/ocr/app/services/ocr_service.py
@@ -1,15 +1,14 @@
-"""Core OCR service using Tesseract with HEIC support."""
+"""Core OCR service with HEIC support, using pluggable engine abstraction."""
 import io
 import logging
 import time
 from typing import Optional

 import magic
-import pytesseract
 from PIL import Image
 from pillow_heif import register_heif_opener

-from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.models import DocumentType, ExtractedField, OcrResponse
 from app.services.preprocessor import preprocessor

@@ -32,8 +31,8 @@ class OcrService:
    }

    def __init__(self) -> None:
-        """Initialize OCR service."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        """Initialize OCR service with engine from factory."""
+        self._engine = create_engine()

    def extract(
        self,
@@ -86,14 +85,11 @@ class OcrService:
                    file_bytes, deskew=True, denoise=True
                )

-            # Perform OCR
-            image = Image.open(io.BytesIO(file_bytes))
-            ocr_data = pytesseract.image_to_data(
-                image, output_type=pytesseract.Output.DICT
-            )
-
-            # Extract text and calculate confidence
-            raw_text, confidence = self._process_ocr_data(ocr_data)
+            # Perform OCR via engine abstraction
+            config = OcrConfig()
+            result = self._engine.recognize(file_bytes, config)
+            raw_text = result.text
+            confidence = result.confidence

            # Detect document type from content
            document_type = self._detect_document_type(raw_text)
@@ -160,26 +156,6 @@ class OcrService:

        return b""

-    def _process_ocr_data(
-        self, ocr_data: dict
-    ) -> tuple[str, float]:
-        """Process Tesseract output to extract text and confidence."""
-        words = []
-        confidences = []
-
-        for i, text in enumerate(ocr_data["text"]):
-            # Filter out empty strings and low-confidence results
-            conf = int(ocr_data["conf"][i])
-            if text.strip() and conf > 0:
-                words.append(text)
-                confidences.append(conf)
-
-        raw_text = " ".join(words)
-        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
-
-        # Normalize confidence to 0-1 range (Tesseract returns 0-100)
-        return raw_text, avg_confidence / 100.0
-
    def _detect_document_type(self, text: str) -> DocumentType:
        """Detect document type from extracted text content."""
        text_lower = text.lower()