feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)

Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py, receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 10:56:27 -06:00
parent ebc633fb36
commit 013fb0c67a
3 changed files with 60 additions and 113 deletions
--- a/ocr/app/extractors/receipt_extractor.py
+++ b/ocr/app/extractors/receipt_extractor.py
@@ -1,16 +1,13 @@
 """Receipt-specific OCR extractor with field extraction."""
 import io
 import logging
 import time
 from dataclasses import dataclass, field
 from typing import Any, Optional
 import magic
 import pytesseract
 from PIL import Image
 from pillow_heif import register_heif_opener
-from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.extractors.base import BaseExtractor
 from app.preprocessors.receipt_preprocessor import receipt_preprocessor
 from app.patterns import currency_matcher, date_matcher, fuel_matcher
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
    }
    def __init__(self) -> None:
-        """Initialize receipt extractor."""
+        """Initialize receipt extractor with engine from factory."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        self._engine = create_engine()
    def extract(
        self,
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"
-    def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
+    def _perform_ocr(self, image_bytes: bytes) -> str:
        """
-        Perform OCR on preprocessed image.
+        Perform OCR on preprocessed image via engine abstraction.
        Args:
            image_bytes: Preprocessed image bytes
            psm: Tesseract page segmentation mode
                 4 = Assume single column of text
                 6 = Uniform block of text
        Returns:
            Raw OCR text
        """
-        image = Image.open(io.BytesIO(image_bytes))
+        config = OcrConfig()
-
+        result = self._engine.recognize(image_bytes, config)
-        # Configure Tesseract for receipt OCR
+        return result.text
        # PSM 4 works well for columnar receipt text
        config = f"--psm {psm}"
        return pytesseract.image_to_string(image, config=config)
    def _detect_receipt_type(self, text: str) -> str:
        """
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -1,5 +1,4 @@
 """VIN-specific OCR extractor with preprocessing and validation."""
 import io
 import logging
 import os
 import time
@@ -8,11 +7,10 @@ from datetime import datetime
 from typing import Optional
 import magic
 import pytesseract
 from PIL import Image
 from pillow_heif import register_heif_opener
 from app.config import settings
 from app.engines import OcrConfig, create_engine
 from app.extractors.base import BaseExtractor
 from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
 from app.validators.vin_validator import vin_validator
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
        "image/heif",
    }
-    # VIN character whitelist for Tesseract
+    # VIN character whitelist (passed to engine for post-OCR filtering)
    VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
    # Fixed debug output directory (inside container)
    DEBUG_DIR = "/tmp/vin-debug"
    def __init__(self) -> None:
-        """Initialize VIN extractor."""
+        """Initialize VIN extractor with engine from factory."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        self._engine = create_engine()
        self._debug = settings.log_level.upper() == "DEBUG"
    def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
            # Perform OCR with VIN-optimized settings
            raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
-            logger.debug("PSM 6 raw text: '%s'", raw_text)
+            logger.debug("Primary OCR raw text: '%s'", raw_text)
-            logger.debug("PSM 6 word confidences: %s", word_confidences)
+            logger.debug("Primary OCR word confidences: %s", word_confidences)
            # Extract VIN candidates from raw text
            candidates = vin_validator.extract_candidates(raw_text)
-            logger.debug("PSM 6 candidates: %s", candidates)
+            logger.debug("Primary OCR candidates: %s", candidates)
            if not candidates:
-                # No VIN candidates found - try with different PSM modes
+                # No VIN candidates found - try alternate OCR configurations
                candidates = self._try_alternate_ocr(preprocessed_bytes)
            if not candidates:
-                # Try grayscale-only (no thresholding) — the Tesseract
+                # Try grayscale-only (no thresholding) — OCR engines often
-                # LSTM engine often performs better on non-binarized input
+                # perform better on non-binarized input because they do
-                # because it does its own internal preprocessing.
+                # their own internal preprocessing.
                gray_result = vin_preprocessor.preprocess(
                    image_bytes, apply_threshold=False
                )
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
                raw_text, word_confidences = self._perform_ocr(
                    gray_result.image_bytes
                )
-                logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
+                logger.debug("Gray primary raw text: '%s'", raw_text)
                candidates = vin_validator.extract_candidates(raw_text)
-                logger.debug("Gray PSM 6 candidates: %s", candidates)
+                logger.debug("Gray primary candidates: %s", candidates)
                if not candidates:
                    candidates = self._try_alternate_ocr(
                        gray_result.image_bytes, prefix="Gray"
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
                    )
                raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
-                logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
+                logger.debug("Otsu primary raw text: '%s'", raw_text)
                candidates = vin_validator.extract_candidates(raw_text)
-                logger.debug("Otsu PSM 6 candidates: %s", candidates)
+                logger.debug("Otsu primary candidates: %s", candidates)
                if not candidates:
                    candidates = self._try_alternate_ocr(
                        otsu_result.image_bytes, prefix="Otsu"
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
        return detected or "application/octet-stream"
    def _perform_ocr(
-        self, image_bytes: bytes, psm: int = 6
+        self,
        image_bytes: bytes,
        single_line: bool = False,
        single_word: bool = False,
    ) -> tuple[str, list[float]]:
        """
-        Perform OCR with VIN-optimized settings.
+        Perform OCR with VIN-optimized settings via engine abstraction.
        Args:
            image_bytes: Preprocessed image bytes
-            psm: Tesseract page segmentation mode
+            single_line: Treat image as a single text line
-                 6 = Uniform block of text
+            single_word: Treat image as a single word
                 7 = Single text line
                 8 = Single word
        Returns:
            Tuple of (raw_text, word_confidences)
        """
-        image = Image.open(io.BytesIO(image_bytes))
+        config = OcrConfig(
-
+            char_whitelist=self.VIN_WHITELIST,
-        # Configure Tesseract for VIN extraction
+            single_line=single_line,
-        # OEM 1 = LSTM neural network engine (best accuracy)
+            single_word=single_word,
-        # NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
+            use_angle_cls=True,
        # Using it causes empty/erratic output.  Character filtering is
        # handled post-OCR by vin_validator.correct_ocr_errors() instead.
        config = (
            f"--psm {psm} "
            f"--oem 1 "
            f"-c load_system_dawg=false "
            f"-c load_freq_dawg=false"
        )
-
+        result = self._engine.recognize(image_bytes, config)
-        # Get detailed OCR data
+        word_confidences = [wb.confidence for wb in result.word_boxes]
-        ocr_data = pytesseract.image_to_data(
+        return result.text, word_confidences
            image, config=config, output_type=pytesseract.Output.DICT
        )
        # Extract words and confidences
        words = []
        confidences = []
        for i, text in enumerate(ocr_data["text"]):
            conf = int(ocr_data["conf"][i])
            if text.strip() and conf > 0:
                words.append(text.strip())
                confidences.append(conf / 100.0)
        raw_text = " ".join(words)
        return raw_text, confidences
    def _try_alternate_ocr(
        self,
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
        """
        Try alternate OCR configurations when initial extraction fails.
-        PSM modes tried in order:
+        Modes tried:
-            7  - Single text line
+            single-line - Treat as a single text line
-            8  - Single word
+            single-word - Treat as a single word
-            11 - Sparse text (finds text in any order, good for angled photos)
+
-            13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
+        For PaddleOCR, angle classification handles rotated/angled text
        inherently, replacing the need for Tesseract PSM mode fallbacks.
        Returns:
            List of VIN candidates
        """
        tag = f"{prefix} " if prefix else ""
-        for psm in (7, 8, 11, 13):
+        for mode_name, kwargs in [
-            raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
+            ("single-line", {"single_line": True}),
-            logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
+            ("single-word", {"single_word": True}),
        ]:
            raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
            logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
            candidates = vin_validator.extract_candidates(raw_text)
-            logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
+            logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
            if candidates:
                return candidates
--- a/ocr/app/services/ocr_service.py
+++ b/ocr/app/services/ocr_service.py
@@ -1,15 +1,14 @@
-"""Core OCR service using Tesseract with HEIC support."""
+"""Core OCR service with HEIC support, using pluggable engine abstraction."""
 import io
 import logging
 import time
 from typing import Optional
 import magic
 import pytesseract
 from PIL import Image
 from pillow_heif import register_heif_opener
-from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.models import DocumentType, ExtractedField, OcrResponse
 from app.services.preprocessor import preprocessor
@@ -32,8 +31,8 @@ class OcrService:
    }
    def __init__(self) -> None:
-        """Initialize OCR service."""
+        """Initialize OCR service with engine from factory."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        self._engine = create_engine()
    def extract(
        self,
@@ -86,14 +85,11 @@ class OcrService:
                    file_bytes, deskew=True, denoise=True
                )
-            # Perform OCR
+            # Perform OCR via engine abstraction
-            image = Image.open(io.BytesIO(file_bytes))
+            config = OcrConfig()
-            ocr_data = pytesseract.image_to_data(
+            result = self._engine.recognize(file_bytes, config)
-                image, output_type=pytesseract.Output.DICT
+            raw_text = result.text
-            )
+            confidence = result.confidence
            # Extract text and calculate confidence
            raw_text, confidence = self._process_ocr_data(ocr_data)
            # Detect document type from content
            document_type = self._detect_document_type(raw_text)
@@ -160,26 +156,6 @@ class OcrService:
        return b""
    def _process_ocr_data(
        self, ocr_data: dict
    ) -> tuple[str, float]:
        """Process Tesseract output to extract text and confidence."""
        words = []
        confidences = []
        for i, text in enumerate(ocr_data["text"]):
            # Filter out empty strings and low-confidence results
            conf = int(ocr_data["conf"][i])
            if text.strip() and conf > 0:
                words.append(text)
                confidences.append(conf)
        raw_text = " ".join(words)
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
        # Normalize confidence to 0-1 range (Tesseract returns 0-100)
        return raw_text, avg_confidence / 100.0
    def _detect_document_type(self, text: str) -> DocumentType:
        """Detect document type from extracted text content."""
        text_lower = text.lower()