feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)

Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py, receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 10:56:27 -06:00
parent ebc633fb36
commit 013fb0c67a
3 changed files with 60 additions and 113 deletions
--- a/ocr/app/extractors/receipt_extractor.py
+++ b/ocr/app/extractors/receipt_extractor.py
@@ -1,16 +1,13 @@
 """Receipt-specific OCR extractor with field extraction."""
-import io
 import logging
 import time
 from dataclasses import dataclass, field
 from typing import Any, Optional

 import magic
-import pytesseract
-from PIL import Image
 from pillow_heif import register_heif_opener

-from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.extractors.base import BaseExtractor
 from app.preprocessors.receipt_preprocessor import receipt_preprocessor
 from app.patterns import currency_matcher, date_matcher, fuel_matcher
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
    }

    def __init__(self) -> None:
-        """Initialize receipt extractor."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        """Initialize receipt extractor with engine from factory."""
+        self._engine = create_engine()

    def extract(
        self,
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"

-    def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
+    def _perform_ocr(self, image_bytes: bytes) -> str:
        """
-        Perform OCR on preprocessed image.
+        Perform OCR on preprocessed image via engine abstraction.

        Args:
            image_bytes: Preprocessed image bytes
-            psm: Tesseract page segmentation mode
-                 4 = Assume single column of text
-                 6 = Uniform block of text

        Returns:
            Raw OCR text
        """
-        image = Image.open(io.BytesIO(image_bytes))
-
-        # Configure Tesseract for receipt OCR
-        # PSM 4 works well for columnar receipt text
-        config = f"--psm {psm}"
-
-        return pytesseract.image_to_string(image, config=config)
+        config = OcrConfig()
+        result = self._engine.recognize(image_bytes, config)
+        return result.text

    def _detect_receipt_type(self, text: str) -> str:
        """
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -1,5 +1,4 @@
 """VIN-specific OCR extractor with preprocessing and validation."""
-import io
 import logging
 import os
 import time
@@ -8,11 +7,10 @@ from datetime import datetime
 from typing import Optional

 import magic
-import pytesseract
-from PIL import Image
 from pillow_heif import register_heif_opener

 from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.extractors.base import BaseExtractor
 from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
 from app.validators.vin_validator import vin_validator
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
        "image/heif",
    }

-    # VIN character whitelist for Tesseract
+    # VIN character whitelist (passed to engine for post-OCR filtering)
    VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"

    # Fixed debug output directory (inside container)
    DEBUG_DIR = "/tmp/vin-debug"

    def __init__(self) -> None:
-        """Initialize VIN extractor."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        """Initialize VIN extractor with engine from factory."""
+        self._engine = create_engine()
        self._debug = settings.log_level.upper() == "DEBUG"

    def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):

            # Perform OCR with VIN-optimized settings
            raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
-            logger.debug("PSM 6 raw text: '%s'", raw_text)
-            logger.debug("PSM 6 word confidences: %s", word_confidences)
+            logger.debug("Primary OCR raw text: '%s'", raw_text)
+            logger.debug("Primary OCR word confidences: %s", word_confidences)

            # Extract VIN candidates from raw text
            candidates = vin_validator.extract_candidates(raw_text)
-            logger.debug("PSM 6 candidates: %s", candidates)
+            logger.debug("Primary OCR candidates: %s", candidates)

            if not candidates:
-                # No VIN candidates found - try with different PSM modes
+                # No VIN candidates found - try alternate OCR configurations
                candidates = self._try_alternate_ocr(preprocessed_bytes)

            if not candidates:
-                # Try grayscale-only (no thresholding) — the Tesseract
-                # LSTM engine often performs better on non-binarized input
-                # because it does its own internal preprocessing.
+                # Try grayscale-only (no thresholding) — OCR engines often
+                # perform better on non-binarized input because they do
+                # their own internal preprocessing.
                gray_result = vin_preprocessor.preprocess(
                    image_bytes, apply_threshold=False
                )
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
                raw_text, word_confidences = self._perform_ocr(
                    gray_result.image_bytes
                )
-                logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
+                logger.debug("Gray primary raw text: '%s'", raw_text)
                candidates = vin_validator.extract_candidates(raw_text)
-                logger.debug("Gray PSM 6 candidates: %s", candidates)
+                logger.debug("Gray primary candidates: %s", candidates)
                if not candidates:
                    candidates = self._try_alternate_ocr(
                        gray_result.image_bytes, prefix="Gray"
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
                    )

                raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
-                logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
+                logger.debug("Otsu primary raw text: '%s'", raw_text)
                candidates = vin_validator.extract_candidates(raw_text)
-                logger.debug("Otsu PSM 6 candidates: %s", candidates)
+                logger.debug("Otsu primary candidates: %s", candidates)
                if not candidates:
                    candidates = self._try_alternate_ocr(
                        otsu_result.image_bytes, prefix="Otsu"
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
        return detected or "application/octet-stream"

    def _perform_ocr(
-        self, image_bytes: bytes, psm: int = 6
+        self,
+        image_bytes: bytes,
+        single_line: bool = False,
+        single_word: bool = False,
    ) -> tuple[str, list[float]]:
        """
-        Perform OCR with VIN-optimized settings.
+        Perform OCR with VIN-optimized settings via engine abstraction.

        Args:
            image_bytes: Preprocessed image bytes
-            psm: Tesseract page segmentation mode
-                 6 = Uniform block of text
-                 7 = Single text line
-                 8 = Single word
+            single_line: Treat image as a single text line
+            single_word: Treat image as a single word

        Returns:
            Tuple of (raw_text, word_confidences)
        """
-        image = Image.open(io.BytesIO(image_bytes))
-
-        # Configure Tesseract for VIN extraction
-        # OEM 1 = LSTM neural network engine (best accuracy)
-        # NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
-        # Using it causes empty/erratic output.  Character filtering is
-        # handled post-OCR by vin_validator.correct_ocr_errors() instead.
-        config = (
-            f"--psm {psm} "
-            f"--oem 1 "
-            f"-c load_system_dawg=false "
-            f"-c load_freq_dawg=false"
+        config = OcrConfig(
+            char_whitelist=self.VIN_WHITELIST,
+            single_line=single_line,
+            single_word=single_word,
+            use_angle_cls=True,
        )
-
-        # Get detailed OCR data
-        ocr_data = pytesseract.image_to_data(
-            image, config=config, output_type=pytesseract.Output.DICT
-        )
-
-        # Extract words and confidences
-        words = []
-        confidences = []
-
-        for i, text in enumerate(ocr_data["text"]):
-            conf = int(ocr_data["conf"][i])
-            if text.strip() and conf > 0:
-                words.append(text.strip())
-                confidences.append(conf / 100.0)
-
-        raw_text = " ".join(words)
-        return raw_text, confidences
+        result = self._engine.recognize(image_bytes, config)
+        word_confidences = [wb.confidence for wb in result.word_boxes]
+        return result.text, word_confidences

    def _try_alternate_ocr(
        self,
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
        """
        Try alternate OCR configurations when initial extraction fails.

-        PSM modes tried in order:
-            7  - Single text line
-            8  - Single word
-            11 - Sparse text (finds text in any order, good for angled photos)
-            13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
+        Modes tried:
+            single-line - Treat as a single text line
+            single-word - Treat as a single word
+
+        For PaddleOCR, angle classification handles rotated/angled text
+        inherently, replacing the need for Tesseract PSM mode fallbacks.

        Returns:
            List of VIN candidates
        """
        tag = f"{prefix} " if prefix else ""
-        for psm in (7, 8, 11, 13):
-            raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
-            logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
+        for mode_name, kwargs in [
+            ("single-line", {"single_line": True}),
+            ("single-word", {"single_word": True}),
+        ]:
+            raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
+            logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
            candidates = vin_validator.extract_candidates(raw_text)
-            logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
+            logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
            if candidates:
                return candidates