2026-02-08 01:13:35 +00:00
3 changed files with 60 additions and 113 deletions
--- a/ocr/app/extractors/receipt_extractor.py
+++ b/ocr/app/extractors/receipt_extractor.py
@@ -1,16 +1,13 @@
 """Receipt-specific OCR extractor with field extraction."""
-import io
 import logging
 import time
 from dataclasses import dataclass, field
 from typing import Any, Optional

 import magic
-import pytesseract
-from PIL import Image
 from pillow_heif import register_heif_opener

-from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.extractors.base import BaseExtractor
 from app.preprocessors.receipt_preprocessor import receipt_preprocessor
 from app.patterns import currency_matcher, date_matcher, fuel_matcher
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
    }

    def __init__(self) -> None:
-        """Initialize receipt extractor."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        """Initialize receipt extractor with engine from factory."""
+        self._engine = create_engine()

    def extract(
        self,
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"

-    def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
+    def _perform_ocr(self, image_bytes: bytes) -> str:
        """
-        Perform OCR on preprocessed image.
+        Perform OCR on preprocessed image via engine abstraction.

        Args:
            image_bytes: Preprocessed image bytes
-            psm: Tesseract page segmentation mode
-                 4 = Assume single column of text
-                 6 = Uniform block of text

        Returns:
            Raw OCR text
        """
-        image = Image.open(io.BytesIO(image_bytes))
-
-        # Configure Tesseract for receipt OCR
-        # PSM 4 works well for columnar receipt text
-        config = f"--psm {psm}"
-
-        return pytesseract.image_to_string(image, config=config)
+        config = OcrConfig()
+        result = self._engine.recognize(image_bytes, config)
+        return result.text

    def _detect_receipt_type(self, text: str) -> str:
        """
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -1,5 +1,4 @@
 """VIN-specific OCR extractor with preprocessing and validation."""
-import io
 import logging
 import os
 import time
@@ -8,11 +7,10 @@ from datetime import datetime
 from typing import Optional

 import magic
-import pytesseract
-from PIL import Image
 from pillow_heif import register_heif_opener

 from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.extractors.base import BaseExtractor
 from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
 from app.validators.vin_validator import vin_validator
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
        "image/heif",
    }

-    # VIN character whitelist for Tesseract
+    # VIN character whitelist (passed to engine for post-OCR filtering)
    VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"

    # Fixed debug output directory (inside container)
    DEBUG_DIR = "/tmp/vin-debug"

    def __init__(self) -> None:
-        """Initialize VIN extractor."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        """Initialize VIN extractor with engine from factory."""
+        self._engine = create_engine()
        self._debug = settings.log_level.upper() == "DEBUG"

    def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):

            # Perform OCR with VIN-optimized settings
            raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
-            logger.debug("PSM 6 raw text: '%s'", raw_text)
-            logger.debug("PSM 6 word confidences: %s", word_confidences)
+            logger.debug("Primary OCR raw text: '%s'", raw_text)
+            logger.debug("Primary OCR word confidences: %s", word_confidences)

            # Extract VIN candidates from raw text
            candidates = vin_validator.extract_candidates(raw_text)
-            logger.debug("PSM 6 candidates: %s", candidates)
+            logger.debug("Primary OCR candidates: %s", candidates)

            if not candidates:
-                # No VIN candidates found - try with different PSM modes
+                # No VIN candidates found - try alternate OCR configurations
                candidates = self._try_alternate_ocr(preprocessed_bytes)

            if not candidates:
-                # Try grayscale-only (no thresholding) — the Tesseract
-                # LSTM engine often performs better on non-binarized input
-                # because it does its own internal preprocessing.
+                # Try grayscale-only (no thresholding) — OCR engines often
+                # perform better on non-binarized input because they do
+                # their own internal preprocessing.
                gray_result = vin_preprocessor.preprocess(
                    image_bytes, apply_threshold=False
                )
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
                raw_text, word_confidences = self._perform_ocr(
                    gray_result.image_bytes
                )
-                logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
+                logger.debug("Gray primary raw text: '%s'", raw_text)
                candidates = vin_validator.extract_candidates(raw_text)
-                logger.debug("Gray PSM 6 candidates: %s", candidates)
+                logger.debug("Gray primary candidates: %s", candidates)
                if not candidates:
                    candidates = self._try_alternate_ocr(
                        gray_result.image_bytes, prefix="Gray"
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
                    )

                raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
-                logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
+                logger.debug("Otsu primary raw text: '%s'", raw_text)
                candidates = vin_validator.extract_candidates(raw_text)
-                logger.debug("Otsu PSM 6 candidates: %s", candidates)
+                logger.debug("Otsu primary candidates: %s", candidates)
                if not candidates:
                    candidates = self._try_alternate_ocr(
                        otsu_result.image_bytes, prefix="Otsu"
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
        return detected or "application/octet-stream"

    def _perform_ocr(
-        self, image_bytes: bytes, psm: int = 6
+        self,
+        image_bytes: bytes,
+        single_line: bool = False,
+        single_word: bool = False,
    ) -> tuple[str, list[float]]:
        """
-        Perform OCR with VIN-optimized settings.
+        Perform OCR with VIN-optimized settings via engine abstraction.

        Args:
            image_bytes: Preprocessed image bytes
-            psm: Tesseract page segmentation mode
-                 6 = Uniform block of text
-                 7 = Single text line
-                 8 = Single word
+            single_line: Treat image as a single text line
+            single_word: Treat image as a single word

        Returns:
            Tuple of (raw_text, word_confidences)
        """
-        image = Image.open(io.BytesIO(image_bytes))
-
-        # Configure Tesseract for VIN extraction
-        # OEM 1 = LSTM neural network engine (best accuracy)
-        # NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
-        # Using it causes empty/erratic output.  Character filtering is
-        # handled post-OCR by vin_validator.correct_ocr_errors() instead.
-        config = (
-            f"--psm {psm} "
-            f"--oem 1 "
-            f"-c load_system_dawg=false "
-            f"-c load_freq_dawg=false"
+        config = OcrConfig(
+            char_whitelist=self.VIN_WHITELIST,
+            single_line=single_line,
+            single_word=single_word,
+            use_angle_cls=True,
        )
-
-        # Get detailed OCR data
-        ocr_data = pytesseract.image_to_data(
-            image, config=config, output_type=pytesseract.Output.DICT
-        )
-
-        # Extract words and confidences
-        words = []
-        confidences = []
-
-        for i, text in enumerate(ocr_data["text"]):
-            conf = int(ocr_data["conf"][i])
-            if text.strip() and conf > 0:
-                words.append(text.strip())
-                confidences.append(conf / 100.0)
-
-        raw_text = " ".join(words)
-        return raw_text, confidences
+        result = self._engine.recognize(image_bytes, config)
+        word_confidences = [wb.confidence for wb in result.word_boxes]
+        return result.text, word_confidences

    def _try_alternate_ocr(
        self,
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
        """
        Try alternate OCR configurations when initial extraction fails.

-        PSM modes tried in order:
-            7  - Single text line
-            8  - Single word
-            11 - Sparse text (finds text in any order, good for angled photos)
-            13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
+        Modes tried:
+            single-line - Treat as a single text line
+            single-word - Treat as a single word
+
+        For PaddleOCR, angle classification handles rotated/angled text
+        inherently, replacing the need for Tesseract PSM mode fallbacks.

        Returns:
            List of VIN candidates
        """
        tag = f"{prefix} " if prefix else ""
-        for psm in (7, 8, 11, 13):
-            raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
-            logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
+        for mode_name, kwargs in [
+            ("single-line", {"single_line": True}),
+            ("single-word", {"single_word": True}),
+        ]:
+            raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
+            logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
            candidates = vin_validator.extract_candidates(raw_text)
-            logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
+            logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
            if candidates:
                return candidates

--- a/ocr/app/services/ocr_service.py
+++ b/ocr/app/services/ocr_service.py
@@ -1,15 +1,14 @@
-"""Core OCR service using Tesseract with HEIC support."""
+"""Core OCR service with HEIC support, using pluggable engine abstraction."""
 import io
 import logging
 import time
 from typing import Optional

 import magic
-import pytesseract
 from PIL import Image
 from pillow_heif import register_heif_opener

-from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.models import DocumentType, ExtractedField, OcrResponse
 from app.services.preprocessor import preprocessor

@@ -32,8 +31,8 @@ class OcrService:
    }

    def __init__(self) -> None:
-        """Initialize OCR service."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        """Initialize OCR service with engine from factory."""
+        self._engine = create_engine()

    def extract(
        self,
@@ -86,14 +85,11 @@ class OcrService:
                    file_bytes, deskew=True, denoise=True
                )

-            # Perform OCR
-            image = Image.open(io.BytesIO(file_bytes))
-            ocr_data = pytesseract.image_to_data(
-                image, output_type=pytesseract.Output.DICT
-            )
-
-            # Extract text and calculate confidence
-            raw_text, confidence = self._process_ocr_data(ocr_data)
+            # Perform OCR via engine abstraction
+            config = OcrConfig()
+            result = self._engine.recognize(file_bytes, config)
+            raw_text = result.text
+            confidence = result.confidence

            # Detect document type from content
            document_type = self._detect_document_type(raw_text)
@@ -160,26 +156,6 @@ class OcrService:

        return b""

-    def _process_ocr_data(
-        self, ocr_data: dict
-    ) -> tuple[str, float]:
-        """Process Tesseract output to extract text and confidence."""
-        words = []
-        confidences = []
-
-        for i, text in enumerate(ocr_data["text"]):
-            # Filter out empty strings and low-confidence results
-            conf = int(ocr_data["conf"][i])
-            if text.strip() and conf > 0:
-                words.append(text)
-                confidences.append(conf)
-
-        raw_text = " ".join(words)
-        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
-
-        # Normalize confidence to 0-1 range (Tesseract returns 0-100)
-        return raw_text, avg_confidence / 100.0
-
    def _detect_document_type(self, text: str) -> DocumentType:
        """Detect document type from extracted text content."""
        text_lower = text.lower()