From 013fb0c67a2a2ed4ab25503e1bf8f50d784ecde2 Mon Sep 17 00:00:00 2001
From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com>
Date: Sat, 7 Feb 2026 10:56:27 -0600
Subject: [PATCH] feat: migrate VIN/receipt extractors and OCR service to
 engine abstraction (refs #117)

Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py,
receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with
engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ocr/app/extractors/receipt_extractor.py |  26 ++----
 ocr/app/extractors/vin_extractor.py     | 105 ++++++++++--------------
 ocr/app/services/ocr_service.py         |  42 ++--------
 3 files changed, 60 insertions(+), 113 deletions(-)

diff --git a/ocr/app/extractors/receipt_extractor.py b/ocr/app/extractors/receipt_extractor.py
index 6134988..111cfb1 100644
--- a/ocr/app/extractors/receipt_extractor.py
+++ b/ocr/app/extractors/receipt_extractor.py
@@ -1,16 +1,13 @@
 """Receipt-specific OCR extractor with field extraction."""
-import io
 import logging
 import time
 from dataclasses import dataclass, field
 from typing import Any, Optional
 
 import magic
-import pytesseract
-from PIL import Image
 from pillow_heif import register_heif_opener
 
-from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.extractors.base import BaseExtractor
 from app.preprocessors.receipt_preprocessor import receipt_preprocessor
 from app.patterns import currency_matcher, date_matcher, fuel_matcher
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
     }
 
     def __init__(self) -> None:
-        """Initialize receipt extractor."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        """Initialize receipt extractor with engine from factory."""
+        self._engine = create_engine()
 
     def extract(
         self,
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
         detected = mime.from_buffer(file_bytes)
         return detected or "application/octet-stream"
 
-    def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
+    def _perform_ocr(self, image_bytes: bytes) -> str:
         """
-        Perform OCR on preprocessed image.
+        Perform OCR on preprocessed image via engine abstraction.
 
         Args:
             image_bytes: Preprocessed image bytes
-            psm: Tesseract page segmentation mode
-                 4 = Assume single column of text
-                 6 = Uniform block of text
 
         Returns:
             Raw OCR text
         """
-        image = Image.open(io.BytesIO(image_bytes))
-
-        # Configure Tesseract for receipt OCR
-        # PSM 4 works well for columnar receipt text
-        config = f"--psm {psm}"
-
-        return pytesseract.image_to_string(image, config=config)
+        config = OcrConfig()
+        result = self._engine.recognize(image_bytes, config)
+        return result.text
 
     def _detect_receipt_type(self, text: str) -> str:
         """
diff --git a/ocr/app/extractors/vin_extractor.py b/ocr/app/extractors/vin_extractor.py
index 1edca3f..cce88e9 100644
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -1,5 +1,4 @@
 """VIN-specific OCR extractor with preprocessing and validation."""
-import io
 import logging
 import os
 import time
@@ -8,11 +7,10 @@ from datetime import datetime
 from typing import Optional
 
 import magic
-import pytesseract
-from PIL import Image
 from pillow_heif import register_heif_opener
 
 from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.extractors.base import BaseExtractor
 from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
 from app.validators.vin_validator import vin_validator
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
         "image/heif",
     }
 
-    # VIN character whitelist for Tesseract
+    # VIN character whitelist (passed to engine for post-OCR filtering)
     VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
 
     # Fixed debug output directory (inside container)
     DEBUG_DIR = "/tmp/vin-debug"
 
     def __init__(self) -> None:
-        """Initialize VIN extractor."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        """Initialize VIN extractor with engine from factory."""
+        self._engine = create_engine()
         self._debug = settings.log_level.upper() == "DEBUG"
 
     def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
 
             # Perform OCR with VIN-optimized settings
             raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
-            logger.debug("PSM 6 raw text: '%s'", raw_text)
-            logger.debug("PSM 6 word confidences: %s", word_confidences)
+            logger.debug("Primary OCR raw text: '%s'", raw_text)
+            logger.debug("Primary OCR word confidences: %s", word_confidences)
 
             # Extract VIN candidates from raw text
             candidates = vin_validator.extract_candidates(raw_text)
-            logger.debug("PSM 6 candidates: %s", candidates)
+            logger.debug("Primary OCR candidates: %s", candidates)
 
             if not candidates:
-                # No VIN candidates found - try with different PSM modes
+                # No VIN candidates found - try alternate OCR configurations
                 candidates = self._try_alternate_ocr(preprocessed_bytes)
 
             if not candidates:
-                # Try grayscale-only (no thresholding) — the Tesseract
-                # LSTM engine often performs better on non-binarized input
-                # because it does its own internal preprocessing.
+                # Try grayscale-only (no thresholding) — OCR engines often
+                # perform better on non-binarized input because they do
+                # their own internal preprocessing.
                 gray_result = vin_preprocessor.preprocess(
                     image_bytes, apply_threshold=False
                 )
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
                 raw_text, word_confidences = self._perform_ocr(
                     gray_result.image_bytes
                 )
-                logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
+                logger.debug("Gray primary raw text: '%s'", raw_text)
                 candidates = vin_validator.extract_candidates(raw_text)
-                logger.debug("Gray PSM 6 candidates: %s", candidates)
+                logger.debug("Gray primary candidates: %s", candidates)
                 if not candidates:
                     candidates = self._try_alternate_ocr(
                         gray_result.image_bytes, prefix="Gray"
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
                     )
 
                 raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
-                logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
+                logger.debug("Otsu primary raw text: '%s'", raw_text)
                 candidates = vin_validator.extract_candidates(raw_text)
-                logger.debug("Otsu PSM 6 candidates: %s", candidates)
+                logger.debug("Otsu primary candidates: %s", candidates)
                 if not candidates:
                     candidates = self._try_alternate_ocr(
                         otsu_result.image_bytes, prefix="Otsu"
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
         return detected or "application/octet-stream"
 
     def _perform_ocr(
-        self, image_bytes: bytes, psm: int = 6
+        self,
+        image_bytes: bytes,
+        single_line: bool = False,
+        single_word: bool = False,
     ) -> tuple[str, list[float]]:
         """
-        Perform OCR with VIN-optimized settings.
+        Perform OCR with VIN-optimized settings via engine abstraction.
 
         Args:
             image_bytes: Preprocessed image bytes
-            psm: Tesseract page segmentation mode
-                 6 = Uniform block of text
-                 7 = Single text line
-                 8 = Single word
+            single_line: Treat image as a single text line
+            single_word: Treat image as a single word
 
         Returns:
             Tuple of (raw_text, word_confidences)
         """
-        image = Image.open(io.BytesIO(image_bytes))
-
-        # Configure Tesseract for VIN extraction
-        # OEM 1 = LSTM neural network engine (best accuracy)
-        # NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
-        # Using it causes empty/erratic output.  Character filtering is
-        # handled post-OCR by vin_validator.correct_ocr_errors() instead.
-        config = (
-            f"--psm {psm} "
-            f"--oem 1 "
-            f"-c load_system_dawg=false "
-            f"-c load_freq_dawg=false"
+        config = OcrConfig(
+            char_whitelist=self.VIN_WHITELIST,
+            single_line=single_line,
+            single_word=single_word,
+            use_angle_cls=True,
         )
-
-        # Get detailed OCR data
-        ocr_data = pytesseract.image_to_data(
-            image, config=config, output_type=pytesseract.Output.DICT
-        )
-
-        # Extract words and confidences
-        words = []
-        confidences = []
-
-        for i, text in enumerate(ocr_data["text"]):
-            conf = int(ocr_data["conf"][i])
-            if text.strip() and conf > 0:
-                words.append(text.strip())
-                confidences.append(conf / 100.0)
-
-        raw_text = " ".join(words)
-        return raw_text, confidences
+        result = self._engine.recognize(image_bytes, config)
+        word_confidences = [wb.confidence for wb in result.word_boxes]
+        return result.text, word_confidences
 
     def _try_alternate_ocr(
         self,
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
         """
         Try alternate OCR configurations when initial extraction fails.
 
-        PSM modes tried in order:
-            7  - Single text line
-            8  - Single word
-            11 - Sparse text (finds text in any order, good for angled photos)
-            13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
+        Modes tried:
+            single-line - Treat as a single text line
+            single-word - Treat as a single word
+
+        For PaddleOCR, angle classification handles rotated/angled text
+        inherently, replacing the need for Tesseract PSM mode fallbacks.
 
         Returns:
             List of VIN candidates
         """
         tag = f"{prefix} " if prefix else ""
-        for psm in (7, 8, 11, 13):
-            raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
-            logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
+        for mode_name, kwargs in [
+            ("single-line", {"single_line": True}),
+            ("single-word", {"single_word": True}),
+        ]:
+            raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
+            logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
             candidates = vin_validator.extract_candidates(raw_text)
-            logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
+            logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
             if candidates:
                 return candidates
 
diff --git a/ocr/app/services/ocr_service.py b/ocr/app/services/ocr_service.py
index 4c317b3..4d06452 100644
--- a/ocr/app/services/ocr_service.py
+++ b/ocr/app/services/ocr_service.py
@@ -1,15 +1,14 @@
-"""Core OCR service using Tesseract with HEIC support."""
+"""Core OCR service with HEIC support, using pluggable engine abstraction."""
 import io
 import logging
 import time
 from typing import Optional
 
 import magic
-import pytesseract
 from PIL import Image
 from pillow_heif import register_heif_opener
 
-from app.config import settings
+from app.engines import OcrConfig, create_engine
 from app.models import DocumentType, ExtractedField, OcrResponse
 from app.services.preprocessor import preprocessor
 
@@ -32,8 +31,8 @@ class OcrService:
     }
 
     def __init__(self) -> None:
-        """Initialize OCR service."""
-        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+        """Initialize OCR service with engine from factory."""
+        self._engine = create_engine()
 
     def extract(
         self,
@@ -86,14 +85,11 @@ class OcrService:
                     file_bytes, deskew=True, denoise=True
                 )
 
-            # Perform OCR
-            image = Image.open(io.BytesIO(file_bytes))
-            ocr_data = pytesseract.image_to_data(
-                image, output_type=pytesseract.Output.DICT
-            )
-
-            # Extract text and calculate confidence
-            raw_text, confidence = self._process_ocr_data(ocr_data)
+            # Perform OCR via engine abstraction
+            config = OcrConfig()
+            result = self._engine.recognize(file_bytes, config)
+            raw_text = result.text
+            confidence = result.confidence
 
             # Detect document type from content
             document_type = self._detect_document_type(raw_text)
@@ -160,26 +156,6 @@ class OcrService:
 
         return b""
 
-    def _process_ocr_data(
-        self, ocr_data: dict
-    ) -> tuple[str, float]:
-        """Process Tesseract output to extract text and confidence."""
-        words = []
-        confidences = []
-
-        for i, text in enumerate(ocr_data["text"]):
-            # Filter out empty strings and low-confidence results
-            conf = int(ocr_data["conf"][i])
-            if text.strip() and conf > 0:
-                words.append(text)
-                confidences.append(conf)
-
-        raw_text = " ".join(words)
-        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
-
-        # Normalize confidence to 0-1 range (Tesseract returns 0-100)
-        return raw_text, avg_confidence / 100.0
-
     def _detect_document_type(self, text: str) -> DocumentType:
         """Detect document type from extracted text content."""
         text_lower = text.lower()