feat: add OCR engine abstraction layer (refs #116)

Introduce pluggable OcrEngine ABC with PaddleOCR PP-OCRv4 as primary engine and Tesseract wrapper for backward compatibility. Engine factory reads OCR_PRIMARY_ENGINE config to instantiate the correct engine. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 10:47:40 -06:00
parent 6b0c18a41c
commit ebc633fb36
7 changed files with 422 additions and 0 deletions
--- a/ocr/app/config.py
+++ b/ocr/app/config.py
@@ -11,6 +11,12 @@ class Settings:
        self.port: int = int(os.getenv("PORT", "8000"))
        self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")

+        # OCR engine configuration
+        self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr")
+        self.ocr_confidence_threshold: float = float(
+            os.getenv("OCR_CONFIDENCE_THRESHOLD", "0.6")
+        )
+
        # Redis configuration for job queue
        self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis")
        self.redis_port: int = int(os.getenv("REDIS_PORT", "6379"))
--- a/ocr/app/engines/init.py
+++ b/ocr/app/engines/init.py
@@ -0,0 +1,27 @@
+"""OCR engine abstraction layer.
+
+Provides a pluggable engine interface for OCR processing,
+decoupling extractors from specific OCR libraries.
+"""
+
+from app.engines.base_engine import (
+    EngineError,
+    EngineProcessingError,
+    EngineUnavailableError,
+    OcrConfig,
+    OcrEngine,
+    OcrEngineResult,
+    WordBox,
+)
+from app.engines.engine_factory import create_engine
+
+__all__ = [
+    "OcrEngine",
+    "OcrConfig",
+    "OcrEngineResult",
+    "WordBox",
+    "EngineError",
+    "EngineUnavailableError",
+    "EngineProcessingError",
+    "create_engine",
+]
--- a/ocr/app/engines/base_engine.py
+++ b/ocr/app/engines/base_engine.py
@@ -0,0 +1,88 @@
+"""OCR engine abstract base class and shared data types."""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any
+
+
+# --- Exception hierarchy ---
+
+
+class EngineError(Exception):
+    """Base exception for all OCR engine errors."""
+
+
+class EngineUnavailableError(EngineError):
+    """Raised when an engine cannot be initialized (missing binary, bad config)."""
+
+
+class EngineProcessingError(EngineError):
+    """Raised when an engine fails to process an image."""
+
+
+# --- Data types ---
+
+
+@dataclass
+class WordBox:
+    """A single recognized word with position and confidence."""
+
+    text: str
+    confidence: float  # 0.0-1.0
+    x: int = 0
+    y: int = 0
+    width: int = 0
+    height: int = 0
+
+
+@dataclass
+class OcrConfig:
+    """Engine-agnostic OCR configuration.
+
+    Common fields cover the most frequent needs. Engine-specific
+    parameters go into ``hints`` so the interface stays stable.
+    """
+
+    char_whitelist: str | None = None  # e.g. VIN: "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
+    single_line: bool = False  # Treat image as a single text line
+    single_word: bool = False  # Treat image as a single word
+    use_angle_cls: bool = True  # Enable angle classification (PaddleOCR)
+    hints: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class OcrEngineResult:
+    """Normalized result returned by every engine implementation."""
+
+    text: str
+    confidence: float  # 0.0-1.0
+    word_boxes: list[WordBox]
+    engine_name: str  # "paddleocr", "tesseract", "google_vision"
+
+
+# --- Abstract base ---
+
+
+class OcrEngine(ABC):
+    """Abstract base class that all OCR engines must implement."""
+
+    @abstractmethod
+    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
+        """Run OCR on preprocessed image bytes.
+
+        Args:
+            image_bytes: Raw image bytes (PNG/JPEG).
+            config: Engine-agnostic configuration.
+
+        Returns:
+            Normalized OCR result.
+
+        Raises:
+            EngineProcessingError: If recognition fails.
+            EngineUnavailableError: If the engine is not ready.
+        """
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Short identifier used in OcrEngineResult.engine_name."""
--- a/ocr/app/engines/engine_factory.py
+++ b/ocr/app/engines/engine_factory.py
@@ -0,0 +1,52 @@
+"""Factory function for creating OCR engine instances from configuration."""
+
+import logging
+
+from app.config import settings
+from app.engines.base_engine import EngineUnavailableError, OcrEngine
+
+logger = logging.getLogger(__name__)
+
+# Valid engine identifiers
+_ENGINE_REGISTRY: dict[str, str] = {
+    "paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
+    "tesseract": "app.engines.tesseract_engine.TesseractEngine",
+}
+
+
+def create_engine(engine_name: str | None = None) -> OcrEngine:
+    """Instantiate an OCR engine by name (defaults to config value).
+
+    Args:
+        engine_name: Engine identifier ("paddleocr", "tesseract").
+                     Falls back to ``settings.ocr_primary_engine``.
+
+    Returns:
+        Initialized OcrEngine instance.
+
+    Raises:
+        EngineUnavailableError: If the engine cannot be loaded or initialized.
+    """
+    name = (engine_name or settings.ocr_primary_engine).lower().strip()
+
+    if name not in _ENGINE_REGISTRY:
+        raise EngineUnavailableError(
+            f"Unknown engine '{name}'. Available: {list(_ENGINE_REGISTRY.keys())}"
+        )
+
+    module_path, class_name = _ENGINE_REGISTRY[name].rsplit(".", 1)
+
+    try:
+        import importlib
+
+        module = importlib.import_module(module_path)
+        engine_cls = getattr(module, class_name)
+        engine: OcrEngine = engine_cls()
+        logger.info("Created OCR engine: %s", name)
+        return engine
+    except EngineUnavailableError:
+        raise
+    except Exception as exc:
+        raise EngineUnavailableError(
+            f"Failed to create engine '{name}': {exc}"
+        ) from exc
--- a/ocr/app/engines/paddle_engine.py
+++ b/ocr/app/engines/paddle_engine.py
@@ -0,0 +1,133 @@
+"""PaddleOCR engine wrapper using PP-OCRv4 models."""
+
+import io
+import logging
+from typing import Any
+
+from app.engines.base_engine import (
+    EngineProcessingError,
+    EngineUnavailableError,
+    OcrConfig,
+    OcrEngine,
+    OcrEngineResult,
+    WordBox,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PaddleOcrEngine(OcrEngine):
+    """PaddleOCR PP-OCRv4 engine with angle classification, CPU-only."""
+
+    def __init__(self) -> None:
+        self._ocr: Any | None = None
+
+    @property
+    def name(self) -> str:
+        return "paddleocr"
+
+    def _get_ocr(self) -> Any:
+        """Lazy-initialize PaddleOCR instance on first use."""
+        if self._ocr is not None:
+            return self._ocr
+        try:
+            from paddleocr import PaddleOCR  # type: ignore[import-untyped]
+
+            self._ocr = PaddleOCR(
+                use_angle_cls=True,
+                lang="en",
+                use_gpu=False,
+                show_log=False,
+            )
+            logger.info("PaddleOCR PP-OCRv4 initialized (CPU, angle_cls=True)")
+            return self._ocr
+        except ImportError as exc:
+            raise EngineUnavailableError(
+                "paddleocr is not installed. "
+                "Install with: pip install paddlepaddle paddleocr"
+            ) from exc
+        except Exception as exc:
+            raise EngineUnavailableError(
+                f"Failed to initialize PaddleOCR: {exc}"
+            ) from exc
+
+    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
+        """Run PaddleOCR on image bytes.
+
+        PaddleOCR returns: list of pages, each page is a list of
+        ``[[box_coords], (text, confidence)]`` entries.
+        """
+        ocr = self._get_ocr()
+
+        try:
+            import numpy as np  # type: ignore[import-untyped]
+            from PIL import Image
+
+            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+            img_array = np.array(image)
+
+            # PaddleOCR accepts numpy arrays
+            results = ocr.ocr(img_array, cls=config.use_angle_cls)
+
+            if not results or not results[0]:
+                return OcrEngineResult(
+                    text="",
+                    confidence=0.0,
+                    word_boxes=[],
+                    engine_name=self.name,
+                )
+
+            word_boxes: list[WordBox] = []
+            texts: list[str] = []
+            confidences: list[float] = []
+
+            for line in results[0]:
+                box_coords = line[0]  # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
+                text = line[1][0]
+                conf = float(line[1][1])
+
+                # Apply character whitelist filter if configured
+                if config.char_whitelist:
+                    allowed = set(config.char_whitelist)
+                    text = "".join(ch for ch in text if ch in allowed)
+
+                if not text.strip():
+                    continue
+
+                # Convert quadrilateral to bounding box
+                xs = [pt[0] for pt in box_coords]
+                ys = [pt[1] for pt in box_coords]
+                x_min, y_min = int(min(xs)), int(min(ys))
+                x_max, y_max = int(max(xs)), int(max(ys))
+
+                word_boxes.append(
+                    WordBox(
+                        text=text.strip(),
+                        confidence=conf,
+                        x=x_min,
+                        y=y_min,
+                        width=x_max - x_min,
+                        height=y_max - y_min,
+                    )
+                )
+                texts.append(text.strip())
+                confidences.append(conf)
+
+            combined_text = " ".join(texts)
+            avg_confidence = (
+                sum(confidences) / len(confidences) if confidences else 0.0
+            )
+
+            return OcrEngineResult(
+                text=combined_text,
+                confidence=avg_confidence,
+                word_boxes=word_boxes,
+                engine_name=self.name,
+            )
+
+        except (EngineUnavailableError, EngineProcessingError):
+            raise
+        except Exception as exc:
+            raise EngineProcessingError(
+                f"PaddleOCR recognition failed: {exc}"
+            ) from exc
--- a/ocr/app/engines/tesseract_engine.py
+++ b/ocr/app/engines/tesseract_engine.py
@@ -0,0 +1,114 @@
+"""Tesseract engine wrapper for backward compatibility."""
+
+import io
+import logging
+
+from app.config import settings
+from app.engines.base_engine import (
+    EngineProcessingError,
+    EngineUnavailableError,
+    OcrConfig,
+    OcrEngine,
+    OcrEngineResult,
+    WordBox,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class TesseractEngine(OcrEngine):
+    """pytesseract wrapper conforming to the OcrEngine interface."""
+
+    def __init__(self) -> None:
+        try:
+            import pytesseract  # type: ignore[import-untyped]
+
+            pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+            self._pytesseract = pytesseract
+            logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
+        except ImportError as exc:
+            raise EngineUnavailableError(
+                "pytesseract is not installed. "
+                "Install with: pip install pytesseract"
+            ) from exc
+
+    @property
+    def name(self) -> str:
+        return "tesseract"
+
+    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
+        """Run Tesseract OCR on image bytes."""
+        try:
+            from PIL import Image
+
+            image = Image.open(io.BytesIO(image_bytes))
+
+            # Build Tesseract config string from OcrConfig
+            tess_config = self._build_config(config)
+
+            # Get word-level data
+            ocr_data = self._pytesseract.image_to_data(
+                image,
+                config=tess_config,
+                output_type=self._pytesseract.Output.DICT,
+            )
+
+            word_boxes: list[WordBox] = []
+            texts: list[str] = []
+            confidences: list[float] = []
+
+            for i, text in enumerate(ocr_data["text"]):
+                conf = int(ocr_data["conf"][i])
+                if text.strip() and conf > 0:
+                    normalized_conf = conf / 100.0
+                    word_boxes.append(
+                        WordBox(
+                            text=text.strip(),
+                            confidence=normalized_conf,
+                            x=int(ocr_data["left"][i]),
+                            y=int(ocr_data["top"][i]),
+                            width=int(ocr_data["width"][i]),
+                            height=int(ocr_data["height"][i]),
+                        )
+                    )
+                    texts.append(text.strip())
+                    confidences.append(normalized_conf)
+
+            combined_text = " ".join(texts)
+            avg_confidence = (
+                sum(confidences) / len(confidences) if confidences else 0.0
+            )
+
+            return OcrEngineResult(
+                text=combined_text,
+                confidence=avg_confidence,
+                word_boxes=word_boxes,
+                engine_name=self.name,
+            )
+
+        except (EngineUnavailableError, EngineProcessingError):
+            raise
+        except Exception as exc:
+            raise EngineProcessingError(
+                f"Tesseract recognition failed: {exc}"
+            ) from exc
+
+    def _build_config(self, config: OcrConfig) -> str:
+        """Translate OcrConfig into a Tesseract CLI config string."""
+        parts: list[str] = []
+
+        # Page segmentation mode
+        if config.single_word:
+            parts.append("--psm 8")
+        elif config.single_line:
+            parts.append("--psm 7")
+        else:
+            # Default: assume uniform block of text
+            psm = config.hints.get("psm", 6)
+            parts.append(f"--psm {psm}")
+
+        # Character whitelist
+        if config.char_whitelist:
+            parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
+
+        return " ".join(parts)
--- a/ocr/requirements.txt
+++ b/ocr/requirements.txt
@@ -15,6 +15,8 @@ numpy>=1.24.0

 # OCR Engines
 pytesseract>=0.3.10
+paddlepaddle>=2.6.0
+paddleocr>=2.8.0

 # PDF Processing
 PyMuPDF>=1.23.0