feat: add OCR engine abstraction layer (refs #116)

Introduce pluggable OcrEngine ABC with PaddleOCR PP-OCRv4 as primary engine and Tesseract wrapper for backward compatibility. Engine factory reads OCR_PRIMARY_ENGINE config to instantiate the correct engine. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 10:47:40 -06:00
parent 6b0c18a41c
commit ebc633fb36
7 changed files with 422 additions and 0 deletions
--- a/ocr/app/engines/base_engine.py
+++ b/ocr/app/engines/base_engine.py
@@ -0,0 +1,88 @@
+"""OCR engine abstract base class and shared data types."""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any
+
+
+# --- Exception hierarchy ---
+
+
+class EngineError(Exception):
+    """Base exception for all OCR engine errors."""
+
+
+class EngineUnavailableError(EngineError):
+    """Raised when an engine cannot be initialized (missing binary, bad config)."""
+
+
+class EngineProcessingError(EngineError):
+    """Raised when an engine fails to process an image."""
+
+
+# --- Data types ---
+
+
+@dataclass
+class WordBox:
+    """A single recognized word with position and confidence."""
+
+    text: str
+    confidence: float  # 0.0-1.0
+    x: int = 0
+    y: int = 0
+    width: int = 0
+    height: int = 0
+
+
+@dataclass
+class OcrConfig:
+    """Engine-agnostic OCR configuration.
+
+    Common fields cover the most frequent needs. Engine-specific
+    parameters go into ``hints`` so the interface stays stable.
+    """
+
+    char_whitelist: str | None = None  # e.g. VIN: "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
+    single_line: bool = False  # Treat image as a single text line
+    single_word: bool = False  # Treat image as a single word
+    use_angle_cls: bool = True  # Enable angle classification (PaddleOCR)
+    hints: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class OcrEngineResult:
+    """Normalized result returned by every engine implementation."""
+
+    text: str
+    confidence: float  # 0.0-1.0
+    word_boxes: list[WordBox]
+    engine_name: str  # "paddleocr", "tesseract", "google_vision"
+
+
+# --- Abstract base ---
+
+
+class OcrEngine(ABC):
+    """Abstract base class that all OCR engines must implement."""
+
+    @abstractmethod
+    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
+        """Run OCR on preprocessed image bytes.
+
+        Args:
+            image_bytes: Raw image bytes (PNG/JPEG).
+            config: Engine-agnostic configuration.
+
+        Returns:
+            Normalized OCR result.
+
+        Raises:
+            EngineProcessingError: If recognition fails.
+            EngineUnavailableError: If the engine is not ready.
+        """
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Short identifier used in OcrEngineResult.engine_name."""