feat: add OCR engine abstraction layer (refs #116)

Introduce pluggable OcrEngine ABC with PaddleOCR PP-OCRv4 as primary engine and Tesseract wrapper for backward compatibility. Engine factory reads OCR_PRIMARY_ENGINE config to instantiate the correct engine. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 10:47:40 -06:00
parent 6b0c18a41c
commit ebc633fb36
7 changed files with 422 additions and 0 deletions
--- a/ocr/app/engines/tesseract_engine.py
+++ b/ocr/app/engines/tesseract_engine.py
@@ -0,0 +1,114 @@
+"""Tesseract engine wrapper for backward compatibility."""
+
+import io
+import logging
+
+from app.config import settings
+from app.engines.base_engine import (
+    EngineProcessingError,
+    EngineUnavailableError,
+    OcrConfig,
+    OcrEngine,
+    OcrEngineResult,
+    WordBox,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class TesseractEngine(OcrEngine):
+    """pytesseract wrapper conforming to the OcrEngine interface."""
+
+    def __init__(self) -> None:
+        try:
+            import pytesseract  # type: ignore[import-untyped]
+
+            pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+            self._pytesseract = pytesseract
+            logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
+        except ImportError as exc:
+            raise EngineUnavailableError(
+                "pytesseract is not installed. "
+                "Install with: pip install pytesseract"
+            ) from exc
+
+    @property
+    def name(self) -> str:
+        return "tesseract"
+
+    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
+        """Run Tesseract OCR on image bytes."""
+        try:
+            from PIL import Image
+
+            image = Image.open(io.BytesIO(image_bytes))
+
+            # Build Tesseract config string from OcrConfig
+            tess_config = self._build_config(config)
+
+            # Get word-level data
+            ocr_data = self._pytesseract.image_to_data(
+                image,
+                config=tess_config,
+                output_type=self._pytesseract.Output.DICT,
+            )
+
+            word_boxes: list[WordBox] = []
+            texts: list[str] = []
+            confidences: list[float] = []
+
+            for i, text in enumerate(ocr_data["text"]):
+                conf = int(ocr_data["conf"][i])
+                if text.strip() and conf > 0:
+                    normalized_conf = conf / 100.0
+                    word_boxes.append(
+                        WordBox(
+                            text=text.strip(),
+                            confidence=normalized_conf,
+                            x=int(ocr_data["left"][i]),
+                            y=int(ocr_data["top"][i]),
+                            width=int(ocr_data["width"][i]),
+                            height=int(ocr_data["height"][i]),
+                        )
+                    )
+                    texts.append(text.strip())
+                    confidences.append(normalized_conf)
+
+            combined_text = " ".join(texts)
+            avg_confidence = (
+                sum(confidences) / len(confidences) if confidences else 0.0
+            )
+
+            return OcrEngineResult(
+                text=combined_text,
+                confidence=avg_confidence,
+                word_boxes=word_boxes,
+                engine_name=self.name,
+            )
+
+        except (EngineUnavailableError, EngineProcessingError):
+            raise
+        except Exception as exc:
+            raise EngineProcessingError(
+                f"Tesseract recognition failed: {exc}"
+            ) from exc
+
+    def _build_config(self, config: OcrConfig) -> str:
+        """Translate OcrConfig into a Tesseract CLI config string."""
+        parts: list[str] = []
+
+        # Page segmentation mode
+        if config.single_word:
+            parts.append("--psm 8")
+        elif config.single_line:
+            parts.append("--psm 7")
+        else:
+            # Default: assume uniform block of text
+            psm = config.hints.get("psm", 6)
+            parts.append(f"--psm {psm}")
+
+        # Character whitelist
+        if config.char_whitelist:
+            parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
+
+        return " ".join(parts)