fix: Build errors and tesseract removal

2026-02-07 12:12:04 -06:00
parent cf114fad3c
commit b9fe222f12
16 changed files with 35 additions and 238 deletions
--- a/ocr/app/engines/init.py
+++ b/ocr/app/engines/init.py
@@ -5,7 +5,6 @@ decoupling extractors from specific OCR libraries.

 Engines:
  - PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only)
-  - TesseractEngine: pytesseract wrapper (backward compatibility)
  - CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback)
  - HybridEngine: Primary + fallback with confidence threshold
 """
--- a/ocr/app/engines/base_engine.py
+++ b/ocr/app/engines/base_engine.py
@@ -57,7 +57,7 @@ class OcrEngineResult:
    text: str
    confidence: float  # 0.0-1.0
    word_boxes: list[WordBox]
-    engine_name: str  # "paddleocr", "tesseract", "google_vision"
+    engine_name: str  # "paddleocr", "google_vision"


 # --- Abstract base ---
--- a/ocr/app/engines/engine_factory.py
+++ b/ocr/app/engines/engine_factory.py
@@ -11,7 +11,6 @@ logger = logging.getLogger(__name__)
 # Valid engine identifiers (primary engines only; hybrid is constructed separately)
 _ENGINE_REGISTRY: dict[str, str] = {
    "paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
-    "tesseract": "app.engines.tesseract_engine.TesseractEngine",
    "google_vision": "app.engines.cloud_engine.CloudEngine",
 }

@@ -46,7 +45,7 @@ def create_engine(engine_name: str | None = None) -> OcrEngine:
    returns a ``HybridEngine`` that wraps the primary with the fallback.

    Args:
-        engine_name: Engine identifier ("paddleocr", "tesseract").
+        engine_name: Engine identifier ("paddleocr", "google_vision").
                     Falls back to ``settings.ocr_primary_engine``.

    Returns:
--- a/ocr/app/engines/tesseract_engine.py
+++ b/ocr/app/engines/tesseract_engine.py
@@ -1,114 +0,0 @@
-"""Tesseract engine wrapper for backward compatibility."""
-
-import io
-import logging
-
-from app.config import settings
-from app.engines.base_engine import (
-    EngineProcessingError,
-    EngineUnavailableError,
-    OcrConfig,
-    OcrEngine,
-    OcrEngineResult,
-    WordBox,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class TesseractEngine(OcrEngine):
-    """pytesseract wrapper conforming to the OcrEngine interface."""
-
-    def __init__(self) -> None:
-        try:
-            import pytesseract  # type: ignore[import-untyped]
-
-            pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
-            self._pytesseract = pytesseract
-            logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
-        except ImportError as exc:
-            raise EngineUnavailableError(
-                "pytesseract is not installed. "
-                "Install with: pip install pytesseract"
-            ) from exc
-
-    @property
-    def name(self) -> str:
-        return "tesseract"
-
-    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
-        """Run Tesseract OCR on image bytes."""
-        try:
-            from PIL import Image
-
-            image = Image.open(io.BytesIO(image_bytes))
-
-            # Build Tesseract config string from OcrConfig
-            tess_config = self._build_config(config)
-
-            # Get word-level data
-            ocr_data = self._pytesseract.image_to_data(
-                image,
-                config=tess_config,
-                output_type=self._pytesseract.Output.DICT,
-            )
-
-            word_boxes: list[WordBox] = []
-            texts: list[str] = []
-            confidences: list[float] = []
-
-            for i, text in enumerate(ocr_data["text"]):
-                conf = int(ocr_data["conf"][i])
-                if text.strip() and conf > 0:
-                    normalized_conf = conf / 100.0
-                    word_boxes.append(
-                        WordBox(
-                            text=text.strip(),
-                            confidence=normalized_conf,
-                            x=int(ocr_data["left"][i]),
-                            y=int(ocr_data["top"][i]),
-                            width=int(ocr_data["width"][i]),
-                            height=int(ocr_data["height"][i]),
-                        )
-                    )
-                    texts.append(text.strip())
-                    confidences.append(normalized_conf)
-
-            combined_text = " ".join(texts)
-            avg_confidence = (
-                sum(confidences) / len(confidences) if confidences else 0.0
-            )
-
-            return OcrEngineResult(
-                text=combined_text,
-                confidence=avg_confidence,
-                word_boxes=word_boxes,
-                engine_name=self.name,
-            )
-
-        except (EngineUnavailableError, EngineProcessingError):
-            raise
-        except Exception as exc:
-            raise EngineProcessingError(
-                f"Tesseract recognition failed: {exc}"
-            ) from exc
-
-    def _build_config(self, config: OcrConfig) -> str:
-        """Translate OcrConfig into a Tesseract CLI config string."""
-        parts: list[str] = []
-
-        # Page segmentation mode
-        if config.single_word:
-            parts.append("--psm 8")
-        elif config.single_line:
-            parts.append("--psm 7")
-        else:
-            # Default: assume uniform block of text
-            psm = config.hints.get("psm", 6)
-            parts.append(f"--psm {psm}")
-
-        # Character whitelist
-        if config.char_whitelist:
-            parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
-
-        return " ".join(parts)