motovaultpro/ocr/app/engines/tesseract_engine.py

"""Tesseract engine wrapper for backward compatibility."""

import io
import logging

from app.config import settings
from app.engines.base_engine import (
    EngineProcessingError,
    EngineUnavailableError,
    OcrConfig,
    OcrEngine,
    OcrEngineResult,
    WordBox,
)

logger = logging.getLogger(__name__)


class TesseractEngine(OcrEngine):
    """pytesseract wrapper conforming to the OcrEngine interface."""

    def __init__(self) -> None:
        try:
            import pytesseract  # type: ignore[import-untyped]

            pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
            self._pytesseract = pytesseract
            logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
        except ImportError as exc:
            raise EngineUnavailableError(
                "pytesseract is not installed. "
                "Install with: pip install pytesseract"
            ) from exc

    @property
    def name(self) -> str:
        return "tesseract"

    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
        """Run Tesseract OCR on image bytes."""
        try:
            from PIL import Image

            image = Image.open(io.BytesIO(image_bytes))

            # Build Tesseract config string from OcrConfig
            tess_config = self._build_config(config)

            # Get word-level data
            ocr_data = self._pytesseract.image_to_data(
                image,
                config=tess_config,
                output_type=self._pytesseract.Output.DICT,
            )

            word_boxes: list[WordBox] = []
            texts: list[str] = []
            confidences: list[float] = []

            for i, text in enumerate(ocr_data["text"]):
                conf = int(ocr_data["conf"][i])
                if text.strip() and conf > 0:
                    normalized_conf = conf / 100.0
                    word_boxes.append(
                        WordBox(
                            text=text.strip(),
                            confidence=normalized_conf,
                            x=int(ocr_data["left"][i]),
                            y=int(ocr_data["top"][i]),
                            width=int(ocr_data["width"][i]),
                            height=int(ocr_data["height"][i]),
                        )
                    )
                    texts.append(text.strip())
                    confidences.append(normalized_conf)

            combined_text = " ".join(texts)
            avg_confidence = (
                sum(confidences) / len(confidences) if confidences else 0.0
            )

            return OcrEngineResult(
                text=combined_text,
                confidence=avg_confidence,
                word_boxes=word_boxes,
                engine_name=self.name,
            )

        except (EngineUnavailableError, EngineProcessingError):
            raise
        except Exception as exc:
            raise EngineProcessingError(
                f"Tesseract recognition failed: {exc}"
            ) from exc

    def _build_config(self, config: OcrConfig) -> str:
        """Translate OcrConfig into a Tesseract CLI config string."""
        parts: list[str] = []

        # Page segmentation mode
        if config.single_word:
            parts.append("--psm 8")
        elif config.single_line:
            parts.append("--psm 7")
        else:
            # Default: assume uniform block of text
            psm = config.hints.get("psm", 6)
            parts.append(f"--psm {psm}")

        # Character whitelist
        if config.char_whitelist:
            parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")

        return " ".join(parts)