motovaultpro/ocr/app/engines/paddle_engine.py

"""PaddleOCR engine wrapper using PP-OCRv4 models."""

import io
import logging
from typing import Any

from app.engines.base_engine import (
    EngineProcessingError,
    EngineUnavailableError,
    OcrConfig,
    OcrEngine,
    OcrEngineResult,
    WordBox,
)

logger = logging.getLogger(__name__)


class PaddleOcrEngine(OcrEngine):
    """PaddleOCR PP-OCRv4 engine with angle classification, CPU-only."""

    def __init__(self) -> None:
        self._ocr: Any | None = None

    @property
    def name(self) -> str:
        return "paddleocr"

    def _get_ocr(self) -> Any:
        """Lazy-initialize PaddleOCR instance on first use."""
        if self._ocr is not None:
            return self._ocr
        try:
            from paddleocr import PaddleOCR  # type: ignore[import-untyped]

            self._ocr = PaddleOCR(
                ocr_version="PP-OCRv4",
                use_textline_orientation=True,
                lang="en",
                device="cpu",
            )
            logger.info("PaddleOCR PP-OCRv4 initialized (CPU, textline_orientation=True)")
            return self._ocr
        except ImportError as exc:
            raise EngineUnavailableError(
                "paddleocr is not installed. "
                "Install with: pip install paddlepaddle paddleocr"
            ) from exc
        except Exception as exc:
            raise EngineUnavailableError(
                f"Failed to initialize PaddleOCR: {exc}"
            ) from exc

    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
        """Run PaddleOCR on image bytes.

        PaddleOCR v3.x ``predict()`` returns an iterator of result objects.
        Each result has a ``res`` dict with ``dt_polys``, ``rec_texts``,
        and ``rec_scores``.
        """
        ocr = self._get_ocr()

        try:
            import numpy as np  # type: ignore[import-untyped]
            from PIL import Image

            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            img_array = np.array(image)

            results = list(ocr.predict(img_array))

            if not results:
                return OcrEngineResult(
                    text="",
                    confidence=0.0,
                    word_boxes=[],
                    engine_name=self.name,
                )

            res = results[0].res
            dt_polys = res.get("dt_polys", [])
            rec_texts = res.get("rec_texts", [])
            rec_scores = res.get("rec_scores", [])

            if not rec_texts:
                return OcrEngineResult(
                    text="",
                    confidence=0.0,
                    word_boxes=[],
                    engine_name=self.name,
                )

            word_boxes: list[WordBox] = []
            texts: list[str] = []
            confidences: list[float] = []

            for i, text in enumerate(rec_texts):
                conf = float(rec_scores[i]) if i < len(rec_scores) else 0.0

                # Apply character whitelist filter if configured
                if config.char_whitelist:
                    allowed = set(config.char_whitelist)
                    text = "".join(ch for ch in text if ch in allowed)

                if not text.strip():
                    continue

                # Convert quadrilateral polygon to bounding box
                x_min, y_min, width, height = 0, 0, 0, 0
                if i < len(dt_polys):
                    poly = dt_polys[i]
                    xs = [pt[0] for pt in poly]
                    ys = [pt[1] for pt in poly]
                    x_min, y_min = int(min(xs)), int(min(ys))
                    x_max, y_max = int(max(xs)), int(max(ys))
                    width = x_max - x_min
                    height = y_max - y_min

                word_boxes.append(
                    WordBox(
                        text=text.strip(),
                        confidence=conf,
                        x=x_min,
                        y=y_min,
                        width=width,
                        height=height,
                    )
                )
                texts.append(text.strip())
                confidences.append(conf)

            combined_text = " ".join(texts)
            avg_confidence = (
                sum(confidences) / len(confidences) if confidences else 0.0
            )

            return OcrEngineResult(
                text=combined_text,
                confidence=avg_confidence,
                word_boxes=word_boxes,
                engine_name=self.name,
            )

        except (EngineUnavailableError, EngineProcessingError):
            raise
        except Exception as exc:
            raise EngineProcessingError(
                f"PaddleOCR recognition failed: {exc}"
            ) from exc