"""PaddleOCR engine wrapper using PP-OCRv4 models.""" import io import logging from typing import Any from app.engines.base_engine import ( EngineProcessingError, EngineUnavailableError, OcrConfig, OcrEngine, OcrEngineResult, WordBox, ) logger = logging.getLogger(__name__) class PaddleOcrEngine(OcrEngine): """PaddleOCR PP-OCRv4 engine with angle classification, CPU-only.""" def __init__(self) -> None: self._ocr: Any | None = None @property def name(self) -> str: return "paddleocr" def _get_ocr(self) -> Any: """Lazy-initialize PaddleOCR instance on first use.""" if self._ocr is not None: return self._ocr try: from paddleocr import PaddleOCR # type: ignore[import-untyped] self._ocr = PaddleOCR( ocr_version="PP-OCRv4", use_textline_orientation=True, lang="en", device="cpu", ) logger.info("PaddleOCR PP-OCRv4 initialized (CPU, textline_orientation=True)") return self._ocr except ImportError as exc: raise EngineUnavailableError( "paddleocr is not installed. " "Install with: pip install paddlepaddle paddleocr" ) from exc except Exception as exc: raise EngineUnavailableError( f"Failed to initialize PaddleOCR: {exc}" ) from exc def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult: """Run PaddleOCR on image bytes. PaddleOCR v3.x ``predict()`` returns an iterator of result objects. Each result has a ``res`` dict with ``dt_polys``, ``rec_texts``, and ``rec_scores``. """ ocr = self._get_ocr() try: import numpy as np # type: ignore[import-untyped] from PIL import Image image = Image.open(io.BytesIO(image_bytes)).convert("RGB") img_array = np.array(image) results = list(ocr.predict(img_array)) if not results: return OcrEngineResult( text="", confidence=0.0, word_boxes=[], engine_name=self.name, ) res = results[0].res dt_polys = res.get("dt_polys", []) rec_texts = res.get("rec_texts", []) rec_scores = res.get("rec_scores", []) if not rec_texts: return OcrEngineResult( text="", confidence=0.0, word_boxes=[], engine_name=self.name, ) word_boxes: list[WordBox] = [] texts: list[str] = [] confidences: list[float] = [] for i, text in enumerate(rec_texts): conf = float(rec_scores[i]) if i < len(rec_scores) else 0.0 # Apply character whitelist filter if configured if config.char_whitelist: allowed = set(config.char_whitelist) text = "".join(ch for ch in text if ch in allowed) if not text.strip(): continue # Convert quadrilateral polygon to bounding box x_min, y_min, width, height = 0, 0, 0, 0 if i < len(dt_polys): poly = dt_polys[i] xs = [pt[0] for pt in poly] ys = [pt[1] for pt in poly] x_min, y_min = int(min(xs)), int(min(ys)) x_max, y_max = int(max(xs)), int(max(ys)) width = x_max - x_min height = y_max - y_min word_boxes.append( WordBox( text=text.strip(), confidence=conf, x=x_min, y=y_min, width=width, height=height, ) ) texts.append(text.strip()) confidences.append(conf) combined_text = " ".join(texts) avg_confidence = ( sum(confidences) / len(confidences) if confidences else 0.0 ) return OcrEngineResult( text=combined_text, confidence=avg_confidence, word_boxes=word_boxes, engine_name=self.name, ) except (EngineUnavailableError, EngineProcessingError): raise except Exception as exc: raise EngineProcessingError( f"PaddleOCR recognition failed: {exc}" ) from exc