All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 5m6s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
150 lines
4.7 KiB
Python
150 lines
4.7 KiB
Python
"""PaddleOCR engine wrapper using PP-OCRv4 models."""
|
|
|
|
import io
|
|
import logging
|
|
from typing import Any
|
|
|
|
from app.engines.base_engine import (
|
|
EngineProcessingError,
|
|
EngineUnavailableError,
|
|
OcrConfig,
|
|
OcrEngine,
|
|
OcrEngineResult,
|
|
WordBox,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PaddleOcrEngine(OcrEngine):
|
|
"""PaddleOCR PP-OCRv4 engine with angle classification, CPU-only."""
|
|
|
|
def __init__(self) -> None:
|
|
self._ocr: Any | None = None
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "paddleocr"
|
|
|
|
def _get_ocr(self) -> Any:
|
|
"""Lazy-initialize PaddleOCR instance on first use."""
|
|
if self._ocr is not None:
|
|
return self._ocr
|
|
try:
|
|
from paddleocr import PaddleOCR # type: ignore[import-untyped]
|
|
|
|
self._ocr = PaddleOCR(
|
|
ocr_version="PP-OCRv4",
|
|
use_textline_orientation=True,
|
|
lang="en",
|
|
device="cpu",
|
|
)
|
|
logger.info("PaddleOCR PP-OCRv4 initialized (CPU, textline_orientation=True)")
|
|
return self._ocr
|
|
except ImportError as exc:
|
|
raise EngineUnavailableError(
|
|
"paddleocr is not installed. "
|
|
"Install with: pip install paddlepaddle paddleocr"
|
|
) from exc
|
|
except Exception as exc:
|
|
raise EngineUnavailableError(
|
|
f"Failed to initialize PaddleOCR: {exc}"
|
|
) from exc
|
|
|
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
|
"""Run PaddleOCR on image bytes.
|
|
|
|
PaddleOCR v3.x ``predict()`` returns an iterator of result objects.
|
|
Each result has a ``res`` dict with ``dt_polys``, ``rec_texts``,
|
|
and ``rec_scores``.
|
|
"""
|
|
ocr = self._get_ocr()
|
|
|
|
try:
|
|
import numpy as np # type: ignore[import-untyped]
|
|
from PIL import Image
|
|
|
|
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
|
img_array = np.array(image)
|
|
|
|
results = list(ocr.predict(img_array))
|
|
|
|
if not results:
|
|
return OcrEngineResult(
|
|
text="",
|
|
confidence=0.0,
|
|
word_boxes=[],
|
|
engine_name=self.name,
|
|
)
|
|
|
|
res = results[0].res
|
|
dt_polys = res.get("dt_polys", [])
|
|
rec_texts = res.get("rec_texts", [])
|
|
rec_scores = res.get("rec_scores", [])
|
|
|
|
if not rec_texts:
|
|
return OcrEngineResult(
|
|
text="",
|
|
confidence=0.0,
|
|
word_boxes=[],
|
|
engine_name=self.name,
|
|
)
|
|
|
|
word_boxes: list[WordBox] = []
|
|
texts: list[str] = []
|
|
confidences: list[float] = []
|
|
|
|
for i, text in enumerate(rec_texts):
|
|
conf = float(rec_scores[i]) if i < len(rec_scores) else 0.0
|
|
|
|
# Apply character whitelist filter if configured
|
|
if config.char_whitelist:
|
|
allowed = set(config.char_whitelist)
|
|
text = "".join(ch for ch in text if ch in allowed)
|
|
|
|
if not text.strip():
|
|
continue
|
|
|
|
# Convert quadrilateral polygon to bounding box
|
|
x_min, y_min, width, height = 0, 0, 0, 0
|
|
if i < len(dt_polys):
|
|
poly = dt_polys[i]
|
|
xs = [pt[0] for pt in poly]
|
|
ys = [pt[1] for pt in poly]
|
|
x_min, y_min = int(min(xs)), int(min(ys))
|
|
x_max, y_max = int(max(xs)), int(max(ys))
|
|
width = x_max - x_min
|
|
height = y_max - y_min
|
|
|
|
word_boxes.append(
|
|
WordBox(
|
|
text=text.strip(),
|
|
confidence=conf,
|
|
x=x_min,
|
|
y=y_min,
|
|
width=width,
|
|
height=height,
|
|
)
|
|
)
|
|
texts.append(text.strip())
|
|
confidences.append(conf)
|
|
|
|
combined_text = " ".join(texts)
|
|
avg_confidence = (
|
|
sum(confidences) / len(confidences) if confidences else 0.0
|
|
)
|
|
|
|
return OcrEngineResult(
|
|
text=combined_text,
|
|
confidence=avg_confidence,
|
|
word_boxes=word_boxes,
|
|
engine_name=self.name,
|
|
)
|
|
|
|
except (EngineUnavailableError, EngineProcessingError):
|
|
raise
|
|
except Exception as exc:
|
|
raise EngineProcessingError(
|
|
f"PaddleOCR recognition failed: {exc}"
|
|
) from exc
|