feat: add OCR engine abstraction layer (refs #116)
Introduce pluggable OcrEngine ABC with PaddleOCR PP-OCRv4 as primary engine and Tesseract wrapper for backward compatibility. Engine factory reads OCR_PRIMARY_ENGINE config to instantiate the correct engine. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
133
ocr/app/engines/paddle_engine.py
Normal file
133
ocr/app/engines/paddle_engine.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""PaddleOCR engine wrapper using PP-OCRv4 models."""
|
||||
|
||||
import io
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from app.engines.base_engine import (
|
||||
EngineProcessingError,
|
||||
EngineUnavailableError,
|
||||
OcrConfig,
|
||||
OcrEngine,
|
||||
OcrEngineResult,
|
||||
WordBox,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PaddleOcrEngine(OcrEngine):
|
||||
"""PaddleOCR PP-OCRv4 engine with angle classification, CPU-only."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._ocr: Any | None = None
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "paddleocr"
|
||||
|
||||
def _get_ocr(self) -> Any:
|
||||
"""Lazy-initialize PaddleOCR instance on first use."""
|
||||
if self._ocr is not None:
|
||||
return self._ocr
|
||||
try:
|
||||
from paddleocr import PaddleOCR # type: ignore[import-untyped]
|
||||
|
||||
self._ocr = PaddleOCR(
|
||||
use_angle_cls=True,
|
||||
lang="en",
|
||||
use_gpu=False,
|
||||
show_log=False,
|
||||
)
|
||||
logger.info("PaddleOCR PP-OCRv4 initialized (CPU, angle_cls=True)")
|
||||
return self._ocr
|
||||
except ImportError as exc:
|
||||
raise EngineUnavailableError(
|
||||
"paddleocr is not installed. "
|
||||
"Install with: pip install paddlepaddle paddleocr"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
raise EngineUnavailableError(
|
||||
f"Failed to initialize PaddleOCR: {exc}"
|
||||
) from exc
|
||||
|
||||
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||
"""Run PaddleOCR on image bytes.
|
||||
|
||||
PaddleOCR returns: list of pages, each page is a list of
|
||||
``[[box_coords], (text, confidence)]`` entries.
|
||||
"""
|
||||
ocr = self._get_ocr()
|
||||
|
||||
try:
|
||||
import numpy as np # type: ignore[import-untyped]
|
||||
from PIL import Image
|
||||
|
||||
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
||||
img_array = np.array(image)
|
||||
|
||||
# PaddleOCR accepts numpy arrays
|
||||
results = ocr.ocr(img_array, cls=config.use_angle_cls)
|
||||
|
||||
if not results or not results[0]:
|
||||
return OcrEngineResult(
|
||||
text="",
|
||||
confidence=0.0,
|
||||
word_boxes=[],
|
||||
engine_name=self.name,
|
||||
)
|
||||
|
||||
word_boxes: list[WordBox] = []
|
||||
texts: list[str] = []
|
||||
confidences: list[float] = []
|
||||
|
||||
for line in results[0]:
|
||||
box_coords = line[0] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
|
||||
text = line[1][0]
|
||||
conf = float(line[1][1])
|
||||
|
||||
# Apply character whitelist filter if configured
|
||||
if config.char_whitelist:
|
||||
allowed = set(config.char_whitelist)
|
||||
text = "".join(ch for ch in text if ch in allowed)
|
||||
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
# Convert quadrilateral to bounding box
|
||||
xs = [pt[0] for pt in box_coords]
|
||||
ys = [pt[1] for pt in box_coords]
|
||||
x_min, y_min = int(min(xs)), int(min(ys))
|
||||
x_max, y_max = int(max(xs)), int(max(ys))
|
||||
|
||||
word_boxes.append(
|
||||
WordBox(
|
||||
text=text.strip(),
|
||||
confidence=conf,
|
||||
x=x_min,
|
||||
y=y_min,
|
||||
width=x_max - x_min,
|
||||
height=y_max - y_min,
|
||||
)
|
||||
)
|
||||
texts.append(text.strip())
|
||||
confidences.append(conf)
|
||||
|
||||
combined_text = " ".join(texts)
|
||||
avg_confidence = (
|
||||
sum(confidences) / len(confidences) if confidences else 0.0
|
||||
)
|
||||
|
||||
return OcrEngineResult(
|
||||
text=combined_text,
|
||||
confidence=avg_confidence,
|
||||
word_boxes=word_boxes,
|
||||
engine_name=self.name,
|
||||
)
|
||||
|
||||
except (EngineUnavailableError, EngineProcessingError):
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise EngineProcessingError(
|
||||
f"PaddleOCR recognition failed: {exc}"
|
||||
) from exc
|
||||
Reference in New Issue
Block a user