feat: add OCR engine abstraction layer (refs #116)
Introduce pluggable OcrEngine ABC with PaddleOCR PP-OCRv4 as primary engine and Tesseract wrapper for backward compatibility. Engine factory reads OCR_PRIMARY_ENGINE config to instantiate the correct engine. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -11,6 +11,12 @@ class Settings:
|
|||||||
self.port: int = int(os.getenv("PORT", "8000"))
|
self.port: int = int(os.getenv("PORT", "8000"))
|
||||||
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
|
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
|
||||||
|
|
||||||
|
# OCR engine configuration
|
||||||
|
self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr")
|
||||||
|
self.ocr_confidence_threshold: float = float(
|
||||||
|
os.getenv("OCR_CONFIDENCE_THRESHOLD", "0.6")
|
||||||
|
)
|
||||||
|
|
||||||
# Redis configuration for job queue
|
# Redis configuration for job queue
|
||||||
self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis")
|
self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis")
|
||||||
self.redis_port: int = int(os.getenv("REDIS_PORT", "6379"))
|
self.redis_port: int = int(os.getenv("REDIS_PORT", "6379"))
|
||||||
|
|||||||
27
ocr/app/engines/__init__.py
Normal file
27
ocr/app/engines/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
"""OCR engine abstraction layer.
|
||||||
|
|
||||||
|
Provides a pluggable engine interface for OCR processing,
|
||||||
|
decoupling extractors from specific OCR libraries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from app.engines.base_engine import (
|
||||||
|
EngineError,
|
||||||
|
EngineProcessingError,
|
||||||
|
EngineUnavailableError,
|
||||||
|
OcrConfig,
|
||||||
|
OcrEngine,
|
||||||
|
OcrEngineResult,
|
||||||
|
WordBox,
|
||||||
|
)
|
||||||
|
from app.engines.engine_factory import create_engine
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"OcrEngine",
|
||||||
|
"OcrConfig",
|
||||||
|
"OcrEngineResult",
|
||||||
|
"WordBox",
|
||||||
|
"EngineError",
|
||||||
|
"EngineUnavailableError",
|
||||||
|
"EngineProcessingError",
|
||||||
|
"create_engine",
|
||||||
|
]
|
||||||
88
ocr/app/engines/base_engine.py
Normal file
88
ocr/app/engines/base_engine.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
"""OCR engine abstract base class and shared data types."""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
# --- Exception hierarchy ---
|
||||||
|
|
||||||
|
|
||||||
|
class EngineError(Exception):
|
||||||
|
"""Base exception for all OCR engine errors."""
|
||||||
|
|
||||||
|
|
||||||
|
class EngineUnavailableError(EngineError):
|
||||||
|
"""Raised when an engine cannot be initialized (missing binary, bad config)."""
|
||||||
|
|
||||||
|
|
||||||
|
class EngineProcessingError(EngineError):
|
||||||
|
"""Raised when an engine fails to process an image."""
|
||||||
|
|
||||||
|
|
||||||
|
# --- Data types ---
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WordBox:
|
||||||
|
"""A single recognized word with position and confidence."""
|
||||||
|
|
||||||
|
text: str
|
||||||
|
confidence: float # 0.0-1.0
|
||||||
|
x: int = 0
|
||||||
|
y: int = 0
|
||||||
|
width: int = 0
|
||||||
|
height: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OcrConfig:
|
||||||
|
"""Engine-agnostic OCR configuration.
|
||||||
|
|
||||||
|
Common fields cover the most frequent needs. Engine-specific
|
||||||
|
parameters go into ``hints`` so the interface stays stable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
char_whitelist: str | None = None # e.g. VIN: "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
||||||
|
single_line: bool = False # Treat image as a single text line
|
||||||
|
single_word: bool = False # Treat image as a single word
|
||||||
|
use_angle_cls: bool = True # Enable angle classification (PaddleOCR)
|
||||||
|
hints: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OcrEngineResult:
|
||||||
|
"""Normalized result returned by every engine implementation."""
|
||||||
|
|
||||||
|
text: str
|
||||||
|
confidence: float # 0.0-1.0
|
||||||
|
word_boxes: list[WordBox]
|
||||||
|
engine_name: str # "paddleocr", "tesseract", "google_vision"
|
||||||
|
|
||||||
|
|
||||||
|
# --- Abstract base ---
|
||||||
|
|
||||||
|
|
||||||
|
class OcrEngine(ABC):
|
||||||
|
"""Abstract base class that all OCR engines must implement."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||||
|
"""Run OCR on preprocessed image bytes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_bytes: Raw image bytes (PNG/JPEG).
|
||||||
|
config: Engine-agnostic configuration.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Normalized OCR result.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
EngineProcessingError: If recognition fails.
|
||||||
|
EngineUnavailableError: If the engine is not ready.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def name(self) -> str:
|
||||||
|
"""Short identifier used in OcrEngineResult.engine_name."""
|
||||||
52
ocr/app/engines/engine_factory.py
Normal file
52
ocr/app/engines/engine_factory.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""Factory function for creating OCR engine instances from configuration."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
from app.engines.base_engine import EngineUnavailableError, OcrEngine
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Valid engine identifiers
|
||||||
|
_ENGINE_REGISTRY: dict[str, str] = {
|
||||||
|
"paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
|
||||||
|
"tesseract": "app.engines.tesseract_engine.TesseractEngine",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_engine(engine_name: str | None = None) -> OcrEngine:
|
||||||
|
"""Instantiate an OCR engine by name (defaults to config value).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
engine_name: Engine identifier ("paddleocr", "tesseract").
|
||||||
|
Falls back to ``settings.ocr_primary_engine``.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Initialized OcrEngine instance.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
EngineUnavailableError: If the engine cannot be loaded or initialized.
|
||||||
|
"""
|
||||||
|
name = (engine_name or settings.ocr_primary_engine).lower().strip()
|
||||||
|
|
||||||
|
if name not in _ENGINE_REGISTRY:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
f"Unknown engine '{name}'. Available: {list(_ENGINE_REGISTRY.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
module_path, class_name = _ENGINE_REGISTRY[name].rsplit(".", 1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
module = importlib.import_module(module_path)
|
||||||
|
engine_cls = getattr(module, class_name)
|
||||||
|
engine: OcrEngine = engine_cls()
|
||||||
|
logger.info("Created OCR engine: %s", name)
|
||||||
|
return engine
|
||||||
|
except EngineUnavailableError:
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
f"Failed to create engine '{name}': {exc}"
|
||||||
|
) from exc
|
||||||
133
ocr/app/engines/paddle_engine.py
Normal file
133
ocr/app/engines/paddle_engine.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
"""PaddleOCR engine wrapper using PP-OCRv4 models."""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from app.engines.base_engine import (
|
||||||
|
EngineProcessingError,
|
||||||
|
EngineUnavailableError,
|
||||||
|
OcrConfig,
|
||||||
|
OcrEngine,
|
||||||
|
OcrEngineResult,
|
||||||
|
WordBox,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PaddleOcrEngine(OcrEngine):
|
||||||
|
"""PaddleOCR PP-OCRv4 engine with angle classification, CPU-only."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._ocr: Any | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "paddleocr"
|
||||||
|
|
||||||
|
def _get_ocr(self) -> Any:
|
||||||
|
"""Lazy-initialize PaddleOCR instance on first use."""
|
||||||
|
if self._ocr is not None:
|
||||||
|
return self._ocr
|
||||||
|
try:
|
||||||
|
from paddleocr import PaddleOCR # type: ignore[import-untyped]
|
||||||
|
|
||||||
|
self._ocr = PaddleOCR(
|
||||||
|
use_angle_cls=True,
|
||||||
|
lang="en",
|
||||||
|
use_gpu=False,
|
||||||
|
show_log=False,
|
||||||
|
)
|
||||||
|
logger.info("PaddleOCR PP-OCRv4 initialized (CPU, angle_cls=True)")
|
||||||
|
return self._ocr
|
||||||
|
except ImportError as exc:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
"paddleocr is not installed. "
|
||||||
|
"Install with: pip install paddlepaddle paddleocr"
|
||||||
|
) from exc
|
||||||
|
except Exception as exc:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
f"Failed to initialize PaddleOCR: {exc}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||||
|
"""Run PaddleOCR on image bytes.
|
||||||
|
|
||||||
|
PaddleOCR returns: list of pages, each page is a list of
|
||||||
|
``[[box_coords], (text, confidence)]`` entries.
|
||||||
|
"""
|
||||||
|
ocr = self._get_ocr()
|
||||||
|
|
||||||
|
try:
|
||||||
|
import numpy as np # type: ignore[import-untyped]
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
||||||
|
img_array = np.array(image)
|
||||||
|
|
||||||
|
# PaddleOCR accepts numpy arrays
|
||||||
|
results = ocr.ocr(img_array, cls=config.use_angle_cls)
|
||||||
|
|
||||||
|
if not results or not results[0]:
|
||||||
|
return OcrEngineResult(
|
||||||
|
text="",
|
||||||
|
confidence=0.0,
|
||||||
|
word_boxes=[],
|
||||||
|
engine_name=self.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
word_boxes: list[WordBox] = []
|
||||||
|
texts: list[str] = []
|
||||||
|
confidences: list[float] = []
|
||||||
|
|
||||||
|
for line in results[0]:
|
||||||
|
box_coords = line[0] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
|
||||||
|
text = line[1][0]
|
||||||
|
conf = float(line[1][1])
|
||||||
|
|
||||||
|
# Apply character whitelist filter if configured
|
||||||
|
if config.char_whitelist:
|
||||||
|
allowed = set(config.char_whitelist)
|
||||||
|
text = "".join(ch for ch in text if ch in allowed)
|
||||||
|
|
||||||
|
if not text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Convert quadrilateral to bounding box
|
||||||
|
xs = [pt[0] for pt in box_coords]
|
||||||
|
ys = [pt[1] for pt in box_coords]
|
||||||
|
x_min, y_min = int(min(xs)), int(min(ys))
|
||||||
|
x_max, y_max = int(max(xs)), int(max(ys))
|
||||||
|
|
||||||
|
word_boxes.append(
|
||||||
|
WordBox(
|
||||||
|
text=text.strip(),
|
||||||
|
confidence=conf,
|
||||||
|
x=x_min,
|
||||||
|
y=y_min,
|
||||||
|
width=x_max - x_min,
|
||||||
|
height=y_max - y_min,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
texts.append(text.strip())
|
||||||
|
confidences.append(conf)
|
||||||
|
|
||||||
|
combined_text = " ".join(texts)
|
||||||
|
avg_confidence = (
|
||||||
|
sum(confidences) / len(confidences) if confidences else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
return OcrEngineResult(
|
||||||
|
text=combined_text,
|
||||||
|
confidence=avg_confidence,
|
||||||
|
word_boxes=word_boxes,
|
||||||
|
engine_name=self.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
except (EngineUnavailableError, EngineProcessingError):
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
raise EngineProcessingError(
|
||||||
|
f"PaddleOCR recognition failed: {exc}"
|
||||||
|
) from exc
|
||||||
114
ocr/app/engines/tesseract_engine.py
Normal file
114
ocr/app/engines/tesseract_engine.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
"""Tesseract engine wrapper for backward compatibility."""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
from app.engines.base_engine import (
|
||||||
|
EngineProcessingError,
|
||||||
|
EngineUnavailableError,
|
||||||
|
OcrConfig,
|
||||||
|
OcrEngine,
|
||||||
|
OcrEngineResult,
|
||||||
|
WordBox,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TesseractEngine(OcrEngine):
|
||||||
|
"""pytesseract wrapper conforming to the OcrEngine interface."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
try:
|
||||||
|
import pytesseract # type: ignore[import-untyped]
|
||||||
|
|
||||||
|
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
||||||
|
self._pytesseract = pytesseract
|
||||||
|
logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
|
||||||
|
except ImportError as exc:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
"pytesseract is not installed. "
|
||||||
|
"Install with: pip install pytesseract"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "tesseract"
|
||||||
|
|
||||||
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||||
|
"""Run Tesseract OCR on image bytes."""
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
image = Image.open(io.BytesIO(image_bytes))
|
||||||
|
|
||||||
|
# Build Tesseract config string from OcrConfig
|
||||||
|
tess_config = self._build_config(config)
|
||||||
|
|
||||||
|
# Get word-level data
|
||||||
|
ocr_data = self._pytesseract.image_to_data(
|
||||||
|
image,
|
||||||
|
config=tess_config,
|
||||||
|
output_type=self._pytesseract.Output.DICT,
|
||||||
|
)
|
||||||
|
|
||||||
|
word_boxes: list[WordBox] = []
|
||||||
|
texts: list[str] = []
|
||||||
|
confidences: list[float] = []
|
||||||
|
|
||||||
|
for i, text in enumerate(ocr_data["text"]):
|
||||||
|
conf = int(ocr_data["conf"][i])
|
||||||
|
if text.strip() and conf > 0:
|
||||||
|
normalized_conf = conf / 100.0
|
||||||
|
word_boxes.append(
|
||||||
|
WordBox(
|
||||||
|
text=text.strip(),
|
||||||
|
confidence=normalized_conf,
|
||||||
|
x=int(ocr_data["left"][i]),
|
||||||
|
y=int(ocr_data["top"][i]),
|
||||||
|
width=int(ocr_data["width"][i]),
|
||||||
|
height=int(ocr_data["height"][i]),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
texts.append(text.strip())
|
||||||
|
confidences.append(normalized_conf)
|
||||||
|
|
||||||
|
combined_text = " ".join(texts)
|
||||||
|
avg_confidence = (
|
||||||
|
sum(confidences) / len(confidences) if confidences else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
return OcrEngineResult(
|
||||||
|
text=combined_text,
|
||||||
|
confidence=avg_confidence,
|
||||||
|
word_boxes=word_boxes,
|
||||||
|
engine_name=self.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
except (EngineUnavailableError, EngineProcessingError):
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
raise EngineProcessingError(
|
||||||
|
f"Tesseract recognition failed: {exc}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
def _build_config(self, config: OcrConfig) -> str:
|
||||||
|
"""Translate OcrConfig into a Tesseract CLI config string."""
|
||||||
|
parts: list[str] = []
|
||||||
|
|
||||||
|
# Page segmentation mode
|
||||||
|
if config.single_word:
|
||||||
|
parts.append("--psm 8")
|
||||||
|
elif config.single_line:
|
||||||
|
parts.append("--psm 7")
|
||||||
|
else:
|
||||||
|
# Default: assume uniform block of text
|
||||||
|
psm = config.hints.get("psm", 6)
|
||||||
|
parts.append(f"--psm {psm}")
|
||||||
|
|
||||||
|
# Character whitelist
|
||||||
|
if config.char_whitelist:
|
||||||
|
parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
|
||||||
|
|
||||||
|
return " ".join(parts)
|
||||||
@@ -15,6 +15,8 @@ numpy>=1.24.0
|
|||||||
|
|
||||||
# OCR Engines
|
# OCR Engines
|
||||||
pytesseract>=0.3.10
|
pytesseract>=0.3.10
|
||||||
|
paddlepaddle>=2.6.0
|
||||||
|
paddleocr>=2.8.0
|
||||||
|
|
||||||
# PDF Processing
|
# PDF Processing
|
||||||
PyMuPDF>=1.23.0
|
PyMuPDF>=1.23.0
|
||||||
|
|||||||
Reference in New Issue
Block a user