diff --git a/ocr/app/config.py b/ocr/app/config.py index 384ffdc..c222693 100644 --- a/ocr/app/config.py +++ b/ocr/app/config.py @@ -11,6 +11,12 @@ class Settings: self.port: int = int(os.getenv("PORT", "8000")) self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract") + # OCR engine configuration + self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr") + self.ocr_confidence_threshold: float = float( + os.getenv("OCR_CONFIDENCE_THRESHOLD", "0.6") + ) + # Redis configuration for job queue self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis") self.redis_port: int = int(os.getenv("REDIS_PORT", "6379")) diff --git a/ocr/app/engines/__init__.py b/ocr/app/engines/__init__.py new file mode 100644 index 0000000..dcc565b --- /dev/null +++ b/ocr/app/engines/__init__.py @@ -0,0 +1,27 @@ +"""OCR engine abstraction layer. + +Provides a pluggable engine interface for OCR processing, +decoupling extractors from specific OCR libraries. +""" + +from app.engines.base_engine import ( + EngineError, + EngineProcessingError, + EngineUnavailableError, + OcrConfig, + OcrEngine, + OcrEngineResult, + WordBox, +) +from app.engines.engine_factory import create_engine + +__all__ = [ + "OcrEngine", + "OcrConfig", + "OcrEngineResult", + "WordBox", + "EngineError", + "EngineUnavailableError", + "EngineProcessingError", + "create_engine", +] diff --git a/ocr/app/engines/base_engine.py b/ocr/app/engines/base_engine.py new file mode 100644 index 0000000..ddca084 --- /dev/null +++ b/ocr/app/engines/base_engine.py @@ -0,0 +1,88 @@ +"""OCR engine abstract base class and shared data types.""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any + + +# --- Exception hierarchy --- + + +class EngineError(Exception): + """Base exception for all OCR engine errors.""" + + +class EngineUnavailableError(EngineError): + """Raised when an engine cannot be initialized (missing binary, bad config).""" + + +class EngineProcessingError(EngineError): + """Raised when an engine fails to process an image.""" + + +# --- Data types --- + + +@dataclass +class WordBox: + """A single recognized word with position and confidence.""" + + text: str + confidence: float # 0.0-1.0 + x: int = 0 + y: int = 0 + width: int = 0 + height: int = 0 + + +@dataclass +class OcrConfig: + """Engine-agnostic OCR configuration. + + Common fields cover the most frequent needs. Engine-specific + parameters go into ``hints`` so the interface stays stable. + """ + + char_whitelist: str | None = None # e.g. VIN: "ABCDEFGHJKLMNPRSTUVWXYZ0123456789" + single_line: bool = False # Treat image as a single text line + single_word: bool = False # Treat image as a single word + use_angle_cls: bool = True # Enable angle classification (PaddleOCR) + hints: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class OcrEngineResult: + """Normalized result returned by every engine implementation.""" + + text: str + confidence: float # 0.0-1.0 + word_boxes: list[WordBox] + engine_name: str # "paddleocr", "tesseract", "google_vision" + + +# --- Abstract base --- + + +class OcrEngine(ABC): + """Abstract base class that all OCR engines must implement.""" + + @abstractmethod + def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult: + """Run OCR on preprocessed image bytes. + + Args: + image_bytes: Raw image bytes (PNG/JPEG). + config: Engine-agnostic configuration. + + Returns: + Normalized OCR result. + + Raises: + EngineProcessingError: If recognition fails. + EngineUnavailableError: If the engine is not ready. + """ + + @property + @abstractmethod + def name(self) -> str: + """Short identifier used in OcrEngineResult.engine_name.""" diff --git a/ocr/app/engines/engine_factory.py b/ocr/app/engines/engine_factory.py new file mode 100644 index 0000000..dad2f16 --- /dev/null +++ b/ocr/app/engines/engine_factory.py @@ -0,0 +1,52 @@ +"""Factory function for creating OCR engine instances from configuration.""" + +import logging + +from app.config import settings +from app.engines.base_engine import EngineUnavailableError, OcrEngine + +logger = logging.getLogger(__name__) + +# Valid engine identifiers +_ENGINE_REGISTRY: dict[str, str] = { + "paddleocr": "app.engines.paddle_engine.PaddleOcrEngine", + "tesseract": "app.engines.tesseract_engine.TesseractEngine", +} + + +def create_engine(engine_name: str | None = None) -> OcrEngine: + """Instantiate an OCR engine by name (defaults to config value). + + Args: + engine_name: Engine identifier ("paddleocr", "tesseract"). + Falls back to ``settings.ocr_primary_engine``. + + Returns: + Initialized OcrEngine instance. + + Raises: + EngineUnavailableError: If the engine cannot be loaded or initialized. + """ + name = (engine_name or settings.ocr_primary_engine).lower().strip() + + if name not in _ENGINE_REGISTRY: + raise EngineUnavailableError( + f"Unknown engine '{name}'. Available: {list(_ENGINE_REGISTRY.keys())}" + ) + + module_path, class_name = _ENGINE_REGISTRY[name].rsplit(".", 1) + + try: + import importlib + + module = importlib.import_module(module_path) + engine_cls = getattr(module, class_name) + engine: OcrEngine = engine_cls() + logger.info("Created OCR engine: %s", name) + return engine + except EngineUnavailableError: + raise + except Exception as exc: + raise EngineUnavailableError( + f"Failed to create engine '{name}': {exc}" + ) from exc diff --git a/ocr/app/engines/paddle_engine.py b/ocr/app/engines/paddle_engine.py new file mode 100644 index 0000000..41433f1 --- /dev/null +++ b/ocr/app/engines/paddle_engine.py @@ -0,0 +1,133 @@ +"""PaddleOCR engine wrapper using PP-OCRv4 models.""" + +import io +import logging +from typing import Any + +from app.engines.base_engine import ( + EngineProcessingError, + EngineUnavailableError, + OcrConfig, + OcrEngine, + OcrEngineResult, + WordBox, +) + +logger = logging.getLogger(__name__) + + +class PaddleOcrEngine(OcrEngine): + """PaddleOCR PP-OCRv4 engine with angle classification, CPU-only.""" + + def __init__(self) -> None: + self._ocr: Any | None = None + + @property + def name(self) -> str: + return "paddleocr" + + def _get_ocr(self) -> Any: + """Lazy-initialize PaddleOCR instance on first use.""" + if self._ocr is not None: + return self._ocr + try: + from paddleocr import PaddleOCR # type: ignore[import-untyped] + + self._ocr = PaddleOCR( + use_angle_cls=True, + lang="en", + use_gpu=False, + show_log=False, + ) + logger.info("PaddleOCR PP-OCRv4 initialized (CPU, angle_cls=True)") + return self._ocr + except ImportError as exc: + raise EngineUnavailableError( + "paddleocr is not installed. " + "Install with: pip install paddlepaddle paddleocr" + ) from exc + except Exception as exc: + raise EngineUnavailableError( + f"Failed to initialize PaddleOCR: {exc}" + ) from exc + + def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult: + """Run PaddleOCR on image bytes. + + PaddleOCR returns: list of pages, each page is a list of + ``[[box_coords], (text, confidence)]`` entries. + """ + ocr = self._get_ocr() + + try: + import numpy as np # type: ignore[import-untyped] + from PIL import Image + + image = Image.open(io.BytesIO(image_bytes)).convert("RGB") + img_array = np.array(image) + + # PaddleOCR accepts numpy arrays + results = ocr.ocr(img_array, cls=config.use_angle_cls) + + if not results or not results[0]: + return OcrEngineResult( + text="", + confidence=0.0, + word_boxes=[], + engine_name=self.name, + ) + + word_boxes: list[WordBox] = [] + texts: list[str] = [] + confidences: list[float] = [] + + for line in results[0]: + box_coords = line[0] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] + text = line[1][0] + conf = float(line[1][1]) + + # Apply character whitelist filter if configured + if config.char_whitelist: + allowed = set(config.char_whitelist) + text = "".join(ch for ch in text if ch in allowed) + + if not text.strip(): + continue + + # Convert quadrilateral to bounding box + xs = [pt[0] for pt in box_coords] + ys = [pt[1] for pt in box_coords] + x_min, y_min = int(min(xs)), int(min(ys)) + x_max, y_max = int(max(xs)), int(max(ys)) + + word_boxes.append( + WordBox( + text=text.strip(), + confidence=conf, + x=x_min, + y=y_min, + width=x_max - x_min, + height=y_max - y_min, + ) + ) + texts.append(text.strip()) + confidences.append(conf) + + combined_text = " ".join(texts) + avg_confidence = ( + sum(confidences) / len(confidences) if confidences else 0.0 + ) + + return OcrEngineResult( + text=combined_text, + confidence=avg_confidence, + word_boxes=word_boxes, + engine_name=self.name, + ) + + except (EngineUnavailableError, EngineProcessingError): + raise + except Exception as exc: + raise EngineProcessingError( + f"PaddleOCR recognition failed: {exc}" + ) from exc diff --git a/ocr/app/engines/tesseract_engine.py b/ocr/app/engines/tesseract_engine.py new file mode 100644 index 0000000..02108ec --- /dev/null +++ b/ocr/app/engines/tesseract_engine.py @@ -0,0 +1,114 @@ +"""Tesseract engine wrapper for backward compatibility.""" + +import io +import logging + +from app.config import settings +from app.engines.base_engine import ( + EngineProcessingError, + EngineUnavailableError, + OcrConfig, + OcrEngine, + OcrEngineResult, + WordBox, +) + +logger = logging.getLogger(__name__) + + +class TesseractEngine(OcrEngine): + """pytesseract wrapper conforming to the OcrEngine interface.""" + + def __init__(self) -> None: + try: + import pytesseract # type: ignore[import-untyped] + + pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd + self._pytesseract = pytesseract + logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd) + except ImportError as exc: + raise EngineUnavailableError( + "pytesseract is not installed. " + "Install with: pip install pytesseract" + ) from exc + + @property + def name(self) -> str: + return "tesseract" + + def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult: + """Run Tesseract OCR on image bytes.""" + try: + from PIL import Image + + image = Image.open(io.BytesIO(image_bytes)) + + # Build Tesseract config string from OcrConfig + tess_config = self._build_config(config) + + # Get word-level data + ocr_data = self._pytesseract.image_to_data( + image, + config=tess_config, + output_type=self._pytesseract.Output.DICT, + ) + + word_boxes: list[WordBox] = [] + texts: list[str] = [] + confidences: list[float] = [] + + for i, text in enumerate(ocr_data["text"]): + conf = int(ocr_data["conf"][i]) + if text.strip() and conf > 0: + normalized_conf = conf / 100.0 + word_boxes.append( + WordBox( + text=text.strip(), + confidence=normalized_conf, + x=int(ocr_data["left"][i]), + y=int(ocr_data["top"][i]), + width=int(ocr_data["width"][i]), + height=int(ocr_data["height"][i]), + ) + ) + texts.append(text.strip()) + confidences.append(normalized_conf) + + combined_text = " ".join(texts) + avg_confidence = ( + sum(confidences) / len(confidences) if confidences else 0.0 + ) + + return OcrEngineResult( + text=combined_text, + confidence=avg_confidence, + word_boxes=word_boxes, + engine_name=self.name, + ) + + except (EngineUnavailableError, EngineProcessingError): + raise + except Exception as exc: + raise EngineProcessingError( + f"Tesseract recognition failed: {exc}" + ) from exc + + def _build_config(self, config: OcrConfig) -> str: + """Translate OcrConfig into a Tesseract CLI config string.""" + parts: list[str] = [] + + # Page segmentation mode + if config.single_word: + parts.append("--psm 8") + elif config.single_line: + parts.append("--psm 7") + else: + # Default: assume uniform block of text + psm = config.hints.get("psm", 6) + parts.append(f"--psm {psm}") + + # Character whitelist + if config.char_whitelist: + parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}") + + return " ".join(parts) diff --git a/ocr/requirements.txt b/ocr/requirements.txt index 8138d85..fb3c268 100644 --- a/ocr/requirements.txt +++ b/ocr/requirements.txt @@ -15,6 +15,8 @@ numpy>=1.24.0 # OCR Engines pytesseract>=0.3.10 +paddlepaddle>=2.6.0 +paddleocr>=2.8.0 # PDF Processing PyMuPDF>=1.23.0