Introduce pluggable OcrEngine ABC with PaddleOCR PP-OCRv4 as primary engine and Tesseract wrapper for backward compatibility. Engine factory reads OCR_PRIMARY_ENGINE config to instantiate the correct engine. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
115 lines
3.6 KiB
Python
115 lines
3.6 KiB
Python
"""Tesseract engine wrapper for backward compatibility."""
|
|
|
|
import io
|
|
import logging
|
|
|
|
from app.config import settings
|
|
from app.engines.base_engine import (
|
|
EngineProcessingError,
|
|
EngineUnavailableError,
|
|
OcrConfig,
|
|
OcrEngine,
|
|
OcrEngineResult,
|
|
WordBox,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TesseractEngine(OcrEngine):
|
|
"""pytesseract wrapper conforming to the OcrEngine interface."""
|
|
|
|
def __init__(self) -> None:
|
|
try:
|
|
import pytesseract # type: ignore[import-untyped]
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
|
self._pytesseract = pytesseract
|
|
logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
|
|
except ImportError as exc:
|
|
raise EngineUnavailableError(
|
|
"pytesseract is not installed. "
|
|
"Install with: pip install pytesseract"
|
|
) from exc
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "tesseract"
|
|
|
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
|
"""Run Tesseract OCR on image bytes."""
|
|
try:
|
|
from PIL import Image
|
|
|
|
image = Image.open(io.BytesIO(image_bytes))
|
|
|
|
# Build Tesseract config string from OcrConfig
|
|
tess_config = self._build_config(config)
|
|
|
|
# Get word-level data
|
|
ocr_data = self._pytesseract.image_to_data(
|
|
image,
|
|
config=tess_config,
|
|
output_type=self._pytesseract.Output.DICT,
|
|
)
|
|
|
|
word_boxes: list[WordBox] = []
|
|
texts: list[str] = []
|
|
confidences: list[float] = []
|
|
|
|
for i, text in enumerate(ocr_data["text"]):
|
|
conf = int(ocr_data["conf"][i])
|
|
if text.strip() and conf > 0:
|
|
normalized_conf = conf / 100.0
|
|
word_boxes.append(
|
|
WordBox(
|
|
text=text.strip(),
|
|
confidence=normalized_conf,
|
|
x=int(ocr_data["left"][i]),
|
|
y=int(ocr_data["top"][i]),
|
|
width=int(ocr_data["width"][i]),
|
|
height=int(ocr_data["height"][i]),
|
|
)
|
|
)
|
|
texts.append(text.strip())
|
|
confidences.append(normalized_conf)
|
|
|
|
combined_text = " ".join(texts)
|
|
avg_confidence = (
|
|
sum(confidences) / len(confidences) if confidences else 0.0
|
|
)
|
|
|
|
return OcrEngineResult(
|
|
text=combined_text,
|
|
confidence=avg_confidence,
|
|
word_boxes=word_boxes,
|
|
engine_name=self.name,
|
|
)
|
|
|
|
except (EngineUnavailableError, EngineProcessingError):
|
|
raise
|
|
except Exception as exc:
|
|
raise EngineProcessingError(
|
|
f"Tesseract recognition failed: {exc}"
|
|
) from exc
|
|
|
|
def _build_config(self, config: OcrConfig) -> str:
|
|
"""Translate OcrConfig into a Tesseract CLI config string."""
|
|
parts: list[str] = []
|
|
|
|
# Page segmentation mode
|
|
if config.single_word:
|
|
parts.append("--psm 8")
|
|
elif config.single_line:
|
|
parts.append("--psm 7")
|
|
else:
|
|
# Default: assume uniform block of text
|
|
psm = config.hints.get("psm", 6)
|
|
parts.append(f"--psm {psm}")
|
|
|
|
# Character whitelist
|
|
if config.char_whitelist:
|
|
parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
|
|
|
|
return " ".join(parts)
|