feat: add OCR engine abstraction layer (refs #116)

Introduce pluggable OcrEngine ABC with PaddleOCR PP-OCRv4 as primary
engine and Tesseract wrapper for backward compatibility. Engine factory
reads OCR_PRIMARY_ENGINE config to instantiate the correct engine.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-07 10:47:40 -06:00
parent 6b0c18a41c
commit ebc633fb36
7 changed files with 422 additions and 0 deletions

View File

@@ -0,0 +1,114 @@
"""Tesseract engine wrapper for backward compatibility."""
import io
import logging
from app.config import settings
from app.engines.base_engine import (
EngineProcessingError,
EngineUnavailableError,
OcrConfig,
OcrEngine,
OcrEngineResult,
WordBox,
)
logger = logging.getLogger(__name__)
class TesseractEngine(OcrEngine):
"""pytesseract wrapper conforming to the OcrEngine interface."""
def __init__(self) -> None:
try:
import pytesseract # type: ignore[import-untyped]
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
self._pytesseract = pytesseract
logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
except ImportError as exc:
raise EngineUnavailableError(
"pytesseract is not installed. "
"Install with: pip install pytesseract"
) from exc
@property
def name(self) -> str:
return "tesseract"
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
"""Run Tesseract OCR on image bytes."""
try:
from PIL import Image
image = Image.open(io.BytesIO(image_bytes))
# Build Tesseract config string from OcrConfig
tess_config = self._build_config(config)
# Get word-level data
ocr_data = self._pytesseract.image_to_data(
image,
config=tess_config,
output_type=self._pytesseract.Output.DICT,
)
word_boxes: list[WordBox] = []
texts: list[str] = []
confidences: list[float] = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
normalized_conf = conf / 100.0
word_boxes.append(
WordBox(
text=text.strip(),
confidence=normalized_conf,
x=int(ocr_data["left"][i]),
y=int(ocr_data["top"][i]),
width=int(ocr_data["width"][i]),
height=int(ocr_data["height"][i]),
)
)
texts.append(text.strip())
confidences.append(normalized_conf)
combined_text = " ".join(texts)
avg_confidence = (
sum(confidences) / len(confidences) if confidences else 0.0
)
return OcrEngineResult(
text=combined_text,
confidence=avg_confidence,
word_boxes=word_boxes,
engine_name=self.name,
)
except (EngineUnavailableError, EngineProcessingError):
raise
except Exception as exc:
raise EngineProcessingError(
f"Tesseract recognition failed: {exc}"
) from exc
def _build_config(self, config: OcrConfig) -> str:
"""Translate OcrConfig into a Tesseract CLI config string."""
parts: list[str] = []
# Page segmentation mode
if config.single_word:
parts.append("--psm 8")
elif config.single_line:
parts.append("--psm 7")
else:
# Default: assume uniform block of text
psm = config.hints.get("psm", 6)
parts.append(f"--psm {psm}")
# Character whitelist
if config.char_whitelist:
parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
return " ".join(parts)