"""Tesseract engine wrapper for backward compatibility.""" import io import logging from app.config import settings from app.engines.base_engine import ( EngineProcessingError, EngineUnavailableError, OcrConfig, OcrEngine, OcrEngineResult, WordBox, ) logger = logging.getLogger(__name__) class TesseractEngine(OcrEngine): """pytesseract wrapper conforming to the OcrEngine interface.""" def __init__(self) -> None: try: import pytesseract # type: ignore[import-untyped] pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd self._pytesseract = pytesseract logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd) except ImportError as exc: raise EngineUnavailableError( "pytesseract is not installed. " "Install with: pip install pytesseract" ) from exc @property def name(self) -> str: return "tesseract" def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult: """Run Tesseract OCR on image bytes.""" try: from PIL import Image image = Image.open(io.BytesIO(image_bytes)) # Build Tesseract config string from OcrConfig tess_config = self._build_config(config) # Get word-level data ocr_data = self._pytesseract.image_to_data( image, config=tess_config, output_type=self._pytesseract.Output.DICT, ) word_boxes: list[WordBox] = [] texts: list[str] = [] confidences: list[float] = [] for i, text in enumerate(ocr_data["text"]): conf = int(ocr_data["conf"][i]) if text.strip() and conf > 0: normalized_conf = conf / 100.0 word_boxes.append( WordBox( text=text.strip(), confidence=normalized_conf, x=int(ocr_data["left"][i]), y=int(ocr_data["top"][i]), width=int(ocr_data["width"][i]), height=int(ocr_data["height"][i]), ) ) texts.append(text.strip()) confidences.append(normalized_conf) combined_text = " ".join(texts) avg_confidence = ( sum(confidences) / len(confidences) if confidences else 0.0 ) return OcrEngineResult( text=combined_text, confidence=avg_confidence, word_boxes=word_boxes, engine_name=self.name, ) except (EngineUnavailableError, EngineProcessingError): raise except Exception as exc: raise EngineProcessingError( f"Tesseract recognition failed: {exc}" ) from exc def _build_config(self, config: OcrConfig) -> str: """Translate OcrConfig into a Tesseract CLI config string.""" parts: list[str] = [] # Page segmentation mode if config.single_word: parts.append("--psm 8") elif config.single_line: parts.append("--psm 7") else: # Default: assume uniform block of text psm = config.hints.get("psm", 6) parts.append(f"--psm {psm}") # Character whitelist if config.char_whitelist: parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}") return " ".join(parts)