fix: Build errors and tesseract removal
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
This commit is contained in:
@@ -5,7 +5,6 @@ decoupling extractors from specific OCR libraries.
|
||||
|
||||
Engines:
|
||||
- PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only)
|
||||
- TesseractEngine: pytesseract wrapper (backward compatibility)
|
||||
- CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback)
|
||||
- HybridEngine: Primary + fallback with confidence threshold
|
||||
"""
|
||||
|
||||
@@ -57,7 +57,7 @@ class OcrEngineResult:
|
||||
text: str
|
||||
confidence: float # 0.0-1.0
|
||||
word_boxes: list[WordBox]
|
||||
engine_name: str # "paddleocr", "tesseract", "google_vision"
|
||||
engine_name: str # "paddleocr", "google_vision"
|
||||
|
||||
|
||||
# --- Abstract base ---
|
||||
|
||||
@@ -11,7 +11,6 @@ logger = logging.getLogger(__name__)
|
||||
# Valid engine identifiers (primary engines only; hybrid is constructed separately)
|
||||
_ENGINE_REGISTRY: dict[str, str] = {
|
||||
"paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
|
||||
"tesseract": "app.engines.tesseract_engine.TesseractEngine",
|
||||
"google_vision": "app.engines.cloud_engine.CloudEngine",
|
||||
}
|
||||
|
||||
@@ -46,7 +45,7 @@ def create_engine(engine_name: str | None = None) -> OcrEngine:
|
||||
returns a ``HybridEngine`` that wraps the primary with the fallback.
|
||||
|
||||
Args:
|
||||
engine_name: Engine identifier ("paddleocr", "tesseract").
|
||||
engine_name: Engine identifier ("paddleocr", "google_vision").
|
||||
Falls back to ``settings.ocr_primary_engine``.
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -1,114 +0,0 @@
|
||||
"""Tesseract engine wrapper for backward compatibility."""
|
||||
|
||||
import io
|
||||
import logging
|
||||
|
||||
from app.config import settings
|
||||
from app.engines.base_engine import (
|
||||
EngineProcessingError,
|
||||
EngineUnavailableError,
|
||||
OcrConfig,
|
||||
OcrEngine,
|
||||
OcrEngineResult,
|
||||
WordBox,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TesseractEngine(OcrEngine):
|
||||
"""pytesseract wrapper conforming to the OcrEngine interface."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
try:
|
||||
import pytesseract # type: ignore[import-untyped]
|
||||
|
||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
||||
self._pytesseract = pytesseract
|
||||
logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
|
||||
except ImportError as exc:
|
||||
raise EngineUnavailableError(
|
||||
"pytesseract is not installed. "
|
||||
"Install with: pip install pytesseract"
|
||||
) from exc
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "tesseract"
|
||||
|
||||
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||
"""Run Tesseract OCR on image bytes."""
|
||||
try:
|
||||
from PIL import Image
|
||||
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
# Build Tesseract config string from OcrConfig
|
||||
tess_config = self._build_config(config)
|
||||
|
||||
# Get word-level data
|
||||
ocr_data = self._pytesseract.image_to_data(
|
||||
image,
|
||||
config=tess_config,
|
||||
output_type=self._pytesseract.Output.DICT,
|
||||
)
|
||||
|
||||
word_boxes: list[WordBox] = []
|
||||
texts: list[str] = []
|
||||
confidences: list[float] = []
|
||||
|
||||
for i, text in enumerate(ocr_data["text"]):
|
||||
conf = int(ocr_data["conf"][i])
|
||||
if text.strip() and conf > 0:
|
||||
normalized_conf = conf / 100.0
|
||||
word_boxes.append(
|
||||
WordBox(
|
||||
text=text.strip(),
|
||||
confidence=normalized_conf,
|
||||
x=int(ocr_data["left"][i]),
|
||||
y=int(ocr_data["top"][i]),
|
||||
width=int(ocr_data["width"][i]),
|
||||
height=int(ocr_data["height"][i]),
|
||||
)
|
||||
)
|
||||
texts.append(text.strip())
|
||||
confidences.append(normalized_conf)
|
||||
|
||||
combined_text = " ".join(texts)
|
||||
avg_confidence = (
|
||||
sum(confidences) / len(confidences) if confidences else 0.0
|
||||
)
|
||||
|
||||
return OcrEngineResult(
|
||||
text=combined_text,
|
||||
confidence=avg_confidence,
|
||||
word_boxes=word_boxes,
|
||||
engine_name=self.name,
|
||||
)
|
||||
|
||||
except (EngineUnavailableError, EngineProcessingError):
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise EngineProcessingError(
|
||||
f"Tesseract recognition failed: {exc}"
|
||||
) from exc
|
||||
|
||||
def _build_config(self, config: OcrConfig) -> str:
|
||||
"""Translate OcrConfig into a Tesseract CLI config string."""
|
||||
parts: list[str] = []
|
||||
|
||||
# Page segmentation mode
|
||||
if config.single_word:
|
||||
parts.append("--psm 8")
|
||||
elif config.single_line:
|
||||
parts.append("--psm 7")
|
||||
else:
|
||||
# Default: assume uniform block of text
|
||||
psm = config.hints.get("psm", 6)
|
||||
parts.append(f"--psm {psm}")
|
||||
|
||||
# Character whitelist
|
||||
if config.char_whitelist:
|
||||
parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
|
||||
|
||||
return " ".join(parts)
|
||||
Reference in New Issue
Block a user