fix: Build errors and tesseract removal
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
"""Tests for OCR engine abstraction layer.
|
||||
|
||||
Covers: base types, exception hierarchy, PaddleOcrEngine,
|
||||
TesseractEngine, CloudEngine, HybridEngine, and engine_factory.
|
||||
CloudEngine, HybridEngine, and engine_factory.
|
||||
"""
|
||||
|
||||
import io
|
||||
@@ -124,7 +124,7 @@ class TestOcrEngineResult:
|
||||
|
||||
def test_empty_result(self) -> None:
|
||||
result = OcrEngineResult(
|
||||
text="", confidence=0.0, word_boxes=[], engine_name="tesseract"
|
||||
text="", confidence=0.0, word_boxes=[], engine_name="paddleocr"
|
||||
)
|
||||
assert result.text == ""
|
||||
assert result.word_boxes == []
|
||||
@@ -303,85 +303,6 @@ class TestPaddleOcrEngine:
|
||||
engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TesseractEngine
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestTesseractEngine:
|
||||
"""Tests for TesseractEngine using mocked pytesseract."""
|
||||
|
||||
@pytest.fixture()
|
||||
def engine(self) -> "TesseractEngine": # type: ignore[name-defined]
|
||||
"""Create a TesseractEngine with mocked pytesseract dependency."""
|
||||
mock_pytesseract = MagicMock()
|
||||
mock_pytesseract.Output.DICT = "dict"
|
||||
|
||||
with patch.dict("sys.modules", {"pytesseract": mock_pytesseract}):
|
||||
with patch("app.engines.tesseract_engine.settings") as mock_settings:
|
||||
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
|
||||
from app.engines.tesseract_engine import TesseractEngine
|
||||
|
||||
eng = TesseractEngine()
|
||||
eng._mock_pytesseract = mock_pytesseract # type: ignore[attr-defined]
|
||||
return eng
|
||||
|
||||
def test_name(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
assert engine.name == "tesseract"
|
||||
|
||||
def test_build_config_default_psm(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig())
|
||||
assert "--psm 6" in config_str
|
||||
|
||||
def test_build_config_single_line(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(single_line=True))
|
||||
assert "--psm 7" in config_str
|
||||
|
||||
def test_build_config_single_word(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(single_word=True))
|
||||
assert "--psm 8" in config_str
|
||||
|
||||
def test_build_config_whitelist(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(char_whitelist="ABC123"))
|
||||
assert "-c tessedit_char_whitelist=ABC123" in config_str
|
||||
|
||||
def test_build_config_psm_hint(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(hints={"psm": 11}))
|
||||
assert "--psm 11" in config_str
|
||||
|
||||
def test_recognize_normalizes_confidence(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
"""Tesseract returns 0-100 confidence; engine normalizes to 0.0-1.0."""
|
||||
engine._pytesseract.image_to_data.return_value = {
|
||||
"text": ["HELLO", ""],
|
||||
"conf": [92, -1],
|
||||
"left": [10],
|
||||
"top": [20],
|
||||
"width": [100],
|
||||
"height": [30],
|
||||
}
|
||||
|
||||
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||
assert result.text == "HELLO"
|
||||
assert abs(result.confidence - 0.92) < 0.01
|
||||
assert result.engine_name == "tesseract"
|
||||
|
||||
def test_import_error_raises_unavailable(self) -> None:
|
||||
with patch.dict("sys.modules", {"pytesseract": None}):
|
||||
with patch("app.engines.tesseract_engine.settings") as mock_settings:
|
||||
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
|
||||
|
||||
def mock_import(name, *args, **kwargs):
|
||||
if name == "pytesseract":
|
||||
raise ImportError("No module named 'pytesseract'")
|
||||
return __import__(name, *args, **kwargs)
|
||||
|
||||
with patch("builtins.__import__", side_effect=mock_import):
|
||||
from app.engines.tesseract_engine import TesseractEngine
|
||||
|
||||
with pytest.raises(EngineUnavailableError, match="pytesseract"):
|
||||
TesseractEngine()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CloudEngine
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -637,8 +558,8 @@ class TestEngineFactory:
|
||||
|
||||
from app.engines.engine_factory import create_engine
|
||||
|
||||
create_engine("tesseract")
|
||||
mock_create.assert_called_once_with("tesseract")
|
||||
create_engine("google_vision")
|
||||
mock_create.assert_called_once_with("google_vision")
|
||||
|
||||
@patch("app.engines.engine_factory.settings")
|
||||
@patch("app.engines.engine_factory._create_single_engine")
|
||||
|
||||
Reference in New Issue
Block a user