feat: add optional Google Vision cloud fallback engine (refs #118)
CloudEngine wraps Google Vision TEXT_DETECTION with lazy init. HybridEngine runs primary engine, falls back to cloud when confidence is below threshold. Disabled by default (OCR_FALLBACK_ENGINE=none). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
"""Factory function for creating OCR engine instances from configuration."""
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
|
||||
from app.config import settings
|
||||
@@ -7,28 +8,16 @@ from app.engines.base_engine import EngineUnavailableError, OcrEngine
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Valid engine identifiers
|
||||
# Valid engine identifiers (primary engines only; hybrid is constructed separately)
|
||||
_ENGINE_REGISTRY: dict[str, str] = {
|
||||
"paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
|
||||
"tesseract": "app.engines.tesseract_engine.TesseractEngine",
|
||||
"google_vision": "app.engines.cloud_engine.CloudEngine",
|
||||
}
|
||||
|
||||
|
||||
def create_engine(engine_name: str | None = None) -> OcrEngine:
|
||||
"""Instantiate an OCR engine by name (defaults to config value).
|
||||
|
||||
Args:
|
||||
engine_name: Engine identifier ("paddleocr", "tesseract").
|
||||
Falls back to ``settings.ocr_primary_engine``.
|
||||
|
||||
Returns:
|
||||
Initialized OcrEngine instance.
|
||||
|
||||
Raises:
|
||||
EngineUnavailableError: If the engine cannot be loaded or initialized.
|
||||
"""
|
||||
name = (engine_name or settings.ocr_primary_engine).lower().strip()
|
||||
|
||||
def _create_single_engine(name: str) -> OcrEngine:
|
||||
"""Instantiate a single engine by registry name."""
|
||||
if name not in _ENGINE_REGISTRY:
|
||||
raise EngineUnavailableError(
|
||||
f"Unknown engine '{name}'. Available: {list(_ENGINE_REGISTRY.keys())}"
|
||||
@@ -37,8 +26,6 @@ def create_engine(engine_name: str | None = None) -> OcrEngine:
|
||||
module_path, class_name = _ENGINE_REGISTRY[name].rsplit(".", 1)
|
||||
|
||||
try:
|
||||
import importlib
|
||||
|
||||
module = importlib.import_module(module_path)
|
||||
engine_cls = getattr(module, class_name)
|
||||
engine: OcrEngine = engine_cls()
|
||||
@@ -50,3 +37,51 @@ def create_engine(engine_name: str | None = None) -> OcrEngine:
|
||||
raise EngineUnavailableError(
|
||||
f"Failed to create engine '{name}': {exc}"
|
||||
) from exc
|
||||
|
||||
|
||||
def create_engine(engine_name: str | None = None) -> OcrEngine:
|
||||
"""Instantiate an OCR engine by name (defaults to config value).
|
||||
|
||||
When a fallback engine is configured (``OCR_FALLBACK_ENGINE != "none"``),
|
||||
returns a ``HybridEngine`` that wraps the primary with the fallback.
|
||||
|
||||
Args:
|
||||
engine_name: Engine identifier ("paddleocr", "tesseract").
|
||||
Falls back to ``settings.ocr_primary_engine``.
|
||||
|
||||
Returns:
|
||||
Initialized OcrEngine instance (possibly a HybridEngine wrapper).
|
||||
|
||||
Raises:
|
||||
EngineUnavailableError: If the primary engine cannot be loaded.
|
||||
"""
|
||||
name = (engine_name or settings.ocr_primary_engine).lower().strip()
|
||||
primary = _create_single_engine(name)
|
||||
|
||||
# Check for cloud fallback configuration
|
||||
fallback_name = settings.ocr_fallback_engine.lower().strip()
|
||||
if fallback_name == "none" or not fallback_name:
|
||||
return primary
|
||||
|
||||
# Create fallback engine (failure is non-fatal -- log and return primary only)
|
||||
try:
|
||||
fallback = _create_single_engine(fallback_name)
|
||||
except EngineUnavailableError as exc:
|
||||
logger.warning(
|
||||
"Fallback engine '%s' unavailable, proceeding without fallback: %s",
|
||||
fallback_name,
|
||||
exc,
|
||||
)
|
||||
return primary
|
||||
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
threshold = settings.ocr_fallback_threshold
|
||||
hybrid = HybridEngine(primary=primary, fallback=fallback, threshold=threshold)
|
||||
logger.info(
|
||||
"Created hybrid engine: primary=%s, fallback=%s, threshold=%.2f",
|
||||
name,
|
||||
fallback_name,
|
||||
threshold,
|
||||
)
|
||||
return hybrid
|
||||
|
||||
Reference in New Issue
Block a user