Files
motovaultpro/ocr/app/engines/hybrid_engine.py
Eric Gullickson 4ef942cb9d feat: add optional Google Vision cloud fallback engine (refs #118)
CloudEngine wraps Google Vision TEXT_DETECTION with lazy init.
HybridEngine runs primary engine, falls back to cloud when confidence
is below threshold. Disabled by default (OCR_FALLBACK_ENGINE=none).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 11:12:08 -06:00

117 lines
3.9 KiB
Python

"""Hybrid OCR engine: primary engine with optional cloud fallback."""
import logging
import time
from app.engines.base_engine import (
EngineError,
EngineProcessingError,
OcrConfig,
OcrEngine,
OcrEngineResult,
)
logger = logging.getLogger(__name__)
# Maximum time (seconds) to wait for the cloud fallback
_CLOUD_TIMEOUT_SECONDS = 5.0
class HybridEngine(OcrEngine):
"""Runs a primary engine and falls back to a cloud engine when
the primary result confidence is below the configured threshold.
If the fallback is ``None`` (default), this engine behaves identically
to the primary engine. Cloud failures are handled gracefully -- the
primary result is returned whenever the fallback is unavailable,
times out, or errors.
"""
def __init__(
self,
primary: OcrEngine,
fallback: OcrEngine | None = None,
threshold: float = 0.6,
) -> None:
self._primary = primary
self._fallback = fallback
self._threshold = threshold
@property
def name(self) -> str:
fallback_name = self._fallback.name if self._fallback else "none"
return f"hybrid({self._primary.name}+{fallback_name})"
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
"""Run primary OCR, optionally falling back to cloud engine."""
primary_result = self._primary.recognize(image_bytes, config)
# Happy path: primary confidence meets threshold
if primary_result.confidence >= self._threshold:
logger.debug(
"Primary engine confidence %.2f >= threshold %.2f, no fallback",
primary_result.confidence,
self._threshold,
)
return primary_result
# No fallback configured -- return primary result as-is
if self._fallback is None:
logger.debug(
"Primary confidence %.2f < threshold %.2f but no fallback configured",
primary_result.confidence,
self._threshold,
)
return primary_result
# Attempt cloud fallback with timeout guard
logger.info(
"Primary confidence %.2f < threshold %.2f, trying fallback (%s)",
primary_result.confidence,
self._threshold,
self._fallback.name,
)
try:
start = time.monotonic()
fallback_result = self._fallback.recognize(image_bytes, config)
elapsed = time.monotonic() - start
if elapsed > _CLOUD_TIMEOUT_SECONDS:
logger.warning(
"Cloud fallback took %.1fs (> %.1fs limit), using primary result",
elapsed,
_CLOUD_TIMEOUT_SECONDS,
)
return primary_result
# Return whichever result has higher confidence
if fallback_result.confidence > primary_result.confidence:
logger.info(
"Fallback confidence %.2f > primary %.2f, using fallback result",
fallback_result.confidence,
primary_result.confidence,
)
return fallback_result
logger.info(
"Primary confidence %.2f >= fallback %.2f, keeping primary result",
primary_result.confidence,
fallback_result.confidence,
)
return primary_result
except EngineError as exc:
logger.warning(
"Cloud fallback failed (%s), returning primary result: %s",
self._fallback.name,
exc,
)
return primary_result
except Exception as exc:
logger.warning(
"Unexpected cloud fallback error, returning primary result: %s",
exc,
)
return primary_result