CloudEngine wraps Google Vision TEXT_DETECTION with lazy init. HybridEngine runs primary engine, falls back to cloud when confidence is below threshold. Disabled by default (OCR_FALLBACK_ENGINE=none). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
117 lines
3.9 KiB
Python
117 lines
3.9 KiB
Python
"""Hybrid OCR engine: primary engine with optional cloud fallback."""
|
|
|
|
import logging
|
|
import time
|
|
|
|
from app.engines.base_engine import (
|
|
EngineError,
|
|
EngineProcessingError,
|
|
OcrConfig,
|
|
OcrEngine,
|
|
OcrEngineResult,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Maximum time (seconds) to wait for the cloud fallback
|
|
_CLOUD_TIMEOUT_SECONDS = 5.0
|
|
|
|
|
|
class HybridEngine(OcrEngine):
|
|
"""Runs a primary engine and falls back to a cloud engine when
|
|
the primary result confidence is below the configured threshold.
|
|
|
|
If the fallback is ``None`` (default), this engine behaves identically
|
|
to the primary engine. Cloud failures are handled gracefully -- the
|
|
primary result is returned whenever the fallback is unavailable,
|
|
times out, or errors.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
primary: OcrEngine,
|
|
fallback: OcrEngine | None = None,
|
|
threshold: float = 0.6,
|
|
) -> None:
|
|
self._primary = primary
|
|
self._fallback = fallback
|
|
self._threshold = threshold
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
fallback_name = self._fallback.name if self._fallback else "none"
|
|
return f"hybrid({self._primary.name}+{fallback_name})"
|
|
|
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
|
"""Run primary OCR, optionally falling back to cloud engine."""
|
|
primary_result = self._primary.recognize(image_bytes, config)
|
|
|
|
# Happy path: primary confidence meets threshold
|
|
if primary_result.confidence >= self._threshold:
|
|
logger.debug(
|
|
"Primary engine confidence %.2f >= threshold %.2f, no fallback",
|
|
primary_result.confidence,
|
|
self._threshold,
|
|
)
|
|
return primary_result
|
|
|
|
# No fallback configured -- return primary result as-is
|
|
if self._fallback is None:
|
|
logger.debug(
|
|
"Primary confidence %.2f < threshold %.2f but no fallback configured",
|
|
primary_result.confidence,
|
|
self._threshold,
|
|
)
|
|
return primary_result
|
|
|
|
# Attempt cloud fallback with timeout guard
|
|
logger.info(
|
|
"Primary confidence %.2f < threshold %.2f, trying fallback (%s)",
|
|
primary_result.confidence,
|
|
self._threshold,
|
|
self._fallback.name,
|
|
)
|
|
|
|
try:
|
|
start = time.monotonic()
|
|
fallback_result = self._fallback.recognize(image_bytes, config)
|
|
elapsed = time.monotonic() - start
|
|
|
|
if elapsed > _CLOUD_TIMEOUT_SECONDS:
|
|
logger.warning(
|
|
"Cloud fallback took %.1fs (> %.1fs limit), using primary result",
|
|
elapsed,
|
|
_CLOUD_TIMEOUT_SECONDS,
|
|
)
|
|
return primary_result
|
|
|
|
# Return whichever result has higher confidence
|
|
if fallback_result.confidence > primary_result.confidence:
|
|
logger.info(
|
|
"Fallback confidence %.2f > primary %.2f, using fallback result",
|
|
fallback_result.confidence,
|
|
primary_result.confidence,
|
|
)
|
|
return fallback_result
|
|
|
|
logger.info(
|
|
"Primary confidence %.2f >= fallback %.2f, keeping primary result",
|
|
primary_result.confidence,
|
|
fallback_result.confidence,
|
|
)
|
|
return primary_result
|
|
|
|
except EngineError as exc:
|
|
logger.warning(
|
|
"Cloud fallback failed (%s), returning primary result: %s",
|
|
self._fallback.name,
|
|
exc,
|
|
)
|
|
return primary_result
|
|
except Exception as exc:
|
|
logger.warning(
|
|
"Unexpected cloud fallback error, returning primary result: %s",
|
|
exc,
|
|
)
|
|
return primary_result
|