feat: add optional Google Vision cloud fallback engine (refs #118)

CloudEngine wraps Google Vision TEXT_DETECTION with lazy init. HybridEngine runs primary engine, falls back to cloud when confidence is below threshold. Disabled by default (OCR_FALLBACK_ENGINE=none). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 11:12:08 -06:00
parent 013fb0c67a
commit 4ef942cb9d
6 changed files with 351 additions and 18 deletions
--- a/ocr/app/engines/cloud_engine.py
+++ b/ocr/app/engines/cloud_engine.py
@@ -0,0 +1,166 @@
+"""Google Vision cloud OCR engine with lazy initialization."""
+
+import logging
+import os
+from typing import Any
+
+from app.engines.base_engine import (
+    EngineProcessingError,
+    EngineUnavailableError,
+    OcrConfig,
+    OcrEngine,
+    OcrEngineResult,
+    WordBox,
+)
+
+logger = logging.getLogger(__name__)
+
+# Default path for Google Vision service account key (Docker secret mount)
+_DEFAULT_KEY_PATH = "/run/secrets/google-vision-key.json"
+
+
+class CloudEngine(OcrEngine):
+    """Google Vision TEXT_DETECTION wrapper with lazy initialization.
+
+    The client is not created until the first ``recognize()`` call,
+    so the container starts normally even when the secret file is
+    missing or the dependency is not installed.
+    """
+
+    def __init__(self, key_path: str | None = None) -> None:
+        self._key_path = key_path or os.getenv(
+            "GOOGLE_VISION_KEY_PATH", _DEFAULT_KEY_PATH
+        )
+        self._client: Any | None = None
+
+    @property
+    def name(self) -> str:
+        return "google_vision"
+
+    # ------------------------------------------------------------------
+    # Lazy init
+    # ------------------------------------------------------------------
+
+    def _get_client(self) -> Any:
+        """Create the Vision client on first use."""
+        if self._client is not None:
+            return self._client
+
+        # Verify credentials file exists
+        if not os.path.isfile(self._key_path):
+            raise EngineUnavailableError(
+                f"Google Vision key not found at {self._key_path}. "
+                "Set GOOGLE_VISION_KEY_PATH or mount the secret."
+            )
+
+        try:
+            from google.cloud import vision  # type: ignore[import-untyped]
+
+            # Point the SDK at the service account key
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._key_path
+            self._client = vision.ImageAnnotatorClient()
+            logger.info(
+                "Google Vision client initialized (key: %s)", self._key_path
+            )
+            return self._client
+        except ImportError as exc:
+            raise EngineUnavailableError(
+                "google-cloud-vision is not installed. "
+                "Install with: pip install google-cloud-vision"
+            ) from exc
+        except Exception as exc:
+            raise EngineUnavailableError(
+                f"Failed to initialize Google Vision client: {exc}"
+            ) from exc
+
+    # ------------------------------------------------------------------
+    # OCR
+    # ------------------------------------------------------------------
+
+    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
+        """Run Google Vision TEXT_DETECTION on image bytes."""
+        client = self._get_client()
+
+        try:
+            from google.cloud import vision  # type: ignore[import-untyped]
+
+            image = vision.Image(content=image_bytes)
+            response = client.text_detection(image=image)
+
+            if response.error.message:
+                raise EngineProcessingError(
+                    f"Google Vision API error: {response.error.message}"
+                )
+
+            annotations = response.text_annotations
+            if not annotations:
+                return OcrEngineResult(
+                    text="",
+                    confidence=0.0,
+                    word_boxes=[],
+                    engine_name=self.name,
+                )
+
+            # First annotation is the full-page text; the rest are words
+            full_text = annotations[0].description.strip()
+            word_boxes: list[WordBox] = []
+            confidences: list[float] = []
+
+            for annotation in annotations[1:]:
+                text = annotation.description
+                vertices = annotation.bounding_poly.vertices
+
+                # Apply character whitelist filter if configured
+                if config.char_whitelist:
+                    allowed = set(config.char_whitelist)
+                    text = "".join(ch for ch in text if ch in allowed)
+
+                if not text.strip():
+                    continue
+
+                xs = [v.x for v in vertices]
+                ys = [v.y for v in vertices]
+                x_min, y_min = min(xs), min(ys)
+                x_max, y_max = max(xs), max(ys)
+
+                # Google Vision TEXT_DETECTION does not return per-word
+                # confidence in annotations.  Use 0.95 as the documented
+                # typical accuracy for clear images so comparisons with
+                # PaddleOCR are meaningful.
+                word_conf = 0.95
+                word_boxes.append(
+                    WordBox(
+                        text=text.strip(),
+                        confidence=word_conf,
+                        x=x_min,
+                        y=y_min,
+                        width=x_max - x_min,
+                        height=y_max - y_min,
+                    )
+                )
+                confidences.append(word_conf)
+
+            # Apply whitelist to full text too
+            if config.char_whitelist:
+                allowed = set(config.char_whitelist)
+                full_text = "".join(
+                    ch for ch in full_text if ch in allowed or ch in " \n"
+                )
+
+            avg_confidence = (
+                sum(confidences) / len(confidences) if confidences else 0.0
+            )
+
+            return OcrEngineResult(
+                text=full_text,
+                confidence=avg_confidence,
+                word_boxes=word_boxes,
+                engine_name=self.name,
+            )
+
+        except (EngineUnavailableError, EngineProcessingError):
+            raise
+        except Exception as exc:
+            raise EngineProcessingError(
+                f"Google Vision recognition failed: {exc}"
+            ) from exc