motovaultpro/ocr/app/engines/cloud_engine.py

"""Google Vision cloud OCR engine with lazy initialization."""

import logging
import os
from typing import Any

from app.engines.base_engine import (
    EngineProcessingError,
    EngineUnavailableError,
    OcrConfig,
    OcrEngine,
    OcrEngineResult,
    WordBox,
)

logger = logging.getLogger(__name__)

# Default path for Google WIF credential config (Docker secret mount)
_DEFAULT_KEY_PATH = "/run/secrets/google-wif-config.json"


class CloudEngine(OcrEngine):
    """Google Vision TEXT_DETECTION wrapper with lazy initialization.

    The client is not created until the first ``recognize()`` call,
    so the container starts normally even when the secret file is
    missing or the dependency is not installed.
    """

    def __init__(self, key_path: str | None = None) -> None:
        self._key_path = key_path or os.getenv(
            "GOOGLE_VISION_KEY_PATH", _DEFAULT_KEY_PATH
        )
        self._client: Any | None = None

    @property
    def name(self) -> str:
        return "google_vision"

    # ------------------------------------------------------------------
    # Lazy init
    # ------------------------------------------------------------------

    def _get_client(self) -> Any:
        """Create the Vision client on first use.

        Uses Application Default Credentials (ADC) pointed at a WIF
        credential config file.  The WIF config references an executable
        that fetches an Auth0 M2M JWT.
        """
        if self._client is not None:
            return self._client

        # Verify credentials config exists
        if not os.path.isfile(self._key_path):
            raise EngineUnavailableError(
                f"Google Vision credential config not found at {self._key_path}. "
                "Set GOOGLE_VISION_KEY_PATH or mount the secret."
            )

        try:
            from google.cloud import vision  # type: ignore[import-untyped]

            # Point ADC at the WIF credential config
            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._key_path
            # Required for executable-sourced credentials
            os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1"
            self._client = vision.ImageAnnotatorClient()
            logger.info(
                "Google Vision client initialized via WIF (config: %s)",
                self._key_path,
            )
            return self._client
        except ImportError as exc:
            raise EngineUnavailableError(
                "google-cloud-vision is not installed. "
                "Install with: pip install google-cloud-vision"
            ) from exc
        except Exception as exc:
            raise EngineUnavailableError(
                f"Failed to initialize Google Vision client: {exc}"
            ) from exc

    # ------------------------------------------------------------------
    # OCR
    # ------------------------------------------------------------------

    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
        """Run Google Vision TEXT_DETECTION on image bytes."""
        client = self._get_client()

        try:
            from google.cloud import vision  # type: ignore[import-untyped]

            image = vision.Image(content=image_bytes)
            response = client.text_detection(image=image)

            if response.error.message:
                raise EngineProcessingError(
                    f"Google Vision API error: {response.error.message}"
                )

            annotations = response.text_annotations
            if not annotations:
                return OcrEngineResult(
                    text="",
                    confidence=0.0,
                    word_boxes=[],
                    engine_name=self.name,
                )

            # First annotation is the full-page text; the rest are words
            full_text = annotations[0].description.strip()
            word_boxes: list[WordBox] = []
            confidences: list[float] = []

            for annotation in annotations[1:]:
                text = annotation.description
                vertices = annotation.bounding_poly.vertices

                # Apply character whitelist filter if configured
                if config.char_whitelist:
                    allowed = set(config.char_whitelist)
                    text = "".join(ch for ch in text if ch in allowed)

                if not text.strip():
                    continue

                xs = [v.x for v in vertices]
                ys = [v.y for v in vertices]
                x_min, y_min = min(xs), min(ys)
                x_max, y_max = max(xs), max(ys)

                # Google Vision TEXT_DETECTION does not return per-word
                # confidence in annotations.  Use 0.95 as the documented
                # typical accuracy for clear images so comparisons with
                # PaddleOCR are meaningful.
                word_conf = 0.95
                word_boxes.append(
                    WordBox(
                        text=text.strip(),
                        confidence=word_conf,
                        x=x_min,
                        y=y_min,
                        width=x_max - x_min,
                        height=y_max - y_min,
                    )
                )
                confidences.append(word_conf)

            # Apply whitelist to full text too
            if config.char_whitelist:
                allowed = set(config.char_whitelist)
                full_text = "".join(
                    ch for ch in full_text if ch in allowed or ch in " \n"
                )

            avg_confidence = (
                sum(confidences) / len(confidences) if confidences else 0.0
            )

            return OcrEngineResult(
                text=full_text,
                confidence=avg_confidence,
                word_boxes=word_boxes,
                engine_name=self.name,
            )

        except (EngineUnavailableError, EngineProcessingError):
            raise
        except Exception as exc:
            raise EngineProcessingError(
                f"Google Vision recognition failed: {exc}"
            ) from exc