"""Google Vision cloud OCR engine with lazy initialization.""" import logging import os from typing import Any from app.engines.base_engine import ( EngineProcessingError, EngineUnavailableError, OcrConfig, OcrEngine, OcrEngineResult, WordBox, ) logger = logging.getLogger(__name__) # Default path for Google WIF credential config (Docker secret mount) _DEFAULT_KEY_PATH = "/run/secrets/google-wif-config.json" class CloudEngine(OcrEngine): """Google Vision TEXT_DETECTION wrapper with lazy initialization. The client is not created until the first ``recognize()`` call, so the container starts normally even when the secret file is missing or the dependency is not installed. """ def __init__(self, key_path: str | None = None) -> None: self._key_path = key_path or os.getenv( "GOOGLE_VISION_KEY_PATH", _DEFAULT_KEY_PATH ) self._client: Any | None = None @property def name(self) -> str: return "google_vision" # ------------------------------------------------------------------ # Lazy init # ------------------------------------------------------------------ def _get_client(self) -> Any: """Create the Vision client on first use. Uses Application Default Credentials (ADC) pointed at a WIF credential config file. The WIF config references an executable that fetches an Auth0 M2M JWT. """ if self._client is not None: return self._client # Verify credentials config exists if not os.path.isfile(self._key_path): raise EngineUnavailableError( f"Google Vision credential config not found at {self._key_path}. " "Set GOOGLE_VISION_KEY_PATH or mount the secret." ) try: from google.cloud import vision # type: ignore[import-untyped] # Point ADC at the WIF credential config os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._key_path # Required for executable-sourced credentials os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1" self._client = vision.ImageAnnotatorClient() logger.info( "Google Vision client initialized via WIF (config: %s)", self._key_path, ) return self._client except ImportError as exc: raise EngineUnavailableError( "google-cloud-vision is not installed. " "Install with: pip install google-cloud-vision" ) from exc except Exception as exc: raise EngineUnavailableError( f"Failed to initialize Google Vision client: {exc}" ) from exc # ------------------------------------------------------------------ # OCR # ------------------------------------------------------------------ def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult: """Run Google Vision TEXT_DETECTION on image bytes.""" client = self._get_client() try: from google.cloud import vision # type: ignore[import-untyped] image = vision.Image(content=image_bytes) response = client.text_detection(image=image) if response.error.message: raise EngineProcessingError( f"Google Vision API error: {response.error.message}" ) annotations = response.text_annotations if not annotations: return OcrEngineResult( text="", confidence=0.0, word_boxes=[], engine_name=self.name, ) # First annotation is the full-page text; the rest are words full_text = annotations[0].description.strip() word_boxes: list[WordBox] = [] confidences: list[float] = [] for annotation in annotations[1:]: text = annotation.description vertices = annotation.bounding_poly.vertices # Apply character whitelist filter if configured if config.char_whitelist: allowed = set(config.char_whitelist) text = "".join(ch for ch in text if ch in allowed) if not text.strip(): continue xs = [v.x for v in vertices] ys = [v.y for v in vertices] x_min, y_min = min(xs), min(ys) x_max, y_max = max(xs), max(ys) # Google Vision TEXT_DETECTION does not return per-word # confidence in annotations. Use 0.95 as the documented # typical accuracy for clear images so comparisons with # PaddleOCR are meaningful. word_conf = 0.95 word_boxes.append( WordBox( text=text.strip(), confidence=word_conf, x=x_min, y=y_min, width=x_max - x_min, height=y_max - y_min, ) ) confidences.append(word_conf) # Apply whitelist to full text too if config.char_whitelist: allowed = set(config.char_whitelist) full_text = "".join( ch for ch in full_text if ch in allowed or ch in " \n" ) avg_confidence = ( sum(confidences) / len(confidences) if confidences else 0.0 ) return OcrEngineResult( text=full_text, confidence=avg_confidence, word_boxes=word_boxes, engine_name=self.name, ) except (EngineUnavailableError, EngineProcessingError): raise except Exception as exc: raise EngineProcessingError( f"Google Vision recognition failed: {exc}" ) from exc