feat: add optional Google Vision cloud fallback engine (refs #118)
CloudEngine wraps Google Vision TEXT_DETECTION with lazy init. HybridEngine runs primary engine, falls back to cloud when confidence is below threshold. Disabled by default (OCR_FALLBACK_ENGINE=none). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
166
ocr/app/engines/cloud_engine.py
Normal file
166
ocr/app/engines/cloud_engine.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""Google Vision cloud OCR engine with lazy initialization."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from app.engines.base_engine import (
|
||||
EngineProcessingError,
|
||||
EngineUnavailableError,
|
||||
OcrConfig,
|
||||
OcrEngine,
|
||||
OcrEngineResult,
|
||||
WordBox,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default path for Google Vision service account key (Docker secret mount)
|
||||
_DEFAULT_KEY_PATH = "/run/secrets/google-vision-key.json"
|
||||
|
||||
|
||||
class CloudEngine(OcrEngine):
|
||||
"""Google Vision TEXT_DETECTION wrapper with lazy initialization.
|
||||
|
||||
The client is not created until the first ``recognize()`` call,
|
||||
so the container starts normally even when the secret file is
|
||||
missing or the dependency is not installed.
|
||||
"""
|
||||
|
||||
def __init__(self, key_path: str | None = None) -> None:
|
||||
self._key_path = key_path or os.getenv(
|
||||
"GOOGLE_VISION_KEY_PATH", _DEFAULT_KEY_PATH
|
||||
)
|
||||
self._client: Any | None = None
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "google_vision"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lazy init
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_client(self) -> Any:
|
||||
"""Create the Vision client on first use."""
|
||||
if self._client is not None:
|
||||
return self._client
|
||||
|
||||
# Verify credentials file exists
|
||||
if not os.path.isfile(self._key_path):
|
||||
raise EngineUnavailableError(
|
||||
f"Google Vision key not found at {self._key_path}. "
|
||||
"Set GOOGLE_VISION_KEY_PATH or mount the secret."
|
||||
)
|
||||
|
||||
try:
|
||||
from google.cloud import vision # type: ignore[import-untyped]
|
||||
|
||||
# Point the SDK at the service account key
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._key_path
|
||||
self._client = vision.ImageAnnotatorClient()
|
||||
logger.info(
|
||||
"Google Vision client initialized (key: %s)", self._key_path
|
||||
)
|
||||
return self._client
|
||||
except ImportError as exc:
|
||||
raise EngineUnavailableError(
|
||||
"google-cloud-vision is not installed. "
|
||||
"Install with: pip install google-cloud-vision"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
raise EngineUnavailableError(
|
||||
f"Failed to initialize Google Vision client: {exc}"
|
||||
) from exc
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# OCR
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||
"""Run Google Vision TEXT_DETECTION on image bytes."""
|
||||
client = self._get_client()
|
||||
|
||||
try:
|
||||
from google.cloud import vision # type: ignore[import-untyped]
|
||||
|
||||
image = vision.Image(content=image_bytes)
|
||||
response = client.text_detection(image=image)
|
||||
|
||||
if response.error.message:
|
||||
raise EngineProcessingError(
|
||||
f"Google Vision API error: {response.error.message}"
|
||||
)
|
||||
|
||||
annotations = response.text_annotations
|
||||
if not annotations:
|
||||
return OcrEngineResult(
|
||||
text="",
|
||||
confidence=0.0,
|
||||
word_boxes=[],
|
||||
engine_name=self.name,
|
||||
)
|
||||
|
||||
# First annotation is the full-page text; the rest are words
|
||||
full_text = annotations[0].description.strip()
|
||||
word_boxes: list[WordBox] = []
|
||||
confidences: list[float] = []
|
||||
|
||||
for annotation in annotations[1:]:
|
||||
text = annotation.description
|
||||
vertices = annotation.bounding_poly.vertices
|
||||
|
||||
# Apply character whitelist filter if configured
|
||||
if config.char_whitelist:
|
||||
allowed = set(config.char_whitelist)
|
||||
text = "".join(ch for ch in text if ch in allowed)
|
||||
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
xs = [v.x for v in vertices]
|
||||
ys = [v.y for v in vertices]
|
||||
x_min, y_min = min(xs), min(ys)
|
||||
x_max, y_max = max(xs), max(ys)
|
||||
|
||||
# Google Vision TEXT_DETECTION does not return per-word
|
||||
# confidence in annotations. Use 0.95 as the documented
|
||||
# typical accuracy for clear images so comparisons with
|
||||
# PaddleOCR are meaningful.
|
||||
word_conf = 0.95
|
||||
word_boxes.append(
|
||||
WordBox(
|
||||
text=text.strip(),
|
||||
confidence=word_conf,
|
||||
x=x_min,
|
||||
y=y_min,
|
||||
width=x_max - x_min,
|
||||
height=y_max - y_min,
|
||||
)
|
||||
)
|
||||
confidences.append(word_conf)
|
||||
|
||||
# Apply whitelist to full text too
|
||||
if config.char_whitelist:
|
||||
allowed = set(config.char_whitelist)
|
||||
full_text = "".join(
|
||||
ch for ch in full_text if ch in allowed or ch in " \n"
|
||||
)
|
||||
|
||||
avg_confidence = (
|
||||
sum(confidences) / len(confidences) if confidences else 0.0
|
||||
)
|
||||
|
||||
return OcrEngineResult(
|
||||
text=full_text,
|
||||
confidence=avg_confidence,
|
||||
word_boxes=word_boxes,
|
||||
engine_name=self.name,
|
||||
)
|
||||
|
||||
except (EngineUnavailableError, EngineProcessingError):
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise EngineProcessingError(
|
||||
f"Google Vision recognition failed: {exc}"
|
||||
) from exc
|
||||
Reference in New Issue
Block a user