chore: update OCR tests and documentation (refs #121)

Add engine abstraction tests and update docs to reflect PaddleOCR primary architecture with optional Google Vision cloud fallback. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 11:42:51 -06:00
parent 1e96baca6f
commit 47c5676498
7 changed files with 870 additions and 68 deletions
--- a/.ai/context.json
+++ b/.ai/context.json
@@ -108,7 +108,7 @@
    },
    "mvp-ocr": {
      "type": "ocr_service",
-      "description": "Python-based OCR for document text extraction",
+      "description": "Python OCR service with pluggable engine abstraction (PaddleOCR PP-OCRv4 primary, optional Google Vision cloud fallback, Tesseract backward compat)",
      "port": 8000
    },
    "mvp-loki": {
--- a/docs/CLAUDE.md
+++ b/docs/CLAUDE.md
@@ -18,5 +18,5 @@
 | `AUDIT.md` | Audit documentation | Security audits, compliance |
 | `MVP-COLOR-SCHEME.md` | Color scheme reference | UI styling decisions |
 | `LOGGING.md` | Unified logging system | Log levels, correlation IDs, Grafana |
-| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, Tesseract setup |
+| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, PaddleOCR engine abstraction |
 | `TIER-GATING.md` | Subscription tier gating rules | Feature access by tier, vehicle limits |
--- a/docs/ocr-pipeline-tech-stack.md
+++ b/docs/ocr-pipeline-tech-stack.md
@@ -118,35 +118,48 @@
        │       ├─────────────────────────────────────────────────────────┤
        │       │                                                         │
        │       │   ┌─────────────────────────────────────────────────┐   │
-        │       │   │  5a. Primary OCR: Tesseract 5.x                 │   │
+        │       │   │  5a. Engine Abstraction Layer                    │   │
        │       │   │                                                  │   │
-        │       │   │  • Engine: LSTM (--oem 1)                       │   │
+        │       │   │  OcrEngine ABC -> PaddleOcrEngine (primary)      │   │
-        │       │   │  • Page segmentation: Auto (--psm 3)            │   │
+        │       │   │                -> CloudEngine (optional fallback) │   │
-        │       │   │  • Output: hOCR with word confidence            │   │
+        │       │   │                -> TesseractEngine (backward compat)│  │
        │       │   │                -> HybridEngine (primary+fallback) │   │
        │       │   └─────────────────────────────────────────────────┘   │
        │       │                         │                               │
        │       │                         ▼                               │
        │       │   ┌─────────────────────────────────────────────────┐   │
        │       │   │  5b. Primary OCR: PaddleOCR PP-OCRv4             │   │
        │       │   │                                                  │   │
        │       │   │  • Scene text detection + angle classification   │   │
        │       │   │  • CPU-only, models baked into Docker image      │   │
        │       │   │  • Normalized output: text, confidence, word boxes│  │
        │       │   └─────────────────────────────────────────────────┘   │
        │       │                         │                               │
        │       │                         ▼                               │
        │       │                 ┌───────────────┐                       │
        │       │                 │  Confidence   │                       │
-        │       │                 │    > 80% ?    │                       │
+        │       │                 │   >= 60% ?    │                       │
        │       │                 └───────────────┘                       │
        │       │                    │         │                          │
-        │       │              YES ──┘         └── NO                     │
+        │       │              YES ──┘         └── NO (and cloud enabled) │
        │       │               │                   │                     │
        │       │               │                   ▼                     │
        │       │               │   ┌─────────────────────────────────┐   │
-        │       │               │   │  5b. Fallback: PaddleOCR        │   │
+        │       │               │   │  5c. Optional Cloud Fallback     │   │
        │       │               │   │      (Google Vision API)         │   │
        │       │               │   │                                  │   │
-        │       │               │   │  • Better for degraded images   │   │
+        │       │               │   │  • Disabled by default           │   │
-        │       │               │   │  • Better table detection       │   │
+        │       │               │   │  • 5-second timeout guard        │   │
-        │       │               │   │  • Slower but more accurate     │   │
+        │       │               │   │  • Returns higher-confidence     │   │
        │       │               │   │    result of primary vs fallback │   │
        │       │               │   └─────────────────────────────────┘   │
        │       │               │                   │                     │
        │       │               ▼                   ▼                     │
        │       │         ┌─────────────────────────────────┐             │
-        │       │         │  5c. Result Merging             │             │
+        │       │         │  5d. HybridEngine Result        │             │
-        │       │         │  • Merge by bounding box        │             │
+        │       │         │  • Compare confidences          │             │
        │       │         │  • Keep highest confidence      │             │
        │       │         │  • Graceful fallback on error   │             │
        │       │         └─────────────────────────────────┘             │
        │       │                                                         │
        │       └─────────────────────────────────────────────────────────┘
@@ -257,10 +270,10 @@
 | Component              | Tool                  | Purpose                              |
 |------------------------|-----------------------|--------------------------------------|
-| **Primary OCR**        | Tesseract 5.x         | Fast, reliable text extraction       |
+| **Primary OCR**        | PaddleOCR PP-OCRv4    | Highest accuracy scene text, CPU-only |
-| **Python Binding**     | pytesseract           | Tesseract Python wrapper             |
+| **Cloud Fallback**     | Google Vision API     | Optional cloud fallback (disabled by default) |
-| **Fallback OCR**       | PaddleOCR             | Higher accuracy, better tables       |
+| **Backward Compat**    | Tesseract 5.x / pytesseract | Legacy engine, configurable via env var |
-| **Layout Analysis**    | PaddleOCR / LayoutParser | Document structure detection      |
+| **Engine Abstraction** | `OcrEngine` ABC       | Pluggable engine interface in `ocr/app/engines/` |
 ### Data Extraction
@@ -291,85 +304,93 @@
 fastapi>=0.100.0
 uvicorn[standard]>=0.23.0
 python-multipart>=0.0.6
-
+pydantic>=2.0.0
 # Task Queue
 celery>=5.3.0
 redis>=4.6.0
 # File Detection & Handling
 python-magic>=0.4.27
 pillow>=10.0.0
 pillow-heif>=0.13.0
 # PDF Processing
 pymupdf>=1.23.0
 # Image Preprocessing
 opencv-python-headless>=4.8.0
 deskew>=1.4.0
 scikit-image>=0.21.0
 numpy>=1.24.0
 # OCR Engines
 pytesseract>=0.3.10
-paddlepaddle>=2.5.0
+paddlepaddle>=2.6.0
-paddleocr>=2.7.0
+paddleocr>=2.8.0
 google-cloud-vision>=3.7.0
-# Table Extraction
+# PDF Processing
-img2table>=1.2.0
+PyMuPDF>=1.23.0
 camelot-py[cv]>=0.11.0
-# NLP & Data
+# Redis for job queue
-spacy>=3.6.0
+redis>=5.0.0
 pandas>=2.0.0
-# Storage & Database
+# HTTP client for callbacks
-boto3>=1.28.0
+httpx>=0.24.0
-psycopg2-binary>=2.9.0
+
-sqlalchemy>=2.0.0
+# Testing
 pytest>=7.4.0
 pytest-asyncio>=0.21.0
 ```
 ### System Package Requirements (Ubuntu/Debian)
 ```bash
-# Tesseract OCR
+# Tesseract OCR (backward compatibility engine)
-apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev
+apt-get install tesseract-ocr tesseract-ocr-eng
 # PaddlePaddle OpenMP runtime
 apt-get install libgomp1
 # HEIC Support
-apt-get install libheif-examples libheif-dev
+apt-get install libheif1 libheif-dev
-# OpenCV dependencies
+# GLib (OpenCV dependency)
-apt-get install libgl1-mesa-glx libglib2.0-0
+apt-get install libglib2.0-0
-# PDF rendering dependencies
+# File type detection
-apt-get install libmupdf-dev mupdf-tools
+apt-get install libmagic1
 # Image processing
 apt-get install libmagic1 ghostscript
 # Camelot dependencies
 apt-get install ghostscript python3-tk
 ```
 ### Environment Variables
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `OCR_PRIMARY_ENGINE` | `paddleocr` | Primary OCR engine (`paddleocr`, `tesseract`) |
 | `OCR_CONFIDENCE_THRESHOLD` | `0.6` | Minimum confidence threshold |
 | `OCR_FALLBACK_ENGINE` | `none` | Fallback engine (`google_vision`, `none`) |
 | `OCR_FALLBACK_THRESHOLD` | `0.6` | Confidence below this triggers fallback |
 | `GOOGLE_VISION_KEY_PATH` | `/run/secrets/google-vision-key.json` | Path to Google Vision service account key |
 ---
 ## DOCKERFILE
 ```dockerfile
-FROM python:3.11-slim
+# Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
 # Backward compat: Tesseract 5.x (optional, via TesseractEngine)
 # Cloud fallback: Google Vision (optional, requires API key at runtime)
 FROM python:3.13-slim
 # System dependencies
 # - tesseract-ocr/eng: Backward-compatible OCR engine
 # - libgomp1: OpenMP runtime required by PaddlePaddle
 # - libheif1/libheif-dev: HEIF image support (iPhone photos)
 # - libglib2.0-0: GLib shared library (OpenCV dependency)
 # - libmagic1: File type detection
 # - curl: Health check endpoint
 RUN apt-get update && apt-get install -y --no-install-recommends \
    tesseract-ocr \
    tesseract-ocr-eng \
-    libtesseract-dev \
+    libgomp1 \
-    libheif-examples \
+    libheif1 \
    libheif-dev \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libmagic1 \
-    ghostscript \
+    curl \
    poppler-utils \
    && rm -rf /var/lib/apt/lists/*
 # Python dependencies
@@ -377,11 +398,9 @@ WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Download spaCy model
+# Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime)
-RUN python -m spacy download en_core_web_sm
+RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)" \
-
+    && echo "PaddleOCR PP-OCRv4 models downloaded and verified"
 # Download PaddleOCR models (cached in image)
 RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')"
 COPY . .
--- a/ocr/CLAUDE.md
+++ b/ocr/CLAUDE.md
@@ -1,10 +1,12 @@
 # ocr/
 Python OCR microservice. Primary engine: PaddleOCR PP-OCRv4 with optional Google Vision cloud fallback. Pluggable engine abstraction in `app/engines/`.
 ## Files
 | File | What | When to read |
 | ---- | ---- | ------------ |
-| `Dockerfile` | Container build definition | Docker builds, deployment |
+| `Dockerfile` | Container build (PaddleOCR models baked in) | Docker builds, deployment |
 | `requirements.txt` | Python dependencies | Adding dependencies |
 ## Subdirectories
@@ -12,4 +14,5 @@
 | Directory | What | When to read |
 | --------- | ---- | ------------ |
 | `app/` | FastAPI application source | OCR endpoint development |
 | `app/engines/` | Engine abstraction layer (OcrEngine ABC, factory, hybrid) | Adding or changing OCR engines |
 | `tests/` | Test suite | Adding or modifying tests |
--- a/ocr/app/CLAUDE.md
+++ b/ocr/app/CLAUDE.md
@@ -12,6 +12,7 @@
 | Directory | What | When to read |
 | --------- | ---- | ------------ |
 | `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback, Tesseract compat) | Engine changes, adding new engines |
 | `extractors/` | Data extraction logic | Adding new extraction types |
 | `models/` | Data models and schemas | Request/response types |
 | `patterns/` | Regex and parsing patterns | Pattern matching rules |
--- a/ocr/tests/test_engine_abstraction.py
+++ b/ocr/tests/test_engine_abstraction.py
@@ -0,0 +1,675 @@
 """Tests for OCR engine abstraction layer.
 Covers: base types, exception hierarchy, PaddleOcrEngine,
 TesseractEngine, CloudEngine, HybridEngine, and engine_factory.
 """
 import io
 from unittest.mock import MagicMock, patch
 import pytest
 from PIL import Image
 from app.engines.base_engine import (
    EngineError,
    EngineProcessingError,
    EngineUnavailableError,
    OcrConfig,
    OcrEngine,
    OcrEngineResult,
    WordBox,
 )
 # --- Helpers ---
 def _create_test_image_bytes() -> bytes:
    """Create minimal PNG image bytes for engine testing."""
    img = Image.new("RGB", (100, 50), (255, 255, 255))
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    return buf.getvalue()
 def _make_result(
    text: str, confidence: float, engine_name: str
 ) -> OcrEngineResult:
    """Create a minimal OcrEngineResult for testing."""
    return OcrEngineResult(
        text=text, confidence=confidence, word_boxes=[], engine_name=engine_name
    )
 # ---------------------------------------------------------------------------
 # Exception hierarchy
 # ---------------------------------------------------------------------------
 class TestExceptionHierarchy:
    """Engine errors form a proper hierarchy under EngineError."""
    def test_unavailable_is_engine_error(self) -> None:
        assert issubclass(EngineUnavailableError, EngineError)
    def test_processing_is_engine_error(self) -> None:
        assert issubclass(EngineProcessingError, EngineError)
    def test_engine_error_is_exception(self) -> None:
        assert issubclass(EngineError, Exception)
    def test_catch_base_catches_subtypes(self) -> None:
        with pytest.raises(EngineError):
            raise EngineUnavailableError("not installed")
        with pytest.raises(EngineError):
            raise EngineProcessingError("OCR failed")
 # ---------------------------------------------------------------------------
 # Data types
 # ---------------------------------------------------------------------------
 class TestWordBox:
    def test_default_positions(self) -> None:
        wb = WordBox(text="VIN", confidence=0.95)
        assert wb.x == 0
        assert wb.y == 0
        assert wb.width == 0
        assert wb.height == 0
    def test_all_fields(self) -> None:
        wb = WordBox(text="ABC", confidence=0.88, x=10, y=20, width=100, height=30)
        assert wb.text == "ABC"
        assert wb.confidence == 0.88
        assert wb.x == 10
        assert wb.width == 100
 class TestOcrConfig:
    def test_defaults(self) -> None:
        config = OcrConfig()
        assert config.char_whitelist is None
        assert config.single_line is False
        assert config.single_word is False
        assert config.use_angle_cls is True
        assert config.hints == {}
    def test_vin_whitelist_excludes_ioq(self) -> None:
        whitelist = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
        config = OcrConfig(char_whitelist=whitelist)
        assert "I" not in config.char_whitelist
        assert "O" not in config.char_whitelist
        assert "Q" not in config.char_whitelist
    def test_hints_are_independent_across_instances(self) -> None:
        c1 = OcrConfig()
        c2 = OcrConfig()
        c1.hints["psm"] = 7
        assert "psm" not in c2.hints
 class TestOcrEngineResult:
    def test_construction(self) -> None:
        result = OcrEngineResult(
            text="1HGBH41JXMN109186",
            confidence=0.94,
            word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
            engine_name="paddleocr",
        )
        assert result.text == "1HGBH41JXMN109186"
        assert result.confidence == 0.94
        assert len(result.word_boxes) == 1
        assert result.engine_name == "paddleocr"
    def test_empty_result(self) -> None:
        result = OcrEngineResult(
            text="", confidence=0.0, word_boxes=[], engine_name="tesseract"
        )
        assert result.text == ""
        assert result.word_boxes == []
 # ---------------------------------------------------------------------------
 # OcrEngine ABC
 # ---------------------------------------------------------------------------
 class TestOcrEngineABC:
    def test_cannot_instantiate_directly(self) -> None:
        with pytest.raises(TypeError):
            OcrEngine()  # type: ignore[abstract]
    def test_concrete_subclass_works(self) -> None:
        class StubEngine(OcrEngine):
            @property
            def name(self) -> str:
                return "stub"
            def recognize(
                self, image_bytes: bytes, config: OcrConfig
            ) -> OcrEngineResult:
                return OcrEngineResult(
                    text="ok", confidence=1.0, word_boxes=[], engine_name="stub"
                )
        engine = StubEngine()
        assert engine.name == "stub"
        result = engine.recognize(b"", OcrConfig())
        assert result.text == "ok"
 # ---------------------------------------------------------------------------
 # PaddleOcrEngine
 # ---------------------------------------------------------------------------
 class TestPaddleOcrEngine:
    def test_name(self) -> None:
        from app.engines.paddle_engine import PaddleOcrEngine
        engine = PaddleOcrEngine()
        assert engine.name == "paddleocr"
    def test_lazy_init_not_loaded_at_construction(self) -> None:
        from app.engines.paddle_engine import PaddleOcrEngine
        engine = PaddleOcrEngine()
        assert engine._ocr is None
    def test_recognize_empty_results(self) -> None:
        from app.engines.paddle_engine import PaddleOcrEngine
        engine = PaddleOcrEngine()
        mock_ocr = MagicMock()
        mock_ocr.ocr.return_value = [None]
        engine._ocr = mock_ocr
        result = engine.recognize(_create_test_image_bytes(), OcrConfig())
        assert result.text == ""
        assert result.confidence == 0.0
        assert result.word_boxes == []
        assert result.engine_name == "paddleocr"
    def test_recognize_with_results(self) -> None:
        from app.engines.paddle_engine import PaddleOcrEngine
        engine = PaddleOcrEngine()
        mock_ocr = MagicMock()
        mock_ocr.ocr.return_value = [
            [
                [[[10, 20], [110, 20], [110, 50], [10, 50]], ("HELLO", 0.95)],
                [[[10, 60], [110, 60], [110, 90], [10, 90]], ("WORLD", 0.88)],
            ]
        ]
        engine._ocr = mock_ocr
        result = engine.recognize(_create_test_image_bytes(), OcrConfig())
        assert result.text == "HELLO WORLD"
        assert abs(result.confidence - 0.915) < 0.01
        assert len(result.word_boxes) == 2
        assert result.word_boxes[0].text == "HELLO"
        assert result.word_boxes[0].confidence == 0.95
        assert result.word_boxes[1].text == "WORLD"
        assert result.engine_name == "paddleocr"
    def test_recognize_whitelist_filters_characters(self) -> None:
        from app.engines.paddle_engine import PaddleOcrEngine
        engine = PaddleOcrEngine()
        mock_ocr = MagicMock()
        mock_ocr.ocr.return_value = [
            [
                [[[0, 0], [100, 0], [100, 30], [0, 30]], ("1HG-BH4!", 0.9)],
            ]
        ]
        engine._ocr = mock_ocr
        config = OcrConfig(char_whitelist="ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
        result = engine.recognize(_create_test_image_bytes(), config)
        assert "-" not in result.text
        assert "!" not in result.text
        assert result.word_boxes[0].text == "1HGBH4"
    def test_recognize_quadrilateral_to_bounding_box(self) -> None:
        from app.engines.paddle_engine import PaddleOcrEngine
        engine = PaddleOcrEngine()
        mock_ocr = MagicMock()
        # Slightly rotated quad: min x=8, min y=20, max x=110, max y=55
        mock_ocr.ocr.return_value = [
            [
                [[[10, 20], [110, 25], [108, 55], [8, 50]], ("TEXT", 0.9)],
            ]
        ]
        engine._ocr = mock_ocr
        result = engine.recognize(_create_test_image_bytes(), OcrConfig())
        wb = result.word_boxes[0]
        assert wb.x == 8
        assert wb.y == 20
        assert wb.width == 102  # 110 - 8
        assert wb.height == 35  # 55 - 20
    def test_recognize_skips_empty_after_whitelist(self) -> None:
        """Text consisting only of non-whitelisted characters is skipped."""
        from app.engines.paddle_engine import PaddleOcrEngine
        engine = PaddleOcrEngine()
        mock_ocr = MagicMock()
        mock_ocr.ocr.return_value = [
            [
                [[[0, 0], [50, 0], [50, 20], [0, 20]], ("---", 0.9)],
            ]
        ]
        engine._ocr = mock_ocr
        config = OcrConfig(char_whitelist="ABC")
        result = engine.recognize(_create_test_image_bytes(), config)
        assert result.text == ""
        assert result.word_boxes == []
        assert result.confidence == 0.0
    def test_import_error_raises_unavailable(self) -> None:
        from app.engines.paddle_engine import PaddleOcrEngine
        engine = PaddleOcrEngine()
        engine._ocr = None
        with patch.dict("sys.modules", {"paddleocr": None}):
            with patch(
                "app.engines.paddle_engine.importlib.import_module",
                side_effect=ImportError("No module"),
            ):
                # Force re-import by removing cached paddleocr
                original_import = __builtins__.__import__ if hasattr(__builtins__, '__import__') else __import__
                def mock_import(name, *args, **kwargs):
                    if name == "paddleocr":
                        raise ImportError("No module named 'paddleocr'")
                    return original_import(name, *args, **kwargs)
                with patch("builtins.__import__", side_effect=mock_import):
                    with pytest.raises(EngineUnavailableError, match="paddleocr"):
                        engine._get_ocr()
    def test_processing_error_on_exception(self) -> None:
        from app.engines.paddle_engine import PaddleOcrEngine
        engine = PaddleOcrEngine()
        mock_ocr = MagicMock()
        mock_ocr.ocr.side_effect = RuntimeError("OCR crashed")
        engine._ocr = mock_ocr
        with pytest.raises(EngineProcessingError, match="PaddleOCR recognition failed"):
            engine.recognize(_create_test_image_bytes(), OcrConfig())
 # ---------------------------------------------------------------------------
 # TesseractEngine
 # ---------------------------------------------------------------------------
 class TestTesseractEngine:
    """Tests for TesseractEngine using mocked pytesseract."""
    @pytest.fixture()
    def engine(self) -> "TesseractEngine":  # type: ignore[name-defined]
        """Create a TesseractEngine with mocked pytesseract dependency."""
        mock_pytesseract = MagicMock()
        mock_pytesseract.Output.DICT = "dict"
        with patch.dict("sys.modules", {"pytesseract": mock_pytesseract}):
            with patch("app.engines.tesseract_engine.settings") as mock_settings:
                mock_settings.tesseract_cmd = "/usr/bin/tesseract"
                from app.engines.tesseract_engine import TesseractEngine
                eng = TesseractEngine()
                eng._mock_pytesseract = mock_pytesseract  # type: ignore[attr-defined]
                return eng
    def test_name(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
        assert engine.name == "tesseract"
    def test_build_config_default_psm(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
        config_str = engine._build_config(OcrConfig())
        assert "--psm 6" in config_str
    def test_build_config_single_line(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
        config_str = engine._build_config(OcrConfig(single_line=True))
        assert "--psm 7" in config_str
    def test_build_config_single_word(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
        config_str = engine._build_config(OcrConfig(single_word=True))
        assert "--psm 8" in config_str
    def test_build_config_whitelist(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
        config_str = engine._build_config(OcrConfig(char_whitelist="ABC123"))
        assert "-c tessedit_char_whitelist=ABC123" in config_str
    def test_build_config_psm_hint(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
        config_str = engine._build_config(OcrConfig(hints={"psm": 11}))
        assert "--psm 11" in config_str
    def test_recognize_normalizes_confidence(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
        """Tesseract returns 0-100 confidence; engine normalizes to 0.0-1.0."""
        engine._pytesseract.image_to_data.return_value = {
            "text": ["HELLO", ""],
            "conf": [92, -1],
            "left": [10],
            "top": [20],
            "width": [100],
            "height": [30],
        }
        result = engine.recognize(_create_test_image_bytes(), OcrConfig())
        assert result.text == "HELLO"
        assert abs(result.confidence - 0.92) < 0.01
        assert result.engine_name == "tesseract"
    def test_import_error_raises_unavailable(self) -> None:
        with patch.dict("sys.modules", {"pytesseract": None}):
            with patch("app.engines.tesseract_engine.settings") as mock_settings:
                mock_settings.tesseract_cmd = "/usr/bin/tesseract"
                def mock_import(name, *args, **kwargs):
                    if name == "pytesseract":
                        raise ImportError("No module named 'pytesseract'")
                    return __import__(name, *args, **kwargs)
                with patch("builtins.__import__", side_effect=mock_import):
                    from app.engines.tesseract_engine import TesseractEngine
                    with pytest.raises(EngineUnavailableError, match="pytesseract"):
                        TesseractEngine()
 # ---------------------------------------------------------------------------
 # CloudEngine
 # ---------------------------------------------------------------------------
 class TestCloudEngine:
    def test_name(self) -> None:
        from app.engines.cloud_engine import CloudEngine
        engine = CloudEngine(key_path="/fake/path.json")
        assert engine.name == "google_vision"
    def test_lazy_init_not_loaded_at_construction(self) -> None:
        from app.engines.cloud_engine import CloudEngine
        engine = CloudEngine(key_path="/fake/path.json")
        assert engine._client is None
    def test_missing_key_file_raises_unavailable(self) -> None:
        from app.engines.cloud_engine import CloudEngine
        engine = CloudEngine(key_path="/nonexistent/key.json")
        with pytest.raises(EngineUnavailableError, match="key not found"):
            engine._get_client()
    @patch("os.path.isfile", return_value=True)
    def test_missing_library_raises_unavailable(self, _mock_isfile: MagicMock) -> None:
        from app.engines.cloud_engine import CloudEngine
        engine = CloudEngine(key_path="/fake/key.json")
        def mock_import(name, *args, **kwargs):
            if "google.cloud" in name:
                raise ImportError("No module named 'google.cloud'")
            return __import__(name, *args, **kwargs)
        with patch("builtins.__import__", side_effect=mock_import):
            with pytest.raises(EngineUnavailableError, match="google-cloud-vision"):
                engine._get_client()
    def test_recognize_empty_annotations(self) -> None:
        from app.engines.cloud_engine import CloudEngine
        engine = CloudEngine(key_path="/fake/key.json")
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_response.error.message = ""
        mock_response.text_annotations = []
        mock_client.text_detection.return_value = mock_response
        engine._client = mock_client
        # Mock the google.cloud.vision import inside recognize()
        mock_vision = MagicMock()
        with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
            result = engine.recognize(b"fake_image", OcrConfig())
        assert result.text == ""
        assert result.confidence == 0.0
        assert result.engine_name == "google_vision"
    def test_recognize_api_error_raises_processing_error(self) -> None:
        from app.engines.cloud_engine import CloudEngine
        engine = CloudEngine(key_path="/fake/key.json")
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_response.error.message = "API quota exceeded"
        mock_client.text_detection.return_value = mock_response
        engine._client = mock_client
        mock_vision = MagicMock()
        with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
            with pytest.raises(EngineProcessingError, match="API quota exceeded"):
                engine.recognize(b"fake_image", OcrConfig())
 # ---------------------------------------------------------------------------
 # HybridEngine
 # ---------------------------------------------------------------------------
 class TestHybridEngine:
    def test_name_with_fallback(self) -> None:
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        fallback = MagicMock(spec=OcrEngine)
        fallback.name = "google_vision"
        engine = HybridEngine(primary=primary, fallback=fallback)
        assert engine.name == "hybrid(paddleocr+google_vision)"
    def test_name_without_fallback(self) -> None:
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        engine = HybridEngine(primary=primary)
        assert engine.name == "hybrid(paddleocr+none)"
    def test_high_confidence_skips_fallback(self) -> None:
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        fallback = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        fallback.name = "cloud"
        primary.recognize.return_value = _make_result("VIN123", 0.95, "paddleocr")
        engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
        result = engine.recognize(b"img", OcrConfig())
        assert result.text == "VIN123"
        assert result.engine_name == "paddleocr"
        fallback.recognize.assert_not_called()
    def test_low_confidence_triggers_fallback(self) -> None:
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        fallback = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        fallback.name = "google_vision"
        primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
        fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
        engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
        result = engine.recognize(b"img", OcrConfig())
        assert result.text == "VIN456"
        assert result.engine_name == "google_vision"
        fallback.recognize.assert_called_once()
    def test_low_confidence_no_fallback_returns_primary(self) -> None:
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
        engine = HybridEngine(primary=primary, fallback=None, threshold=0.6)
        result = engine.recognize(b"img", OcrConfig())
        assert result.text == "VIN123"
    def test_fallback_lower_confidence_returns_primary(self) -> None:
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        fallback = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        fallback.name = "google_vision"
        primary.recognize.return_value = _make_result("VIN123", 0.4, "paddleocr")
        fallback.recognize.return_value = _make_result("VIN456", 0.3, "google_vision")
        engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
        result = engine.recognize(b"img", OcrConfig())
        assert result.text == "VIN123"
    def test_fallback_engine_error_returns_primary(self) -> None:
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        fallback = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        fallback.name = "google_vision"
        primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
        fallback.recognize.side_effect = EngineUnavailableError("key missing")
        engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
        result = engine.recognize(b"img", OcrConfig())
        assert result.text == "VIN123"
    def test_fallback_unexpected_error_returns_primary(self) -> None:
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        fallback = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        fallback.name = "google_vision"
        primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
        fallback.recognize.side_effect = RuntimeError("network error")
        engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
        result = engine.recognize(b"img", OcrConfig())
        assert result.text == "VIN123"
    @patch("app.engines.hybrid_engine.time")
    def test_fallback_timeout_returns_primary(self, mock_time: MagicMock) -> None:
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        fallback = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        fallback.name = "google_vision"
        primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
        fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
        # Simulate 6-second delay (exceeds 5s limit)
        mock_time.monotonic.side_effect = [0.0, 6.0]
        engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
        result = engine.recognize(b"img", OcrConfig())
        assert result.text == "VIN123"  # timeout -> use primary
    def test_exact_threshold_skips_fallback(self) -> None:
        """When confidence == threshold, no fallback needed (>= check)."""
        from app.engines.hybrid_engine import HybridEngine
        primary = MagicMock(spec=OcrEngine)
        fallback = MagicMock(spec=OcrEngine)
        primary.name = "paddleocr"
        fallback.name = "cloud"
        primary.recognize.return_value = _make_result("VIN", 0.6, "paddleocr")
        engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
        result = engine.recognize(b"img", OcrConfig())
        assert result.engine_name == "paddleocr"
        fallback.recognize.assert_not_called()
 # ---------------------------------------------------------------------------
 # Engine factory
 # ---------------------------------------------------------------------------
 class TestEngineFactory:
    def test_unknown_engine_raises(self) -> None:
        from app.engines.engine_factory import _create_single_engine
        with pytest.raises(EngineUnavailableError, match="Unknown engine"):
            _create_single_engine("nonexistent")
    @patch("app.engines.engine_factory.settings")
    @patch("app.engines.engine_factory._create_single_engine")
    def test_defaults_to_settings_primary(
        self, mock_create: MagicMock, mock_settings: MagicMock
    ) -> None:
        mock_settings.ocr_primary_engine = "paddleocr"
        mock_settings.ocr_fallback_engine = "none"
        mock_engine = MagicMock(spec=OcrEngine)
        mock_create.return_value = mock_engine
        from app.engines.engine_factory import create_engine
        result = create_engine()
        mock_create.assert_called_once_with("paddleocr")
        assert result == mock_engine
    @patch("app.engines.engine_factory.settings")
    @patch("app.engines.engine_factory._create_single_engine")
    def test_explicit_name_overrides_settings(
        self, mock_create: MagicMock, mock_settings: MagicMock
    ) -> None:
        mock_settings.ocr_fallback_engine = "none"
        mock_engine = MagicMock(spec=OcrEngine)
        mock_create.return_value = mock_engine
        from app.engines.engine_factory import create_engine
        create_engine("tesseract")
        mock_create.assert_called_once_with("tesseract")
    @patch("app.engines.engine_factory.settings")
    @patch("app.engines.engine_factory._create_single_engine")
    def test_creates_hybrid_when_fallback_configured(
        self, mock_create: MagicMock, mock_settings: MagicMock
    ) -> None:
        mock_settings.ocr_primary_engine = "paddleocr"
        mock_settings.ocr_fallback_engine = "google_vision"
        mock_settings.ocr_fallback_threshold = 0.7
        mock_primary = MagicMock(spec=OcrEngine)
        mock_fallback = MagicMock(spec=OcrEngine)
        mock_create.side_effect = [mock_primary, mock_fallback]
        from app.engines.engine_factory import create_engine
        from app.engines.hybrid_engine import HybridEngine
        result = create_engine()
        assert isinstance(result, HybridEngine)
    @patch("app.engines.engine_factory.settings")
    @patch("app.engines.engine_factory._create_single_engine")
    def test_fallback_failure_returns_primary_only(
        self, mock_create: MagicMock, mock_settings: MagicMock
    ) -> None:
        mock_settings.ocr_primary_engine = "paddleocr"
        mock_settings.ocr_fallback_engine = "google_vision"
        mock_settings.ocr_fallback_threshold = 0.6
        mock_primary = MagicMock(spec=OcrEngine)
        mock_create.side_effect = [mock_primary, EngineUnavailableError("no key")]
        from app.engines.engine_factory import create_engine
        result = create_engine()
        assert result == mock_primary
--- a/ocr/tests/test_vin_extraction.py
+++ b/ocr/tests/test_vin_extraction.py
@@ -1,11 +1,12 @@
-"""Integration tests for VIN extraction endpoint."""
+"""Integration tests for VIN extraction endpoint and engine integration."""
 import io
 from unittest.mock import patch, MagicMock
 import pytest
 from fastapi.testclient import TestClient
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image, ImageDraw
 from app.engines.base_engine import OcrConfig, OcrEngineResult, WordBox
 from app.main import app
@@ -240,3 +241,106 @@ class TestVinExtractionContentTypes:
        )
        assert response.status_code == 200
 # ---------------------------------------------------------------------------
 # VIN extractor engine integration tests
 # ---------------------------------------------------------------------------
 class TestVinExtractorEngineIntegration:
    """Tests verifying VinExtractor integrates correctly with engine abstraction."""
    @patch("app.extractors.vin_extractor.create_engine")
    def test_perform_ocr_calls_engine_with_vin_config(
        self, mock_create_engine: MagicMock
    ) -> None:
        """_perform_ocr passes VIN whitelist and angle_cls to engine."""
        from app.extractors.vin_extractor import VinExtractor
        mock_engine = MagicMock()
        mock_engine.recognize.return_value = OcrEngineResult(
            text="1HGBH41JXMN109186",
            confidence=0.94,
            word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
            engine_name="paddleocr",
        )
        mock_create_engine.return_value = mock_engine
        extractor = VinExtractor()
        text, confidences = extractor._perform_ocr(b"fake_image")
        mock_engine.recognize.assert_called_once()
        call_config = mock_engine.recognize.call_args[0][1]
        assert isinstance(call_config, OcrConfig)
        assert call_config.char_whitelist == VinExtractor.VIN_WHITELIST
        assert call_config.use_angle_cls is True
        assert call_config.single_line is False
        assert call_config.single_word is False
        assert text == "1HGBH41JXMN109186"
        assert confidences == [0.94]
    @patch("app.extractors.vin_extractor.create_engine")
    def test_perform_ocr_single_line_mode(
        self, mock_create_engine: MagicMock
    ) -> None:
        """_perform_ocr passes single_line flag to engine config."""
        from app.extractors.vin_extractor import VinExtractor
        mock_engine = MagicMock()
        mock_engine.recognize.return_value = OcrEngineResult(
            text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
        )
        mock_create_engine.return_value = mock_engine
        extractor = VinExtractor()
        extractor._perform_ocr(b"img", single_line=True)
        call_config = mock_engine.recognize.call_args[0][1]
        assert call_config.single_line is True
        assert call_config.single_word is False
    @patch("app.extractors.vin_extractor.create_engine")
    def test_perform_ocr_single_word_mode(
        self, mock_create_engine: MagicMock
    ) -> None:
        """_perform_ocr passes single_word flag to engine config."""
        from app.extractors.vin_extractor import VinExtractor
        mock_engine = MagicMock()
        mock_engine.recognize.return_value = OcrEngineResult(
            text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
        )
        mock_create_engine.return_value = mock_engine
        extractor = VinExtractor()
        extractor._perform_ocr(b"img", single_word=True)
        call_config = mock_engine.recognize.call_args[0][1]
        assert call_config.single_word is True
        assert call_config.single_line is False
    def test_calculate_base_confidence_empty_returns_default(self) -> None:
        """Empty word confidences return 0.5 default."""
        from app.extractors.vin_extractor import VinExtractor
        extractor = VinExtractor.__new__(VinExtractor)
        assert extractor._calculate_base_confidence([]) == 0.5
    def test_calculate_base_confidence_weighted_blend(self) -> None:
        """Confidence = 70% average + 30% minimum."""
        from app.extractors.vin_extractor import VinExtractor
        extractor = VinExtractor.__new__(VinExtractor)
        # avg = (0.9 + 0.8) / 2 = 0.85, min = 0.8
        # result = 0.7 * 0.85 + 0.3 * 0.8 = 0.595 + 0.24 = 0.835
        result = extractor._calculate_base_confidence([0.9, 0.8])
        assert abs(result - 0.835) < 0.001
    def test_calculate_base_confidence_single_value(self) -> None:
        """Single confidence value: avg == min, so result equals that value."""
        from app.extractors.vin_extractor import VinExtractor
        extractor = VinExtractor.__new__(VinExtractor)
        result = extractor._calculate_base_confidence([0.92])
        assert abs(result - 0.92) < 0.001