diff --git a/.ai/context.json b/.ai/context.json index 1e2c4bd..06b6032 100644 --- a/.ai/context.json +++ b/.ai/context.json @@ -108,7 +108,7 @@ }, "mvp-ocr": { "type": "ocr_service", - "description": "Python-based OCR for document text extraction", + "description": "Python OCR service with pluggable engine abstraction (PaddleOCR PP-OCRv4 primary, optional Google Vision cloud fallback, Tesseract backward compat)", "port": 8000 }, "mvp-loki": { diff --git a/docs/CLAUDE.md b/docs/CLAUDE.md index e17808d..59ba6eb 100644 --- a/docs/CLAUDE.md +++ b/docs/CLAUDE.md @@ -18,5 +18,5 @@ | `AUDIT.md` | Audit documentation | Security audits, compliance | | `MVP-COLOR-SCHEME.md` | Color scheme reference | UI styling decisions | | `LOGGING.md` | Unified logging system | Log levels, correlation IDs, Grafana | -| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, Tesseract setup | +| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, PaddleOCR engine abstraction | | `TIER-GATING.md` | Subscription tier gating rules | Feature access by tier, vehicle limits | diff --git a/docs/ocr-pipeline-tech-stack.md b/docs/ocr-pipeline-tech-stack.md index 5160a9a..a80090c 100644 --- a/docs/ocr-pipeline-tech-stack.md +++ b/docs/ocr-pipeline-tech-stack.md @@ -118,35 +118,48 @@ │ ├─────────────────────────────────────────────────────────┤ │ │ │ │ │ ┌─────────────────────────────────────────────────┐ │ - │ │ │ 5a. Primary OCR: Tesseract 5.x │ │ - │ │ │ │ │ - │ │ │ • Engine: LSTM (--oem 1) │ │ - │ │ │ • Page segmentation: Auto (--psm 3) │ │ - │ │ │ • Output: hOCR with word confidence │ │ + │ │ │ 5a. Engine Abstraction Layer │ │ + │ │ │ │ │ + │ │ │ OcrEngine ABC -> PaddleOcrEngine (primary) │ │ + │ │ │ -> CloudEngine (optional fallback) │ │ + │ │ │ -> TesseractEngine (backward compat)│ │ + │ │ │ -> HybridEngine (primary+fallback) │ │ + │ │ └─────────────────────────────────────────────────┘ │ + │ │ │ │ + │ │ ▼ │ + │ │ ┌─────────────────────────────────────────────────┐ │ + │ │ │ 5b. Primary OCR: PaddleOCR PP-OCRv4 │ │ + │ │ │ │ │ + │ │ │ • Scene text detection + angle classification │ │ + │ │ │ • CPU-only, models baked into Docker image │ │ + │ │ │ • Normalized output: text, confidence, word boxes│ │ │ │ └─────────────────────────────────────────────────┘ │ │ │ │ │ │ │ ▼ │ │ │ ┌───────────────┐ │ │ │ │ Confidence │ │ - │ │ │ > 80% ? │ │ + │ │ │ >= 60% ? │ │ │ │ └───────────────┘ │ │ │ │ │ │ - │ │ YES ──┘ └── NO │ + │ │ YES ──┘ └── NO (and cloud enabled) │ │ │ │ │ │ │ │ │ ▼ │ │ │ │ ┌─────────────────────────────────┐ │ - │ │ │ │ 5b. Fallback: PaddleOCR │ │ - │ │ │ │ │ │ - │ │ │ │ • Better for degraded images │ │ - │ │ │ │ • Better table detection │ │ - │ │ │ │ • Slower but more accurate │ │ + │ │ │ │ 5c. Optional Cloud Fallback │ │ + │ │ │ │ (Google Vision API) │ │ + │ │ │ │ │ │ + │ │ │ │ • Disabled by default │ │ + │ │ │ │ • 5-second timeout guard │ │ + │ │ │ │ • Returns higher-confidence │ │ + │ │ │ │ result of primary vs fallback │ │ │ │ │ └─────────────────────────────────┘ │ │ │ │ │ │ │ │ ▼ ▼ │ │ │ ┌─────────────────────────────────┐ │ - │ │ │ 5c. Result Merging │ │ - │ │ │ • Merge by bounding box │ │ + │ │ │ 5d. HybridEngine Result │ │ + │ │ │ • Compare confidences │ │ │ │ │ • Keep highest confidence │ │ + │ │ │ • Graceful fallback on error │ │ │ │ └─────────────────────────────────┘ │ │ │ │ │ └─────────────────────────────────────────────────────────┘ @@ -257,10 +270,10 @@ | Component | Tool | Purpose | |------------------------|-----------------------|--------------------------------------| -| **Primary OCR** | Tesseract 5.x | Fast, reliable text extraction | -| **Python Binding** | pytesseract | Tesseract Python wrapper | -| **Fallback OCR** | PaddleOCR | Higher accuracy, better tables | -| **Layout Analysis** | PaddleOCR / LayoutParser | Document structure detection | +| **Primary OCR** | PaddleOCR PP-OCRv4 | Highest accuracy scene text, CPU-only | +| **Cloud Fallback** | Google Vision API | Optional cloud fallback (disabled by default) | +| **Backward Compat** | Tesseract 5.x / pytesseract | Legacy engine, configurable via env var | +| **Engine Abstraction** | `OcrEngine` ABC | Pluggable engine interface in `ocr/app/engines/` | ### Data Extraction @@ -291,85 +304,93 @@ fastapi>=0.100.0 uvicorn[standard]>=0.23.0 python-multipart>=0.0.6 - -# Task Queue -celery>=5.3.0 -redis>=4.6.0 +pydantic>=2.0.0 # File Detection & Handling python-magic>=0.4.27 pillow>=10.0.0 pillow-heif>=0.13.0 -# PDF Processing -pymupdf>=1.23.0 - # Image Preprocessing opencv-python-headless>=4.8.0 -deskew>=1.4.0 -scikit-image>=0.21.0 numpy>=1.24.0 # OCR Engines pytesseract>=0.3.10 -paddlepaddle>=2.5.0 -paddleocr>=2.7.0 +paddlepaddle>=2.6.0 +paddleocr>=2.8.0 +google-cloud-vision>=3.7.0 -# Table Extraction -img2table>=1.2.0 -camelot-py[cv]>=0.11.0 +# PDF Processing +PyMuPDF>=1.23.0 -# NLP & Data -spacy>=3.6.0 -pandas>=2.0.0 +# Redis for job queue +redis>=5.0.0 -# Storage & Database -boto3>=1.28.0 -psycopg2-binary>=2.9.0 -sqlalchemy>=2.0.0 +# HTTP client for callbacks +httpx>=0.24.0 + +# Testing +pytest>=7.4.0 +pytest-asyncio>=0.21.0 ``` ### System Package Requirements (Ubuntu/Debian) ```bash -# Tesseract OCR -apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev +# Tesseract OCR (backward compatibility engine) +apt-get install tesseract-ocr tesseract-ocr-eng + +# PaddlePaddle OpenMP runtime +apt-get install libgomp1 # HEIC Support -apt-get install libheif-examples libheif-dev +apt-get install libheif1 libheif-dev -# OpenCV dependencies -apt-get install libgl1-mesa-glx libglib2.0-0 +# GLib (OpenCV dependency) +apt-get install libglib2.0-0 -# PDF rendering dependencies -apt-get install libmupdf-dev mupdf-tools - -# Image processing -apt-get install libmagic1 ghostscript - -# Camelot dependencies -apt-get install ghostscript python3-tk +# File type detection +apt-get install libmagic1 ``` +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `OCR_PRIMARY_ENGINE` | `paddleocr` | Primary OCR engine (`paddleocr`, `tesseract`) | +| `OCR_CONFIDENCE_THRESHOLD` | `0.6` | Minimum confidence threshold | +| `OCR_FALLBACK_ENGINE` | `none` | Fallback engine (`google_vision`, `none`) | +| `OCR_FALLBACK_THRESHOLD` | `0.6` | Confidence below this triggers fallback | +| `GOOGLE_VISION_KEY_PATH` | `/run/secrets/google-vision-key.json` | Path to Google Vision service account key | + --- ## DOCKERFILE ```dockerfile -FROM python:3.11-slim +# Primary engine: PaddleOCR PP-OCRv4 (models baked into image) +# Backward compat: Tesseract 5.x (optional, via TesseractEngine) +# Cloud fallback: Google Vision (optional, requires API key at runtime) + +FROM python:3.13-slim # System dependencies +# - tesseract-ocr/eng: Backward-compatible OCR engine +# - libgomp1: OpenMP runtime required by PaddlePaddle +# - libheif1/libheif-dev: HEIF image support (iPhone photos) +# - libglib2.0-0: GLib shared library (OpenCV dependency) +# - libmagic1: File type detection +# - curl: Health check endpoint RUN apt-get update && apt-get install -y --no-install-recommends \ tesseract-ocr \ tesseract-ocr-eng \ - libtesseract-dev \ - libheif-examples \ + libgomp1 \ + libheif1 \ libheif-dev \ - libgl1-mesa-glx \ libglib2.0-0 \ libmagic1 \ - ghostscript \ - poppler-utils \ + curl \ && rm -rf /var/lib/apt/lists/* # Python dependencies @@ -377,11 +398,9 @@ WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -# Download spaCy model -RUN python -m spacy download en_core_web_sm - -# Download PaddleOCR models (cached in image) -RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')" +# Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime) +RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)" \ + && echo "PaddleOCR PP-OCRv4 models downloaded and verified" COPY . . diff --git a/ocr/CLAUDE.md b/ocr/CLAUDE.md index 2020969..1f3988d 100644 --- a/ocr/CLAUDE.md +++ b/ocr/CLAUDE.md @@ -1,10 +1,12 @@ # ocr/ +Python OCR microservice. Primary engine: PaddleOCR PP-OCRv4 with optional Google Vision cloud fallback. Pluggable engine abstraction in `app/engines/`. + ## Files | File | What | When to read | | ---- | ---- | ------------ | -| `Dockerfile` | Container build definition | Docker builds, deployment | +| `Dockerfile` | Container build (PaddleOCR models baked in) | Docker builds, deployment | | `requirements.txt` | Python dependencies | Adding dependencies | ## Subdirectories @@ -12,4 +14,5 @@ | Directory | What | When to read | | --------- | ---- | ------------ | | `app/` | FastAPI application source | OCR endpoint development | +| `app/engines/` | Engine abstraction layer (OcrEngine ABC, factory, hybrid) | Adding or changing OCR engines | | `tests/` | Test suite | Adding or modifying tests | diff --git a/ocr/app/CLAUDE.md b/ocr/app/CLAUDE.md index 26c799a..8fbc7f1 100644 --- a/ocr/app/CLAUDE.md +++ b/ocr/app/CLAUDE.md @@ -12,6 +12,7 @@ | Directory | What | When to read | | --------- | ---- | ------------ | +| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback, Tesseract compat) | Engine changes, adding new engines | | `extractors/` | Data extraction logic | Adding new extraction types | | `models/` | Data models and schemas | Request/response types | | `patterns/` | Regex and parsing patterns | Pattern matching rules | diff --git a/ocr/tests/test_engine_abstraction.py b/ocr/tests/test_engine_abstraction.py new file mode 100644 index 0000000..2e8c150 --- /dev/null +++ b/ocr/tests/test_engine_abstraction.py @@ -0,0 +1,675 @@ +"""Tests for OCR engine abstraction layer. + +Covers: base types, exception hierarchy, PaddleOcrEngine, +TesseractEngine, CloudEngine, HybridEngine, and engine_factory. +""" + +import io +from unittest.mock import MagicMock, patch + +import pytest +from PIL import Image + +from app.engines.base_engine import ( + EngineError, + EngineProcessingError, + EngineUnavailableError, + OcrConfig, + OcrEngine, + OcrEngineResult, + WordBox, +) + + +# --- Helpers --- + + +def _create_test_image_bytes() -> bytes: + """Create minimal PNG image bytes for engine testing.""" + img = Image.new("RGB", (100, 50), (255, 255, 255)) + buf = io.BytesIO() + img.save(buf, format="PNG") + return buf.getvalue() + + +def _make_result( + text: str, confidence: float, engine_name: str +) -> OcrEngineResult: + """Create a minimal OcrEngineResult for testing.""" + return OcrEngineResult( + text=text, confidence=confidence, word_boxes=[], engine_name=engine_name + ) + + +# --------------------------------------------------------------------------- +# Exception hierarchy +# --------------------------------------------------------------------------- + + +class TestExceptionHierarchy: + """Engine errors form a proper hierarchy under EngineError.""" + + def test_unavailable_is_engine_error(self) -> None: + assert issubclass(EngineUnavailableError, EngineError) + + def test_processing_is_engine_error(self) -> None: + assert issubclass(EngineProcessingError, EngineError) + + def test_engine_error_is_exception(self) -> None: + assert issubclass(EngineError, Exception) + + def test_catch_base_catches_subtypes(self) -> None: + with pytest.raises(EngineError): + raise EngineUnavailableError("not installed") + with pytest.raises(EngineError): + raise EngineProcessingError("OCR failed") + + +# --------------------------------------------------------------------------- +# Data types +# --------------------------------------------------------------------------- + + +class TestWordBox: + def test_default_positions(self) -> None: + wb = WordBox(text="VIN", confidence=0.95) + assert wb.x == 0 + assert wb.y == 0 + assert wb.width == 0 + assert wb.height == 0 + + def test_all_fields(self) -> None: + wb = WordBox(text="ABC", confidence=0.88, x=10, y=20, width=100, height=30) + assert wb.text == "ABC" + assert wb.confidence == 0.88 + assert wb.x == 10 + assert wb.width == 100 + + +class TestOcrConfig: + def test_defaults(self) -> None: + config = OcrConfig() + assert config.char_whitelist is None + assert config.single_line is False + assert config.single_word is False + assert config.use_angle_cls is True + assert config.hints == {} + + def test_vin_whitelist_excludes_ioq(self) -> None: + whitelist = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789" + config = OcrConfig(char_whitelist=whitelist) + assert "I" not in config.char_whitelist + assert "O" not in config.char_whitelist + assert "Q" not in config.char_whitelist + + def test_hints_are_independent_across_instances(self) -> None: + c1 = OcrConfig() + c2 = OcrConfig() + c1.hints["psm"] = 7 + assert "psm" not in c2.hints + + +class TestOcrEngineResult: + def test_construction(self) -> None: + result = OcrEngineResult( + text="1HGBH41JXMN109186", + confidence=0.94, + word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)], + engine_name="paddleocr", + ) + assert result.text == "1HGBH41JXMN109186" + assert result.confidence == 0.94 + assert len(result.word_boxes) == 1 + assert result.engine_name == "paddleocr" + + def test_empty_result(self) -> None: + result = OcrEngineResult( + text="", confidence=0.0, word_boxes=[], engine_name="tesseract" + ) + assert result.text == "" + assert result.word_boxes == [] + + +# --------------------------------------------------------------------------- +# OcrEngine ABC +# --------------------------------------------------------------------------- + + +class TestOcrEngineABC: + def test_cannot_instantiate_directly(self) -> None: + with pytest.raises(TypeError): + OcrEngine() # type: ignore[abstract] + + def test_concrete_subclass_works(self) -> None: + class StubEngine(OcrEngine): + @property + def name(self) -> str: + return "stub" + + def recognize( + self, image_bytes: bytes, config: OcrConfig + ) -> OcrEngineResult: + return OcrEngineResult( + text="ok", confidence=1.0, word_boxes=[], engine_name="stub" + ) + + engine = StubEngine() + assert engine.name == "stub" + result = engine.recognize(b"", OcrConfig()) + assert result.text == "ok" + + +# --------------------------------------------------------------------------- +# PaddleOcrEngine +# --------------------------------------------------------------------------- + + +class TestPaddleOcrEngine: + def test_name(self) -> None: + from app.engines.paddle_engine import PaddleOcrEngine + + engine = PaddleOcrEngine() + assert engine.name == "paddleocr" + + def test_lazy_init_not_loaded_at_construction(self) -> None: + from app.engines.paddle_engine import PaddleOcrEngine + + engine = PaddleOcrEngine() + assert engine._ocr is None + + def test_recognize_empty_results(self) -> None: + from app.engines.paddle_engine import PaddleOcrEngine + + engine = PaddleOcrEngine() + mock_ocr = MagicMock() + mock_ocr.ocr.return_value = [None] + engine._ocr = mock_ocr + + result = engine.recognize(_create_test_image_bytes(), OcrConfig()) + assert result.text == "" + assert result.confidence == 0.0 + assert result.word_boxes == [] + assert result.engine_name == "paddleocr" + + def test_recognize_with_results(self) -> None: + from app.engines.paddle_engine import PaddleOcrEngine + + engine = PaddleOcrEngine() + mock_ocr = MagicMock() + mock_ocr.ocr.return_value = [ + [ + [[[10, 20], [110, 20], [110, 50], [10, 50]], ("HELLO", 0.95)], + [[[10, 60], [110, 60], [110, 90], [10, 90]], ("WORLD", 0.88)], + ] + ] + engine._ocr = mock_ocr + + result = engine.recognize(_create_test_image_bytes(), OcrConfig()) + assert result.text == "HELLO WORLD" + assert abs(result.confidence - 0.915) < 0.01 + assert len(result.word_boxes) == 2 + assert result.word_boxes[0].text == "HELLO" + assert result.word_boxes[0].confidence == 0.95 + assert result.word_boxes[1].text == "WORLD" + assert result.engine_name == "paddleocr" + + def test_recognize_whitelist_filters_characters(self) -> None: + from app.engines.paddle_engine import PaddleOcrEngine + + engine = PaddleOcrEngine() + mock_ocr = MagicMock() + mock_ocr.ocr.return_value = [ + [ + [[[0, 0], [100, 0], [100, 30], [0, 30]], ("1HG-BH4!", 0.9)], + ] + ] + engine._ocr = mock_ocr + + config = OcrConfig(char_whitelist="ABCDEFGHJKLMNPRSTUVWXYZ0123456789") + result = engine.recognize(_create_test_image_bytes(), config) + assert "-" not in result.text + assert "!" not in result.text + assert result.word_boxes[0].text == "1HGBH4" + + def test_recognize_quadrilateral_to_bounding_box(self) -> None: + from app.engines.paddle_engine import PaddleOcrEngine + + engine = PaddleOcrEngine() + mock_ocr = MagicMock() + # Slightly rotated quad: min x=8, min y=20, max x=110, max y=55 + mock_ocr.ocr.return_value = [ + [ + [[[10, 20], [110, 25], [108, 55], [8, 50]], ("TEXT", 0.9)], + ] + ] + engine._ocr = mock_ocr + + result = engine.recognize(_create_test_image_bytes(), OcrConfig()) + wb = result.word_boxes[0] + assert wb.x == 8 + assert wb.y == 20 + assert wb.width == 102 # 110 - 8 + assert wb.height == 35 # 55 - 20 + + def test_recognize_skips_empty_after_whitelist(self) -> None: + """Text consisting only of non-whitelisted characters is skipped.""" + from app.engines.paddle_engine import PaddleOcrEngine + + engine = PaddleOcrEngine() + mock_ocr = MagicMock() + mock_ocr.ocr.return_value = [ + [ + [[[0, 0], [50, 0], [50, 20], [0, 20]], ("---", 0.9)], + ] + ] + engine._ocr = mock_ocr + + config = OcrConfig(char_whitelist="ABC") + result = engine.recognize(_create_test_image_bytes(), config) + assert result.text == "" + assert result.word_boxes == [] + assert result.confidence == 0.0 + + def test_import_error_raises_unavailable(self) -> None: + from app.engines.paddle_engine import PaddleOcrEngine + + engine = PaddleOcrEngine() + engine._ocr = None + with patch.dict("sys.modules", {"paddleocr": None}): + with patch( + "app.engines.paddle_engine.importlib.import_module", + side_effect=ImportError("No module"), + ): + # Force re-import by removing cached paddleocr + original_import = __builtins__.__import__ if hasattr(__builtins__, '__import__') else __import__ + def mock_import(name, *args, **kwargs): + if name == "paddleocr": + raise ImportError("No module named 'paddleocr'") + return original_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=mock_import): + with pytest.raises(EngineUnavailableError, match="paddleocr"): + engine._get_ocr() + + def test_processing_error_on_exception(self) -> None: + from app.engines.paddle_engine import PaddleOcrEngine + + engine = PaddleOcrEngine() + mock_ocr = MagicMock() + mock_ocr.ocr.side_effect = RuntimeError("OCR crashed") + engine._ocr = mock_ocr + + with pytest.raises(EngineProcessingError, match="PaddleOCR recognition failed"): + engine.recognize(_create_test_image_bytes(), OcrConfig()) + + +# --------------------------------------------------------------------------- +# TesseractEngine +# --------------------------------------------------------------------------- + + +class TestTesseractEngine: + """Tests for TesseractEngine using mocked pytesseract.""" + + @pytest.fixture() + def engine(self) -> "TesseractEngine": # type: ignore[name-defined] + """Create a TesseractEngine with mocked pytesseract dependency.""" + mock_pytesseract = MagicMock() + mock_pytesseract.Output.DICT = "dict" + + with patch.dict("sys.modules", {"pytesseract": mock_pytesseract}): + with patch("app.engines.tesseract_engine.settings") as mock_settings: + mock_settings.tesseract_cmd = "/usr/bin/tesseract" + from app.engines.tesseract_engine import TesseractEngine + + eng = TesseractEngine() + eng._mock_pytesseract = mock_pytesseract # type: ignore[attr-defined] + return eng + + def test_name(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] + assert engine.name == "tesseract" + + def test_build_config_default_psm(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] + config_str = engine._build_config(OcrConfig()) + assert "--psm 6" in config_str + + def test_build_config_single_line(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] + config_str = engine._build_config(OcrConfig(single_line=True)) + assert "--psm 7" in config_str + + def test_build_config_single_word(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] + config_str = engine._build_config(OcrConfig(single_word=True)) + assert "--psm 8" in config_str + + def test_build_config_whitelist(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] + config_str = engine._build_config(OcrConfig(char_whitelist="ABC123")) + assert "-c tessedit_char_whitelist=ABC123" in config_str + + def test_build_config_psm_hint(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] + config_str = engine._build_config(OcrConfig(hints={"psm": 11})) + assert "--psm 11" in config_str + + def test_recognize_normalizes_confidence(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] + """Tesseract returns 0-100 confidence; engine normalizes to 0.0-1.0.""" + engine._pytesseract.image_to_data.return_value = { + "text": ["HELLO", ""], + "conf": [92, -1], + "left": [10], + "top": [20], + "width": [100], + "height": [30], + } + + result = engine.recognize(_create_test_image_bytes(), OcrConfig()) + assert result.text == "HELLO" + assert abs(result.confidence - 0.92) < 0.01 + assert result.engine_name == "tesseract" + + def test_import_error_raises_unavailable(self) -> None: + with patch.dict("sys.modules", {"pytesseract": None}): + with patch("app.engines.tesseract_engine.settings") as mock_settings: + mock_settings.tesseract_cmd = "/usr/bin/tesseract" + + def mock_import(name, *args, **kwargs): + if name == "pytesseract": + raise ImportError("No module named 'pytesseract'") + return __import__(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=mock_import): + from app.engines.tesseract_engine import TesseractEngine + + with pytest.raises(EngineUnavailableError, match="pytesseract"): + TesseractEngine() + + +# --------------------------------------------------------------------------- +# CloudEngine +# --------------------------------------------------------------------------- + + +class TestCloudEngine: + def test_name(self) -> None: + from app.engines.cloud_engine import CloudEngine + + engine = CloudEngine(key_path="/fake/path.json") + assert engine.name == "google_vision" + + def test_lazy_init_not_loaded_at_construction(self) -> None: + from app.engines.cloud_engine import CloudEngine + + engine = CloudEngine(key_path="/fake/path.json") + assert engine._client is None + + def test_missing_key_file_raises_unavailable(self) -> None: + from app.engines.cloud_engine import CloudEngine + + engine = CloudEngine(key_path="/nonexistent/key.json") + with pytest.raises(EngineUnavailableError, match="key not found"): + engine._get_client() + + @patch("os.path.isfile", return_value=True) + def test_missing_library_raises_unavailable(self, _mock_isfile: MagicMock) -> None: + from app.engines.cloud_engine import CloudEngine + + engine = CloudEngine(key_path="/fake/key.json") + + def mock_import(name, *args, **kwargs): + if "google.cloud" in name: + raise ImportError("No module named 'google.cloud'") + return __import__(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=mock_import): + with pytest.raises(EngineUnavailableError, match="google-cloud-vision"): + engine._get_client() + + def test_recognize_empty_annotations(self) -> None: + from app.engines.cloud_engine import CloudEngine + + engine = CloudEngine(key_path="/fake/key.json") + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.error.message = "" + mock_response.text_annotations = [] + mock_client.text_detection.return_value = mock_response + engine._client = mock_client + + # Mock the google.cloud.vision import inside recognize() + mock_vision = MagicMock() + with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}): + result = engine.recognize(b"fake_image", OcrConfig()) + assert result.text == "" + assert result.confidence == 0.0 + assert result.engine_name == "google_vision" + + def test_recognize_api_error_raises_processing_error(self) -> None: + from app.engines.cloud_engine import CloudEngine + + engine = CloudEngine(key_path="/fake/key.json") + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.error.message = "API quota exceeded" + mock_client.text_detection.return_value = mock_response + engine._client = mock_client + + mock_vision = MagicMock() + with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}): + with pytest.raises(EngineProcessingError, match="API quota exceeded"): + engine.recognize(b"fake_image", OcrConfig()) + + +# --------------------------------------------------------------------------- +# HybridEngine +# --------------------------------------------------------------------------- + + +class TestHybridEngine: + def test_name_with_fallback(self) -> None: + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + fallback = MagicMock(spec=OcrEngine) + fallback.name = "google_vision" + engine = HybridEngine(primary=primary, fallback=fallback) + assert engine.name == "hybrid(paddleocr+google_vision)" + + def test_name_without_fallback(self) -> None: + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + engine = HybridEngine(primary=primary) + assert engine.name == "hybrid(paddleocr+none)" + + def test_high_confidence_skips_fallback(self) -> None: + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + fallback = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + fallback.name = "cloud" + primary.recognize.return_value = _make_result("VIN123", 0.95, "paddleocr") + + engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6) + result = engine.recognize(b"img", OcrConfig()) + assert result.text == "VIN123" + assert result.engine_name == "paddleocr" + fallback.recognize.assert_not_called() + + def test_low_confidence_triggers_fallback(self) -> None: + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + fallback = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + fallback.name = "google_vision" + primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr") + fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision") + + engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6) + result = engine.recognize(b"img", OcrConfig()) + assert result.text == "VIN456" + assert result.engine_name == "google_vision" + fallback.recognize.assert_called_once() + + def test_low_confidence_no_fallback_returns_primary(self) -> None: + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr") + + engine = HybridEngine(primary=primary, fallback=None, threshold=0.6) + result = engine.recognize(b"img", OcrConfig()) + assert result.text == "VIN123" + + def test_fallback_lower_confidence_returns_primary(self) -> None: + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + fallback = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + fallback.name = "google_vision" + primary.recognize.return_value = _make_result("VIN123", 0.4, "paddleocr") + fallback.recognize.return_value = _make_result("VIN456", 0.3, "google_vision") + + engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6) + result = engine.recognize(b"img", OcrConfig()) + assert result.text == "VIN123" + + def test_fallback_engine_error_returns_primary(self) -> None: + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + fallback = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + fallback.name = "google_vision" + primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr") + fallback.recognize.side_effect = EngineUnavailableError("key missing") + + engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6) + result = engine.recognize(b"img", OcrConfig()) + assert result.text == "VIN123" + + def test_fallback_unexpected_error_returns_primary(self) -> None: + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + fallback = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + fallback.name = "google_vision" + primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr") + fallback.recognize.side_effect = RuntimeError("network error") + + engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6) + result = engine.recognize(b"img", OcrConfig()) + assert result.text == "VIN123" + + @patch("app.engines.hybrid_engine.time") + def test_fallback_timeout_returns_primary(self, mock_time: MagicMock) -> None: + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + fallback = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + fallback.name = "google_vision" + primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr") + fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision") + # Simulate 6-second delay (exceeds 5s limit) + mock_time.monotonic.side_effect = [0.0, 6.0] + + engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6) + result = engine.recognize(b"img", OcrConfig()) + assert result.text == "VIN123" # timeout -> use primary + + def test_exact_threshold_skips_fallback(self) -> None: + """When confidence == threshold, no fallback needed (>= check).""" + from app.engines.hybrid_engine import HybridEngine + + primary = MagicMock(spec=OcrEngine) + fallback = MagicMock(spec=OcrEngine) + primary.name = "paddleocr" + fallback.name = "cloud" + primary.recognize.return_value = _make_result("VIN", 0.6, "paddleocr") + + engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6) + result = engine.recognize(b"img", OcrConfig()) + assert result.engine_name == "paddleocr" + fallback.recognize.assert_not_called() + + +# --------------------------------------------------------------------------- +# Engine factory +# --------------------------------------------------------------------------- + + +class TestEngineFactory: + def test_unknown_engine_raises(self) -> None: + from app.engines.engine_factory import _create_single_engine + + with pytest.raises(EngineUnavailableError, match="Unknown engine"): + _create_single_engine("nonexistent") + + @patch("app.engines.engine_factory.settings") + @patch("app.engines.engine_factory._create_single_engine") + def test_defaults_to_settings_primary( + self, mock_create: MagicMock, mock_settings: MagicMock + ) -> None: + mock_settings.ocr_primary_engine = "paddleocr" + mock_settings.ocr_fallback_engine = "none" + mock_engine = MagicMock(spec=OcrEngine) + mock_create.return_value = mock_engine + + from app.engines.engine_factory import create_engine + + result = create_engine() + mock_create.assert_called_once_with("paddleocr") + assert result == mock_engine + + @patch("app.engines.engine_factory.settings") + @patch("app.engines.engine_factory._create_single_engine") + def test_explicit_name_overrides_settings( + self, mock_create: MagicMock, mock_settings: MagicMock + ) -> None: + mock_settings.ocr_fallback_engine = "none" + mock_engine = MagicMock(spec=OcrEngine) + mock_create.return_value = mock_engine + + from app.engines.engine_factory import create_engine + + create_engine("tesseract") + mock_create.assert_called_once_with("tesseract") + + @patch("app.engines.engine_factory.settings") + @patch("app.engines.engine_factory._create_single_engine") + def test_creates_hybrid_when_fallback_configured( + self, mock_create: MagicMock, mock_settings: MagicMock + ) -> None: + mock_settings.ocr_primary_engine = "paddleocr" + mock_settings.ocr_fallback_engine = "google_vision" + mock_settings.ocr_fallback_threshold = 0.7 + mock_primary = MagicMock(spec=OcrEngine) + mock_fallback = MagicMock(spec=OcrEngine) + mock_create.side_effect = [mock_primary, mock_fallback] + + from app.engines.engine_factory import create_engine + from app.engines.hybrid_engine import HybridEngine + + result = create_engine() + assert isinstance(result, HybridEngine) + + @patch("app.engines.engine_factory.settings") + @patch("app.engines.engine_factory._create_single_engine") + def test_fallback_failure_returns_primary_only( + self, mock_create: MagicMock, mock_settings: MagicMock + ) -> None: + mock_settings.ocr_primary_engine = "paddleocr" + mock_settings.ocr_fallback_engine = "google_vision" + mock_settings.ocr_fallback_threshold = 0.6 + mock_primary = MagicMock(spec=OcrEngine) + mock_create.side_effect = [mock_primary, EngineUnavailableError("no key")] + + from app.engines.engine_factory import create_engine + + result = create_engine() + assert result == mock_primary diff --git a/ocr/tests/test_vin_extraction.py b/ocr/tests/test_vin_extraction.py index b2c8170..3a42c8b 100644 --- a/ocr/tests/test_vin_extraction.py +++ b/ocr/tests/test_vin_extraction.py @@ -1,11 +1,12 @@ -"""Integration tests for VIN extraction endpoint.""" +"""Integration tests for VIN extraction endpoint and engine integration.""" import io from unittest.mock import patch, MagicMock import pytest from fastapi.testclient import TestClient -from PIL import Image, ImageDraw, ImageFont +from PIL import Image, ImageDraw +from app.engines.base_engine import OcrConfig, OcrEngineResult, WordBox from app.main import app @@ -240,3 +241,106 @@ class TestVinExtractionContentTypes: ) assert response.status_code == 200 + + +# --------------------------------------------------------------------------- +# VIN extractor engine integration tests +# --------------------------------------------------------------------------- + + +class TestVinExtractorEngineIntegration: + """Tests verifying VinExtractor integrates correctly with engine abstraction.""" + + @patch("app.extractors.vin_extractor.create_engine") + def test_perform_ocr_calls_engine_with_vin_config( + self, mock_create_engine: MagicMock + ) -> None: + """_perform_ocr passes VIN whitelist and angle_cls to engine.""" + from app.extractors.vin_extractor import VinExtractor + + mock_engine = MagicMock() + mock_engine.recognize.return_value = OcrEngineResult( + text="1HGBH41JXMN109186", + confidence=0.94, + word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)], + engine_name="paddleocr", + ) + mock_create_engine.return_value = mock_engine + + extractor = VinExtractor() + text, confidences = extractor._perform_ocr(b"fake_image") + + mock_engine.recognize.assert_called_once() + call_config = mock_engine.recognize.call_args[0][1] + assert isinstance(call_config, OcrConfig) + assert call_config.char_whitelist == VinExtractor.VIN_WHITELIST + assert call_config.use_angle_cls is True + assert call_config.single_line is False + assert call_config.single_word is False + assert text == "1HGBH41JXMN109186" + assert confidences == [0.94] + + @patch("app.extractors.vin_extractor.create_engine") + def test_perform_ocr_single_line_mode( + self, mock_create_engine: MagicMock + ) -> None: + """_perform_ocr passes single_line flag to engine config.""" + from app.extractors.vin_extractor import VinExtractor + + mock_engine = MagicMock() + mock_engine.recognize.return_value = OcrEngineResult( + text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr" + ) + mock_create_engine.return_value = mock_engine + + extractor = VinExtractor() + extractor._perform_ocr(b"img", single_line=True) + + call_config = mock_engine.recognize.call_args[0][1] + assert call_config.single_line is True + assert call_config.single_word is False + + @patch("app.extractors.vin_extractor.create_engine") + def test_perform_ocr_single_word_mode( + self, mock_create_engine: MagicMock + ) -> None: + """_perform_ocr passes single_word flag to engine config.""" + from app.extractors.vin_extractor import VinExtractor + + mock_engine = MagicMock() + mock_engine.recognize.return_value = OcrEngineResult( + text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr" + ) + mock_create_engine.return_value = mock_engine + + extractor = VinExtractor() + extractor._perform_ocr(b"img", single_word=True) + + call_config = mock_engine.recognize.call_args[0][1] + assert call_config.single_word is True + assert call_config.single_line is False + + def test_calculate_base_confidence_empty_returns_default(self) -> None: + """Empty word confidences return 0.5 default.""" + from app.extractors.vin_extractor import VinExtractor + + extractor = VinExtractor.__new__(VinExtractor) + assert extractor._calculate_base_confidence([]) == 0.5 + + def test_calculate_base_confidence_weighted_blend(self) -> None: + """Confidence = 70% average + 30% minimum.""" + from app.extractors.vin_extractor import VinExtractor + + extractor = VinExtractor.__new__(VinExtractor) + # avg = (0.9 + 0.8) / 2 = 0.85, min = 0.8 + # result = 0.7 * 0.85 + 0.3 * 0.8 = 0.595 + 0.24 = 0.835 + result = extractor._calculate_base_confidence([0.9, 0.8]) + assert abs(result - 0.835) < 0.001 + + def test_calculate_base_confidence_single_value(self) -> None: + """Single confidence value: avg == min, so result equals that value.""" + from app.extractors.vin_extractor import VinExtractor + + extractor = VinExtractor.__new__(VinExtractor) + result = extractor._calculate_base_confidence([0.92]) + assert abs(result - 0.92) < 0.001