chore: update OCR tests and documentation (refs #121)
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 7m4s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 7s

Add engine abstraction tests and update docs to reflect PaddleOCR primary
architecture with optional Google Vision cloud fallback.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-07 11:42:51 -06:00
parent 1e96baca6f
commit 47c5676498
7 changed files with 870 additions and 68 deletions

View File

@@ -108,7 +108,7 @@
}, },
"mvp-ocr": { "mvp-ocr": {
"type": "ocr_service", "type": "ocr_service",
"description": "Python-based OCR for document text extraction", "description": "Python OCR service with pluggable engine abstraction (PaddleOCR PP-OCRv4 primary, optional Google Vision cloud fallback, Tesseract backward compat)",
"port": 8000 "port": 8000
}, },
"mvp-loki": { "mvp-loki": {

View File

@@ -18,5 +18,5 @@
| `AUDIT.md` | Audit documentation | Security audits, compliance | | `AUDIT.md` | Audit documentation | Security audits, compliance |
| `MVP-COLOR-SCHEME.md` | Color scheme reference | UI styling decisions | | `MVP-COLOR-SCHEME.md` | Color scheme reference | UI styling decisions |
| `LOGGING.md` | Unified logging system | Log levels, correlation IDs, Grafana | | `LOGGING.md` | Unified logging system | Log levels, correlation IDs, Grafana |
| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, Tesseract setup | | `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, PaddleOCR engine abstraction |
| `TIER-GATING.md` | Subscription tier gating rules | Feature access by tier, vehicle limits | | `TIER-GATING.md` | Subscription tier gating rules | Feature access by tier, vehicle limits |

View File

@@ -118,35 +118,48 @@
│ ├─────────────────────────────────────────────────────────┤ │ ├─────────────────────────────────────────────────────────┤
│ │ │ │ │ │
│ │ ┌─────────────────────────────────────────────────┐ │ │ │ ┌─────────────────────────────────────────────────┐ │
│ │ │ 5a. Primary OCR: Tesseract 5.x │ │ │ │ │ 5a. Engine Abstraction Layer │ │
│ │ │ │ │ │ │ │ │ │
│ │ │ Engine: LSTM (--oem 1) │ │ │ │ │ OcrEngine ABC -> PaddleOcrEngine (primary) │ │
│ │ │ • Page segmentation: Auto (--psm 3) │ │ │ │ │ -> CloudEngine (optional fallback) │ │
│ │ │ • Output: hOCR with word confidence │ │ │ │ -> TesseractEngine (backward compat)│
│ │ │ -> HybridEngine (primary+fallback) │ │
│ │ └─────────────────────────────────────────────────┘ │
│ │ │ │
│ │ ▼ │
│ │ ┌─────────────────────────────────────────────────┐ │
│ │ │ 5b. Primary OCR: PaddleOCR PP-OCRv4 │ │
│ │ │ │ │
│ │ │ • Scene text detection + angle classification │ │
│ │ │ • CPU-only, models baked into Docker image │ │
│ │ │ • Normalized output: text, confidence, word boxes│ │
│ │ └─────────────────────────────────────────────────┘ │ │ │ └─────────────────────────────────────────────────┘ │
│ │ │ │ │ │ │ │
│ │ ▼ │ │ │ ▼ │
│ │ ┌───────────────┐ │ │ │ ┌───────────────┐ │
│ │ │ Confidence │ │ │ │ │ Confidence │ │
│ │ │ > 80% ? │ │ │ │ │ >= 60% ? │ │
│ │ └───────────────┘ │ │ │ └───────────────┘ │
│ │ │ │ │ │ │ │ │ │
│ │ YES ──┘ └── NO │ │ YES ──┘ └── NO (and cloud enabled)
│ │ │ │ │ │ │ │ │ │
│ │ │ ▼ │ │ │ │ ▼ │
│ │ │ ┌─────────────────────────────────┐ │ │ │ │ ┌─────────────────────────────────┐ │
│ │ │ │ 5b. Fallback: PaddleOCR │ │ │ │ │ │ 5c. Optional Cloud Fallback │ │
│ │ │ │ (Google Vision API) │ │
│ │ │ │ │ │ │ │ │ │ │ │
│ │ │ │ • Better for degraded images │ │ │ │ │ │ • Disabled by default │ │
│ │ │ │ • Better table detection │ │ │ │ │ │ • 5-second timeout guard │ │
│ │ │ │ • Slower but more accurate │ │ │ │ │ │ • Returns higher-confidence │ │
│ │ │ │ result of primary vs fallback │ │
│ │ │ └─────────────────────────────────┘ │ │ │ │ └─────────────────────────────────┘ │
│ │ │ │ │ │ │ │ │ │
│ │ ▼ ▼ │ │ │ ▼ ▼ │
│ │ ┌─────────────────────────────────┐ │ │ │ ┌─────────────────────────────────┐ │
│ │ │ 5c. Result Merging │ │ │ │ │ 5d. HybridEngine Result │ │
│ │ │ • Merge by bounding box │ │ │ │ │ • Compare confidences │ │
│ │ │ • Keep highest confidence │ │ │ │ │ • Keep highest confidence │ │
│ │ │ • Graceful fallback on error │ │
│ │ └─────────────────────────────────┘ │ │ │ └─────────────────────────────────┘ │
│ │ │ │ │ │
│ └─────────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────┘
@@ -257,10 +270,10 @@
| Component | Tool | Purpose | | Component | Tool | Purpose |
|------------------------|-----------------------|--------------------------------------| |------------------------|-----------------------|--------------------------------------|
| **Primary OCR** | Tesseract 5.x | Fast, reliable text extraction | | **Primary OCR** | PaddleOCR PP-OCRv4 | Highest accuracy scene text, CPU-only |
| **Python Binding** | pytesseract | Tesseract Python wrapper | | **Cloud Fallback** | Google Vision API | Optional cloud fallback (disabled by default) |
| **Fallback OCR** | PaddleOCR | Higher accuracy, better tables | | **Backward Compat** | Tesseract 5.x / pytesseract | Legacy engine, configurable via env var |
| **Layout Analysis** | PaddleOCR / LayoutParser | Document structure detection | | **Engine Abstraction** | `OcrEngine` ABC | Pluggable engine interface in `ocr/app/engines/` |
### Data Extraction ### Data Extraction
@@ -291,85 +304,93 @@
fastapi>=0.100.0 fastapi>=0.100.0
uvicorn[standard]>=0.23.0 uvicorn[standard]>=0.23.0
python-multipart>=0.0.6 python-multipart>=0.0.6
pydantic>=2.0.0
# Task Queue
celery>=5.3.0
redis>=4.6.0
# File Detection & Handling # File Detection & Handling
python-magic>=0.4.27 python-magic>=0.4.27
pillow>=10.0.0 pillow>=10.0.0
pillow-heif>=0.13.0 pillow-heif>=0.13.0
# PDF Processing
pymupdf>=1.23.0
# Image Preprocessing # Image Preprocessing
opencv-python-headless>=4.8.0 opencv-python-headless>=4.8.0
deskew>=1.4.0
scikit-image>=0.21.0
numpy>=1.24.0 numpy>=1.24.0
# OCR Engines # OCR Engines
pytesseract>=0.3.10 pytesseract>=0.3.10
paddlepaddle>=2.5.0 paddlepaddle>=2.6.0
paddleocr>=2.7.0 paddleocr>=2.8.0
google-cloud-vision>=3.7.0
# Table Extraction # PDF Processing
img2table>=1.2.0 PyMuPDF>=1.23.0
camelot-py[cv]>=0.11.0
# NLP & Data # Redis for job queue
spacy>=3.6.0 redis>=5.0.0
pandas>=2.0.0
# Storage & Database # HTTP client for callbacks
boto3>=1.28.0 httpx>=0.24.0
psycopg2-binary>=2.9.0
sqlalchemy>=2.0.0 # Testing
pytest>=7.4.0
pytest-asyncio>=0.21.0
``` ```
### System Package Requirements (Ubuntu/Debian) ### System Package Requirements (Ubuntu/Debian)
```bash ```bash
# Tesseract OCR # Tesseract OCR (backward compatibility engine)
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev apt-get install tesseract-ocr tesseract-ocr-eng
# PaddlePaddle OpenMP runtime
apt-get install libgomp1
# HEIC Support # HEIC Support
apt-get install libheif-examples libheif-dev apt-get install libheif1 libheif-dev
# OpenCV dependencies # GLib (OpenCV dependency)
apt-get install libgl1-mesa-glx libglib2.0-0 apt-get install libglib2.0-0
# PDF rendering dependencies # File type detection
apt-get install libmupdf-dev mupdf-tools apt-get install libmagic1
# Image processing
apt-get install libmagic1 ghostscript
# Camelot dependencies
apt-get install ghostscript python3-tk
``` ```
### Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
| `OCR_PRIMARY_ENGINE` | `paddleocr` | Primary OCR engine (`paddleocr`, `tesseract`) |
| `OCR_CONFIDENCE_THRESHOLD` | `0.6` | Minimum confidence threshold |
| `OCR_FALLBACK_ENGINE` | `none` | Fallback engine (`google_vision`, `none`) |
| `OCR_FALLBACK_THRESHOLD` | `0.6` | Confidence below this triggers fallback |
| `GOOGLE_VISION_KEY_PATH` | `/run/secrets/google-vision-key.json` | Path to Google Vision service account key |
--- ---
## DOCKERFILE ## DOCKERFILE
```dockerfile ```dockerfile
FROM python:3.11-slim # Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
# Backward compat: Tesseract 5.x (optional, via TesseractEngine)
# Cloud fallback: Google Vision (optional, requires API key at runtime)
FROM python:3.13-slim
# System dependencies # System dependencies
# - tesseract-ocr/eng: Backward-compatible OCR engine
# - libgomp1: OpenMP runtime required by PaddlePaddle
# - libheif1/libheif-dev: HEIF image support (iPhone photos)
# - libglib2.0-0: GLib shared library (OpenCV dependency)
# - libmagic1: File type detection
# - curl: Health check endpoint
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr \ tesseract-ocr \
tesseract-ocr-eng \ tesseract-ocr-eng \
libtesseract-dev \ libgomp1 \
libheif-examples \ libheif1 \
libheif-dev \ libheif-dev \
libgl1-mesa-glx \
libglib2.0-0 \ libglib2.0-0 \
libmagic1 \ libmagic1 \
ghostscript \ curl \
poppler-utils \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Python dependencies # Python dependencies
@@ -377,11 +398,9 @@ WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
# Download spaCy model # Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime)
RUN python -m spacy download en_core_web_sm RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)" \
&& echo "PaddleOCR PP-OCRv4 models downloaded and verified"
# Download PaddleOCR models (cached in image)
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')"
COPY . . COPY . .

View File

@@ -1,10 +1,12 @@
# ocr/ # ocr/
Python OCR microservice. Primary engine: PaddleOCR PP-OCRv4 with optional Google Vision cloud fallback. Pluggable engine abstraction in `app/engines/`.
## Files ## Files
| File | What | When to read | | File | What | When to read |
| ---- | ---- | ------------ | | ---- | ---- | ------------ |
| `Dockerfile` | Container build definition | Docker builds, deployment | | `Dockerfile` | Container build (PaddleOCR models baked in) | Docker builds, deployment |
| `requirements.txt` | Python dependencies | Adding dependencies | | `requirements.txt` | Python dependencies | Adding dependencies |
## Subdirectories ## Subdirectories
@@ -12,4 +14,5 @@
| Directory | What | When to read | | Directory | What | When to read |
| --------- | ---- | ------------ | | --------- | ---- | ------------ |
| `app/` | FastAPI application source | OCR endpoint development | | `app/` | FastAPI application source | OCR endpoint development |
| `app/engines/` | Engine abstraction layer (OcrEngine ABC, factory, hybrid) | Adding or changing OCR engines |
| `tests/` | Test suite | Adding or modifying tests | | `tests/` | Test suite | Adding or modifying tests |

View File

@@ -12,6 +12,7 @@
| Directory | What | When to read | | Directory | What | When to read |
| --------- | ---- | ------------ | | --------- | ---- | ------------ |
| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback, Tesseract compat) | Engine changes, adding new engines |
| `extractors/` | Data extraction logic | Adding new extraction types | | `extractors/` | Data extraction logic | Adding new extraction types |
| `models/` | Data models and schemas | Request/response types | | `models/` | Data models and schemas | Request/response types |
| `patterns/` | Regex and parsing patterns | Pattern matching rules | | `patterns/` | Regex and parsing patterns | Pattern matching rules |

View File

@@ -0,0 +1,675 @@
"""Tests for OCR engine abstraction layer.
Covers: base types, exception hierarchy, PaddleOcrEngine,
TesseractEngine, CloudEngine, HybridEngine, and engine_factory.
"""
import io
from unittest.mock import MagicMock, patch
import pytest
from PIL import Image
from app.engines.base_engine import (
EngineError,
EngineProcessingError,
EngineUnavailableError,
OcrConfig,
OcrEngine,
OcrEngineResult,
WordBox,
)
# --- Helpers ---
def _create_test_image_bytes() -> bytes:
"""Create minimal PNG image bytes for engine testing."""
img = Image.new("RGB", (100, 50), (255, 255, 255))
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
def _make_result(
text: str, confidence: float, engine_name: str
) -> OcrEngineResult:
"""Create a minimal OcrEngineResult for testing."""
return OcrEngineResult(
text=text, confidence=confidence, word_boxes=[], engine_name=engine_name
)
# ---------------------------------------------------------------------------
# Exception hierarchy
# ---------------------------------------------------------------------------
class TestExceptionHierarchy:
"""Engine errors form a proper hierarchy under EngineError."""
def test_unavailable_is_engine_error(self) -> None:
assert issubclass(EngineUnavailableError, EngineError)
def test_processing_is_engine_error(self) -> None:
assert issubclass(EngineProcessingError, EngineError)
def test_engine_error_is_exception(self) -> None:
assert issubclass(EngineError, Exception)
def test_catch_base_catches_subtypes(self) -> None:
with pytest.raises(EngineError):
raise EngineUnavailableError("not installed")
with pytest.raises(EngineError):
raise EngineProcessingError("OCR failed")
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
class TestWordBox:
def test_default_positions(self) -> None:
wb = WordBox(text="VIN", confidence=0.95)
assert wb.x == 0
assert wb.y == 0
assert wb.width == 0
assert wb.height == 0
def test_all_fields(self) -> None:
wb = WordBox(text="ABC", confidence=0.88, x=10, y=20, width=100, height=30)
assert wb.text == "ABC"
assert wb.confidence == 0.88
assert wb.x == 10
assert wb.width == 100
class TestOcrConfig:
def test_defaults(self) -> None:
config = OcrConfig()
assert config.char_whitelist is None
assert config.single_line is False
assert config.single_word is False
assert config.use_angle_cls is True
assert config.hints == {}
def test_vin_whitelist_excludes_ioq(self) -> None:
whitelist = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
config = OcrConfig(char_whitelist=whitelist)
assert "I" not in config.char_whitelist
assert "O" not in config.char_whitelist
assert "Q" not in config.char_whitelist
def test_hints_are_independent_across_instances(self) -> None:
c1 = OcrConfig()
c2 = OcrConfig()
c1.hints["psm"] = 7
assert "psm" not in c2.hints
class TestOcrEngineResult:
def test_construction(self) -> None:
result = OcrEngineResult(
text="1HGBH41JXMN109186",
confidence=0.94,
word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
engine_name="paddleocr",
)
assert result.text == "1HGBH41JXMN109186"
assert result.confidence == 0.94
assert len(result.word_boxes) == 1
assert result.engine_name == "paddleocr"
def test_empty_result(self) -> None:
result = OcrEngineResult(
text="", confidence=0.0, word_boxes=[], engine_name="tesseract"
)
assert result.text == ""
assert result.word_boxes == []
# ---------------------------------------------------------------------------
# OcrEngine ABC
# ---------------------------------------------------------------------------
class TestOcrEngineABC:
def test_cannot_instantiate_directly(self) -> None:
with pytest.raises(TypeError):
OcrEngine() # type: ignore[abstract]
def test_concrete_subclass_works(self) -> None:
class StubEngine(OcrEngine):
@property
def name(self) -> str:
return "stub"
def recognize(
self, image_bytes: bytes, config: OcrConfig
) -> OcrEngineResult:
return OcrEngineResult(
text="ok", confidence=1.0, word_boxes=[], engine_name="stub"
)
engine = StubEngine()
assert engine.name == "stub"
result = engine.recognize(b"", OcrConfig())
assert result.text == "ok"
# ---------------------------------------------------------------------------
# PaddleOcrEngine
# ---------------------------------------------------------------------------
class TestPaddleOcrEngine:
def test_name(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
assert engine.name == "paddleocr"
def test_lazy_init_not_loaded_at_construction(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
assert engine._ocr is None
def test_recognize_empty_results(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.ocr.return_value = [None]
engine._ocr = mock_ocr
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
assert result.text == ""
assert result.confidence == 0.0
assert result.word_boxes == []
assert result.engine_name == "paddleocr"
def test_recognize_with_results(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.ocr.return_value = [
[
[[[10, 20], [110, 20], [110, 50], [10, 50]], ("HELLO", 0.95)],
[[[10, 60], [110, 60], [110, 90], [10, 90]], ("WORLD", 0.88)],
]
]
engine._ocr = mock_ocr
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
assert result.text == "HELLO WORLD"
assert abs(result.confidence - 0.915) < 0.01
assert len(result.word_boxes) == 2
assert result.word_boxes[0].text == "HELLO"
assert result.word_boxes[0].confidence == 0.95
assert result.word_boxes[1].text == "WORLD"
assert result.engine_name == "paddleocr"
def test_recognize_whitelist_filters_characters(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.ocr.return_value = [
[
[[[0, 0], [100, 0], [100, 30], [0, 30]], ("1HG-BH4!", 0.9)],
]
]
engine._ocr = mock_ocr
config = OcrConfig(char_whitelist="ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
result = engine.recognize(_create_test_image_bytes(), config)
assert "-" not in result.text
assert "!" not in result.text
assert result.word_boxes[0].text == "1HGBH4"
def test_recognize_quadrilateral_to_bounding_box(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
# Slightly rotated quad: min x=8, min y=20, max x=110, max y=55
mock_ocr.ocr.return_value = [
[
[[[10, 20], [110, 25], [108, 55], [8, 50]], ("TEXT", 0.9)],
]
]
engine._ocr = mock_ocr
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
wb = result.word_boxes[0]
assert wb.x == 8
assert wb.y == 20
assert wb.width == 102 # 110 - 8
assert wb.height == 35 # 55 - 20
def test_recognize_skips_empty_after_whitelist(self) -> None:
"""Text consisting only of non-whitelisted characters is skipped."""
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.ocr.return_value = [
[
[[[0, 0], [50, 0], [50, 20], [0, 20]], ("---", 0.9)],
]
]
engine._ocr = mock_ocr
config = OcrConfig(char_whitelist="ABC")
result = engine.recognize(_create_test_image_bytes(), config)
assert result.text == ""
assert result.word_boxes == []
assert result.confidence == 0.0
def test_import_error_raises_unavailable(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
engine._ocr = None
with patch.dict("sys.modules", {"paddleocr": None}):
with patch(
"app.engines.paddle_engine.importlib.import_module",
side_effect=ImportError("No module"),
):
# Force re-import by removing cached paddleocr
original_import = __builtins__.__import__ if hasattr(__builtins__, '__import__') else __import__
def mock_import(name, *args, **kwargs):
if name == "paddleocr":
raise ImportError("No module named 'paddleocr'")
return original_import(name, *args, **kwargs)
with patch("builtins.__import__", side_effect=mock_import):
with pytest.raises(EngineUnavailableError, match="paddleocr"):
engine._get_ocr()
def test_processing_error_on_exception(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.ocr.side_effect = RuntimeError("OCR crashed")
engine._ocr = mock_ocr
with pytest.raises(EngineProcessingError, match="PaddleOCR recognition failed"):
engine.recognize(_create_test_image_bytes(), OcrConfig())
# ---------------------------------------------------------------------------
# TesseractEngine
# ---------------------------------------------------------------------------
class TestTesseractEngine:
"""Tests for TesseractEngine using mocked pytesseract."""
@pytest.fixture()
def engine(self) -> "TesseractEngine": # type: ignore[name-defined]
"""Create a TesseractEngine with mocked pytesseract dependency."""
mock_pytesseract = MagicMock()
mock_pytesseract.Output.DICT = "dict"
with patch.dict("sys.modules", {"pytesseract": mock_pytesseract}):
with patch("app.engines.tesseract_engine.settings") as mock_settings:
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
from app.engines.tesseract_engine import TesseractEngine
eng = TesseractEngine()
eng._mock_pytesseract = mock_pytesseract # type: ignore[attr-defined]
return eng
def test_name(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
assert engine.name == "tesseract"
def test_build_config_default_psm(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig())
assert "--psm 6" in config_str
def test_build_config_single_line(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig(single_line=True))
assert "--psm 7" in config_str
def test_build_config_single_word(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig(single_word=True))
assert "--psm 8" in config_str
def test_build_config_whitelist(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig(char_whitelist="ABC123"))
assert "-c tessedit_char_whitelist=ABC123" in config_str
def test_build_config_psm_hint(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig(hints={"psm": 11}))
assert "--psm 11" in config_str
def test_recognize_normalizes_confidence(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
"""Tesseract returns 0-100 confidence; engine normalizes to 0.0-1.0."""
engine._pytesseract.image_to_data.return_value = {
"text": ["HELLO", ""],
"conf": [92, -1],
"left": [10],
"top": [20],
"width": [100],
"height": [30],
}
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
assert result.text == "HELLO"
assert abs(result.confidence - 0.92) < 0.01
assert result.engine_name == "tesseract"
def test_import_error_raises_unavailable(self) -> None:
with patch.dict("sys.modules", {"pytesseract": None}):
with patch("app.engines.tesseract_engine.settings") as mock_settings:
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
def mock_import(name, *args, **kwargs):
if name == "pytesseract":
raise ImportError("No module named 'pytesseract'")
return __import__(name, *args, **kwargs)
with patch("builtins.__import__", side_effect=mock_import):
from app.engines.tesseract_engine import TesseractEngine
with pytest.raises(EngineUnavailableError, match="pytesseract"):
TesseractEngine()
# ---------------------------------------------------------------------------
# CloudEngine
# ---------------------------------------------------------------------------
class TestCloudEngine:
def test_name(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/path.json")
assert engine.name == "google_vision"
def test_lazy_init_not_loaded_at_construction(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/path.json")
assert engine._client is None
def test_missing_key_file_raises_unavailable(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/nonexistent/key.json")
with pytest.raises(EngineUnavailableError, match="key not found"):
engine._get_client()
@patch("os.path.isfile", return_value=True)
def test_missing_library_raises_unavailable(self, _mock_isfile: MagicMock) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/key.json")
def mock_import(name, *args, **kwargs):
if "google.cloud" in name:
raise ImportError("No module named 'google.cloud'")
return __import__(name, *args, **kwargs)
with patch("builtins.__import__", side_effect=mock_import):
with pytest.raises(EngineUnavailableError, match="google-cloud-vision"):
engine._get_client()
def test_recognize_empty_annotations(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/key.json")
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.error.message = ""
mock_response.text_annotations = []
mock_client.text_detection.return_value = mock_response
engine._client = mock_client
# Mock the google.cloud.vision import inside recognize()
mock_vision = MagicMock()
with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
result = engine.recognize(b"fake_image", OcrConfig())
assert result.text == ""
assert result.confidence == 0.0
assert result.engine_name == "google_vision"
def test_recognize_api_error_raises_processing_error(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/key.json")
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.error.message = "API quota exceeded"
mock_client.text_detection.return_value = mock_response
engine._client = mock_client
mock_vision = MagicMock()
with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
with pytest.raises(EngineProcessingError, match="API quota exceeded"):
engine.recognize(b"fake_image", OcrConfig())
# ---------------------------------------------------------------------------
# HybridEngine
# ---------------------------------------------------------------------------
class TestHybridEngine:
def test_name_with_fallback(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback = MagicMock(spec=OcrEngine)
fallback.name = "google_vision"
engine = HybridEngine(primary=primary, fallback=fallback)
assert engine.name == "hybrid(paddleocr+google_vision)"
def test_name_without_fallback(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
engine = HybridEngine(primary=primary)
assert engine.name == "hybrid(paddleocr+none)"
def test_high_confidence_skips_fallback(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "cloud"
primary.recognize.return_value = _make_result("VIN123", 0.95, "paddleocr")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
assert result.engine_name == "paddleocr"
fallback.recognize.assert_not_called()
def test_low_confidence_triggers_fallback(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN456"
assert result.engine_name == "google_vision"
fallback.recognize.assert_called_once()
def test_low_confidence_no_fallback_returns_primary(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
engine = HybridEngine(primary=primary, fallback=None, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
def test_fallback_lower_confidence_returns_primary(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.4, "paddleocr")
fallback.recognize.return_value = _make_result("VIN456", 0.3, "google_vision")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
def test_fallback_engine_error_returns_primary(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
fallback.recognize.side_effect = EngineUnavailableError("key missing")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
def test_fallback_unexpected_error_returns_primary(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
fallback.recognize.side_effect = RuntimeError("network error")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
@patch("app.engines.hybrid_engine.time")
def test_fallback_timeout_returns_primary(self, mock_time: MagicMock) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
# Simulate 6-second delay (exceeds 5s limit)
mock_time.monotonic.side_effect = [0.0, 6.0]
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123" # timeout -> use primary
def test_exact_threshold_skips_fallback(self) -> None:
"""When confidence == threshold, no fallback needed (>= check)."""
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "cloud"
primary.recognize.return_value = _make_result("VIN", 0.6, "paddleocr")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.engine_name == "paddleocr"
fallback.recognize.assert_not_called()
# ---------------------------------------------------------------------------
# Engine factory
# ---------------------------------------------------------------------------
class TestEngineFactory:
def test_unknown_engine_raises(self) -> None:
from app.engines.engine_factory import _create_single_engine
with pytest.raises(EngineUnavailableError, match="Unknown engine"):
_create_single_engine("nonexistent")
@patch("app.engines.engine_factory.settings")
@patch("app.engines.engine_factory._create_single_engine")
def test_defaults_to_settings_primary(
self, mock_create: MagicMock, mock_settings: MagicMock
) -> None:
mock_settings.ocr_primary_engine = "paddleocr"
mock_settings.ocr_fallback_engine = "none"
mock_engine = MagicMock(spec=OcrEngine)
mock_create.return_value = mock_engine
from app.engines.engine_factory import create_engine
result = create_engine()
mock_create.assert_called_once_with("paddleocr")
assert result == mock_engine
@patch("app.engines.engine_factory.settings")
@patch("app.engines.engine_factory._create_single_engine")
def test_explicit_name_overrides_settings(
self, mock_create: MagicMock, mock_settings: MagicMock
) -> None:
mock_settings.ocr_fallback_engine = "none"
mock_engine = MagicMock(spec=OcrEngine)
mock_create.return_value = mock_engine
from app.engines.engine_factory import create_engine
create_engine("tesseract")
mock_create.assert_called_once_with("tesseract")
@patch("app.engines.engine_factory.settings")
@patch("app.engines.engine_factory._create_single_engine")
def test_creates_hybrid_when_fallback_configured(
self, mock_create: MagicMock, mock_settings: MagicMock
) -> None:
mock_settings.ocr_primary_engine = "paddleocr"
mock_settings.ocr_fallback_engine = "google_vision"
mock_settings.ocr_fallback_threshold = 0.7
mock_primary = MagicMock(spec=OcrEngine)
mock_fallback = MagicMock(spec=OcrEngine)
mock_create.side_effect = [mock_primary, mock_fallback]
from app.engines.engine_factory import create_engine
from app.engines.hybrid_engine import HybridEngine
result = create_engine()
assert isinstance(result, HybridEngine)
@patch("app.engines.engine_factory.settings")
@patch("app.engines.engine_factory._create_single_engine")
def test_fallback_failure_returns_primary_only(
self, mock_create: MagicMock, mock_settings: MagicMock
) -> None:
mock_settings.ocr_primary_engine = "paddleocr"
mock_settings.ocr_fallback_engine = "google_vision"
mock_settings.ocr_fallback_threshold = 0.6
mock_primary = MagicMock(spec=OcrEngine)
mock_create.side_effect = [mock_primary, EngineUnavailableError("no key")]
from app.engines.engine_factory import create_engine
result = create_engine()
assert result == mock_primary

View File

@@ -1,11 +1,12 @@
"""Integration tests for VIN extraction endpoint.""" """Integration tests for VIN extraction endpoint and engine integration."""
import io import io
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
import pytest import pytest
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw
from app.engines.base_engine import OcrConfig, OcrEngineResult, WordBox
from app.main import app from app.main import app
@@ -240,3 +241,106 @@ class TestVinExtractionContentTypes:
) )
assert response.status_code == 200 assert response.status_code == 200
# ---------------------------------------------------------------------------
# VIN extractor engine integration tests
# ---------------------------------------------------------------------------
class TestVinExtractorEngineIntegration:
"""Tests verifying VinExtractor integrates correctly with engine abstraction."""
@patch("app.extractors.vin_extractor.create_engine")
def test_perform_ocr_calls_engine_with_vin_config(
self, mock_create_engine: MagicMock
) -> None:
"""_perform_ocr passes VIN whitelist and angle_cls to engine."""
from app.extractors.vin_extractor import VinExtractor
mock_engine = MagicMock()
mock_engine.recognize.return_value = OcrEngineResult(
text="1HGBH41JXMN109186",
confidence=0.94,
word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
engine_name="paddleocr",
)
mock_create_engine.return_value = mock_engine
extractor = VinExtractor()
text, confidences = extractor._perform_ocr(b"fake_image")
mock_engine.recognize.assert_called_once()
call_config = mock_engine.recognize.call_args[0][1]
assert isinstance(call_config, OcrConfig)
assert call_config.char_whitelist == VinExtractor.VIN_WHITELIST
assert call_config.use_angle_cls is True
assert call_config.single_line is False
assert call_config.single_word is False
assert text == "1HGBH41JXMN109186"
assert confidences == [0.94]
@patch("app.extractors.vin_extractor.create_engine")
def test_perform_ocr_single_line_mode(
self, mock_create_engine: MagicMock
) -> None:
"""_perform_ocr passes single_line flag to engine config."""
from app.extractors.vin_extractor import VinExtractor
mock_engine = MagicMock()
mock_engine.recognize.return_value = OcrEngineResult(
text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
)
mock_create_engine.return_value = mock_engine
extractor = VinExtractor()
extractor._perform_ocr(b"img", single_line=True)
call_config = mock_engine.recognize.call_args[0][1]
assert call_config.single_line is True
assert call_config.single_word is False
@patch("app.extractors.vin_extractor.create_engine")
def test_perform_ocr_single_word_mode(
self, mock_create_engine: MagicMock
) -> None:
"""_perform_ocr passes single_word flag to engine config."""
from app.extractors.vin_extractor import VinExtractor
mock_engine = MagicMock()
mock_engine.recognize.return_value = OcrEngineResult(
text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
)
mock_create_engine.return_value = mock_engine
extractor = VinExtractor()
extractor._perform_ocr(b"img", single_word=True)
call_config = mock_engine.recognize.call_args[0][1]
assert call_config.single_word is True
assert call_config.single_line is False
def test_calculate_base_confidence_empty_returns_default(self) -> None:
"""Empty word confidences return 0.5 default."""
from app.extractors.vin_extractor import VinExtractor
extractor = VinExtractor.__new__(VinExtractor)
assert extractor._calculate_base_confidence([]) == 0.5
def test_calculate_base_confidence_weighted_blend(self) -> None:
"""Confidence = 70% average + 30% minimum."""
from app.extractors.vin_extractor import VinExtractor
extractor = VinExtractor.__new__(VinExtractor)
# avg = (0.9 + 0.8) / 2 = 0.85, min = 0.8
# result = 0.7 * 0.85 + 0.3 * 0.8 = 0.595 + 0.24 = 0.835
result = extractor._calculate_base_confidence([0.9, 0.8])
assert abs(result - 0.835) < 0.001
def test_calculate_base_confidence_single_value(self) -> None:
"""Single confidence value: avg == min, so result equals that value."""
from app.extractors.vin_extractor import VinExtractor
extractor = VinExtractor.__new__(VinExtractor)
result = extractor._calculate_base_confidence([0.92])
assert abs(result - 0.92) < 0.001