feat: Improve OCR process - replace Tesseract with PaddleOCR (#115) #122
@@ -108,7 +108,7 @@
|
||||
},
|
||||
"mvp-ocr": {
|
||||
"type": "ocr_service",
|
||||
"description": "Python-based OCR for document text extraction",
|
||||
"description": "Python OCR service with pluggable engine abstraction (PaddleOCR PP-OCRv4 primary, optional Google Vision cloud fallback, Tesseract backward compat)",
|
||||
"port": 8000
|
||||
},
|
||||
"mvp-loki": {
|
||||
|
||||
@@ -18,5 +18,5 @@
|
||||
| `AUDIT.md` | Audit documentation | Security audits, compliance |
|
||||
| `MVP-COLOR-SCHEME.md` | Color scheme reference | UI styling decisions |
|
||||
| `LOGGING.md` | Unified logging system | Log levels, correlation IDs, Grafana |
|
||||
| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, Tesseract setup |
|
||||
| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, PaddleOCR engine abstraction |
|
||||
| `TIER-GATING.md` | Subscription tier gating rules | Feature access by tier, vehicle limits |
|
||||
|
||||
@@ -118,35 +118,48 @@
|
||||
│ ├─────────────────────────────────────────────────────────┤
|
||||
│ │ │
|
||||
│ │ ┌─────────────────────────────────────────────────┐ │
|
||||
│ │ │ 5a. Primary OCR: Tesseract 5.x │ │
|
||||
│ │ │ │ │
|
||||
│ │ │ • Engine: LSTM (--oem 1) │ │
|
||||
│ │ │ • Page segmentation: Auto (--psm 3) │ │
|
||||
│ │ │ • Output: hOCR with word confidence │ │
|
||||
│ │ │ 5a. Engine Abstraction Layer │ │
|
||||
│ │ │ │ │
|
||||
│ │ │ OcrEngine ABC -> PaddleOcrEngine (primary) │ │
|
||||
│ │ │ -> CloudEngine (optional fallback) │ │
|
||||
│ │ │ -> TesseractEngine (backward compat)│ │
|
||||
│ │ │ -> HybridEngine (primary+fallback) │ │
|
||||
│ │ └─────────────────────────────────────────────────┘ │
|
||||
│ │ │ │
|
||||
│ │ ▼ │
|
||||
│ │ ┌─────────────────────────────────────────────────┐ │
|
||||
│ │ │ 5b. Primary OCR: PaddleOCR PP-OCRv4 │ │
|
||||
│ │ │ │ │
|
||||
│ │ │ • Scene text detection + angle classification │ │
|
||||
│ │ │ • CPU-only, models baked into Docker image │ │
|
||||
│ │ │ • Normalized output: text, confidence, word boxes│ │
|
||||
│ │ └─────────────────────────────────────────────────┘ │
|
||||
│ │ │ │
|
||||
│ │ ▼ │
|
||||
│ │ ┌───────────────┐ │
|
||||
│ │ │ Confidence │ │
|
||||
│ │ │ > 80% ? │ │
|
||||
│ │ │ >= 60% ? │ │
|
||||
│ │ └───────────────┘ │
|
||||
│ │ │ │ │
|
||||
│ │ YES ──┘ └── NO │
|
||||
│ │ YES ──┘ └── NO (and cloud enabled) │
|
||||
│ │ │ │ │
|
||||
│ │ │ ▼ │
|
||||
│ │ │ ┌─────────────────────────────────┐ │
|
||||
│ │ │ │ 5b. Fallback: PaddleOCR │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ │ │ • Better for degraded images │ │
|
||||
│ │ │ │ • Better table detection │ │
|
||||
│ │ │ │ • Slower but more accurate │ │
|
||||
│ │ │ │ 5c. Optional Cloud Fallback │ │
|
||||
│ │ │ │ (Google Vision API) │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ │ │ • Disabled by default │ │
|
||||
│ │ │ │ • 5-second timeout guard │ │
|
||||
│ │ │ │ • Returns higher-confidence │ │
|
||||
│ │ │ │ result of primary vs fallback │ │
|
||||
│ │ │ └─────────────────────────────────┘ │
|
||||
│ │ │ │ │
|
||||
│ │ ▼ ▼ │
|
||||
│ │ ┌─────────────────────────────────┐ │
|
||||
│ │ │ 5c. Result Merging │ │
|
||||
│ │ │ • Merge by bounding box │ │
|
||||
│ │ │ 5d. HybridEngine Result │ │
|
||||
│ │ │ • Compare confidences │ │
|
||||
│ │ │ • Keep highest confidence │ │
|
||||
│ │ │ • Graceful fallback on error │ │
|
||||
│ │ └─────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ └─────────────────────────────────────────────────────────┘
|
||||
@@ -257,10 +270,10 @@
|
||||
|
||||
| Component | Tool | Purpose |
|
||||
|------------------------|-----------------------|--------------------------------------|
|
||||
| **Primary OCR** | Tesseract 5.x | Fast, reliable text extraction |
|
||||
| **Python Binding** | pytesseract | Tesseract Python wrapper |
|
||||
| **Fallback OCR** | PaddleOCR | Higher accuracy, better tables |
|
||||
| **Layout Analysis** | PaddleOCR / LayoutParser | Document structure detection |
|
||||
| **Primary OCR** | PaddleOCR PP-OCRv4 | Highest accuracy scene text, CPU-only |
|
||||
| **Cloud Fallback** | Google Vision API | Optional cloud fallback (disabled by default) |
|
||||
| **Backward Compat** | Tesseract 5.x / pytesseract | Legacy engine, configurable via env var |
|
||||
| **Engine Abstraction** | `OcrEngine` ABC | Pluggable engine interface in `ocr/app/engines/` |
|
||||
|
||||
### Data Extraction
|
||||
|
||||
@@ -291,85 +304,93 @@
|
||||
fastapi>=0.100.0
|
||||
uvicorn[standard]>=0.23.0
|
||||
python-multipart>=0.0.6
|
||||
|
||||
# Task Queue
|
||||
celery>=5.3.0
|
||||
redis>=4.6.0
|
||||
pydantic>=2.0.0
|
||||
|
||||
# File Detection & Handling
|
||||
python-magic>=0.4.27
|
||||
pillow>=10.0.0
|
||||
pillow-heif>=0.13.0
|
||||
|
||||
# PDF Processing
|
||||
pymupdf>=1.23.0
|
||||
|
||||
# Image Preprocessing
|
||||
opencv-python-headless>=4.8.0
|
||||
deskew>=1.4.0
|
||||
scikit-image>=0.21.0
|
||||
numpy>=1.24.0
|
||||
|
||||
# OCR Engines
|
||||
pytesseract>=0.3.10
|
||||
paddlepaddle>=2.5.0
|
||||
paddleocr>=2.7.0
|
||||
paddlepaddle>=2.6.0
|
||||
paddleocr>=2.8.0
|
||||
google-cloud-vision>=3.7.0
|
||||
|
||||
# Table Extraction
|
||||
img2table>=1.2.0
|
||||
camelot-py[cv]>=0.11.0
|
||||
# PDF Processing
|
||||
PyMuPDF>=1.23.0
|
||||
|
||||
# NLP & Data
|
||||
spacy>=3.6.0
|
||||
pandas>=2.0.0
|
||||
# Redis for job queue
|
||||
redis>=5.0.0
|
||||
|
||||
# Storage & Database
|
||||
boto3>=1.28.0
|
||||
psycopg2-binary>=2.9.0
|
||||
sqlalchemy>=2.0.0
|
||||
# HTTP client for callbacks
|
||||
httpx>=0.24.0
|
||||
|
||||
# Testing
|
||||
pytest>=7.4.0
|
||||
pytest-asyncio>=0.21.0
|
||||
```
|
||||
|
||||
### System Package Requirements (Ubuntu/Debian)
|
||||
|
||||
```bash
|
||||
# Tesseract OCR
|
||||
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev
|
||||
# Tesseract OCR (backward compatibility engine)
|
||||
apt-get install tesseract-ocr tesseract-ocr-eng
|
||||
|
||||
# PaddlePaddle OpenMP runtime
|
||||
apt-get install libgomp1
|
||||
|
||||
# HEIC Support
|
||||
apt-get install libheif-examples libheif-dev
|
||||
apt-get install libheif1 libheif-dev
|
||||
|
||||
# OpenCV dependencies
|
||||
apt-get install libgl1-mesa-glx libglib2.0-0
|
||||
# GLib (OpenCV dependency)
|
||||
apt-get install libglib2.0-0
|
||||
|
||||
# PDF rendering dependencies
|
||||
apt-get install libmupdf-dev mupdf-tools
|
||||
|
||||
# Image processing
|
||||
apt-get install libmagic1 ghostscript
|
||||
|
||||
# Camelot dependencies
|
||||
apt-get install ghostscript python3-tk
|
||||
# File type detection
|
||||
apt-get install libmagic1
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `OCR_PRIMARY_ENGINE` | `paddleocr` | Primary OCR engine (`paddleocr`, `tesseract`) |
|
||||
| `OCR_CONFIDENCE_THRESHOLD` | `0.6` | Minimum confidence threshold |
|
||||
| `OCR_FALLBACK_ENGINE` | `none` | Fallback engine (`google_vision`, `none`) |
|
||||
| `OCR_FALLBACK_THRESHOLD` | `0.6` | Confidence below this triggers fallback |
|
||||
| `GOOGLE_VISION_KEY_PATH` | `/run/secrets/google-vision-key.json` | Path to Google Vision service account key |
|
||||
|
||||
---
|
||||
|
||||
## DOCKERFILE
|
||||
|
||||
```dockerfile
|
||||
FROM python:3.11-slim
|
||||
# Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
|
||||
# Backward compat: Tesseract 5.x (optional, via TesseractEngine)
|
||||
# Cloud fallback: Google Vision (optional, requires API key at runtime)
|
||||
|
||||
FROM python:3.13-slim
|
||||
|
||||
# System dependencies
|
||||
# - tesseract-ocr/eng: Backward-compatible OCR engine
|
||||
# - libgomp1: OpenMP runtime required by PaddlePaddle
|
||||
# - libheif1/libheif-dev: HEIF image support (iPhone photos)
|
||||
# - libglib2.0-0: GLib shared library (OpenCV dependency)
|
||||
# - libmagic1: File type detection
|
||||
# - curl: Health check endpoint
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
libtesseract-dev \
|
||||
libheif-examples \
|
||||
libgomp1 \
|
||||
libheif1 \
|
||||
libheif-dev \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
libmagic1 \
|
||||
ghostscript \
|
||||
poppler-utils \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Python dependencies
|
||||
@@ -377,11 +398,9 @@ WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Download spaCy model
|
||||
RUN python -m spacy download en_core_web_sm
|
||||
|
||||
# Download PaddleOCR models (cached in image)
|
||||
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')"
|
||||
# Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime)
|
||||
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)" \
|
||||
&& echo "PaddleOCR PP-OCRv4 models downloaded and verified"
|
||||
|
||||
COPY . .
|
||||
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
# ocr/
|
||||
|
||||
Python OCR microservice. Primary engine: PaddleOCR PP-OCRv4 with optional Google Vision cloud fallback. Pluggable engine abstraction in `app/engines/`.
|
||||
|
||||
## Files
|
||||
|
||||
| File | What | When to read |
|
||||
| ---- | ---- | ------------ |
|
||||
| `Dockerfile` | Container build definition | Docker builds, deployment |
|
||||
| `Dockerfile` | Container build (PaddleOCR models baked in) | Docker builds, deployment |
|
||||
| `requirements.txt` | Python dependencies | Adding dependencies |
|
||||
|
||||
## Subdirectories
|
||||
@@ -12,4 +14,5 @@
|
||||
| Directory | What | When to read |
|
||||
| --------- | ---- | ------------ |
|
||||
| `app/` | FastAPI application source | OCR endpoint development |
|
||||
| `app/engines/` | Engine abstraction layer (OcrEngine ABC, factory, hybrid) | Adding or changing OCR engines |
|
||||
| `tests/` | Test suite | Adding or modifying tests |
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
| Directory | What | When to read |
|
||||
| --------- | ---- | ------------ |
|
||||
| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback, Tesseract compat) | Engine changes, adding new engines |
|
||||
| `extractors/` | Data extraction logic | Adding new extraction types |
|
||||
| `models/` | Data models and schemas | Request/response types |
|
||||
| `patterns/` | Regex and parsing patterns | Pattern matching rules |
|
||||
|
||||
675
ocr/tests/test_engine_abstraction.py
Normal file
675
ocr/tests/test_engine_abstraction.py
Normal file
@@ -0,0 +1,675 @@
|
||||
"""Tests for OCR engine abstraction layer.
|
||||
|
||||
Covers: base types, exception hierarchy, PaddleOcrEngine,
|
||||
TesseractEngine, CloudEngine, HybridEngine, and engine_factory.
|
||||
"""
|
||||
|
||||
import io
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from app.engines.base_engine import (
|
||||
EngineError,
|
||||
EngineProcessingError,
|
||||
EngineUnavailableError,
|
||||
OcrConfig,
|
||||
OcrEngine,
|
||||
OcrEngineResult,
|
||||
WordBox,
|
||||
)
|
||||
|
||||
|
||||
# --- Helpers ---
|
||||
|
||||
|
||||
def _create_test_image_bytes() -> bytes:
|
||||
"""Create minimal PNG image bytes for engine testing."""
|
||||
img = Image.new("RGB", (100, 50), (255, 255, 255))
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="PNG")
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def _make_result(
|
||||
text: str, confidence: float, engine_name: str
|
||||
) -> OcrEngineResult:
|
||||
"""Create a minimal OcrEngineResult for testing."""
|
||||
return OcrEngineResult(
|
||||
text=text, confidence=confidence, word_boxes=[], engine_name=engine_name
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Exception hierarchy
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExceptionHierarchy:
|
||||
"""Engine errors form a proper hierarchy under EngineError."""
|
||||
|
||||
def test_unavailable_is_engine_error(self) -> None:
|
||||
assert issubclass(EngineUnavailableError, EngineError)
|
||||
|
||||
def test_processing_is_engine_error(self) -> None:
|
||||
assert issubclass(EngineProcessingError, EngineError)
|
||||
|
||||
def test_engine_error_is_exception(self) -> None:
|
||||
assert issubclass(EngineError, Exception)
|
||||
|
||||
def test_catch_base_catches_subtypes(self) -> None:
|
||||
with pytest.raises(EngineError):
|
||||
raise EngineUnavailableError("not installed")
|
||||
with pytest.raises(EngineError):
|
||||
raise EngineProcessingError("OCR failed")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestWordBox:
|
||||
def test_default_positions(self) -> None:
|
||||
wb = WordBox(text="VIN", confidence=0.95)
|
||||
assert wb.x == 0
|
||||
assert wb.y == 0
|
||||
assert wb.width == 0
|
||||
assert wb.height == 0
|
||||
|
||||
def test_all_fields(self) -> None:
|
||||
wb = WordBox(text="ABC", confidence=0.88, x=10, y=20, width=100, height=30)
|
||||
assert wb.text == "ABC"
|
||||
assert wb.confidence == 0.88
|
||||
assert wb.x == 10
|
||||
assert wb.width == 100
|
||||
|
||||
|
||||
class TestOcrConfig:
|
||||
def test_defaults(self) -> None:
|
||||
config = OcrConfig()
|
||||
assert config.char_whitelist is None
|
||||
assert config.single_line is False
|
||||
assert config.single_word is False
|
||||
assert config.use_angle_cls is True
|
||||
assert config.hints == {}
|
||||
|
||||
def test_vin_whitelist_excludes_ioq(self) -> None:
|
||||
whitelist = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
||||
config = OcrConfig(char_whitelist=whitelist)
|
||||
assert "I" not in config.char_whitelist
|
||||
assert "O" not in config.char_whitelist
|
||||
assert "Q" not in config.char_whitelist
|
||||
|
||||
def test_hints_are_independent_across_instances(self) -> None:
|
||||
c1 = OcrConfig()
|
||||
c2 = OcrConfig()
|
||||
c1.hints["psm"] = 7
|
||||
assert "psm" not in c2.hints
|
||||
|
||||
|
||||
class TestOcrEngineResult:
|
||||
def test_construction(self) -> None:
|
||||
result = OcrEngineResult(
|
||||
text="1HGBH41JXMN109186",
|
||||
confidence=0.94,
|
||||
word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
|
||||
engine_name="paddleocr",
|
||||
)
|
||||
assert result.text == "1HGBH41JXMN109186"
|
||||
assert result.confidence == 0.94
|
||||
assert len(result.word_boxes) == 1
|
||||
assert result.engine_name == "paddleocr"
|
||||
|
||||
def test_empty_result(self) -> None:
|
||||
result = OcrEngineResult(
|
||||
text="", confidence=0.0, word_boxes=[], engine_name="tesseract"
|
||||
)
|
||||
assert result.text == ""
|
||||
assert result.word_boxes == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OcrEngine ABC
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOcrEngineABC:
|
||||
def test_cannot_instantiate_directly(self) -> None:
|
||||
with pytest.raises(TypeError):
|
||||
OcrEngine() # type: ignore[abstract]
|
||||
|
||||
def test_concrete_subclass_works(self) -> None:
|
||||
class StubEngine(OcrEngine):
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "stub"
|
||||
|
||||
def recognize(
|
||||
self, image_bytes: bytes, config: OcrConfig
|
||||
) -> OcrEngineResult:
|
||||
return OcrEngineResult(
|
||||
text="ok", confidence=1.0, word_boxes=[], engine_name="stub"
|
||||
)
|
||||
|
||||
engine = StubEngine()
|
||||
assert engine.name == "stub"
|
||||
result = engine.recognize(b"", OcrConfig())
|
||||
assert result.text == "ok"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# PaddleOcrEngine
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPaddleOcrEngine:
|
||||
def test_name(self) -> None:
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
engine = PaddleOcrEngine()
|
||||
assert engine.name == "paddleocr"
|
||||
|
||||
def test_lazy_init_not_loaded_at_construction(self) -> None:
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
engine = PaddleOcrEngine()
|
||||
assert engine._ocr is None
|
||||
|
||||
def test_recognize_empty_results(self) -> None:
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
engine = PaddleOcrEngine()
|
||||
mock_ocr = MagicMock()
|
||||
mock_ocr.ocr.return_value = [None]
|
||||
engine._ocr = mock_ocr
|
||||
|
||||
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||
assert result.text == ""
|
||||
assert result.confidence == 0.0
|
||||
assert result.word_boxes == []
|
||||
assert result.engine_name == "paddleocr"
|
||||
|
||||
def test_recognize_with_results(self) -> None:
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
engine = PaddleOcrEngine()
|
||||
mock_ocr = MagicMock()
|
||||
mock_ocr.ocr.return_value = [
|
||||
[
|
||||
[[[10, 20], [110, 20], [110, 50], [10, 50]], ("HELLO", 0.95)],
|
||||
[[[10, 60], [110, 60], [110, 90], [10, 90]], ("WORLD", 0.88)],
|
||||
]
|
||||
]
|
||||
engine._ocr = mock_ocr
|
||||
|
||||
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||
assert result.text == "HELLO WORLD"
|
||||
assert abs(result.confidence - 0.915) < 0.01
|
||||
assert len(result.word_boxes) == 2
|
||||
assert result.word_boxes[0].text == "HELLO"
|
||||
assert result.word_boxes[0].confidence == 0.95
|
||||
assert result.word_boxes[1].text == "WORLD"
|
||||
assert result.engine_name == "paddleocr"
|
||||
|
||||
def test_recognize_whitelist_filters_characters(self) -> None:
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
engine = PaddleOcrEngine()
|
||||
mock_ocr = MagicMock()
|
||||
mock_ocr.ocr.return_value = [
|
||||
[
|
||||
[[[0, 0], [100, 0], [100, 30], [0, 30]], ("1HG-BH4!", 0.9)],
|
||||
]
|
||||
]
|
||||
engine._ocr = mock_ocr
|
||||
|
||||
config = OcrConfig(char_whitelist="ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
|
||||
result = engine.recognize(_create_test_image_bytes(), config)
|
||||
assert "-" not in result.text
|
||||
assert "!" not in result.text
|
||||
assert result.word_boxes[0].text == "1HGBH4"
|
||||
|
||||
def test_recognize_quadrilateral_to_bounding_box(self) -> None:
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
engine = PaddleOcrEngine()
|
||||
mock_ocr = MagicMock()
|
||||
# Slightly rotated quad: min x=8, min y=20, max x=110, max y=55
|
||||
mock_ocr.ocr.return_value = [
|
||||
[
|
||||
[[[10, 20], [110, 25], [108, 55], [8, 50]], ("TEXT", 0.9)],
|
||||
]
|
||||
]
|
||||
engine._ocr = mock_ocr
|
||||
|
||||
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||
wb = result.word_boxes[0]
|
||||
assert wb.x == 8
|
||||
assert wb.y == 20
|
||||
assert wb.width == 102 # 110 - 8
|
||||
assert wb.height == 35 # 55 - 20
|
||||
|
||||
def test_recognize_skips_empty_after_whitelist(self) -> None:
|
||||
"""Text consisting only of non-whitelisted characters is skipped."""
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
engine = PaddleOcrEngine()
|
||||
mock_ocr = MagicMock()
|
||||
mock_ocr.ocr.return_value = [
|
||||
[
|
||||
[[[0, 0], [50, 0], [50, 20], [0, 20]], ("---", 0.9)],
|
||||
]
|
||||
]
|
||||
engine._ocr = mock_ocr
|
||||
|
||||
config = OcrConfig(char_whitelist="ABC")
|
||||
result = engine.recognize(_create_test_image_bytes(), config)
|
||||
assert result.text == ""
|
||||
assert result.word_boxes == []
|
||||
assert result.confidence == 0.0
|
||||
|
||||
def test_import_error_raises_unavailable(self) -> None:
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
engine = PaddleOcrEngine()
|
||||
engine._ocr = None
|
||||
with patch.dict("sys.modules", {"paddleocr": None}):
|
||||
with patch(
|
||||
"app.engines.paddle_engine.importlib.import_module",
|
||||
side_effect=ImportError("No module"),
|
||||
):
|
||||
# Force re-import by removing cached paddleocr
|
||||
original_import = __builtins__.__import__ if hasattr(__builtins__, '__import__') else __import__
|
||||
def mock_import(name, *args, **kwargs):
|
||||
if name == "paddleocr":
|
||||
raise ImportError("No module named 'paddleocr'")
|
||||
return original_import(name, *args, **kwargs)
|
||||
|
||||
with patch("builtins.__import__", side_effect=mock_import):
|
||||
with pytest.raises(EngineUnavailableError, match="paddleocr"):
|
||||
engine._get_ocr()
|
||||
|
||||
def test_processing_error_on_exception(self) -> None:
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
engine = PaddleOcrEngine()
|
||||
mock_ocr = MagicMock()
|
||||
mock_ocr.ocr.side_effect = RuntimeError("OCR crashed")
|
||||
engine._ocr = mock_ocr
|
||||
|
||||
with pytest.raises(EngineProcessingError, match="PaddleOCR recognition failed"):
|
||||
engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TesseractEngine
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestTesseractEngine:
|
||||
"""Tests for TesseractEngine using mocked pytesseract."""
|
||||
|
||||
@pytest.fixture()
|
||||
def engine(self) -> "TesseractEngine": # type: ignore[name-defined]
|
||||
"""Create a TesseractEngine with mocked pytesseract dependency."""
|
||||
mock_pytesseract = MagicMock()
|
||||
mock_pytesseract.Output.DICT = "dict"
|
||||
|
||||
with patch.dict("sys.modules", {"pytesseract": mock_pytesseract}):
|
||||
with patch("app.engines.tesseract_engine.settings") as mock_settings:
|
||||
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
|
||||
from app.engines.tesseract_engine import TesseractEngine
|
||||
|
||||
eng = TesseractEngine()
|
||||
eng._mock_pytesseract = mock_pytesseract # type: ignore[attr-defined]
|
||||
return eng
|
||||
|
||||
def test_name(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
assert engine.name == "tesseract"
|
||||
|
||||
def test_build_config_default_psm(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig())
|
||||
assert "--psm 6" in config_str
|
||||
|
||||
def test_build_config_single_line(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(single_line=True))
|
||||
assert "--psm 7" in config_str
|
||||
|
||||
def test_build_config_single_word(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(single_word=True))
|
||||
assert "--psm 8" in config_str
|
||||
|
||||
def test_build_config_whitelist(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(char_whitelist="ABC123"))
|
||||
assert "-c tessedit_char_whitelist=ABC123" in config_str
|
||||
|
||||
def test_build_config_psm_hint(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(hints={"psm": 11}))
|
||||
assert "--psm 11" in config_str
|
||||
|
||||
def test_recognize_normalizes_confidence(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
"""Tesseract returns 0-100 confidence; engine normalizes to 0.0-1.0."""
|
||||
engine._pytesseract.image_to_data.return_value = {
|
||||
"text": ["HELLO", ""],
|
||||
"conf": [92, -1],
|
||||
"left": [10],
|
||||
"top": [20],
|
||||
"width": [100],
|
||||
"height": [30],
|
||||
}
|
||||
|
||||
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||
assert result.text == "HELLO"
|
||||
assert abs(result.confidence - 0.92) < 0.01
|
||||
assert result.engine_name == "tesseract"
|
||||
|
||||
def test_import_error_raises_unavailable(self) -> None:
|
||||
with patch.dict("sys.modules", {"pytesseract": None}):
|
||||
with patch("app.engines.tesseract_engine.settings") as mock_settings:
|
||||
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
|
||||
|
||||
def mock_import(name, *args, **kwargs):
|
||||
if name == "pytesseract":
|
||||
raise ImportError("No module named 'pytesseract'")
|
||||
return __import__(name, *args, **kwargs)
|
||||
|
||||
with patch("builtins.__import__", side_effect=mock_import):
|
||||
from app.engines.tesseract_engine import TesseractEngine
|
||||
|
||||
with pytest.raises(EngineUnavailableError, match="pytesseract"):
|
||||
TesseractEngine()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CloudEngine
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCloudEngine:
|
||||
def test_name(self) -> None:
|
||||
from app.engines.cloud_engine import CloudEngine
|
||||
|
||||
engine = CloudEngine(key_path="/fake/path.json")
|
||||
assert engine.name == "google_vision"
|
||||
|
||||
def test_lazy_init_not_loaded_at_construction(self) -> None:
|
||||
from app.engines.cloud_engine import CloudEngine
|
||||
|
||||
engine = CloudEngine(key_path="/fake/path.json")
|
||||
assert engine._client is None
|
||||
|
||||
def test_missing_key_file_raises_unavailable(self) -> None:
|
||||
from app.engines.cloud_engine import CloudEngine
|
||||
|
||||
engine = CloudEngine(key_path="/nonexistent/key.json")
|
||||
with pytest.raises(EngineUnavailableError, match="key not found"):
|
||||
engine._get_client()
|
||||
|
||||
@patch("os.path.isfile", return_value=True)
|
||||
def test_missing_library_raises_unavailable(self, _mock_isfile: MagicMock) -> None:
|
||||
from app.engines.cloud_engine import CloudEngine
|
||||
|
||||
engine = CloudEngine(key_path="/fake/key.json")
|
||||
|
||||
def mock_import(name, *args, **kwargs):
|
||||
if "google.cloud" in name:
|
||||
raise ImportError("No module named 'google.cloud'")
|
||||
return __import__(name, *args, **kwargs)
|
||||
|
||||
with patch("builtins.__import__", side_effect=mock_import):
|
||||
with pytest.raises(EngineUnavailableError, match="google-cloud-vision"):
|
||||
engine._get_client()
|
||||
|
||||
def test_recognize_empty_annotations(self) -> None:
|
||||
from app.engines.cloud_engine import CloudEngine
|
||||
|
||||
engine = CloudEngine(key_path="/fake/key.json")
|
||||
mock_client = MagicMock()
|
||||
mock_response = MagicMock()
|
||||
mock_response.error.message = ""
|
||||
mock_response.text_annotations = []
|
||||
mock_client.text_detection.return_value = mock_response
|
||||
engine._client = mock_client
|
||||
|
||||
# Mock the google.cloud.vision import inside recognize()
|
||||
mock_vision = MagicMock()
|
||||
with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
|
||||
result = engine.recognize(b"fake_image", OcrConfig())
|
||||
assert result.text == ""
|
||||
assert result.confidence == 0.0
|
||||
assert result.engine_name == "google_vision"
|
||||
|
||||
def test_recognize_api_error_raises_processing_error(self) -> None:
|
||||
from app.engines.cloud_engine import CloudEngine
|
||||
|
||||
engine = CloudEngine(key_path="/fake/key.json")
|
||||
mock_client = MagicMock()
|
||||
mock_response = MagicMock()
|
||||
mock_response.error.message = "API quota exceeded"
|
||||
mock_client.text_detection.return_value = mock_response
|
||||
engine._client = mock_client
|
||||
|
||||
mock_vision = MagicMock()
|
||||
with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
|
||||
with pytest.raises(EngineProcessingError, match="API quota exceeded"):
|
||||
engine.recognize(b"fake_image", OcrConfig())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HybridEngine
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHybridEngine:
|
||||
def test_name_with_fallback(self) -> None:
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
fallback = MagicMock(spec=OcrEngine)
|
||||
fallback.name = "google_vision"
|
||||
engine = HybridEngine(primary=primary, fallback=fallback)
|
||||
assert engine.name == "hybrid(paddleocr+google_vision)"
|
||||
|
||||
def test_name_without_fallback(self) -> None:
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
engine = HybridEngine(primary=primary)
|
||||
assert engine.name == "hybrid(paddleocr+none)"
|
||||
|
||||
def test_high_confidence_skips_fallback(self) -> None:
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
fallback = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
fallback.name = "cloud"
|
||||
primary.recognize.return_value = _make_result("VIN123", 0.95, "paddleocr")
|
||||
|
||||
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||
result = engine.recognize(b"img", OcrConfig())
|
||||
assert result.text == "VIN123"
|
||||
assert result.engine_name == "paddleocr"
|
||||
fallback.recognize.assert_not_called()
|
||||
|
||||
def test_low_confidence_triggers_fallback(self) -> None:
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
fallback = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
fallback.name = "google_vision"
|
||||
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||
fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
|
||||
|
||||
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||
result = engine.recognize(b"img", OcrConfig())
|
||||
assert result.text == "VIN456"
|
||||
assert result.engine_name == "google_vision"
|
||||
fallback.recognize.assert_called_once()
|
||||
|
||||
def test_low_confidence_no_fallback_returns_primary(self) -> None:
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||
|
||||
engine = HybridEngine(primary=primary, fallback=None, threshold=0.6)
|
||||
result = engine.recognize(b"img", OcrConfig())
|
||||
assert result.text == "VIN123"
|
||||
|
||||
def test_fallback_lower_confidence_returns_primary(self) -> None:
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
fallback = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
fallback.name = "google_vision"
|
||||
primary.recognize.return_value = _make_result("VIN123", 0.4, "paddleocr")
|
||||
fallback.recognize.return_value = _make_result("VIN456", 0.3, "google_vision")
|
||||
|
||||
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||
result = engine.recognize(b"img", OcrConfig())
|
||||
assert result.text == "VIN123"
|
||||
|
||||
def test_fallback_engine_error_returns_primary(self) -> None:
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
fallback = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
fallback.name = "google_vision"
|
||||
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||
fallback.recognize.side_effect = EngineUnavailableError("key missing")
|
||||
|
||||
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||
result = engine.recognize(b"img", OcrConfig())
|
||||
assert result.text == "VIN123"
|
||||
|
||||
def test_fallback_unexpected_error_returns_primary(self) -> None:
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
fallback = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
fallback.name = "google_vision"
|
||||
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||
fallback.recognize.side_effect = RuntimeError("network error")
|
||||
|
||||
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||
result = engine.recognize(b"img", OcrConfig())
|
||||
assert result.text == "VIN123"
|
||||
|
||||
@patch("app.engines.hybrid_engine.time")
|
||||
def test_fallback_timeout_returns_primary(self, mock_time: MagicMock) -> None:
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
fallback = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
fallback.name = "google_vision"
|
||||
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||
fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
|
||||
# Simulate 6-second delay (exceeds 5s limit)
|
||||
mock_time.monotonic.side_effect = [0.0, 6.0]
|
||||
|
||||
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||
result = engine.recognize(b"img", OcrConfig())
|
||||
assert result.text == "VIN123" # timeout -> use primary
|
||||
|
||||
def test_exact_threshold_skips_fallback(self) -> None:
|
||||
"""When confidence == threshold, no fallback needed (>= check)."""
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
primary = MagicMock(spec=OcrEngine)
|
||||
fallback = MagicMock(spec=OcrEngine)
|
||||
primary.name = "paddleocr"
|
||||
fallback.name = "cloud"
|
||||
primary.recognize.return_value = _make_result("VIN", 0.6, "paddleocr")
|
||||
|
||||
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||
result = engine.recognize(b"img", OcrConfig())
|
||||
assert result.engine_name == "paddleocr"
|
||||
fallback.recognize.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Engine factory
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestEngineFactory:
|
||||
def test_unknown_engine_raises(self) -> None:
|
||||
from app.engines.engine_factory import _create_single_engine
|
||||
|
||||
with pytest.raises(EngineUnavailableError, match="Unknown engine"):
|
||||
_create_single_engine("nonexistent")
|
||||
|
||||
@patch("app.engines.engine_factory.settings")
|
||||
@patch("app.engines.engine_factory._create_single_engine")
|
||||
def test_defaults_to_settings_primary(
|
||||
self, mock_create: MagicMock, mock_settings: MagicMock
|
||||
) -> None:
|
||||
mock_settings.ocr_primary_engine = "paddleocr"
|
||||
mock_settings.ocr_fallback_engine = "none"
|
||||
mock_engine = MagicMock(spec=OcrEngine)
|
||||
mock_create.return_value = mock_engine
|
||||
|
||||
from app.engines.engine_factory import create_engine
|
||||
|
||||
result = create_engine()
|
||||
mock_create.assert_called_once_with("paddleocr")
|
||||
assert result == mock_engine
|
||||
|
||||
@patch("app.engines.engine_factory.settings")
|
||||
@patch("app.engines.engine_factory._create_single_engine")
|
||||
def test_explicit_name_overrides_settings(
|
||||
self, mock_create: MagicMock, mock_settings: MagicMock
|
||||
) -> None:
|
||||
mock_settings.ocr_fallback_engine = "none"
|
||||
mock_engine = MagicMock(spec=OcrEngine)
|
||||
mock_create.return_value = mock_engine
|
||||
|
||||
from app.engines.engine_factory import create_engine
|
||||
|
||||
create_engine("tesseract")
|
||||
mock_create.assert_called_once_with("tesseract")
|
||||
|
||||
@patch("app.engines.engine_factory.settings")
|
||||
@patch("app.engines.engine_factory._create_single_engine")
|
||||
def test_creates_hybrid_when_fallback_configured(
|
||||
self, mock_create: MagicMock, mock_settings: MagicMock
|
||||
) -> None:
|
||||
mock_settings.ocr_primary_engine = "paddleocr"
|
||||
mock_settings.ocr_fallback_engine = "google_vision"
|
||||
mock_settings.ocr_fallback_threshold = 0.7
|
||||
mock_primary = MagicMock(spec=OcrEngine)
|
||||
mock_fallback = MagicMock(spec=OcrEngine)
|
||||
mock_create.side_effect = [mock_primary, mock_fallback]
|
||||
|
||||
from app.engines.engine_factory import create_engine
|
||||
from app.engines.hybrid_engine import HybridEngine
|
||||
|
||||
result = create_engine()
|
||||
assert isinstance(result, HybridEngine)
|
||||
|
||||
@patch("app.engines.engine_factory.settings")
|
||||
@patch("app.engines.engine_factory._create_single_engine")
|
||||
def test_fallback_failure_returns_primary_only(
|
||||
self, mock_create: MagicMock, mock_settings: MagicMock
|
||||
) -> None:
|
||||
mock_settings.ocr_primary_engine = "paddleocr"
|
||||
mock_settings.ocr_fallback_engine = "google_vision"
|
||||
mock_settings.ocr_fallback_threshold = 0.6
|
||||
mock_primary = MagicMock(spec=OcrEngine)
|
||||
mock_create.side_effect = [mock_primary, EngineUnavailableError("no key")]
|
||||
|
||||
from app.engines.engine_factory import create_engine
|
||||
|
||||
result = create_engine()
|
||||
assert result == mock_primary
|
||||
@@ -1,11 +1,12 @@
|
||||
"""Integration tests for VIN extraction endpoint."""
|
||||
"""Integration tests for VIN extraction endpoint and engine integration."""
|
||||
import io
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
from app.engines.base_engine import OcrConfig, OcrEngineResult, WordBox
|
||||
from app.main import app
|
||||
|
||||
|
||||
@@ -240,3 +241,106 @@ class TestVinExtractionContentTypes:
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# VIN extractor engine integration tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestVinExtractorEngineIntegration:
|
||||
"""Tests verifying VinExtractor integrates correctly with engine abstraction."""
|
||||
|
||||
@patch("app.extractors.vin_extractor.create_engine")
|
||||
def test_perform_ocr_calls_engine_with_vin_config(
|
||||
self, mock_create_engine: MagicMock
|
||||
) -> None:
|
||||
"""_perform_ocr passes VIN whitelist and angle_cls to engine."""
|
||||
from app.extractors.vin_extractor import VinExtractor
|
||||
|
||||
mock_engine = MagicMock()
|
||||
mock_engine.recognize.return_value = OcrEngineResult(
|
||||
text="1HGBH41JXMN109186",
|
||||
confidence=0.94,
|
||||
word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
|
||||
engine_name="paddleocr",
|
||||
)
|
||||
mock_create_engine.return_value = mock_engine
|
||||
|
||||
extractor = VinExtractor()
|
||||
text, confidences = extractor._perform_ocr(b"fake_image")
|
||||
|
||||
mock_engine.recognize.assert_called_once()
|
||||
call_config = mock_engine.recognize.call_args[0][1]
|
||||
assert isinstance(call_config, OcrConfig)
|
||||
assert call_config.char_whitelist == VinExtractor.VIN_WHITELIST
|
||||
assert call_config.use_angle_cls is True
|
||||
assert call_config.single_line is False
|
||||
assert call_config.single_word is False
|
||||
assert text == "1HGBH41JXMN109186"
|
||||
assert confidences == [0.94]
|
||||
|
||||
@patch("app.extractors.vin_extractor.create_engine")
|
||||
def test_perform_ocr_single_line_mode(
|
||||
self, mock_create_engine: MagicMock
|
||||
) -> None:
|
||||
"""_perform_ocr passes single_line flag to engine config."""
|
||||
from app.extractors.vin_extractor import VinExtractor
|
||||
|
||||
mock_engine = MagicMock()
|
||||
mock_engine.recognize.return_value = OcrEngineResult(
|
||||
text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
|
||||
)
|
||||
mock_create_engine.return_value = mock_engine
|
||||
|
||||
extractor = VinExtractor()
|
||||
extractor._perform_ocr(b"img", single_line=True)
|
||||
|
||||
call_config = mock_engine.recognize.call_args[0][1]
|
||||
assert call_config.single_line is True
|
||||
assert call_config.single_word is False
|
||||
|
||||
@patch("app.extractors.vin_extractor.create_engine")
|
||||
def test_perform_ocr_single_word_mode(
|
||||
self, mock_create_engine: MagicMock
|
||||
) -> None:
|
||||
"""_perform_ocr passes single_word flag to engine config."""
|
||||
from app.extractors.vin_extractor import VinExtractor
|
||||
|
||||
mock_engine = MagicMock()
|
||||
mock_engine.recognize.return_value = OcrEngineResult(
|
||||
text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
|
||||
)
|
||||
mock_create_engine.return_value = mock_engine
|
||||
|
||||
extractor = VinExtractor()
|
||||
extractor._perform_ocr(b"img", single_word=True)
|
||||
|
||||
call_config = mock_engine.recognize.call_args[0][1]
|
||||
assert call_config.single_word is True
|
||||
assert call_config.single_line is False
|
||||
|
||||
def test_calculate_base_confidence_empty_returns_default(self) -> None:
|
||||
"""Empty word confidences return 0.5 default."""
|
||||
from app.extractors.vin_extractor import VinExtractor
|
||||
|
||||
extractor = VinExtractor.__new__(VinExtractor)
|
||||
assert extractor._calculate_base_confidence([]) == 0.5
|
||||
|
||||
def test_calculate_base_confidence_weighted_blend(self) -> None:
|
||||
"""Confidence = 70% average + 30% minimum."""
|
||||
from app.extractors.vin_extractor import VinExtractor
|
||||
|
||||
extractor = VinExtractor.__new__(VinExtractor)
|
||||
# avg = (0.9 + 0.8) / 2 = 0.85, min = 0.8
|
||||
# result = 0.7 * 0.85 + 0.3 * 0.8 = 0.595 + 0.24 = 0.835
|
||||
result = extractor._calculate_base_confidence([0.9, 0.8])
|
||||
assert abs(result - 0.835) < 0.001
|
||||
|
||||
def test_calculate_base_confidence_single_value(self) -> None:
|
||||
"""Single confidence value: avg == min, so result equals that value."""
|
||||
from app.extractors.vin_extractor import VinExtractor
|
||||
|
||||
extractor = VinExtractor.__new__(VinExtractor)
|
||||
result = extractor._calculate_base_confidence([0.92])
|
||||
assert abs(result - 0.92) < 0.001
|
||||
|
||||
Reference in New Issue
Block a user