Merge pull request 'feat: Improve OCR process - replace Tesseract with PaddleOCR (#115)' (#122) from issue-115-improve-ocr-paddleocr into main
All checks were successful
Deploy to Staging / Build Images (push) Successful in 36s
Deploy to Staging / Deploy to Staging (push) Successful in 51s
Deploy to Staging / Verify Staging (push) Successful in 9s
Deploy to Staging / Notify Staging Ready (push) Successful in 7s
Deploy to Staging / Notify Staging Failure (push) Has been skipped
Mirror Base Images / Mirror Base Images (push) Successful in 51s
All checks were successful
Deploy to Staging / Build Images (push) Successful in 36s
Deploy to Staging / Deploy to Staging (push) Successful in 51s
Deploy to Staging / Verify Staging (push) Successful in 9s
Deploy to Staging / Notify Staging Ready (push) Successful in 7s
Deploy to Staging / Notify Staging Failure (push) Has been skipped
Mirror Base Images / Mirror Base Images (push) Successful in 51s
Reviewed-on: #122
This commit was merged in pull request #122.
This commit is contained in:
@@ -108,7 +108,7 @@
|
|||||||
},
|
},
|
||||||
"mvp-ocr": {
|
"mvp-ocr": {
|
||||||
"type": "ocr_service",
|
"type": "ocr_service",
|
||||||
"description": "Python-based OCR for document text extraction",
|
"description": "Python OCR service with pluggable engine abstraction (PaddleOCR PP-OCRv4 primary, optional Google Vision cloud fallback, Tesseract backward compat)",
|
||||||
"port": 8000
|
"port": 8000
|
||||||
},
|
},
|
||||||
"mvp-loki": {
|
"mvp-loki": {
|
||||||
|
|||||||
@@ -45,7 +45,7 @@
|
|||||||
"parent_issue": "The original feature issue. Tracks overall status. Only the parent gets status label transitions.",
|
"parent_issue": "The original feature issue. Tracks overall status. Only the parent gets status label transitions.",
|
||||||
"sub_issue_title_format": "{type}: {summary} (#{parent_index})",
|
"sub_issue_title_format": "{type}: {summary} (#{parent_index})",
|
||||||
"sub_issue_body": "First line must be 'Relates to #{parent_index}'. Each sub-issue is a self-contained unit of work.",
|
"sub_issue_body": "First line must be 'Relates to #{parent_index}'. Each sub-issue is a self-contained unit of work.",
|
||||||
"sub_issue_labels": "status/backlog + same type/* as parent. Sub-issues stay in backlog; parent issue tracks status.",
|
"sub_issue_labels": "status/in-progress + same type/* as parent. Sub-issues move to in-progress as they are worked on.",
|
||||||
"sub_issue_milestone": "Same sprint milestone as parent.",
|
"sub_issue_milestone": "Same sprint milestone as parent.",
|
||||||
"rules": [
|
"rules": [
|
||||||
"ONE branch for the parent issue. Never create branches per sub-issue.",
|
"ONE branch for the parent issue. Never create branches per sub-issue.",
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import type { JobResponse, OcrResponse, VinExtractionResponse } from '../domain/
|
|||||||
|
|
||||||
/** OCR service configuration */
|
/** OCR service configuration */
|
||||||
const OCR_SERVICE_URL = process.env.OCR_SERVICE_URL || 'http://mvp-ocr:8000';
|
const OCR_SERVICE_URL = process.env.OCR_SERVICE_URL || 'http://mvp-ocr:8000';
|
||||||
const OCR_TIMEOUT_MS = 30000; // 30 seconds for sync operations
|
const OCR_TIMEOUT_MS = 120000; // 120 seconds for sync operations (PaddleOCR model loading on first call)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* HTTP client for communicating with the OCR service.
|
* HTTP client for communicating with the OCR service.
|
||||||
|
|||||||
@@ -38,13 +38,17 @@ services:
|
|||||||
STRIPE_ENTERPRISE_MONTHLY_PRICE_ID: prod_Toj8xGEui9jl6j
|
STRIPE_ENTERPRISE_MONTHLY_PRICE_ID: prod_Toj8xGEui9jl6j
|
||||||
STRIPE_ENTERPRISE_YEARLY_PRICE_ID: prod_Toj9A7A773xrdn
|
STRIPE_ENTERPRISE_YEARLY_PRICE_ID: prod_Toj9A7A773xrdn
|
||||||
|
|
||||||
# OCR - Production log level
|
# OCR - Production log level + engine config
|
||||||
mvp-ocr:
|
mvp-ocr:
|
||||||
environment:
|
environment:
|
||||||
LOG_LEVEL: error
|
LOG_LEVEL: error
|
||||||
REDIS_HOST: mvp-redis
|
REDIS_HOST: mvp-redis
|
||||||
REDIS_PORT: 6379
|
REDIS_PORT: 6379
|
||||||
REDIS_DB: 1
|
REDIS_DB: 1
|
||||||
|
OCR_PRIMARY_ENGINE: paddleocr
|
||||||
|
OCR_FALLBACK_ENGINE: ${OCR_FALLBACK_ENGINE:-none}
|
||||||
|
OCR_FALLBACK_THRESHOLD: ${OCR_FALLBACK_THRESHOLD:-0.6}
|
||||||
|
GOOGLE_VISION_KEY_PATH: /run/secrets/google-vision-key.json
|
||||||
|
|
||||||
# PostgreSQL - Remove dev ports, production log level
|
# PostgreSQL - Remove dev ports, production log level
|
||||||
mvp-postgres:
|
mvp-postgres:
|
||||||
|
|||||||
@@ -63,6 +63,15 @@ services:
|
|||||||
mvp-ocr:
|
mvp-ocr:
|
||||||
image: ${OCR_IMAGE:-git.motovaultpro.com/egullickson/ocr:latest}
|
image: ${OCR_IMAGE:-git.motovaultpro.com/egullickson/ocr:latest}
|
||||||
container_name: mvp-ocr-staging
|
container_name: mvp-ocr-staging
|
||||||
|
environment:
|
||||||
|
LOG_LEVEL: debug
|
||||||
|
REDIS_HOST: mvp-redis
|
||||||
|
REDIS_PORT: 6379
|
||||||
|
REDIS_DB: 1
|
||||||
|
OCR_PRIMARY_ENGINE: paddleocr
|
||||||
|
OCR_FALLBACK_ENGINE: ${OCR_FALLBACK_ENGINE:-none}
|
||||||
|
OCR_FALLBACK_THRESHOLD: ${OCR_FALLBACK_THRESHOLD:-0.6}
|
||||||
|
GOOGLE_VISION_KEY_PATH: /run/secrets/google-vision-key.json
|
||||||
|
|
||||||
# ========================================
|
# ========================================
|
||||||
# PostgreSQL (Staging - Separate Database)
|
# PostgreSQL (Staging - Separate Database)
|
||||||
|
|||||||
@@ -193,8 +193,16 @@ services:
|
|||||||
REDIS_HOST: mvp-redis
|
REDIS_HOST: mvp-redis
|
||||||
REDIS_PORT: 6379
|
REDIS_PORT: 6379
|
||||||
REDIS_DB: 1
|
REDIS_DB: 1
|
||||||
|
# OCR engine configuration (PaddleOCR primary, cloud fallback optional)
|
||||||
|
OCR_PRIMARY_ENGINE: paddleocr
|
||||||
|
OCR_FALLBACK_ENGINE: ${OCR_FALLBACK_ENGINE:-none}
|
||||||
|
OCR_FALLBACK_THRESHOLD: ${OCR_FALLBACK_THRESHOLD:-0.6}
|
||||||
|
GOOGLE_VISION_KEY_PATH: /run/secrets/google-vision-key.json
|
||||||
volumes:
|
volumes:
|
||||||
- /tmp/vin-debug:/tmp/vin-debug
|
- /tmp/vin-debug:/tmp/vin-debug
|
||||||
|
# Optional: Uncomment to enable Google Vision cloud fallback.
|
||||||
|
# Requires: secrets/app/google-vision-key.json and OCR_FALLBACK_ENGINE=google_vision
|
||||||
|
# - ./secrets/app/google-vision-key.json:/run/secrets/google-vision-key.json:ro
|
||||||
networks:
|
networks:
|
||||||
- backend
|
- backend
|
||||||
- database
|
- database
|
||||||
|
|||||||
@@ -18,5 +18,5 @@
|
|||||||
| `AUDIT.md` | Audit documentation | Security audits, compliance |
|
| `AUDIT.md` | Audit documentation | Security audits, compliance |
|
||||||
| `MVP-COLOR-SCHEME.md` | Color scheme reference | UI styling decisions |
|
| `MVP-COLOR-SCHEME.md` | Color scheme reference | UI styling decisions |
|
||||||
| `LOGGING.md` | Unified logging system | Log levels, correlation IDs, Grafana |
|
| `LOGGING.md` | Unified logging system | Log levels, correlation IDs, Grafana |
|
||||||
| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, Tesseract setup |
|
| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, PaddleOCR engine abstraction |
|
||||||
| `TIER-GATING.md` | Subscription tier gating rules | Feature access by tier, vehicle limits |
|
| `TIER-GATING.md` | Subscription tier gating rules | Feature access by tier, vehicle limits |
|
||||||
|
|||||||
@@ -118,35 +118,48 @@
|
|||||||
│ ├─────────────────────────────────────────────────────────┤
|
│ ├─────────────────────────────────────────────────────────┤
|
||||||
│ │ │
|
│ │ │
|
||||||
│ │ ┌─────────────────────────────────────────────────┐ │
|
│ │ ┌─────────────────────────────────────────────────┐ │
|
||||||
│ │ │ 5a. Primary OCR: Tesseract 5.x │ │
|
│ │ │ 5a. Engine Abstraction Layer │ │
|
||||||
│ │ │ │ │
|
│ │ │ │ │
|
||||||
│ │ │ • Engine: LSTM (--oem 1) │ │
|
│ │ │ OcrEngine ABC -> PaddleOcrEngine (primary) │ │
|
||||||
│ │ │ • Page segmentation: Auto (--psm 3) │ │
|
│ │ │ -> CloudEngine (optional fallback) │ │
|
||||||
│ │ │ • Output: hOCR with word confidence │ │
|
│ │ │ -> TesseractEngine (backward compat)│ │
|
||||||
|
│ │ │ -> HybridEngine (primary+fallback) │ │
|
||||||
|
│ │ └─────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ ▼ │
|
||||||
|
│ │ ┌─────────────────────────────────────────────────┐ │
|
||||||
|
│ │ │ 5b. Primary OCR: PaddleOCR PP-OCRv4 │ │
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ • Scene text detection + angle classification │ │
|
||||||
|
│ │ │ • CPU-only, models baked into Docker image │ │
|
||||||
|
│ │ │ • Normalized output: text, confidence, word boxes│ │
|
||||||
│ │ └─────────────────────────────────────────────────┘ │
|
│ │ └─────────────────────────────────────────────────┘ │
|
||||||
│ │ │ │
|
│ │ │ │
|
||||||
│ │ ▼ │
|
│ │ ▼ │
|
||||||
│ │ ┌───────────────┐ │
|
│ │ ┌───────────────┐ │
|
||||||
│ │ │ Confidence │ │
|
│ │ │ Confidence │ │
|
||||||
│ │ │ > 80% ? │ │
|
│ │ │ >= 60% ? │ │
|
||||||
│ │ └───────────────┘ │
|
│ │ └───────────────┘ │
|
||||||
│ │ │ │ │
|
│ │ │ │ │
|
||||||
│ │ YES ──┘ └── NO │
|
│ │ YES ──┘ └── NO (and cloud enabled) │
|
||||||
│ │ │ │ │
|
│ │ │ │ │
|
||||||
│ │ │ ▼ │
|
│ │ │ ▼ │
|
||||||
│ │ │ ┌─────────────────────────────────┐ │
|
│ │ │ ┌─────────────────────────────────┐ │
|
||||||
│ │ │ │ 5b. Fallback: PaddleOCR │ │
|
│ │ │ │ 5c. Optional Cloud Fallback │ │
|
||||||
|
│ │ │ │ (Google Vision API) │ │
|
||||||
│ │ │ │ │ │
|
│ │ │ │ │ │
|
||||||
│ │ │ │ • Better for degraded images │ │
|
│ │ │ │ • Disabled by default │ │
|
||||||
│ │ │ │ • Better table detection │ │
|
│ │ │ │ • 5-second timeout guard │ │
|
||||||
│ │ │ │ • Slower but more accurate │ │
|
│ │ │ │ • Returns higher-confidence │ │
|
||||||
|
│ │ │ │ result of primary vs fallback │ │
|
||||||
│ │ │ └─────────────────────────────────┘ │
|
│ │ │ └─────────────────────────────────┘ │
|
||||||
│ │ │ │ │
|
│ │ │ │ │
|
||||||
│ │ ▼ ▼ │
|
│ │ ▼ ▼ │
|
||||||
│ │ ┌─────────────────────────────────┐ │
|
│ │ ┌─────────────────────────────────┐ │
|
||||||
│ │ │ 5c. Result Merging │ │
|
│ │ │ 5d. HybridEngine Result │ │
|
||||||
│ │ │ • Merge by bounding box │ │
|
│ │ │ • Compare confidences │ │
|
||||||
│ │ │ • Keep highest confidence │ │
|
│ │ │ • Keep highest confidence │ │
|
||||||
|
│ │ │ • Graceful fallback on error │ │
|
||||||
│ │ └─────────────────────────────────┘ │
|
│ │ └─────────────────────────────────┘ │
|
||||||
│ │ │
|
│ │ │
|
||||||
│ └─────────────────────────────────────────────────────────┘
|
│ └─────────────────────────────────────────────────────────┘
|
||||||
@@ -257,10 +270,10 @@
|
|||||||
|
|
||||||
| Component | Tool | Purpose |
|
| Component | Tool | Purpose |
|
||||||
|------------------------|-----------------------|--------------------------------------|
|
|------------------------|-----------------------|--------------------------------------|
|
||||||
| **Primary OCR** | Tesseract 5.x | Fast, reliable text extraction |
|
| **Primary OCR** | PaddleOCR PP-OCRv4 | Highest accuracy scene text, CPU-only |
|
||||||
| **Python Binding** | pytesseract | Tesseract Python wrapper |
|
| **Cloud Fallback** | Google Vision API | Optional cloud fallback (disabled by default) |
|
||||||
| **Fallback OCR** | PaddleOCR | Higher accuracy, better tables |
|
| **Backward Compat** | Tesseract 5.x / pytesseract | Legacy engine, configurable via env var |
|
||||||
| **Layout Analysis** | PaddleOCR / LayoutParser | Document structure detection |
|
| **Engine Abstraction** | `OcrEngine` ABC | Pluggable engine interface in `ocr/app/engines/` |
|
||||||
|
|
||||||
### Data Extraction
|
### Data Extraction
|
||||||
|
|
||||||
@@ -291,85 +304,93 @@
|
|||||||
fastapi>=0.100.0
|
fastapi>=0.100.0
|
||||||
uvicorn[standard]>=0.23.0
|
uvicorn[standard]>=0.23.0
|
||||||
python-multipart>=0.0.6
|
python-multipart>=0.0.6
|
||||||
|
pydantic>=2.0.0
|
||||||
# Task Queue
|
|
||||||
celery>=5.3.0
|
|
||||||
redis>=4.6.0
|
|
||||||
|
|
||||||
# File Detection & Handling
|
# File Detection & Handling
|
||||||
python-magic>=0.4.27
|
python-magic>=0.4.27
|
||||||
pillow>=10.0.0
|
pillow>=10.0.0
|
||||||
pillow-heif>=0.13.0
|
pillow-heif>=0.13.0
|
||||||
|
|
||||||
# PDF Processing
|
|
||||||
pymupdf>=1.23.0
|
|
||||||
|
|
||||||
# Image Preprocessing
|
# Image Preprocessing
|
||||||
opencv-python-headless>=4.8.0
|
opencv-python-headless>=4.8.0
|
||||||
deskew>=1.4.0
|
|
||||||
scikit-image>=0.21.0
|
|
||||||
numpy>=1.24.0
|
numpy>=1.24.0
|
||||||
|
|
||||||
# OCR Engines
|
# OCR Engines
|
||||||
pytesseract>=0.3.10
|
pytesseract>=0.3.10
|
||||||
paddlepaddle>=2.5.0
|
paddlepaddle>=2.6.0
|
||||||
paddleocr>=2.7.0
|
paddleocr>=2.8.0
|
||||||
|
google-cloud-vision>=3.7.0
|
||||||
|
|
||||||
# Table Extraction
|
# PDF Processing
|
||||||
img2table>=1.2.0
|
PyMuPDF>=1.23.0
|
||||||
camelot-py[cv]>=0.11.0
|
|
||||||
|
|
||||||
# NLP & Data
|
# Redis for job queue
|
||||||
spacy>=3.6.0
|
redis>=5.0.0
|
||||||
pandas>=2.0.0
|
|
||||||
|
|
||||||
# Storage & Database
|
# HTTP client for callbacks
|
||||||
boto3>=1.28.0
|
httpx>=0.24.0
|
||||||
psycopg2-binary>=2.9.0
|
|
||||||
sqlalchemy>=2.0.0
|
# Testing
|
||||||
|
pytest>=7.4.0
|
||||||
|
pytest-asyncio>=0.21.0
|
||||||
```
|
```
|
||||||
|
|
||||||
### System Package Requirements (Ubuntu/Debian)
|
### System Package Requirements (Ubuntu/Debian)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Tesseract OCR
|
# Tesseract OCR (backward compatibility engine)
|
||||||
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev
|
apt-get install tesseract-ocr tesseract-ocr-eng
|
||||||
|
|
||||||
|
# PaddlePaddle OpenMP runtime
|
||||||
|
apt-get install libgomp1
|
||||||
|
|
||||||
# HEIC Support
|
# HEIC Support
|
||||||
apt-get install libheif-examples libheif-dev
|
apt-get install libheif1 libheif-dev
|
||||||
|
|
||||||
# OpenCV dependencies
|
# GLib (OpenCV dependency)
|
||||||
apt-get install libgl1-mesa-glx libglib2.0-0
|
apt-get install libglib2.0-0
|
||||||
|
|
||||||
# PDF rendering dependencies
|
# File type detection
|
||||||
apt-get install libmupdf-dev mupdf-tools
|
apt-get install libmagic1
|
||||||
|
|
||||||
# Image processing
|
|
||||||
apt-get install libmagic1 ghostscript
|
|
||||||
|
|
||||||
# Camelot dependencies
|
|
||||||
apt-get install ghostscript python3-tk
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|----------|---------|-------------|
|
||||||
|
| `OCR_PRIMARY_ENGINE` | `paddleocr` | Primary OCR engine (`paddleocr`, `tesseract`) |
|
||||||
|
| `OCR_CONFIDENCE_THRESHOLD` | `0.6` | Minimum confidence threshold |
|
||||||
|
| `OCR_FALLBACK_ENGINE` | `none` | Fallback engine (`google_vision`, `none`) |
|
||||||
|
| `OCR_FALLBACK_THRESHOLD` | `0.6` | Confidence below this triggers fallback |
|
||||||
|
| `GOOGLE_VISION_KEY_PATH` | `/run/secrets/google-vision-key.json` | Path to Google Vision service account key |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## DOCKERFILE
|
## DOCKERFILE
|
||||||
|
|
||||||
```dockerfile
|
```dockerfile
|
||||||
FROM python:3.11-slim
|
# Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
|
||||||
|
# Backward compat: Tesseract 5.x (optional, via TesseractEngine)
|
||||||
|
# Cloud fallback: Google Vision (optional, requires API key at runtime)
|
||||||
|
|
||||||
|
FROM python:3.13-slim
|
||||||
|
|
||||||
# System dependencies
|
# System dependencies
|
||||||
|
# - tesseract-ocr/eng: Backward-compatible OCR engine
|
||||||
|
# - libgomp1: OpenMP runtime required by PaddlePaddle
|
||||||
|
# - libheif1/libheif-dev: HEIF image support (iPhone photos)
|
||||||
|
# - libglib2.0-0: GLib shared library (OpenCV dependency)
|
||||||
|
# - libmagic1: File type detection
|
||||||
|
# - curl: Health check endpoint
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
tesseract-ocr-eng \
|
tesseract-ocr-eng \
|
||||||
libtesseract-dev \
|
libgomp1 \
|
||||||
libheif-examples \
|
libheif1 \
|
||||||
libheif-dev \
|
libheif-dev \
|
||||||
libgl1-mesa-glx \
|
|
||||||
libglib2.0-0 \
|
libglib2.0-0 \
|
||||||
libmagic1 \
|
libmagic1 \
|
||||||
ghostscript \
|
curl \
|
||||||
poppler-utils \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Python dependencies
|
# Python dependencies
|
||||||
@@ -377,11 +398,9 @@ WORKDIR /app
|
|||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
# Download spaCy model
|
# Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime)
|
||||||
RUN python -m spacy download en_core_web_sm
|
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)" \
|
||||||
|
&& echo "PaddleOCR PP-OCRv4 models downloaded and verified"
|
||||||
# Download PaddleOCR models (cached in image)
|
|
||||||
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')"
|
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -49,7 +49,7 @@ async function extractVinFromImage(file: File): Promise<VinOcrResult> {
|
|||||||
|
|
||||||
const response = await apiClient.post('/ocr/extract/vin', formData, {
|
const response = await apiClient.post('/ocr/extract/vin', formData, {
|
||||||
headers: { 'Content-Type': 'multipart/form-data' },
|
headers: { 'Content-Type': 'multipart/form-data' },
|
||||||
timeout: 30000, // 30 seconds for OCR processing
|
timeout: 120000, // 120 seconds for OCR processing
|
||||||
});
|
});
|
||||||
|
|
||||||
const data = response.data;
|
const data = response.data;
|
||||||
|
|||||||
@@ -245,7 +245,7 @@ export const CameraCapture: React.FC<CameraCaptureProps> = ({
|
|||||||
return (
|
return (
|
||||||
<CropTool
|
<CropTool
|
||||||
imageSrc={capturedImageSrc}
|
imageSrc={capturedImageSrc}
|
||||||
lockAspectRatio={guidanceType !== 'none'}
|
lockAspectRatio={guidanceType !== 'none' && guidanceType !== 'vin'}
|
||||||
aspectRatio={cropAspectRatio}
|
aspectRatio={cropAspectRatio}
|
||||||
onConfirm={handleCropConfirm}
|
onConfirm={handleCropConfirm}
|
||||||
onReset={handleCropReset}
|
onReset={handleCropReset}
|
||||||
|
|||||||
@@ -95,10 +95,6 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
|
|||||||
const drawOriginRef = useRef({ x: 0, y: 0 });
|
const drawOriginRef = useRef({ x: 0, y: 0 });
|
||||||
const cropAreaRef = useRef(cropArea);
|
const cropAreaRef = useRef(cropArea);
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
cropAreaRef.current = cropArea;
|
|
||||||
}, [cropArea]);
|
|
||||||
|
|
||||||
const setCropArea = useCallback(
|
const setCropArea = useCallback(
|
||||||
(area: CropArea) => {
|
(area: CropArea) => {
|
||||||
setCropAreaState(getAspectRatioAdjustedCrop(area));
|
setCropAreaState(getAspectRatioAdjustedCrop(area));
|
||||||
@@ -177,7 +173,9 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
|
|||||||
startPosRef.current = { x: clientX, y: clientY };
|
startPosRef.current = { x: clientX, y: clientY };
|
||||||
drawOriginRef.current = { x, y };
|
drawOriginRef.current = { x, y };
|
||||||
|
|
||||||
setCropAreaState({ x, y, width: 0, height: 0 });
|
const initial = { x, y, width: 0, height: 0 };
|
||||||
|
setCropAreaState(initial);
|
||||||
|
cropAreaRef.current = initial;
|
||||||
|
|
||||||
isDrawingRef.current = true;
|
isDrawingRef.current = true;
|
||||||
activeHandleRef.current = null;
|
activeHandleRef.current = null;
|
||||||
@@ -203,18 +201,24 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
|
|||||||
const originX = drawOriginRef.current.x;
|
const originX = drawOriginRef.current.x;
|
||||||
const originY = drawOriginRef.current.y;
|
const originY = drawOriginRef.current.y;
|
||||||
|
|
||||||
let newCrop: CropArea = {
|
const drawnWidth = Math.abs(currentX - originX);
|
||||||
|
const drawnHeight = aspectRatio
|
||||||
|
? drawnWidth / aspectRatio
|
||||||
|
: Math.abs(currentY - originY);
|
||||||
|
let drawnY = Math.min(originY, currentY);
|
||||||
|
// Clamp so crop doesn't exceed container bounds when aspect ratio forces height
|
||||||
|
if (aspectRatio && drawnY + drawnHeight > 100) {
|
||||||
|
drawnY = Math.max(0, 100 - drawnHeight);
|
||||||
|
}
|
||||||
|
const newCrop: CropArea = {
|
||||||
x: Math.min(originX, currentX),
|
x: Math.min(originX, currentX),
|
||||||
y: Math.min(originY, currentY),
|
y: drawnY,
|
||||||
width: Math.abs(currentX - originX),
|
width: drawnWidth,
|
||||||
height: Math.abs(currentY - originY),
|
height: drawnHeight,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (aspectRatio) {
|
|
||||||
newCrop.height = newCrop.width / aspectRatio;
|
|
||||||
}
|
|
||||||
|
|
||||||
setCropAreaState(newCrop);
|
setCropAreaState(newCrop);
|
||||||
|
cropAreaRef.current = newCrop;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -303,7 +307,9 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
setCropAreaState(constrainCrop(newCrop));
|
const constrained = constrainCrop(newCrop);
|
||||||
|
setCropAreaState(constrained);
|
||||||
|
cropAreaRef.current = constrained;
|
||||||
},
|
},
|
||||||
[isDragging, constrainCrop, aspectRatio]
|
[isDragging, constrainCrop, aspectRatio]
|
||||||
);
|
);
|
||||||
@@ -312,7 +318,9 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
|
|||||||
if (isDrawingRef.current) {
|
if (isDrawingRef.current) {
|
||||||
isDrawingRef.current = false;
|
isDrawingRef.current = false;
|
||||||
const area = cropAreaRef.current;
|
const area = cropAreaRef.current;
|
||||||
if (area.width >= minSize && area.height >= minSize) {
|
// Accept crop if at least one dimension is meaningful (allows thin strips like VINs)
|
||||||
|
const meetsMinSize = area.width >= minSize || area.height >= minSize;
|
||||||
|
if (meetsMinSize) {
|
||||||
setCropDrawn(true);
|
setCropDrawn(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
# ocr/
|
# ocr/
|
||||||
|
|
||||||
|
Python OCR microservice. Primary engine: PaddleOCR PP-OCRv4 with optional Google Vision cloud fallback. Pluggable engine abstraction in `app/engines/`.
|
||||||
|
|
||||||
## Files
|
## Files
|
||||||
|
|
||||||
| File | What | When to read |
|
| File | What | When to read |
|
||||||
| ---- | ---- | ------------ |
|
| ---- | ---- | ------------ |
|
||||||
| `Dockerfile` | Container build definition | Docker builds, deployment |
|
| `Dockerfile` | Container build (PaddleOCR models baked in) | Docker builds, deployment |
|
||||||
| `requirements.txt` | Python dependencies | Adding dependencies |
|
| `requirements.txt` | Python dependencies | Adding dependencies |
|
||||||
|
|
||||||
## Subdirectories
|
## Subdirectories
|
||||||
@@ -12,4 +14,5 @@
|
|||||||
| Directory | What | When to read |
|
| Directory | What | When to read |
|
||||||
| --------- | ---- | ------------ |
|
| --------- | ---- | ------------ |
|
||||||
| `app/` | FastAPI application source | OCR endpoint development |
|
| `app/` | FastAPI application source | OCR endpoint development |
|
||||||
|
| `app/engines/` | Engine abstraction layer (OcrEngine ABC, factory, hybrid) | Adding or changing OCR engines |
|
||||||
| `tests/` | Test suite | Adding or modifying tests |
|
| `tests/` | Test suite | Adding or modifying tests |
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
# Production Dockerfile for MotoVaultPro OCR Service
|
# Production Dockerfile for MotoVaultPro OCR Service
|
||||||
# Uses mirrored base images from Gitea Package Registry
|
# Uses mirrored base images from Gitea Package Registry
|
||||||
|
#
|
||||||
|
# Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
|
||||||
|
# Cloud fallback: Google Vision (optional, requires API key at runtime)
|
||||||
|
|
||||||
# Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub)
|
# Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub)
|
||||||
ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
|
ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
|
||||||
@@ -7,10 +10,13 @@ ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
|
|||||||
FROM ${REGISTRY_MIRRORS}/python:3.13-slim
|
FROM ${REGISTRY_MIRRORS}/python:3.13-slim
|
||||||
|
|
||||||
# System dependencies
|
# System dependencies
|
||||||
|
# - libgomp1: OpenMP runtime required by PaddlePaddle
|
||||||
|
# - libheif1/libheif-dev: HEIF image support (iPhone photos)
|
||||||
|
# - libglib2.0-0: GLib shared library (OpenCV dependency)
|
||||||
|
# - libmagic1: File type detection
|
||||||
|
# - curl: Health check endpoint
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
tesseract-ocr \
|
libgomp1 \
|
||||||
tesseract-ocr-eng \
|
|
||||||
libtesseract-dev \
|
|
||||||
libheif1 \
|
libheif1 \
|
||||||
libheif-dev \
|
libheif-dev \
|
||||||
libglib2.0-0 \
|
libglib2.0-0 \
|
||||||
@@ -21,7 +27,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
# Python dependencies
|
# Python dependencies
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
# Install dependencies. PaddleX (transitive via paddleocr) pulls in the full
|
||||||
|
# opencv-python which requires libGL.so.1. Force-reinstall the headless
|
||||||
|
# variant afterwards so the container stays GUI-free.
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt \
|
||||||
|
&& pip install --no-cache-dir --force-reinstall opencv-python-headless
|
||||||
|
|
||||||
|
# Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime).
|
||||||
|
# Models are baked into the image so container starts are fast and
|
||||||
|
# no network access is needed at runtime for model download.
|
||||||
|
ENV PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK=True
|
||||||
|
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(ocr_version='PP-OCRv4', use_textline_orientation=True, lang='en', device='cpu', enable_mkldnn=False)" \
|
||||||
|
&& echo "PaddleOCR PP-OCRv4 models downloaded and verified"
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
|
|
||||||
| Directory | What | When to read |
|
| Directory | What | When to read |
|
||||||
| --------- | ---- | ------------ |
|
| --------- | ---- | ------------ |
|
||||||
|
| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback) | Engine changes, adding new engines |
|
||||||
| `extractors/` | Data extraction logic | Adding new extraction types |
|
| `extractors/` | Data extraction logic | Adding new extraction types |
|
||||||
| `models/` | Data models and schemas | Request/response types |
|
| `models/` | Data models and schemas | Request/response types |
|
||||||
| `patterns/` | Regex and parsing patterns | Pattern matching rules |
|
| `patterns/` | Regex and parsing patterns | Pattern matching rules |
|
||||||
|
|||||||
@@ -9,7 +9,20 @@ class Settings:
|
|||||||
self.log_level: str = os.getenv("LOG_LEVEL", "info")
|
self.log_level: str = os.getenv("LOG_LEVEL", "info")
|
||||||
self.host: str = os.getenv("HOST", "0.0.0.0")
|
self.host: str = os.getenv("HOST", "0.0.0.0")
|
||||||
self.port: int = int(os.getenv("PORT", "8000"))
|
self.port: int = int(os.getenv("PORT", "8000"))
|
||||||
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
|
# OCR engine configuration
|
||||||
|
self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr")
|
||||||
|
self.ocr_confidence_threshold: float = float(
|
||||||
|
os.getenv("OCR_CONFIDENCE_THRESHOLD", "0.6")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cloud fallback configuration (disabled by default)
|
||||||
|
self.ocr_fallback_engine: str = os.getenv("OCR_FALLBACK_ENGINE", "none")
|
||||||
|
self.ocr_fallback_threshold: float = float(
|
||||||
|
os.getenv("OCR_FALLBACK_THRESHOLD", "0.6")
|
||||||
|
)
|
||||||
|
self.google_vision_key_path: str = os.getenv(
|
||||||
|
"GOOGLE_VISION_KEY_PATH", "/run/secrets/google-vision-key.json"
|
||||||
|
)
|
||||||
|
|
||||||
# Redis configuration for job queue
|
# Redis configuration for job queue
|
||||||
self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis")
|
self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis")
|
||||||
|
|||||||
32
ocr/app/engines/__init__.py
Normal file
32
ocr/app/engines/__init__.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
"""OCR engine abstraction layer.
|
||||||
|
|
||||||
|
Provides a pluggable engine interface for OCR processing,
|
||||||
|
decoupling extractors from specific OCR libraries.
|
||||||
|
|
||||||
|
Engines:
|
||||||
|
- PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only)
|
||||||
|
- CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback)
|
||||||
|
- HybridEngine: Primary + fallback with confidence threshold
|
||||||
|
"""
|
||||||
|
|
||||||
|
from app.engines.base_engine import (
|
||||||
|
EngineError,
|
||||||
|
EngineProcessingError,
|
||||||
|
EngineUnavailableError,
|
||||||
|
OcrConfig,
|
||||||
|
OcrEngine,
|
||||||
|
OcrEngineResult,
|
||||||
|
WordBox,
|
||||||
|
)
|
||||||
|
from app.engines.engine_factory import create_engine
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"OcrEngine",
|
||||||
|
"OcrConfig",
|
||||||
|
"OcrEngineResult",
|
||||||
|
"WordBox",
|
||||||
|
"EngineError",
|
||||||
|
"EngineUnavailableError",
|
||||||
|
"EngineProcessingError",
|
||||||
|
"create_engine",
|
||||||
|
]
|
||||||
88
ocr/app/engines/base_engine.py
Normal file
88
ocr/app/engines/base_engine.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
"""OCR engine abstract base class and shared data types."""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
# --- Exception hierarchy ---
|
||||||
|
|
||||||
|
|
||||||
|
class EngineError(Exception):
|
||||||
|
"""Base exception for all OCR engine errors."""
|
||||||
|
|
||||||
|
|
||||||
|
class EngineUnavailableError(EngineError):
|
||||||
|
"""Raised when an engine cannot be initialized (missing binary, bad config)."""
|
||||||
|
|
||||||
|
|
||||||
|
class EngineProcessingError(EngineError):
|
||||||
|
"""Raised when an engine fails to process an image."""
|
||||||
|
|
||||||
|
|
||||||
|
# --- Data types ---
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WordBox:
|
||||||
|
"""A single recognized word with position and confidence."""
|
||||||
|
|
||||||
|
text: str
|
||||||
|
confidence: float # 0.0-1.0
|
||||||
|
x: int = 0
|
||||||
|
y: int = 0
|
||||||
|
width: int = 0
|
||||||
|
height: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OcrConfig:
|
||||||
|
"""Engine-agnostic OCR configuration.
|
||||||
|
|
||||||
|
Common fields cover the most frequent needs. Engine-specific
|
||||||
|
parameters go into ``hints`` so the interface stays stable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
char_whitelist: str | None = None # e.g. VIN: "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
||||||
|
single_line: bool = False # Treat image as a single text line
|
||||||
|
single_word: bool = False # Treat image as a single word
|
||||||
|
use_angle_cls: bool = True # Enable angle classification (PaddleOCR)
|
||||||
|
hints: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OcrEngineResult:
|
||||||
|
"""Normalized result returned by every engine implementation."""
|
||||||
|
|
||||||
|
text: str
|
||||||
|
confidence: float # 0.0-1.0
|
||||||
|
word_boxes: list[WordBox]
|
||||||
|
engine_name: str # "paddleocr", "google_vision"
|
||||||
|
|
||||||
|
|
||||||
|
# --- Abstract base ---
|
||||||
|
|
||||||
|
|
||||||
|
class OcrEngine(ABC):
|
||||||
|
"""Abstract base class that all OCR engines must implement."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||||
|
"""Run OCR on preprocessed image bytes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_bytes: Raw image bytes (PNG/JPEG).
|
||||||
|
config: Engine-agnostic configuration.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Normalized OCR result.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
EngineProcessingError: If recognition fails.
|
||||||
|
EngineUnavailableError: If the engine is not ready.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def name(self) -> str:
|
||||||
|
"""Short identifier used in OcrEngineResult.engine_name."""
|
||||||
166
ocr/app/engines/cloud_engine.py
Normal file
166
ocr/app/engines/cloud_engine.py
Normal file
@@ -0,0 +1,166 @@
|
|||||||
|
"""Google Vision cloud OCR engine with lazy initialization."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from app.engines.base_engine import (
|
||||||
|
EngineProcessingError,
|
||||||
|
EngineUnavailableError,
|
||||||
|
OcrConfig,
|
||||||
|
OcrEngine,
|
||||||
|
OcrEngineResult,
|
||||||
|
WordBox,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Default path for Google Vision service account key (Docker secret mount)
|
||||||
|
_DEFAULT_KEY_PATH = "/run/secrets/google-vision-key.json"
|
||||||
|
|
||||||
|
|
||||||
|
class CloudEngine(OcrEngine):
|
||||||
|
"""Google Vision TEXT_DETECTION wrapper with lazy initialization.
|
||||||
|
|
||||||
|
The client is not created until the first ``recognize()`` call,
|
||||||
|
so the container starts normally even when the secret file is
|
||||||
|
missing or the dependency is not installed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, key_path: str | None = None) -> None:
|
||||||
|
self._key_path = key_path or os.getenv(
|
||||||
|
"GOOGLE_VISION_KEY_PATH", _DEFAULT_KEY_PATH
|
||||||
|
)
|
||||||
|
self._client: Any | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "google_vision"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lazy init
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _get_client(self) -> Any:
|
||||||
|
"""Create the Vision client on first use."""
|
||||||
|
if self._client is not None:
|
||||||
|
return self._client
|
||||||
|
|
||||||
|
# Verify credentials file exists
|
||||||
|
if not os.path.isfile(self._key_path):
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
f"Google Vision key not found at {self._key_path}. "
|
||||||
|
"Set GOOGLE_VISION_KEY_PATH or mount the secret."
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from google.cloud import vision # type: ignore[import-untyped]
|
||||||
|
|
||||||
|
# Point the SDK at the service account key
|
||||||
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._key_path
|
||||||
|
self._client = vision.ImageAnnotatorClient()
|
||||||
|
logger.info(
|
||||||
|
"Google Vision client initialized (key: %s)", self._key_path
|
||||||
|
)
|
||||||
|
return self._client
|
||||||
|
except ImportError as exc:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
"google-cloud-vision is not installed. "
|
||||||
|
"Install with: pip install google-cloud-vision"
|
||||||
|
) from exc
|
||||||
|
except Exception as exc:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
f"Failed to initialize Google Vision client: {exc}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# OCR
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||||
|
"""Run Google Vision TEXT_DETECTION on image bytes."""
|
||||||
|
client = self._get_client()
|
||||||
|
|
||||||
|
try:
|
||||||
|
from google.cloud import vision # type: ignore[import-untyped]
|
||||||
|
|
||||||
|
image = vision.Image(content=image_bytes)
|
||||||
|
response = client.text_detection(image=image)
|
||||||
|
|
||||||
|
if response.error.message:
|
||||||
|
raise EngineProcessingError(
|
||||||
|
f"Google Vision API error: {response.error.message}"
|
||||||
|
)
|
||||||
|
|
||||||
|
annotations = response.text_annotations
|
||||||
|
if not annotations:
|
||||||
|
return OcrEngineResult(
|
||||||
|
text="",
|
||||||
|
confidence=0.0,
|
||||||
|
word_boxes=[],
|
||||||
|
engine_name=self.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
# First annotation is the full-page text; the rest are words
|
||||||
|
full_text = annotations[0].description.strip()
|
||||||
|
word_boxes: list[WordBox] = []
|
||||||
|
confidences: list[float] = []
|
||||||
|
|
||||||
|
for annotation in annotations[1:]:
|
||||||
|
text = annotation.description
|
||||||
|
vertices = annotation.bounding_poly.vertices
|
||||||
|
|
||||||
|
# Apply character whitelist filter if configured
|
||||||
|
if config.char_whitelist:
|
||||||
|
allowed = set(config.char_whitelist)
|
||||||
|
text = "".join(ch for ch in text if ch in allowed)
|
||||||
|
|
||||||
|
if not text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
xs = [v.x for v in vertices]
|
||||||
|
ys = [v.y for v in vertices]
|
||||||
|
x_min, y_min = min(xs), min(ys)
|
||||||
|
x_max, y_max = max(xs), max(ys)
|
||||||
|
|
||||||
|
# Google Vision TEXT_DETECTION does not return per-word
|
||||||
|
# confidence in annotations. Use 0.95 as the documented
|
||||||
|
# typical accuracy for clear images so comparisons with
|
||||||
|
# PaddleOCR are meaningful.
|
||||||
|
word_conf = 0.95
|
||||||
|
word_boxes.append(
|
||||||
|
WordBox(
|
||||||
|
text=text.strip(),
|
||||||
|
confidence=word_conf,
|
||||||
|
x=x_min,
|
||||||
|
y=y_min,
|
||||||
|
width=x_max - x_min,
|
||||||
|
height=y_max - y_min,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
confidences.append(word_conf)
|
||||||
|
|
||||||
|
# Apply whitelist to full text too
|
||||||
|
if config.char_whitelist:
|
||||||
|
allowed = set(config.char_whitelist)
|
||||||
|
full_text = "".join(
|
||||||
|
ch for ch in full_text if ch in allowed or ch in " \n"
|
||||||
|
)
|
||||||
|
|
||||||
|
avg_confidence = (
|
||||||
|
sum(confidences) / len(confidences) if confidences else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
return OcrEngineResult(
|
||||||
|
text=full_text,
|
||||||
|
confidence=avg_confidence,
|
||||||
|
word_boxes=word_boxes,
|
||||||
|
engine_name=self.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
except (EngineUnavailableError, EngineProcessingError):
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
raise EngineProcessingError(
|
||||||
|
f"Google Vision recognition failed: {exc}"
|
||||||
|
) from exc
|
||||||
86
ocr/app/engines/engine_factory.py
Normal file
86
ocr/app/engines/engine_factory.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
"""Factory function for creating OCR engine instances from configuration."""
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
from app.engines.base_engine import EngineUnavailableError, OcrEngine
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Valid engine identifiers (primary engines only; hybrid is constructed separately)
|
||||||
|
_ENGINE_REGISTRY: dict[str, str] = {
|
||||||
|
"paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
|
||||||
|
"google_vision": "app.engines.cloud_engine.CloudEngine",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _create_single_engine(name: str) -> OcrEngine:
|
||||||
|
"""Instantiate a single engine by registry name."""
|
||||||
|
if name not in _ENGINE_REGISTRY:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
f"Unknown engine '{name}'. Available: {list(_ENGINE_REGISTRY.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
module_path, class_name = _ENGINE_REGISTRY[name].rsplit(".", 1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
module = importlib.import_module(module_path)
|
||||||
|
engine_cls = getattr(module, class_name)
|
||||||
|
engine: OcrEngine = engine_cls()
|
||||||
|
logger.info("Created OCR engine: %s", name)
|
||||||
|
return engine
|
||||||
|
except EngineUnavailableError:
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
f"Failed to create engine '{name}': {exc}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
|
||||||
|
def create_engine(engine_name: str | None = None) -> OcrEngine:
|
||||||
|
"""Instantiate an OCR engine by name (defaults to config value).
|
||||||
|
|
||||||
|
When a fallback engine is configured (``OCR_FALLBACK_ENGINE != "none"``),
|
||||||
|
returns a ``HybridEngine`` that wraps the primary with the fallback.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
engine_name: Engine identifier ("paddleocr", "google_vision").
|
||||||
|
Falls back to ``settings.ocr_primary_engine``.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Initialized OcrEngine instance (possibly a HybridEngine wrapper).
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
EngineUnavailableError: If the primary engine cannot be loaded.
|
||||||
|
"""
|
||||||
|
name = (engine_name or settings.ocr_primary_engine).lower().strip()
|
||||||
|
primary = _create_single_engine(name)
|
||||||
|
|
||||||
|
# Check for cloud fallback configuration
|
||||||
|
fallback_name = settings.ocr_fallback_engine.lower().strip()
|
||||||
|
if fallback_name == "none" or not fallback_name:
|
||||||
|
return primary
|
||||||
|
|
||||||
|
# Create fallback engine (failure is non-fatal -- log and return primary only)
|
||||||
|
try:
|
||||||
|
fallback = _create_single_engine(fallback_name)
|
||||||
|
except EngineUnavailableError as exc:
|
||||||
|
logger.warning(
|
||||||
|
"Fallback engine '%s' unavailable, proceeding without fallback: %s",
|
||||||
|
fallback_name,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
return primary
|
||||||
|
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
threshold = settings.ocr_fallback_threshold
|
||||||
|
hybrid = HybridEngine(primary=primary, fallback=fallback, threshold=threshold)
|
||||||
|
logger.info(
|
||||||
|
"Created hybrid engine: primary=%s, fallback=%s, threshold=%.2f",
|
||||||
|
name,
|
||||||
|
fallback_name,
|
||||||
|
threshold,
|
||||||
|
)
|
||||||
|
return hybrid
|
||||||
116
ocr/app/engines/hybrid_engine.py
Normal file
116
ocr/app/engines/hybrid_engine.py
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
"""Hybrid OCR engine: primary engine with optional cloud fallback."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
|
from app.engines.base_engine import (
|
||||||
|
EngineError,
|
||||||
|
EngineProcessingError,
|
||||||
|
OcrConfig,
|
||||||
|
OcrEngine,
|
||||||
|
OcrEngineResult,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Maximum time (seconds) to wait for the cloud fallback
|
||||||
|
_CLOUD_TIMEOUT_SECONDS = 5.0
|
||||||
|
|
||||||
|
|
||||||
|
class HybridEngine(OcrEngine):
|
||||||
|
"""Runs a primary engine and falls back to a cloud engine when
|
||||||
|
the primary result confidence is below the configured threshold.
|
||||||
|
|
||||||
|
If the fallback is ``None`` (default), this engine behaves identically
|
||||||
|
to the primary engine. Cloud failures are handled gracefully -- the
|
||||||
|
primary result is returned whenever the fallback is unavailable,
|
||||||
|
times out, or errors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
primary: OcrEngine,
|
||||||
|
fallback: OcrEngine | None = None,
|
||||||
|
threshold: float = 0.6,
|
||||||
|
) -> None:
|
||||||
|
self._primary = primary
|
||||||
|
self._fallback = fallback
|
||||||
|
self._threshold = threshold
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
fallback_name = self._fallback.name if self._fallback else "none"
|
||||||
|
return f"hybrid({self._primary.name}+{fallback_name})"
|
||||||
|
|
||||||
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||||
|
"""Run primary OCR, optionally falling back to cloud engine."""
|
||||||
|
primary_result = self._primary.recognize(image_bytes, config)
|
||||||
|
|
||||||
|
# Happy path: primary confidence meets threshold
|
||||||
|
if primary_result.confidence >= self._threshold:
|
||||||
|
logger.debug(
|
||||||
|
"Primary engine confidence %.2f >= threshold %.2f, no fallback",
|
||||||
|
primary_result.confidence,
|
||||||
|
self._threshold,
|
||||||
|
)
|
||||||
|
return primary_result
|
||||||
|
|
||||||
|
# No fallback configured -- return primary result as-is
|
||||||
|
if self._fallback is None:
|
||||||
|
logger.debug(
|
||||||
|
"Primary confidence %.2f < threshold %.2f but no fallback configured",
|
||||||
|
primary_result.confidence,
|
||||||
|
self._threshold,
|
||||||
|
)
|
||||||
|
return primary_result
|
||||||
|
|
||||||
|
# Attempt cloud fallback with timeout guard
|
||||||
|
logger.info(
|
||||||
|
"Primary confidence %.2f < threshold %.2f, trying fallback (%s)",
|
||||||
|
primary_result.confidence,
|
||||||
|
self._threshold,
|
||||||
|
self._fallback.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
start = time.monotonic()
|
||||||
|
fallback_result = self._fallback.recognize(image_bytes, config)
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
|
||||||
|
if elapsed > _CLOUD_TIMEOUT_SECONDS:
|
||||||
|
logger.warning(
|
||||||
|
"Cloud fallback took %.1fs (> %.1fs limit), using primary result",
|
||||||
|
elapsed,
|
||||||
|
_CLOUD_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
return primary_result
|
||||||
|
|
||||||
|
# Return whichever result has higher confidence
|
||||||
|
if fallback_result.confidence > primary_result.confidence:
|
||||||
|
logger.info(
|
||||||
|
"Fallback confidence %.2f > primary %.2f, using fallback result",
|
||||||
|
fallback_result.confidence,
|
||||||
|
primary_result.confidence,
|
||||||
|
)
|
||||||
|
return fallback_result
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Primary confidence %.2f >= fallback %.2f, keeping primary result",
|
||||||
|
primary_result.confidence,
|
||||||
|
fallback_result.confidence,
|
||||||
|
)
|
||||||
|
return primary_result
|
||||||
|
|
||||||
|
except EngineError as exc:
|
||||||
|
logger.warning(
|
||||||
|
"Cloud fallback failed (%s), returning primary result: %s",
|
||||||
|
self._fallback.name,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
return primary_result
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(
|
||||||
|
"Unexpected cloud fallback error, returning primary result: %s",
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
return primary_result
|
||||||
157
ocr/app/engines/paddle_engine.py
Normal file
157
ocr/app/engines/paddle_engine.py
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
"""PaddleOCR engine wrapper using PP-OCRv4 models."""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from app.engines.base_engine import (
|
||||||
|
EngineProcessingError,
|
||||||
|
EngineUnavailableError,
|
||||||
|
OcrConfig,
|
||||||
|
OcrEngine,
|
||||||
|
OcrEngineResult,
|
||||||
|
WordBox,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PaddleOcrEngine(OcrEngine):
|
||||||
|
"""PaddleOCR PP-OCRv4 engine with angle classification, CPU-only."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._ocr: Any | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "paddleocr"
|
||||||
|
|
||||||
|
def _get_ocr(self) -> Any:
|
||||||
|
"""Lazy-initialize PaddleOCR instance on first use."""
|
||||||
|
if self._ocr is not None:
|
||||||
|
return self._ocr
|
||||||
|
try:
|
||||||
|
from paddleocr import PaddleOCR # type: ignore[import-untyped]
|
||||||
|
|
||||||
|
self._ocr = PaddleOCR(
|
||||||
|
ocr_version="PP-OCRv4",
|
||||||
|
use_textline_orientation=True,
|
||||||
|
lang="en",
|
||||||
|
device="cpu",
|
||||||
|
enable_mkldnn=False,
|
||||||
|
)
|
||||||
|
logger.info("PaddleOCR PP-OCRv4 initialized (CPU, textline_orientation=True)")
|
||||||
|
return self._ocr
|
||||||
|
except ImportError as exc:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
"paddleocr is not installed. "
|
||||||
|
"Install with: pip install paddlepaddle paddleocr"
|
||||||
|
) from exc
|
||||||
|
except Exception as exc:
|
||||||
|
raise EngineUnavailableError(
|
||||||
|
f"Failed to initialize PaddleOCR: {exc}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||||
|
"""Run PaddleOCR on image bytes.
|
||||||
|
|
||||||
|
PaddleOCR v3.x ``predict()`` returns an iterator of result objects.
|
||||||
|
Each result's ``.json`` property returns a dict. The OCR fields
|
||||||
|
(``dt_polys``, ``rec_texts``, ``rec_scores``) may be at the top
|
||||||
|
level or nested under a ``"res"`` key depending on the version.
|
||||||
|
"""
|
||||||
|
ocr = self._get_ocr()
|
||||||
|
|
||||||
|
try:
|
||||||
|
import numpy as np # type: ignore[import-untyped]
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
||||||
|
img_array = np.array(image)
|
||||||
|
|
||||||
|
results = list(ocr.predict(img_array))
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return OcrEngineResult(
|
||||||
|
text="",
|
||||||
|
confidence=0.0,
|
||||||
|
word_boxes=[],
|
||||||
|
engine_name=self.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
raw = results[0].json
|
||||||
|
# Unwrap nested "res" key if present (save_to_json format)
|
||||||
|
res = raw.get("res", raw) if isinstance(raw, dict) else raw
|
||||||
|
logger.debug(
|
||||||
|
"PaddleOCR result keys: %s",
|
||||||
|
list(res.keys()) if isinstance(res, dict) else type(res).__name__,
|
||||||
|
)
|
||||||
|
dt_polys = res.get("dt_polys", [])
|
||||||
|
rec_texts = res.get("rec_texts", [])
|
||||||
|
rec_scores = res.get("rec_scores", [])
|
||||||
|
|
||||||
|
if not rec_texts:
|
||||||
|
return OcrEngineResult(
|
||||||
|
text="",
|
||||||
|
confidence=0.0,
|
||||||
|
word_boxes=[],
|
||||||
|
engine_name=self.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
word_boxes: list[WordBox] = []
|
||||||
|
texts: list[str] = []
|
||||||
|
confidences: list[float] = []
|
||||||
|
|
||||||
|
for i, text in enumerate(rec_texts):
|
||||||
|
conf = float(rec_scores[i]) if i < len(rec_scores) else 0.0
|
||||||
|
|
||||||
|
# Apply character whitelist filter if configured
|
||||||
|
if config.char_whitelist:
|
||||||
|
allowed = set(config.char_whitelist)
|
||||||
|
text = "".join(ch for ch in text if ch in allowed)
|
||||||
|
|
||||||
|
if not text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Convert quadrilateral polygon to bounding box
|
||||||
|
x_min, y_min, width, height = 0, 0, 0, 0
|
||||||
|
if i < len(dt_polys):
|
||||||
|
poly = dt_polys[i]
|
||||||
|
xs = [pt[0] for pt in poly]
|
||||||
|
ys = [pt[1] for pt in poly]
|
||||||
|
x_min, y_min = int(min(xs)), int(min(ys))
|
||||||
|
x_max, y_max = int(max(xs)), int(max(ys))
|
||||||
|
width = x_max - x_min
|
||||||
|
height = y_max - y_min
|
||||||
|
|
||||||
|
word_boxes.append(
|
||||||
|
WordBox(
|
||||||
|
text=text.strip(),
|
||||||
|
confidence=conf,
|
||||||
|
x=x_min,
|
||||||
|
y=y_min,
|
||||||
|
width=width,
|
||||||
|
height=height,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
texts.append(text.strip())
|
||||||
|
confidences.append(conf)
|
||||||
|
|
||||||
|
combined_text = " ".join(texts)
|
||||||
|
avg_confidence = (
|
||||||
|
sum(confidences) / len(confidences) if confidences else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
return OcrEngineResult(
|
||||||
|
text=combined_text,
|
||||||
|
confidence=avg_confidence,
|
||||||
|
word_boxes=word_boxes,
|
||||||
|
engine_name=self.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
except (EngineUnavailableError, EngineProcessingError):
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
raise EngineProcessingError(
|
||||||
|
f"PaddleOCR recognition failed: {exc}"
|
||||||
|
) from exc
|
||||||
@@ -5,9 +5,9 @@ import time
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import pytesseract
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from app.engines import create_engine, OcrConfig
|
||||||
from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo
|
from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo
|
||||||
from app.table_extraction.detector import table_detector, DetectedTable
|
from app.table_extraction.detector import table_detector, DetectedTable
|
||||||
from app.table_extraction.parser import table_parser, ParsedScheduleRow
|
from app.table_extraction.parser import table_parser, ParsedScheduleRow
|
||||||
@@ -243,8 +243,9 @@ class ManualExtractor:
|
|||||||
|
|
||||||
# OCR the full page
|
# OCR the full page
|
||||||
try:
|
try:
|
||||||
image = Image.open(io.BytesIO(image_bytes))
|
engine = create_engine()
|
||||||
ocr_text = pytesseract.image_to_string(image)
|
ocr_result = engine.recognize(image_bytes, OcrConfig())
|
||||||
|
ocr_text = ocr_result.text
|
||||||
|
|
||||||
# Mark tables as maintenance if page contains maintenance keywords
|
# Mark tables as maintenance if page contains maintenance keywords
|
||||||
for table in detected_tables:
|
for table in detected_tables:
|
||||||
@@ -358,8 +359,9 @@ class ManualExtractor:
|
|||||||
|
|
||||||
if not text and first_page.image_bytes:
|
if not text and first_page.image_bytes:
|
||||||
# OCR first page
|
# OCR first page
|
||||||
image = Image.open(io.BytesIO(first_page.image_bytes))
|
engine = create_engine()
|
||||||
text = pytesseract.image_to_string(image)
|
ocr_result = engine.recognize(first_page.image_bytes, OcrConfig())
|
||||||
|
text = ocr_result.text
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
return self._parse_vehicle_from_text(text)
|
return self._parse_vehicle_from_text(text)
|
||||||
|
|||||||
@@ -1,16 +1,13 @@
|
|||||||
"""Receipt-specific OCR extractor with field extraction."""
|
"""Receipt-specific OCR extractor with field extraction."""
|
||||||
import io
|
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
import pytesseract
|
|
||||||
from PIL import Image
|
|
||||||
from pillow_heif import register_heif_opener
|
from pillow_heif import register_heif_opener
|
||||||
|
|
||||||
from app.config import settings
|
from app.engines import OcrConfig, create_engine
|
||||||
from app.extractors.base import BaseExtractor
|
from app.extractors.base import BaseExtractor
|
||||||
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
|
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
|
||||||
from app.patterns import currency_matcher, date_matcher, fuel_matcher
|
from app.patterns import currency_matcher, date_matcher, fuel_matcher
|
||||||
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize receipt extractor."""
|
"""Initialize receipt extractor with engine from factory."""
|
||||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
self._engine = create_engine()
|
||||||
|
|
||||||
def extract(
|
def extract(
|
||||||
self,
|
self,
|
||||||
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
|
|||||||
detected = mime.from_buffer(file_bytes)
|
detected = mime.from_buffer(file_bytes)
|
||||||
return detected or "application/octet-stream"
|
return detected or "application/octet-stream"
|
||||||
|
|
||||||
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
|
def _perform_ocr(self, image_bytes: bytes) -> str:
|
||||||
"""
|
"""
|
||||||
Perform OCR on preprocessed image.
|
Perform OCR on preprocessed image via engine abstraction.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_bytes: Preprocessed image bytes
|
image_bytes: Preprocessed image bytes
|
||||||
psm: Tesseract page segmentation mode
|
|
||||||
4 = Assume single column of text
|
|
||||||
6 = Uniform block of text
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Raw OCR text
|
Raw OCR text
|
||||||
"""
|
"""
|
||||||
image = Image.open(io.BytesIO(image_bytes))
|
config = OcrConfig()
|
||||||
|
result = self._engine.recognize(image_bytes, config)
|
||||||
# Configure Tesseract for receipt OCR
|
return result.text
|
||||||
# PSM 4 works well for columnar receipt text
|
|
||||||
config = f"--psm {psm}"
|
|
||||||
|
|
||||||
return pytesseract.image_to_string(image, config=config)
|
|
||||||
|
|
||||||
def _detect_receipt_type(self, text: str) -> str:
|
def _detect_receipt_type(self, text: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
"""VIN-specific OCR extractor with preprocessing and validation."""
|
"""VIN-specific OCR extractor with preprocessing and validation."""
|
||||||
import io
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@@ -8,11 +7,10 @@ from datetime import datetime
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
import pytesseract
|
|
||||||
from PIL import Image
|
|
||||||
from pillow_heif import register_heif_opener
|
from pillow_heif import register_heif_opener
|
||||||
|
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
|
from app.engines import OcrConfig, create_engine
|
||||||
from app.extractors.base import BaseExtractor
|
from app.extractors.base import BaseExtractor
|
||||||
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
|
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
|
||||||
from app.validators.vin_validator import vin_validator
|
from app.validators.vin_validator import vin_validator
|
||||||
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
|
|||||||
"image/heif",
|
"image/heif",
|
||||||
}
|
}
|
||||||
|
|
||||||
# VIN character whitelist for Tesseract
|
# VIN character whitelist (passed to engine for post-OCR filtering)
|
||||||
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
||||||
|
|
||||||
# Fixed debug output directory (inside container)
|
# Fixed debug output directory (inside container)
|
||||||
DEBUG_DIR = "/tmp/vin-debug"
|
DEBUG_DIR = "/tmp/vin-debug"
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize VIN extractor."""
|
"""Initialize VIN extractor with engine from factory."""
|
||||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
self._engine = create_engine()
|
||||||
self._debug = settings.log_level.upper() == "DEBUG"
|
self._debug = settings.log_level.upper() == "DEBUG"
|
||||||
|
|
||||||
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
|
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
|
||||||
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
|
|||||||
|
|
||||||
# Perform OCR with VIN-optimized settings
|
# Perform OCR with VIN-optimized settings
|
||||||
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
|
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
|
||||||
logger.debug("PSM 6 raw text: '%s'", raw_text)
|
logger.debug("Primary OCR raw text: '%s'", raw_text)
|
||||||
logger.debug("PSM 6 word confidences: %s", word_confidences)
|
logger.debug("Primary OCR word confidences: %s", word_confidences)
|
||||||
|
|
||||||
# Extract VIN candidates from raw text
|
# Extract VIN candidates from raw text
|
||||||
candidates = vin_validator.extract_candidates(raw_text)
|
candidates = vin_validator.extract_candidates(raw_text)
|
||||||
logger.debug("PSM 6 candidates: %s", candidates)
|
logger.debug("Primary OCR candidates: %s", candidates)
|
||||||
|
|
||||||
if not candidates:
|
if not candidates:
|
||||||
# No VIN candidates found - try with different PSM modes
|
# No VIN candidates found - try alternate OCR configurations
|
||||||
candidates = self._try_alternate_ocr(preprocessed_bytes)
|
candidates = self._try_alternate_ocr(preprocessed_bytes)
|
||||||
|
|
||||||
if not candidates:
|
if not candidates:
|
||||||
# Try grayscale-only (no thresholding) — the Tesseract
|
# Try grayscale-only (no thresholding) — OCR engines often
|
||||||
# LSTM engine often performs better on non-binarized input
|
# perform better on non-binarized input because they do
|
||||||
# because it does its own internal preprocessing.
|
# their own internal preprocessing.
|
||||||
gray_result = vin_preprocessor.preprocess(
|
gray_result = vin_preprocessor.preprocess(
|
||||||
image_bytes, apply_threshold=False
|
image_bytes, apply_threshold=False
|
||||||
)
|
)
|
||||||
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
|
|||||||
raw_text, word_confidences = self._perform_ocr(
|
raw_text, word_confidences = self._perform_ocr(
|
||||||
gray_result.image_bytes
|
gray_result.image_bytes
|
||||||
)
|
)
|
||||||
logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
|
logger.debug("Gray primary raw text: '%s'", raw_text)
|
||||||
candidates = vin_validator.extract_candidates(raw_text)
|
candidates = vin_validator.extract_candidates(raw_text)
|
||||||
logger.debug("Gray PSM 6 candidates: %s", candidates)
|
logger.debug("Gray primary candidates: %s", candidates)
|
||||||
if not candidates:
|
if not candidates:
|
||||||
candidates = self._try_alternate_ocr(
|
candidates = self._try_alternate_ocr(
|
||||||
gray_result.image_bytes, prefix="Gray"
|
gray_result.image_bytes, prefix="Gray"
|
||||||
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
|
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
|
||||||
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
|
logger.debug("Otsu primary raw text: '%s'", raw_text)
|
||||||
candidates = vin_validator.extract_candidates(raw_text)
|
candidates = vin_validator.extract_candidates(raw_text)
|
||||||
logger.debug("Otsu PSM 6 candidates: %s", candidates)
|
logger.debug("Otsu primary candidates: %s", candidates)
|
||||||
if not candidates:
|
if not candidates:
|
||||||
candidates = self._try_alternate_ocr(
|
candidates = self._try_alternate_ocr(
|
||||||
otsu_result.image_bytes, prefix="Otsu"
|
otsu_result.image_bytes, prefix="Otsu"
|
||||||
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
|
|||||||
return detected or "application/octet-stream"
|
return detected or "application/octet-stream"
|
||||||
|
|
||||||
def _perform_ocr(
|
def _perform_ocr(
|
||||||
self, image_bytes: bytes, psm: int = 6
|
self,
|
||||||
|
image_bytes: bytes,
|
||||||
|
single_line: bool = False,
|
||||||
|
single_word: bool = False,
|
||||||
) -> tuple[str, list[float]]:
|
) -> tuple[str, list[float]]:
|
||||||
"""
|
"""
|
||||||
Perform OCR with VIN-optimized settings.
|
Perform OCR with VIN-optimized settings via engine abstraction.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_bytes: Preprocessed image bytes
|
image_bytes: Preprocessed image bytes
|
||||||
psm: Tesseract page segmentation mode
|
single_line: Treat image as a single text line
|
||||||
6 = Uniform block of text
|
single_word: Treat image as a single word
|
||||||
7 = Single text line
|
|
||||||
8 = Single word
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (raw_text, word_confidences)
|
Tuple of (raw_text, word_confidences)
|
||||||
"""
|
"""
|
||||||
image = Image.open(io.BytesIO(image_bytes))
|
config = OcrConfig(
|
||||||
|
char_whitelist=self.VIN_WHITELIST,
|
||||||
# Configure Tesseract for VIN extraction
|
single_line=single_line,
|
||||||
# OEM 1 = LSTM neural network engine (best accuracy)
|
single_word=single_word,
|
||||||
# NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
|
use_angle_cls=True,
|
||||||
# Using it causes empty/erratic output. Character filtering is
|
|
||||||
# handled post-OCR by vin_validator.correct_ocr_errors() instead.
|
|
||||||
config = (
|
|
||||||
f"--psm {psm} "
|
|
||||||
f"--oem 1 "
|
|
||||||
f"-c load_system_dawg=false "
|
|
||||||
f"-c load_freq_dawg=false"
|
|
||||||
)
|
)
|
||||||
|
result = self._engine.recognize(image_bytes, config)
|
||||||
# Get detailed OCR data
|
word_confidences = [wb.confidence for wb in result.word_boxes]
|
||||||
ocr_data = pytesseract.image_to_data(
|
return result.text, word_confidences
|
||||||
image, config=config, output_type=pytesseract.Output.DICT
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract words and confidences
|
|
||||||
words = []
|
|
||||||
confidences = []
|
|
||||||
|
|
||||||
for i, text in enumerate(ocr_data["text"]):
|
|
||||||
conf = int(ocr_data["conf"][i])
|
|
||||||
if text.strip() and conf > 0:
|
|
||||||
words.append(text.strip())
|
|
||||||
confidences.append(conf / 100.0)
|
|
||||||
|
|
||||||
raw_text = " ".join(words)
|
|
||||||
return raw_text, confidences
|
|
||||||
|
|
||||||
def _try_alternate_ocr(
|
def _try_alternate_ocr(
|
||||||
self,
|
self,
|
||||||
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
|
|||||||
"""
|
"""
|
||||||
Try alternate OCR configurations when initial extraction fails.
|
Try alternate OCR configurations when initial extraction fails.
|
||||||
|
|
||||||
PSM modes tried in order:
|
Modes tried:
|
||||||
7 - Single text line
|
single-line - Treat as a single text line
|
||||||
8 - Single word
|
single-word - Treat as a single word
|
||||||
11 - Sparse text (finds text in any order, good for angled photos)
|
|
||||||
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
|
PaddleOCR angle classification handles rotated/angled text
|
||||||
|
inherently, so no PSM mode fallbacks are needed.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of VIN candidates
|
List of VIN candidates
|
||||||
"""
|
"""
|
||||||
tag = f"{prefix} " if prefix else ""
|
tag = f"{prefix} " if prefix else ""
|
||||||
for psm in (7, 8, 11, 13):
|
for mode_name, kwargs in [
|
||||||
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
|
("single-line", {"single_line": True}),
|
||||||
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
|
("single-word", {"single_word": True}),
|
||||||
|
]:
|
||||||
|
raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
|
||||||
|
logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
|
||||||
candidates = vin_validator.extract_candidates(raw_text)
|
candidates = vin_validator.extract_candidates(raw_text)
|
||||||
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
|
logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
|
||||||
if candidates:
|
if candidates:
|
||||||
return candidates
|
return candidates
|
||||||
|
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ class VinPreprocessor:
|
|||||||
gray = cv_image
|
gray = cv_image
|
||||||
steps_applied.append("grayscale")
|
steps_applied.append("grayscale")
|
||||||
|
|
||||||
# Upscale small images for better OCR (Tesseract needs ~300 DPI)
|
# Upscale small images for better OCR (~300 DPI recommended)
|
||||||
gray = self._ensure_minimum_resolution(gray)
|
gray = self._ensure_minimum_resolution(gray)
|
||||||
steps_applied.append("resolution_check")
|
steps_applied.append("resolution_check")
|
||||||
|
|
||||||
@@ -129,14 +129,14 @@ class VinPreprocessor:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Minimum width in pixels for reliable VIN OCR.
|
# Minimum width in pixels for reliable VIN OCR.
|
||||||
# A 17-char VIN needs ~30px per character for Tesseract accuracy.
|
# A 17-char VIN needs ~30px per character for reliable OCR accuracy.
|
||||||
MIN_WIDTH_FOR_VIN = 600
|
MIN_WIDTH_FOR_VIN = 600
|
||||||
|
|
||||||
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
|
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Upscale image if too small for reliable OCR.
|
Upscale image if too small for reliable OCR.
|
||||||
|
|
||||||
Tesseract works best at ~300 DPI. Mobile photos of VINs may have
|
OCR works best at ~300 DPI. Mobile photos of VINs may have
|
||||||
the text occupy only a small portion of the frame, resulting in
|
the text occupy only a small portion of the frame, resulting in
|
||||||
low effective resolution for the VIN characters.
|
low effective resolution for the VIN characters.
|
||||||
"""
|
"""
|
||||||
@@ -160,7 +160,7 @@ class VinPreprocessor:
|
|||||||
Colored backgrounds have a low min value (e.g. green sticker:
|
Colored backgrounds have a low min value (e.g. green sticker:
|
||||||
min(130,230,150) = 130) → inverted to 125 (medium gray).
|
min(130,230,150) = 130) → inverted to 125 (medium gray).
|
||||||
|
|
||||||
The inversion ensures Tesseract always receives dark-text-on-
|
The inversion ensures the OCR engine always receives dark-text-on-
|
||||||
light-background, which is the polarity it expects.
|
light-background, which is the polarity it expects.
|
||||||
"""
|
"""
|
||||||
b_channel, g_channel, r_channel = cv2.split(bgr_image)
|
b_channel, g_channel, r_channel = cv2.split(bgr_image)
|
||||||
@@ -168,8 +168,8 @@ class VinPreprocessor:
|
|||||||
min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)
|
min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)
|
||||||
|
|
||||||
# Invert so white text (min=255) becomes black (0) and colored
|
# Invert so white text (min=255) becomes black (0) and colored
|
||||||
# backgrounds (min~130) become lighter gray (~125). Tesseract
|
# backgrounds (min~130) become lighter gray (~125). OCR engines
|
||||||
# expects dark text on light background.
|
# expect dark text on light background.
|
||||||
inverted = cv2.bitwise_not(min_channel)
|
inverted = cv2.bitwise_not(min_channel)
|
||||||
|
|
||||||
gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
|
||||||
|
|||||||
@@ -1,15 +1,14 @@
|
|||||||
"""Core OCR service using Tesseract with HEIC support."""
|
"""Core OCR service with HEIC support, using pluggable engine abstraction."""
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
import pytesseract
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from pillow_heif import register_heif_opener
|
from pillow_heif import register_heif_opener
|
||||||
|
|
||||||
from app.config import settings
|
from app.engines import OcrConfig, create_engine
|
||||||
from app.models import DocumentType, ExtractedField, OcrResponse
|
from app.models import DocumentType, ExtractedField, OcrResponse
|
||||||
from app.services.preprocessor import preprocessor
|
from app.services.preprocessor import preprocessor
|
||||||
|
|
||||||
@@ -32,8 +31,8 @@ class OcrService:
|
|||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize OCR service."""
|
"""Initialize OCR service with engine from factory."""
|
||||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
self._engine = create_engine()
|
||||||
|
|
||||||
def extract(
|
def extract(
|
||||||
self,
|
self,
|
||||||
@@ -86,14 +85,11 @@ class OcrService:
|
|||||||
file_bytes, deskew=True, denoise=True
|
file_bytes, deskew=True, denoise=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Perform OCR
|
# Perform OCR via engine abstraction
|
||||||
image = Image.open(io.BytesIO(file_bytes))
|
config = OcrConfig()
|
||||||
ocr_data = pytesseract.image_to_data(
|
result = self._engine.recognize(file_bytes, config)
|
||||||
image, output_type=pytesseract.Output.DICT
|
raw_text = result.text
|
||||||
)
|
confidence = result.confidence
|
||||||
|
|
||||||
# Extract text and calculate confidence
|
|
||||||
raw_text, confidence = self._process_ocr_data(ocr_data)
|
|
||||||
|
|
||||||
# Detect document type from content
|
# Detect document type from content
|
||||||
document_type = self._detect_document_type(raw_text)
|
document_type = self._detect_document_type(raw_text)
|
||||||
@@ -160,26 +156,6 @@ class OcrService:
|
|||||||
|
|
||||||
return b""
|
return b""
|
||||||
|
|
||||||
def _process_ocr_data(
|
|
||||||
self, ocr_data: dict
|
|
||||||
) -> tuple[str, float]:
|
|
||||||
"""Process Tesseract output to extract text and confidence."""
|
|
||||||
words = []
|
|
||||||
confidences = []
|
|
||||||
|
|
||||||
for i, text in enumerate(ocr_data["text"]):
|
|
||||||
# Filter out empty strings and low-confidence results
|
|
||||||
conf = int(ocr_data["conf"][i])
|
|
||||||
if text.strip() and conf > 0:
|
|
||||||
words.append(text)
|
|
||||||
confidences.append(conf)
|
|
||||||
|
|
||||||
raw_text = " ".join(words)
|
|
||||||
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
|
||||||
|
|
||||||
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
|
|
||||||
return raw_text, avg_confidence / 100.0
|
|
||||||
|
|
||||||
def _detect_document_type(self, text: str) -> DocumentType:
|
def _detect_document_type(self, text: str) -> DocumentType:
|
||||||
"""Detect document type from extracted text content."""
|
"""Detect document type from extracted text content."""
|
||||||
text_lower = text.lower()
|
text_lower = text.lower()
|
||||||
|
|||||||
@@ -312,7 +312,7 @@ class TableDetector:
|
|||||||
Returns:
|
Returns:
|
||||||
2D list of cell contents
|
2D list of cell contents
|
||||||
"""
|
"""
|
||||||
# This would use Tesseract on the cropped region
|
# This would use OCR on the cropped region
|
||||||
# For now, return empty - actual OCR will be done in manual_extractor
|
# For now, return empty - actual OCR will be done in manual_extractor
|
||||||
logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}")
|
logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}")
|
||||||
return []
|
return []
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ class VinValidator:
|
|||||||
Uses two strategies:
|
Uses two strategies:
|
||||||
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
|
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
|
||||||
2. Concatenate adjacent short fragments separated by spaces/dashes
|
2. Concatenate adjacent short fragments separated by spaces/dashes
|
||||||
(handles Tesseract fragmenting VINs into multiple words)
|
(handles OCR fragmenting VINs into multiple words)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Raw OCR text
|
text: Raw OCR text
|
||||||
|
|||||||
@@ -14,7 +14,9 @@ opencv-python-headless>=4.8.0
|
|||||||
numpy>=1.24.0
|
numpy>=1.24.0
|
||||||
|
|
||||||
# OCR Engines
|
# OCR Engines
|
||||||
pytesseract>=0.3.10
|
paddlepaddle>=2.6.0
|
||||||
|
paddleocr>=2.8.0
|
||||||
|
google-cloud-vision>=3.7.0
|
||||||
|
|
||||||
# PDF Processing
|
# PDF Processing
|
||||||
PyMuPDF>=1.23.0
|
PyMuPDF>=1.23.0
|
||||||
|
|||||||
626
ocr/tests/test_engine_abstraction.py
Normal file
626
ocr/tests/test_engine_abstraction.py
Normal file
@@ -0,0 +1,626 @@
|
|||||||
|
"""Tests for OCR engine abstraction layer.
|
||||||
|
|
||||||
|
Covers: base types, exception hierarchy, PaddleOcrEngine,
|
||||||
|
CloudEngine, HybridEngine, and engine_factory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import io
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from app.engines.base_engine import (
|
||||||
|
EngineError,
|
||||||
|
EngineProcessingError,
|
||||||
|
EngineUnavailableError,
|
||||||
|
OcrConfig,
|
||||||
|
OcrEngine,
|
||||||
|
OcrEngineResult,
|
||||||
|
WordBox,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# --- Helpers ---
|
||||||
|
|
||||||
|
|
||||||
|
def _create_test_image_bytes() -> bytes:
|
||||||
|
"""Create minimal PNG image bytes for engine testing."""
|
||||||
|
img = Image.new("RGB", (100, 50), (255, 255, 255))
|
||||||
|
buf = io.BytesIO()
|
||||||
|
img.save(buf, format="PNG")
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def _make_result(
|
||||||
|
text: str, confidence: float, engine_name: str
|
||||||
|
) -> OcrEngineResult:
|
||||||
|
"""Create a minimal OcrEngineResult for testing."""
|
||||||
|
return OcrEngineResult(
|
||||||
|
text=text, confidence=confidence, word_boxes=[], engine_name=engine_name
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_paddle_result(
|
||||||
|
dt_polys: list, rec_texts: list[str], rec_scores: list[float]
|
||||||
|
) -> MagicMock:
|
||||||
|
"""Create a mock PaddleOCR v3.x predict() result object.
|
||||||
|
|
||||||
|
Wraps data under ``"res"`` key to match save_to_json format.
|
||||||
|
"""
|
||||||
|
result = MagicMock()
|
||||||
|
result.json = {
|
||||||
|
"res": {
|
||||||
|
"dt_polys": dt_polys,
|
||||||
|
"rec_texts": rec_texts,
|
||||||
|
"rec_scores": rec_scores,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Exception hierarchy
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestExceptionHierarchy:
|
||||||
|
"""Engine errors form a proper hierarchy under EngineError."""
|
||||||
|
|
||||||
|
def test_unavailable_is_engine_error(self) -> None:
|
||||||
|
assert issubclass(EngineUnavailableError, EngineError)
|
||||||
|
|
||||||
|
def test_processing_is_engine_error(self) -> None:
|
||||||
|
assert issubclass(EngineProcessingError, EngineError)
|
||||||
|
|
||||||
|
def test_engine_error_is_exception(self) -> None:
|
||||||
|
assert issubclass(EngineError, Exception)
|
||||||
|
|
||||||
|
def test_catch_base_catches_subtypes(self) -> None:
|
||||||
|
with pytest.raises(EngineError):
|
||||||
|
raise EngineUnavailableError("not installed")
|
||||||
|
with pytest.raises(EngineError):
|
||||||
|
raise EngineProcessingError("OCR failed")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Data types
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestWordBox:
|
||||||
|
def test_default_positions(self) -> None:
|
||||||
|
wb = WordBox(text="VIN", confidence=0.95)
|
||||||
|
assert wb.x == 0
|
||||||
|
assert wb.y == 0
|
||||||
|
assert wb.width == 0
|
||||||
|
assert wb.height == 0
|
||||||
|
|
||||||
|
def test_all_fields(self) -> None:
|
||||||
|
wb = WordBox(text="ABC", confidence=0.88, x=10, y=20, width=100, height=30)
|
||||||
|
assert wb.text == "ABC"
|
||||||
|
assert wb.confidence == 0.88
|
||||||
|
assert wb.x == 10
|
||||||
|
assert wb.width == 100
|
||||||
|
|
||||||
|
|
||||||
|
class TestOcrConfig:
|
||||||
|
def test_defaults(self) -> None:
|
||||||
|
config = OcrConfig()
|
||||||
|
assert config.char_whitelist is None
|
||||||
|
assert config.single_line is False
|
||||||
|
assert config.single_word is False
|
||||||
|
assert config.use_angle_cls is True
|
||||||
|
assert config.hints == {}
|
||||||
|
|
||||||
|
def test_vin_whitelist_excludes_ioq(self) -> None:
|
||||||
|
whitelist = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
||||||
|
config = OcrConfig(char_whitelist=whitelist)
|
||||||
|
assert "I" not in config.char_whitelist
|
||||||
|
assert "O" not in config.char_whitelist
|
||||||
|
assert "Q" not in config.char_whitelist
|
||||||
|
|
||||||
|
def test_hints_are_independent_across_instances(self) -> None:
|
||||||
|
c1 = OcrConfig()
|
||||||
|
c2 = OcrConfig()
|
||||||
|
c1.hints["psm"] = 7
|
||||||
|
assert "psm" not in c2.hints
|
||||||
|
|
||||||
|
|
||||||
|
class TestOcrEngineResult:
|
||||||
|
def test_construction(self) -> None:
|
||||||
|
result = OcrEngineResult(
|
||||||
|
text="1HGBH41JXMN109186",
|
||||||
|
confidence=0.94,
|
||||||
|
word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
|
||||||
|
engine_name="paddleocr",
|
||||||
|
)
|
||||||
|
assert result.text == "1HGBH41JXMN109186"
|
||||||
|
assert result.confidence == 0.94
|
||||||
|
assert len(result.word_boxes) == 1
|
||||||
|
assert result.engine_name == "paddleocr"
|
||||||
|
|
||||||
|
def test_empty_result(self) -> None:
|
||||||
|
result = OcrEngineResult(
|
||||||
|
text="", confidence=0.0, word_boxes=[], engine_name="paddleocr"
|
||||||
|
)
|
||||||
|
assert result.text == ""
|
||||||
|
assert result.word_boxes == []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# OcrEngine ABC
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestOcrEngineABC:
|
||||||
|
def test_cannot_instantiate_directly(self) -> None:
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
OcrEngine() # type: ignore[abstract]
|
||||||
|
|
||||||
|
def test_concrete_subclass_works(self) -> None:
|
||||||
|
class StubEngine(OcrEngine):
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "stub"
|
||||||
|
|
||||||
|
def recognize(
|
||||||
|
self, image_bytes: bytes, config: OcrConfig
|
||||||
|
) -> OcrEngineResult:
|
||||||
|
return OcrEngineResult(
|
||||||
|
text="ok", confidence=1.0, word_boxes=[], engine_name="stub"
|
||||||
|
)
|
||||||
|
|
||||||
|
engine = StubEngine()
|
||||||
|
assert engine.name == "stub"
|
||||||
|
result = engine.recognize(b"", OcrConfig())
|
||||||
|
assert result.text == "ok"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# PaddleOcrEngine
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestPaddleOcrEngine:
|
||||||
|
def test_name(self) -> None:
|
||||||
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
|
engine = PaddleOcrEngine()
|
||||||
|
assert engine.name == "paddleocr"
|
||||||
|
|
||||||
|
def test_lazy_init_not_loaded_at_construction(self) -> None:
|
||||||
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
|
engine = PaddleOcrEngine()
|
||||||
|
assert engine._ocr is None
|
||||||
|
|
||||||
|
def test_recognize_empty_results(self) -> None:
|
||||||
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
|
engine = PaddleOcrEngine()
|
||||||
|
mock_ocr = MagicMock()
|
||||||
|
mock_ocr.predict.return_value = iter([
|
||||||
|
_mock_paddle_result(dt_polys=[], rec_texts=[], rec_scores=[])
|
||||||
|
])
|
||||||
|
engine._ocr = mock_ocr
|
||||||
|
|
||||||
|
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||||
|
assert result.text == ""
|
||||||
|
assert result.confidence == 0.0
|
||||||
|
assert result.word_boxes == []
|
||||||
|
assert result.engine_name == "paddleocr"
|
||||||
|
|
||||||
|
def test_recognize_with_results(self) -> None:
|
||||||
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
|
engine = PaddleOcrEngine()
|
||||||
|
mock_ocr = MagicMock()
|
||||||
|
mock_ocr.predict.return_value = iter([
|
||||||
|
_mock_paddle_result(
|
||||||
|
dt_polys=[
|
||||||
|
[[10, 20], [110, 20], [110, 50], [10, 50]],
|
||||||
|
[[10, 60], [110, 60], [110, 90], [10, 90]],
|
||||||
|
],
|
||||||
|
rec_texts=["HELLO", "WORLD"],
|
||||||
|
rec_scores=[0.95, 0.88],
|
||||||
|
)
|
||||||
|
])
|
||||||
|
engine._ocr = mock_ocr
|
||||||
|
|
||||||
|
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||||
|
assert result.text == "HELLO WORLD"
|
||||||
|
assert abs(result.confidence - 0.915) < 0.01
|
||||||
|
assert len(result.word_boxes) == 2
|
||||||
|
assert result.word_boxes[0].text == "HELLO"
|
||||||
|
assert result.word_boxes[0].confidence == 0.95
|
||||||
|
assert result.word_boxes[1].text == "WORLD"
|
||||||
|
assert result.engine_name == "paddleocr"
|
||||||
|
|
||||||
|
def test_recognize_whitelist_filters_characters(self) -> None:
|
||||||
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
|
engine = PaddleOcrEngine()
|
||||||
|
mock_ocr = MagicMock()
|
||||||
|
mock_ocr.predict.return_value = iter([
|
||||||
|
_mock_paddle_result(
|
||||||
|
dt_polys=[[[0, 0], [100, 0], [100, 30], [0, 30]]],
|
||||||
|
rec_texts=["1HG-BH4!"],
|
||||||
|
rec_scores=[0.9],
|
||||||
|
)
|
||||||
|
])
|
||||||
|
engine._ocr = mock_ocr
|
||||||
|
|
||||||
|
config = OcrConfig(char_whitelist="ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
|
||||||
|
result = engine.recognize(_create_test_image_bytes(), config)
|
||||||
|
assert "-" not in result.text
|
||||||
|
assert "!" not in result.text
|
||||||
|
assert result.word_boxes[0].text == "1HGBH4"
|
||||||
|
|
||||||
|
def test_recognize_quadrilateral_to_bounding_box(self) -> None:
|
||||||
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
|
engine = PaddleOcrEngine()
|
||||||
|
mock_ocr = MagicMock()
|
||||||
|
# Slightly rotated quad: min x=8, min y=20, max x=110, max y=55
|
||||||
|
mock_ocr.predict.return_value = iter([
|
||||||
|
_mock_paddle_result(
|
||||||
|
dt_polys=[[[10, 20], [110, 25], [108, 55], [8, 50]]],
|
||||||
|
rec_texts=["TEXT"],
|
||||||
|
rec_scores=[0.9],
|
||||||
|
)
|
||||||
|
])
|
||||||
|
engine._ocr = mock_ocr
|
||||||
|
|
||||||
|
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||||
|
wb = result.word_boxes[0]
|
||||||
|
assert wb.x == 8
|
||||||
|
assert wb.y == 20
|
||||||
|
assert wb.width == 102 # 110 - 8
|
||||||
|
assert wb.height == 35 # 55 - 20
|
||||||
|
|
||||||
|
def test_recognize_skips_empty_after_whitelist(self) -> None:
|
||||||
|
"""Text consisting only of non-whitelisted characters is skipped."""
|
||||||
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
|
engine = PaddleOcrEngine()
|
||||||
|
mock_ocr = MagicMock()
|
||||||
|
mock_ocr.predict.return_value = iter([
|
||||||
|
_mock_paddle_result(
|
||||||
|
dt_polys=[[[0, 0], [50, 0], [50, 20], [0, 20]]],
|
||||||
|
rec_texts=["---"],
|
||||||
|
rec_scores=[0.9],
|
||||||
|
)
|
||||||
|
])
|
||||||
|
engine._ocr = mock_ocr
|
||||||
|
|
||||||
|
config = OcrConfig(char_whitelist="ABC")
|
||||||
|
result = engine.recognize(_create_test_image_bytes(), config)
|
||||||
|
assert result.text == ""
|
||||||
|
assert result.word_boxes == []
|
||||||
|
assert result.confidence == 0.0
|
||||||
|
|
||||||
|
def test_import_error_raises_unavailable(self) -> None:
|
||||||
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
|
engine = PaddleOcrEngine()
|
||||||
|
engine._ocr = None
|
||||||
|
with patch.dict("sys.modules", {"paddleocr": None}):
|
||||||
|
with patch(
|
||||||
|
"app.engines.paddle_engine.importlib.import_module",
|
||||||
|
side_effect=ImportError("No module"),
|
||||||
|
):
|
||||||
|
# Force re-import by removing cached paddleocr
|
||||||
|
original_import = __builtins__.__import__ if hasattr(__builtins__, '__import__') else __import__
|
||||||
|
def mock_import(name, *args, **kwargs):
|
||||||
|
if name == "paddleocr":
|
||||||
|
raise ImportError("No module named 'paddleocr'")
|
||||||
|
return original_import(name, *args, **kwargs)
|
||||||
|
|
||||||
|
with patch("builtins.__import__", side_effect=mock_import):
|
||||||
|
with pytest.raises(EngineUnavailableError, match="paddleocr"):
|
||||||
|
engine._get_ocr()
|
||||||
|
|
||||||
|
def test_processing_error_on_exception(self) -> None:
|
||||||
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
|
engine = PaddleOcrEngine()
|
||||||
|
mock_ocr = MagicMock()
|
||||||
|
mock_ocr.predict.side_effect = RuntimeError("OCR crashed")
|
||||||
|
engine._ocr = mock_ocr
|
||||||
|
|
||||||
|
with pytest.raises(EngineProcessingError, match="PaddleOCR recognition failed"):
|
||||||
|
engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CloudEngine
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestCloudEngine:
|
||||||
|
def test_name(self) -> None:
|
||||||
|
from app.engines.cloud_engine import CloudEngine
|
||||||
|
|
||||||
|
engine = CloudEngine(key_path="/fake/path.json")
|
||||||
|
assert engine.name == "google_vision"
|
||||||
|
|
||||||
|
def test_lazy_init_not_loaded_at_construction(self) -> None:
|
||||||
|
from app.engines.cloud_engine import CloudEngine
|
||||||
|
|
||||||
|
engine = CloudEngine(key_path="/fake/path.json")
|
||||||
|
assert engine._client is None
|
||||||
|
|
||||||
|
def test_missing_key_file_raises_unavailable(self) -> None:
|
||||||
|
from app.engines.cloud_engine import CloudEngine
|
||||||
|
|
||||||
|
engine = CloudEngine(key_path="/nonexistent/key.json")
|
||||||
|
with pytest.raises(EngineUnavailableError, match="key not found"):
|
||||||
|
engine._get_client()
|
||||||
|
|
||||||
|
@patch("os.path.isfile", return_value=True)
|
||||||
|
def test_missing_library_raises_unavailable(self, _mock_isfile: MagicMock) -> None:
|
||||||
|
from app.engines.cloud_engine import CloudEngine
|
||||||
|
|
||||||
|
engine = CloudEngine(key_path="/fake/key.json")
|
||||||
|
|
||||||
|
def mock_import(name, *args, **kwargs):
|
||||||
|
if "google.cloud" in name:
|
||||||
|
raise ImportError("No module named 'google.cloud'")
|
||||||
|
return __import__(name, *args, **kwargs)
|
||||||
|
|
||||||
|
with patch("builtins.__import__", side_effect=mock_import):
|
||||||
|
with pytest.raises(EngineUnavailableError, match="google-cloud-vision"):
|
||||||
|
engine._get_client()
|
||||||
|
|
||||||
|
def test_recognize_empty_annotations(self) -> None:
|
||||||
|
from app.engines.cloud_engine import CloudEngine
|
||||||
|
|
||||||
|
engine = CloudEngine(key_path="/fake/key.json")
|
||||||
|
mock_client = MagicMock()
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.error.message = ""
|
||||||
|
mock_response.text_annotations = []
|
||||||
|
mock_client.text_detection.return_value = mock_response
|
||||||
|
engine._client = mock_client
|
||||||
|
|
||||||
|
# Mock the google.cloud.vision import inside recognize()
|
||||||
|
mock_vision = MagicMock()
|
||||||
|
with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
|
||||||
|
result = engine.recognize(b"fake_image", OcrConfig())
|
||||||
|
assert result.text == ""
|
||||||
|
assert result.confidence == 0.0
|
||||||
|
assert result.engine_name == "google_vision"
|
||||||
|
|
||||||
|
def test_recognize_api_error_raises_processing_error(self) -> None:
|
||||||
|
from app.engines.cloud_engine import CloudEngine
|
||||||
|
|
||||||
|
engine = CloudEngine(key_path="/fake/key.json")
|
||||||
|
mock_client = MagicMock()
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.error.message = "API quota exceeded"
|
||||||
|
mock_client.text_detection.return_value = mock_response
|
||||||
|
engine._client = mock_client
|
||||||
|
|
||||||
|
mock_vision = MagicMock()
|
||||||
|
with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
|
||||||
|
with pytest.raises(EngineProcessingError, match="API quota exceeded"):
|
||||||
|
engine.recognize(b"fake_image", OcrConfig())
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# HybridEngine
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestHybridEngine:
|
||||||
|
def test_name_with_fallback(self) -> None:
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
fallback = MagicMock(spec=OcrEngine)
|
||||||
|
fallback.name = "google_vision"
|
||||||
|
engine = HybridEngine(primary=primary, fallback=fallback)
|
||||||
|
assert engine.name == "hybrid(paddleocr+google_vision)"
|
||||||
|
|
||||||
|
def test_name_without_fallback(self) -> None:
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
engine = HybridEngine(primary=primary)
|
||||||
|
assert engine.name == "hybrid(paddleocr+none)"
|
||||||
|
|
||||||
|
def test_high_confidence_skips_fallback(self) -> None:
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
fallback = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
fallback.name = "cloud"
|
||||||
|
primary.recognize.return_value = _make_result("VIN123", 0.95, "paddleocr")
|
||||||
|
|
||||||
|
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||||
|
result = engine.recognize(b"img", OcrConfig())
|
||||||
|
assert result.text == "VIN123"
|
||||||
|
assert result.engine_name == "paddleocr"
|
||||||
|
fallback.recognize.assert_not_called()
|
||||||
|
|
||||||
|
def test_low_confidence_triggers_fallback(self) -> None:
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
fallback = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
fallback.name = "google_vision"
|
||||||
|
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||||
|
fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
|
||||||
|
|
||||||
|
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||||
|
result = engine.recognize(b"img", OcrConfig())
|
||||||
|
assert result.text == "VIN456"
|
||||||
|
assert result.engine_name == "google_vision"
|
||||||
|
fallback.recognize.assert_called_once()
|
||||||
|
|
||||||
|
def test_low_confidence_no_fallback_returns_primary(self) -> None:
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||||
|
|
||||||
|
engine = HybridEngine(primary=primary, fallback=None, threshold=0.6)
|
||||||
|
result = engine.recognize(b"img", OcrConfig())
|
||||||
|
assert result.text == "VIN123"
|
||||||
|
|
||||||
|
def test_fallback_lower_confidence_returns_primary(self) -> None:
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
fallback = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
fallback.name = "google_vision"
|
||||||
|
primary.recognize.return_value = _make_result("VIN123", 0.4, "paddleocr")
|
||||||
|
fallback.recognize.return_value = _make_result("VIN456", 0.3, "google_vision")
|
||||||
|
|
||||||
|
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||||
|
result = engine.recognize(b"img", OcrConfig())
|
||||||
|
assert result.text == "VIN123"
|
||||||
|
|
||||||
|
def test_fallback_engine_error_returns_primary(self) -> None:
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
fallback = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
fallback.name = "google_vision"
|
||||||
|
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||||
|
fallback.recognize.side_effect = EngineUnavailableError("key missing")
|
||||||
|
|
||||||
|
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||||
|
result = engine.recognize(b"img", OcrConfig())
|
||||||
|
assert result.text == "VIN123"
|
||||||
|
|
||||||
|
def test_fallback_unexpected_error_returns_primary(self) -> None:
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
fallback = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
fallback.name = "google_vision"
|
||||||
|
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||||
|
fallback.recognize.side_effect = RuntimeError("network error")
|
||||||
|
|
||||||
|
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||||
|
result = engine.recognize(b"img", OcrConfig())
|
||||||
|
assert result.text == "VIN123"
|
||||||
|
|
||||||
|
@patch("app.engines.hybrid_engine.time")
|
||||||
|
def test_fallback_timeout_returns_primary(self, mock_time: MagicMock) -> None:
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
fallback = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
fallback.name = "google_vision"
|
||||||
|
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
|
||||||
|
fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
|
||||||
|
# Simulate 6-second delay (exceeds 5s limit)
|
||||||
|
mock_time.monotonic.side_effect = [0.0, 6.0]
|
||||||
|
|
||||||
|
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||||
|
result = engine.recognize(b"img", OcrConfig())
|
||||||
|
assert result.text == "VIN123" # timeout -> use primary
|
||||||
|
|
||||||
|
def test_exact_threshold_skips_fallback(self) -> None:
|
||||||
|
"""When confidence == threshold, no fallback needed (>= check)."""
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
primary = MagicMock(spec=OcrEngine)
|
||||||
|
fallback = MagicMock(spec=OcrEngine)
|
||||||
|
primary.name = "paddleocr"
|
||||||
|
fallback.name = "cloud"
|
||||||
|
primary.recognize.return_value = _make_result("VIN", 0.6, "paddleocr")
|
||||||
|
|
||||||
|
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
|
||||||
|
result = engine.recognize(b"img", OcrConfig())
|
||||||
|
assert result.engine_name == "paddleocr"
|
||||||
|
fallback.recognize.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Engine factory
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestEngineFactory:
|
||||||
|
def test_unknown_engine_raises(self) -> None:
|
||||||
|
from app.engines.engine_factory import _create_single_engine
|
||||||
|
|
||||||
|
with pytest.raises(EngineUnavailableError, match="Unknown engine"):
|
||||||
|
_create_single_engine("nonexistent")
|
||||||
|
|
||||||
|
@patch("app.engines.engine_factory.settings")
|
||||||
|
@patch("app.engines.engine_factory._create_single_engine")
|
||||||
|
def test_defaults_to_settings_primary(
|
||||||
|
self, mock_create: MagicMock, mock_settings: MagicMock
|
||||||
|
) -> None:
|
||||||
|
mock_settings.ocr_primary_engine = "paddleocr"
|
||||||
|
mock_settings.ocr_fallback_engine = "none"
|
||||||
|
mock_engine = MagicMock(spec=OcrEngine)
|
||||||
|
mock_create.return_value = mock_engine
|
||||||
|
|
||||||
|
from app.engines.engine_factory import create_engine
|
||||||
|
|
||||||
|
result = create_engine()
|
||||||
|
mock_create.assert_called_once_with("paddleocr")
|
||||||
|
assert result == mock_engine
|
||||||
|
|
||||||
|
@patch("app.engines.engine_factory.settings")
|
||||||
|
@patch("app.engines.engine_factory._create_single_engine")
|
||||||
|
def test_explicit_name_overrides_settings(
|
||||||
|
self, mock_create: MagicMock, mock_settings: MagicMock
|
||||||
|
) -> None:
|
||||||
|
mock_settings.ocr_fallback_engine = "none"
|
||||||
|
mock_engine = MagicMock(spec=OcrEngine)
|
||||||
|
mock_create.return_value = mock_engine
|
||||||
|
|
||||||
|
from app.engines.engine_factory import create_engine
|
||||||
|
|
||||||
|
create_engine("google_vision")
|
||||||
|
mock_create.assert_called_once_with("google_vision")
|
||||||
|
|
||||||
|
@patch("app.engines.engine_factory.settings")
|
||||||
|
@patch("app.engines.engine_factory._create_single_engine")
|
||||||
|
def test_creates_hybrid_when_fallback_configured(
|
||||||
|
self, mock_create: MagicMock, mock_settings: MagicMock
|
||||||
|
) -> None:
|
||||||
|
mock_settings.ocr_primary_engine = "paddleocr"
|
||||||
|
mock_settings.ocr_fallback_engine = "google_vision"
|
||||||
|
mock_settings.ocr_fallback_threshold = 0.7
|
||||||
|
mock_primary = MagicMock(spec=OcrEngine)
|
||||||
|
mock_fallback = MagicMock(spec=OcrEngine)
|
||||||
|
mock_create.side_effect = [mock_primary, mock_fallback]
|
||||||
|
|
||||||
|
from app.engines.engine_factory import create_engine
|
||||||
|
from app.engines.hybrid_engine import HybridEngine
|
||||||
|
|
||||||
|
result = create_engine()
|
||||||
|
assert isinstance(result, HybridEngine)
|
||||||
|
|
||||||
|
@patch("app.engines.engine_factory.settings")
|
||||||
|
@patch("app.engines.engine_factory._create_single_engine")
|
||||||
|
def test_fallback_failure_returns_primary_only(
|
||||||
|
self, mock_create: MagicMock, mock_settings: MagicMock
|
||||||
|
) -> None:
|
||||||
|
mock_settings.ocr_primary_engine = "paddleocr"
|
||||||
|
mock_settings.ocr_fallback_engine = "google_vision"
|
||||||
|
mock_settings.ocr_fallback_threshold = 0.6
|
||||||
|
mock_primary = MagicMock(spec=OcrEngine)
|
||||||
|
mock_create.side_effect = [mock_primary, EngineUnavailableError("no key")]
|
||||||
|
|
||||||
|
from app.engines.engine_factory import create_engine
|
||||||
|
|
||||||
|
result = create_engine()
|
||||||
|
assert result == mock_primary
|
||||||
@@ -39,14 +39,9 @@ def test_pillow_heif_can_register():
|
|||||||
assert "HEIF" in Image.registered_extensions().values()
|
assert "HEIF" in Image.registered_extensions().values()
|
||||||
|
|
||||||
|
|
||||||
def test_tesseract_available():
|
def test_paddleocr_engine_available():
|
||||||
"""Tesseract OCR is available and can process images."""
|
"""PaddleOCR engine can be created."""
|
||||||
import pytesseract
|
from app.engines.paddle_engine import PaddleOcrEngine
|
||||||
|
|
||||||
# Create a simple test image with text
|
engine = PaddleOcrEngine()
|
||||||
img = Image.new("RGB", (200, 50), color="white")
|
assert engine.name == "paddleocr"
|
||||||
|
|
||||||
# Verify pytesseract can call tesseract (will return empty string for blank image)
|
|
||||||
result = pytesseract.image_to_string(img)
|
|
||||||
# Just verify it doesn't raise an exception - blank image returns empty/whitespace
|
|
||||||
assert isinstance(result, str)
|
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
"""Integration tests for VIN extraction endpoint."""
|
"""Integration tests for VIN extraction endpoint and engine integration."""
|
||||||
import io
|
import io
|
||||||
from unittest.mock import patch, MagicMock
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw
|
||||||
|
|
||||||
|
from app.engines.base_engine import OcrConfig, OcrEngineResult, WordBox
|
||||||
from app.main import app
|
from app.main import app
|
||||||
|
|
||||||
|
|
||||||
@@ -240,3 +241,106 @@ class TestVinExtractionContentTypes:
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# VIN extractor engine integration tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestVinExtractorEngineIntegration:
|
||||||
|
"""Tests verifying VinExtractor integrates correctly with engine abstraction."""
|
||||||
|
|
||||||
|
@patch("app.extractors.vin_extractor.create_engine")
|
||||||
|
def test_perform_ocr_calls_engine_with_vin_config(
|
||||||
|
self, mock_create_engine: MagicMock
|
||||||
|
) -> None:
|
||||||
|
"""_perform_ocr passes VIN whitelist and angle_cls to engine."""
|
||||||
|
from app.extractors.vin_extractor import VinExtractor
|
||||||
|
|
||||||
|
mock_engine = MagicMock()
|
||||||
|
mock_engine.recognize.return_value = OcrEngineResult(
|
||||||
|
text="1HGBH41JXMN109186",
|
||||||
|
confidence=0.94,
|
||||||
|
word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
|
||||||
|
engine_name="paddleocr",
|
||||||
|
)
|
||||||
|
mock_create_engine.return_value = mock_engine
|
||||||
|
|
||||||
|
extractor = VinExtractor()
|
||||||
|
text, confidences = extractor._perform_ocr(b"fake_image")
|
||||||
|
|
||||||
|
mock_engine.recognize.assert_called_once()
|
||||||
|
call_config = mock_engine.recognize.call_args[0][1]
|
||||||
|
assert isinstance(call_config, OcrConfig)
|
||||||
|
assert call_config.char_whitelist == VinExtractor.VIN_WHITELIST
|
||||||
|
assert call_config.use_angle_cls is True
|
||||||
|
assert call_config.single_line is False
|
||||||
|
assert call_config.single_word is False
|
||||||
|
assert text == "1HGBH41JXMN109186"
|
||||||
|
assert confidences == [0.94]
|
||||||
|
|
||||||
|
@patch("app.extractors.vin_extractor.create_engine")
|
||||||
|
def test_perform_ocr_single_line_mode(
|
||||||
|
self, mock_create_engine: MagicMock
|
||||||
|
) -> None:
|
||||||
|
"""_perform_ocr passes single_line flag to engine config."""
|
||||||
|
from app.extractors.vin_extractor import VinExtractor
|
||||||
|
|
||||||
|
mock_engine = MagicMock()
|
||||||
|
mock_engine.recognize.return_value = OcrEngineResult(
|
||||||
|
text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
|
||||||
|
)
|
||||||
|
mock_create_engine.return_value = mock_engine
|
||||||
|
|
||||||
|
extractor = VinExtractor()
|
||||||
|
extractor._perform_ocr(b"img", single_line=True)
|
||||||
|
|
||||||
|
call_config = mock_engine.recognize.call_args[0][1]
|
||||||
|
assert call_config.single_line is True
|
||||||
|
assert call_config.single_word is False
|
||||||
|
|
||||||
|
@patch("app.extractors.vin_extractor.create_engine")
|
||||||
|
def test_perform_ocr_single_word_mode(
|
||||||
|
self, mock_create_engine: MagicMock
|
||||||
|
) -> None:
|
||||||
|
"""_perform_ocr passes single_word flag to engine config."""
|
||||||
|
from app.extractors.vin_extractor import VinExtractor
|
||||||
|
|
||||||
|
mock_engine = MagicMock()
|
||||||
|
mock_engine.recognize.return_value = OcrEngineResult(
|
||||||
|
text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
|
||||||
|
)
|
||||||
|
mock_create_engine.return_value = mock_engine
|
||||||
|
|
||||||
|
extractor = VinExtractor()
|
||||||
|
extractor._perform_ocr(b"img", single_word=True)
|
||||||
|
|
||||||
|
call_config = mock_engine.recognize.call_args[0][1]
|
||||||
|
assert call_config.single_word is True
|
||||||
|
assert call_config.single_line is False
|
||||||
|
|
||||||
|
def test_calculate_base_confidence_empty_returns_default(self) -> None:
|
||||||
|
"""Empty word confidences return 0.5 default."""
|
||||||
|
from app.extractors.vin_extractor import VinExtractor
|
||||||
|
|
||||||
|
extractor = VinExtractor.__new__(VinExtractor)
|
||||||
|
assert extractor._calculate_base_confidence([]) == 0.5
|
||||||
|
|
||||||
|
def test_calculate_base_confidence_weighted_blend(self) -> None:
|
||||||
|
"""Confidence = 70% average + 30% minimum."""
|
||||||
|
from app.extractors.vin_extractor import VinExtractor
|
||||||
|
|
||||||
|
extractor = VinExtractor.__new__(VinExtractor)
|
||||||
|
# avg = (0.9 + 0.8) / 2 = 0.85, min = 0.8
|
||||||
|
# result = 0.7 * 0.85 + 0.3 * 0.8 = 0.595 + 0.24 = 0.835
|
||||||
|
result = extractor._calculate_base_confidence([0.9, 0.8])
|
||||||
|
assert abs(result - 0.835) < 0.001
|
||||||
|
|
||||||
|
def test_calculate_base_confidence_single_value(self) -> None:
|
||||||
|
"""Single confidence value: avg == min, so result equals that value."""
|
||||||
|
from app.extractors.vin_extractor import VinExtractor
|
||||||
|
|
||||||
|
extractor = VinExtractor.__new__(VinExtractor)
|
||||||
|
result = extractor._calculate_base_confidence([0.92])
|
||||||
|
assert abs(result - 0.92) < 0.001
|
||||||
|
|||||||
@@ -165,7 +165,7 @@ class TestVinValidator:
|
|||||||
"""Test candidate extraction handles space-fragmented VINs from OCR."""
|
"""Test candidate extraction handles space-fragmented VINs from OCR."""
|
||||||
validator = VinValidator()
|
validator = VinValidator()
|
||||||
|
|
||||||
# Tesseract often fragments VINs into multiple words
|
# OCR engines sometimes fragment VINs into multiple words
|
||||||
text = "1HGBH 41JXMN 109186"
|
text = "1HGBH 41JXMN 109186"
|
||||||
candidates = validator.extract_candidates(text)
|
candidates = validator.extract_candidates(text)
|
||||||
|
|
||||||
|
|||||||
18
secrets/app/google-vision-key.json.example
Normal file
18
secrets/app/google-vision-key.json.example
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
{
|
||||||
|
"_comment": "Google Vision API service account key for OCR cloud fallback",
|
||||||
|
"_instructions": [
|
||||||
|
"1. Create a Google Cloud service account with Vision API access",
|
||||||
|
"2. Download the JSON key file",
|
||||||
|
"3. Save it as secrets/app/google-vision-key.json (gitignored)",
|
||||||
|
"4. Uncomment the volume mount in docker-compose.yml",
|
||||||
|
"5. Set OCR_FALLBACK_ENGINE=google_vision"
|
||||||
|
],
|
||||||
|
"type": "service_account",
|
||||||
|
"project_id": "your-project-id",
|
||||||
|
"private_key_id": "",
|
||||||
|
"private_key": "",
|
||||||
|
"client_email": "your-sa@your-project-id.iam.gserviceaccount.com",
|
||||||
|
"client_id": "",
|
||||||
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
||||||
|
"token_uri": "https://oauth2.googleapis.com/token"
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user