feat: Improve OCR process - replace Tesseract with PaddleOCR (#115) #122

Merged
egullickson merged 16 commits from issue-115-improve-ocr-paddleocr into main 2026-02-08 01:13:35 +00:00
35 changed files with 2560 additions and 232 deletions

View File

@@ -108,7 +108,7 @@
}, },
"mvp-ocr": { "mvp-ocr": {
"type": "ocr_service", "type": "ocr_service",
"description": "Python-based OCR for document text extraction", "description": "Python OCR service with pluggable engine abstraction (PaddleOCR PP-OCRv4 primary, optional Google Vision cloud fallback, Tesseract backward compat)",
"port": 8000 "port": 8000
}, },
"mvp-loki": { "mvp-loki": {

View File

@@ -45,7 +45,7 @@
"parent_issue": "The original feature issue. Tracks overall status. Only the parent gets status label transitions.", "parent_issue": "The original feature issue. Tracks overall status. Only the parent gets status label transitions.",
"sub_issue_title_format": "{type}: {summary} (#{parent_index})", "sub_issue_title_format": "{type}: {summary} (#{parent_index})",
"sub_issue_body": "First line must be 'Relates to #{parent_index}'. Each sub-issue is a self-contained unit of work.", "sub_issue_body": "First line must be 'Relates to #{parent_index}'. Each sub-issue is a self-contained unit of work.",
"sub_issue_labels": "status/backlog + same type/* as parent. Sub-issues stay in backlog; parent issue tracks status.", "sub_issue_labels": "status/in-progress + same type/* as parent. Sub-issues move to in-progress as they are worked on.",
"sub_issue_milestone": "Same sprint milestone as parent.", "sub_issue_milestone": "Same sprint milestone as parent.",
"rules": [ "rules": [
"ONE branch for the parent issue. Never create branches per sub-issue.", "ONE branch for the parent issue. Never create branches per sub-issue.",

View File

@@ -6,7 +6,7 @@ import type { JobResponse, OcrResponse, VinExtractionResponse } from '../domain/
/** OCR service configuration */ /** OCR service configuration */
const OCR_SERVICE_URL = process.env.OCR_SERVICE_URL || 'http://mvp-ocr:8000'; const OCR_SERVICE_URL = process.env.OCR_SERVICE_URL || 'http://mvp-ocr:8000';
const OCR_TIMEOUT_MS = 30000; // 30 seconds for sync operations const OCR_TIMEOUT_MS = 120000; // 120 seconds for sync operations (PaddleOCR model loading on first call)
/** /**
* HTTP client for communicating with the OCR service. * HTTP client for communicating with the OCR service.

View File

@@ -38,13 +38,17 @@ services:
STRIPE_ENTERPRISE_MONTHLY_PRICE_ID: prod_Toj8xGEui9jl6j STRIPE_ENTERPRISE_MONTHLY_PRICE_ID: prod_Toj8xGEui9jl6j
STRIPE_ENTERPRISE_YEARLY_PRICE_ID: prod_Toj9A7A773xrdn STRIPE_ENTERPRISE_YEARLY_PRICE_ID: prod_Toj9A7A773xrdn
# OCR - Production log level # OCR - Production log level + engine config
mvp-ocr: mvp-ocr:
environment: environment:
LOG_LEVEL: error LOG_LEVEL: error
REDIS_HOST: mvp-redis REDIS_HOST: mvp-redis
REDIS_PORT: 6379 REDIS_PORT: 6379
REDIS_DB: 1 REDIS_DB: 1
OCR_PRIMARY_ENGINE: paddleocr
OCR_FALLBACK_ENGINE: ${OCR_FALLBACK_ENGINE:-none}
OCR_FALLBACK_THRESHOLD: ${OCR_FALLBACK_THRESHOLD:-0.6}
GOOGLE_VISION_KEY_PATH: /run/secrets/google-vision-key.json
# PostgreSQL - Remove dev ports, production log level # PostgreSQL - Remove dev ports, production log level
mvp-postgres: mvp-postgres:

View File

@@ -63,6 +63,15 @@ services:
mvp-ocr: mvp-ocr:
image: ${OCR_IMAGE:-git.motovaultpro.com/egullickson/ocr:latest} image: ${OCR_IMAGE:-git.motovaultpro.com/egullickson/ocr:latest}
container_name: mvp-ocr-staging container_name: mvp-ocr-staging
environment:
LOG_LEVEL: debug
REDIS_HOST: mvp-redis
REDIS_PORT: 6379
REDIS_DB: 1
OCR_PRIMARY_ENGINE: paddleocr
OCR_FALLBACK_ENGINE: ${OCR_FALLBACK_ENGINE:-none}
OCR_FALLBACK_THRESHOLD: ${OCR_FALLBACK_THRESHOLD:-0.6}
GOOGLE_VISION_KEY_PATH: /run/secrets/google-vision-key.json
# ======================================== # ========================================
# PostgreSQL (Staging - Separate Database) # PostgreSQL (Staging - Separate Database)

View File

@@ -193,8 +193,16 @@ services:
REDIS_HOST: mvp-redis REDIS_HOST: mvp-redis
REDIS_PORT: 6379 REDIS_PORT: 6379
REDIS_DB: 1 REDIS_DB: 1
# OCR engine configuration (PaddleOCR primary, cloud fallback optional)
OCR_PRIMARY_ENGINE: paddleocr
OCR_FALLBACK_ENGINE: ${OCR_FALLBACK_ENGINE:-none}
OCR_FALLBACK_THRESHOLD: ${OCR_FALLBACK_THRESHOLD:-0.6}
GOOGLE_VISION_KEY_PATH: /run/secrets/google-vision-key.json
volumes: volumes:
- /tmp/vin-debug:/tmp/vin-debug - /tmp/vin-debug:/tmp/vin-debug
# Optional: Uncomment to enable Google Vision cloud fallback.
# Requires: secrets/app/google-vision-key.json and OCR_FALLBACK_ENGINE=google_vision
# - ./secrets/app/google-vision-key.json:/run/secrets/google-vision-key.json:ro
networks: networks:
- backend - backend
- database - database

View File

@@ -18,5 +18,5 @@
| `AUDIT.md` | Audit documentation | Security audits, compliance | | `AUDIT.md` | Audit documentation | Security audits, compliance |
| `MVP-COLOR-SCHEME.md` | Color scheme reference | UI styling decisions | | `MVP-COLOR-SCHEME.md` | Color scheme reference | UI styling decisions |
| `LOGGING.md` | Unified logging system | Log levels, correlation IDs, Grafana | | `LOGGING.md` | Unified logging system | Log levels, correlation IDs, Grafana |
| `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, Tesseract setup | | `ocr-pipeline-tech-stack.md` | OCR pipeline technology decisions | OCR architecture, PaddleOCR engine abstraction |
| `TIER-GATING.md` | Subscription tier gating rules | Feature access by tier, vehicle limits | | `TIER-GATING.md` | Subscription tier gating rules | Feature access by tier, vehicle limits |

View File

@@ -118,35 +118,48 @@
│ ├─────────────────────────────────────────────────────────┤ │ ├─────────────────────────────────────────────────────────┤
│ │ │ │ │ │
│ │ ┌─────────────────────────────────────────────────┐ │ │ │ ┌─────────────────────────────────────────────────┐ │
│ │ │ 5a. Primary OCR: Tesseract 5.x │ │ │ │ │ 5a. Engine Abstraction Layer │ │
│ │ │ │ │ │ │ │ │ │
│ │ │ Engine: LSTM (--oem 1) │ │ │ │ │ OcrEngine ABC -> PaddleOcrEngine (primary) │ │
│ │ │ • Page segmentation: Auto (--psm 3) │ │ │ │ │ -> CloudEngine (optional fallback) │ │
│ │ │ • Output: hOCR with word confidence │ │ │ │ -> TesseractEngine (backward compat)│
│ │ │ -> HybridEngine (primary+fallback) │ │
│ │ └─────────────────────────────────────────────────┘ │
│ │ │ │
│ │ ▼ │
│ │ ┌─────────────────────────────────────────────────┐ │
│ │ │ 5b. Primary OCR: PaddleOCR PP-OCRv4 │ │
│ │ │ │ │
│ │ │ • Scene text detection + angle classification │ │
│ │ │ • CPU-only, models baked into Docker image │ │
│ │ │ • Normalized output: text, confidence, word boxes│ │
│ │ └─────────────────────────────────────────────────┘ │ │ │ └─────────────────────────────────────────────────┘ │
│ │ │ │ │ │ │ │
│ │ ▼ │ │ │ ▼ │
│ │ ┌───────────────┐ │ │ │ ┌───────────────┐ │
│ │ │ Confidence │ │ │ │ │ Confidence │ │
│ │ │ > 80% ? │ │ │ │ │ >= 60% ? │ │
│ │ └───────────────┘ │ │ │ └───────────────┘ │
│ │ │ │ │ │ │ │ │ │
│ │ YES ──┘ └── NO │ │ YES ──┘ └── NO (and cloud enabled)
│ │ │ │ │ │ │ │ │ │
│ │ │ ▼ │ │ │ │ ▼ │
│ │ │ ┌─────────────────────────────────┐ │ │ │ │ ┌─────────────────────────────────┐ │
│ │ │ │ 5b. Fallback: PaddleOCR │ │ │ │ │ │ 5c. Optional Cloud Fallback │ │
│ │ │ │ (Google Vision API) │ │
│ │ │ │ │ │ │ │ │ │ │ │
│ │ │ │ • Better for degraded images │ │ │ │ │ │ • Disabled by default │ │
│ │ │ │ • Better table detection │ │ │ │ │ │ • 5-second timeout guard │ │
│ │ │ │ • Slower but more accurate │ │ │ │ │ │ • Returns higher-confidence │ │
│ │ │ │ result of primary vs fallback │ │
│ │ │ └─────────────────────────────────┘ │ │ │ │ └─────────────────────────────────┘ │
│ │ │ │ │ │ │ │ │ │
│ │ ▼ ▼ │ │ │ ▼ ▼ │
│ │ ┌─────────────────────────────────┐ │ │ │ ┌─────────────────────────────────┐ │
│ │ │ 5c. Result Merging │ │ │ │ │ 5d. HybridEngine Result │ │
│ │ │ • Merge by bounding box │ │ │ │ │ • Compare confidences │ │
│ │ │ • Keep highest confidence │ │ │ │ │ • Keep highest confidence │ │
│ │ │ • Graceful fallback on error │ │
│ │ └─────────────────────────────────┘ │ │ │ └─────────────────────────────────┘ │
│ │ │ │ │ │
│ └─────────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────┘
@@ -257,10 +270,10 @@
| Component | Tool | Purpose | | Component | Tool | Purpose |
|------------------------|-----------------------|--------------------------------------| |------------------------|-----------------------|--------------------------------------|
| **Primary OCR** | Tesseract 5.x | Fast, reliable text extraction | | **Primary OCR** | PaddleOCR PP-OCRv4 | Highest accuracy scene text, CPU-only |
| **Python Binding** | pytesseract | Tesseract Python wrapper | | **Cloud Fallback** | Google Vision API | Optional cloud fallback (disabled by default) |
| **Fallback OCR** | PaddleOCR | Higher accuracy, better tables | | **Backward Compat** | Tesseract 5.x / pytesseract | Legacy engine, configurable via env var |
| **Layout Analysis** | PaddleOCR / LayoutParser | Document structure detection | | **Engine Abstraction** | `OcrEngine` ABC | Pluggable engine interface in `ocr/app/engines/` |
### Data Extraction ### Data Extraction
@@ -291,85 +304,93 @@
fastapi>=0.100.0 fastapi>=0.100.0
uvicorn[standard]>=0.23.0 uvicorn[standard]>=0.23.0
python-multipart>=0.0.6 python-multipart>=0.0.6
pydantic>=2.0.0
# Task Queue
celery>=5.3.0
redis>=4.6.0
# File Detection & Handling # File Detection & Handling
python-magic>=0.4.27 python-magic>=0.4.27
pillow>=10.0.0 pillow>=10.0.0
pillow-heif>=0.13.0 pillow-heif>=0.13.0
# PDF Processing
pymupdf>=1.23.0
# Image Preprocessing # Image Preprocessing
opencv-python-headless>=4.8.0 opencv-python-headless>=4.8.0
deskew>=1.4.0
scikit-image>=0.21.0
numpy>=1.24.0 numpy>=1.24.0
# OCR Engines # OCR Engines
pytesseract>=0.3.10 pytesseract>=0.3.10
paddlepaddle>=2.5.0 paddlepaddle>=2.6.0
paddleocr>=2.7.0 paddleocr>=2.8.0
google-cloud-vision>=3.7.0
# Table Extraction # PDF Processing
img2table>=1.2.0 PyMuPDF>=1.23.0
camelot-py[cv]>=0.11.0
# NLP & Data # Redis for job queue
spacy>=3.6.0 redis>=5.0.0
pandas>=2.0.0
# Storage & Database # HTTP client for callbacks
boto3>=1.28.0 httpx>=0.24.0
psycopg2-binary>=2.9.0
sqlalchemy>=2.0.0 # Testing
pytest>=7.4.0
pytest-asyncio>=0.21.0
``` ```
### System Package Requirements (Ubuntu/Debian) ### System Package Requirements (Ubuntu/Debian)
```bash ```bash
# Tesseract OCR # Tesseract OCR (backward compatibility engine)
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev apt-get install tesseract-ocr tesseract-ocr-eng
# PaddlePaddle OpenMP runtime
apt-get install libgomp1
# HEIC Support # HEIC Support
apt-get install libheif-examples libheif-dev apt-get install libheif1 libheif-dev
# OpenCV dependencies # GLib (OpenCV dependency)
apt-get install libgl1-mesa-glx libglib2.0-0 apt-get install libglib2.0-0
# PDF rendering dependencies # File type detection
apt-get install libmupdf-dev mupdf-tools apt-get install libmagic1
# Image processing
apt-get install libmagic1 ghostscript
# Camelot dependencies
apt-get install ghostscript python3-tk
``` ```
### Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
| `OCR_PRIMARY_ENGINE` | `paddleocr` | Primary OCR engine (`paddleocr`, `tesseract`) |
| `OCR_CONFIDENCE_THRESHOLD` | `0.6` | Minimum confidence threshold |
| `OCR_FALLBACK_ENGINE` | `none` | Fallback engine (`google_vision`, `none`) |
| `OCR_FALLBACK_THRESHOLD` | `0.6` | Confidence below this triggers fallback |
| `GOOGLE_VISION_KEY_PATH` | `/run/secrets/google-vision-key.json` | Path to Google Vision service account key |
--- ---
## DOCKERFILE ## DOCKERFILE
```dockerfile ```dockerfile
FROM python:3.11-slim # Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
# Backward compat: Tesseract 5.x (optional, via TesseractEngine)
# Cloud fallback: Google Vision (optional, requires API key at runtime)
FROM python:3.13-slim
# System dependencies # System dependencies
# - tesseract-ocr/eng: Backward-compatible OCR engine
# - libgomp1: OpenMP runtime required by PaddlePaddle
# - libheif1/libheif-dev: HEIF image support (iPhone photos)
# - libglib2.0-0: GLib shared library (OpenCV dependency)
# - libmagic1: File type detection
# - curl: Health check endpoint
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr \ tesseract-ocr \
tesseract-ocr-eng \ tesseract-ocr-eng \
libtesseract-dev \ libgomp1 \
libheif-examples \ libheif1 \
libheif-dev \ libheif-dev \
libgl1-mesa-glx \
libglib2.0-0 \ libglib2.0-0 \
libmagic1 \ libmagic1 \
ghostscript \ curl \
poppler-utils \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Python dependencies # Python dependencies
@@ -377,11 +398,9 @@ WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
# Download spaCy model # Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime)
RUN python -m spacy download en_core_web_sm RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)" \
&& echo "PaddleOCR PP-OCRv4 models downloaded and verified"
# Download PaddleOCR models (cached in image)
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')"
COPY . . COPY . .

File diff suppressed because one or more lines are too long

View File

@@ -49,7 +49,7 @@ async function extractVinFromImage(file: File): Promise<VinOcrResult> {
const response = await apiClient.post('/ocr/extract/vin', formData, { const response = await apiClient.post('/ocr/extract/vin', formData, {
headers: { 'Content-Type': 'multipart/form-data' }, headers: { 'Content-Type': 'multipart/form-data' },
timeout: 30000, // 30 seconds for OCR processing timeout: 120000, // 120 seconds for OCR processing
}); });
const data = response.data; const data = response.data;

View File

@@ -245,7 +245,7 @@ export const CameraCapture: React.FC<CameraCaptureProps> = ({
return ( return (
<CropTool <CropTool
imageSrc={capturedImageSrc} imageSrc={capturedImageSrc}
lockAspectRatio={guidanceType !== 'none'} lockAspectRatio={guidanceType !== 'none' && guidanceType !== 'vin'}
aspectRatio={cropAspectRatio} aspectRatio={cropAspectRatio}
onConfirm={handleCropConfirm} onConfirm={handleCropConfirm}
onReset={handleCropReset} onReset={handleCropReset}

View File

@@ -95,10 +95,6 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
const drawOriginRef = useRef({ x: 0, y: 0 }); const drawOriginRef = useRef({ x: 0, y: 0 });
const cropAreaRef = useRef(cropArea); const cropAreaRef = useRef(cropArea);
useEffect(() => {
cropAreaRef.current = cropArea;
}, [cropArea]);
const setCropArea = useCallback( const setCropArea = useCallback(
(area: CropArea) => { (area: CropArea) => {
setCropAreaState(getAspectRatioAdjustedCrop(area)); setCropAreaState(getAspectRatioAdjustedCrop(area));
@@ -177,7 +173,9 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
startPosRef.current = { x: clientX, y: clientY }; startPosRef.current = { x: clientX, y: clientY };
drawOriginRef.current = { x, y }; drawOriginRef.current = { x, y };
setCropAreaState({ x, y, width: 0, height: 0 }); const initial = { x, y, width: 0, height: 0 };
setCropAreaState(initial);
cropAreaRef.current = initial;
isDrawingRef.current = true; isDrawingRef.current = true;
activeHandleRef.current = null; activeHandleRef.current = null;
@@ -203,18 +201,24 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
const originX = drawOriginRef.current.x; const originX = drawOriginRef.current.x;
const originY = drawOriginRef.current.y; const originY = drawOriginRef.current.y;
let newCrop: CropArea = { const drawnWidth = Math.abs(currentX - originX);
const drawnHeight = aspectRatio
? drawnWidth / aspectRatio
: Math.abs(currentY - originY);
let drawnY = Math.min(originY, currentY);
// Clamp so crop doesn't exceed container bounds when aspect ratio forces height
if (aspectRatio && drawnY + drawnHeight > 100) {
drawnY = Math.max(0, 100 - drawnHeight);
}
const newCrop: CropArea = {
x: Math.min(originX, currentX), x: Math.min(originX, currentX),
y: Math.min(originY, currentY), y: drawnY,
width: Math.abs(currentX - originX), width: drawnWidth,
height: Math.abs(currentY - originY), height: drawnHeight,
}; };
if (aspectRatio) {
newCrop.height = newCrop.width / aspectRatio;
}
setCropAreaState(newCrop); setCropAreaState(newCrop);
cropAreaRef.current = newCrop;
return; return;
} }
@@ -303,7 +307,9 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
break; break;
} }
setCropAreaState(constrainCrop(newCrop)); const constrained = constrainCrop(newCrop);
setCropAreaState(constrained);
cropAreaRef.current = constrained;
}, },
[isDragging, constrainCrop, aspectRatio] [isDragging, constrainCrop, aspectRatio]
); );
@@ -312,7 +318,9 @@ export function useImageCrop(options: UseImageCropOptions = {}): UseImageCropRet
if (isDrawingRef.current) { if (isDrawingRef.current) {
isDrawingRef.current = false; isDrawingRef.current = false;
const area = cropAreaRef.current; const area = cropAreaRef.current;
if (area.width >= minSize && area.height >= minSize) { // Accept crop if at least one dimension is meaningful (allows thin strips like VINs)
const meetsMinSize = area.width >= minSize || area.height >= minSize;
if (meetsMinSize) {
setCropDrawn(true); setCropDrawn(true);
} }
} }

View File

@@ -1,10 +1,12 @@
# ocr/ # ocr/
Python OCR microservice. Primary engine: PaddleOCR PP-OCRv4 with optional Google Vision cloud fallback. Pluggable engine abstraction in `app/engines/`.
## Files ## Files
| File | What | When to read | | File | What | When to read |
| ---- | ---- | ------------ | | ---- | ---- | ------------ |
| `Dockerfile` | Container build definition | Docker builds, deployment | | `Dockerfile` | Container build (PaddleOCR models baked in) | Docker builds, deployment |
| `requirements.txt` | Python dependencies | Adding dependencies | | `requirements.txt` | Python dependencies | Adding dependencies |
## Subdirectories ## Subdirectories
@@ -12,4 +14,5 @@
| Directory | What | When to read | | Directory | What | When to read |
| --------- | ---- | ------------ | | --------- | ---- | ------------ |
| `app/` | FastAPI application source | OCR endpoint development | | `app/` | FastAPI application source | OCR endpoint development |
| `app/engines/` | Engine abstraction layer (OcrEngine ABC, factory, hybrid) | Adding or changing OCR engines |
| `tests/` | Test suite | Adding or modifying tests | | `tests/` | Test suite | Adding or modifying tests |

View File

@@ -1,5 +1,8 @@
# Production Dockerfile for MotoVaultPro OCR Service # Production Dockerfile for MotoVaultPro OCR Service
# Uses mirrored base images from Gitea Package Registry # Uses mirrored base images from Gitea Package Registry
#
# Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
# Cloud fallback: Google Vision (optional, requires API key at runtime)
# Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub) # Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub)
ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
@@ -7,10 +10,13 @@ ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
FROM ${REGISTRY_MIRRORS}/python:3.13-slim FROM ${REGISTRY_MIRRORS}/python:3.13-slim
# System dependencies # System dependencies
# - libgomp1: OpenMP runtime required by PaddlePaddle
# - libheif1/libheif-dev: HEIF image support (iPhone photos)
# - libglib2.0-0: GLib shared library (OpenCV dependency)
# - libmagic1: File type detection
# - curl: Health check endpoint
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr \ libgomp1 \
tesseract-ocr-eng \
libtesseract-dev \
libheif1 \ libheif1 \
libheif-dev \ libheif-dev \
libglib2.0-0 \ libglib2.0-0 \
@@ -21,7 +27,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
# Python dependencies # Python dependencies
WORKDIR /app WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt # Install dependencies. PaddleX (transitive via paddleocr) pulls in the full
# opencv-python which requires libGL.so.1. Force-reinstall the headless
# variant afterwards so the container stays GUI-free.
RUN pip install --no-cache-dir -r requirements.txt \
&& pip install --no-cache-dir --force-reinstall opencv-python-headless
# Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime).
# Models are baked into the image so container starts are fast and
# no network access is needed at runtime for model download.
ENV PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK=True
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(ocr_version='PP-OCRv4', use_textline_orientation=True, lang='en', device='cpu', enable_mkldnn=False)" \
&& echo "PaddleOCR PP-OCRv4 models downloaded and verified"
COPY . . COPY . .

View File

@@ -12,6 +12,7 @@
| Directory | What | When to read | | Directory | What | When to read |
| --------- | ---- | ------------ | | --------- | ---- | ------------ |
| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback) | Engine changes, adding new engines |
| `extractors/` | Data extraction logic | Adding new extraction types | | `extractors/` | Data extraction logic | Adding new extraction types |
| `models/` | Data models and schemas | Request/response types | | `models/` | Data models and schemas | Request/response types |
| `patterns/` | Regex and parsing patterns | Pattern matching rules | | `patterns/` | Regex and parsing patterns | Pattern matching rules |

View File

@@ -9,7 +9,20 @@ class Settings:
self.log_level: str = os.getenv("LOG_LEVEL", "info") self.log_level: str = os.getenv("LOG_LEVEL", "info")
self.host: str = os.getenv("HOST", "0.0.0.0") self.host: str = os.getenv("HOST", "0.0.0.0")
self.port: int = int(os.getenv("PORT", "8000")) self.port: int = int(os.getenv("PORT", "8000"))
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract") # OCR engine configuration
self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr")
self.ocr_confidence_threshold: float = float(
os.getenv("OCR_CONFIDENCE_THRESHOLD", "0.6")
)
# Cloud fallback configuration (disabled by default)
self.ocr_fallback_engine: str = os.getenv("OCR_FALLBACK_ENGINE", "none")
self.ocr_fallback_threshold: float = float(
os.getenv("OCR_FALLBACK_THRESHOLD", "0.6")
)
self.google_vision_key_path: str = os.getenv(
"GOOGLE_VISION_KEY_PATH", "/run/secrets/google-vision-key.json"
)
# Redis configuration for job queue # Redis configuration for job queue
self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis") self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis")

View File

@@ -0,0 +1,32 @@
"""OCR engine abstraction layer.
Provides a pluggable engine interface for OCR processing,
decoupling extractors from specific OCR libraries.
Engines:
- PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only)
- CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback)
- HybridEngine: Primary + fallback with confidence threshold
"""
from app.engines.base_engine import (
EngineError,
EngineProcessingError,
EngineUnavailableError,
OcrConfig,
OcrEngine,
OcrEngineResult,
WordBox,
)
from app.engines.engine_factory import create_engine
__all__ = [
"OcrEngine",
"OcrConfig",
"OcrEngineResult",
"WordBox",
"EngineError",
"EngineUnavailableError",
"EngineProcessingError",
"create_engine",
]

View File

@@ -0,0 +1,88 @@
"""OCR engine abstract base class and shared data types."""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any
# --- Exception hierarchy ---
class EngineError(Exception):
"""Base exception for all OCR engine errors."""
class EngineUnavailableError(EngineError):
"""Raised when an engine cannot be initialized (missing binary, bad config)."""
class EngineProcessingError(EngineError):
"""Raised when an engine fails to process an image."""
# --- Data types ---
@dataclass
class WordBox:
"""A single recognized word with position and confidence."""
text: str
confidence: float # 0.0-1.0
x: int = 0
y: int = 0
width: int = 0
height: int = 0
@dataclass
class OcrConfig:
"""Engine-agnostic OCR configuration.
Common fields cover the most frequent needs. Engine-specific
parameters go into ``hints`` so the interface stays stable.
"""
char_whitelist: str | None = None # e.g. VIN: "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
single_line: bool = False # Treat image as a single text line
single_word: bool = False # Treat image as a single word
use_angle_cls: bool = True # Enable angle classification (PaddleOCR)
hints: dict[str, Any] = field(default_factory=dict)
@dataclass
class OcrEngineResult:
"""Normalized result returned by every engine implementation."""
text: str
confidence: float # 0.0-1.0
word_boxes: list[WordBox]
engine_name: str # "paddleocr", "google_vision"
# --- Abstract base ---
class OcrEngine(ABC):
"""Abstract base class that all OCR engines must implement."""
@abstractmethod
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
"""Run OCR on preprocessed image bytes.
Args:
image_bytes: Raw image bytes (PNG/JPEG).
config: Engine-agnostic configuration.
Returns:
Normalized OCR result.
Raises:
EngineProcessingError: If recognition fails.
EngineUnavailableError: If the engine is not ready.
"""
@property
@abstractmethod
def name(self) -> str:
"""Short identifier used in OcrEngineResult.engine_name."""

View File

@@ -0,0 +1,166 @@
"""Google Vision cloud OCR engine with lazy initialization."""
import logging
import os
from typing import Any
from app.engines.base_engine import (
EngineProcessingError,
EngineUnavailableError,
OcrConfig,
OcrEngine,
OcrEngineResult,
WordBox,
)
logger = logging.getLogger(__name__)
# Default path for Google Vision service account key (Docker secret mount)
_DEFAULT_KEY_PATH = "/run/secrets/google-vision-key.json"
class CloudEngine(OcrEngine):
"""Google Vision TEXT_DETECTION wrapper with lazy initialization.
The client is not created until the first ``recognize()`` call,
so the container starts normally even when the secret file is
missing or the dependency is not installed.
"""
def __init__(self, key_path: str | None = None) -> None:
self._key_path = key_path or os.getenv(
"GOOGLE_VISION_KEY_PATH", _DEFAULT_KEY_PATH
)
self._client: Any | None = None
@property
def name(self) -> str:
return "google_vision"
# ------------------------------------------------------------------
# Lazy init
# ------------------------------------------------------------------
def _get_client(self) -> Any:
"""Create the Vision client on first use."""
if self._client is not None:
return self._client
# Verify credentials file exists
if not os.path.isfile(self._key_path):
raise EngineUnavailableError(
f"Google Vision key not found at {self._key_path}. "
"Set GOOGLE_VISION_KEY_PATH or mount the secret."
)
try:
from google.cloud import vision # type: ignore[import-untyped]
# Point the SDK at the service account key
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._key_path
self._client = vision.ImageAnnotatorClient()
logger.info(
"Google Vision client initialized (key: %s)", self._key_path
)
return self._client
except ImportError as exc:
raise EngineUnavailableError(
"google-cloud-vision is not installed. "
"Install with: pip install google-cloud-vision"
) from exc
except Exception as exc:
raise EngineUnavailableError(
f"Failed to initialize Google Vision client: {exc}"
) from exc
# ------------------------------------------------------------------
# OCR
# ------------------------------------------------------------------
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
"""Run Google Vision TEXT_DETECTION on image bytes."""
client = self._get_client()
try:
from google.cloud import vision # type: ignore[import-untyped]
image = vision.Image(content=image_bytes)
response = client.text_detection(image=image)
if response.error.message:
raise EngineProcessingError(
f"Google Vision API error: {response.error.message}"
)
annotations = response.text_annotations
if not annotations:
return OcrEngineResult(
text="",
confidence=0.0,
word_boxes=[],
engine_name=self.name,
)
# First annotation is the full-page text; the rest are words
full_text = annotations[0].description.strip()
word_boxes: list[WordBox] = []
confidences: list[float] = []
for annotation in annotations[1:]:
text = annotation.description
vertices = annotation.bounding_poly.vertices
# Apply character whitelist filter if configured
if config.char_whitelist:
allowed = set(config.char_whitelist)
text = "".join(ch for ch in text if ch in allowed)
if not text.strip():
continue
xs = [v.x for v in vertices]
ys = [v.y for v in vertices]
x_min, y_min = min(xs), min(ys)
x_max, y_max = max(xs), max(ys)
# Google Vision TEXT_DETECTION does not return per-word
# confidence in annotations. Use 0.95 as the documented
# typical accuracy for clear images so comparisons with
# PaddleOCR are meaningful.
word_conf = 0.95
word_boxes.append(
WordBox(
text=text.strip(),
confidence=word_conf,
x=x_min,
y=y_min,
width=x_max - x_min,
height=y_max - y_min,
)
)
confidences.append(word_conf)
# Apply whitelist to full text too
if config.char_whitelist:
allowed = set(config.char_whitelist)
full_text = "".join(
ch for ch in full_text if ch in allowed or ch in " \n"
)
avg_confidence = (
sum(confidences) / len(confidences) if confidences else 0.0
)
return OcrEngineResult(
text=full_text,
confidence=avg_confidence,
word_boxes=word_boxes,
engine_name=self.name,
)
except (EngineUnavailableError, EngineProcessingError):
raise
except Exception as exc:
raise EngineProcessingError(
f"Google Vision recognition failed: {exc}"
) from exc

View File

@@ -0,0 +1,86 @@
"""Factory function for creating OCR engine instances from configuration."""
import importlib
import logging
from app.config import settings
from app.engines.base_engine import EngineUnavailableError, OcrEngine
logger = logging.getLogger(__name__)
# Valid engine identifiers (primary engines only; hybrid is constructed separately)
_ENGINE_REGISTRY: dict[str, str] = {
"paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
"google_vision": "app.engines.cloud_engine.CloudEngine",
}
def _create_single_engine(name: str) -> OcrEngine:
"""Instantiate a single engine by registry name."""
if name not in _ENGINE_REGISTRY:
raise EngineUnavailableError(
f"Unknown engine '{name}'. Available: {list(_ENGINE_REGISTRY.keys())}"
)
module_path, class_name = _ENGINE_REGISTRY[name].rsplit(".", 1)
try:
module = importlib.import_module(module_path)
engine_cls = getattr(module, class_name)
engine: OcrEngine = engine_cls()
logger.info("Created OCR engine: %s", name)
return engine
except EngineUnavailableError:
raise
except Exception as exc:
raise EngineUnavailableError(
f"Failed to create engine '{name}': {exc}"
) from exc
def create_engine(engine_name: str | None = None) -> OcrEngine:
"""Instantiate an OCR engine by name (defaults to config value).
When a fallback engine is configured (``OCR_FALLBACK_ENGINE != "none"``),
returns a ``HybridEngine`` that wraps the primary with the fallback.
Args:
engine_name: Engine identifier ("paddleocr", "google_vision").
Falls back to ``settings.ocr_primary_engine``.
Returns:
Initialized OcrEngine instance (possibly a HybridEngine wrapper).
Raises:
EngineUnavailableError: If the primary engine cannot be loaded.
"""
name = (engine_name or settings.ocr_primary_engine).lower().strip()
primary = _create_single_engine(name)
# Check for cloud fallback configuration
fallback_name = settings.ocr_fallback_engine.lower().strip()
if fallback_name == "none" or not fallback_name:
return primary
# Create fallback engine (failure is non-fatal -- log and return primary only)
try:
fallback = _create_single_engine(fallback_name)
except EngineUnavailableError as exc:
logger.warning(
"Fallback engine '%s' unavailable, proceeding without fallback: %s",
fallback_name,
exc,
)
return primary
from app.engines.hybrid_engine import HybridEngine
threshold = settings.ocr_fallback_threshold
hybrid = HybridEngine(primary=primary, fallback=fallback, threshold=threshold)
logger.info(
"Created hybrid engine: primary=%s, fallback=%s, threshold=%.2f",
name,
fallback_name,
threshold,
)
return hybrid

View File

@@ -0,0 +1,116 @@
"""Hybrid OCR engine: primary engine with optional cloud fallback."""
import logging
import time
from app.engines.base_engine import (
EngineError,
EngineProcessingError,
OcrConfig,
OcrEngine,
OcrEngineResult,
)
logger = logging.getLogger(__name__)
# Maximum time (seconds) to wait for the cloud fallback
_CLOUD_TIMEOUT_SECONDS = 5.0
class HybridEngine(OcrEngine):
"""Runs a primary engine and falls back to a cloud engine when
the primary result confidence is below the configured threshold.
If the fallback is ``None`` (default), this engine behaves identically
to the primary engine. Cloud failures are handled gracefully -- the
primary result is returned whenever the fallback is unavailable,
times out, or errors.
"""
def __init__(
self,
primary: OcrEngine,
fallback: OcrEngine | None = None,
threshold: float = 0.6,
) -> None:
self._primary = primary
self._fallback = fallback
self._threshold = threshold
@property
def name(self) -> str:
fallback_name = self._fallback.name if self._fallback else "none"
return f"hybrid({self._primary.name}+{fallback_name})"
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
"""Run primary OCR, optionally falling back to cloud engine."""
primary_result = self._primary.recognize(image_bytes, config)
# Happy path: primary confidence meets threshold
if primary_result.confidence >= self._threshold:
logger.debug(
"Primary engine confidence %.2f >= threshold %.2f, no fallback",
primary_result.confidence,
self._threshold,
)
return primary_result
# No fallback configured -- return primary result as-is
if self._fallback is None:
logger.debug(
"Primary confidence %.2f < threshold %.2f but no fallback configured",
primary_result.confidence,
self._threshold,
)
return primary_result
# Attempt cloud fallback with timeout guard
logger.info(
"Primary confidence %.2f < threshold %.2f, trying fallback (%s)",
primary_result.confidence,
self._threshold,
self._fallback.name,
)
try:
start = time.monotonic()
fallback_result = self._fallback.recognize(image_bytes, config)
elapsed = time.monotonic() - start
if elapsed > _CLOUD_TIMEOUT_SECONDS:
logger.warning(
"Cloud fallback took %.1fs (> %.1fs limit), using primary result",
elapsed,
_CLOUD_TIMEOUT_SECONDS,
)
return primary_result
# Return whichever result has higher confidence
if fallback_result.confidence > primary_result.confidence:
logger.info(
"Fallback confidence %.2f > primary %.2f, using fallback result",
fallback_result.confidence,
primary_result.confidence,
)
return fallback_result
logger.info(
"Primary confidence %.2f >= fallback %.2f, keeping primary result",
primary_result.confidence,
fallback_result.confidence,
)
return primary_result
except EngineError as exc:
logger.warning(
"Cloud fallback failed (%s), returning primary result: %s",
self._fallback.name,
exc,
)
return primary_result
except Exception as exc:
logger.warning(
"Unexpected cloud fallback error, returning primary result: %s",
exc,
)
return primary_result

View File

@@ -0,0 +1,157 @@
"""PaddleOCR engine wrapper using PP-OCRv4 models."""
import io
import logging
from typing import Any
from app.engines.base_engine import (
EngineProcessingError,
EngineUnavailableError,
OcrConfig,
OcrEngine,
OcrEngineResult,
WordBox,
)
logger = logging.getLogger(__name__)
class PaddleOcrEngine(OcrEngine):
"""PaddleOCR PP-OCRv4 engine with angle classification, CPU-only."""
def __init__(self) -> None:
self._ocr: Any | None = None
@property
def name(self) -> str:
return "paddleocr"
def _get_ocr(self) -> Any:
"""Lazy-initialize PaddleOCR instance on first use."""
if self._ocr is not None:
return self._ocr
try:
from paddleocr import PaddleOCR # type: ignore[import-untyped]
self._ocr = PaddleOCR(
ocr_version="PP-OCRv4",
use_textline_orientation=True,
lang="en",
device="cpu",
enable_mkldnn=False,
)
logger.info("PaddleOCR PP-OCRv4 initialized (CPU, textline_orientation=True)")
return self._ocr
except ImportError as exc:
raise EngineUnavailableError(
"paddleocr is not installed. "
"Install with: pip install paddlepaddle paddleocr"
) from exc
except Exception as exc:
raise EngineUnavailableError(
f"Failed to initialize PaddleOCR: {exc}"
) from exc
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
"""Run PaddleOCR on image bytes.
PaddleOCR v3.x ``predict()`` returns an iterator of result objects.
Each result's ``.json`` property returns a dict. The OCR fields
(``dt_polys``, ``rec_texts``, ``rec_scores``) may be at the top
level or nested under a ``"res"`` key depending on the version.
"""
ocr = self._get_ocr()
try:
import numpy as np # type: ignore[import-untyped]
from PIL import Image
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
img_array = np.array(image)
results = list(ocr.predict(img_array))
if not results:
return OcrEngineResult(
text="",
confidence=0.0,
word_boxes=[],
engine_name=self.name,
)
raw = results[0].json
# Unwrap nested "res" key if present (save_to_json format)
res = raw.get("res", raw) if isinstance(raw, dict) else raw
logger.debug(
"PaddleOCR result keys: %s",
list(res.keys()) if isinstance(res, dict) else type(res).__name__,
)
dt_polys = res.get("dt_polys", [])
rec_texts = res.get("rec_texts", [])
rec_scores = res.get("rec_scores", [])
if not rec_texts:
return OcrEngineResult(
text="",
confidence=0.0,
word_boxes=[],
engine_name=self.name,
)
word_boxes: list[WordBox] = []
texts: list[str] = []
confidences: list[float] = []
for i, text in enumerate(rec_texts):
conf = float(rec_scores[i]) if i < len(rec_scores) else 0.0
# Apply character whitelist filter if configured
if config.char_whitelist:
allowed = set(config.char_whitelist)
text = "".join(ch for ch in text if ch in allowed)
if not text.strip():
continue
# Convert quadrilateral polygon to bounding box
x_min, y_min, width, height = 0, 0, 0, 0
if i < len(dt_polys):
poly = dt_polys[i]
xs = [pt[0] for pt in poly]
ys = [pt[1] for pt in poly]
x_min, y_min = int(min(xs)), int(min(ys))
x_max, y_max = int(max(xs)), int(max(ys))
width = x_max - x_min
height = y_max - y_min
word_boxes.append(
WordBox(
text=text.strip(),
confidence=conf,
x=x_min,
y=y_min,
width=width,
height=height,
)
)
texts.append(text.strip())
confidences.append(conf)
combined_text = " ".join(texts)
avg_confidence = (
sum(confidences) / len(confidences) if confidences else 0.0
)
return OcrEngineResult(
text=combined_text,
confidence=avg_confidence,
word_boxes=word_boxes,
engine_name=self.name,
)
except (EngineUnavailableError, EngineProcessingError):
raise
except Exception as exc:
raise EngineProcessingError(
f"PaddleOCR recognition failed: {exc}"
) from exc

View File

@@ -5,9 +5,9 @@ import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Callable, Optional from typing import Callable, Optional
import pytesseract
from PIL import Image from PIL import Image
from app.engines import create_engine, OcrConfig
from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo
from app.table_extraction.detector import table_detector, DetectedTable from app.table_extraction.detector import table_detector, DetectedTable
from app.table_extraction.parser import table_parser, ParsedScheduleRow from app.table_extraction.parser import table_parser, ParsedScheduleRow
@@ -243,8 +243,9 @@ class ManualExtractor:
# OCR the full page # OCR the full page
try: try:
image = Image.open(io.BytesIO(image_bytes)) engine = create_engine()
ocr_text = pytesseract.image_to_string(image) ocr_result = engine.recognize(image_bytes, OcrConfig())
ocr_text = ocr_result.text
# Mark tables as maintenance if page contains maintenance keywords # Mark tables as maintenance if page contains maintenance keywords
for table in detected_tables: for table in detected_tables:
@@ -358,8 +359,9 @@ class ManualExtractor:
if not text and first_page.image_bytes: if not text and first_page.image_bytes:
# OCR first page # OCR first page
image = Image.open(io.BytesIO(first_page.image_bytes)) engine = create_engine()
text = pytesseract.image_to_string(image) ocr_result = engine.recognize(first_page.image_bytes, OcrConfig())
text = ocr_result.text
if text: if text:
return self._parse_vehicle_from_text(text) return self._parse_vehicle_from_text(text)

View File

@@ -1,16 +1,13 @@
"""Receipt-specific OCR extractor with field extraction.""" """Receipt-specific OCR extractor with field extraction."""
import io
import logging import logging
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, Optional from typing import Any, Optional
import magic import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener from pillow_heif import register_heif_opener
from app.config import settings from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor from app.extractors.base import BaseExtractor
from app.preprocessors.receipt_preprocessor import receipt_preprocessor from app.preprocessors.receipt_preprocessor import receipt_preprocessor
from app.patterns import currency_matcher, date_matcher, fuel_matcher from app.patterns import currency_matcher, date_matcher, fuel_matcher
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
} }
def __init__(self) -> None: def __init__(self) -> None:
"""Initialize receipt extractor.""" """Initialize receipt extractor with engine from factory."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd self._engine = create_engine()
def extract( def extract(
self, self,
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
detected = mime.from_buffer(file_bytes) detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream" return detected or "application/octet-stream"
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str: def _perform_ocr(self, image_bytes: bytes) -> str:
""" """
Perform OCR on preprocessed image. Perform OCR on preprocessed image via engine abstraction.
Args: Args:
image_bytes: Preprocessed image bytes image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
4 = Assume single column of text
6 = Uniform block of text
Returns: Returns:
Raw OCR text Raw OCR text
""" """
image = Image.open(io.BytesIO(image_bytes)) config = OcrConfig()
result = self._engine.recognize(image_bytes, config)
# Configure Tesseract for receipt OCR return result.text
# PSM 4 works well for columnar receipt text
config = f"--psm {psm}"
return pytesseract.image_to_string(image, config=config)
def _detect_receipt_type(self, text: str) -> str: def _detect_receipt_type(self, text: str) -> str:
""" """

View File

@@ -1,5 +1,4 @@
"""VIN-specific OCR extractor with preprocessing and validation.""" """VIN-specific OCR extractor with preprocessing and validation."""
import io
import logging import logging
import os import os
import time import time
@@ -8,11 +7,10 @@ from datetime import datetime
from typing import Optional from typing import Optional
import magic import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener from pillow_heif import register_heif_opener
from app.config import settings from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator from app.validators.vin_validator import vin_validator
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
"image/heif", "image/heif",
} }
# VIN character whitelist for Tesseract # VIN character whitelist (passed to engine for post-OCR filtering)
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789" VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
# Fixed debug output directory (inside container) # Fixed debug output directory (inside container)
DEBUG_DIR = "/tmp/vin-debug" DEBUG_DIR = "/tmp/vin-debug"
def __init__(self) -> None: def __init__(self) -> None:
"""Initialize VIN extractor.""" """Initialize VIN extractor with engine from factory."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd self._engine = create_engine()
self._debug = settings.log_level.upper() == "DEBUG" self._debug = settings.log_level.upper() == "DEBUG"
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None: def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
# Perform OCR with VIN-optimized settings # Perform OCR with VIN-optimized settings
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes) raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
logger.debug("PSM 6 raw text: '%s'", raw_text) logger.debug("Primary OCR raw text: '%s'", raw_text)
logger.debug("PSM 6 word confidences: %s", word_confidences) logger.debug("Primary OCR word confidences: %s", word_confidences)
# Extract VIN candidates from raw text # Extract VIN candidates from raw text
candidates = vin_validator.extract_candidates(raw_text) candidates = vin_validator.extract_candidates(raw_text)
logger.debug("PSM 6 candidates: %s", candidates) logger.debug("Primary OCR candidates: %s", candidates)
if not candidates: if not candidates:
# No VIN candidates found - try with different PSM modes # No VIN candidates found - try alternate OCR configurations
candidates = self._try_alternate_ocr(preprocessed_bytes) candidates = self._try_alternate_ocr(preprocessed_bytes)
if not candidates: if not candidates:
# Try grayscale-only (no thresholding) — the Tesseract # Try grayscale-only (no thresholding) — OCR engines often
# LSTM engine often performs better on non-binarized input # perform better on non-binarized input because they do
# because it does its own internal preprocessing. # their own internal preprocessing.
gray_result = vin_preprocessor.preprocess( gray_result = vin_preprocessor.preprocess(
image_bytes, apply_threshold=False image_bytes, apply_threshold=False
) )
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
raw_text, word_confidences = self._perform_ocr( raw_text, word_confidences = self._perform_ocr(
gray_result.image_bytes gray_result.image_bytes
) )
logger.debug("Gray PSM 6 raw text: '%s'", raw_text) logger.debug("Gray primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text) candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Gray PSM 6 candidates: %s", candidates) logger.debug("Gray primary candidates: %s", candidates)
if not candidates: if not candidates:
candidates = self._try_alternate_ocr( candidates = self._try_alternate_ocr(
gray_result.image_bytes, prefix="Gray" gray_result.image_bytes, prefix="Gray"
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
) )
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes) raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text) logger.debug("Otsu primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text) candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Otsu PSM 6 candidates: %s", candidates) logger.debug("Otsu primary candidates: %s", candidates)
if not candidates: if not candidates:
candidates = self._try_alternate_ocr( candidates = self._try_alternate_ocr(
otsu_result.image_bytes, prefix="Otsu" otsu_result.image_bytes, prefix="Otsu"
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
return detected or "application/octet-stream" return detected or "application/octet-stream"
def _perform_ocr( def _perform_ocr(
self, image_bytes: bytes, psm: int = 6 self,
image_bytes: bytes,
single_line: bool = False,
single_word: bool = False,
) -> tuple[str, list[float]]: ) -> tuple[str, list[float]]:
""" """
Perform OCR with VIN-optimized settings. Perform OCR with VIN-optimized settings via engine abstraction.
Args: Args:
image_bytes: Preprocessed image bytes image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode single_line: Treat image as a single text line
6 = Uniform block of text single_word: Treat image as a single word
7 = Single text line
8 = Single word
Returns: Returns:
Tuple of (raw_text, word_confidences) Tuple of (raw_text, word_confidences)
""" """
image = Image.open(io.BytesIO(image_bytes)) config = OcrConfig(
char_whitelist=self.VIN_WHITELIST,
# Configure Tesseract for VIN extraction single_line=single_line,
# OEM 1 = LSTM neural network engine (best accuracy) single_word=single_word,
# NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM). use_angle_cls=True,
# Using it causes empty/erratic output. Character filtering is
# handled post-OCR by vin_validator.correct_ocr_errors() instead.
config = (
f"--psm {psm} "
f"--oem 1 "
f"-c load_system_dawg=false "
f"-c load_freq_dawg=false"
) )
result = self._engine.recognize(image_bytes, config)
# Get detailed OCR data word_confidences = [wb.confidence for wb in result.word_boxes]
ocr_data = pytesseract.image_to_data( return result.text, word_confidences
image, config=config, output_type=pytesseract.Output.DICT
)
# Extract words and confidences
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text.strip())
confidences.append(conf / 100.0)
raw_text = " ".join(words)
return raw_text, confidences
def _try_alternate_ocr( def _try_alternate_ocr(
self, self,
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
""" """
Try alternate OCR configurations when initial extraction fails. Try alternate OCR configurations when initial extraction fails.
PSM modes tried in order: Modes tried:
7 - Single text line single-line - Treat as a single text line
8 - Single word single-word - Treat as a single word
11 - Sparse text (finds text in any order, good for angled photos)
13 - Raw line (no Tesseract heuristics, good for clean VIN plates) PaddleOCR angle classification handles rotated/angled text
inherently, so no PSM mode fallbacks are needed.
Returns: Returns:
List of VIN candidates List of VIN candidates
""" """
tag = f"{prefix} " if prefix else "" tag = f"{prefix} " if prefix else ""
for psm in (7, 8, 11, 13): for mode_name, kwargs in [
raw_text, _ = self._perform_ocr(image_bytes, psm=psm) ("single-line", {"single_line": True}),
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text) ("single-word", {"single_word": True}),
]:
raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
candidates = vin_validator.extract_candidates(raw_text) candidates = vin_validator.extract_candidates(raw_text)
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates) logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
if candidates: if candidates:
return candidates return candidates

View File

@@ -93,7 +93,7 @@ class VinPreprocessor:
gray = cv_image gray = cv_image
steps_applied.append("grayscale") steps_applied.append("grayscale")
# Upscale small images for better OCR (Tesseract needs ~300 DPI) # Upscale small images for better OCR (~300 DPI recommended)
gray = self._ensure_minimum_resolution(gray) gray = self._ensure_minimum_resolution(gray)
steps_applied.append("resolution_check") steps_applied.append("resolution_check")
@@ -129,14 +129,14 @@ class VinPreprocessor:
) )
# Minimum width in pixels for reliable VIN OCR. # Minimum width in pixels for reliable VIN OCR.
# A 17-char VIN needs ~30px per character for Tesseract accuracy. # A 17-char VIN needs ~30px per character for reliable OCR accuracy.
MIN_WIDTH_FOR_VIN = 600 MIN_WIDTH_FOR_VIN = 600
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray: def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
""" """
Upscale image if too small for reliable OCR. Upscale image if too small for reliable OCR.
Tesseract works best at ~300 DPI. Mobile photos of VINs may have OCR works best at ~300 DPI. Mobile photos of VINs may have
the text occupy only a small portion of the frame, resulting in the text occupy only a small portion of the frame, resulting in
low effective resolution for the VIN characters. low effective resolution for the VIN characters.
""" """
@@ -160,7 +160,7 @@ class VinPreprocessor:
Colored backgrounds have a low min value (e.g. green sticker: Colored backgrounds have a low min value (e.g. green sticker:
min(130,230,150) = 130) → inverted to 125 (medium gray). min(130,230,150) = 130) → inverted to 125 (medium gray).
The inversion ensures Tesseract always receives dark-text-on- The inversion ensures the OCR engine always receives dark-text-on-
light-background, which is the polarity it expects. light-background, which is the polarity it expects.
""" """
b_channel, g_channel, r_channel = cv2.split(bgr_image) b_channel, g_channel, r_channel = cv2.split(bgr_image)
@@ -168,8 +168,8 @@ class VinPreprocessor:
min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel) min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)
# Invert so white text (min=255) becomes black (0) and colored # Invert so white text (min=255) becomes black (0) and colored
# backgrounds (min~130) become lighter gray (~125). Tesseract # backgrounds (min~130) become lighter gray (~125). OCR engines
# expects dark text on light background. # expect dark text on light background.
inverted = cv2.bitwise_not(min_channel) inverted = cv2.bitwise_not(min_channel)
gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)

View File

@@ -1,15 +1,14 @@
"""Core OCR service using Tesseract with HEIC support.""" """Core OCR service with HEIC support, using pluggable engine abstraction."""
import io import io
import logging import logging
import time import time
from typing import Optional from typing import Optional
import magic import magic
import pytesseract
from PIL import Image from PIL import Image
from pillow_heif import register_heif_opener from pillow_heif import register_heif_opener
from app.config import settings from app.engines import OcrConfig, create_engine
from app.models import DocumentType, ExtractedField, OcrResponse from app.models import DocumentType, ExtractedField, OcrResponse
from app.services.preprocessor import preprocessor from app.services.preprocessor import preprocessor
@@ -32,8 +31,8 @@ class OcrService:
} }
def __init__(self) -> None: def __init__(self) -> None:
"""Initialize OCR service.""" """Initialize OCR service with engine from factory."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd self._engine = create_engine()
def extract( def extract(
self, self,
@@ -86,14 +85,11 @@ class OcrService:
file_bytes, deskew=True, denoise=True file_bytes, deskew=True, denoise=True
) )
# Perform OCR # Perform OCR via engine abstraction
image = Image.open(io.BytesIO(file_bytes)) config = OcrConfig()
ocr_data = pytesseract.image_to_data( result = self._engine.recognize(file_bytes, config)
image, output_type=pytesseract.Output.DICT raw_text = result.text
) confidence = result.confidence
# Extract text and calculate confidence
raw_text, confidence = self._process_ocr_data(ocr_data)
# Detect document type from content # Detect document type from content
document_type = self._detect_document_type(raw_text) document_type = self._detect_document_type(raw_text)
@@ -160,26 +156,6 @@ class OcrService:
return b"" return b""
def _process_ocr_data(
self, ocr_data: dict
) -> tuple[str, float]:
"""Process Tesseract output to extract text and confidence."""
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
# Filter out empty strings and low-confidence results
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text)
confidences.append(conf)
raw_text = " ".join(words)
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
return raw_text, avg_confidence / 100.0
def _detect_document_type(self, text: str) -> DocumentType: def _detect_document_type(self, text: str) -> DocumentType:
"""Detect document type from extracted text content.""" """Detect document type from extracted text content."""
text_lower = text.lower() text_lower = text.lower()

View File

@@ -312,7 +312,7 @@ class TableDetector:
Returns: Returns:
2D list of cell contents 2D list of cell contents
""" """
# This would use Tesseract on the cropped region # This would use OCR on the cropped region
# For now, return empty - actual OCR will be done in manual_extractor # For now, return empty - actual OCR will be done in manual_extractor
logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}") logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}")
return [] return []

View File

@@ -226,7 +226,7 @@ class VinValidator:
Uses two strategies: Uses two strategies:
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs) 1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
2. Concatenate adjacent short fragments separated by spaces/dashes 2. Concatenate adjacent short fragments separated by spaces/dashes
(handles Tesseract fragmenting VINs into multiple words) (handles OCR fragmenting VINs into multiple words)
Args: Args:
text: Raw OCR text text: Raw OCR text

View File

@@ -14,7 +14,9 @@ opencv-python-headless>=4.8.0
numpy>=1.24.0 numpy>=1.24.0
# OCR Engines # OCR Engines
pytesseract>=0.3.10 paddlepaddle>=2.6.0
paddleocr>=2.8.0
google-cloud-vision>=3.7.0
# PDF Processing # PDF Processing
PyMuPDF>=1.23.0 PyMuPDF>=1.23.0

View File

@@ -0,0 +1,626 @@
"""Tests for OCR engine abstraction layer.
Covers: base types, exception hierarchy, PaddleOcrEngine,
CloudEngine, HybridEngine, and engine_factory.
"""
import io
from unittest.mock import MagicMock, patch
import pytest
from PIL import Image
from app.engines.base_engine import (
EngineError,
EngineProcessingError,
EngineUnavailableError,
OcrConfig,
OcrEngine,
OcrEngineResult,
WordBox,
)
# --- Helpers ---
def _create_test_image_bytes() -> bytes:
"""Create minimal PNG image bytes for engine testing."""
img = Image.new("RGB", (100, 50), (255, 255, 255))
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
def _make_result(
text: str, confidence: float, engine_name: str
) -> OcrEngineResult:
"""Create a minimal OcrEngineResult for testing."""
return OcrEngineResult(
text=text, confidence=confidence, word_boxes=[], engine_name=engine_name
)
def _mock_paddle_result(
dt_polys: list, rec_texts: list[str], rec_scores: list[float]
) -> MagicMock:
"""Create a mock PaddleOCR v3.x predict() result object.
Wraps data under ``"res"`` key to match save_to_json format.
"""
result = MagicMock()
result.json = {
"res": {
"dt_polys": dt_polys,
"rec_texts": rec_texts,
"rec_scores": rec_scores,
}
}
return result
# ---------------------------------------------------------------------------
# Exception hierarchy
# ---------------------------------------------------------------------------
class TestExceptionHierarchy:
"""Engine errors form a proper hierarchy under EngineError."""
def test_unavailable_is_engine_error(self) -> None:
assert issubclass(EngineUnavailableError, EngineError)
def test_processing_is_engine_error(self) -> None:
assert issubclass(EngineProcessingError, EngineError)
def test_engine_error_is_exception(self) -> None:
assert issubclass(EngineError, Exception)
def test_catch_base_catches_subtypes(self) -> None:
with pytest.raises(EngineError):
raise EngineUnavailableError("not installed")
with pytest.raises(EngineError):
raise EngineProcessingError("OCR failed")
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
class TestWordBox:
def test_default_positions(self) -> None:
wb = WordBox(text="VIN", confidence=0.95)
assert wb.x == 0
assert wb.y == 0
assert wb.width == 0
assert wb.height == 0
def test_all_fields(self) -> None:
wb = WordBox(text="ABC", confidence=0.88, x=10, y=20, width=100, height=30)
assert wb.text == "ABC"
assert wb.confidence == 0.88
assert wb.x == 10
assert wb.width == 100
class TestOcrConfig:
def test_defaults(self) -> None:
config = OcrConfig()
assert config.char_whitelist is None
assert config.single_line is False
assert config.single_word is False
assert config.use_angle_cls is True
assert config.hints == {}
def test_vin_whitelist_excludes_ioq(self) -> None:
whitelist = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
config = OcrConfig(char_whitelist=whitelist)
assert "I" not in config.char_whitelist
assert "O" not in config.char_whitelist
assert "Q" not in config.char_whitelist
def test_hints_are_independent_across_instances(self) -> None:
c1 = OcrConfig()
c2 = OcrConfig()
c1.hints["psm"] = 7
assert "psm" not in c2.hints
class TestOcrEngineResult:
def test_construction(self) -> None:
result = OcrEngineResult(
text="1HGBH41JXMN109186",
confidence=0.94,
word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
engine_name="paddleocr",
)
assert result.text == "1HGBH41JXMN109186"
assert result.confidence == 0.94
assert len(result.word_boxes) == 1
assert result.engine_name == "paddleocr"
def test_empty_result(self) -> None:
result = OcrEngineResult(
text="", confidence=0.0, word_boxes=[], engine_name="paddleocr"
)
assert result.text == ""
assert result.word_boxes == []
# ---------------------------------------------------------------------------
# OcrEngine ABC
# ---------------------------------------------------------------------------
class TestOcrEngineABC:
def test_cannot_instantiate_directly(self) -> None:
with pytest.raises(TypeError):
OcrEngine() # type: ignore[abstract]
def test_concrete_subclass_works(self) -> None:
class StubEngine(OcrEngine):
@property
def name(self) -> str:
return "stub"
def recognize(
self, image_bytes: bytes, config: OcrConfig
) -> OcrEngineResult:
return OcrEngineResult(
text="ok", confidence=1.0, word_boxes=[], engine_name="stub"
)
engine = StubEngine()
assert engine.name == "stub"
result = engine.recognize(b"", OcrConfig())
assert result.text == "ok"
# ---------------------------------------------------------------------------
# PaddleOcrEngine
# ---------------------------------------------------------------------------
class TestPaddleOcrEngine:
def test_name(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
assert engine.name == "paddleocr"
def test_lazy_init_not_loaded_at_construction(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
assert engine._ocr is None
def test_recognize_empty_results(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.predict.return_value = iter([
_mock_paddle_result(dt_polys=[], rec_texts=[], rec_scores=[])
])
engine._ocr = mock_ocr
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
assert result.text == ""
assert result.confidence == 0.0
assert result.word_boxes == []
assert result.engine_name == "paddleocr"
def test_recognize_with_results(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.predict.return_value = iter([
_mock_paddle_result(
dt_polys=[
[[10, 20], [110, 20], [110, 50], [10, 50]],
[[10, 60], [110, 60], [110, 90], [10, 90]],
],
rec_texts=["HELLO", "WORLD"],
rec_scores=[0.95, 0.88],
)
])
engine._ocr = mock_ocr
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
assert result.text == "HELLO WORLD"
assert abs(result.confidence - 0.915) < 0.01
assert len(result.word_boxes) == 2
assert result.word_boxes[0].text == "HELLO"
assert result.word_boxes[0].confidence == 0.95
assert result.word_boxes[1].text == "WORLD"
assert result.engine_name == "paddleocr"
def test_recognize_whitelist_filters_characters(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.predict.return_value = iter([
_mock_paddle_result(
dt_polys=[[[0, 0], [100, 0], [100, 30], [0, 30]]],
rec_texts=["1HG-BH4!"],
rec_scores=[0.9],
)
])
engine._ocr = mock_ocr
config = OcrConfig(char_whitelist="ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
result = engine.recognize(_create_test_image_bytes(), config)
assert "-" not in result.text
assert "!" not in result.text
assert result.word_boxes[0].text == "1HGBH4"
def test_recognize_quadrilateral_to_bounding_box(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
# Slightly rotated quad: min x=8, min y=20, max x=110, max y=55
mock_ocr.predict.return_value = iter([
_mock_paddle_result(
dt_polys=[[[10, 20], [110, 25], [108, 55], [8, 50]]],
rec_texts=["TEXT"],
rec_scores=[0.9],
)
])
engine._ocr = mock_ocr
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
wb = result.word_boxes[0]
assert wb.x == 8
assert wb.y == 20
assert wb.width == 102 # 110 - 8
assert wb.height == 35 # 55 - 20
def test_recognize_skips_empty_after_whitelist(self) -> None:
"""Text consisting only of non-whitelisted characters is skipped."""
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.predict.return_value = iter([
_mock_paddle_result(
dt_polys=[[[0, 0], [50, 0], [50, 20], [0, 20]]],
rec_texts=["---"],
rec_scores=[0.9],
)
])
engine._ocr = mock_ocr
config = OcrConfig(char_whitelist="ABC")
result = engine.recognize(_create_test_image_bytes(), config)
assert result.text == ""
assert result.word_boxes == []
assert result.confidence == 0.0
def test_import_error_raises_unavailable(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
engine._ocr = None
with patch.dict("sys.modules", {"paddleocr": None}):
with patch(
"app.engines.paddle_engine.importlib.import_module",
side_effect=ImportError("No module"),
):
# Force re-import by removing cached paddleocr
original_import = __builtins__.__import__ if hasattr(__builtins__, '__import__') else __import__
def mock_import(name, *args, **kwargs):
if name == "paddleocr":
raise ImportError("No module named 'paddleocr'")
return original_import(name, *args, **kwargs)
with patch("builtins.__import__", side_effect=mock_import):
with pytest.raises(EngineUnavailableError, match="paddleocr"):
engine._get_ocr()
def test_processing_error_on_exception(self) -> None:
from app.engines.paddle_engine import PaddleOcrEngine
engine = PaddleOcrEngine()
mock_ocr = MagicMock()
mock_ocr.predict.side_effect = RuntimeError("OCR crashed")
engine._ocr = mock_ocr
with pytest.raises(EngineProcessingError, match="PaddleOCR recognition failed"):
engine.recognize(_create_test_image_bytes(), OcrConfig())
# ---------------------------------------------------------------------------
# CloudEngine
# ---------------------------------------------------------------------------
class TestCloudEngine:
def test_name(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/path.json")
assert engine.name == "google_vision"
def test_lazy_init_not_loaded_at_construction(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/path.json")
assert engine._client is None
def test_missing_key_file_raises_unavailable(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/nonexistent/key.json")
with pytest.raises(EngineUnavailableError, match="key not found"):
engine._get_client()
@patch("os.path.isfile", return_value=True)
def test_missing_library_raises_unavailable(self, _mock_isfile: MagicMock) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/key.json")
def mock_import(name, *args, **kwargs):
if "google.cloud" in name:
raise ImportError("No module named 'google.cloud'")
return __import__(name, *args, **kwargs)
with patch("builtins.__import__", side_effect=mock_import):
with pytest.raises(EngineUnavailableError, match="google-cloud-vision"):
engine._get_client()
def test_recognize_empty_annotations(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/key.json")
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.error.message = ""
mock_response.text_annotations = []
mock_client.text_detection.return_value = mock_response
engine._client = mock_client
# Mock the google.cloud.vision import inside recognize()
mock_vision = MagicMock()
with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
result = engine.recognize(b"fake_image", OcrConfig())
assert result.text == ""
assert result.confidence == 0.0
assert result.engine_name == "google_vision"
def test_recognize_api_error_raises_processing_error(self) -> None:
from app.engines.cloud_engine import CloudEngine
engine = CloudEngine(key_path="/fake/key.json")
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.error.message = "API quota exceeded"
mock_client.text_detection.return_value = mock_response
engine._client = mock_client
mock_vision = MagicMock()
with patch.dict("sys.modules", {"google.cloud.vision": mock_vision, "google.cloud": MagicMock(), "google": MagicMock()}):
with pytest.raises(EngineProcessingError, match="API quota exceeded"):
engine.recognize(b"fake_image", OcrConfig())
# ---------------------------------------------------------------------------
# HybridEngine
# ---------------------------------------------------------------------------
class TestHybridEngine:
def test_name_with_fallback(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback = MagicMock(spec=OcrEngine)
fallback.name = "google_vision"
engine = HybridEngine(primary=primary, fallback=fallback)
assert engine.name == "hybrid(paddleocr+google_vision)"
def test_name_without_fallback(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
engine = HybridEngine(primary=primary)
assert engine.name == "hybrid(paddleocr+none)"
def test_high_confidence_skips_fallback(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "cloud"
primary.recognize.return_value = _make_result("VIN123", 0.95, "paddleocr")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
assert result.engine_name == "paddleocr"
fallback.recognize.assert_not_called()
def test_low_confidence_triggers_fallback(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN456"
assert result.engine_name == "google_vision"
fallback.recognize.assert_called_once()
def test_low_confidence_no_fallback_returns_primary(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
engine = HybridEngine(primary=primary, fallback=None, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
def test_fallback_lower_confidence_returns_primary(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.4, "paddleocr")
fallback.recognize.return_value = _make_result("VIN456", 0.3, "google_vision")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
def test_fallback_engine_error_returns_primary(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
fallback.recognize.side_effect = EngineUnavailableError("key missing")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
def test_fallback_unexpected_error_returns_primary(self) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
fallback.recognize.side_effect = RuntimeError("network error")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123"
@patch("app.engines.hybrid_engine.time")
def test_fallback_timeout_returns_primary(self, mock_time: MagicMock) -> None:
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "google_vision"
primary.recognize.return_value = _make_result("VIN123", 0.3, "paddleocr")
fallback.recognize.return_value = _make_result("VIN456", 0.92, "google_vision")
# Simulate 6-second delay (exceeds 5s limit)
mock_time.monotonic.side_effect = [0.0, 6.0]
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.text == "VIN123" # timeout -> use primary
def test_exact_threshold_skips_fallback(self) -> None:
"""When confidence == threshold, no fallback needed (>= check)."""
from app.engines.hybrid_engine import HybridEngine
primary = MagicMock(spec=OcrEngine)
fallback = MagicMock(spec=OcrEngine)
primary.name = "paddleocr"
fallback.name = "cloud"
primary.recognize.return_value = _make_result("VIN", 0.6, "paddleocr")
engine = HybridEngine(primary=primary, fallback=fallback, threshold=0.6)
result = engine.recognize(b"img", OcrConfig())
assert result.engine_name == "paddleocr"
fallback.recognize.assert_not_called()
# ---------------------------------------------------------------------------
# Engine factory
# ---------------------------------------------------------------------------
class TestEngineFactory:
def test_unknown_engine_raises(self) -> None:
from app.engines.engine_factory import _create_single_engine
with pytest.raises(EngineUnavailableError, match="Unknown engine"):
_create_single_engine("nonexistent")
@patch("app.engines.engine_factory.settings")
@patch("app.engines.engine_factory._create_single_engine")
def test_defaults_to_settings_primary(
self, mock_create: MagicMock, mock_settings: MagicMock
) -> None:
mock_settings.ocr_primary_engine = "paddleocr"
mock_settings.ocr_fallback_engine = "none"
mock_engine = MagicMock(spec=OcrEngine)
mock_create.return_value = mock_engine
from app.engines.engine_factory import create_engine
result = create_engine()
mock_create.assert_called_once_with("paddleocr")
assert result == mock_engine
@patch("app.engines.engine_factory.settings")
@patch("app.engines.engine_factory._create_single_engine")
def test_explicit_name_overrides_settings(
self, mock_create: MagicMock, mock_settings: MagicMock
) -> None:
mock_settings.ocr_fallback_engine = "none"
mock_engine = MagicMock(spec=OcrEngine)
mock_create.return_value = mock_engine
from app.engines.engine_factory import create_engine
create_engine("google_vision")
mock_create.assert_called_once_with("google_vision")
@patch("app.engines.engine_factory.settings")
@patch("app.engines.engine_factory._create_single_engine")
def test_creates_hybrid_when_fallback_configured(
self, mock_create: MagicMock, mock_settings: MagicMock
) -> None:
mock_settings.ocr_primary_engine = "paddleocr"
mock_settings.ocr_fallback_engine = "google_vision"
mock_settings.ocr_fallback_threshold = 0.7
mock_primary = MagicMock(spec=OcrEngine)
mock_fallback = MagicMock(spec=OcrEngine)
mock_create.side_effect = [mock_primary, mock_fallback]
from app.engines.engine_factory import create_engine
from app.engines.hybrid_engine import HybridEngine
result = create_engine()
assert isinstance(result, HybridEngine)
@patch("app.engines.engine_factory.settings")
@patch("app.engines.engine_factory._create_single_engine")
def test_fallback_failure_returns_primary_only(
self, mock_create: MagicMock, mock_settings: MagicMock
) -> None:
mock_settings.ocr_primary_engine = "paddleocr"
mock_settings.ocr_fallback_engine = "google_vision"
mock_settings.ocr_fallback_threshold = 0.6
mock_primary = MagicMock(spec=OcrEngine)
mock_create.side_effect = [mock_primary, EngineUnavailableError("no key")]
from app.engines.engine_factory import create_engine
result = create_engine()
assert result == mock_primary

View File

@@ -39,14 +39,9 @@ def test_pillow_heif_can_register():
assert "HEIF" in Image.registered_extensions().values() assert "HEIF" in Image.registered_extensions().values()
def test_tesseract_available(): def test_paddleocr_engine_available():
"""Tesseract OCR is available and can process images.""" """PaddleOCR engine can be created."""
import pytesseract from app.engines.paddle_engine import PaddleOcrEngine
# Create a simple test image with text engine = PaddleOcrEngine()
img = Image.new("RGB", (200, 50), color="white") assert engine.name == "paddleocr"
# Verify pytesseract can call tesseract (will return empty string for blank image)
result = pytesseract.image_to_string(img)
# Just verify it doesn't raise an exception - blank image returns empty/whitespace
assert isinstance(result, str)

View File

@@ -1,11 +1,12 @@
"""Integration tests for VIN extraction endpoint.""" """Integration tests for VIN extraction endpoint and engine integration."""
import io import io
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
import pytest import pytest
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw
from app.engines.base_engine import OcrConfig, OcrEngineResult, WordBox
from app.main import app from app.main import app
@@ -240,3 +241,106 @@ class TestVinExtractionContentTypes:
) )
assert response.status_code == 200 assert response.status_code == 200
# ---------------------------------------------------------------------------
# VIN extractor engine integration tests
# ---------------------------------------------------------------------------
class TestVinExtractorEngineIntegration:
"""Tests verifying VinExtractor integrates correctly with engine abstraction."""
@patch("app.extractors.vin_extractor.create_engine")
def test_perform_ocr_calls_engine_with_vin_config(
self, mock_create_engine: MagicMock
) -> None:
"""_perform_ocr passes VIN whitelist and angle_cls to engine."""
from app.extractors.vin_extractor import VinExtractor
mock_engine = MagicMock()
mock_engine.recognize.return_value = OcrEngineResult(
text="1HGBH41JXMN109186",
confidence=0.94,
word_boxes=[WordBox(text="1HGBH41JXMN109186", confidence=0.94)],
engine_name="paddleocr",
)
mock_create_engine.return_value = mock_engine
extractor = VinExtractor()
text, confidences = extractor._perform_ocr(b"fake_image")
mock_engine.recognize.assert_called_once()
call_config = mock_engine.recognize.call_args[0][1]
assert isinstance(call_config, OcrConfig)
assert call_config.char_whitelist == VinExtractor.VIN_WHITELIST
assert call_config.use_angle_cls is True
assert call_config.single_line is False
assert call_config.single_word is False
assert text == "1HGBH41JXMN109186"
assert confidences == [0.94]
@patch("app.extractors.vin_extractor.create_engine")
def test_perform_ocr_single_line_mode(
self, mock_create_engine: MagicMock
) -> None:
"""_perform_ocr passes single_line flag to engine config."""
from app.extractors.vin_extractor import VinExtractor
mock_engine = MagicMock()
mock_engine.recognize.return_value = OcrEngineResult(
text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
)
mock_create_engine.return_value = mock_engine
extractor = VinExtractor()
extractor._perform_ocr(b"img", single_line=True)
call_config = mock_engine.recognize.call_args[0][1]
assert call_config.single_line is True
assert call_config.single_word is False
@patch("app.extractors.vin_extractor.create_engine")
def test_perform_ocr_single_word_mode(
self, mock_create_engine: MagicMock
) -> None:
"""_perform_ocr passes single_word flag to engine config."""
from app.extractors.vin_extractor import VinExtractor
mock_engine = MagicMock()
mock_engine.recognize.return_value = OcrEngineResult(
text="VIN123", confidence=0.9, word_boxes=[], engine_name="paddleocr"
)
mock_create_engine.return_value = mock_engine
extractor = VinExtractor()
extractor._perform_ocr(b"img", single_word=True)
call_config = mock_engine.recognize.call_args[0][1]
assert call_config.single_word is True
assert call_config.single_line is False
def test_calculate_base_confidence_empty_returns_default(self) -> None:
"""Empty word confidences return 0.5 default."""
from app.extractors.vin_extractor import VinExtractor
extractor = VinExtractor.__new__(VinExtractor)
assert extractor._calculate_base_confidence([]) == 0.5
def test_calculate_base_confidence_weighted_blend(self) -> None:
"""Confidence = 70% average + 30% minimum."""
from app.extractors.vin_extractor import VinExtractor
extractor = VinExtractor.__new__(VinExtractor)
# avg = (0.9 + 0.8) / 2 = 0.85, min = 0.8
# result = 0.7 * 0.85 + 0.3 * 0.8 = 0.595 + 0.24 = 0.835
result = extractor._calculate_base_confidence([0.9, 0.8])
assert abs(result - 0.835) < 0.001
def test_calculate_base_confidence_single_value(self) -> None:
"""Single confidence value: avg == min, so result equals that value."""
from app.extractors.vin_extractor import VinExtractor
extractor = VinExtractor.__new__(VinExtractor)
result = extractor._calculate_base_confidence([0.92])
assert abs(result - 0.92) < 0.001

View File

@@ -165,7 +165,7 @@ class TestVinValidator:
"""Test candidate extraction handles space-fragmented VINs from OCR.""" """Test candidate extraction handles space-fragmented VINs from OCR."""
validator = VinValidator() validator = VinValidator()
# Tesseract often fragments VINs into multiple words # OCR engines sometimes fragment VINs into multiple words
text = "1HGBH 41JXMN 109186" text = "1HGBH 41JXMN 109186"
candidates = validator.extract_candidates(text) candidates = validator.extract_candidates(text)

View File

@@ -0,0 +1,18 @@
{
"_comment": "Google Vision API service account key for OCR cloud fallback",
"_instructions": [
"1. Create a Google Cloud service account with Vision API access",
"2. Download the JSON key file",
"3. Save it as secrets/app/google-vision-key.json (gitignored)",
"4. Uncomment the volume mount in docker-compose.yml",
"5. Set OCR_FALLBACK_ENGINE=google_vision"
],
"type": "service_account",
"project_id": "your-project-id",
"private_key_id": "",
"private_key": "",
"client_email": "your-sa@your-project-id.iam.gserviceaccount.com",
"client_id": "",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token"
}