feat: add OCR service container (refs #64)
Some checks failed
Deploy to Staging / Build Images (pull_request) Successful in 7m41s
Deploy to Staging / Deploy to Staging (pull_request) Failing after 13s
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s

Add Python-based OCR service container (mvp-ocr) as the 6th service:
- Python 3.11-slim with FastAPI/uvicorn
- Tesseract OCR with English language pack
- pillow-heif for HEIC image support
- opencv-python-headless for image preprocessing
- Health endpoint at /health
- Unit tests for health, HEIC support, and Tesseract availability

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 13:06:16 -06:00
parent e3a482e00f
commit 1ba491144b
8 changed files with 156 additions and 0 deletions

View File

@@ -164,6 +164,24 @@ services:
- "traefik.http.services.mvp-backend.loadbalancer.healthcheck.timeout=10s" - "traefik.http.services.mvp-backend.loadbalancer.healthcheck.timeout=10s"
- "traefik.http.services.mvp-backend.loadbalancer.passhostheader=true" - "traefik.http.services.mvp-backend.loadbalancer.passhostheader=true"
# Application Services - OCR Processing
mvp-ocr:
build:
context: ./ocr
dockerfile: Dockerfile
container_name: mvp-ocr
restart: unless-stopped
environment:
LOG_LEVEL: info
networks:
- backend
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
# Database Services - Application PostgreSQL # Database Services - Application PostgreSQL
mvp-postgres: mvp-postgres:
image: ${REGISTRY_MIRRORS:-git.motovaultpro.com/egullickson/mirrors}/postgres:18-alpine image: ${REGISTRY_MIRRORS:-git.motovaultpro.com/egullickson/mirrors}/postgres:18-alpine

23
ocr/Dockerfile Normal file
View File

@@ -0,0 +1,23 @@
FROM python:3.11-slim
# System dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-eng \
libtesseract-dev \
libheif1 \
libheif-dev \
libglib2.0-0 \
libmagic1 \
curl \
&& rm -rf /var/lib/apt/lists/*
# Python dependencies
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

1
ocr/app/__init__.py Normal file
View File

@@ -0,0 +1 @@
# OCR Service Application

15
ocr/app/config.py Normal file
View File

@@ -0,0 +1,15 @@
"""OCR Service Configuration."""
import os
class Settings:
"""Application settings loaded from environment variables."""
def __init__(self) -> None:
self.log_level: str = os.getenv("LOG_LEVEL", "info")
self.host: str = os.getenv("HOST", "0.0.0.0")
self.port: int = int(os.getenv("PORT", "8000"))
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
settings = Settings()

26
ocr/app/main.py Normal file
View File

@@ -0,0 +1,26 @@
"""OCR Service FastAPI Application."""
from fastapi import FastAPI
from app.config import settings
app = FastAPI(
title="MotoVaultPro OCR Service",
description="OCR processing service for vehicle documents",
version="1.0.0",
)
@app.get("/health")
async def health_check() -> dict:
"""Health check endpoint for container orchestration."""
return {"status": "healthy"}
@app.get("/")
async def root() -> dict:
"""Root endpoint with service information."""
return {
"service": "mvp-ocr",
"version": "1.0.0",
"log_level": settings.log_level,
}

20
ocr/requirements.txt Normal file
View File

@@ -0,0 +1,20 @@
# API Framework
fastapi>=0.100.0
uvicorn[standard]>=0.23.0
python-multipart>=0.0.6
# File Detection & Handling
python-magic>=0.4.27
pillow>=10.0.0
pillow-heif>=0.13.0
# Image Preprocessing
opencv-python-headless>=4.8.0
numpy>=1.24.0
# OCR Engines
pytesseract>=0.3.10
# Testing
pytest>=7.4.0
httpx>=0.24.0

1
ocr/tests/__init__.py Normal file
View File

@@ -0,0 +1 @@
# OCR Service Tests

52
ocr/tests/test_health.py Normal file
View File

@@ -0,0 +1,52 @@
"""Tests for OCR service health and core functionality."""
import io
import pytest
from fastapi.testclient import TestClient
from PIL import Image
from app.main import app
@pytest.fixture
def client():
"""Create test client for FastAPI app."""
return TestClient(app)
def test_health_endpoint(client):
"""Health endpoint returns healthy status."""
response = client.get("/health")
assert response.status_code == 200
assert response.json() == {"status": "healthy"}
def test_root_endpoint(client):
"""Root endpoint returns service information."""
response = client.get("/")
assert response.status_code == 200
data = response.json()
assert data["service"] == "mvp-ocr"
assert "version" in data
def test_pillow_heif_can_register():
"""pillow-heif can register with Pillow for HEIC support."""
import pillow_heif
pillow_heif.register_heif_opener()
# Verify HEIC format is registered
assert "HEIF" in Image.registered_extensions().values()
def test_tesseract_available():
"""Tesseract OCR is available and can process images."""
import pytesseract
# Create a simple test image with text
img = Image.new("RGB", (200, 50), color="white")
# Verify pytesseract can call tesseract (will return empty string for blank image)
result = pytesseract.image_to_string(img)
# Just verify it doesn't raise an exception - blank image returns empty/whitespace
assert isinstance(result, str)