feat: add OCR service container (refs #64)
Some checks failed
Deploy to Staging / Build Images (pull_request) Successful in 7m41s
Deploy to Staging / Deploy to Staging (pull_request) Failing after 13s
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
Some checks failed
Deploy to Staging / Build Images (pull_request) Successful in 7m41s
Deploy to Staging / Deploy to Staging (pull_request) Failing after 13s
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
Add Python-based OCR service container (mvp-ocr) as the 6th service: - Python 3.11-slim with FastAPI/uvicorn - Tesseract OCR with English language pack - pillow-heif for HEIC image support - opencv-python-headless for image preprocessing - Health endpoint at /health - Unit tests for health, HEIC support, and Tesseract availability Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -164,6 +164,24 @@ services:
|
|||||||
- "traefik.http.services.mvp-backend.loadbalancer.healthcheck.timeout=10s"
|
- "traefik.http.services.mvp-backend.loadbalancer.healthcheck.timeout=10s"
|
||||||
- "traefik.http.services.mvp-backend.loadbalancer.passhostheader=true"
|
- "traefik.http.services.mvp-backend.loadbalancer.passhostheader=true"
|
||||||
|
|
||||||
|
# Application Services - OCR Processing
|
||||||
|
mvp-ocr:
|
||||||
|
build:
|
||||||
|
context: ./ocr
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: mvp-ocr
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
LOG_LEVEL: info
|
||||||
|
networks:
|
||||||
|
- backend
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
|
|
||||||
# Database Services - Application PostgreSQL
|
# Database Services - Application PostgreSQL
|
||||||
mvp-postgres:
|
mvp-postgres:
|
||||||
image: ${REGISTRY_MIRRORS:-git.motovaultpro.com/egullickson/mirrors}/postgres:18-alpine
|
image: ${REGISTRY_MIRRORS:-git.motovaultpro.com/egullickson/mirrors}/postgres:18-alpine
|
||||||
|
|||||||
23
ocr/Dockerfile
Normal file
23
ocr/Dockerfile
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# System dependencies
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
tesseract-ocr \
|
||||||
|
tesseract-ocr-eng \
|
||||||
|
libtesseract-dev \
|
||||||
|
libheif1 \
|
||||||
|
libheif-dev \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libmagic1 \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Python dependencies
|
||||||
|
WORKDIR /app
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
1
ocr/app/__init__.py
Normal file
1
ocr/app/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# OCR Service Application
|
||||||
15
ocr/app/config.py
Normal file
15
ocr/app/config.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
"""OCR Service Configuration."""
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class Settings:
|
||||||
|
"""Application settings loaded from environment variables."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.log_level: str = os.getenv("LOG_LEVEL", "info")
|
||||||
|
self.host: str = os.getenv("HOST", "0.0.0.0")
|
||||||
|
self.port: int = int(os.getenv("PORT", "8000"))
|
||||||
|
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
|
||||||
|
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
26
ocr/app/main.py
Normal file
26
ocr/app/main.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
"""OCR Service FastAPI Application."""
|
||||||
|
from fastapi import FastAPI
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="MotoVaultPro OCR Service",
|
||||||
|
description="OCR processing service for vehicle documents",
|
||||||
|
version="1.0.0",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health_check() -> dict:
|
||||||
|
"""Health check endpoint for container orchestration."""
|
||||||
|
return {"status": "healthy"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root() -> dict:
|
||||||
|
"""Root endpoint with service information."""
|
||||||
|
return {
|
||||||
|
"service": "mvp-ocr",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"log_level": settings.log_level,
|
||||||
|
}
|
||||||
20
ocr/requirements.txt
Normal file
20
ocr/requirements.txt
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
# API Framework
|
||||||
|
fastapi>=0.100.0
|
||||||
|
uvicorn[standard]>=0.23.0
|
||||||
|
python-multipart>=0.0.6
|
||||||
|
|
||||||
|
# File Detection & Handling
|
||||||
|
python-magic>=0.4.27
|
||||||
|
pillow>=10.0.0
|
||||||
|
pillow-heif>=0.13.0
|
||||||
|
|
||||||
|
# Image Preprocessing
|
||||||
|
opencv-python-headless>=4.8.0
|
||||||
|
numpy>=1.24.0
|
||||||
|
|
||||||
|
# OCR Engines
|
||||||
|
pytesseract>=0.3.10
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
pytest>=7.4.0
|
||||||
|
httpx>=0.24.0
|
||||||
1
ocr/tests/__init__.py
Normal file
1
ocr/tests/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# OCR Service Tests
|
||||||
52
ocr/tests/test_health.py
Normal file
52
ocr/tests/test_health.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""Tests for OCR service health and core functionality."""
|
||||||
|
import io
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from app.main import app
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
"""Create test client for FastAPI app."""
|
||||||
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
def test_health_endpoint(client):
|
||||||
|
"""Health endpoint returns healthy status."""
|
||||||
|
response = client.get("/health")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.json() == {"status": "healthy"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_root_endpoint(client):
|
||||||
|
"""Root endpoint returns service information."""
|
||||||
|
response = client.get("/")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["service"] == "mvp-ocr"
|
||||||
|
assert "version" in data
|
||||||
|
|
||||||
|
|
||||||
|
def test_pillow_heif_can_register():
|
||||||
|
"""pillow-heif can register with Pillow for HEIC support."""
|
||||||
|
import pillow_heif
|
||||||
|
|
||||||
|
pillow_heif.register_heif_opener()
|
||||||
|
# Verify HEIC format is registered
|
||||||
|
assert "HEIF" in Image.registered_extensions().values()
|
||||||
|
|
||||||
|
|
||||||
|
def test_tesseract_available():
|
||||||
|
"""Tesseract OCR is available and can process images."""
|
||||||
|
import pytesseract
|
||||||
|
|
||||||
|
# Create a simple test image with text
|
||||||
|
img = Image.new("RGB", (200, 50), color="white")
|
||||||
|
|
||||||
|
# Verify pytesseract can call tesseract (will return empty string for blank image)
|
||||||
|
result = pytesseract.image_to_string(img)
|
||||||
|
# Just verify it doesn't raise an exception - blank image returns empty/whitespace
|
||||||
|
assert isinstance(result, str)
|
||||||
Reference in New Issue
Block a user