diff --git a/docker-compose.yml b/docker-compose.yml index c2cca9f..095dd93 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -164,6 +164,24 @@ services: - "traefik.http.services.mvp-backend.loadbalancer.healthcheck.timeout=10s" - "traefik.http.services.mvp-backend.loadbalancer.passhostheader=true" + # Application Services - OCR Processing + mvp-ocr: + build: + context: ./ocr + dockerfile: Dockerfile + container_name: mvp-ocr + restart: unless-stopped + environment: + LOG_LEVEL: info + networks: + - backend + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + # Database Services - Application PostgreSQL mvp-postgres: image: ${REGISTRY_MIRRORS:-git.motovaultpro.com/egullickson/mirrors}/postgres:18-alpine diff --git a/ocr/Dockerfile b/ocr/Dockerfile new file mode 100644 index 0000000..364ba97 --- /dev/null +++ b/ocr/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.11-slim + +# System dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + tesseract-ocr \ + tesseract-ocr-eng \ + libtesseract-dev \ + libheif1 \ + libheif-dev \ + libglib2.0-0 \ + libmagic1 \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Python dependencies +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8000 +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/ocr/app/__init__.py b/ocr/app/__init__.py new file mode 100644 index 0000000..f6ef689 --- /dev/null +++ b/ocr/app/__init__.py @@ -0,0 +1 @@ +# OCR Service Application diff --git a/ocr/app/config.py b/ocr/app/config.py new file mode 100644 index 0000000..a0f4ada --- /dev/null +++ b/ocr/app/config.py @@ -0,0 +1,15 @@ +"""OCR Service Configuration.""" +import os + + +class Settings: + """Application settings loaded from environment variables.""" + + def __init__(self) -> None: + self.log_level: str = os.getenv("LOG_LEVEL", "info") + self.host: str = os.getenv("HOST", "0.0.0.0") + self.port: int = int(os.getenv("PORT", "8000")) + self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract") + + +settings = Settings() diff --git a/ocr/app/main.py b/ocr/app/main.py new file mode 100644 index 0000000..4553aa9 --- /dev/null +++ b/ocr/app/main.py @@ -0,0 +1,26 @@ +"""OCR Service FastAPI Application.""" +from fastapi import FastAPI + +from app.config import settings + +app = FastAPI( + title="MotoVaultPro OCR Service", + description="OCR processing service for vehicle documents", + version="1.0.0", +) + + +@app.get("/health") +async def health_check() -> dict: + """Health check endpoint for container orchestration.""" + return {"status": "healthy"} + + +@app.get("/") +async def root() -> dict: + """Root endpoint with service information.""" + return { + "service": "mvp-ocr", + "version": "1.0.0", + "log_level": settings.log_level, + } diff --git a/ocr/requirements.txt b/ocr/requirements.txt new file mode 100644 index 0000000..d14a652 --- /dev/null +++ b/ocr/requirements.txt @@ -0,0 +1,20 @@ +# API Framework +fastapi>=0.100.0 +uvicorn[standard]>=0.23.0 +python-multipart>=0.0.6 + +# File Detection & Handling +python-magic>=0.4.27 +pillow>=10.0.0 +pillow-heif>=0.13.0 + +# Image Preprocessing +opencv-python-headless>=4.8.0 +numpy>=1.24.0 + +# OCR Engines +pytesseract>=0.3.10 + +# Testing +pytest>=7.4.0 +httpx>=0.24.0 diff --git a/ocr/tests/__init__.py b/ocr/tests/__init__.py new file mode 100644 index 0000000..a0355b8 --- /dev/null +++ b/ocr/tests/__init__.py @@ -0,0 +1 @@ +# OCR Service Tests diff --git a/ocr/tests/test_health.py b/ocr/tests/test_health.py new file mode 100644 index 0000000..cd1e914 --- /dev/null +++ b/ocr/tests/test_health.py @@ -0,0 +1,52 @@ +"""Tests for OCR service health and core functionality.""" +import io + +import pytest +from fastapi.testclient import TestClient +from PIL import Image + +from app.main import app + + +@pytest.fixture +def client(): + """Create test client for FastAPI app.""" + return TestClient(app) + + +def test_health_endpoint(client): + """Health endpoint returns healthy status.""" + response = client.get("/health") + assert response.status_code == 200 + assert response.json() == {"status": "healthy"} + + +def test_root_endpoint(client): + """Root endpoint returns service information.""" + response = client.get("/") + assert response.status_code == 200 + data = response.json() + assert data["service"] == "mvp-ocr" + assert "version" in data + + +def test_pillow_heif_can_register(): + """pillow-heif can register with Pillow for HEIC support.""" + import pillow_heif + + pillow_heif.register_heif_opener() + # Verify HEIC format is registered + assert "HEIF" in Image.registered_extensions().values() + + +def test_tesseract_available(): + """Tesseract OCR is available and can process images.""" + import pytesseract + + # Create a simple test image with text + img = Image.new("RGB", (200, 50), color="white") + + # Verify pytesseract can call tesseract (will return empty string for blank image) + result = pytesseract.image_to_string(img) + # Just verify it doesn't raise an exception - blank image returns empty/whitespace + assert isinstance(result, str)