chore: update Docker and compose files for PaddleOCR engine (refs #119)
- Replace libtesseract-dev with libgomp1 (OpenMP for PaddlePaddle) - Pre-download PP-OCRv4 models during Docker build - Add OCR engine env vars to all compose files (base, staging, prod) - Add optional Google Vision secret mount (commented, enable on demand) - Create google-vision-key.json.example placeholder Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,9 @@
|
||||
# Production Dockerfile for MotoVaultPro OCR Service
|
||||
# Uses mirrored base images from Gitea Package Registry
|
||||
#
|
||||
# Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
|
||||
# Backward compat: Tesseract 5.x (optional, via TesseractEngine)
|
||||
# Cloud fallback: Google Vision (optional, requires API key at runtime)
|
||||
|
||||
# Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub)
|
||||
ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
|
||||
@@ -7,10 +11,16 @@ ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
|
||||
FROM ${REGISTRY_MIRRORS}/python:3.13-slim
|
||||
|
||||
# System dependencies
|
||||
# - tesseract-ocr/eng: Backward-compatible OCR engine (used by TesseractEngine)
|
||||
# - libgomp1: OpenMP runtime required by PaddlePaddle
|
||||
# - libheif1/libheif-dev: HEIF image support (iPhone photos)
|
||||
# - libglib2.0-0: GLib shared library (OpenCV dependency)
|
||||
# - libmagic1: File type detection
|
||||
# - curl: Health check endpoint
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
libtesseract-dev \
|
||||
libgomp1 \
|
||||
libheif1 \
|
||||
libheif-dev \
|
||||
libglib2.0-0 \
|
||||
@@ -23,6 +33,12 @@ WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime).
|
||||
# Models are baked into the image so container starts are fast and
|
||||
# no network access is needed at runtime for model download.
|
||||
RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)" \
|
||||
&& echo "PaddleOCR PP-OCRv4 models downloaded and verified"
|
||||
|
||||
COPY . .
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
Reference in New Issue
Block a user