From b9fe222f125359f3b288ecdad0f10deba832b5e0 Mon Sep 17 00:00:00 2001
From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com>
Date: Sat, 7 Feb 2026 12:12:04 -0600
Subject: [PATCH] fix: Build errors and tesseract removal

---
 ocr/Dockerfile                            |  12 +--
 ocr/app/CLAUDE.md                         |   2 +-
 ocr/app/config.py                         |   2 -
 ocr/app/engines/__init__.py               |   1 -
 ocr/app/engines/base_engine.py            |   2 +-
 ocr/app/engines/engine_factory.py         |   3 +-
 ocr/app/engines/tesseract_engine.py       | 114 ----------------------
 ocr/app/extractors/manual_extractor.py    |  12 ++-
 ocr/app/extractors/vin_extractor.py       |   4 +-
 ocr/app/preprocessors/vin_preprocessor.py |  12 +--
 ocr/app/table_extraction/detector.py      |   2 +-
 ocr/app/validators/vin_validator.py       |   2 +-
 ocr/requirements.txt                      |   1 -
 ocr/tests/test_engine_abstraction.py      |  87 +----------------
 ocr/tests/test_health.py                  |  15 +--
 ocr/tests/test_vin_validator.py           |   2 +-
 16 files changed, 35 insertions(+), 238 deletions(-)
 delete mode 100644 ocr/app/engines/tesseract_engine.py

diff --git a/ocr/Dockerfile b/ocr/Dockerfile
index d1c52e4..8028575 100644
--- a/ocr/Dockerfile
+++ b/ocr/Dockerfile
@@ -2,7 +2,6 @@
 # Uses mirrored base images from Gitea Package Registry
 #
 # Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
-# Backward compat: Tesseract 5.x (optional, via TesseractEngine)
 # Cloud fallback: Google Vision (optional, requires API key at runtime)
 
 # Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub)
@@ -11,21 +10,16 @@ ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
 FROM ${REGISTRY_MIRRORS}/python:3.13-slim
 
 # System dependencies
-# - tesseract-ocr/eng: Backward-compatible OCR engine (used by TesseractEngine)
 # - libgomp1: OpenMP runtime required by PaddlePaddle
 # - libheif1/libheif-dev: HEIF image support (iPhone photos)
 # - libglib2.0-0: GLib shared library (OpenCV dependency)
-# - libgl1-mesa-glx: OpenGL runtime (OpenCV cv2 dependency, pulled by PaddleX)
 # - libmagic1: File type detection
 # - curl: Health check endpoint
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    tesseract-ocr \
-    tesseract-ocr-eng \
     libgomp1 \
     libheif1 \
     libheif-dev \
     libglib2.0-0 \
-    libgl1-mesa-glx \
     libmagic1 \
     curl \
     && rm -rf /var/lib/apt/lists/*
@@ -33,7 +27,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Python dependencies
 WORKDIR /app
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+# Install dependencies. PaddleX (transitive via paddleocr) pulls in the full
+# opencv-python which requires libGL.so.1.  Force-reinstall the headless
+# variant afterwards so the container stays GUI-free.
+RUN pip install --no-cache-dir -r requirements.txt \
+    && pip install --no-cache-dir --force-reinstall opencv-python-headless
 
 # Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime).
 # Models are baked into the image so container starts are fast and
diff --git a/ocr/app/CLAUDE.md b/ocr/app/CLAUDE.md
index 8fbc7f1..7d0441b 100644
--- a/ocr/app/CLAUDE.md
+++ b/ocr/app/CLAUDE.md
@@ -12,7 +12,7 @@
 
 | Directory | What | When to read |
 | --------- | ---- | ------------ |
-| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback, Tesseract compat) | Engine changes, adding new engines |
+| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback) | Engine changes, adding new engines |
 | `extractors/` | Data extraction logic | Adding new extraction types |
 | `models/` | Data models and schemas | Request/response types |
 | `patterns/` | Regex and parsing patterns | Pattern matching rules |
diff --git a/ocr/app/config.py b/ocr/app/config.py
index 4a15906..e933d4b 100644
--- a/ocr/app/config.py
+++ b/ocr/app/config.py
@@ -9,8 +9,6 @@ class Settings:
         self.log_level: str = os.getenv("LOG_LEVEL", "info")
         self.host: str = os.getenv("HOST", "0.0.0.0")
         self.port: int = int(os.getenv("PORT", "8000"))
-        self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
-
         # OCR engine configuration
         self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr")
         self.ocr_confidence_threshold: float = float(
diff --git a/ocr/app/engines/__init__.py b/ocr/app/engines/__init__.py
index abc8b05..df38155 100644
--- a/ocr/app/engines/__init__.py
+++ b/ocr/app/engines/__init__.py
@@ -5,7 +5,6 @@ decoupling extractors from specific OCR libraries.
 
 Engines:
   - PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only)
-  - TesseractEngine: pytesseract wrapper (backward compatibility)
   - CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback)
   - HybridEngine: Primary + fallback with confidence threshold
 """
diff --git a/ocr/app/engines/base_engine.py b/ocr/app/engines/base_engine.py
index ddca084..d10ca26 100644
--- a/ocr/app/engines/base_engine.py
+++ b/ocr/app/engines/base_engine.py
@@ -57,7 +57,7 @@ class OcrEngineResult:
     text: str
     confidence: float  # 0.0-1.0
     word_boxes: list[WordBox]
-    engine_name: str  # "paddleocr", "tesseract", "google_vision"
+    engine_name: str  # "paddleocr", "google_vision"
 
 
 # --- Abstract base ---
diff --git a/ocr/app/engines/engine_factory.py b/ocr/app/engines/engine_factory.py
index 49464d2..f52926f 100644
--- a/ocr/app/engines/engine_factory.py
+++ b/ocr/app/engines/engine_factory.py
@@ -11,7 +11,6 @@ logger = logging.getLogger(__name__)
 # Valid engine identifiers (primary engines only; hybrid is constructed separately)
 _ENGINE_REGISTRY: dict[str, str] = {
     "paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
-    "tesseract": "app.engines.tesseract_engine.TesseractEngine",
     "google_vision": "app.engines.cloud_engine.CloudEngine",
 }
 
@@ -46,7 +45,7 @@ def create_engine(engine_name: str | None = None) -> OcrEngine:
     returns a ``HybridEngine`` that wraps the primary with the fallback.
 
     Args:
-        engine_name: Engine identifier ("paddleocr", "tesseract").
+        engine_name: Engine identifier ("paddleocr", "google_vision").
                      Falls back to ``settings.ocr_primary_engine``.
 
     Returns:
diff --git a/ocr/app/engines/tesseract_engine.py b/ocr/app/engines/tesseract_engine.py
deleted file mode 100644
index 02108ec..0000000
--- a/ocr/app/engines/tesseract_engine.py
+++ /dev/null
@@ -1,114 +0,0 @@
-"""Tesseract engine wrapper for backward compatibility."""
-
-import io
-import logging
-
-from app.config import settings
-from app.engines.base_engine import (
-    EngineProcessingError,
-    EngineUnavailableError,
-    OcrConfig,
-    OcrEngine,
-    OcrEngineResult,
-    WordBox,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class TesseractEngine(OcrEngine):
-    """pytesseract wrapper conforming to the OcrEngine interface."""
-
-    def __init__(self) -> None:
-        try:
-            import pytesseract  # type: ignore[import-untyped]
-
-            pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
-            self._pytesseract = pytesseract
-            logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
-        except ImportError as exc:
-            raise EngineUnavailableError(
-                "pytesseract is not installed. "
-                "Install with: pip install pytesseract"
-            ) from exc
-
-    @property
-    def name(self) -> str:
-        return "tesseract"
-
-    def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
-        """Run Tesseract OCR on image bytes."""
-        try:
-            from PIL import Image
-
-            image = Image.open(io.BytesIO(image_bytes))
-
-            # Build Tesseract config string from OcrConfig
-            tess_config = self._build_config(config)
-
-            # Get word-level data
-            ocr_data = self._pytesseract.image_to_data(
-                image,
-                config=tess_config,
-                output_type=self._pytesseract.Output.DICT,
-            )
-
-            word_boxes: list[WordBox] = []
-            texts: list[str] = []
-            confidences: list[float] = []
-
-            for i, text in enumerate(ocr_data["text"]):
-                conf = int(ocr_data["conf"][i])
-                if text.strip() and conf > 0:
-                    normalized_conf = conf / 100.0
-                    word_boxes.append(
-                        WordBox(
-                            text=text.strip(),
-                            confidence=normalized_conf,
-                            x=int(ocr_data["left"][i]),
-                            y=int(ocr_data["top"][i]),
-                            width=int(ocr_data["width"][i]),
-                            height=int(ocr_data["height"][i]),
-                        )
-                    )
-                    texts.append(text.strip())
-                    confidences.append(normalized_conf)
-
-            combined_text = " ".join(texts)
-            avg_confidence = (
-                sum(confidences) / len(confidences) if confidences else 0.0
-            )
-
-            return OcrEngineResult(
-                text=combined_text,
-                confidence=avg_confidence,
-                word_boxes=word_boxes,
-                engine_name=self.name,
-            )
-
-        except (EngineUnavailableError, EngineProcessingError):
-            raise
-        except Exception as exc:
-            raise EngineProcessingError(
-                f"Tesseract recognition failed: {exc}"
-            ) from exc
-
-    def _build_config(self, config: OcrConfig) -> str:
-        """Translate OcrConfig into a Tesseract CLI config string."""
-        parts: list[str] = []
-
-        # Page segmentation mode
-        if config.single_word:
-            parts.append("--psm 8")
-        elif config.single_line:
-            parts.append("--psm 7")
-        else:
-            # Default: assume uniform block of text
-            psm = config.hints.get("psm", 6)
-            parts.append(f"--psm {psm}")
-
-        # Character whitelist
-        if config.char_whitelist:
-            parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
-
-        return " ".join(parts)
diff --git a/ocr/app/extractors/manual_extractor.py b/ocr/app/extractors/manual_extractor.py
index e447882..ad5f159 100644
--- a/ocr/app/extractors/manual_extractor.py
+++ b/ocr/app/extractors/manual_extractor.py
@@ -5,9 +5,9 @@ import time
 from dataclasses import dataclass, field
 from typing import Callable, Optional
 
-import pytesseract
 from PIL import Image
 
+from app.engines import create_engine, OcrConfig
 from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo
 from app.table_extraction.detector import table_detector, DetectedTable
 from app.table_extraction.parser import table_parser, ParsedScheduleRow
@@ -243,8 +243,9 @@ class ManualExtractor:
 
         # OCR the full page
         try:
-            image = Image.open(io.BytesIO(image_bytes))
-            ocr_text = pytesseract.image_to_string(image)
+            engine = create_engine()
+            ocr_result = engine.recognize(image_bytes, OcrConfig())
+            ocr_text = ocr_result.text
 
             # Mark tables as maintenance if page contains maintenance keywords
             for table in detected_tables:
@@ -358,8 +359,9 @@ class ManualExtractor:
 
             if not text and first_page.image_bytes:
                 # OCR first page
-                image = Image.open(io.BytesIO(first_page.image_bytes))
-                text = pytesseract.image_to_string(image)
+                engine = create_engine()
+                ocr_result = engine.recognize(first_page.image_bytes, OcrConfig())
+                text = ocr_result.text
 
             if text:
                 return self._parse_vehicle_from_text(text)
diff --git a/ocr/app/extractors/vin_extractor.py b/ocr/app/extractors/vin_extractor.py
index cce88e9..01a9343 100644
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -316,8 +316,8 @@ class VinExtractor(BaseExtractor):
             single-line - Treat as a single text line
             single-word - Treat as a single word
 
-        For PaddleOCR, angle classification handles rotated/angled text
-        inherently, replacing the need for Tesseract PSM mode fallbacks.
+        PaddleOCR angle classification handles rotated/angled text
+        inherently, so no PSM mode fallbacks are needed.
 
         Returns:
             List of VIN candidates
diff --git a/ocr/app/preprocessors/vin_preprocessor.py b/ocr/app/preprocessors/vin_preprocessor.py
index 290fb5b..4128e68 100644
--- a/ocr/app/preprocessors/vin_preprocessor.py
+++ b/ocr/app/preprocessors/vin_preprocessor.py
@@ -93,7 +93,7 @@ class VinPreprocessor:
             gray = cv_image
         steps_applied.append("grayscale")
 
-        # Upscale small images for better OCR (Tesseract needs ~300 DPI)
+        # Upscale small images for better OCR (~300 DPI recommended)
         gray = self._ensure_minimum_resolution(gray)
         steps_applied.append("resolution_check")
 
@@ -129,14 +129,14 @@ class VinPreprocessor:
         )
 
     # Minimum width in pixels for reliable VIN OCR.
-    # A 17-char VIN needs ~30px per character for Tesseract accuracy.
+    # A 17-char VIN needs ~30px per character for reliable OCR accuracy.
     MIN_WIDTH_FOR_VIN = 600
 
     def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
         """
         Upscale image if too small for reliable OCR.
 
-        Tesseract works best at ~300 DPI. Mobile photos of VINs may have
+        OCR works best at ~300 DPI. Mobile photos of VINs may have
         the text occupy only a small portion of the frame, resulting in
         low effective resolution for the VIN characters.
         """
@@ -160,7 +160,7 @@ class VinPreprocessor:
         Colored backgrounds have a low min value (e.g. green sticker:
         min(130,230,150) = 130) → inverted to 125 (medium gray).
 
-        The inversion ensures Tesseract always receives dark-text-on-
+        The inversion ensures the OCR engine always receives dark-text-on-
         light-background, which is the polarity it expects.
         """
         b_channel, g_channel, r_channel = cv2.split(bgr_image)
@@ -168,8 +168,8 @@ class VinPreprocessor:
         min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)
 
         # Invert so white text (min=255) becomes black (0) and colored
-        # backgrounds (min~130) become lighter gray (~125).  Tesseract
-        # expects dark text on light background.
+        # backgrounds (min~130) become lighter gray (~125).  OCR engines
+        # expect dark text on light background.
         inverted = cv2.bitwise_not(min_channel)
 
         gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
diff --git a/ocr/app/table_extraction/detector.py b/ocr/app/table_extraction/detector.py
index 362990f..9c5af54 100644
--- a/ocr/app/table_extraction/detector.py
+++ b/ocr/app/table_extraction/detector.py
@@ -312,7 +312,7 @@ class TableDetector:
         Returns:
             2D list of cell contents
         """
-        # This would use Tesseract on the cropped region
+        # This would use OCR on the cropped region
         # For now, return empty - actual OCR will be done in manual_extractor
         logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}")
         return []
diff --git a/ocr/app/validators/vin_validator.py b/ocr/app/validators/vin_validator.py
index c9c60ef..79a2062 100644
--- a/ocr/app/validators/vin_validator.py
+++ b/ocr/app/validators/vin_validator.py
@@ -226,7 +226,7 @@ class VinValidator:
         Uses two strategies:
         1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
         2. Concatenate adjacent short fragments separated by spaces/dashes
-           (handles Tesseract fragmenting VINs into multiple words)
+           (handles OCR fragmenting VINs into multiple words)
 
         Args:
             text: Raw OCR text
diff --git a/ocr/requirements.txt b/ocr/requirements.txt
index 9ac83ad..946f645 100644
--- a/ocr/requirements.txt
+++ b/ocr/requirements.txt
@@ -14,7 +14,6 @@ opencv-python-headless>=4.8.0
 numpy>=1.24.0
 
 # OCR Engines
-pytesseract>=0.3.10
 paddlepaddle>=2.6.0
 paddleocr>=2.8.0
 google-cloud-vision>=3.7.0
diff --git a/ocr/tests/test_engine_abstraction.py b/ocr/tests/test_engine_abstraction.py
index 2e8c150..44c314f 100644
--- a/ocr/tests/test_engine_abstraction.py
+++ b/ocr/tests/test_engine_abstraction.py
@@ -1,7 +1,7 @@
 """Tests for OCR engine abstraction layer.
 
 Covers: base types, exception hierarchy, PaddleOcrEngine,
-TesseractEngine, CloudEngine, HybridEngine, and engine_factory.
+CloudEngine, HybridEngine, and engine_factory.
 """
 
 import io
@@ -124,7 +124,7 @@ class TestOcrEngineResult:
 
     def test_empty_result(self) -> None:
         result = OcrEngineResult(
-            text="", confidence=0.0, word_boxes=[], engine_name="tesseract"
+            text="", confidence=0.0, word_boxes=[], engine_name="paddleocr"
         )
         assert result.text == ""
         assert result.word_boxes == []
@@ -303,85 +303,6 @@ class TestPaddleOcrEngine:
             engine.recognize(_create_test_image_bytes(), OcrConfig())
 
 
-# ---------------------------------------------------------------------------
-# TesseractEngine
-# ---------------------------------------------------------------------------
-
-
-class TestTesseractEngine:
-    """Tests for TesseractEngine using mocked pytesseract."""
-
-    @pytest.fixture()
-    def engine(self) -> "TesseractEngine":  # type: ignore[name-defined]
-        """Create a TesseractEngine with mocked pytesseract dependency."""
-        mock_pytesseract = MagicMock()
-        mock_pytesseract.Output.DICT = "dict"
-
-        with patch.dict("sys.modules", {"pytesseract": mock_pytesseract}):
-            with patch("app.engines.tesseract_engine.settings") as mock_settings:
-                mock_settings.tesseract_cmd = "/usr/bin/tesseract"
-                from app.engines.tesseract_engine import TesseractEngine
-
-                eng = TesseractEngine()
-                eng._mock_pytesseract = mock_pytesseract  # type: ignore[attr-defined]
-                return eng
-
-    def test_name(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
-        assert engine.name == "tesseract"
-
-    def test_build_config_default_psm(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
-        config_str = engine._build_config(OcrConfig())
-        assert "--psm 6" in config_str
-
-    def test_build_config_single_line(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
-        config_str = engine._build_config(OcrConfig(single_line=True))
-        assert "--psm 7" in config_str
-
-    def test_build_config_single_word(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
-        config_str = engine._build_config(OcrConfig(single_word=True))
-        assert "--psm 8" in config_str
-
-    def test_build_config_whitelist(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
-        config_str = engine._build_config(OcrConfig(char_whitelist="ABC123"))
-        assert "-c tessedit_char_whitelist=ABC123" in config_str
-
-    def test_build_config_psm_hint(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
-        config_str = engine._build_config(OcrConfig(hints={"psm": 11}))
-        assert "--psm 11" in config_str
-
-    def test_recognize_normalizes_confidence(self, engine: "TesseractEngine") -> None:  # type: ignore[name-defined]
-        """Tesseract returns 0-100 confidence; engine normalizes to 0.0-1.0."""
-        engine._pytesseract.image_to_data.return_value = {
-            "text": ["HELLO", ""],
-            "conf": [92, -1],
-            "left": [10],
-            "top": [20],
-            "width": [100],
-            "height": [30],
-        }
-
-        result = engine.recognize(_create_test_image_bytes(), OcrConfig())
-        assert result.text == "HELLO"
-        assert abs(result.confidence - 0.92) < 0.01
-        assert result.engine_name == "tesseract"
-
-    def test_import_error_raises_unavailable(self) -> None:
-        with patch.dict("sys.modules", {"pytesseract": None}):
-            with patch("app.engines.tesseract_engine.settings") as mock_settings:
-                mock_settings.tesseract_cmd = "/usr/bin/tesseract"
-
-                def mock_import(name, *args, **kwargs):
-                    if name == "pytesseract":
-                        raise ImportError("No module named 'pytesseract'")
-                    return __import__(name, *args, **kwargs)
-
-                with patch("builtins.__import__", side_effect=mock_import):
-                    from app.engines.tesseract_engine import TesseractEngine
-
-                    with pytest.raises(EngineUnavailableError, match="pytesseract"):
-                        TesseractEngine()
-
-
 # ---------------------------------------------------------------------------
 # CloudEngine
 # ---------------------------------------------------------------------------
@@ -637,8 +558,8 @@ class TestEngineFactory:
 
         from app.engines.engine_factory import create_engine
 
-        create_engine("tesseract")
-        mock_create.assert_called_once_with("tesseract")
+        create_engine("google_vision")
+        mock_create.assert_called_once_with("google_vision")
 
     @patch("app.engines.engine_factory.settings")
     @patch("app.engines.engine_factory._create_single_engine")
diff --git a/ocr/tests/test_health.py b/ocr/tests/test_health.py
index cd1e914..a127293 100644
--- a/ocr/tests/test_health.py
+++ b/ocr/tests/test_health.py
@@ -39,14 +39,9 @@ def test_pillow_heif_can_register():
     assert "HEIF" in Image.registered_extensions().values()
 
 
-def test_tesseract_available():
-    """Tesseract OCR is available and can process images."""
-    import pytesseract
+def test_paddleocr_engine_available():
+    """PaddleOCR engine can be created."""
+    from app.engines.paddle_engine import PaddleOcrEngine
 
-    # Create a simple test image with text
-    img = Image.new("RGB", (200, 50), color="white")
-
-    # Verify pytesseract can call tesseract (will return empty string for blank image)
-    result = pytesseract.image_to_string(img)
-    # Just verify it doesn't raise an exception - blank image returns empty/whitespace
-    assert isinstance(result, str)
+    engine = PaddleOcrEngine()
+    assert engine.name == "paddleocr"
diff --git a/ocr/tests/test_vin_validator.py b/ocr/tests/test_vin_validator.py
index 241eabd..e6c65e1 100644
--- a/ocr/tests/test_vin_validator.py
+++ b/ocr/tests/test_vin_validator.py
@@ -165,7 +165,7 @@ class TestVinValidator:
         """Test candidate extraction handles space-fragmented VINs from OCR."""
         validator = VinValidator()
 
-        # Tesseract often fragments VINs into multiple words
+        # OCR engines sometimes fragment VINs into multiple words
         text = "1HGBH 41JXMN 109186"
         candidates = validator.extract_candidates(text)