From b9fe222f125359f3b288ecdad0f10deba832b5e0 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Sat, 7 Feb 2026 12:12:04 -0600 Subject: [PATCH] fix: Build errors and tesseract removal --- ocr/Dockerfile | 12 +-- ocr/app/CLAUDE.md | 2 +- ocr/app/config.py | 2 - ocr/app/engines/__init__.py | 1 - ocr/app/engines/base_engine.py | 2 +- ocr/app/engines/engine_factory.py | 3 +- ocr/app/engines/tesseract_engine.py | 114 ---------------------- ocr/app/extractors/manual_extractor.py | 12 ++- ocr/app/extractors/vin_extractor.py | 4 +- ocr/app/preprocessors/vin_preprocessor.py | 12 +-- ocr/app/table_extraction/detector.py | 2 +- ocr/app/validators/vin_validator.py | 2 +- ocr/requirements.txt | 1 - ocr/tests/test_engine_abstraction.py | 87 +---------------- ocr/tests/test_health.py | 15 +-- ocr/tests/test_vin_validator.py | 2 +- 16 files changed, 35 insertions(+), 238 deletions(-) delete mode 100644 ocr/app/engines/tesseract_engine.py diff --git a/ocr/Dockerfile b/ocr/Dockerfile index d1c52e4..8028575 100644 --- a/ocr/Dockerfile +++ b/ocr/Dockerfile @@ -2,7 +2,6 @@ # Uses mirrored base images from Gitea Package Registry # # Primary engine: PaddleOCR PP-OCRv4 (models baked into image) -# Backward compat: Tesseract 5.x (optional, via TesseractEngine) # Cloud fallback: Google Vision (optional, requires API key at runtime) # Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub) @@ -11,21 +10,16 @@ ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors FROM ${REGISTRY_MIRRORS}/python:3.13-slim # System dependencies -# - tesseract-ocr/eng: Backward-compatible OCR engine (used by TesseractEngine) # - libgomp1: OpenMP runtime required by PaddlePaddle # - libheif1/libheif-dev: HEIF image support (iPhone photos) # - libglib2.0-0: GLib shared library (OpenCV dependency) -# - libgl1-mesa-glx: OpenGL runtime (OpenCV cv2 dependency, pulled by PaddleX) # - libmagic1: File type detection # - curl: Health check endpoint RUN apt-get update && apt-get install -y --no-install-recommends \ - tesseract-ocr \ - tesseract-ocr-eng \ libgomp1 \ libheif1 \ libheif-dev \ libglib2.0-0 \ - libgl1-mesa-glx \ libmagic1 \ curl \ && rm -rf /var/lib/apt/lists/* @@ -33,7 +27,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Python dependencies WORKDIR /app COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +# Install dependencies. PaddleX (transitive via paddleocr) pulls in the full +# opencv-python which requires libGL.so.1. Force-reinstall the headless +# variant afterwards so the container stays GUI-free. +RUN pip install --no-cache-dir -r requirements.txt \ + && pip install --no-cache-dir --force-reinstall opencv-python-headless # Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime). # Models are baked into the image so container starts are fast and diff --git a/ocr/app/CLAUDE.md b/ocr/app/CLAUDE.md index 8fbc7f1..7d0441b 100644 --- a/ocr/app/CLAUDE.md +++ b/ocr/app/CLAUDE.md @@ -12,7 +12,7 @@ | Directory | What | When to read | | --------- | ---- | ------------ | -| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback, Tesseract compat) | Engine changes, adding new engines | +| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback) | Engine changes, adding new engines | | `extractors/` | Data extraction logic | Adding new extraction types | | `models/` | Data models and schemas | Request/response types | | `patterns/` | Regex and parsing patterns | Pattern matching rules | diff --git a/ocr/app/config.py b/ocr/app/config.py index 4a15906..e933d4b 100644 --- a/ocr/app/config.py +++ b/ocr/app/config.py @@ -9,8 +9,6 @@ class Settings: self.log_level: str = os.getenv("LOG_LEVEL", "info") self.host: str = os.getenv("HOST", "0.0.0.0") self.port: int = int(os.getenv("PORT", "8000")) - self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract") - # OCR engine configuration self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr") self.ocr_confidence_threshold: float = float( diff --git a/ocr/app/engines/__init__.py b/ocr/app/engines/__init__.py index abc8b05..df38155 100644 --- a/ocr/app/engines/__init__.py +++ b/ocr/app/engines/__init__.py @@ -5,7 +5,6 @@ decoupling extractors from specific OCR libraries. Engines: - PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only) - - TesseractEngine: pytesseract wrapper (backward compatibility) - CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback) - HybridEngine: Primary + fallback with confidence threshold """ diff --git a/ocr/app/engines/base_engine.py b/ocr/app/engines/base_engine.py index ddca084..d10ca26 100644 --- a/ocr/app/engines/base_engine.py +++ b/ocr/app/engines/base_engine.py @@ -57,7 +57,7 @@ class OcrEngineResult: text: str confidence: float # 0.0-1.0 word_boxes: list[WordBox] - engine_name: str # "paddleocr", "tesseract", "google_vision" + engine_name: str # "paddleocr", "google_vision" # --- Abstract base --- diff --git a/ocr/app/engines/engine_factory.py b/ocr/app/engines/engine_factory.py index 49464d2..f52926f 100644 --- a/ocr/app/engines/engine_factory.py +++ b/ocr/app/engines/engine_factory.py @@ -11,7 +11,6 @@ logger = logging.getLogger(__name__) # Valid engine identifiers (primary engines only; hybrid is constructed separately) _ENGINE_REGISTRY: dict[str, str] = { "paddleocr": "app.engines.paddle_engine.PaddleOcrEngine", - "tesseract": "app.engines.tesseract_engine.TesseractEngine", "google_vision": "app.engines.cloud_engine.CloudEngine", } @@ -46,7 +45,7 @@ def create_engine(engine_name: str | None = None) -> OcrEngine: returns a ``HybridEngine`` that wraps the primary with the fallback. Args: - engine_name: Engine identifier ("paddleocr", "tesseract"). + engine_name: Engine identifier ("paddleocr", "google_vision"). Falls back to ``settings.ocr_primary_engine``. Returns: diff --git a/ocr/app/engines/tesseract_engine.py b/ocr/app/engines/tesseract_engine.py deleted file mode 100644 index 02108ec..0000000 --- a/ocr/app/engines/tesseract_engine.py +++ /dev/null @@ -1,114 +0,0 @@ -"""Tesseract engine wrapper for backward compatibility.""" - -import io -import logging - -from app.config import settings -from app.engines.base_engine import ( - EngineProcessingError, - EngineUnavailableError, - OcrConfig, - OcrEngine, - OcrEngineResult, - WordBox, -) - -logger = logging.getLogger(__name__) - - -class TesseractEngine(OcrEngine): - """pytesseract wrapper conforming to the OcrEngine interface.""" - - def __init__(self) -> None: - try: - import pytesseract # type: ignore[import-untyped] - - pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd - self._pytesseract = pytesseract - logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd) - except ImportError as exc: - raise EngineUnavailableError( - "pytesseract is not installed. " - "Install with: pip install pytesseract" - ) from exc - - @property - def name(self) -> str: - return "tesseract" - - def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult: - """Run Tesseract OCR on image bytes.""" - try: - from PIL import Image - - image = Image.open(io.BytesIO(image_bytes)) - - # Build Tesseract config string from OcrConfig - tess_config = self._build_config(config) - - # Get word-level data - ocr_data = self._pytesseract.image_to_data( - image, - config=tess_config, - output_type=self._pytesseract.Output.DICT, - ) - - word_boxes: list[WordBox] = [] - texts: list[str] = [] - confidences: list[float] = [] - - for i, text in enumerate(ocr_data["text"]): - conf = int(ocr_data["conf"][i]) - if text.strip() and conf > 0: - normalized_conf = conf / 100.0 - word_boxes.append( - WordBox( - text=text.strip(), - confidence=normalized_conf, - x=int(ocr_data["left"][i]), - y=int(ocr_data["top"][i]), - width=int(ocr_data["width"][i]), - height=int(ocr_data["height"][i]), - ) - ) - texts.append(text.strip()) - confidences.append(normalized_conf) - - combined_text = " ".join(texts) - avg_confidence = ( - sum(confidences) / len(confidences) if confidences else 0.0 - ) - - return OcrEngineResult( - text=combined_text, - confidence=avg_confidence, - word_boxes=word_boxes, - engine_name=self.name, - ) - - except (EngineUnavailableError, EngineProcessingError): - raise - except Exception as exc: - raise EngineProcessingError( - f"Tesseract recognition failed: {exc}" - ) from exc - - def _build_config(self, config: OcrConfig) -> str: - """Translate OcrConfig into a Tesseract CLI config string.""" - parts: list[str] = [] - - # Page segmentation mode - if config.single_word: - parts.append("--psm 8") - elif config.single_line: - parts.append("--psm 7") - else: - # Default: assume uniform block of text - psm = config.hints.get("psm", 6) - parts.append(f"--psm {psm}") - - # Character whitelist - if config.char_whitelist: - parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}") - - return " ".join(parts) diff --git a/ocr/app/extractors/manual_extractor.py b/ocr/app/extractors/manual_extractor.py index e447882..ad5f159 100644 --- a/ocr/app/extractors/manual_extractor.py +++ b/ocr/app/extractors/manual_extractor.py @@ -5,9 +5,9 @@ import time from dataclasses import dataclass, field from typing import Callable, Optional -import pytesseract from PIL import Image +from app.engines import create_engine, OcrConfig from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo from app.table_extraction.detector import table_detector, DetectedTable from app.table_extraction.parser import table_parser, ParsedScheduleRow @@ -243,8 +243,9 @@ class ManualExtractor: # OCR the full page try: - image = Image.open(io.BytesIO(image_bytes)) - ocr_text = pytesseract.image_to_string(image) + engine = create_engine() + ocr_result = engine.recognize(image_bytes, OcrConfig()) + ocr_text = ocr_result.text # Mark tables as maintenance if page contains maintenance keywords for table in detected_tables: @@ -358,8 +359,9 @@ class ManualExtractor: if not text and first_page.image_bytes: # OCR first page - image = Image.open(io.BytesIO(first_page.image_bytes)) - text = pytesseract.image_to_string(image) + engine = create_engine() + ocr_result = engine.recognize(first_page.image_bytes, OcrConfig()) + text = ocr_result.text if text: return self._parse_vehicle_from_text(text) diff --git a/ocr/app/extractors/vin_extractor.py b/ocr/app/extractors/vin_extractor.py index cce88e9..01a9343 100644 --- a/ocr/app/extractors/vin_extractor.py +++ b/ocr/app/extractors/vin_extractor.py @@ -316,8 +316,8 @@ class VinExtractor(BaseExtractor): single-line - Treat as a single text line single-word - Treat as a single word - For PaddleOCR, angle classification handles rotated/angled text - inherently, replacing the need for Tesseract PSM mode fallbacks. + PaddleOCR angle classification handles rotated/angled text + inherently, so no PSM mode fallbacks are needed. Returns: List of VIN candidates diff --git a/ocr/app/preprocessors/vin_preprocessor.py b/ocr/app/preprocessors/vin_preprocessor.py index 290fb5b..4128e68 100644 --- a/ocr/app/preprocessors/vin_preprocessor.py +++ b/ocr/app/preprocessors/vin_preprocessor.py @@ -93,7 +93,7 @@ class VinPreprocessor: gray = cv_image steps_applied.append("grayscale") - # Upscale small images for better OCR (Tesseract needs ~300 DPI) + # Upscale small images for better OCR (~300 DPI recommended) gray = self._ensure_minimum_resolution(gray) steps_applied.append("resolution_check") @@ -129,14 +129,14 @@ class VinPreprocessor: ) # Minimum width in pixels for reliable VIN OCR. - # A 17-char VIN needs ~30px per character for Tesseract accuracy. + # A 17-char VIN needs ~30px per character for reliable OCR accuracy. MIN_WIDTH_FOR_VIN = 600 def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray: """ Upscale image if too small for reliable OCR. - Tesseract works best at ~300 DPI. Mobile photos of VINs may have + OCR works best at ~300 DPI. Mobile photos of VINs may have the text occupy only a small portion of the frame, resulting in low effective resolution for the VIN characters. """ @@ -160,7 +160,7 @@ class VinPreprocessor: Colored backgrounds have a low min value (e.g. green sticker: min(130,230,150) = 130) → inverted to 125 (medium gray). - The inversion ensures Tesseract always receives dark-text-on- + The inversion ensures the OCR engine always receives dark-text-on- light-background, which is the polarity it expects. """ b_channel, g_channel, r_channel = cv2.split(bgr_image) @@ -168,8 +168,8 @@ class VinPreprocessor: min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel) # Invert so white text (min=255) becomes black (0) and colored - # backgrounds (min~130) become lighter gray (~125). Tesseract - # expects dark text on light background. + # backgrounds (min~130) become lighter gray (~125). OCR engines + # expect dark text on light background. inverted = cv2.bitwise_not(min_channel) gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY) diff --git a/ocr/app/table_extraction/detector.py b/ocr/app/table_extraction/detector.py index 362990f..9c5af54 100644 --- a/ocr/app/table_extraction/detector.py +++ b/ocr/app/table_extraction/detector.py @@ -312,7 +312,7 @@ class TableDetector: Returns: 2D list of cell contents """ - # This would use Tesseract on the cropped region + # This would use OCR on the cropped region # For now, return empty - actual OCR will be done in manual_extractor logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}") return [] diff --git a/ocr/app/validators/vin_validator.py b/ocr/app/validators/vin_validator.py index c9c60ef..79a2062 100644 --- a/ocr/app/validators/vin_validator.py +++ b/ocr/app/validators/vin_validator.py @@ -226,7 +226,7 @@ class VinValidator: Uses two strategies: 1. Find continuous 11-20 char alphanumeric runs (handles intact VINs) 2. Concatenate adjacent short fragments separated by spaces/dashes - (handles Tesseract fragmenting VINs into multiple words) + (handles OCR fragmenting VINs into multiple words) Args: text: Raw OCR text diff --git a/ocr/requirements.txt b/ocr/requirements.txt index 9ac83ad..946f645 100644 --- a/ocr/requirements.txt +++ b/ocr/requirements.txt @@ -14,7 +14,6 @@ opencv-python-headless>=4.8.0 numpy>=1.24.0 # OCR Engines -pytesseract>=0.3.10 paddlepaddle>=2.6.0 paddleocr>=2.8.0 google-cloud-vision>=3.7.0 diff --git a/ocr/tests/test_engine_abstraction.py b/ocr/tests/test_engine_abstraction.py index 2e8c150..44c314f 100644 --- a/ocr/tests/test_engine_abstraction.py +++ b/ocr/tests/test_engine_abstraction.py @@ -1,7 +1,7 @@ """Tests for OCR engine abstraction layer. Covers: base types, exception hierarchy, PaddleOcrEngine, -TesseractEngine, CloudEngine, HybridEngine, and engine_factory. +CloudEngine, HybridEngine, and engine_factory. """ import io @@ -124,7 +124,7 @@ class TestOcrEngineResult: def test_empty_result(self) -> None: result = OcrEngineResult( - text="", confidence=0.0, word_boxes=[], engine_name="tesseract" + text="", confidence=0.0, word_boxes=[], engine_name="paddleocr" ) assert result.text == "" assert result.word_boxes == [] @@ -303,85 +303,6 @@ class TestPaddleOcrEngine: engine.recognize(_create_test_image_bytes(), OcrConfig()) -# --------------------------------------------------------------------------- -# TesseractEngine -# --------------------------------------------------------------------------- - - -class TestTesseractEngine: - """Tests for TesseractEngine using mocked pytesseract.""" - - @pytest.fixture() - def engine(self) -> "TesseractEngine": # type: ignore[name-defined] - """Create a TesseractEngine with mocked pytesseract dependency.""" - mock_pytesseract = MagicMock() - mock_pytesseract.Output.DICT = "dict" - - with patch.dict("sys.modules", {"pytesseract": mock_pytesseract}): - with patch("app.engines.tesseract_engine.settings") as mock_settings: - mock_settings.tesseract_cmd = "/usr/bin/tesseract" - from app.engines.tesseract_engine import TesseractEngine - - eng = TesseractEngine() - eng._mock_pytesseract = mock_pytesseract # type: ignore[attr-defined] - return eng - - def test_name(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] - assert engine.name == "tesseract" - - def test_build_config_default_psm(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] - config_str = engine._build_config(OcrConfig()) - assert "--psm 6" in config_str - - def test_build_config_single_line(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] - config_str = engine._build_config(OcrConfig(single_line=True)) - assert "--psm 7" in config_str - - def test_build_config_single_word(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] - config_str = engine._build_config(OcrConfig(single_word=True)) - assert "--psm 8" in config_str - - def test_build_config_whitelist(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] - config_str = engine._build_config(OcrConfig(char_whitelist="ABC123")) - assert "-c tessedit_char_whitelist=ABC123" in config_str - - def test_build_config_psm_hint(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] - config_str = engine._build_config(OcrConfig(hints={"psm": 11})) - assert "--psm 11" in config_str - - def test_recognize_normalizes_confidence(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined] - """Tesseract returns 0-100 confidence; engine normalizes to 0.0-1.0.""" - engine._pytesseract.image_to_data.return_value = { - "text": ["HELLO", ""], - "conf": [92, -1], - "left": [10], - "top": [20], - "width": [100], - "height": [30], - } - - result = engine.recognize(_create_test_image_bytes(), OcrConfig()) - assert result.text == "HELLO" - assert abs(result.confidence - 0.92) < 0.01 - assert result.engine_name == "tesseract" - - def test_import_error_raises_unavailable(self) -> None: - with patch.dict("sys.modules", {"pytesseract": None}): - with patch("app.engines.tesseract_engine.settings") as mock_settings: - mock_settings.tesseract_cmd = "/usr/bin/tesseract" - - def mock_import(name, *args, **kwargs): - if name == "pytesseract": - raise ImportError("No module named 'pytesseract'") - return __import__(name, *args, **kwargs) - - with patch("builtins.__import__", side_effect=mock_import): - from app.engines.tesseract_engine import TesseractEngine - - with pytest.raises(EngineUnavailableError, match="pytesseract"): - TesseractEngine() - - # --------------------------------------------------------------------------- # CloudEngine # --------------------------------------------------------------------------- @@ -637,8 +558,8 @@ class TestEngineFactory: from app.engines.engine_factory import create_engine - create_engine("tesseract") - mock_create.assert_called_once_with("tesseract") + create_engine("google_vision") + mock_create.assert_called_once_with("google_vision") @patch("app.engines.engine_factory.settings") @patch("app.engines.engine_factory._create_single_engine") diff --git a/ocr/tests/test_health.py b/ocr/tests/test_health.py index cd1e914..a127293 100644 --- a/ocr/tests/test_health.py +++ b/ocr/tests/test_health.py @@ -39,14 +39,9 @@ def test_pillow_heif_can_register(): assert "HEIF" in Image.registered_extensions().values() -def test_tesseract_available(): - """Tesseract OCR is available and can process images.""" - import pytesseract +def test_paddleocr_engine_available(): + """PaddleOCR engine can be created.""" + from app.engines.paddle_engine import PaddleOcrEngine - # Create a simple test image with text - img = Image.new("RGB", (200, 50), color="white") - - # Verify pytesseract can call tesseract (will return empty string for blank image) - result = pytesseract.image_to_string(img) - # Just verify it doesn't raise an exception - blank image returns empty/whitespace - assert isinstance(result, str) + engine = PaddleOcrEngine() + assert engine.name == "paddleocr" diff --git a/ocr/tests/test_vin_validator.py b/ocr/tests/test_vin_validator.py index 241eabd..e6c65e1 100644 --- a/ocr/tests/test_vin_validator.py +++ b/ocr/tests/test_vin_validator.py @@ -165,7 +165,7 @@ class TestVinValidator: """Test candidate extraction handles space-fragmented VINs from OCR.""" validator = VinValidator() - # Tesseract often fragments VINs into multiple words + # OCR engines sometimes fragment VINs into multiple words text = "1HGBH 41JXMN 109186" candidates = validator.extract_candidates(text)