fix: Build errors and tesseract removal
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s

This commit is contained in:
Eric Gullickson
2026-02-07 12:12:04 -06:00
parent cf114fad3c
commit b9fe222f12
16 changed files with 35 additions and 238 deletions

View File

@@ -2,7 +2,6 @@
# Uses mirrored base images from Gitea Package Registry # Uses mirrored base images from Gitea Package Registry
# #
# Primary engine: PaddleOCR PP-OCRv4 (models baked into image) # Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
# Backward compat: Tesseract 5.x (optional, via TesseractEngine)
# Cloud fallback: Google Vision (optional, requires API key at runtime) # Cloud fallback: Google Vision (optional, requires API key at runtime)
# Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub) # Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub)
@@ -11,21 +10,16 @@ ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
FROM ${REGISTRY_MIRRORS}/python:3.13-slim FROM ${REGISTRY_MIRRORS}/python:3.13-slim
# System dependencies # System dependencies
# - tesseract-ocr/eng: Backward-compatible OCR engine (used by TesseractEngine)
# - libgomp1: OpenMP runtime required by PaddlePaddle # - libgomp1: OpenMP runtime required by PaddlePaddle
# - libheif1/libheif-dev: HEIF image support (iPhone photos) # - libheif1/libheif-dev: HEIF image support (iPhone photos)
# - libglib2.0-0: GLib shared library (OpenCV dependency) # - libglib2.0-0: GLib shared library (OpenCV dependency)
# - libgl1-mesa-glx: OpenGL runtime (OpenCV cv2 dependency, pulled by PaddleX)
# - libmagic1: File type detection # - libmagic1: File type detection
# - curl: Health check endpoint # - curl: Health check endpoint
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-eng \
libgomp1 \ libgomp1 \
libheif1 \ libheif1 \
libheif-dev \ libheif-dev \
libglib2.0-0 \ libglib2.0-0 \
libgl1-mesa-glx \
libmagic1 \ libmagic1 \
curl \ curl \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
@@ -33,7 +27,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
# Python dependencies # Python dependencies
WORKDIR /app WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt # Install dependencies. PaddleX (transitive via paddleocr) pulls in the full
# opencv-python which requires libGL.so.1. Force-reinstall the headless
# variant afterwards so the container stays GUI-free.
RUN pip install --no-cache-dir -r requirements.txt \
&& pip install --no-cache-dir --force-reinstall opencv-python-headless
# Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime). # Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime).
# Models are baked into the image so container starts are fast and # Models are baked into the image so container starts are fast and

View File

@@ -12,7 +12,7 @@
| Directory | What | When to read | | Directory | What | When to read |
| --------- | ---- | ------------ | | --------- | ---- | ------------ |
| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback, Tesseract compat) | Engine changes, adding new engines | | `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback) | Engine changes, adding new engines |
| `extractors/` | Data extraction logic | Adding new extraction types | | `extractors/` | Data extraction logic | Adding new extraction types |
| `models/` | Data models and schemas | Request/response types | | `models/` | Data models and schemas | Request/response types |
| `patterns/` | Regex and parsing patterns | Pattern matching rules | | `patterns/` | Regex and parsing patterns | Pattern matching rules |

View File

@@ -9,8 +9,6 @@ class Settings:
self.log_level: str = os.getenv("LOG_LEVEL", "info") self.log_level: str = os.getenv("LOG_LEVEL", "info")
self.host: str = os.getenv("HOST", "0.0.0.0") self.host: str = os.getenv("HOST", "0.0.0.0")
self.port: int = int(os.getenv("PORT", "8000")) self.port: int = int(os.getenv("PORT", "8000"))
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
# OCR engine configuration # OCR engine configuration
self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr") self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr")
self.ocr_confidence_threshold: float = float( self.ocr_confidence_threshold: float = float(

View File

@@ -5,7 +5,6 @@ decoupling extractors from specific OCR libraries.
Engines: Engines:
- PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only) - PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only)
- TesseractEngine: pytesseract wrapper (backward compatibility)
- CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback) - CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback)
- HybridEngine: Primary + fallback with confidence threshold - HybridEngine: Primary + fallback with confidence threshold
""" """

View File

@@ -57,7 +57,7 @@ class OcrEngineResult:
text: str text: str
confidence: float # 0.0-1.0 confidence: float # 0.0-1.0
word_boxes: list[WordBox] word_boxes: list[WordBox]
engine_name: str # "paddleocr", "tesseract", "google_vision" engine_name: str # "paddleocr", "google_vision"
# --- Abstract base --- # --- Abstract base ---

View File

@@ -11,7 +11,6 @@ logger = logging.getLogger(__name__)
# Valid engine identifiers (primary engines only; hybrid is constructed separately) # Valid engine identifiers (primary engines only; hybrid is constructed separately)
_ENGINE_REGISTRY: dict[str, str] = { _ENGINE_REGISTRY: dict[str, str] = {
"paddleocr": "app.engines.paddle_engine.PaddleOcrEngine", "paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
"tesseract": "app.engines.tesseract_engine.TesseractEngine",
"google_vision": "app.engines.cloud_engine.CloudEngine", "google_vision": "app.engines.cloud_engine.CloudEngine",
} }
@@ -46,7 +45,7 @@ def create_engine(engine_name: str | None = None) -> OcrEngine:
returns a ``HybridEngine`` that wraps the primary with the fallback. returns a ``HybridEngine`` that wraps the primary with the fallback.
Args: Args:
engine_name: Engine identifier ("paddleocr", "tesseract"). engine_name: Engine identifier ("paddleocr", "google_vision").
Falls back to ``settings.ocr_primary_engine``. Falls back to ``settings.ocr_primary_engine``.
Returns: Returns:

View File

@@ -1,114 +0,0 @@
"""Tesseract engine wrapper for backward compatibility."""
import io
import logging
from app.config import settings
from app.engines.base_engine import (
EngineProcessingError,
EngineUnavailableError,
OcrConfig,
OcrEngine,
OcrEngineResult,
WordBox,
)
logger = logging.getLogger(__name__)
class TesseractEngine(OcrEngine):
"""pytesseract wrapper conforming to the OcrEngine interface."""
def __init__(self) -> None:
try:
import pytesseract # type: ignore[import-untyped]
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
self._pytesseract = pytesseract
logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
except ImportError as exc:
raise EngineUnavailableError(
"pytesseract is not installed. "
"Install with: pip install pytesseract"
) from exc
@property
def name(self) -> str:
return "tesseract"
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
"""Run Tesseract OCR on image bytes."""
try:
from PIL import Image
image = Image.open(io.BytesIO(image_bytes))
# Build Tesseract config string from OcrConfig
tess_config = self._build_config(config)
# Get word-level data
ocr_data = self._pytesseract.image_to_data(
image,
config=tess_config,
output_type=self._pytesseract.Output.DICT,
)
word_boxes: list[WordBox] = []
texts: list[str] = []
confidences: list[float] = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
normalized_conf = conf / 100.0
word_boxes.append(
WordBox(
text=text.strip(),
confidence=normalized_conf,
x=int(ocr_data["left"][i]),
y=int(ocr_data["top"][i]),
width=int(ocr_data["width"][i]),
height=int(ocr_data["height"][i]),
)
)
texts.append(text.strip())
confidences.append(normalized_conf)
combined_text = " ".join(texts)
avg_confidence = (
sum(confidences) / len(confidences) if confidences else 0.0
)
return OcrEngineResult(
text=combined_text,
confidence=avg_confidence,
word_boxes=word_boxes,
engine_name=self.name,
)
except (EngineUnavailableError, EngineProcessingError):
raise
except Exception as exc:
raise EngineProcessingError(
f"Tesseract recognition failed: {exc}"
) from exc
def _build_config(self, config: OcrConfig) -> str:
"""Translate OcrConfig into a Tesseract CLI config string."""
parts: list[str] = []
# Page segmentation mode
if config.single_word:
parts.append("--psm 8")
elif config.single_line:
parts.append("--psm 7")
else:
# Default: assume uniform block of text
psm = config.hints.get("psm", 6)
parts.append(f"--psm {psm}")
# Character whitelist
if config.char_whitelist:
parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
return " ".join(parts)

View File

@@ -5,9 +5,9 @@ import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Callable, Optional from typing import Callable, Optional
import pytesseract
from PIL import Image from PIL import Image
from app.engines import create_engine, OcrConfig
from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo
from app.table_extraction.detector import table_detector, DetectedTable from app.table_extraction.detector import table_detector, DetectedTable
from app.table_extraction.parser import table_parser, ParsedScheduleRow from app.table_extraction.parser import table_parser, ParsedScheduleRow
@@ -243,8 +243,9 @@ class ManualExtractor:
# OCR the full page # OCR the full page
try: try:
image = Image.open(io.BytesIO(image_bytes)) engine = create_engine()
ocr_text = pytesseract.image_to_string(image) ocr_result = engine.recognize(image_bytes, OcrConfig())
ocr_text = ocr_result.text
# Mark tables as maintenance if page contains maintenance keywords # Mark tables as maintenance if page contains maintenance keywords
for table in detected_tables: for table in detected_tables:
@@ -358,8 +359,9 @@ class ManualExtractor:
if not text and first_page.image_bytes: if not text and first_page.image_bytes:
# OCR first page # OCR first page
image = Image.open(io.BytesIO(first_page.image_bytes)) engine = create_engine()
text = pytesseract.image_to_string(image) ocr_result = engine.recognize(first_page.image_bytes, OcrConfig())
text = ocr_result.text
if text: if text:
return self._parse_vehicle_from_text(text) return self._parse_vehicle_from_text(text)

View File

@@ -316,8 +316,8 @@ class VinExtractor(BaseExtractor):
single-line - Treat as a single text line single-line - Treat as a single text line
single-word - Treat as a single word single-word - Treat as a single word
For PaddleOCR, angle classification handles rotated/angled text PaddleOCR angle classification handles rotated/angled text
inherently, replacing the need for Tesseract PSM mode fallbacks. inherently, so no PSM mode fallbacks are needed.
Returns: Returns:
List of VIN candidates List of VIN candidates

View File

@@ -93,7 +93,7 @@ class VinPreprocessor:
gray = cv_image gray = cv_image
steps_applied.append("grayscale") steps_applied.append("grayscale")
# Upscale small images for better OCR (Tesseract needs ~300 DPI) # Upscale small images for better OCR (~300 DPI recommended)
gray = self._ensure_minimum_resolution(gray) gray = self._ensure_minimum_resolution(gray)
steps_applied.append("resolution_check") steps_applied.append("resolution_check")
@@ -129,14 +129,14 @@ class VinPreprocessor:
) )
# Minimum width in pixels for reliable VIN OCR. # Minimum width in pixels for reliable VIN OCR.
# A 17-char VIN needs ~30px per character for Tesseract accuracy. # A 17-char VIN needs ~30px per character for reliable OCR accuracy.
MIN_WIDTH_FOR_VIN = 600 MIN_WIDTH_FOR_VIN = 600
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray: def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
""" """
Upscale image if too small for reliable OCR. Upscale image if too small for reliable OCR.
Tesseract works best at ~300 DPI. Mobile photos of VINs may have OCR works best at ~300 DPI. Mobile photos of VINs may have
the text occupy only a small portion of the frame, resulting in the text occupy only a small portion of the frame, resulting in
low effective resolution for the VIN characters. low effective resolution for the VIN characters.
""" """
@@ -160,7 +160,7 @@ class VinPreprocessor:
Colored backgrounds have a low min value (e.g. green sticker: Colored backgrounds have a low min value (e.g. green sticker:
min(130,230,150) = 130) → inverted to 125 (medium gray). min(130,230,150) = 130) → inverted to 125 (medium gray).
The inversion ensures Tesseract always receives dark-text-on- The inversion ensures the OCR engine always receives dark-text-on-
light-background, which is the polarity it expects. light-background, which is the polarity it expects.
""" """
b_channel, g_channel, r_channel = cv2.split(bgr_image) b_channel, g_channel, r_channel = cv2.split(bgr_image)
@@ -168,8 +168,8 @@ class VinPreprocessor:
min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel) min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)
# Invert so white text (min=255) becomes black (0) and colored # Invert so white text (min=255) becomes black (0) and colored
# backgrounds (min~130) become lighter gray (~125). Tesseract # backgrounds (min~130) become lighter gray (~125). OCR engines
# expects dark text on light background. # expect dark text on light background.
inverted = cv2.bitwise_not(min_channel) inverted = cv2.bitwise_not(min_channel)
gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)

View File

@@ -312,7 +312,7 @@ class TableDetector:
Returns: Returns:
2D list of cell contents 2D list of cell contents
""" """
# This would use Tesseract on the cropped region # This would use OCR on the cropped region
# For now, return empty - actual OCR will be done in manual_extractor # For now, return empty - actual OCR will be done in manual_extractor
logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}") logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}")
return [] return []

View File

@@ -226,7 +226,7 @@ class VinValidator:
Uses two strategies: Uses two strategies:
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs) 1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
2. Concatenate adjacent short fragments separated by spaces/dashes 2. Concatenate adjacent short fragments separated by spaces/dashes
(handles Tesseract fragmenting VINs into multiple words) (handles OCR fragmenting VINs into multiple words)
Args: Args:
text: Raw OCR text text: Raw OCR text

View File

@@ -14,7 +14,6 @@ opencv-python-headless>=4.8.0
numpy>=1.24.0 numpy>=1.24.0
# OCR Engines # OCR Engines
pytesseract>=0.3.10
paddlepaddle>=2.6.0 paddlepaddle>=2.6.0
paddleocr>=2.8.0 paddleocr>=2.8.0
google-cloud-vision>=3.7.0 google-cloud-vision>=3.7.0

View File

@@ -1,7 +1,7 @@
"""Tests for OCR engine abstraction layer. """Tests for OCR engine abstraction layer.
Covers: base types, exception hierarchy, PaddleOcrEngine, Covers: base types, exception hierarchy, PaddleOcrEngine,
TesseractEngine, CloudEngine, HybridEngine, and engine_factory. CloudEngine, HybridEngine, and engine_factory.
""" """
import io import io
@@ -124,7 +124,7 @@ class TestOcrEngineResult:
def test_empty_result(self) -> None: def test_empty_result(self) -> None:
result = OcrEngineResult( result = OcrEngineResult(
text="", confidence=0.0, word_boxes=[], engine_name="tesseract" text="", confidence=0.0, word_boxes=[], engine_name="paddleocr"
) )
assert result.text == "" assert result.text == ""
assert result.word_boxes == [] assert result.word_boxes == []
@@ -303,85 +303,6 @@ class TestPaddleOcrEngine:
engine.recognize(_create_test_image_bytes(), OcrConfig()) engine.recognize(_create_test_image_bytes(), OcrConfig())
# ---------------------------------------------------------------------------
# TesseractEngine
# ---------------------------------------------------------------------------
class TestTesseractEngine:
"""Tests for TesseractEngine using mocked pytesseract."""
@pytest.fixture()
def engine(self) -> "TesseractEngine": # type: ignore[name-defined]
"""Create a TesseractEngine with mocked pytesseract dependency."""
mock_pytesseract = MagicMock()
mock_pytesseract.Output.DICT = "dict"
with patch.dict("sys.modules", {"pytesseract": mock_pytesseract}):
with patch("app.engines.tesseract_engine.settings") as mock_settings:
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
from app.engines.tesseract_engine import TesseractEngine
eng = TesseractEngine()
eng._mock_pytesseract = mock_pytesseract # type: ignore[attr-defined]
return eng
def test_name(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
assert engine.name == "tesseract"
def test_build_config_default_psm(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig())
assert "--psm 6" in config_str
def test_build_config_single_line(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig(single_line=True))
assert "--psm 7" in config_str
def test_build_config_single_word(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig(single_word=True))
assert "--psm 8" in config_str
def test_build_config_whitelist(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig(char_whitelist="ABC123"))
assert "-c tessedit_char_whitelist=ABC123" in config_str
def test_build_config_psm_hint(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
config_str = engine._build_config(OcrConfig(hints={"psm": 11}))
assert "--psm 11" in config_str
def test_recognize_normalizes_confidence(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
"""Tesseract returns 0-100 confidence; engine normalizes to 0.0-1.0."""
engine._pytesseract.image_to_data.return_value = {
"text": ["HELLO", ""],
"conf": [92, -1],
"left": [10],
"top": [20],
"width": [100],
"height": [30],
}
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
assert result.text == "HELLO"
assert abs(result.confidence - 0.92) < 0.01
assert result.engine_name == "tesseract"
def test_import_error_raises_unavailable(self) -> None:
with patch.dict("sys.modules", {"pytesseract": None}):
with patch("app.engines.tesseract_engine.settings") as mock_settings:
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
def mock_import(name, *args, **kwargs):
if name == "pytesseract":
raise ImportError("No module named 'pytesseract'")
return __import__(name, *args, **kwargs)
with patch("builtins.__import__", side_effect=mock_import):
from app.engines.tesseract_engine import TesseractEngine
with pytest.raises(EngineUnavailableError, match="pytesseract"):
TesseractEngine()
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# CloudEngine # CloudEngine
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -637,8 +558,8 @@ class TestEngineFactory:
from app.engines.engine_factory import create_engine from app.engines.engine_factory import create_engine
create_engine("tesseract") create_engine("google_vision")
mock_create.assert_called_once_with("tesseract") mock_create.assert_called_once_with("google_vision")
@patch("app.engines.engine_factory.settings") @patch("app.engines.engine_factory.settings")
@patch("app.engines.engine_factory._create_single_engine") @patch("app.engines.engine_factory._create_single_engine")

View File

@@ -39,14 +39,9 @@ def test_pillow_heif_can_register():
assert "HEIF" in Image.registered_extensions().values() assert "HEIF" in Image.registered_extensions().values()
def test_tesseract_available(): def test_paddleocr_engine_available():
"""Tesseract OCR is available and can process images.""" """PaddleOCR engine can be created."""
import pytesseract from app.engines.paddle_engine import PaddleOcrEngine
# Create a simple test image with text engine = PaddleOcrEngine()
img = Image.new("RGB", (200, 50), color="white") assert engine.name == "paddleocr"
# Verify pytesseract can call tesseract (will return empty string for blank image)
result = pytesseract.image_to_string(img)
# Just verify it doesn't raise an exception - blank image returns empty/whitespace
assert isinstance(result, str)

View File

@@ -165,7 +165,7 @@ class TestVinValidator:
"""Test candidate extraction handles space-fragmented VINs from OCR.""" """Test candidate extraction handles space-fragmented VINs from OCR."""
validator = VinValidator() validator = VinValidator()
# Tesseract often fragments VINs into multiple words # OCR engines sometimes fragment VINs into multiple words
text = "1HGBH 41JXMN 109186" text = "1HGBH 41JXMN 109186"
candidates = validator.extract_candidates(text) candidates = validator.extract_candidates(text)