fix: Build errors and tesseract removal
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
# Uses mirrored base images from Gitea Package Registry
|
||||
#
|
||||
# Primary engine: PaddleOCR PP-OCRv4 (models baked into image)
|
||||
# Backward compat: Tesseract 5.x (optional, via TesseractEngine)
|
||||
# Cloud fallback: Google Vision (optional, requires API key at runtime)
|
||||
|
||||
# Build argument for registry (defaults to Gitea mirrors, falls back to Docker Hub)
|
||||
@@ -11,21 +10,16 @@ ARG REGISTRY_MIRRORS=git.motovaultpro.com/egullickson/mirrors
|
||||
FROM ${REGISTRY_MIRRORS}/python:3.13-slim
|
||||
|
||||
# System dependencies
|
||||
# - tesseract-ocr/eng: Backward-compatible OCR engine (used by TesseractEngine)
|
||||
# - libgomp1: OpenMP runtime required by PaddlePaddle
|
||||
# - libheif1/libheif-dev: HEIF image support (iPhone photos)
|
||||
# - libglib2.0-0: GLib shared library (OpenCV dependency)
|
||||
# - libgl1-mesa-glx: OpenGL runtime (OpenCV cv2 dependency, pulled by PaddleX)
|
||||
# - libmagic1: File type detection
|
||||
# - curl: Health check endpoint
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
libgomp1 \
|
||||
libheif1 \
|
||||
libheif-dev \
|
||||
libglib2.0-0 \
|
||||
libgl1-mesa-glx \
|
||||
libmagic1 \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
@@ -33,7 +27,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
# Python dependencies
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
# Install dependencies. PaddleX (transitive via paddleocr) pulls in the full
|
||||
# opencv-python which requires libGL.so.1. Force-reinstall the headless
|
||||
# variant afterwards so the container stays GUI-free.
|
||||
RUN pip install --no-cache-dir -r requirements.txt \
|
||||
&& pip install --no-cache-dir --force-reinstall opencv-python-headless
|
||||
|
||||
# Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime).
|
||||
# Models are baked into the image so container starts are fast and
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
|
||||
| Directory | What | When to read |
|
||||
| --------- | ---- | ------------ |
|
||||
| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback, Tesseract compat) | Engine changes, adding new engines |
|
||||
| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback) | Engine changes, adding new engines |
|
||||
| `extractors/` | Data extraction logic | Adding new extraction types |
|
||||
| `models/` | Data models and schemas | Request/response types |
|
||||
| `patterns/` | Regex and parsing patterns | Pattern matching rules |
|
||||
|
||||
@@ -9,8 +9,6 @@ class Settings:
|
||||
self.log_level: str = os.getenv("LOG_LEVEL", "info")
|
||||
self.host: str = os.getenv("HOST", "0.0.0.0")
|
||||
self.port: int = int(os.getenv("PORT", "8000"))
|
||||
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
|
||||
|
||||
# OCR engine configuration
|
||||
self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr")
|
||||
self.ocr_confidence_threshold: float = float(
|
||||
|
||||
@@ -5,7 +5,6 @@ decoupling extractors from specific OCR libraries.
|
||||
|
||||
Engines:
|
||||
- PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only)
|
||||
- TesseractEngine: pytesseract wrapper (backward compatibility)
|
||||
- CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback)
|
||||
- HybridEngine: Primary + fallback with confidence threshold
|
||||
"""
|
||||
|
||||
@@ -57,7 +57,7 @@ class OcrEngineResult:
|
||||
text: str
|
||||
confidence: float # 0.0-1.0
|
||||
word_boxes: list[WordBox]
|
||||
engine_name: str # "paddleocr", "tesseract", "google_vision"
|
||||
engine_name: str # "paddleocr", "google_vision"
|
||||
|
||||
|
||||
# --- Abstract base ---
|
||||
|
||||
@@ -11,7 +11,6 @@ logger = logging.getLogger(__name__)
|
||||
# Valid engine identifiers (primary engines only; hybrid is constructed separately)
|
||||
_ENGINE_REGISTRY: dict[str, str] = {
|
||||
"paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
|
||||
"tesseract": "app.engines.tesseract_engine.TesseractEngine",
|
||||
"google_vision": "app.engines.cloud_engine.CloudEngine",
|
||||
}
|
||||
|
||||
@@ -46,7 +45,7 @@ def create_engine(engine_name: str | None = None) -> OcrEngine:
|
||||
returns a ``HybridEngine`` that wraps the primary with the fallback.
|
||||
|
||||
Args:
|
||||
engine_name: Engine identifier ("paddleocr", "tesseract").
|
||||
engine_name: Engine identifier ("paddleocr", "google_vision").
|
||||
Falls back to ``settings.ocr_primary_engine``.
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -1,114 +0,0 @@
|
||||
"""Tesseract engine wrapper for backward compatibility."""
|
||||
|
||||
import io
|
||||
import logging
|
||||
|
||||
from app.config import settings
|
||||
from app.engines.base_engine import (
|
||||
EngineProcessingError,
|
||||
EngineUnavailableError,
|
||||
OcrConfig,
|
||||
OcrEngine,
|
||||
OcrEngineResult,
|
||||
WordBox,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TesseractEngine(OcrEngine):
|
||||
"""pytesseract wrapper conforming to the OcrEngine interface."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
try:
|
||||
import pytesseract # type: ignore[import-untyped]
|
||||
|
||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
||||
self._pytesseract = pytesseract
|
||||
logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
|
||||
except ImportError as exc:
|
||||
raise EngineUnavailableError(
|
||||
"pytesseract is not installed. "
|
||||
"Install with: pip install pytesseract"
|
||||
) from exc
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "tesseract"
|
||||
|
||||
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
|
||||
"""Run Tesseract OCR on image bytes."""
|
||||
try:
|
||||
from PIL import Image
|
||||
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
# Build Tesseract config string from OcrConfig
|
||||
tess_config = self._build_config(config)
|
||||
|
||||
# Get word-level data
|
||||
ocr_data = self._pytesseract.image_to_data(
|
||||
image,
|
||||
config=tess_config,
|
||||
output_type=self._pytesseract.Output.DICT,
|
||||
)
|
||||
|
||||
word_boxes: list[WordBox] = []
|
||||
texts: list[str] = []
|
||||
confidences: list[float] = []
|
||||
|
||||
for i, text in enumerate(ocr_data["text"]):
|
||||
conf = int(ocr_data["conf"][i])
|
||||
if text.strip() and conf > 0:
|
||||
normalized_conf = conf / 100.0
|
||||
word_boxes.append(
|
||||
WordBox(
|
||||
text=text.strip(),
|
||||
confidence=normalized_conf,
|
||||
x=int(ocr_data["left"][i]),
|
||||
y=int(ocr_data["top"][i]),
|
||||
width=int(ocr_data["width"][i]),
|
||||
height=int(ocr_data["height"][i]),
|
||||
)
|
||||
)
|
||||
texts.append(text.strip())
|
||||
confidences.append(normalized_conf)
|
||||
|
||||
combined_text = " ".join(texts)
|
||||
avg_confidence = (
|
||||
sum(confidences) / len(confidences) if confidences else 0.0
|
||||
)
|
||||
|
||||
return OcrEngineResult(
|
||||
text=combined_text,
|
||||
confidence=avg_confidence,
|
||||
word_boxes=word_boxes,
|
||||
engine_name=self.name,
|
||||
)
|
||||
|
||||
except (EngineUnavailableError, EngineProcessingError):
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise EngineProcessingError(
|
||||
f"Tesseract recognition failed: {exc}"
|
||||
) from exc
|
||||
|
||||
def _build_config(self, config: OcrConfig) -> str:
|
||||
"""Translate OcrConfig into a Tesseract CLI config string."""
|
||||
parts: list[str] = []
|
||||
|
||||
# Page segmentation mode
|
||||
if config.single_word:
|
||||
parts.append("--psm 8")
|
||||
elif config.single_line:
|
||||
parts.append("--psm 7")
|
||||
else:
|
||||
# Default: assume uniform block of text
|
||||
psm = config.hints.get("psm", 6)
|
||||
parts.append(f"--psm {psm}")
|
||||
|
||||
# Character whitelist
|
||||
if config.char_whitelist:
|
||||
parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
|
||||
|
||||
return " ".join(parts)
|
||||
@@ -5,9 +5,9 @@ import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
from app.engines import create_engine, OcrConfig
|
||||
from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo
|
||||
from app.table_extraction.detector import table_detector, DetectedTable
|
||||
from app.table_extraction.parser import table_parser, ParsedScheduleRow
|
||||
@@ -243,8 +243,9 @@ class ManualExtractor:
|
||||
|
||||
# OCR the full page
|
||||
try:
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
ocr_text = pytesseract.image_to_string(image)
|
||||
engine = create_engine()
|
||||
ocr_result = engine.recognize(image_bytes, OcrConfig())
|
||||
ocr_text = ocr_result.text
|
||||
|
||||
# Mark tables as maintenance if page contains maintenance keywords
|
||||
for table in detected_tables:
|
||||
@@ -358,8 +359,9 @@ class ManualExtractor:
|
||||
|
||||
if not text and first_page.image_bytes:
|
||||
# OCR first page
|
||||
image = Image.open(io.BytesIO(first_page.image_bytes))
|
||||
text = pytesseract.image_to_string(image)
|
||||
engine = create_engine()
|
||||
ocr_result = engine.recognize(first_page.image_bytes, OcrConfig())
|
||||
text = ocr_result.text
|
||||
|
||||
if text:
|
||||
return self._parse_vehicle_from_text(text)
|
||||
|
||||
@@ -316,8 +316,8 @@ class VinExtractor(BaseExtractor):
|
||||
single-line - Treat as a single text line
|
||||
single-word - Treat as a single word
|
||||
|
||||
For PaddleOCR, angle classification handles rotated/angled text
|
||||
inherently, replacing the need for Tesseract PSM mode fallbacks.
|
||||
PaddleOCR angle classification handles rotated/angled text
|
||||
inherently, so no PSM mode fallbacks are needed.
|
||||
|
||||
Returns:
|
||||
List of VIN candidates
|
||||
|
||||
@@ -93,7 +93,7 @@ class VinPreprocessor:
|
||||
gray = cv_image
|
||||
steps_applied.append("grayscale")
|
||||
|
||||
# Upscale small images for better OCR (Tesseract needs ~300 DPI)
|
||||
# Upscale small images for better OCR (~300 DPI recommended)
|
||||
gray = self._ensure_minimum_resolution(gray)
|
||||
steps_applied.append("resolution_check")
|
||||
|
||||
@@ -129,14 +129,14 @@ class VinPreprocessor:
|
||||
)
|
||||
|
||||
# Minimum width in pixels for reliable VIN OCR.
|
||||
# A 17-char VIN needs ~30px per character for Tesseract accuracy.
|
||||
# A 17-char VIN needs ~30px per character for reliable OCR accuracy.
|
||||
MIN_WIDTH_FOR_VIN = 600
|
||||
|
||||
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Upscale image if too small for reliable OCR.
|
||||
|
||||
Tesseract works best at ~300 DPI. Mobile photos of VINs may have
|
||||
OCR works best at ~300 DPI. Mobile photos of VINs may have
|
||||
the text occupy only a small portion of the frame, resulting in
|
||||
low effective resolution for the VIN characters.
|
||||
"""
|
||||
@@ -160,7 +160,7 @@ class VinPreprocessor:
|
||||
Colored backgrounds have a low min value (e.g. green sticker:
|
||||
min(130,230,150) = 130) → inverted to 125 (medium gray).
|
||||
|
||||
The inversion ensures Tesseract always receives dark-text-on-
|
||||
The inversion ensures the OCR engine always receives dark-text-on-
|
||||
light-background, which is the polarity it expects.
|
||||
"""
|
||||
b_channel, g_channel, r_channel = cv2.split(bgr_image)
|
||||
@@ -168,8 +168,8 @@ class VinPreprocessor:
|
||||
min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)
|
||||
|
||||
# Invert so white text (min=255) becomes black (0) and colored
|
||||
# backgrounds (min~130) become lighter gray (~125). Tesseract
|
||||
# expects dark text on light background.
|
||||
# backgrounds (min~130) become lighter gray (~125). OCR engines
|
||||
# expect dark text on light background.
|
||||
inverted = cv2.bitwise_not(min_channel)
|
||||
|
||||
gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
@@ -312,7 +312,7 @@ class TableDetector:
|
||||
Returns:
|
||||
2D list of cell contents
|
||||
"""
|
||||
# This would use Tesseract on the cropped region
|
||||
# This would use OCR on the cropped region
|
||||
# For now, return empty - actual OCR will be done in manual_extractor
|
||||
logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}")
|
||||
return []
|
||||
|
||||
@@ -226,7 +226,7 @@ class VinValidator:
|
||||
Uses two strategies:
|
||||
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
|
||||
2. Concatenate adjacent short fragments separated by spaces/dashes
|
||||
(handles Tesseract fragmenting VINs into multiple words)
|
||||
(handles OCR fragmenting VINs into multiple words)
|
||||
|
||||
Args:
|
||||
text: Raw OCR text
|
||||
|
||||
@@ -14,7 +14,6 @@ opencv-python-headless>=4.8.0
|
||||
numpy>=1.24.0
|
||||
|
||||
# OCR Engines
|
||||
pytesseract>=0.3.10
|
||||
paddlepaddle>=2.6.0
|
||||
paddleocr>=2.8.0
|
||||
google-cloud-vision>=3.7.0
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Tests for OCR engine abstraction layer.
|
||||
|
||||
Covers: base types, exception hierarchy, PaddleOcrEngine,
|
||||
TesseractEngine, CloudEngine, HybridEngine, and engine_factory.
|
||||
CloudEngine, HybridEngine, and engine_factory.
|
||||
"""
|
||||
|
||||
import io
|
||||
@@ -124,7 +124,7 @@ class TestOcrEngineResult:
|
||||
|
||||
def test_empty_result(self) -> None:
|
||||
result = OcrEngineResult(
|
||||
text="", confidence=0.0, word_boxes=[], engine_name="tesseract"
|
||||
text="", confidence=0.0, word_boxes=[], engine_name="paddleocr"
|
||||
)
|
||||
assert result.text == ""
|
||||
assert result.word_boxes == []
|
||||
@@ -303,85 +303,6 @@ class TestPaddleOcrEngine:
|
||||
engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TesseractEngine
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestTesseractEngine:
|
||||
"""Tests for TesseractEngine using mocked pytesseract."""
|
||||
|
||||
@pytest.fixture()
|
||||
def engine(self) -> "TesseractEngine": # type: ignore[name-defined]
|
||||
"""Create a TesseractEngine with mocked pytesseract dependency."""
|
||||
mock_pytesseract = MagicMock()
|
||||
mock_pytesseract.Output.DICT = "dict"
|
||||
|
||||
with patch.dict("sys.modules", {"pytesseract": mock_pytesseract}):
|
||||
with patch("app.engines.tesseract_engine.settings") as mock_settings:
|
||||
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
|
||||
from app.engines.tesseract_engine import TesseractEngine
|
||||
|
||||
eng = TesseractEngine()
|
||||
eng._mock_pytesseract = mock_pytesseract # type: ignore[attr-defined]
|
||||
return eng
|
||||
|
||||
def test_name(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
assert engine.name == "tesseract"
|
||||
|
||||
def test_build_config_default_psm(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig())
|
||||
assert "--psm 6" in config_str
|
||||
|
||||
def test_build_config_single_line(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(single_line=True))
|
||||
assert "--psm 7" in config_str
|
||||
|
||||
def test_build_config_single_word(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(single_word=True))
|
||||
assert "--psm 8" in config_str
|
||||
|
||||
def test_build_config_whitelist(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(char_whitelist="ABC123"))
|
||||
assert "-c tessedit_char_whitelist=ABC123" in config_str
|
||||
|
||||
def test_build_config_psm_hint(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
config_str = engine._build_config(OcrConfig(hints={"psm": 11}))
|
||||
assert "--psm 11" in config_str
|
||||
|
||||
def test_recognize_normalizes_confidence(self, engine: "TesseractEngine") -> None: # type: ignore[name-defined]
|
||||
"""Tesseract returns 0-100 confidence; engine normalizes to 0.0-1.0."""
|
||||
engine._pytesseract.image_to_data.return_value = {
|
||||
"text": ["HELLO", ""],
|
||||
"conf": [92, -1],
|
||||
"left": [10],
|
||||
"top": [20],
|
||||
"width": [100],
|
||||
"height": [30],
|
||||
}
|
||||
|
||||
result = engine.recognize(_create_test_image_bytes(), OcrConfig())
|
||||
assert result.text == "HELLO"
|
||||
assert abs(result.confidence - 0.92) < 0.01
|
||||
assert result.engine_name == "tesseract"
|
||||
|
||||
def test_import_error_raises_unavailable(self) -> None:
|
||||
with patch.dict("sys.modules", {"pytesseract": None}):
|
||||
with patch("app.engines.tesseract_engine.settings") as mock_settings:
|
||||
mock_settings.tesseract_cmd = "/usr/bin/tesseract"
|
||||
|
||||
def mock_import(name, *args, **kwargs):
|
||||
if name == "pytesseract":
|
||||
raise ImportError("No module named 'pytesseract'")
|
||||
return __import__(name, *args, **kwargs)
|
||||
|
||||
with patch("builtins.__import__", side_effect=mock_import):
|
||||
from app.engines.tesseract_engine import TesseractEngine
|
||||
|
||||
with pytest.raises(EngineUnavailableError, match="pytesseract"):
|
||||
TesseractEngine()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CloudEngine
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -637,8 +558,8 @@ class TestEngineFactory:
|
||||
|
||||
from app.engines.engine_factory import create_engine
|
||||
|
||||
create_engine("tesseract")
|
||||
mock_create.assert_called_once_with("tesseract")
|
||||
create_engine("google_vision")
|
||||
mock_create.assert_called_once_with("google_vision")
|
||||
|
||||
@patch("app.engines.engine_factory.settings")
|
||||
@patch("app.engines.engine_factory._create_single_engine")
|
||||
|
||||
@@ -39,14 +39,9 @@ def test_pillow_heif_can_register():
|
||||
assert "HEIF" in Image.registered_extensions().values()
|
||||
|
||||
|
||||
def test_tesseract_available():
|
||||
"""Tesseract OCR is available and can process images."""
|
||||
import pytesseract
|
||||
def test_paddleocr_engine_available():
|
||||
"""PaddleOCR engine can be created."""
|
||||
from app.engines.paddle_engine import PaddleOcrEngine
|
||||
|
||||
# Create a simple test image with text
|
||||
img = Image.new("RGB", (200, 50), color="white")
|
||||
|
||||
# Verify pytesseract can call tesseract (will return empty string for blank image)
|
||||
result = pytesseract.image_to_string(img)
|
||||
# Just verify it doesn't raise an exception - blank image returns empty/whitespace
|
||||
assert isinstance(result, str)
|
||||
engine = PaddleOcrEngine()
|
||||
assert engine.name == "paddleocr"
|
||||
|
||||
@@ -165,7 +165,7 @@ class TestVinValidator:
|
||||
"""Test candidate extraction handles space-fragmented VINs from OCR."""
|
||||
validator = VinValidator()
|
||||
|
||||
# Tesseract often fragments VINs into multiple words
|
||||
# OCR engines sometimes fragment VINs into multiple words
|
||||
text = "1HGBH 41JXMN 109186"
|
||||
candidates = validator.extract_candidates(text)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user