fix: Build errors and tesseract removal
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s

This commit is contained in:
Eric Gullickson
2026-02-07 12:12:04 -06:00
parent cf114fad3c
commit b9fe222f12
16 changed files with 35 additions and 238 deletions

View File

@@ -12,7 +12,7 @@
| Directory | What | When to read |
| --------- | ---- | ------------ |
| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback, Tesseract compat) | Engine changes, adding new engines |
| `engines/` | OCR engine abstraction (PaddleOCR primary, Google Vision fallback) | Engine changes, adding new engines |
| `extractors/` | Data extraction logic | Adding new extraction types |
| `models/` | Data models and schemas | Request/response types |
| `patterns/` | Regex and parsing patterns | Pattern matching rules |

View File

@@ -9,8 +9,6 @@ class Settings:
self.log_level: str = os.getenv("LOG_LEVEL", "info")
self.host: str = os.getenv("HOST", "0.0.0.0")
self.port: int = int(os.getenv("PORT", "8000"))
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
# OCR engine configuration
self.ocr_primary_engine: str = os.getenv("OCR_PRIMARY_ENGINE", "paddleocr")
self.ocr_confidence_threshold: float = float(

View File

@@ -5,7 +5,6 @@ decoupling extractors from specific OCR libraries.
Engines:
- PaddleOcrEngine: PaddleOCR PP-OCRv4 (primary, CPU-only)
- TesseractEngine: pytesseract wrapper (backward compatibility)
- CloudEngine: Google Vision TEXT_DETECTION (optional cloud fallback)
- HybridEngine: Primary + fallback with confidence threshold
"""

View File

@@ -57,7 +57,7 @@ class OcrEngineResult:
text: str
confidence: float # 0.0-1.0
word_boxes: list[WordBox]
engine_name: str # "paddleocr", "tesseract", "google_vision"
engine_name: str # "paddleocr", "google_vision"
# --- Abstract base ---

View File

@@ -11,7 +11,6 @@ logger = logging.getLogger(__name__)
# Valid engine identifiers (primary engines only; hybrid is constructed separately)
_ENGINE_REGISTRY: dict[str, str] = {
"paddleocr": "app.engines.paddle_engine.PaddleOcrEngine",
"tesseract": "app.engines.tesseract_engine.TesseractEngine",
"google_vision": "app.engines.cloud_engine.CloudEngine",
}
@@ -46,7 +45,7 @@ def create_engine(engine_name: str | None = None) -> OcrEngine:
returns a ``HybridEngine`` that wraps the primary with the fallback.
Args:
engine_name: Engine identifier ("paddleocr", "tesseract").
engine_name: Engine identifier ("paddleocr", "google_vision").
Falls back to ``settings.ocr_primary_engine``.
Returns:

View File

@@ -1,114 +0,0 @@
"""Tesseract engine wrapper for backward compatibility."""
import io
import logging
from app.config import settings
from app.engines.base_engine import (
EngineProcessingError,
EngineUnavailableError,
OcrConfig,
OcrEngine,
OcrEngineResult,
WordBox,
)
logger = logging.getLogger(__name__)
class TesseractEngine(OcrEngine):
"""pytesseract wrapper conforming to the OcrEngine interface."""
def __init__(self) -> None:
try:
import pytesseract # type: ignore[import-untyped]
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
self._pytesseract = pytesseract
logger.info("TesseractEngine initialized (cmd=%s)", settings.tesseract_cmd)
except ImportError as exc:
raise EngineUnavailableError(
"pytesseract is not installed. "
"Install with: pip install pytesseract"
) from exc
@property
def name(self) -> str:
return "tesseract"
def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult:
"""Run Tesseract OCR on image bytes."""
try:
from PIL import Image
image = Image.open(io.BytesIO(image_bytes))
# Build Tesseract config string from OcrConfig
tess_config = self._build_config(config)
# Get word-level data
ocr_data = self._pytesseract.image_to_data(
image,
config=tess_config,
output_type=self._pytesseract.Output.DICT,
)
word_boxes: list[WordBox] = []
texts: list[str] = []
confidences: list[float] = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
normalized_conf = conf / 100.0
word_boxes.append(
WordBox(
text=text.strip(),
confidence=normalized_conf,
x=int(ocr_data["left"][i]),
y=int(ocr_data["top"][i]),
width=int(ocr_data["width"][i]),
height=int(ocr_data["height"][i]),
)
)
texts.append(text.strip())
confidences.append(normalized_conf)
combined_text = " ".join(texts)
avg_confidence = (
sum(confidences) / len(confidences) if confidences else 0.0
)
return OcrEngineResult(
text=combined_text,
confidence=avg_confidence,
word_boxes=word_boxes,
engine_name=self.name,
)
except (EngineUnavailableError, EngineProcessingError):
raise
except Exception as exc:
raise EngineProcessingError(
f"Tesseract recognition failed: {exc}"
) from exc
def _build_config(self, config: OcrConfig) -> str:
"""Translate OcrConfig into a Tesseract CLI config string."""
parts: list[str] = []
# Page segmentation mode
if config.single_word:
parts.append("--psm 8")
elif config.single_line:
parts.append("--psm 7")
else:
# Default: assume uniform block of text
psm = config.hints.get("psm", 6)
parts.append(f"--psm {psm}")
# Character whitelist
if config.char_whitelist:
parts.append(f"-c tessedit_char_whitelist={config.char_whitelist}")
return " ".join(parts)

View File

@@ -5,9 +5,9 @@ import time
from dataclasses import dataclass, field
from typing import Callable, Optional
import pytesseract
from PIL import Image
from app.engines import create_engine, OcrConfig
from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo
from app.table_extraction.detector import table_detector, DetectedTable
from app.table_extraction.parser import table_parser, ParsedScheduleRow
@@ -243,8 +243,9 @@ class ManualExtractor:
# OCR the full page
try:
image = Image.open(io.BytesIO(image_bytes))
ocr_text = pytesseract.image_to_string(image)
engine = create_engine()
ocr_result = engine.recognize(image_bytes, OcrConfig())
ocr_text = ocr_result.text
# Mark tables as maintenance if page contains maintenance keywords
for table in detected_tables:
@@ -358,8 +359,9 @@ class ManualExtractor:
if not text and first_page.image_bytes:
# OCR first page
image = Image.open(io.BytesIO(first_page.image_bytes))
text = pytesseract.image_to_string(image)
engine = create_engine()
ocr_result = engine.recognize(first_page.image_bytes, OcrConfig())
text = ocr_result.text
if text:
return self._parse_vehicle_from_text(text)

View File

@@ -316,8 +316,8 @@ class VinExtractor(BaseExtractor):
single-line - Treat as a single text line
single-word - Treat as a single word
For PaddleOCR, angle classification handles rotated/angled text
inherently, replacing the need for Tesseract PSM mode fallbacks.
PaddleOCR angle classification handles rotated/angled text
inherently, so no PSM mode fallbacks are needed.
Returns:
List of VIN candidates

View File

@@ -93,7 +93,7 @@ class VinPreprocessor:
gray = cv_image
steps_applied.append("grayscale")
# Upscale small images for better OCR (Tesseract needs ~300 DPI)
# Upscale small images for better OCR (~300 DPI recommended)
gray = self._ensure_minimum_resolution(gray)
steps_applied.append("resolution_check")
@@ -129,14 +129,14 @@ class VinPreprocessor:
)
# Minimum width in pixels for reliable VIN OCR.
# A 17-char VIN needs ~30px per character for Tesseract accuracy.
# A 17-char VIN needs ~30px per character for reliable OCR accuracy.
MIN_WIDTH_FOR_VIN = 600
def _ensure_minimum_resolution(self, image: np.ndarray) -> np.ndarray:
"""
Upscale image if too small for reliable OCR.
Tesseract works best at ~300 DPI. Mobile photos of VINs may have
OCR works best at ~300 DPI. Mobile photos of VINs may have
the text occupy only a small portion of the frame, resulting in
low effective resolution for the VIN characters.
"""
@@ -160,7 +160,7 @@ class VinPreprocessor:
Colored backgrounds have a low min value (e.g. green sticker:
min(130,230,150) = 130) → inverted to 125 (medium gray).
The inversion ensures Tesseract always receives dark-text-on-
The inversion ensures the OCR engine always receives dark-text-on-
light-background, which is the polarity it expects.
"""
b_channel, g_channel, r_channel = cv2.split(bgr_image)
@@ -168,8 +168,8 @@ class VinPreprocessor:
min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)
# Invert so white text (min=255) becomes black (0) and colored
# backgrounds (min~130) become lighter gray (~125). Tesseract
# expects dark text on light background.
# backgrounds (min~130) become lighter gray (~125). OCR engines
# expect dark text on light background.
inverted = cv2.bitwise_not(min_channel)
gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)

View File

@@ -312,7 +312,7 @@ class TableDetector:
Returns:
2D list of cell contents
"""
# This would use Tesseract on the cropped region
# This would use OCR on the cropped region
# For now, return empty - actual OCR will be done in manual_extractor
logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}")
return []

View File

@@ -226,7 +226,7 @@ class VinValidator:
Uses two strategies:
1. Find continuous 11-20 char alphanumeric runs (handles intact VINs)
2. Concatenate adjacent short fragments separated by spaces/dashes
(handles Tesseract fragmenting VINs into multiple words)
(handles OCR fragmenting VINs into multiple words)
Args:
text: Raw OCR text