fix: Build errors and tesseract removal

2026-02-07 12:12:04 -06:00
parent cf114fad3c
commit b9fe222f12
16 changed files with 35 additions and 238 deletions
--- a/ocr/app/extractors/manual_extractor.py
+++ b/ocr/app/extractors/manual_extractor.py
@@ -5,9 +5,9 @@ import time
 from dataclasses import dataclass, field
 from typing import Callable, Optional

-import pytesseract
 from PIL import Image

+from app.engines import create_engine, OcrConfig
 from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo
 from app.table_extraction.detector import table_detector, DetectedTable
 from app.table_extraction.parser import table_parser, ParsedScheduleRow
@@ -243,8 +243,9 @@ class ManualExtractor:

        # OCR the full page
        try:
-            image = Image.open(io.BytesIO(image_bytes))
-            ocr_text = pytesseract.image_to_string(image)
+            engine = create_engine()
+            ocr_result = engine.recognize(image_bytes, OcrConfig())
+            ocr_text = ocr_result.text

            # Mark tables as maintenance if page contains maintenance keywords
            for table in detected_tables:
@@ -358,8 +359,9 @@ class ManualExtractor:

            if not text and first_page.image_bytes:
                # OCR first page
-                image = Image.open(io.BytesIO(first_page.image_bytes))
-                text = pytesseract.image_to_string(image)
+                engine = create_engine()
+                ocr_result = engine.recognize(first_page.image_bytes, OcrConfig())
+                text = ocr_result.text

            if text:
                return self._parse_vehicle_from_text(text)
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -316,8 +316,8 @@ class VinExtractor(BaseExtractor):
            single-line - Treat as a single text line
            single-word - Treat as a single word

-        For PaddleOCR, angle classification handles rotated/angled text
-        inherently, replacing the need for Tesseract PSM mode fallbacks.
+        PaddleOCR angle classification handles rotated/angled text
+        inherently, so no PSM mode fallbacks are needed.

        Returns:
            List of VIN candidates