fix: Build errors and tesseract removal
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
This commit is contained in:
@@ -5,9 +5,9 @@ import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
from app.engines import create_engine, OcrConfig
|
||||
from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo
|
||||
from app.table_extraction.detector import table_detector, DetectedTable
|
||||
from app.table_extraction.parser import table_parser, ParsedScheduleRow
|
||||
@@ -243,8 +243,9 @@ class ManualExtractor:
|
||||
|
||||
# OCR the full page
|
||||
try:
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
ocr_text = pytesseract.image_to_string(image)
|
||||
engine = create_engine()
|
||||
ocr_result = engine.recognize(image_bytes, OcrConfig())
|
||||
ocr_text = ocr_result.text
|
||||
|
||||
# Mark tables as maintenance if page contains maintenance keywords
|
||||
for table in detected_tables:
|
||||
@@ -358,8 +359,9 @@ class ManualExtractor:
|
||||
|
||||
if not text and first_page.image_bytes:
|
||||
# OCR first page
|
||||
image = Image.open(io.BytesIO(first_page.image_bytes))
|
||||
text = pytesseract.image_to_string(image)
|
||||
engine = create_engine()
|
||||
ocr_result = engine.recognize(first_page.image_bytes, OcrConfig())
|
||||
text = ocr_result.text
|
||||
|
||||
if text:
|
||||
return self._parse_vehicle_from_text(text)
|
||||
|
||||
@@ -316,8 +316,8 @@ class VinExtractor(BaseExtractor):
|
||||
single-line - Treat as a single text line
|
||||
single-word - Treat as a single word
|
||||
|
||||
For PaddleOCR, angle classification handles rotated/angled text
|
||||
inherently, replacing the need for Tesseract PSM mode fallbacks.
|
||||
PaddleOCR angle classification handles rotated/angled text
|
||||
inherently, so no PSM mode fallbacks are needed.
|
||||
|
||||
Returns:
|
||||
List of VIN candidates
|
||||
|
||||
Reference in New Issue
Block a user