feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)
Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py, receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,16 +1,13 @@
|
||||
"""Receipt-specific OCR extractor with field extraction."""
|
||||
import io
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional
|
||||
|
||||
import magic
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from pillow_heif import register_heif_opener
|
||||
|
||||
from app.config import settings
|
||||
from app.engines import OcrConfig, create_engine
|
||||
from app.extractors.base import BaseExtractor
|
||||
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
|
||||
from app.patterns import currency_matcher, date_matcher, fuel_matcher
|
||||
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize receipt extractor."""
|
||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
||||
"""Initialize receipt extractor with engine from factory."""
|
||||
self._engine = create_engine()
|
||||
|
||||
def extract(
|
||||
self,
|
||||
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
|
||||
detected = mime.from_buffer(file_bytes)
|
||||
return detected or "application/octet-stream"
|
||||
|
||||
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
|
||||
def _perform_ocr(self, image_bytes: bytes) -> str:
|
||||
"""
|
||||
Perform OCR on preprocessed image.
|
||||
Perform OCR on preprocessed image via engine abstraction.
|
||||
|
||||
Args:
|
||||
image_bytes: Preprocessed image bytes
|
||||
psm: Tesseract page segmentation mode
|
||||
4 = Assume single column of text
|
||||
6 = Uniform block of text
|
||||
|
||||
Returns:
|
||||
Raw OCR text
|
||||
"""
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
# Configure Tesseract for receipt OCR
|
||||
# PSM 4 works well for columnar receipt text
|
||||
config = f"--psm {psm}"
|
||||
|
||||
return pytesseract.image_to_string(image, config=config)
|
||||
config = OcrConfig()
|
||||
result = self._engine.recognize(image_bytes, config)
|
||||
return result.text
|
||||
|
||||
def _detect_receipt_type(self, text: str) -> str:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user