feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)
Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py, receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,15 +1,14 @@
|
||||
"""Core OCR service using Tesseract with HEIC support."""
|
||||
"""Core OCR service with HEIC support, using pluggable engine abstraction."""
|
||||
import io
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import magic
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from pillow_heif import register_heif_opener
|
||||
|
||||
from app.config import settings
|
||||
from app.engines import OcrConfig, create_engine
|
||||
from app.models import DocumentType, ExtractedField, OcrResponse
|
||||
from app.services.preprocessor import preprocessor
|
||||
|
||||
@@ -32,8 +31,8 @@ class OcrService:
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize OCR service."""
|
||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
||||
"""Initialize OCR service with engine from factory."""
|
||||
self._engine = create_engine()
|
||||
|
||||
def extract(
|
||||
self,
|
||||
@@ -86,14 +85,11 @@ class OcrService:
|
||||
file_bytes, deskew=True, denoise=True
|
||||
)
|
||||
|
||||
# Perform OCR
|
||||
image = Image.open(io.BytesIO(file_bytes))
|
||||
ocr_data = pytesseract.image_to_data(
|
||||
image, output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Extract text and calculate confidence
|
||||
raw_text, confidence = self._process_ocr_data(ocr_data)
|
||||
# Perform OCR via engine abstraction
|
||||
config = OcrConfig()
|
||||
result = self._engine.recognize(file_bytes, config)
|
||||
raw_text = result.text
|
||||
confidence = result.confidence
|
||||
|
||||
# Detect document type from content
|
||||
document_type = self._detect_document_type(raw_text)
|
||||
@@ -160,26 +156,6 @@ class OcrService:
|
||||
|
||||
return b""
|
||||
|
||||
def _process_ocr_data(
|
||||
self, ocr_data: dict
|
||||
) -> tuple[str, float]:
|
||||
"""Process Tesseract output to extract text and confidence."""
|
||||
words = []
|
||||
confidences = []
|
||||
|
||||
for i, text in enumerate(ocr_data["text"]):
|
||||
# Filter out empty strings and low-confidence results
|
||||
conf = int(ocr_data["conf"][i])
|
||||
if text.strip() and conf > 0:
|
||||
words.append(text)
|
||||
confidences.append(conf)
|
||||
|
||||
raw_text = " ".join(words)
|
||||
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
||||
|
||||
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
|
||||
return raw_text, avg_confidence / 100.0
|
||||
|
||||
def _detect_document_type(self, text: str) -> DocumentType:
|
||||
"""Detect document type from extracted text content."""
|
||||
text_lower = text.lower()
|
||||
|
||||
Reference in New Issue
Block a user