feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117)
Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py, receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,16 +1,13 @@
|
|||||||
"""Receipt-specific OCR extractor with field extraction."""
|
"""Receipt-specific OCR extractor with field extraction."""
|
||||||
import io
|
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
import pytesseract
|
|
||||||
from PIL import Image
|
|
||||||
from pillow_heif import register_heif_opener
|
from pillow_heif import register_heif_opener
|
||||||
|
|
||||||
from app.config import settings
|
from app.engines import OcrConfig, create_engine
|
||||||
from app.extractors.base import BaseExtractor
|
from app.extractors.base import BaseExtractor
|
||||||
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
|
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
|
||||||
from app.patterns import currency_matcher, date_matcher, fuel_matcher
|
from app.patterns import currency_matcher, date_matcher, fuel_matcher
|
||||||
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize receipt extractor."""
|
"""Initialize receipt extractor with engine from factory."""
|
||||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
self._engine = create_engine()
|
||||||
|
|
||||||
def extract(
|
def extract(
|
||||||
self,
|
self,
|
||||||
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
|
|||||||
detected = mime.from_buffer(file_bytes)
|
detected = mime.from_buffer(file_bytes)
|
||||||
return detected or "application/octet-stream"
|
return detected or "application/octet-stream"
|
||||||
|
|
||||||
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
|
def _perform_ocr(self, image_bytes: bytes) -> str:
|
||||||
"""
|
"""
|
||||||
Perform OCR on preprocessed image.
|
Perform OCR on preprocessed image via engine abstraction.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_bytes: Preprocessed image bytes
|
image_bytes: Preprocessed image bytes
|
||||||
psm: Tesseract page segmentation mode
|
|
||||||
4 = Assume single column of text
|
|
||||||
6 = Uniform block of text
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Raw OCR text
|
Raw OCR text
|
||||||
"""
|
"""
|
||||||
image = Image.open(io.BytesIO(image_bytes))
|
config = OcrConfig()
|
||||||
|
result = self._engine.recognize(image_bytes, config)
|
||||||
# Configure Tesseract for receipt OCR
|
return result.text
|
||||||
# PSM 4 works well for columnar receipt text
|
|
||||||
config = f"--psm {psm}"
|
|
||||||
|
|
||||||
return pytesseract.image_to_string(image, config=config)
|
|
||||||
|
|
||||||
def _detect_receipt_type(self, text: str) -> str:
|
def _detect_receipt_type(self, text: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
"""VIN-specific OCR extractor with preprocessing and validation."""
|
"""VIN-specific OCR extractor with preprocessing and validation."""
|
||||||
import io
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@@ -8,11 +7,10 @@ from datetime import datetime
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
import pytesseract
|
|
||||||
from PIL import Image
|
|
||||||
from pillow_heif import register_heif_opener
|
from pillow_heif import register_heif_opener
|
||||||
|
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
|
from app.engines import OcrConfig, create_engine
|
||||||
from app.extractors.base import BaseExtractor
|
from app.extractors.base import BaseExtractor
|
||||||
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
|
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
|
||||||
from app.validators.vin_validator import vin_validator
|
from app.validators.vin_validator import vin_validator
|
||||||
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
|
|||||||
"image/heif",
|
"image/heif",
|
||||||
}
|
}
|
||||||
|
|
||||||
# VIN character whitelist for Tesseract
|
# VIN character whitelist (passed to engine for post-OCR filtering)
|
||||||
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
||||||
|
|
||||||
# Fixed debug output directory (inside container)
|
# Fixed debug output directory (inside container)
|
||||||
DEBUG_DIR = "/tmp/vin-debug"
|
DEBUG_DIR = "/tmp/vin-debug"
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize VIN extractor."""
|
"""Initialize VIN extractor with engine from factory."""
|
||||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
self._engine = create_engine()
|
||||||
self._debug = settings.log_level.upper() == "DEBUG"
|
self._debug = settings.log_level.upper() == "DEBUG"
|
||||||
|
|
||||||
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
|
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
|
||||||
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
|
|||||||
|
|
||||||
# Perform OCR with VIN-optimized settings
|
# Perform OCR with VIN-optimized settings
|
||||||
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
|
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
|
||||||
logger.debug("PSM 6 raw text: '%s'", raw_text)
|
logger.debug("Primary OCR raw text: '%s'", raw_text)
|
||||||
logger.debug("PSM 6 word confidences: %s", word_confidences)
|
logger.debug("Primary OCR word confidences: %s", word_confidences)
|
||||||
|
|
||||||
# Extract VIN candidates from raw text
|
# Extract VIN candidates from raw text
|
||||||
candidates = vin_validator.extract_candidates(raw_text)
|
candidates = vin_validator.extract_candidates(raw_text)
|
||||||
logger.debug("PSM 6 candidates: %s", candidates)
|
logger.debug("Primary OCR candidates: %s", candidates)
|
||||||
|
|
||||||
if not candidates:
|
if not candidates:
|
||||||
# No VIN candidates found - try with different PSM modes
|
# No VIN candidates found - try alternate OCR configurations
|
||||||
candidates = self._try_alternate_ocr(preprocessed_bytes)
|
candidates = self._try_alternate_ocr(preprocessed_bytes)
|
||||||
|
|
||||||
if not candidates:
|
if not candidates:
|
||||||
# Try grayscale-only (no thresholding) — the Tesseract
|
# Try grayscale-only (no thresholding) — OCR engines often
|
||||||
# LSTM engine often performs better on non-binarized input
|
# perform better on non-binarized input because they do
|
||||||
# because it does its own internal preprocessing.
|
# their own internal preprocessing.
|
||||||
gray_result = vin_preprocessor.preprocess(
|
gray_result = vin_preprocessor.preprocess(
|
||||||
image_bytes, apply_threshold=False
|
image_bytes, apply_threshold=False
|
||||||
)
|
)
|
||||||
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
|
|||||||
raw_text, word_confidences = self._perform_ocr(
|
raw_text, word_confidences = self._perform_ocr(
|
||||||
gray_result.image_bytes
|
gray_result.image_bytes
|
||||||
)
|
)
|
||||||
logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
|
logger.debug("Gray primary raw text: '%s'", raw_text)
|
||||||
candidates = vin_validator.extract_candidates(raw_text)
|
candidates = vin_validator.extract_candidates(raw_text)
|
||||||
logger.debug("Gray PSM 6 candidates: %s", candidates)
|
logger.debug("Gray primary candidates: %s", candidates)
|
||||||
if not candidates:
|
if not candidates:
|
||||||
candidates = self._try_alternate_ocr(
|
candidates = self._try_alternate_ocr(
|
||||||
gray_result.image_bytes, prefix="Gray"
|
gray_result.image_bytes, prefix="Gray"
|
||||||
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
|
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
|
||||||
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
|
logger.debug("Otsu primary raw text: '%s'", raw_text)
|
||||||
candidates = vin_validator.extract_candidates(raw_text)
|
candidates = vin_validator.extract_candidates(raw_text)
|
||||||
logger.debug("Otsu PSM 6 candidates: %s", candidates)
|
logger.debug("Otsu primary candidates: %s", candidates)
|
||||||
if not candidates:
|
if not candidates:
|
||||||
candidates = self._try_alternate_ocr(
|
candidates = self._try_alternate_ocr(
|
||||||
otsu_result.image_bytes, prefix="Otsu"
|
otsu_result.image_bytes, prefix="Otsu"
|
||||||
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
|
|||||||
return detected or "application/octet-stream"
|
return detected or "application/octet-stream"
|
||||||
|
|
||||||
def _perform_ocr(
|
def _perform_ocr(
|
||||||
self, image_bytes: bytes, psm: int = 6
|
self,
|
||||||
|
image_bytes: bytes,
|
||||||
|
single_line: bool = False,
|
||||||
|
single_word: bool = False,
|
||||||
) -> tuple[str, list[float]]:
|
) -> tuple[str, list[float]]:
|
||||||
"""
|
"""
|
||||||
Perform OCR with VIN-optimized settings.
|
Perform OCR with VIN-optimized settings via engine abstraction.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_bytes: Preprocessed image bytes
|
image_bytes: Preprocessed image bytes
|
||||||
psm: Tesseract page segmentation mode
|
single_line: Treat image as a single text line
|
||||||
6 = Uniform block of text
|
single_word: Treat image as a single word
|
||||||
7 = Single text line
|
|
||||||
8 = Single word
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (raw_text, word_confidences)
|
Tuple of (raw_text, word_confidences)
|
||||||
"""
|
"""
|
||||||
image = Image.open(io.BytesIO(image_bytes))
|
config = OcrConfig(
|
||||||
|
char_whitelist=self.VIN_WHITELIST,
|
||||||
# Configure Tesseract for VIN extraction
|
single_line=single_line,
|
||||||
# OEM 1 = LSTM neural network engine (best accuracy)
|
single_word=single_word,
|
||||||
# NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
|
use_angle_cls=True,
|
||||||
# Using it causes empty/erratic output. Character filtering is
|
|
||||||
# handled post-OCR by vin_validator.correct_ocr_errors() instead.
|
|
||||||
config = (
|
|
||||||
f"--psm {psm} "
|
|
||||||
f"--oem 1 "
|
|
||||||
f"-c load_system_dawg=false "
|
|
||||||
f"-c load_freq_dawg=false"
|
|
||||||
)
|
)
|
||||||
|
result = self._engine.recognize(image_bytes, config)
|
||||||
# Get detailed OCR data
|
word_confidences = [wb.confidence for wb in result.word_boxes]
|
||||||
ocr_data = pytesseract.image_to_data(
|
return result.text, word_confidences
|
||||||
image, config=config, output_type=pytesseract.Output.DICT
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract words and confidences
|
|
||||||
words = []
|
|
||||||
confidences = []
|
|
||||||
|
|
||||||
for i, text in enumerate(ocr_data["text"]):
|
|
||||||
conf = int(ocr_data["conf"][i])
|
|
||||||
if text.strip() and conf > 0:
|
|
||||||
words.append(text.strip())
|
|
||||||
confidences.append(conf / 100.0)
|
|
||||||
|
|
||||||
raw_text = " ".join(words)
|
|
||||||
return raw_text, confidences
|
|
||||||
|
|
||||||
def _try_alternate_ocr(
|
def _try_alternate_ocr(
|
||||||
self,
|
self,
|
||||||
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
|
|||||||
"""
|
"""
|
||||||
Try alternate OCR configurations when initial extraction fails.
|
Try alternate OCR configurations when initial extraction fails.
|
||||||
|
|
||||||
PSM modes tried in order:
|
Modes tried:
|
||||||
7 - Single text line
|
single-line - Treat as a single text line
|
||||||
8 - Single word
|
single-word - Treat as a single word
|
||||||
11 - Sparse text (finds text in any order, good for angled photos)
|
|
||||||
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
|
For PaddleOCR, angle classification handles rotated/angled text
|
||||||
|
inherently, replacing the need for Tesseract PSM mode fallbacks.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of VIN candidates
|
List of VIN candidates
|
||||||
"""
|
"""
|
||||||
tag = f"{prefix} " if prefix else ""
|
tag = f"{prefix} " if prefix else ""
|
||||||
for psm in (7, 8, 11, 13):
|
for mode_name, kwargs in [
|
||||||
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
|
("single-line", {"single_line": True}),
|
||||||
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
|
("single-word", {"single_word": True}),
|
||||||
|
]:
|
||||||
|
raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
|
||||||
|
logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
|
||||||
candidates = vin_validator.extract_candidates(raw_text)
|
candidates = vin_validator.extract_candidates(raw_text)
|
||||||
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
|
logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
|
||||||
if candidates:
|
if candidates:
|
||||||
return candidates
|
return candidates
|
||||||
|
|
||||||
|
|||||||
@@ -1,15 +1,14 @@
|
|||||||
"""Core OCR service using Tesseract with HEIC support."""
|
"""Core OCR service with HEIC support, using pluggable engine abstraction."""
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
import pytesseract
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from pillow_heif import register_heif_opener
|
from pillow_heif import register_heif_opener
|
||||||
|
|
||||||
from app.config import settings
|
from app.engines import OcrConfig, create_engine
|
||||||
from app.models import DocumentType, ExtractedField, OcrResponse
|
from app.models import DocumentType, ExtractedField, OcrResponse
|
||||||
from app.services.preprocessor import preprocessor
|
from app.services.preprocessor import preprocessor
|
||||||
|
|
||||||
@@ -32,8 +31,8 @@ class OcrService:
|
|||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize OCR service."""
|
"""Initialize OCR service with engine from factory."""
|
||||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
self._engine = create_engine()
|
||||||
|
|
||||||
def extract(
|
def extract(
|
||||||
self,
|
self,
|
||||||
@@ -86,14 +85,11 @@ class OcrService:
|
|||||||
file_bytes, deskew=True, denoise=True
|
file_bytes, deskew=True, denoise=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Perform OCR
|
# Perform OCR via engine abstraction
|
||||||
image = Image.open(io.BytesIO(file_bytes))
|
config = OcrConfig()
|
||||||
ocr_data = pytesseract.image_to_data(
|
result = self._engine.recognize(file_bytes, config)
|
||||||
image, output_type=pytesseract.Output.DICT
|
raw_text = result.text
|
||||||
)
|
confidence = result.confidence
|
||||||
|
|
||||||
# Extract text and calculate confidence
|
|
||||||
raw_text, confidence = self._process_ocr_data(ocr_data)
|
|
||||||
|
|
||||||
# Detect document type from content
|
# Detect document type from content
|
||||||
document_type = self._detect_document_type(raw_text)
|
document_type = self._detect_document_type(raw_text)
|
||||||
@@ -160,26 +156,6 @@ class OcrService:
|
|||||||
|
|
||||||
return b""
|
return b""
|
||||||
|
|
||||||
def _process_ocr_data(
|
|
||||||
self, ocr_data: dict
|
|
||||||
) -> tuple[str, float]:
|
|
||||||
"""Process Tesseract output to extract text and confidence."""
|
|
||||||
words = []
|
|
||||||
confidences = []
|
|
||||||
|
|
||||||
for i, text in enumerate(ocr_data["text"]):
|
|
||||||
# Filter out empty strings and low-confidence results
|
|
||||||
conf = int(ocr_data["conf"][i])
|
|
||||||
if text.strip() and conf > 0:
|
|
||||||
words.append(text)
|
|
||||||
confidences.append(conf)
|
|
||||||
|
|
||||||
raw_text = " ".join(words)
|
|
||||||
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
|
||||||
|
|
||||||
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
|
|
||||||
return raw_text, avg_confidence / 100.0
|
|
||||||
|
|
||||||
def _detect_document_type(self, text: str) -> DocumentType:
|
def _detect_document_type(self, text: str) -> DocumentType:
|
||||||
"""Detect document type from extracted text content."""
|
"""Detect document type from extracted text content."""
|
||||||
text_lower = text.lower()
|
text_lower = text.lower()
|
||||||
|
|||||||
Reference in New Issue
Block a user