From 013fb0c67a2a2ed4ab25503e1bf8f50d784ecde2 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Sat, 7 Feb 2026 10:56:27 -0600 Subject: [PATCH] feat: migrate VIN/receipt extractors and OCR service to engine abstraction (refs #117) Replace direct pytesseract calls with OcrEngine interface in vin_extractor.py, receipt_extractor.py, and ocr_service.py. PSM mode fallbacks replaced with engine-agnostic single-line/single-word configs. Dead _process_ocr_data removed. Co-Authored-By: Claude Opus 4.6 --- ocr/app/extractors/receipt_extractor.py | 26 ++---- ocr/app/extractors/vin_extractor.py | 105 ++++++++++-------------- ocr/app/services/ocr_service.py | 42 ++-------- 3 files changed, 60 insertions(+), 113 deletions(-) diff --git a/ocr/app/extractors/receipt_extractor.py b/ocr/app/extractors/receipt_extractor.py index 6134988..111cfb1 100644 --- a/ocr/app/extractors/receipt_extractor.py +++ b/ocr/app/extractors/receipt_extractor.py @@ -1,16 +1,13 @@ """Receipt-specific OCR extractor with field extraction.""" -import io import logging import time from dataclasses import dataclass, field from typing import Any, Optional import magic -import pytesseract -from PIL import Image from pillow_heif import register_heif_opener -from app.config import settings +from app.engines import OcrConfig, create_engine from app.extractors.base import BaseExtractor from app.preprocessors.receipt_preprocessor import receipt_preprocessor from app.patterns import currency_matcher, date_matcher, fuel_matcher @@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor): } def __init__(self) -> None: - """Initialize receipt extractor.""" - pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd + """Initialize receipt extractor with engine from factory.""" + self._engine = create_engine() def extract( self, @@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor): detected = mime.from_buffer(file_bytes) return detected or "application/octet-stream" - def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str: + def _perform_ocr(self, image_bytes: bytes) -> str: """ - Perform OCR on preprocessed image. + Perform OCR on preprocessed image via engine abstraction. Args: image_bytes: Preprocessed image bytes - psm: Tesseract page segmentation mode - 4 = Assume single column of text - 6 = Uniform block of text Returns: Raw OCR text """ - image = Image.open(io.BytesIO(image_bytes)) - - # Configure Tesseract for receipt OCR - # PSM 4 works well for columnar receipt text - config = f"--psm {psm}" - - return pytesseract.image_to_string(image, config=config) + config = OcrConfig() + result = self._engine.recognize(image_bytes, config) + return result.text def _detect_receipt_type(self, text: str) -> str: """ diff --git a/ocr/app/extractors/vin_extractor.py b/ocr/app/extractors/vin_extractor.py index 1edca3f..cce88e9 100644 --- a/ocr/app/extractors/vin_extractor.py +++ b/ocr/app/extractors/vin_extractor.py @@ -1,5 +1,4 @@ """VIN-specific OCR extractor with preprocessing and validation.""" -import io import logging import os import time @@ -8,11 +7,10 @@ from datetime import datetime from typing import Optional import magic -import pytesseract -from PIL import Image from pillow_heif import register_heif_opener from app.config import settings +from app.engines import OcrConfig, create_engine from app.extractors.base import BaseExtractor from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox from app.validators.vin_validator import vin_validator @@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor): "image/heif", } - # VIN character whitelist for Tesseract + # VIN character whitelist (passed to engine for post-OCR filtering) VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789" # Fixed debug output directory (inside container) DEBUG_DIR = "/tmp/vin-debug" def __init__(self) -> None: - """Initialize VIN extractor.""" - pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd + """Initialize VIN extractor with engine from factory.""" + self._engine = create_engine() self._debug = settings.log_level.upper() == "DEBUG" def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None: @@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor): # Perform OCR with VIN-optimized settings raw_text, word_confidences = self._perform_ocr(preprocessed_bytes) - logger.debug("PSM 6 raw text: '%s'", raw_text) - logger.debug("PSM 6 word confidences: %s", word_confidences) + logger.debug("Primary OCR raw text: '%s'", raw_text) + logger.debug("Primary OCR word confidences: %s", word_confidences) # Extract VIN candidates from raw text candidates = vin_validator.extract_candidates(raw_text) - logger.debug("PSM 6 candidates: %s", candidates) + logger.debug("Primary OCR candidates: %s", candidates) if not candidates: - # No VIN candidates found - try with different PSM modes + # No VIN candidates found - try alternate OCR configurations candidates = self._try_alternate_ocr(preprocessed_bytes) if not candidates: - # Try grayscale-only (no thresholding) — the Tesseract - # LSTM engine often performs better on non-binarized input - # because it does its own internal preprocessing. + # Try grayscale-only (no thresholding) — OCR engines often + # perform better on non-binarized input because they do + # their own internal preprocessing. gray_result = vin_preprocessor.preprocess( image_bytes, apply_threshold=False ) @@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor): raw_text, word_confidences = self._perform_ocr( gray_result.image_bytes ) - logger.debug("Gray PSM 6 raw text: '%s'", raw_text) + logger.debug("Gray primary raw text: '%s'", raw_text) candidates = vin_validator.extract_candidates(raw_text) - logger.debug("Gray PSM 6 candidates: %s", candidates) + logger.debug("Gray primary candidates: %s", candidates) if not candidates: candidates = self._try_alternate_ocr( gray_result.image_bytes, prefix="Gray" @@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor): ) raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes) - logger.debug("Otsu PSM 6 raw text: '%s'", raw_text) + logger.debug("Otsu primary raw text: '%s'", raw_text) candidates = vin_validator.extract_candidates(raw_text) - logger.debug("Otsu PSM 6 candidates: %s", candidates) + logger.debug("Otsu primary candidates: %s", candidates) if not candidates: candidates = self._try_alternate_ocr( otsu_result.image_bytes, prefix="Otsu" @@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor): return detected or "application/octet-stream" def _perform_ocr( - self, image_bytes: bytes, psm: int = 6 + self, + image_bytes: bytes, + single_line: bool = False, + single_word: bool = False, ) -> tuple[str, list[float]]: """ - Perform OCR with VIN-optimized settings. + Perform OCR with VIN-optimized settings via engine abstraction. Args: image_bytes: Preprocessed image bytes - psm: Tesseract page segmentation mode - 6 = Uniform block of text - 7 = Single text line - 8 = Single word + single_line: Treat image as a single text line + single_word: Treat image as a single word Returns: Tuple of (raw_text, word_confidences) """ - image = Image.open(io.BytesIO(image_bytes)) - - # Configure Tesseract for VIN extraction - # OEM 1 = LSTM neural network engine (best accuracy) - # NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM). - # Using it causes empty/erratic output. Character filtering is - # handled post-OCR by vin_validator.correct_ocr_errors() instead. - config = ( - f"--psm {psm} " - f"--oem 1 " - f"-c load_system_dawg=false " - f"-c load_freq_dawg=false" + config = OcrConfig( + char_whitelist=self.VIN_WHITELIST, + single_line=single_line, + single_word=single_word, + use_angle_cls=True, ) - - # Get detailed OCR data - ocr_data = pytesseract.image_to_data( - image, config=config, output_type=pytesseract.Output.DICT - ) - - # Extract words and confidences - words = [] - confidences = [] - - for i, text in enumerate(ocr_data["text"]): - conf = int(ocr_data["conf"][i]) - if text.strip() and conf > 0: - words.append(text.strip()) - confidences.append(conf / 100.0) - - raw_text = " ".join(words) - return raw_text, confidences + result = self._engine.recognize(image_bytes, config) + word_confidences = [wb.confidence for wb in result.word_boxes] + return result.text, word_confidences def _try_alternate_ocr( self, @@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor): """ Try alternate OCR configurations when initial extraction fails. - PSM modes tried in order: - 7 - Single text line - 8 - Single word - 11 - Sparse text (finds text in any order, good for angled photos) - 13 - Raw line (no Tesseract heuristics, good for clean VIN plates) + Modes tried: + single-line - Treat as a single text line + single-word - Treat as a single word + + For PaddleOCR, angle classification handles rotated/angled text + inherently, replacing the need for Tesseract PSM mode fallbacks. Returns: List of VIN candidates """ tag = f"{prefix} " if prefix else "" - for psm in (7, 8, 11, 13): - raw_text, _ = self._perform_ocr(image_bytes, psm=psm) - logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text) + for mode_name, kwargs in [ + ("single-line", {"single_line": True}), + ("single-word", {"single_word": True}), + ]: + raw_text, _ = self._perform_ocr(image_bytes, **kwargs) + logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text) candidates = vin_validator.extract_candidates(raw_text) - logger.debug("%sPSM %d candidates: %s", tag, psm, candidates) + logger.debug("%s%s candidates: %s", tag, mode_name, candidates) if candidates: return candidates diff --git a/ocr/app/services/ocr_service.py b/ocr/app/services/ocr_service.py index 4c317b3..4d06452 100644 --- a/ocr/app/services/ocr_service.py +++ b/ocr/app/services/ocr_service.py @@ -1,15 +1,14 @@ -"""Core OCR service using Tesseract with HEIC support.""" +"""Core OCR service with HEIC support, using pluggable engine abstraction.""" import io import logging import time from typing import Optional import magic -import pytesseract from PIL import Image from pillow_heif import register_heif_opener -from app.config import settings +from app.engines import OcrConfig, create_engine from app.models import DocumentType, ExtractedField, OcrResponse from app.services.preprocessor import preprocessor @@ -32,8 +31,8 @@ class OcrService: } def __init__(self) -> None: - """Initialize OCR service.""" - pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd + """Initialize OCR service with engine from factory.""" + self._engine = create_engine() def extract( self, @@ -86,14 +85,11 @@ class OcrService: file_bytes, deskew=True, denoise=True ) - # Perform OCR - image = Image.open(io.BytesIO(file_bytes)) - ocr_data = pytesseract.image_to_data( - image, output_type=pytesseract.Output.DICT - ) - - # Extract text and calculate confidence - raw_text, confidence = self._process_ocr_data(ocr_data) + # Perform OCR via engine abstraction + config = OcrConfig() + result = self._engine.recognize(file_bytes, config) + raw_text = result.text + confidence = result.confidence # Detect document type from content document_type = self._detect_document_type(raw_text) @@ -160,26 +156,6 @@ class OcrService: return b"" - def _process_ocr_data( - self, ocr_data: dict - ) -> tuple[str, float]: - """Process Tesseract output to extract text and confidence.""" - words = [] - confidences = [] - - for i, text in enumerate(ocr_data["text"]): - # Filter out empty strings and low-confidence results - conf = int(ocr_data["conf"][i]) - if text.strip() and conf > 0: - words.append(text) - confidences.append(conf) - - raw_text = " ".join(words) - avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0 - - # Normalize confidence to 0-1 range (Tesseract returns 0-100) - return raw_text, avg_confidence / 100.0 - def _detect_document_type(self, text: str) -> DocumentType: """Detect document type from extracted text content.""" text_lower = text.lower()