feat: Improve OCR process - replace Tesseract with PaddleOCR (#115) #122

Merged
egullickson merged 16 commits from issue-115-improve-ocr-paddleocr into main 2026-02-08 01:13:35 +00:00
3 changed files with 60 additions and 113 deletions
Showing only changes of commit 013fb0c67a - Show all commits

View File

@@ -1,16 +1,13 @@
"""Receipt-specific OCR extractor with field extraction."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
from app.patterns import currency_matcher, date_matcher, fuel_matcher
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
}
def __init__(self) -> None:
"""Initialize receipt extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
"""Initialize receipt extractor with engine from factory."""
self._engine = create_engine()
def extract(
self,
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
def _perform_ocr(self, image_bytes: bytes) -> str:
"""
Perform OCR on preprocessed image.
Perform OCR on preprocessed image via engine abstraction.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
4 = Assume single column of text
6 = Uniform block of text
Returns:
Raw OCR text
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for receipt OCR
# PSM 4 works well for columnar receipt text
config = f"--psm {psm}"
return pytesseract.image_to_string(image, config=config)
config = OcrConfig()
result = self._engine.recognize(image_bytes, config)
return result.text
def _detect_receipt_type(self, text: str) -> str:
"""

View File

@@ -1,5 +1,4 @@
"""VIN-specific OCR extractor with preprocessing and validation."""
import io
import logging
import os
import time
@@ -8,11 +7,10 @@ from datetime import datetime
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
"image/heif",
}
# VIN character whitelist for Tesseract
# VIN character whitelist (passed to engine for post-OCR filtering)
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
# Fixed debug output directory (inside container)
DEBUG_DIR = "/tmp/vin-debug"
def __init__(self) -> None:
"""Initialize VIN extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
"""Initialize VIN extractor with engine from factory."""
self._engine = create_engine()
self._debug = settings.log_level.upper() == "DEBUG"
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
# Perform OCR with VIN-optimized settings
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
logger.debug("PSM 6 raw text: '%s'", raw_text)
logger.debug("PSM 6 word confidences: %s", word_confidences)
logger.debug("Primary OCR raw text: '%s'", raw_text)
logger.debug("Primary OCR word confidences: %s", word_confidences)
# Extract VIN candidates from raw text
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("PSM 6 candidates: %s", candidates)
logger.debug("Primary OCR candidates: %s", candidates)
if not candidates:
# No VIN candidates found - try with different PSM modes
# No VIN candidates found - try alternate OCR configurations
candidates = self._try_alternate_ocr(preprocessed_bytes)
if not candidates:
# Try grayscale-only (no thresholding) — the Tesseract
# LSTM engine often performs better on non-binarized input
# because it does its own internal preprocessing.
# Try grayscale-only (no thresholding) — OCR engines often
# perform better on non-binarized input because they do
# their own internal preprocessing.
gray_result = vin_preprocessor.preprocess(
image_bytes, apply_threshold=False
)
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
raw_text, word_confidences = self._perform_ocr(
gray_result.image_bytes
)
logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
logger.debug("Gray primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Gray PSM 6 candidates: %s", candidates)
logger.debug("Gray primary candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
gray_result.image_bytes, prefix="Gray"
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
)
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
logger.debug("Otsu primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Otsu PSM 6 candidates: %s", candidates)
logger.debug("Otsu primary candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
otsu_result.image_bytes, prefix="Otsu"
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
return detected or "application/octet-stream"
def _perform_ocr(
self, image_bytes: bytes, psm: int = 6
self,
image_bytes: bytes,
single_line: bool = False,
single_word: bool = False,
) -> tuple[str, list[float]]:
"""
Perform OCR with VIN-optimized settings.
Perform OCR with VIN-optimized settings via engine abstraction.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
6 = Uniform block of text
7 = Single text line
8 = Single word
single_line: Treat image as a single text line
single_word: Treat image as a single word
Returns:
Tuple of (raw_text, word_confidences)
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for VIN extraction
# OEM 1 = LSTM neural network engine (best accuracy)
# NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
# Using it causes empty/erratic output. Character filtering is
# handled post-OCR by vin_validator.correct_ocr_errors() instead.
config = (
f"--psm {psm} "
f"--oem 1 "
f"-c load_system_dawg=false "
f"-c load_freq_dawg=false"
config = OcrConfig(
char_whitelist=self.VIN_WHITELIST,
single_line=single_line,
single_word=single_word,
use_angle_cls=True,
)
# Get detailed OCR data
ocr_data = pytesseract.image_to_data(
image, config=config, output_type=pytesseract.Output.DICT
)
# Extract words and confidences
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text.strip())
confidences.append(conf / 100.0)
raw_text = " ".join(words)
return raw_text, confidences
result = self._engine.recognize(image_bytes, config)
word_confidences = [wb.confidence for wb in result.word_boxes]
return result.text, word_confidences
def _try_alternate_ocr(
self,
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
"""
Try alternate OCR configurations when initial extraction fails.
PSM modes tried in order:
7 - Single text line
8 - Single word
11 - Sparse text (finds text in any order, good for angled photos)
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
Modes tried:
single-line - Treat as a single text line
single-word - Treat as a single word
For PaddleOCR, angle classification handles rotated/angled text
inherently, replacing the need for Tesseract PSM mode fallbacks.
Returns:
List of VIN candidates
"""
tag = f"{prefix} " if prefix else ""
for psm in (7, 8, 11, 13):
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
for mode_name, kwargs in [
("single-line", {"single_line": True}),
("single-word", {"single_word": True}),
]:
raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
if candidates:
return candidates

View File

@@ -1,15 +1,14 @@
"""Core OCR service using Tesseract with HEIC support."""
"""Core OCR service with HEIC support, using pluggable engine abstraction."""
import io
import logging
import time
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.engines import OcrConfig, create_engine
from app.models import DocumentType, ExtractedField, OcrResponse
from app.services.preprocessor import preprocessor
@@ -32,8 +31,8 @@ class OcrService:
}
def __init__(self) -> None:
"""Initialize OCR service."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
"""Initialize OCR service with engine from factory."""
self._engine = create_engine()
def extract(
self,
@@ -86,14 +85,11 @@ class OcrService:
file_bytes, deskew=True, denoise=True
)
# Perform OCR
image = Image.open(io.BytesIO(file_bytes))
ocr_data = pytesseract.image_to_data(
image, output_type=pytesseract.Output.DICT
)
# Extract text and calculate confidence
raw_text, confidence = self._process_ocr_data(ocr_data)
# Perform OCR via engine abstraction
config = OcrConfig()
result = self._engine.recognize(file_bytes, config)
raw_text = result.text
confidence = result.confidence
# Detect document type from content
document_type = self._detect_document_type(raw_text)
@@ -160,26 +156,6 @@ class OcrService:
return b""
def _process_ocr_data(
self, ocr_data: dict
) -> tuple[str, float]:
"""Process Tesseract output to extract text and confidence."""
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
# Filter out empty strings and low-confidence results
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text)
confidences.append(conf)
raw_text = " ".join(words)
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
return raw_text, avg_confidence / 100.0
def _detect_document_type(self, text: str) -> DocumentType:
"""Detect document type from extracted text content."""
text_lower = text.lower()