feat: Improve OCR process - replace Tesseract with PaddleOCR (#115) #122
@@ -1,16 +1,13 @@
|
||||
"""Receipt-specific OCR extractor with field extraction."""
|
||||
import io
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional
|
||||
|
||||
import magic
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from pillow_heif import register_heif_opener
|
||||
|
||||
from app.config import settings
|
||||
from app.engines import OcrConfig, create_engine
|
||||
from app.extractors.base import BaseExtractor
|
||||
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
|
||||
from app.patterns import currency_matcher, date_matcher, fuel_matcher
|
||||
@@ -53,8 +50,8 @@ class ReceiptExtractor(BaseExtractor):
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize receipt extractor."""
|
||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
||||
"""Initialize receipt extractor with engine from factory."""
|
||||
self._engine = create_engine()
|
||||
|
||||
def extract(
|
||||
self,
|
||||
@@ -150,26 +147,19 @@ class ReceiptExtractor(BaseExtractor):
|
||||
detected = mime.from_buffer(file_bytes)
|
||||
return detected or "application/octet-stream"
|
||||
|
||||
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
|
||||
def _perform_ocr(self, image_bytes: bytes) -> str:
|
||||
"""
|
||||
Perform OCR on preprocessed image.
|
||||
Perform OCR on preprocessed image via engine abstraction.
|
||||
|
||||
Args:
|
||||
image_bytes: Preprocessed image bytes
|
||||
psm: Tesseract page segmentation mode
|
||||
4 = Assume single column of text
|
||||
6 = Uniform block of text
|
||||
|
||||
Returns:
|
||||
Raw OCR text
|
||||
"""
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
# Configure Tesseract for receipt OCR
|
||||
# PSM 4 works well for columnar receipt text
|
||||
config = f"--psm {psm}"
|
||||
|
||||
return pytesseract.image_to_string(image, config=config)
|
||||
config = OcrConfig()
|
||||
result = self._engine.recognize(image_bytes, config)
|
||||
return result.text
|
||||
|
||||
def _detect_receipt_type(self, text: str) -> str:
|
||||
"""
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
"""VIN-specific OCR extractor with preprocessing and validation."""
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
@@ -8,11 +7,10 @@ from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import magic
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from pillow_heif import register_heif_opener
|
||||
|
||||
from app.config import settings
|
||||
from app.engines import OcrConfig, create_engine
|
||||
from app.extractors.base import BaseExtractor
|
||||
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
|
||||
from app.validators.vin_validator import vin_validator
|
||||
@@ -56,15 +54,15 @@ class VinExtractor(BaseExtractor):
|
||||
"image/heif",
|
||||
}
|
||||
|
||||
# VIN character whitelist for Tesseract
|
||||
# VIN character whitelist (passed to engine for post-OCR filtering)
|
||||
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
||||
|
||||
# Fixed debug output directory (inside container)
|
||||
DEBUG_DIR = "/tmp/vin-debug"
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize VIN extractor."""
|
||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
||||
"""Initialize VIN extractor with engine from factory."""
|
||||
self._engine = create_engine()
|
||||
self._debug = settings.log_level.upper() == "DEBUG"
|
||||
|
||||
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
|
||||
@@ -135,21 +133,21 @@ class VinExtractor(BaseExtractor):
|
||||
|
||||
# Perform OCR with VIN-optimized settings
|
||||
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
|
||||
logger.debug("PSM 6 raw text: '%s'", raw_text)
|
||||
logger.debug("PSM 6 word confidences: %s", word_confidences)
|
||||
logger.debug("Primary OCR raw text: '%s'", raw_text)
|
||||
logger.debug("Primary OCR word confidences: %s", word_confidences)
|
||||
|
||||
# Extract VIN candidates from raw text
|
||||
candidates = vin_validator.extract_candidates(raw_text)
|
||||
logger.debug("PSM 6 candidates: %s", candidates)
|
||||
logger.debug("Primary OCR candidates: %s", candidates)
|
||||
|
||||
if not candidates:
|
||||
# No VIN candidates found - try with different PSM modes
|
||||
# No VIN candidates found - try alternate OCR configurations
|
||||
candidates = self._try_alternate_ocr(preprocessed_bytes)
|
||||
|
||||
if not candidates:
|
||||
# Try grayscale-only (no thresholding) — the Tesseract
|
||||
# LSTM engine often performs better on non-binarized input
|
||||
# because it does its own internal preprocessing.
|
||||
# Try grayscale-only (no thresholding) — OCR engines often
|
||||
# perform better on non-binarized input because they do
|
||||
# their own internal preprocessing.
|
||||
gray_result = vin_preprocessor.preprocess(
|
||||
image_bytes, apply_threshold=False
|
||||
)
|
||||
@@ -166,9 +164,9 @@ class VinExtractor(BaseExtractor):
|
||||
raw_text, word_confidences = self._perform_ocr(
|
||||
gray_result.image_bytes
|
||||
)
|
||||
logger.debug("Gray PSM 6 raw text: '%s'", raw_text)
|
||||
logger.debug("Gray primary raw text: '%s'", raw_text)
|
||||
candidates = vin_validator.extract_candidates(raw_text)
|
||||
logger.debug("Gray PSM 6 candidates: %s", candidates)
|
||||
logger.debug("Gray primary candidates: %s", candidates)
|
||||
if not candidates:
|
||||
candidates = self._try_alternate_ocr(
|
||||
gray_result.image_bytes, prefix="Gray"
|
||||
@@ -188,9 +186,9 @@ class VinExtractor(BaseExtractor):
|
||||
)
|
||||
|
||||
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
|
||||
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
|
||||
logger.debug("Otsu primary raw text: '%s'", raw_text)
|
||||
candidates = vin_validator.extract_candidates(raw_text)
|
||||
logger.debug("Otsu PSM 6 candidates: %s", candidates)
|
||||
logger.debug("Otsu primary candidates: %s", candidates)
|
||||
if not candidates:
|
||||
candidates = self._try_alternate_ocr(
|
||||
otsu_result.image_bytes, prefix="Otsu"
|
||||
@@ -280,52 +278,31 @@ class VinExtractor(BaseExtractor):
|
||||
return detected or "application/octet-stream"
|
||||
|
||||
def _perform_ocr(
|
||||
self, image_bytes: bytes, psm: int = 6
|
||||
self,
|
||||
image_bytes: bytes,
|
||||
single_line: bool = False,
|
||||
single_word: bool = False,
|
||||
) -> tuple[str, list[float]]:
|
||||
"""
|
||||
Perform OCR with VIN-optimized settings.
|
||||
Perform OCR with VIN-optimized settings via engine abstraction.
|
||||
|
||||
Args:
|
||||
image_bytes: Preprocessed image bytes
|
||||
psm: Tesseract page segmentation mode
|
||||
6 = Uniform block of text
|
||||
7 = Single text line
|
||||
8 = Single word
|
||||
single_line: Treat image as a single text line
|
||||
single_word: Treat image as a single word
|
||||
|
||||
Returns:
|
||||
Tuple of (raw_text, word_confidences)
|
||||
"""
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
# Configure Tesseract for VIN extraction
|
||||
# OEM 1 = LSTM neural network engine (best accuracy)
|
||||
# NOTE: tessedit_char_whitelist does NOT work with OEM 1 (LSTM).
|
||||
# Using it causes empty/erratic output. Character filtering is
|
||||
# handled post-OCR by vin_validator.correct_ocr_errors() instead.
|
||||
config = (
|
||||
f"--psm {psm} "
|
||||
f"--oem 1 "
|
||||
f"-c load_system_dawg=false "
|
||||
f"-c load_freq_dawg=false"
|
||||
config = OcrConfig(
|
||||
char_whitelist=self.VIN_WHITELIST,
|
||||
single_line=single_line,
|
||||
single_word=single_word,
|
||||
use_angle_cls=True,
|
||||
)
|
||||
|
||||
# Get detailed OCR data
|
||||
ocr_data = pytesseract.image_to_data(
|
||||
image, config=config, output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Extract words and confidences
|
||||
words = []
|
||||
confidences = []
|
||||
|
||||
for i, text in enumerate(ocr_data["text"]):
|
||||
conf = int(ocr_data["conf"][i])
|
||||
if text.strip() and conf > 0:
|
||||
words.append(text.strip())
|
||||
confidences.append(conf / 100.0)
|
||||
|
||||
raw_text = " ".join(words)
|
||||
return raw_text, confidences
|
||||
result = self._engine.recognize(image_bytes, config)
|
||||
word_confidences = [wb.confidence for wb in result.word_boxes]
|
||||
return result.text, word_confidences
|
||||
|
||||
def _try_alternate_ocr(
|
||||
self,
|
||||
@@ -335,21 +312,25 @@ class VinExtractor(BaseExtractor):
|
||||
"""
|
||||
Try alternate OCR configurations when initial extraction fails.
|
||||
|
||||
PSM modes tried in order:
|
||||
7 - Single text line
|
||||
8 - Single word
|
||||
11 - Sparse text (finds text in any order, good for angled photos)
|
||||
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
|
||||
Modes tried:
|
||||
single-line - Treat as a single text line
|
||||
single-word - Treat as a single word
|
||||
|
||||
For PaddleOCR, angle classification handles rotated/angled text
|
||||
inherently, replacing the need for Tesseract PSM mode fallbacks.
|
||||
|
||||
Returns:
|
||||
List of VIN candidates
|
||||
"""
|
||||
tag = f"{prefix} " if prefix else ""
|
||||
for psm in (7, 8, 11, 13):
|
||||
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
|
||||
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
|
||||
for mode_name, kwargs in [
|
||||
("single-line", {"single_line": True}),
|
||||
("single-word", {"single_word": True}),
|
||||
]:
|
||||
raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
|
||||
logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
|
||||
candidates = vin_validator.extract_candidates(raw_text)
|
||||
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
|
||||
logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
|
||||
if candidates:
|
||||
return candidates
|
||||
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
"""Core OCR service using Tesseract with HEIC support."""
|
||||
"""Core OCR service with HEIC support, using pluggable engine abstraction."""
|
||||
import io
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import magic
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from pillow_heif import register_heif_opener
|
||||
|
||||
from app.config import settings
|
||||
from app.engines import OcrConfig, create_engine
|
||||
from app.models import DocumentType, ExtractedField, OcrResponse
|
||||
from app.services.preprocessor import preprocessor
|
||||
|
||||
@@ -32,8 +31,8 @@ class OcrService:
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize OCR service."""
|
||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
||||
"""Initialize OCR service with engine from factory."""
|
||||
self._engine = create_engine()
|
||||
|
||||
def extract(
|
||||
self,
|
||||
@@ -86,14 +85,11 @@ class OcrService:
|
||||
file_bytes, deskew=True, denoise=True
|
||||
)
|
||||
|
||||
# Perform OCR
|
||||
image = Image.open(io.BytesIO(file_bytes))
|
||||
ocr_data = pytesseract.image_to_data(
|
||||
image, output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Extract text and calculate confidence
|
||||
raw_text, confidence = self._process_ocr_data(ocr_data)
|
||||
# Perform OCR via engine abstraction
|
||||
config = OcrConfig()
|
||||
result = self._engine.recognize(file_bytes, config)
|
||||
raw_text = result.text
|
||||
confidence = result.confidence
|
||||
|
||||
# Detect document type from content
|
||||
document_type = self._detect_document_type(raw_text)
|
||||
@@ -160,26 +156,6 @@ class OcrService:
|
||||
|
||||
return b""
|
||||
|
||||
def _process_ocr_data(
|
||||
self, ocr_data: dict
|
||||
) -> tuple[str, float]:
|
||||
"""Process Tesseract output to extract text and confidence."""
|
||||
words = []
|
||||
confidences = []
|
||||
|
||||
for i, text in enumerate(ocr_data["text"]):
|
||||
# Filter out empty strings and low-confidence results
|
||||
conf = int(ocr_data["conf"][i])
|
||||
if text.strip() and conf > 0:
|
||||
words.append(text)
|
||||
confidences.append(conf)
|
||||
|
||||
raw_text = " ".join(words)
|
||||
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
||||
|
||||
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
|
||||
return raw_text, avg_confidence / 100.0
|
||||
|
||||
def _detect_document_type(self, text: str) -> DocumentType:
|
||||
"""Detect document type from extracted text content."""
|
||||
text_lower = text.lower()
|
||||
|
||||
Reference in New Issue
Block a user