Files
motovaultpro/ocr/app/extractors/receipt_extractor.py
Eric Gullickson 5e4515da7c
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 37s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 52s
Deploy to Staging / Verify Staging (pull_request) Successful in 9s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
fix: use PyMuPDF instead of pdf2image for PDF-to-image conversion (refs #182)
pdf2image requires poppler-utils which is not installed in the OCR
container. PyMuPDF is already in requirements.txt and can render PDF
pages to PNG at 300 DPI natively without extra system dependencies.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 21:34:17 -06:00

368 lines
12 KiB
Python

"""Receipt-specific OCR extractor with field extraction."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Optional
import magic
from pillow_heif import register_heif_opener
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
from app.patterns import currency_matcher, date_matcher, fuel_matcher
# Register HEIF/HEIC opener
register_heif_opener()
logger = logging.getLogger(__name__)
@dataclass
class ExtractedField:
"""A single extracted field with confidence."""
value: Any
confidence: float
@dataclass
class ReceiptExtractionResult:
"""Result of receipt extraction."""
success: bool
receipt_type: str = "unknown"
extracted_fields: dict[str, ExtractedField] = field(default_factory=dict)
raw_text: str = ""
processing_time_ms: int = 0
error: Optional[str] = None
class ReceiptExtractor(BaseExtractor):
"""Receipt-specific OCR extractor for fuel and general receipts."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
"application/pdf",
}
def __init__(self) -> None:
"""Initialize receipt extractor with engine from factory."""
self._engine = create_engine()
def extract(
self,
image_bytes: bytes,
content_type: Optional[str] = None,
receipt_type: Optional[str] = None,
) -> ReceiptExtractionResult:
"""
Extract data from a receipt image.
Args:
image_bytes: Raw image or PDF bytes (HEIC, JPEG, PNG, PDF)
content_type: MIME type (auto-detected if not provided)
receipt_type: Hint for receipt type ("fuel" for specialized extraction)
Returns:
ReceiptExtractionResult with extracted fields
"""
start_time = time.time()
# Detect content type if not provided
if not content_type:
content_type = self._detect_mime_type(image_bytes)
# Validate content type
if content_type not in self.SUPPORTED_TYPES:
return ReceiptExtractionResult(
success=False,
error=f"Unsupported file type: {content_type}",
processing_time_ms=int((time.time() - start_time) * 1000),
)
try:
# Convert PDF to image (first page)
if content_type == "application/pdf":
image_bytes = self._extract_pdf_first_page(image_bytes)
if not image_bytes:
return ReceiptExtractionResult(
success=False,
error="Failed to extract image from PDF",
processing_time_ms=int((time.time() - start_time) * 1000),
)
# Apply receipt-optimized preprocessing
preprocessing_result = receipt_preprocessor.preprocess(image_bytes)
preprocessed_bytes = preprocessing_result.image_bytes
# Perform OCR
raw_text = self._perform_ocr(preprocessed_bytes)
if not raw_text.strip():
# Try with less aggressive preprocessing
preprocessing_result = receipt_preprocessor.preprocess(
image_bytes,
apply_threshold=False,
)
preprocessed_bytes = preprocessing_result.image_bytes
raw_text = self._perform_ocr(preprocessed_bytes)
if not raw_text.strip():
return ReceiptExtractionResult(
success=False,
error="No text found in image",
processing_time_ms=int((time.time() - start_time) * 1000),
)
# Detect receipt type if not specified
detected_type = receipt_type or self._detect_receipt_type(raw_text)
# Extract fields based on receipt type
if detected_type == "fuel":
extracted_fields = self._extract_fuel_fields(raw_text)
else:
extracted_fields = self._extract_generic_fields(raw_text)
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
f"Receipt extraction: type={detected_type}, "
f"fields={len(extracted_fields)}, "
f"time={processing_time_ms}ms"
)
return ReceiptExtractionResult(
success=True,
receipt_type=detected_type,
extracted_fields=extracted_fields,
raw_text=raw_text,
processing_time_ms=processing_time_ms,
)
except Exception as e:
logger.error(f"Receipt extraction failed: {e}", exc_info=True)
return ReceiptExtractionResult(
success=False,
error=str(e),
processing_time_ms=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
"""Extract first page of PDF as PNG image for OCR processing."""
try:
import fitz # PyMuPDF
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
page = doc[0]
# Render at 300 DPI (default is 72, so scale factor = 300/72)
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
png_bytes = pix.tobytes("png")
doc.close()
return png_bytes
except ImportError:
logger.warning("PyMuPDF not available, PDF support limited")
except Exception as e:
logger.error(f"PDF first page extraction failed: {e}")
return b""
def _perform_ocr(self, image_bytes: bytes) -> str:
"""
Perform OCR on preprocessed image via engine abstraction.
Args:
image_bytes: Preprocessed image bytes
Returns:
Raw OCR text
"""
config = OcrConfig()
result = self._engine.recognize(image_bytes, config)
return result.text
def _detect_receipt_type(self, text: str) -> str:
"""
Detect receipt type based on content.
Args:
text: OCR text
Returns:
Receipt type: "fuel", "retail", or "unknown"
"""
text_upper = text.upper()
# Fuel receipt indicators
fuel_keywords = [
"GALLON", "GAL", "FUEL", "GAS", "DIESEL", "UNLEADED",
"REGULAR", "PREMIUM", "OCTANE", "PPG", "PUMP",
]
fuel_score = sum(1 for kw in fuel_keywords if kw in text_upper)
# Check for known gas stations
if fuel_matcher.extract_merchant_name(text):
merchant, _ = fuel_matcher.extract_merchant_name(text)
if any(
station in merchant.upper()
for station in fuel_matcher.STATION_NAMES
):
fuel_score += 3
if fuel_score >= 2:
return "fuel"
return "unknown"
def _extract_fuel_fields(self, text: str) -> dict[str, ExtractedField]:
"""
Extract fuel-specific fields from receipt text.
Args:
text: OCR text
Returns:
Dictionary of extracted fields
"""
fields: dict[str, ExtractedField] = {}
# Extract merchant name
merchant_result = fuel_matcher.extract_merchant_name(text)
if merchant_result:
merchant_name, confidence = merchant_result
fields["merchantName"] = ExtractedField(
value=merchant_name,
confidence=confidence,
)
# Extract transaction date
date_match = date_matcher.extract_best_date(text)
if date_match:
fields["transactionDate"] = ExtractedField(
value=date_match.value,
confidence=date_match.confidence,
)
# Extract total amount
total_match = currency_matcher.extract_total(text)
if total_match:
fields["totalAmount"] = ExtractedField(
value=total_match.value,
confidence=total_match.confidence,
)
# Extract fuel quantity
quantity_match = fuel_matcher.extract_quantity(text)
if quantity_match:
fields["fuelQuantity"] = ExtractedField(
value=quantity_match.value,
confidence=quantity_match.confidence,
)
# Extract price per unit
price_match = fuel_matcher.extract_price_per_unit(text)
if price_match:
fields["pricePerUnit"] = ExtractedField(
value=price_match.value,
confidence=price_match.confidence,
)
# Extract fuel grade
grade_match = fuel_matcher.extract_grade(text)
if grade_match:
fields["fuelGrade"] = ExtractedField(
value=grade_match.value,
confidence=grade_match.confidence,
)
# Calculate derived values if we have enough data
if "totalAmount" in fields and "fuelQuantity" in fields:
if "pricePerUnit" not in fields:
# Calculate price per unit from total and quantity
calculated_price = (
fields["totalAmount"].value / fields["fuelQuantity"].value
)
# Only use if reasonable
if 1.0 <= calculated_price <= 10.0:
fields["pricePerUnit"] = ExtractedField(
value=round(calculated_price, 3),
confidence=min(
fields["totalAmount"].confidence,
fields["fuelQuantity"].confidence,
)
* 0.8, # Lower confidence for calculated value
)
return fields
def _extract_generic_fields(self, text: str) -> dict[str, ExtractedField]:
"""
Extract generic fields from receipt text.
Args:
text: OCR text
Returns:
Dictionary of extracted fields
"""
fields: dict[str, ExtractedField] = {}
# Extract date
date_match = date_matcher.extract_best_date(text)
if date_match:
fields["transactionDate"] = ExtractedField(
value=date_match.value,
confidence=date_match.confidence,
)
# Extract total amount
total_match = currency_matcher.extract_total(text)
if total_match:
fields["totalAmount"] = ExtractedField(
value=total_match.value,
confidence=total_match.confidence,
)
# Try to get merchant from first line
lines = [l.strip() for l in text.split("\n") if l.strip()]
if lines:
fields["merchantName"] = ExtractedField(
value=lines[0][:50],
confidence=0.40,
)
return fields
def validate(self, data: Any) -> bool:
"""
Validate extracted receipt data.
Args:
data: Extracted data to validate
Returns:
True if data has minimum required fields
"""
if not isinstance(data, dict):
return False
# Minimum: must have at least total amount or date
return "totalAmount" in data or "transactionDate" in data
# Singleton instance
receipt_extractor = ReceiptExtractor()