Files
motovaultpro/ocr/app/extractors/receipt_extractor.py
Eric Gullickson 6319d50fb1
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
feat: add receipt OCR pipeline (refs #69)
Implement receipt-specific OCR extraction for fuel receipts:

- Pattern matching modules for date, currency, and fuel data extraction
- Receipt-optimized image preprocessing for thermal receipts
- POST /extract/receipt endpoint with field extraction
- Confidence scoring per extracted field
- Cross-validation of fuel receipt data
- Unit tests for all pattern matchers

Extracted fields: merchantName, transactionDate, totalAmount,
fuelQuantity, pricePerUnit, fuelGrade

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 20:43:30 -06:00

346 lines
11 KiB
Python

"""Receipt-specific OCR extractor with field extraction."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.extractors.base import BaseExtractor
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
from app.patterns import currency_matcher, date_matcher, fuel_matcher
# Register HEIF/HEIC opener
register_heif_opener()
logger = logging.getLogger(__name__)
@dataclass
class ExtractedField:
"""A single extracted field with confidence."""
value: Any
confidence: float
@dataclass
class ReceiptExtractionResult:
"""Result of receipt extraction."""
success: bool
receipt_type: str = "unknown"
extracted_fields: dict[str, ExtractedField] = field(default_factory=dict)
raw_text: str = ""
processing_time_ms: int = 0
error: Optional[str] = None
class ReceiptExtractor(BaseExtractor):
"""Receipt-specific OCR extractor for fuel and general receipts."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
}
def __init__(self) -> None:
"""Initialize receipt extractor."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
def extract(
self,
image_bytes: bytes,
content_type: Optional[str] = None,
receipt_type: Optional[str] = None,
) -> ReceiptExtractionResult:
"""
Extract data from a receipt image.
Args:
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
content_type: MIME type (auto-detected if not provided)
receipt_type: Hint for receipt type ("fuel" for specialized extraction)
Returns:
ReceiptExtractionResult with extracted fields
"""
start_time = time.time()
# Detect content type if not provided
if not content_type:
content_type = self._detect_mime_type(image_bytes)
# Validate content type
if content_type not in self.SUPPORTED_TYPES:
return ReceiptExtractionResult(
success=False,
error=f"Unsupported file type: {content_type}",
processing_time_ms=int((time.time() - start_time) * 1000),
)
try:
# Apply receipt-optimized preprocessing
preprocessing_result = receipt_preprocessor.preprocess(image_bytes)
preprocessed_bytes = preprocessing_result.image_bytes
# Perform OCR
raw_text = self._perform_ocr(preprocessed_bytes)
if not raw_text.strip():
# Try with less aggressive preprocessing
preprocessing_result = receipt_preprocessor.preprocess(
image_bytes,
apply_threshold=False,
)
preprocessed_bytes = preprocessing_result.image_bytes
raw_text = self._perform_ocr(preprocessed_bytes)
if not raw_text.strip():
return ReceiptExtractionResult(
success=False,
error="No text found in image",
processing_time_ms=int((time.time() - start_time) * 1000),
)
# Detect receipt type if not specified
detected_type = receipt_type or self._detect_receipt_type(raw_text)
# Extract fields based on receipt type
if detected_type == "fuel":
extracted_fields = self._extract_fuel_fields(raw_text)
else:
extracted_fields = self._extract_generic_fields(raw_text)
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
f"Receipt extraction: type={detected_type}, "
f"fields={len(extracted_fields)}, "
f"time={processing_time_ms}ms"
)
return ReceiptExtractionResult(
success=True,
receipt_type=detected_type,
extracted_fields=extracted_fields,
raw_text=raw_text,
processing_time_ms=processing_time_ms,
)
except Exception as e:
logger.error(f"Receipt extraction failed: {e}", exc_info=True)
return ReceiptExtractionResult(
success=False,
error=str(e),
processing_time_ms=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
"""
Perform OCR on preprocessed image.
Args:
image_bytes: Preprocessed image bytes
psm: Tesseract page segmentation mode
4 = Assume single column of text
6 = Uniform block of text
Returns:
Raw OCR text
"""
image = Image.open(io.BytesIO(image_bytes))
# Configure Tesseract for receipt OCR
# PSM 4 works well for columnar receipt text
config = f"--psm {psm}"
return pytesseract.image_to_string(image, config=config)
def _detect_receipt_type(self, text: str) -> str:
"""
Detect receipt type based on content.
Args:
text: OCR text
Returns:
Receipt type: "fuel", "retail", or "unknown"
"""
text_upper = text.upper()
# Fuel receipt indicators
fuel_keywords = [
"GALLON", "GAL", "FUEL", "GAS", "DIESEL", "UNLEADED",
"REGULAR", "PREMIUM", "OCTANE", "PPG", "PUMP",
]
fuel_score = sum(1 for kw in fuel_keywords if kw in text_upper)
# Check for known gas stations
if fuel_matcher.extract_merchant_name(text):
merchant, _ = fuel_matcher.extract_merchant_name(text)
if any(
station in merchant.upper()
for station in fuel_matcher.STATION_NAMES
):
fuel_score += 3
if fuel_score >= 2:
return "fuel"
return "unknown"
def _extract_fuel_fields(self, text: str) -> dict[str, ExtractedField]:
"""
Extract fuel-specific fields from receipt text.
Args:
text: OCR text
Returns:
Dictionary of extracted fields
"""
fields: dict[str, ExtractedField] = {}
# Extract merchant name
merchant_result = fuel_matcher.extract_merchant_name(text)
if merchant_result:
merchant_name, confidence = merchant_result
fields["merchantName"] = ExtractedField(
value=merchant_name,
confidence=confidence,
)
# Extract transaction date
date_match = date_matcher.extract_best_date(text)
if date_match:
fields["transactionDate"] = ExtractedField(
value=date_match.value,
confidence=date_match.confidence,
)
# Extract total amount
total_match = currency_matcher.extract_total(text)
if total_match:
fields["totalAmount"] = ExtractedField(
value=total_match.value,
confidence=total_match.confidence,
)
# Extract fuel quantity
quantity_match = fuel_matcher.extract_quantity(text)
if quantity_match:
fields["fuelQuantity"] = ExtractedField(
value=quantity_match.value,
confidence=quantity_match.confidence,
)
# Extract price per unit
price_match = fuel_matcher.extract_price_per_unit(text)
if price_match:
fields["pricePerUnit"] = ExtractedField(
value=price_match.value,
confidence=price_match.confidence,
)
# Extract fuel grade
grade_match = fuel_matcher.extract_grade(text)
if grade_match:
fields["fuelGrade"] = ExtractedField(
value=grade_match.value,
confidence=grade_match.confidence,
)
# Calculate derived values if we have enough data
if "totalAmount" in fields and "fuelQuantity" in fields:
if "pricePerUnit" not in fields:
# Calculate price per unit from total and quantity
calculated_price = (
fields["totalAmount"].value / fields["fuelQuantity"].value
)
# Only use if reasonable
if 1.0 <= calculated_price <= 10.0:
fields["pricePerUnit"] = ExtractedField(
value=round(calculated_price, 3),
confidence=min(
fields["totalAmount"].confidence,
fields["fuelQuantity"].confidence,
)
* 0.8, # Lower confidence for calculated value
)
return fields
def _extract_generic_fields(self, text: str) -> dict[str, ExtractedField]:
"""
Extract generic fields from receipt text.
Args:
text: OCR text
Returns:
Dictionary of extracted fields
"""
fields: dict[str, ExtractedField] = {}
# Extract date
date_match = date_matcher.extract_best_date(text)
if date_match:
fields["transactionDate"] = ExtractedField(
value=date_match.value,
confidence=date_match.confidence,
)
# Extract total amount
total_match = currency_matcher.extract_total(text)
if total_match:
fields["totalAmount"] = ExtractedField(
value=total_match.value,
confidence=total_match.confidence,
)
# Try to get merchant from first line
lines = [l.strip() for l in text.split("\n") if l.strip()]
if lines:
fields["merchantName"] = ExtractedField(
value=lines[0][:50],
confidence=0.40,
)
return fields
def validate(self, data: Any) -> bool:
"""
Validate extracted receipt data.
Args:
data: Extracted data to validate
Returns:
True if data has minimum required fields
"""
if not isinstance(data, dict):
return False
# Minimum: must have at least total amount or date
return "totalAmount" in data or "transactionDate" in data
# Singleton instance
receipt_extractor = ReceiptExtractor()