"""Receipt-specific OCR extractor with field extraction.""" import io import logging import time from dataclasses import dataclass, field from typing import Any, Optional import magic import pytesseract from PIL import Image from pillow_heif import register_heif_opener from app.config import settings from app.extractors.base import BaseExtractor from app.preprocessors.receipt_preprocessor import receipt_preprocessor from app.patterns import currency_matcher, date_matcher, fuel_matcher # Register HEIF/HEIC opener register_heif_opener() logger = logging.getLogger(__name__) @dataclass class ExtractedField: """A single extracted field with confidence.""" value: Any confidence: float @dataclass class ReceiptExtractionResult: """Result of receipt extraction.""" success: bool receipt_type: str = "unknown" extracted_fields: dict[str, ExtractedField] = field(default_factory=dict) raw_text: str = "" processing_time_ms: int = 0 error: Optional[str] = None class ReceiptExtractor(BaseExtractor): """Receipt-specific OCR extractor for fuel and general receipts.""" # Supported MIME types SUPPORTED_TYPES = { "image/jpeg", "image/png", "image/heic", "image/heif", } def __init__(self) -> None: """Initialize receipt extractor.""" pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd def extract( self, image_bytes: bytes, content_type: Optional[str] = None, receipt_type: Optional[str] = None, ) -> ReceiptExtractionResult: """ Extract data from a receipt image. Args: image_bytes: Raw image bytes (HEIC, JPEG, PNG) content_type: MIME type (auto-detected if not provided) receipt_type: Hint for receipt type ("fuel" for specialized extraction) Returns: ReceiptExtractionResult with extracted fields """ start_time = time.time() # Detect content type if not provided if not content_type: content_type = self._detect_mime_type(image_bytes) # Validate content type if content_type not in self.SUPPORTED_TYPES: return ReceiptExtractionResult( success=False, error=f"Unsupported file type: {content_type}", processing_time_ms=int((time.time() - start_time) * 1000), ) try: # Apply receipt-optimized preprocessing preprocessing_result = receipt_preprocessor.preprocess(image_bytes) preprocessed_bytes = preprocessing_result.image_bytes # Perform OCR raw_text = self._perform_ocr(preprocessed_bytes) if not raw_text.strip(): # Try with less aggressive preprocessing preprocessing_result = receipt_preprocessor.preprocess( image_bytes, apply_threshold=False, ) preprocessed_bytes = preprocessing_result.image_bytes raw_text = self._perform_ocr(preprocessed_bytes) if not raw_text.strip(): return ReceiptExtractionResult( success=False, error="No text found in image", processing_time_ms=int((time.time() - start_time) * 1000), ) # Detect receipt type if not specified detected_type = receipt_type or self._detect_receipt_type(raw_text) # Extract fields based on receipt type if detected_type == "fuel": extracted_fields = self._extract_fuel_fields(raw_text) else: extracted_fields = self._extract_generic_fields(raw_text) processing_time_ms = int((time.time() - start_time) * 1000) logger.info( f"Receipt extraction: type={detected_type}, " f"fields={len(extracted_fields)}, " f"time={processing_time_ms}ms" ) return ReceiptExtractionResult( success=True, receipt_type=detected_type, extracted_fields=extracted_fields, raw_text=raw_text, processing_time_ms=processing_time_ms, ) except Exception as e: logger.error(f"Receipt extraction failed: {e}", exc_info=True) return ReceiptExtractionResult( success=False, error=str(e), processing_time_ms=int((time.time() - start_time) * 1000), ) def _detect_mime_type(self, file_bytes: bytes) -> str: """Detect MIME type using python-magic.""" mime = magic.Magic(mime=True) detected = mime.from_buffer(file_bytes) return detected or "application/octet-stream" def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str: """ Perform OCR on preprocessed image. Args: image_bytes: Preprocessed image bytes psm: Tesseract page segmentation mode 4 = Assume single column of text 6 = Uniform block of text Returns: Raw OCR text """ image = Image.open(io.BytesIO(image_bytes)) # Configure Tesseract for receipt OCR # PSM 4 works well for columnar receipt text config = f"--psm {psm}" return pytesseract.image_to_string(image, config=config) def _detect_receipt_type(self, text: str) -> str: """ Detect receipt type based on content. Args: text: OCR text Returns: Receipt type: "fuel", "retail", or "unknown" """ text_upper = text.upper() # Fuel receipt indicators fuel_keywords = [ "GALLON", "GAL", "FUEL", "GAS", "DIESEL", "UNLEADED", "REGULAR", "PREMIUM", "OCTANE", "PPG", "PUMP", ] fuel_score = sum(1 for kw in fuel_keywords if kw in text_upper) # Check for known gas stations if fuel_matcher.extract_merchant_name(text): merchant, _ = fuel_matcher.extract_merchant_name(text) if any( station in merchant.upper() for station in fuel_matcher.STATION_NAMES ): fuel_score += 3 if fuel_score >= 2: return "fuel" return "unknown" def _extract_fuel_fields(self, text: str) -> dict[str, ExtractedField]: """ Extract fuel-specific fields from receipt text. Args: text: OCR text Returns: Dictionary of extracted fields """ fields: dict[str, ExtractedField] = {} # Extract merchant name merchant_result = fuel_matcher.extract_merchant_name(text) if merchant_result: merchant_name, confidence = merchant_result fields["merchantName"] = ExtractedField( value=merchant_name, confidence=confidence, ) # Extract transaction date date_match = date_matcher.extract_best_date(text) if date_match: fields["transactionDate"] = ExtractedField( value=date_match.value, confidence=date_match.confidence, ) # Extract total amount total_match = currency_matcher.extract_total(text) if total_match: fields["totalAmount"] = ExtractedField( value=total_match.value, confidence=total_match.confidence, ) # Extract fuel quantity quantity_match = fuel_matcher.extract_quantity(text) if quantity_match: fields["fuelQuantity"] = ExtractedField( value=quantity_match.value, confidence=quantity_match.confidence, ) # Extract price per unit price_match = fuel_matcher.extract_price_per_unit(text) if price_match: fields["pricePerUnit"] = ExtractedField( value=price_match.value, confidence=price_match.confidence, ) # Extract fuel grade grade_match = fuel_matcher.extract_grade(text) if grade_match: fields["fuelGrade"] = ExtractedField( value=grade_match.value, confidence=grade_match.confidence, ) # Calculate derived values if we have enough data if "totalAmount" in fields and "fuelQuantity" in fields: if "pricePerUnit" not in fields: # Calculate price per unit from total and quantity calculated_price = ( fields["totalAmount"].value / fields["fuelQuantity"].value ) # Only use if reasonable if 1.0 <= calculated_price <= 10.0: fields["pricePerUnit"] = ExtractedField( value=round(calculated_price, 3), confidence=min( fields["totalAmount"].confidence, fields["fuelQuantity"].confidence, ) * 0.8, # Lower confidence for calculated value ) return fields def _extract_generic_fields(self, text: str) -> dict[str, ExtractedField]: """ Extract generic fields from receipt text. Args: text: OCR text Returns: Dictionary of extracted fields """ fields: dict[str, ExtractedField] = {} # Extract date date_match = date_matcher.extract_best_date(text) if date_match: fields["transactionDate"] = ExtractedField( value=date_match.value, confidence=date_match.confidence, ) # Extract total amount total_match = currency_matcher.extract_total(text) if total_match: fields["totalAmount"] = ExtractedField( value=total_match.value, confidence=total_match.confidence, ) # Try to get merchant from first line lines = [l.strip() for l in text.split("\n") if l.strip()] if lines: fields["merchantName"] = ExtractedField( value=lines[0][:50], confidence=0.40, ) return fields def validate(self, data: Any) -> bool: """ Validate extracted receipt data. Args: data: Extracted data to validate Returns: True if data has minimum required fields """ if not isinstance(data, dict): return False # Minimum: must have at least total amount or date return "totalAmount" in data or "transactionDate" in data # Singleton instance receipt_extractor = ReceiptExtractor()