"""Receipt-specific OCR extractor with field extraction.""" import io import logging import time from dataclasses import dataclass, field from typing import Any, Optional import magic from pillow_heif import register_heif_opener from app.engines import OcrConfig, create_engine from app.extractors.base import BaseExtractor from app.preprocessors.receipt_preprocessor import receipt_preprocessor from app.patterns import currency_matcher, date_matcher, fuel_matcher # Register HEIF/HEIC opener register_heif_opener() logger = logging.getLogger(__name__) @dataclass class ExtractedField: """A single extracted field with confidence.""" value: Any confidence: float @dataclass class ReceiptExtractionResult: """Result of receipt extraction.""" success: bool receipt_type: str = "unknown" extracted_fields: dict[str, ExtractedField] = field(default_factory=dict) raw_text: str = "" processing_time_ms: int = 0 error: Optional[str] = None class ReceiptExtractor(BaseExtractor): """Receipt-specific OCR extractor for fuel and general receipts.""" # Supported MIME types SUPPORTED_TYPES = { "image/jpeg", "image/png", "image/heic", "image/heif", "application/pdf", } def __init__(self) -> None: """Initialize receipt extractor with engine from factory.""" self._engine = create_engine() def extract( self, image_bytes: bytes, content_type: Optional[str] = None, receipt_type: Optional[str] = None, ) -> ReceiptExtractionResult: """ Extract data from a receipt image. Args: image_bytes: Raw image or PDF bytes (HEIC, JPEG, PNG, PDF) content_type: MIME type (auto-detected if not provided) receipt_type: Hint for receipt type ("fuel" for specialized extraction) Returns: ReceiptExtractionResult with extracted fields """ start_time = time.time() # Detect content type if not provided if not content_type: content_type = self._detect_mime_type(image_bytes) # Validate content type if content_type not in self.SUPPORTED_TYPES: return ReceiptExtractionResult( success=False, error=f"Unsupported file type: {content_type}", processing_time_ms=int((time.time() - start_time) * 1000), ) try: # Convert PDF to image (first page) if content_type == "application/pdf": image_bytes = self._extract_pdf_first_page(image_bytes) if not image_bytes: return ReceiptExtractionResult( success=False, error="Failed to extract image from PDF", processing_time_ms=int((time.time() - start_time) * 1000), ) # Apply receipt-optimized preprocessing preprocessing_result = receipt_preprocessor.preprocess(image_bytes) preprocessed_bytes = preprocessing_result.image_bytes # Perform OCR raw_text = self._perform_ocr(preprocessed_bytes) if not raw_text.strip(): # Try with less aggressive preprocessing preprocessing_result = receipt_preprocessor.preprocess( image_bytes, apply_threshold=False, ) preprocessed_bytes = preprocessing_result.image_bytes raw_text = self._perform_ocr(preprocessed_bytes) if not raw_text.strip(): return ReceiptExtractionResult( success=False, error="No text found in image", processing_time_ms=int((time.time() - start_time) * 1000), ) # Detect receipt type if not specified detected_type = receipt_type or self._detect_receipt_type(raw_text) # Extract fields based on receipt type if detected_type == "fuel": extracted_fields = self._extract_fuel_fields(raw_text) else: extracted_fields = self._extract_generic_fields(raw_text) processing_time_ms = int((time.time() - start_time) * 1000) logger.info( f"Receipt extraction: type={detected_type}, " f"fields={len(extracted_fields)}, " f"time={processing_time_ms}ms" ) return ReceiptExtractionResult( success=True, receipt_type=detected_type, extracted_fields=extracted_fields, raw_text=raw_text, processing_time_ms=processing_time_ms, ) except Exception as e: logger.error(f"Receipt extraction failed: {e}", exc_info=True) return ReceiptExtractionResult( success=False, error=str(e), processing_time_ms=int((time.time() - start_time) * 1000), ) def _detect_mime_type(self, file_bytes: bytes) -> str: """Detect MIME type using python-magic.""" mime = magic.Magic(mime=True) detected = mime.from_buffer(file_bytes) return detected or "application/octet-stream" def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes: """Extract first page of PDF as PNG image for OCR processing.""" try: from pdf2image import convert_from_bytes images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300) if images: buffer = io.BytesIO() images[0].save(buffer, format="PNG") return buffer.getvalue() except ImportError: logger.warning("pdf2image not available, PDF support limited") except Exception as e: logger.error(f"PDF first page extraction failed: {e}") return b"" def _perform_ocr(self, image_bytes: bytes) -> str: """ Perform OCR on preprocessed image via engine abstraction. Args: image_bytes: Preprocessed image bytes Returns: Raw OCR text """ config = OcrConfig() result = self._engine.recognize(image_bytes, config) return result.text def _detect_receipt_type(self, text: str) -> str: """ Detect receipt type based on content. Args: text: OCR text Returns: Receipt type: "fuel", "retail", or "unknown" """ text_upper = text.upper() # Fuel receipt indicators fuel_keywords = [ "GALLON", "GAL", "FUEL", "GAS", "DIESEL", "UNLEADED", "REGULAR", "PREMIUM", "OCTANE", "PPG", "PUMP", ] fuel_score = sum(1 for kw in fuel_keywords if kw in text_upper) # Check for known gas stations if fuel_matcher.extract_merchant_name(text): merchant, _ = fuel_matcher.extract_merchant_name(text) if any( station in merchant.upper() for station in fuel_matcher.STATION_NAMES ): fuel_score += 3 if fuel_score >= 2: return "fuel" return "unknown" def _extract_fuel_fields(self, text: str) -> dict[str, ExtractedField]: """ Extract fuel-specific fields from receipt text. Args: text: OCR text Returns: Dictionary of extracted fields """ fields: dict[str, ExtractedField] = {} # Extract merchant name merchant_result = fuel_matcher.extract_merchant_name(text) if merchant_result: merchant_name, confidence = merchant_result fields["merchantName"] = ExtractedField( value=merchant_name, confidence=confidence, ) # Extract transaction date date_match = date_matcher.extract_best_date(text) if date_match: fields["transactionDate"] = ExtractedField( value=date_match.value, confidence=date_match.confidence, ) # Extract total amount total_match = currency_matcher.extract_total(text) if total_match: fields["totalAmount"] = ExtractedField( value=total_match.value, confidence=total_match.confidence, ) # Extract fuel quantity quantity_match = fuel_matcher.extract_quantity(text) if quantity_match: fields["fuelQuantity"] = ExtractedField( value=quantity_match.value, confidence=quantity_match.confidence, ) # Extract price per unit price_match = fuel_matcher.extract_price_per_unit(text) if price_match: fields["pricePerUnit"] = ExtractedField( value=price_match.value, confidence=price_match.confidence, ) # Extract fuel grade grade_match = fuel_matcher.extract_grade(text) if grade_match: fields["fuelGrade"] = ExtractedField( value=grade_match.value, confidence=grade_match.confidence, ) # Calculate derived values if we have enough data if "totalAmount" in fields and "fuelQuantity" in fields: if "pricePerUnit" not in fields: # Calculate price per unit from total and quantity calculated_price = ( fields["totalAmount"].value / fields["fuelQuantity"].value ) # Only use if reasonable if 1.0 <= calculated_price <= 10.0: fields["pricePerUnit"] = ExtractedField( value=round(calculated_price, 3), confidence=min( fields["totalAmount"].confidence, fields["fuelQuantity"].confidence, ) * 0.8, # Lower confidence for calculated value ) return fields def _extract_generic_fields(self, text: str) -> dict[str, ExtractedField]: """ Extract generic fields from receipt text. Args: text: OCR text Returns: Dictionary of extracted fields """ fields: dict[str, ExtractedField] = {} # Extract date date_match = date_matcher.extract_best_date(text) if date_match: fields["transactionDate"] = ExtractedField( value=date_match.value, confidence=date_match.confidence, ) # Extract total amount total_match = currency_matcher.extract_total(text) if total_match: fields["totalAmount"] = ExtractedField( value=total_match.value, confidence=total_match.confidence, ) # Try to get merchant from first line lines = [l.strip() for l in text.split("\n") if l.strip()] if lines: fields["merchantName"] = ExtractedField( value=lines[0][:50], confidence=0.40, ) return fields def validate(self, data: Any) -> bool: """ Validate extracted receipt data. Args: data: Extracted data to validate Returns: True if data has minimum required fields """ if not isinstance(data, dict): return False # Minimum: must have at least total amount or date return "totalAmount" in data or "transactionDate" in data # Singleton instance receipt_extractor = ReceiptExtractor()