"""Currency and amount pattern matching for receipt extraction.""" import re from dataclasses import dataclass from decimal import Decimal, InvalidOperation from typing import Optional @dataclass class AmountMatch: """Result of currency/amount pattern matching.""" value: float raw_match: str confidence: float pattern_name: str label: Optional[str] = None # e.g., "TOTAL", "SUBTOTAL" class CurrencyPatternMatcher: """Extract and normalize currency amounts from receipt text.""" # Total amount patterns (prioritized) TOTAL_PATTERNS = [ # TOTAL $XX.XX or TOTAL: $XX.XX ( r"(?:^|\s)TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)", "total_explicit", 0.98, ), # AMOUNT DUE $XX.XX ( r"AMOUNT\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})", "amount_due", 0.95, ), # SALE $XX.XX ( r"(?:^|\s)SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)", "sale_explicit", 0.92, ), # GRAND TOTAL $XX.XX ( r"GRAND\s*TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})", "grand_total", 0.97, ), # TOTAL SALE $XX.XX ( r"TOTAL\s*SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})", "total_sale", 0.96, ), # BALANCE DUE $XX.XX ( r"BALANCE\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})", "balance_due", 0.94, ), # PURCHASE $XX.XX ( r"(?:^|\s)PURCHASE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)", "purchase", 0.88, ), ] # Generic amount patterns (lower priority) AMOUNT_PATTERNS = [ # $XX.XX (standalone dollar amount) ( r"\$\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})", "dollar_amount", 0.60, ), # XX.XX (standalone decimal amount) ( r"(? Optional[AmountMatch]: """ Extract the total amount from receipt text. Prioritizes explicit total patterns over generic amounts. Args: text: Receipt text to search Returns: AmountMatch for total or None if not found """ text_upper = text.upper() # Try total-specific patterns first for pattern, name, confidence in self.TOTAL_PATTERNS: match = re.search(pattern, text_upper, re.MULTILINE) if match: amount = self._parse_amount(match.group(1)) if amount is not None and self._is_reasonable_total(amount): return AmountMatch( value=amount, raw_match=match.group(0).strip(), confidence=confidence, pattern_name=name, label=self._extract_label(name), ) # Fall back to finding the largest reasonable amount all_amounts = self.extract_all_amounts(text) reasonable = [a for a in all_amounts if self._is_reasonable_total(a.value)] if reasonable: # Assume largest amount is the total reasonable.sort(key=lambda x: x.value, reverse=True) best = reasonable[0] # Lower confidence since we're guessing return AmountMatch( value=best.value, raw_match=best.raw_match, confidence=min(0.60, best.confidence), pattern_name="inferred_total", label="TOTAL", ) return None def extract_all_amounts(self, text: str) -> list[AmountMatch]: """ Extract all currency amounts from text. Args: text: Receipt text to search Returns: List of AmountMatch objects """ matches = [] text_upper = text.upper() # Check total patterns for pattern, name, confidence in self.TOTAL_PATTERNS: for match in re.finditer(pattern, text_upper, re.MULTILINE): amount = self._parse_amount(match.group(1)) if amount is not None: matches.append( AmountMatch( value=amount, raw_match=match.group(0).strip(), confidence=confidence, pattern_name=name, label=self._extract_label(name), ) ) # Check generic amount patterns for pattern, name, confidence in self.AMOUNT_PATTERNS: for match in re.finditer(pattern, text_upper): amount = self._parse_amount(match.group(1)) if amount is not None: # Skip if already found by a more specific pattern if not any(abs(m.value - amount) < 0.01 for m in matches): matches.append( AmountMatch( value=amount, raw_match=match.group(0).strip(), confidence=confidence, pattern_name=name, ) ) return matches def _parse_amount(self, amount_str: str) -> Optional[float]: """Parse amount string to float, handling various formats.""" # Remove any spaces cleaned = amount_str.strip().replace(" ", "") # Handle European format (1.234,56) vs US format (1,234.56) # For US receipts, assume comma is thousands separator if "," in cleaned and "." in cleaned: # Determine which is decimal separator (last one) if cleaned.rfind(",") > cleaned.rfind("."): # European format cleaned = cleaned.replace(".", "").replace(",", ".") else: # US format cleaned = cleaned.replace(",", "") elif "," in cleaned: # Could be thousands separator or decimal parts = cleaned.split(",") if len(parts) == 2 and len(parts[1]) == 2: # Likely decimal separator cleaned = cleaned.replace(",", ".") else: # Likely thousands separator cleaned = cleaned.replace(",", "") try: amount = float(Decimal(cleaned)) return amount if amount >= 0 else None except (InvalidOperation, ValueError): return None def _is_reasonable_total(self, amount: float) -> bool: """Check if amount is a reasonable total for a fuel receipt.""" # Reasonable range: $1 to $500 for typical fuel purchases return 1.0 <= amount <= 500.0 def _extract_label(self, pattern_name: str) -> str: """Extract display label from pattern name.""" labels = { "total_explicit": "TOTAL", "amount_due": "AMOUNT DUE", "sale_explicit": "SALE", "grand_total": "GRAND TOTAL", "total_sale": "TOTAL SALE", "balance_due": "BALANCE DUE", "purchase": "PURCHASE", } return labels.get(pattern_name, "TOTAL") # Singleton instance currency_matcher = CurrencyPatternMatcher()