motovaultpro/ocr/app/patterns/currency_patterns.py

"""Currency and amount pattern matching for receipt extraction."""
import re
from dataclasses import dataclass
from decimal import Decimal, InvalidOperation
from typing import Optional


@dataclass
class AmountMatch:
    """Result of currency/amount pattern matching."""

    value: float
    raw_match: str
    confidence: float
    pattern_name: str
    label: Optional[str] = None  # e.g., "TOTAL", "SUBTOTAL"


class CurrencyPatternMatcher:
    """Extract and normalize currency amounts from receipt text."""

    # Total amount patterns (prioritized)
    TOTAL_PATTERNS = [
        # TOTAL $XX.XX or TOTAL: $XX.XX
        (
            r"(?:^|\s)TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
            "total_explicit",
            0.98,
        ),
        # AMOUNT DUE $XX.XX
        (
            r"AMOUNT\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "amount_due",
            0.95,
        ),
        # SALE $XX.XX
        (
            r"(?:^|\s)SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
            "sale_explicit",
            0.92,
        ),
        # GRAND TOTAL $XX.XX
        (
            r"GRAND\s*TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "grand_total",
            0.97,
        ),
        # TOTAL SALE $XX.XX
        (
            r"TOTAL\s*SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "total_sale",
            0.96,
        ),
        # BALANCE DUE $XX.XX
        (
            r"BALANCE\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "balance_due",
            0.94,
        ),
        # PURCHASE $XX.XX
        (
            r"(?:^|\s)PURCHASE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
            "purchase",
            0.88,
        ),
    ]

    # Generic amount patterns (lower priority)
    AMOUNT_PATTERNS = [
        # $XX.XX (standalone dollar amount)
        (
            r"\$\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "dollar_amount",
            0.60,
        ),
        # XX.XX (standalone decimal amount)
        (
            r"(?<![.$\d])(\d{1,6}[.,]\d{2})(?![.\d])",
            "decimal_amount",
            0.40,
        ),
    ]

    def extract_total(self, text: str) -> Optional[AmountMatch]:
        """
        Extract the total amount from receipt text.

        Prioritizes explicit total patterns over generic amounts.

        Args:
            text: Receipt text to search

        Returns:
            AmountMatch for total or None if not found
        """
        text_upper = text.upper()

        # Try total-specific patterns first
        for pattern, name, confidence in self.TOTAL_PATTERNS:
            match = re.search(pattern, text_upper, re.MULTILINE)
            if match:
                amount = self._parse_amount(match.group(1))
                if amount is not None and self._is_reasonable_total(amount):
                    return AmountMatch(
                        value=amount,
                        raw_match=match.group(0).strip(),
                        confidence=confidence,
                        pattern_name=name,
                        label=self._extract_label(name),
                    )

        # Fall back to finding the largest reasonable amount
        all_amounts = self.extract_all_amounts(text)
        reasonable = [a for a in all_amounts if self._is_reasonable_total(a.value)]
        if reasonable:
            # Assume largest amount is the total
            reasonable.sort(key=lambda x: x.value, reverse=True)
            best = reasonable[0]
            # Lower confidence since we're guessing
            return AmountMatch(
                value=best.value,
                raw_match=best.raw_match,
                confidence=min(0.60, best.confidence),
                pattern_name="inferred_total",
                label="TOTAL",
            )

        return None

    def extract_all_amounts(self, text: str) -> list[AmountMatch]:
        """
        Extract all currency amounts from text.

        Args:
            text: Receipt text to search

        Returns:
            List of AmountMatch objects
        """
        matches = []
        text_upper = text.upper()

        # Check total patterns
        for pattern, name, confidence in self.TOTAL_PATTERNS:
            for match in re.finditer(pattern, text_upper, re.MULTILINE):
                amount = self._parse_amount(match.group(1))
                if amount is not None:
                    matches.append(
                        AmountMatch(
                            value=amount,
                            raw_match=match.group(0).strip(),
                            confidence=confidence,
                            pattern_name=name,
                            label=self._extract_label(name),
                        )
                    )

        # Check generic amount patterns
        for pattern, name, confidence in self.AMOUNT_PATTERNS:
            for match in re.finditer(pattern, text_upper):
                amount = self._parse_amount(match.group(1))
                if amount is not None:
                    # Skip if already found by a more specific pattern
                    if not any(abs(m.value - amount) < 0.01 for m in matches):
                        matches.append(
                            AmountMatch(
                                value=amount,
                                raw_match=match.group(0).strip(),
                                confidence=confidence,
                                pattern_name=name,
                            )
                        )

        return matches

    def _parse_amount(self, amount_str: str) -> Optional[float]:
        """Parse amount string to float, handling various formats."""
        # Remove any spaces
        cleaned = amount_str.strip().replace(" ", "")

        # Handle European format (1.234,56) vs US format (1,234.56)
        # For US receipts, assume comma is thousands separator
        if "," in cleaned and "." in cleaned:
            # Determine which is decimal separator (last one)
            if cleaned.rfind(",") > cleaned.rfind("."):
                # European format
                cleaned = cleaned.replace(".", "").replace(",", ".")
            else:
                # US format
                cleaned = cleaned.replace(",", "")
        elif "," in cleaned:
            # Could be thousands separator or decimal
            parts = cleaned.split(",")
            if len(parts) == 2 and len(parts[1]) == 2:
                # Likely decimal separator
                cleaned = cleaned.replace(",", ".")
            else:
                # Likely thousands separator
                cleaned = cleaned.replace(",", "")

        try:
            amount = float(Decimal(cleaned))
            return amount if amount >= 0 else None
        except (InvalidOperation, ValueError):
            return None

    def _is_reasonable_total(self, amount: float) -> bool:
        """Check if amount is a reasonable total for a fuel receipt."""
        # Reasonable range: $1 to $500 for typical fuel purchases
        return 1.0 <= amount <= 500.0

    def _extract_label(self, pattern_name: str) -> str:
        """Extract display label from pattern name."""
        labels = {
            "total_explicit": "TOTAL",
            "amount_due": "AMOUNT DUE",
            "sale_explicit": "SALE",
            "grand_total": "GRAND TOTAL",
            "total_sale": "TOTAL SALE",
            "balance_due": "BALANCE DUE",
            "purchase": "PURCHASE",
        }
        return labels.get(pattern_name, "TOTAL")


# Singleton instance
currency_matcher = CurrencyPatternMatcher()