All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
228 lines
7.5 KiB
Python
228 lines
7.5 KiB
Python
"""Currency and amount pattern matching for receipt extraction."""
|
|
import re
|
|
from dataclasses import dataclass
|
|
from decimal import Decimal, InvalidOperation
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class AmountMatch:
|
|
"""Result of currency/amount pattern matching."""
|
|
|
|
value: float
|
|
raw_match: str
|
|
confidence: float
|
|
pattern_name: str
|
|
label: Optional[str] = None # e.g., "TOTAL", "SUBTOTAL"
|
|
|
|
|
|
class CurrencyPatternMatcher:
|
|
"""Extract and normalize currency amounts from receipt text."""
|
|
|
|
# Total amount patterns (prioritized)
|
|
TOTAL_PATTERNS = [
|
|
# TOTAL $XX.XX or TOTAL: $XX.XX
|
|
(
|
|
r"(?:^|\s)TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
|
|
"total_explicit",
|
|
0.98,
|
|
),
|
|
# AMOUNT DUE $XX.XX
|
|
(
|
|
r"AMOUNT\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
|
"amount_due",
|
|
0.95,
|
|
),
|
|
# SALE $XX.XX
|
|
(
|
|
r"(?:^|\s)SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
|
|
"sale_explicit",
|
|
0.92,
|
|
),
|
|
# GRAND TOTAL $XX.XX
|
|
(
|
|
r"GRAND\s*TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
|
"grand_total",
|
|
0.97,
|
|
),
|
|
# TOTAL SALE $XX.XX
|
|
(
|
|
r"TOTAL\s*SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
|
"total_sale",
|
|
0.96,
|
|
),
|
|
# BALANCE DUE $XX.XX
|
|
(
|
|
r"BALANCE\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
|
"balance_due",
|
|
0.94,
|
|
),
|
|
# PURCHASE $XX.XX
|
|
(
|
|
r"(?:^|\s)PURCHASE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
|
|
"purchase",
|
|
0.88,
|
|
),
|
|
]
|
|
|
|
# Generic amount patterns (lower priority)
|
|
AMOUNT_PATTERNS = [
|
|
# $XX.XX (standalone dollar amount)
|
|
(
|
|
r"\$\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
|
"dollar_amount",
|
|
0.60,
|
|
),
|
|
# XX.XX (standalone decimal amount)
|
|
(
|
|
r"(?<![.$\d])(\d{1,6}[.,]\d{2})(?![.\d])",
|
|
"decimal_amount",
|
|
0.40,
|
|
),
|
|
]
|
|
|
|
def extract_total(self, text: str) -> Optional[AmountMatch]:
|
|
"""
|
|
Extract the total amount from receipt text.
|
|
|
|
Prioritizes explicit total patterns over generic amounts.
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
AmountMatch for total or None if not found
|
|
"""
|
|
text_upper = text.upper()
|
|
|
|
# Try total-specific patterns first
|
|
for pattern, name, confidence in self.TOTAL_PATTERNS:
|
|
match = re.search(pattern, text_upper, re.MULTILINE)
|
|
if match:
|
|
amount = self._parse_amount(match.group(1))
|
|
if amount is not None and self._is_reasonable_total(amount):
|
|
return AmountMatch(
|
|
value=amount,
|
|
raw_match=match.group(0).strip(),
|
|
confidence=confidence,
|
|
pattern_name=name,
|
|
label=self._extract_label(name),
|
|
)
|
|
|
|
# Fall back to finding the largest reasonable amount
|
|
all_amounts = self.extract_all_amounts(text)
|
|
reasonable = [a for a in all_amounts if self._is_reasonable_total(a.value)]
|
|
if reasonable:
|
|
# Assume largest amount is the total
|
|
reasonable.sort(key=lambda x: x.value, reverse=True)
|
|
best = reasonable[0]
|
|
# Lower confidence since we're guessing
|
|
return AmountMatch(
|
|
value=best.value,
|
|
raw_match=best.raw_match,
|
|
confidence=min(0.60, best.confidence),
|
|
pattern_name="inferred_total",
|
|
label="TOTAL",
|
|
)
|
|
|
|
return None
|
|
|
|
def extract_all_amounts(self, text: str) -> list[AmountMatch]:
|
|
"""
|
|
Extract all currency amounts from text.
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
List of AmountMatch objects
|
|
"""
|
|
matches = []
|
|
text_upper = text.upper()
|
|
|
|
# Check total patterns
|
|
for pattern, name, confidence in self.TOTAL_PATTERNS:
|
|
for match in re.finditer(pattern, text_upper, re.MULTILINE):
|
|
amount = self._parse_amount(match.group(1))
|
|
if amount is not None:
|
|
matches.append(
|
|
AmountMatch(
|
|
value=amount,
|
|
raw_match=match.group(0).strip(),
|
|
confidence=confidence,
|
|
pattern_name=name,
|
|
label=self._extract_label(name),
|
|
)
|
|
)
|
|
|
|
# Check generic amount patterns
|
|
for pattern, name, confidence in self.AMOUNT_PATTERNS:
|
|
for match in re.finditer(pattern, text_upper):
|
|
amount = self._parse_amount(match.group(1))
|
|
if amount is not None:
|
|
# Skip if already found by a more specific pattern
|
|
if not any(abs(m.value - amount) < 0.01 for m in matches):
|
|
matches.append(
|
|
AmountMatch(
|
|
value=amount,
|
|
raw_match=match.group(0).strip(),
|
|
confidence=confidence,
|
|
pattern_name=name,
|
|
)
|
|
)
|
|
|
|
return matches
|
|
|
|
def _parse_amount(self, amount_str: str) -> Optional[float]:
|
|
"""Parse amount string to float, handling various formats."""
|
|
# Remove any spaces
|
|
cleaned = amount_str.strip().replace(" ", "")
|
|
|
|
# Handle European format (1.234,56) vs US format (1,234.56)
|
|
# For US receipts, assume comma is thousands separator
|
|
if "," in cleaned and "." in cleaned:
|
|
# Determine which is decimal separator (last one)
|
|
if cleaned.rfind(",") > cleaned.rfind("."):
|
|
# European format
|
|
cleaned = cleaned.replace(".", "").replace(",", ".")
|
|
else:
|
|
# US format
|
|
cleaned = cleaned.replace(",", "")
|
|
elif "," in cleaned:
|
|
# Could be thousands separator or decimal
|
|
parts = cleaned.split(",")
|
|
if len(parts) == 2 and len(parts[1]) == 2:
|
|
# Likely decimal separator
|
|
cleaned = cleaned.replace(",", ".")
|
|
else:
|
|
# Likely thousands separator
|
|
cleaned = cleaned.replace(",", "")
|
|
|
|
try:
|
|
amount = float(Decimal(cleaned))
|
|
return amount if amount >= 0 else None
|
|
except (InvalidOperation, ValueError):
|
|
return None
|
|
|
|
def _is_reasonable_total(self, amount: float) -> bool:
|
|
"""Check if amount is a reasonable total for a fuel receipt."""
|
|
# Reasonable range: $1 to $500 for typical fuel purchases
|
|
return 1.0 <= amount <= 500.0
|
|
|
|
def _extract_label(self, pattern_name: str) -> str:
|
|
"""Extract display label from pattern name."""
|
|
labels = {
|
|
"total_explicit": "TOTAL",
|
|
"amount_due": "AMOUNT DUE",
|
|
"sale_explicit": "SALE",
|
|
"grand_total": "GRAND TOTAL",
|
|
"total_sale": "TOTAL SALE",
|
|
"balance_due": "BALANCE DUE",
|
|
"purchase": "PURCHASE",
|
|
}
|
|
return labels.get(pattern_name, "TOTAL")
|
|
|
|
|
|
# Singleton instance
|
|
currency_matcher = CurrencyPatternMatcher()
|