Files
motovaultpro/ocr/app/patterns/currency_patterns.py
Eric Gullickson 6319d50fb1
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
feat: add receipt OCR pipeline (refs #69)
Implement receipt-specific OCR extraction for fuel receipts:

- Pattern matching modules for date, currency, and fuel data extraction
- Receipt-optimized image preprocessing for thermal receipts
- POST /extract/receipt endpoint with field extraction
- Confidence scoring per extracted field
- Cross-validation of fuel receipt data
- Unit tests for all pattern matchers

Extracted fields: merchantName, transactionDate, totalAmount,
fuelQuantity, pricePerUnit, fuelGrade

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 20:43:30 -06:00

228 lines
7.5 KiB
Python

"""Currency and amount pattern matching for receipt extraction."""
import re
from dataclasses import dataclass
from decimal import Decimal, InvalidOperation
from typing import Optional
@dataclass
class AmountMatch:
"""Result of currency/amount pattern matching."""
value: float
raw_match: str
confidence: float
pattern_name: str
label: Optional[str] = None # e.g., "TOTAL", "SUBTOTAL"
class CurrencyPatternMatcher:
"""Extract and normalize currency amounts from receipt text."""
# Total amount patterns (prioritized)
TOTAL_PATTERNS = [
# TOTAL $XX.XX or TOTAL: $XX.XX
(
r"(?:^|\s)TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
"total_explicit",
0.98,
),
# AMOUNT DUE $XX.XX
(
r"AMOUNT\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"amount_due",
0.95,
),
# SALE $XX.XX
(
r"(?:^|\s)SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
"sale_explicit",
0.92,
),
# GRAND TOTAL $XX.XX
(
r"GRAND\s*TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"grand_total",
0.97,
),
# TOTAL SALE $XX.XX
(
r"TOTAL\s*SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"total_sale",
0.96,
),
# BALANCE DUE $XX.XX
(
r"BALANCE\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"balance_due",
0.94,
),
# PURCHASE $XX.XX
(
r"(?:^|\s)PURCHASE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
"purchase",
0.88,
),
]
# Generic amount patterns (lower priority)
AMOUNT_PATTERNS = [
# $XX.XX (standalone dollar amount)
(
r"\$\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"dollar_amount",
0.60,
),
# XX.XX (standalone decimal amount)
(
r"(?<![.$\d])(\d{1,6}[.,]\d{2})(?![.\d])",
"decimal_amount",
0.40,
),
]
def extract_total(self, text: str) -> Optional[AmountMatch]:
"""
Extract the total amount from receipt text.
Prioritizes explicit total patterns over generic amounts.
Args:
text: Receipt text to search
Returns:
AmountMatch for total or None if not found
"""
text_upper = text.upper()
# Try total-specific patterns first
for pattern, name, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text_upper, re.MULTILINE)
if match:
amount = self._parse_amount(match.group(1))
if amount is not None and self._is_reasonable_total(amount):
return AmountMatch(
value=amount,
raw_match=match.group(0).strip(),
confidence=confidence,
pattern_name=name,
label=self._extract_label(name),
)
# Fall back to finding the largest reasonable amount
all_amounts = self.extract_all_amounts(text)
reasonable = [a for a in all_amounts if self._is_reasonable_total(a.value)]
if reasonable:
# Assume largest amount is the total
reasonable.sort(key=lambda x: x.value, reverse=True)
best = reasonable[0]
# Lower confidence since we're guessing
return AmountMatch(
value=best.value,
raw_match=best.raw_match,
confidence=min(0.60, best.confidence),
pattern_name="inferred_total",
label="TOTAL",
)
return None
def extract_all_amounts(self, text: str) -> list[AmountMatch]:
"""
Extract all currency amounts from text.
Args:
text: Receipt text to search
Returns:
List of AmountMatch objects
"""
matches = []
text_upper = text.upper()
# Check total patterns
for pattern, name, confidence in self.TOTAL_PATTERNS:
for match in re.finditer(pattern, text_upper, re.MULTILINE):
amount = self._parse_amount(match.group(1))
if amount is not None:
matches.append(
AmountMatch(
value=amount,
raw_match=match.group(0).strip(),
confidence=confidence,
pattern_name=name,
label=self._extract_label(name),
)
)
# Check generic amount patterns
for pattern, name, confidence in self.AMOUNT_PATTERNS:
for match in re.finditer(pattern, text_upper):
amount = self._parse_amount(match.group(1))
if amount is not None:
# Skip if already found by a more specific pattern
if not any(abs(m.value - amount) < 0.01 for m in matches):
matches.append(
AmountMatch(
value=amount,
raw_match=match.group(0).strip(),
confidence=confidence,
pattern_name=name,
)
)
return matches
def _parse_amount(self, amount_str: str) -> Optional[float]:
"""Parse amount string to float, handling various formats."""
# Remove any spaces
cleaned = amount_str.strip().replace(" ", "")
# Handle European format (1.234,56) vs US format (1,234.56)
# For US receipts, assume comma is thousands separator
if "," in cleaned and "." in cleaned:
# Determine which is decimal separator (last one)
if cleaned.rfind(",") > cleaned.rfind("."):
# European format
cleaned = cleaned.replace(".", "").replace(",", ".")
else:
# US format
cleaned = cleaned.replace(",", "")
elif "," in cleaned:
# Could be thousands separator or decimal
parts = cleaned.split(",")
if len(parts) == 2 and len(parts[1]) == 2:
# Likely decimal separator
cleaned = cleaned.replace(",", ".")
else:
# Likely thousands separator
cleaned = cleaned.replace(",", "")
try:
amount = float(Decimal(cleaned))
return amount if amount >= 0 else None
except (InvalidOperation, ValueError):
return None
def _is_reasonable_total(self, amount: float) -> bool:
"""Check if amount is a reasonable total for a fuel receipt."""
# Reasonable range: $1 to $500 for typical fuel purchases
return 1.0 <= amount <= 500.0
def _extract_label(self, pattern_name: str) -> str:
"""Extract display label from pattern name."""
labels = {
"total_explicit": "TOTAL",
"amount_due": "AMOUNT DUE",
"sale_explicit": "SALE",
"grand_total": "GRAND TOTAL",
"total_sale": "TOTAL SALE",
"balance_due": "BALANCE DUE",
"purchase": "PURCHASE",
}
return labels.get(pattern_name, "TOTAL")
# Singleton instance
currency_matcher = CurrencyPatternMatcher()