feat: add receipt OCR pipeline (refs #69)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

Implement receipt-specific OCR extraction for fuel receipts:

- Pattern matching modules for date, currency, and fuel data extraction
- Receipt-optimized image preprocessing for thermal receipts
- POST /extract/receipt endpoint with field extraction
- Confidence scoring per extracted field
- Cross-validation of fuel receipt data
- Unit tests for all pattern matchers

Extracted fields: merchantName, transactionDate, totalAmount,
fuelQuantity, pricePerUnit, fuelGrade

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 20:43:30 -06:00
parent a2f0abb14c
commit 6319d50fb1
16 changed files with 2845 additions and 2 deletions

View File

@@ -0,0 +1,13 @@
"""Pattern matching modules for receipt field extraction."""
from app.patterns.date_patterns import DatePatternMatcher, date_matcher
from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher
from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
__all__ = [
"DatePatternMatcher",
"date_matcher",
"CurrencyPatternMatcher",
"currency_matcher",
"FuelPatternMatcher",
"fuel_matcher",
]

View File

@@ -0,0 +1,227 @@
"""Currency and amount pattern matching for receipt extraction."""
import re
from dataclasses import dataclass
from decimal import Decimal, InvalidOperation
from typing import Optional
@dataclass
class AmountMatch:
"""Result of currency/amount pattern matching."""
value: float
raw_match: str
confidence: float
pattern_name: str
label: Optional[str] = None # e.g., "TOTAL", "SUBTOTAL"
class CurrencyPatternMatcher:
"""Extract and normalize currency amounts from receipt text."""
# Total amount patterns (prioritized)
TOTAL_PATTERNS = [
# TOTAL $XX.XX or TOTAL: $XX.XX
(
r"(?:^|\s)TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
"total_explicit",
0.98,
),
# AMOUNT DUE $XX.XX
(
r"AMOUNT\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"amount_due",
0.95,
),
# SALE $XX.XX
(
r"(?:^|\s)SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
"sale_explicit",
0.92,
),
# GRAND TOTAL $XX.XX
(
r"GRAND\s*TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"grand_total",
0.97,
),
# TOTAL SALE $XX.XX
(
r"TOTAL\s*SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"total_sale",
0.96,
),
# BALANCE DUE $XX.XX
(
r"BALANCE\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"balance_due",
0.94,
),
# PURCHASE $XX.XX
(
r"(?:^|\s)PURCHASE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
"purchase",
0.88,
),
]
# Generic amount patterns (lower priority)
AMOUNT_PATTERNS = [
# $XX.XX (standalone dollar amount)
(
r"\$\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
"dollar_amount",
0.60,
),
# XX.XX (standalone decimal amount)
(
r"(?<![.$\d])(\d{1,6}[.,]\d{2})(?![.\d])",
"decimal_amount",
0.40,
),
]
def extract_total(self, text: str) -> Optional[AmountMatch]:
"""
Extract the total amount from receipt text.
Prioritizes explicit total patterns over generic amounts.
Args:
text: Receipt text to search
Returns:
AmountMatch for total or None if not found
"""
text_upper = text.upper()
# Try total-specific patterns first
for pattern, name, confidence in self.TOTAL_PATTERNS:
match = re.search(pattern, text_upper, re.MULTILINE)
if match:
amount = self._parse_amount(match.group(1))
if amount is not None and self._is_reasonable_total(amount):
return AmountMatch(
value=amount,
raw_match=match.group(0).strip(),
confidence=confidence,
pattern_name=name,
label=self._extract_label(name),
)
# Fall back to finding the largest reasonable amount
all_amounts = self.extract_all_amounts(text)
reasonable = [a for a in all_amounts if self._is_reasonable_total(a.value)]
if reasonable:
# Assume largest amount is the total
reasonable.sort(key=lambda x: x.value, reverse=True)
best = reasonable[0]
# Lower confidence since we're guessing
return AmountMatch(
value=best.value,
raw_match=best.raw_match,
confidence=min(0.60, best.confidence),
pattern_name="inferred_total",
label="TOTAL",
)
return None
def extract_all_amounts(self, text: str) -> list[AmountMatch]:
"""
Extract all currency amounts from text.
Args:
text: Receipt text to search
Returns:
List of AmountMatch objects
"""
matches = []
text_upper = text.upper()
# Check total patterns
for pattern, name, confidence in self.TOTAL_PATTERNS:
for match in re.finditer(pattern, text_upper, re.MULTILINE):
amount = self._parse_amount(match.group(1))
if amount is not None:
matches.append(
AmountMatch(
value=amount,
raw_match=match.group(0).strip(),
confidence=confidence,
pattern_name=name,
label=self._extract_label(name),
)
)
# Check generic amount patterns
for pattern, name, confidence in self.AMOUNT_PATTERNS:
for match in re.finditer(pattern, text_upper):
amount = self._parse_amount(match.group(1))
if amount is not None:
# Skip if already found by a more specific pattern
if not any(abs(m.value - amount) < 0.01 for m in matches):
matches.append(
AmountMatch(
value=amount,
raw_match=match.group(0).strip(),
confidence=confidence,
pattern_name=name,
)
)
return matches
def _parse_amount(self, amount_str: str) -> Optional[float]:
"""Parse amount string to float, handling various formats."""
# Remove any spaces
cleaned = amount_str.strip().replace(" ", "")
# Handle European format (1.234,56) vs US format (1,234.56)
# For US receipts, assume comma is thousands separator
if "," in cleaned and "." in cleaned:
# Determine which is decimal separator (last one)
if cleaned.rfind(",") > cleaned.rfind("."):
# European format
cleaned = cleaned.replace(".", "").replace(",", ".")
else:
# US format
cleaned = cleaned.replace(",", "")
elif "," in cleaned:
# Could be thousands separator or decimal
parts = cleaned.split(",")
if len(parts) == 2 and len(parts[1]) == 2:
# Likely decimal separator
cleaned = cleaned.replace(",", ".")
else:
# Likely thousands separator
cleaned = cleaned.replace(",", "")
try:
amount = float(Decimal(cleaned))
return amount if amount >= 0 else None
except (InvalidOperation, ValueError):
return None
def _is_reasonable_total(self, amount: float) -> bool:
"""Check if amount is a reasonable total for a fuel receipt."""
# Reasonable range: $1 to $500 for typical fuel purchases
return 1.0 <= amount <= 500.0
def _extract_label(self, pattern_name: str) -> str:
"""Extract display label from pattern name."""
labels = {
"total_explicit": "TOTAL",
"amount_due": "AMOUNT DUE",
"sale_explicit": "SALE",
"grand_total": "GRAND TOTAL",
"total_sale": "TOTAL SALE",
"balance_due": "BALANCE DUE",
"purchase": "PURCHASE",
}
return labels.get(pattern_name, "TOTAL")
# Singleton instance
currency_matcher = CurrencyPatternMatcher()

View File

@@ -0,0 +1,186 @@
"""Date pattern matching for receipt extraction."""
import re
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
@dataclass
class DateMatch:
"""Result of date pattern matching."""
value: str # ISO format YYYY-MM-DD
raw_match: str # Original text matched
confidence: float
pattern_name: str
class DatePatternMatcher:
"""Extract and normalize dates from receipt text."""
# Pattern definitions with named groups and confidence weights
PATTERNS = [
# MM/DD/YYYY or MM/DD/YY (most common US format)
(
r"(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4})",
"mm_dd_yyyy",
0.95,
),
# MM-DD-YYYY or MM-DD-YY
(
r"(?P<month>\d{1,2})-(?P<day>\d{1,2})-(?P<year>\d{2,4})",
"mm_dd_yyyy_dash",
0.90,
),
# YYYY-MM-DD (ISO format)
(
r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})",
"iso_date",
0.98,
),
# Mon DD, YYYY (e.g., Jan 15, 2024)
(
r"(?P<month_name>[A-Za-z]{3})\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})",
"month_name_long",
0.85,
),
# DD Mon YYYY (e.g., 15 Jan 2024)
(
r"(?P<day>\d{1,2})\s+(?P<month_name>[A-Za-z]{3})\s+(?P<year>\d{4})",
"day_month_year",
0.85,
),
# MMDDYYYY or MMDDYY (no separators, common in some POS systems)
(
r"(?<!\d)(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{2,4})(?!\d)",
"compact_date",
0.70,
),
]
MONTH_NAMES = {
"jan": 1, "january": 1,
"feb": 2, "february": 2,
"mar": 3, "march": 3,
"apr": 4, "april": 4,
"may": 5,
"jun": 6, "june": 6,
"jul": 7, "july": 7,
"aug": 8, "august": 8,
"sep": 9, "sept": 9, "september": 9,
"oct": 10, "october": 10,
"nov": 11, "november": 11,
"dec": 12, "december": 12,
}
def extract_dates(self, text: str) -> list[DateMatch]:
"""
Extract all date patterns from text.
Args:
text: Receipt text to search
Returns:
List of DateMatch objects sorted by confidence
"""
matches = []
text_upper = text.upper()
for pattern, name, base_confidence in self.PATTERNS:
for match in re.finditer(pattern, text, re.IGNORECASE):
parsed = self._parse_match(match, name)
if parsed:
year, month, day = parsed
if self._is_valid_date(year, month, day):
# Adjust confidence based on context
confidence = self._adjust_confidence(
base_confidence, text_upper, match.start()
)
matches.append(
DateMatch(
value=f"{year:04d}-{month:02d}-{day:02d}",
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
)
# Sort by confidence, deduplicate by value
matches.sort(key=lambda x: x.confidence, reverse=True)
seen = set()
unique_matches = []
for match in matches:
if match.value not in seen:
seen.add(match.value)
unique_matches.append(match)
return unique_matches
def extract_best_date(self, text: str) -> Optional[DateMatch]:
"""
Extract the most likely transaction date.
Args:
text: Receipt text to search
Returns:
Best DateMatch or None if no date found
"""
matches = self.extract_dates(text)
return matches[0] if matches else None
def _parse_match(
self, match: re.Match, pattern_name: str
) -> Optional[tuple[int, int, int]]:
"""Parse regex match into year, month, day tuple."""
groups = match.groupdict()
# Handle month name patterns
if "month_name" in groups:
month_str = groups["month_name"].lower()
month = self.MONTH_NAMES.get(month_str)
if not month:
return None
else:
month = int(groups["month"])
day = int(groups["day"])
year = int(groups["year"])
# Normalize 2-digit years
if year < 100:
year = 2000 + year if year < 50 else 1900 + year
return year, month, day
def _is_valid_date(self, year: int, month: int, day: int) -> bool:
"""Check if date components form a valid date."""
try:
datetime(year=year, month=month, day=day)
# Reasonable year range for receipts
return 2000 <= year <= 2100
except ValueError:
return False
def _adjust_confidence(
self, base_confidence: float, text: str, position: int
) -> float:
"""
Adjust confidence based on context clues.
Boost confidence if date appears near date-related keywords.
"""
# Look for nearby date keywords
context_start = max(0, position - 50)
context = text[context_start:position + 50]
date_keywords = ["DATE", "TIME", "TRANS", "SALE"]
for keyword in date_keywords:
if keyword in context:
return min(1.0, base_confidence + 0.05)
return base_confidence
# Singleton instance
date_matcher = DatePatternMatcher()

View File

@@ -0,0 +1,364 @@
"""Fuel-specific pattern matching for receipt extraction."""
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class FuelQuantityMatch:
"""Result of fuel quantity pattern matching."""
value: float # Gallons or liters
unit: str # "GAL" or "L"
raw_match: str
confidence: float
pattern_name: str
@dataclass
class FuelPriceMatch:
"""Result of fuel price per unit pattern matching."""
value: float
unit: str # "GAL" or "L"
raw_match: str
confidence: float
pattern_name: str
@dataclass
class FuelGradeMatch:
"""Result of fuel grade pattern matching."""
value: str # e.g., "87", "89", "93", "DIESEL"
display_name: str # e.g., "Regular 87", "Premium 93"
raw_match: str
confidence: float
class FuelPatternMatcher:
"""Extract fuel-specific data from receipt text."""
# Gallons patterns
GALLONS_PATTERNS = [
# XX.XXX GAL or XX.XXX GALLONS
(
r"(\d{1,3}\.\d{1,3})\s*(?:GAL(?:LON)?S?)",
"gallons_suffix",
0.95,
),
# GALLONS: XX.XXX or GAL: XX.XXX
(
r"(?:GAL(?:LON)?S?)[:\s]+(\d{1,3}\.\d{1,3})",
"gallons_prefix",
0.93,
),
# VOLUME XX.XXX
(
r"VOLUME[:\s]+(\d{1,3}\.\d{1,3})",
"volume",
0.85,
),
# QTY XX.XXX (near fuel context)
(
r"QTY[:\s]+(\d{1,3}\.\d{1,3})",
"qty",
0.70,
),
]
# Liters patterns (for international receipts)
LITERS_PATTERNS = [
# XX.XX L or XX.XX LITERS
(
r"(\d{1,3}\.\d{1,3})\s*(?:L(?:ITERS?)?)",
"liters_suffix",
0.95,
),
# LITERS: XX.XX
(
r"(?:L(?:ITERS?)?)[:\s]+(\d{1,3}\.\d{1,3})",
"liters_prefix",
0.93,
),
]
# Price per gallon patterns
PRICE_PER_UNIT_PATTERNS = [
# $X.XXX/GAL or $X.XX/GAL
(
r"\$?\s*(\d{1,2}\.\d{2,3})\s*/\s*GAL",
"price_per_gal",
0.98,
),
# PRICE/GAL $X.XXX
(
r"PRICE\s*/\s*GAL[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
"labeled_price_gal",
0.96,
),
# UNIT PRICE $X.XXX
(
r"UNIT\s*PRICE[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
"unit_price",
0.90,
),
# @ $X.XXX (per unit implied)
(
r"@\s*\$?\s*(\d{1,2}\.\d{2,3})",
"at_price",
0.85,
),
# PPG $X.XXX (price per gallon)
(
r"PPG[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
"ppg",
0.92,
),
]
# Fuel grade patterns
GRADE_PATTERNS = [
# REGULAR 87, REG 87
(r"(?:REGULAR|REG)\s*(\d{2})", "regular", 0.95),
# UNLEADED 87
(r"UNLEADED\s*(\d{2})", "unleaded", 0.93),
# PLUS 89, MID 89, MIDGRADE 89
(r"(?:PLUS|MID(?:GRADE)?)\s*(\d{2})", "plus", 0.95),
# PREMIUM 91/93, PREM 91/93, SUPER 91/93
(r"(?:PREMIUM|PREM|SUPER)\s*(\d{2})", "premium", 0.95),
# Just the octane number near fuel context (87, 89, 91, 93)
(r"(?<!\d)\s*(87|89|91|93)\s*(?:OCT(?:ANE)?)?", "octane_only", 0.75),
# DIESEL (no octane)
(r"DIESEL(?:\s*#?\d)?", "diesel", 0.98),
# E85 (ethanol blend)
(r"E\s*85", "e85", 0.95),
]
# Common gas station names
STATION_NAMES = [
"SHELL", "CHEVRON", "EXXON", "MOBIL", "BP", "SUNOCO", "76",
"CIRCLE K", "SPEEDWAY", "WAWA", "SHEETZ", "CASEY", "PILOT",
"FLYING J", "LOVES", "TA", "PETRO", "MARATHON", "CITGO",
"VALERO", "MURPHY", "COSTCO", "SAMS CLUB", "SAM'S CLUB",
"KROGER", "QT", "QUIKTRIP", "RACETRAC", "KUM & GO",
"KWIK TRIP", "HOLIDAY", "SINCLAIR", "CONOCO", "PHILLIPS 66",
"ARCO", "AMPM", "AM/PM", "7-ELEVEN", "7 ELEVEN", "GETTY",
"GULF", "HESS", "TEXACO", "TURKEY HILL", "CUMBERLAND FARMS",
]
def extract_gallons(self, text: str) -> Optional[FuelQuantityMatch]:
"""
Extract fuel quantity in gallons.
Args:
text: Receipt text to search
Returns:
FuelQuantityMatch or None
"""
text_upper = text.upper()
for pattern, name, confidence in self.GALLONS_PATTERNS:
match = re.search(pattern, text_upper)
if match:
quantity = float(match.group(1))
if self._is_reasonable_quantity(quantity):
return FuelQuantityMatch(
value=quantity,
unit="GAL",
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
return None
def extract_liters(self, text: str) -> Optional[FuelQuantityMatch]:
"""
Extract fuel quantity in liters.
Args:
text: Receipt text to search
Returns:
FuelQuantityMatch or None
"""
text_upper = text.upper()
for pattern, name, confidence in self.LITERS_PATTERNS:
match = re.search(pattern, text_upper)
if match:
quantity = float(match.group(1))
if self._is_reasonable_quantity(quantity, is_liters=True):
return FuelQuantityMatch(
value=quantity,
unit="L",
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
return None
def extract_quantity(self, text: str) -> Optional[FuelQuantityMatch]:
"""
Extract fuel quantity (gallons or liters).
Prefers gallons for US receipts.
Args:
text: Receipt text to search
Returns:
FuelQuantityMatch or None
"""
# Try gallons first (more common in US)
gallons = self.extract_gallons(text)
if gallons:
return gallons
# Fall back to liters
return self.extract_liters(text)
def extract_price_per_unit(self, text: str) -> Optional[FuelPriceMatch]:
"""
Extract price per gallon/liter.
Args:
text: Receipt text to search
Returns:
FuelPriceMatch or None
"""
text_upper = text.upper()
for pattern, name, confidence in self.PRICE_PER_UNIT_PATTERNS:
match = re.search(pattern, text_upper)
if match:
price = float(match.group(1))
if self._is_reasonable_price(price):
return FuelPriceMatch(
value=price,
unit="GAL", # Default to gallons for US
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
return None
def extract_grade(self, text: str) -> Optional[FuelGradeMatch]:
"""
Extract fuel grade (octane rating or diesel).
Args:
text: Receipt text to search
Returns:
FuelGradeMatch or None
"""
text_upper = text.upper()
for pattern, name, confidence in self.GRADE_PATTERNS:
match = re.search(pattern, text_upper)
if match:
if name == "diesel":
return FuelGradeMatch(
value="DIESEL",
display_name="Diesel",
raw_match=match.group(0),
confidence=confidence,
)
elif name == "e85":
return FuelGradeMatch(
value="E85",
display_name="E85 Ethanol",
raw_match=match.group(0),
confidence=confidence,
)
else:
octane = match.group(1)
display = self._get_grade_display_name(octane, name)
return FuelGradeMatch(
value=octane,
display_name=display,
raw_match=match.group(0),
confidence=confidence,
)
return None
def extract_merchant_name(self, text: str) -> Optional[tuple[str, float]]:
"""
Extract gas station/merchant name.
Args:
text: Receipt text to search
Returns:
Tuple of (merchant_name, confidence) or None
"""
text_upper = text.upper()
# Check for known station names
for station in self.STATION_NAMES:
if station in text_upper:
# Try to get the full line for context
for line in text.split("\n"):
if station in line.upper():
# Clean up the line
cleaned = line.strip()
if len(cleaned) <= 50: # Reasonable length
return (cleaned, 0.90)
return (station.title(), 0.85)
# Fall back to first non-empty line (often the merchant)
lines = [l.strip() for l in text.split("\n") if l.strip()]
if lines:
first_line = lines[0]
# Skip if it looks like a date or number
if not re.match(r"^\d+[/\-.]", first_line):
return (first_line[:50], 0.50) # Low confidence
return None
def _is_reasonable_quantity(
self, quantity: float, is_liters: bool = False
) -> bool:
"""Check if fuel quantity is reasonable."""
if is_liters:
# Typical fill: 20-100 liters
return 0.5 <= quantity <= 150.0
else:
# Typical fill: 5-30 gallons
return 0.1 <= quantity <= 50.0
def _is_reasonable_price(self, price: float) -> bool:
"""Check if price per unit is reasonable."""
# US gas prices: $1.50 - $8.00 per gallon (allowing for fluctuation)
return 1.00 <= price <= 10.00
def _get_grade_display_name(self, octane: str, pattern_name: str) -> str:
"""Get display name for fuel grade."""
grade_names = {
"87": "Regular 87",
"89": "Plus 89",
"91": "Premium 91",
"93": "Premium 93",
}
if octane in grade_names:
return grade_names[octane]
# Use pattern hint
if pattern_name == "premium":
return f"Premium {octane}"
elif pattern_name == "plus":
return f"Plus {octane}"
else:
return f"Unleaded {octane}"
# Singleton instance
fuel_matcher = FuelPatternMatcher()