feat: add receipt OCR pipeline (refs #69)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

Implement receipt-specific OCR extraction for fuel receipts:

- Pattern matching modules for date, currency, and fuel data extraction
- Receipt-optimized image preprocessing for thermal receipts
- POST /extract/receipt endpoint with field extraction
- Confidence scoring per extracted field
- Cross-validation of fuel receipt data
- Unit tests for all pattern matchers

Extracted fields: merchantName, transactionDate, totalAmount,
fuelQuantity, pricePerUnit, fuelGrade

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 20:43:30 -06:00
parent a2f0abb14c
commit 6319d50fb1
16 changed files with 2845 additions and 2 deletions

View File

@@ -0,0 +1,364 @@
"""Fuel-specific pattern matching for receipt extraction."""
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class FuelQuantityMatch:
"""Result of fuel quantity pattern matching."""
value: float # Gallons or liters
unit: str # "GAL" or "L"
raw_match: str
confidence: float
pattern_name: str
@dataclass
class FuelPriceMatch:
"""Result of fuel price per unit pattern matching."""
value: float
unit: str # "GAL" or "L"
raw_match: str
confidence: float
pattern_name: str
@dataclass
class FuelGradeMatch:
"""Result of fuel grade pattern matching."""
value: str # e.g., "87", "89", "93", "DIESEL"
display_name: str # e.g., "Regular 87", "Premium 93"
raw_match: str
confidence: float
class FuelPatternMatcher:
"""Extract fuel-specific data from receipt text."""
# Gallons patterns
GALLONS_PATTERNS = [
# XX.XXX GAL or XX.XXX GALLONS
(
r"(\d{1,3}\.\d{1,3})\s*(?:GAL(?:LON)?S?)",
"gallons_suffix",
0.95,
),
# GALLONS: XX.XXX or GAL: XX.XXX
(
r"(?:GAL(?:LON)?S?)[:\s]+(\d{1,3}\.\d{1,3})",
"gallons_prefix",
0.93,
),
# VOLUME XX.XXX
(
r"VOLUME[:\s]+(\d{1,3}\.\d{1,3})",
"volume",
0.85,
),
# QTY XX.XXX (near fuel context)
(
r"QTY[:\s]+(\d{1,3}\.\d{1,3})",
"qty",
0.70,
),
]
# Liters patterns (for international receipts)
LITERS_PATTERNS = [
# XX.XX L or XX.XX LITERS
(
r"(\d{1,3}\.\d{1,3})\s*(?:L(?:ITERS?)?)",
"liters_suffix",
0.95,
),
# LITERS: XX.XX
(
r"(?:L(?:ITERS?)?)[:\s]+(\d{1,3}\.\d{1,3})",
"liters_prefix",
0.93,
),
]
# Price per gallon patterns
PRICE_PER_UNIT_PATTERNS = [
# $X.XXX/GAL or $X.XX/GAL
(
r"\$?\s*(\d{1,2}\.\d{2,3})\s*/\s*GAL",
"price_per_gal",
0.98,
),
# PRICE/GAL $X.XXX
(
r"PRICE\s*/\s*GAL[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
"labeled_price_gal",
0.96,
),
# UNIT PRICE $X.XXX
(
r"UNIT\s*PRICE[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
"unit_price",
0.90,
),
# @ $X.XXX (per unit implied)
(
r"@\s*\$?\s*(\d{1,2}\.\d{2,3})",
"at_price",
0.85,
),
# PPG $X.XXX (price per gallon)
(
r"PPG[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
"ppg",
0.92,
),
]
# Fuel grade patterns
GRADE_PATTERNS = [
# REGULAR 87, REG 87
(r"(?:REGULAR|REG)\s*(\d{2})", "regular", 0.95),
# UNLEADED 87
(r"UNLEADED\s*(\d{2})", "unleaded", 0.93),
# PLUS 89, MID 89, MIDGRADE 89
(r"(?:PLUS|MID(?:GRADE)?)\s*(\d{2})", "plus", 0.95),
# PREMIUM 91/93, PREM 91/93, SUPER 91/93
(r"(?:PREMIUM|PREM|SUPER)\s*(\d{2})", "premium", 0.95),
# Just the octane number near fuel context (87, 89, 91, 93)
(r"(?<!\d)\s*(87|89|91|93)\s*(?:OCT(?:ANE)?)?", "octane_only", 0.75),
# DIESEL (no octane)
(r"DIESEL(?:\s*#?\d)?", "diesel", 0.98),
# E85 (ethanol blend)
(r"E\s*85", "e85", 0.95),
]
# Common gas station names
STATION_NAMES = [
"SHELL", "CHEVRON", "EXXON", "MOBIL", "BP", "SUNOCO", "76",
"CIRCLE K", "SPEEDWAY", "WAWA", "SHEETZ", "CASEY", "PILOT",
"FLYING J", "LOVES", "TA", "PETRO", "MARATHON", "CITGO",
"VALERO", "MURPHY", "COSTCO", "SAMS CLUB", "SAM'S CLUB",
"KROGER", "QT", "QUIKTRIP", "RACETRAC", "KUM & GO",
"KWIK TRIP", "HOLIDAY", "SINCLAIR", "CONOCO", "PHILLIPS 66",
"ARCO", "AMPM", "AM/PM", "7-ELEVEN", "7 ELEVEN", "GETTY",
"GULF", "HESS", "TEXACO", "TURKEY HILL", "CUMBERLAND FARMS",
]
def extract_gallons(self, text: str) -> Optional[FuelQuantityMatch]:
"""
Extract fuel quantity in gallons.
Args:
text: Receipt text to search
Returns:
FuelQuantityMatch or None
"""
text_upper = text.upper()
for pattern, name, confidence in self.GALLONS_PATTERNS:
match = re.search(pattern, text_upper)
if match:
quantity = float(match.group(1))
if self._is_reasonable_quantity(quantity):
return FuelQuantityMatch(
value=quantity,
unit="GAL",
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
return None
def extract_liters(self, text: str) -> Optional[FuelQuantityMatch]:
"""
Extract fuel quantity in liters.
Args:
text: Receipt text to search
Returns:
FuelQuantityMatch or None
"""
text_upper = text.upper()
for pattern, name, confidence in self.LITERS_PATTERNS:
match = re.search(pattern, text_upper)
if match:
quantity = float(match.group(1))
if self._is_reasonable_quantity(quantity, is_liters=True):
return FuelQuantityMatch(
value=quantity,
unit="L",
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
return None
def extract_quantity(self, text: str) -> Optional[FuelQuantityMatch]:
"""
Extract fuel quantity (gallons or liters).
Prefers gallons for US receipts.
Args:
text: Receipt text to search
Returns:
FuelQuantityMatch or None
"""
# Try gallons first (more common in US)
gallons = self.extract_gallons(text)
if gallons:
return gallons
# Fall back to liters
return self.extract_liters(text)
def extract_price_per_unit(self, text: str) -> Optional[FuelPriceMatch]:
"""
Extract price per gallon/liter.
Args:
text: Receipt text to search
Returns:
FuelPriceMatch or None
"""
text_upper = text.upper()
for pattern, name, confidence in self.PRICE_PER_UNIT_PATTERNS:
match = re.search(pattern, text_upper)
if match:
price = float(match.group(1))
if self._is_reasonable_price(price):
return FuelPriceMatch(
value=price,
unit="GAL", # Default to gallons for US
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
return None
def extract_grade(self, text: str) -> Optional[FuelGradeMatch]:
"""
Extract fuel grade (octane rating or diesel).
Args:
text: Receipt text to search
Returns:
FuelGradeMatch or None
"""
text_upper = text.upper()
for pattern, name, confidence in self.GRADE_PATTERNS:
match = re.search(pattern, text_upper)
if match:
if name == "diesel":
return FuelGradeMatch(
value="DIESEL",
display_name="Diesel",
raw_match=match.group(0),
confidence=confidence,
)
elif name == "e85":
return FuelGradeMatch(
value="E85",
display_name="E85 Ethanol",
raw_match=match.group(0),
confidence=confidence,
)
else:
octane = match.group(1)
display = self._get_grade_display_name(octane, name)
return FuelGradeMatch(
value=octane,
display_name=display,
raw_match=match.group(0),
confidence=confidence,
)
return None
def extract_merchant_name(self, text: str) -> Optional[tuple[str, float]]:
"""
Extract gas station/merchant name.
Args:
text: Receipt text to search
Returns:
Tuple of (merchant_name, confidence) or None
"""
text_upper = text.upper()
# Check for known station names
for station in self.STATION_NAMES:
if station in text_upper:
# Try to get the full line for context
for line in text.split("\n"):
if station in line.upper():
# Clean up the line
cleaned = line.strip()
if len(cleaned) <= 50: # Reasonable length
return (cleaned, 0.90)
return (station.title(), 0.85)
# Fall back to first non-empty line (often the merchant)
lines = [l.strip() for l in text.split("\n") if l.strip()]
if lines:
first_line = lines[0]
# Skip if it looks like a date or number
if not re.match(r"^\d+[/\-.]", first_line):
return (first_line[:50], 0.50) # Low confidence
return None
def _is_reasonable_quantity(
self, quantity: float, is_liters: bool = False
) -> bool:
"""Check if fuel quantity is reasonable."""
if is_liters:
# Typical fill: 20-100 liters
return 0.5 <= quantity <= 150.0
else:
# Typical fill: 5-30 gallons
return 0.1 <= quantity <= 50.0
def _is_reasonable_price(self, price: float) -> bool:
"""Check if price per unit is reasonable."""
# US gas prices: $1.50 - $8.00 per gallon (allowing for fluctuation)
return 1.00 <= price <= 10.00
def _get_grade_display_name(self, octane: str, pattern_name: str) -> str:
"""Get display name for fuel grade."""
grade_names = {
"87": "Regular 87",
"89": "Plus 89",
"91": "Premium 91",
"93": "Premium 93",
}
if octane in grade_names:
return grade_names[octane]
# Use pattern hint
if pattern_name == "premium":
return f"Premium {octane}"
elif pattern_name == "plus":
return f"Plus {octane}"
else:
return f"Unleaded {octane}"
# Singleton instance
fuel_matcher = FuelPatternMatcher()