All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
365 lines
11 KiB
Python
365 lines
11 KiB
Python
"""Fuel-specific pattern matching for receipt extraction."""
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class FuelQuantityMatch:
|
|
"""Result of fuel quantity pattern matching."""
|
|
|
|
value: float # Gallons or liters
|
|
unit: str # "GAL" or "L"
|
|
raw_match: str
|
|
confidence: float
|
|
pattern_name: str
|
|
|
|
|
|
@dataclass
|
|
class FuelPriceMatch:
|
|
"""Result of fuel price per unit pattern matching."""
|
|
|
|
value: float
|
|
unit: str # "GAL" or "L"
|
|
raw_match: str
|
|
confidence: float
|
|
pattern_name: str
|
|
|
|
|
|
@dataclass
|
|
class FuelGradeMatch:
|
|
"""Result of fuel grade pattern matching."""
|
|
|
|
value: str # e.g., "87", "89", "93", "DIESEL"
|
|
display_name: str # e.g., "Regular 87", "Premium 93"
|
|
raw_match: str
|
|
confidence: float
|
|
|
|
|
|
class FuelPatternMatcher:
|
|
"""Extract fuel-specific data from receipt text."""
|
|
|
|
# Gallons patterns
|
|
GALLONS_PATTERNS = [
|
|
# XX.XXX GAL or XX.XXX GALLONS
|
|
(
|
|
r"(\d{1,3}\.\d{1,3})\s*(?:GAL(?:LON)?S?)",
|
|
"gallons_suffix",
|
|
0.95,
|
|
),
|
|
# GALLONS: XX.XXX or GAL: XX.XXX
|
|
(
|
|
r"(?:GAL(?:LON)?S?)[:\s]+(\d{1,3}\.\d{1,3})",
|
|
"gallons_prefix",
|
|
0.93,
|
|
),
|
|
# VOLUME XX.XXX
|
|
(
|
|
r"VOLUME[:\s]+(\d{1,3}\.\d{1,3})",
|
|
"volume",
|
|
0.85,
|
|
),
|
|
# QTY XX.XXX (near fuel context)
|
|
(
|
|
r"QTY[:\s]+(\d{1,3}\.\d{1,3})",
|
|
"qty",
|
|
0.70,
|
|
),
|
|
]
|
|
|
|
# Liters patterns (for international receipts)
|
|
LITERS_PATTERNS = [
|
|
# XX.XX L or XX.XX LITERS
|
|
(
|
|
r"(\d{1,3}\.\d{1,3})\s*(?:L(?:ITERS?)?)",
|
|
"liters_suffix",
|
|
0.95,
|
|
),
|
|
# LITERS: XX.XX
|
|
(
|
|
r"(?:L(?:ITERS?)?)[:\s]+(\d{1,3}\.\d{1,3})",
|
|
"liters_prefix",
|
|
0.93,
|
|
),
|
|
]
|
|
|
|
# Price per gallon patterns
|
|
PRICE_PER_UNIT_PATTERNS = [
|
|
# $X.XXX/GAL or $X.XX/GAL
|
|
(
|
|
r"\$?\s*(\d{1,2}\.\d{2,3})\s*/\s*GAL",
|
|
"price_per_gal",
|
|
0.98,
|
|
),
|
|
# PRICE/GAL $X.XXX
|
|
(
|
|
r"PRICE\s*/\s*GAL[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
|
|
"labeled_price_gal",
|
|
0.96,
|
|
),
|
|
# UNIT PRICE $X.XXX
|
|
(
|
|
r"UNIT\s*PRICE[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
|
|
"unit_price",
|
|
0.90,
|
|
),
|
|
# @ $X.XXX (per unit implied)
|
|
(
|
|
r"@\s*\$?\s*(\d{1,2}\.\d{2,3})",
|
|
"at_price",
|
|
0.85,
|
|
),
|
|
# PPG $X.XXX (price per gallon)
|
|
(
|
|
r"PPG[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
|
|
"ppg",
|
|
0.92,
|
|
),
|
|
]
|
|
|
|
# Fuel grade patterns
|
|
GRADE_PATTERNS = [
|
|
# REGULAR 87, REG 87
|
|
(r"(?:REGULAR|REG)\s*(\d{2})", "regular", 0.95),
|
|
# UNLEADED 87
|
|
(r"UNLEADED\s*(\d{2})", "unleaded", 0.93),
|
|
# PLUS 89, MID 89, MIDGRADE 89
|
|
(r"(?:PLUS|MID(?:GRADE)?)\s*(\d{2})", "plus", 0.95),
|
|
# PREMIUM 91/93, PREM 91/93, SUPER 91/93
|
|
(r"(?:PREMIUM|PREM|SUPER)\s*(\d{2})", "premium", 0.95),
|
|
# Just the octane number near fuel context (87, 89, 91, 93)
|
|
(r"(?<!\d)\s*(87|89|91|93)\s*(?:OCT(?:ANE)?)?", "octane_only", 0.75),
|
|
# DIESEL (no octane)
|
|
(r"DIESEL(?:\s*#?\d)?", "diesel", 0.98),
|
|
# E85 (ethanol blend)
|
|
(r"E\s*85", "e85", 0.95),
|
|
]
|
|
|
|
# Common gas station names
|
|
STATION_NAMES = [
|
|
"SHELL", "CHEVRON", "EXXON", "MOBIL", "BP", "SUNOCO", "76",
|
|
"CIRCLE K", "SPEEDWAY", "WAWA", "SHEETZ", "CASEY", "PILOT",
|
|
"FLYING J", "LOVES", "TA", "PETRO", "MARATHON", "CITGO",
|
|
"VALERO", "MURPHY", "COSTCO", "SAMS CLUB", "SAM'S CLUB",
|
|
"KROGER", "QT", "QUIKTRIP", "RACETRAC", "KUM & GO",
|
|
"KWIK TRIP", "HOLIDAY", "SINCLAIR", "CONOCO", "PHILLIPS 66",
|
|
"ARCO", "AMPM", "AM/PM", "7-ELEVEN", "7 ELEVEN", "GETTY",
|
|
"GULF", "HESS", "TEXACO", "TURKEY HILL", "CUMBERLAND FARMS",
|
|
]
|
|
|
|
def extract_gallons(self, text: str) -> Optional[FuelQuantityMatch]:
|
|
"""
|
|
Extract fuel quantity in gallons.
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
FuelQuantityMatch or None
|
|
"""
|
|
text_upper = text.upper()
|
|
|
|
for pattern, name, confidence in self.GALLONS_PATTERNS:
|
|
match = re.search(pattern, text_upper)
|
|
if match:
|
|
quantity = float(match.group(1))
|
|
if self._is_reasonable_quantity(quantity):
|
|
return FuelQuantityMatch(
|
|
value=quantity,
|
|
unit="GAL",
|
|
raw_match=match.group(0),
|
|
confidence=confidence,
|
|
pattern_name=name,
|
|
)
|
|
|
|
return None
|
|
|
|
def extract_liters(self, text: str) -> Optional[FuelQuantityMatch]:
|
|
"""
|
|
Extract fuel quantity in liters.
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
FuelQuantityMatch or None
|
|
"""
|
|
text_upper = text.upper()
|
|
|
|
for pattern, name, confidence in self.LITERS_PATTERNS:
|
|
match = re.search(pattern, text_upper)
|
|
if match:
|
|
quantity = float(match.group(1))
|
|
if self._is_reasonable_quantity(quantity, is_liters=True):
|
|
return FuelQuantityMatch(
|
|
value=quantity,
|
|
unit="L",
|
|
raw_match=match.group(0),
|
|
confidence=confidence,
|
|
pattern_name=name,
|
|
)
|
|
|
|
return None
|
|
|
|
def extract_quantity(self, text: str) -> Optional[FuelQuantityMatch]:
|
|
"""
|
|
Extract fuel quantity (gallons or liters).
|
|
|
|
Prefers gallons for US receipts.
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
FuelQuantityMatch or None
|
|
"""
|
|
# Try gallons first (more common in US)
|
|
gallons = self.extract_gallons(text)
|
|
if gallons:
|
|
return gallons
|
|
|
|
# Fall back to liters
|
|
return self.extract_liters(text)
|
|
|
|
def extract_price_per_unit(self, text: str) -> Optional[FuelPriceMatch]:
|
|
"""
|
|
Extract price per gallon/liter.
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
FuelPriceMatch or None
|
|
"""
|
|
text_upper = text.upper()
|
|
|
|
for pattern, name, confidence in self.PRICE_PER_UNIT_PATTERNS:
|
|
match = re.search(pattern, text_upper)
|
|
if match:
|
|
price = float(match.group(1))
|
|
if self._is_reasonable_price(price):
|
|
return FuelPriceMatch(
|
|
value=price,
|
|
unit="GAL", # Default to gallons for US
|
|
raw_match=match.group(0),
|
|
confidence=confidence,
|
|
pattern_name=name,
|
|
)
|
|
|
|
return None
|
|
|
|
def extract_grade(self, text: str) -> Optional[FuelGradeMatch]:
|
|
"""
|
|
Extract fuel grade (octane rating or diesel).
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
FuelGradeMatch or None
|
|
"""
|
|
text_upper = text.upper()
|
|
|
|
for pattern, name, confidence in self.GRADE_PATTERNS:
|
|
match = re.search(pattern, text_upper)
|
|
if match:
|
|
if name == "diesel":
|
|
return FuelGradeMatch(
|
|
value="DIESEL",
|
|
display_name="Diesel",
|
|
raw_match=match.group(0),
|
|
confidence=confidence,
|
|
)
|
|
elif name == "e85":
|
|
return FuelGradeMatch(
|
|
value="E85",
|
|
display_name="E85 Ethanol",
|
|
raw_match=match.group(0),
|
|
confidence=confidence,
|
|
)
|
|
else:
|
|
octane = match.group(1)
|
|
display = self._get_grade_display_name(octane, name)
|
|
return FuelGradeMatch(
|
|
value=octane,
|
|
display_name=display,
|
|
raw_match=match.group(0),
|
|
confidence=confidence,
|
|
)
|
|
|
|
return None
|
|
|
|
def extract_merchant_name(self, text: str) -> Optional[tuple[str, float]]:
|
|
"""
|
|
Extract gas station/merchant name.
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
Tuple of (merchant_name, confidence) or None
|
|
"""
|
|
text_upper = text.upper()
|
|
|
|
# Check for known station names
|
|
for station in self.STATION_NAMES:
|
|
if station in text_upper:
|
|
# Try to get the full line for context
|
|
for line in text.split("\n"):
|
|
if station in line.upper():
|
|
# Clean up the line
|
|
cleaned = line.strip()
|
|
if len(cleaned) <= 50: # Reasonable length
|
|
return (cleaned, 0.90)
|
|
return (station.title(), 0.85)
|
|
|
|
# Fall back to first non-empty line (often the merchant)
|
|
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
|
if lines:
|
|
first_line = lines[0]
|
|
# Skip if it looks like a date or number
|
|
if not re.match(r"^\d+[/\-.]", first_line):
|
|
return (first_line[:50], 0.50) # Low confidence
|
|
|
|
return None
|
|
|
|
def _is_reasonable_quantity(
|
|
self, quantity: float, is_liters: bool = False
|
|
) -> bool:
|
|
"""Check if fuel quantity is reasonable."""
|
|
if is_liters:
|
|
# Typical fill: 20-100 liters
|
|
return 0.5 <= quantity <= 150.0
|
|
else:
|
|
# Typical fill: 5-30 gallons
|
|
return 0.1 <= quantity <= 50.0
|
|
|
|
def _is_reasonable_price(self, price: float) -> bool:
|
|
"""Check if price per unit is reasonable."""
|
|
# US gas prices: $1.50 - $8.00 per gallon (allowing for fluctuation)
|
|
return 1.00 <= price <= 10.00
|
|
|
|
def _get_grade_display_name(self, octane: str, pattern_name: str) -> str:
|
|
"""Get display name for fuel grade."""
|
|
grade_names = {
|
|
"87": "Regular 87",
|
|
"89": "Plus 89",
|
|
"91": "Premium 91",
|
|
"93": "Premium 93",
|
|
}
|
|
|
|
if octane in grade_names:
|
|
return grade_names[octane]
|
|
|
|
# Use pattern hint
|
|
if pattern_name == "premium":
|
|
return f"Premium {octane}"
|
|
elif pattern_name == "plus":
|
|
return f"Plus {octane}"
|
|
else:
|
|
return f"Unleaded {octane}"
|
|
|
|
|
|
# Singleton instance
|
|
fuel_matcher = FuelPatternMatcher()
|