feat: add receipt OCR pipeline (refs #69)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
364
ocr/app/patterns/fuel_patterns.py
Normal file
364
ocr/app/patterns/fuel_patterns.py
Normal file
@@ -0,0 +1,364 @@
|
||||
"""Fuel-specific pattern matching for receipt extraction."""
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class FuelQuantityMatch:
|
||||
"""Result of fuel quantity pattern matching."""
|
||||
|
||||
value: float # Gallons or liters
|
||||
unit: str # "GAL" or "L"
|
||||
raw_match: str
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FuelPriceMatch:
|
||||
"""Result of fuel price per unit pattern matching."""
|
||||
|
||||
value: float
|
||||
unit: str # "GAL" or "L"
|
||||
raw_match: str
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FuelGradeMatch:
|
||||
"""Result of fuel grade pattern matching."""
|
||||
|
||||
value: str # e.g., "87", "89", "93", "DIESEL"
|
||||
display_name: str # e.g., "Regular 87", "Premium 93"
|
||||
raw_match: str
|
||||
confidence: float
|
||||
|
||||
|
||||
class FuelPatternMatcher:
|
||||
"""Extract fuel-specific data from receipt text."""
|
||||
|
||||
# Gallons patterns
|
||||
GALLONS_PATTERNS = [
|
||||
# XX.XXX GAL or XX.XXX GALLONS
|
||||
(
|
||||
r"(\d{1,3}\.\d{1,3})\s*(?:GAL(?:LON)?S?)",
|
||||
"gallons_suffix",
|
||||
0.95,
|
||||
),
|
||||
# GALLONS: XX.XXX or GAL: XX.XXX
|
||||
(
|
||||
r"(?:GAL(?:LON)?S?)[:\s]+(\d{1,3}\.\d{1,3})",
|
||||
"gallons_prefix",
|
||||
0.93,
|
||||
),
|
||||
# VOLUME XX.XXX
|
||||
(
|
||||
r"VOLUME[:\s]+(\d{1,3}\.\d{1,3})",
|
||||
"volume",
|
||||
0.85,
|
||||
),
|
||||
# QTY XX.XXX (near fuel context)
|
||||
(
|
||||
r"QTY[:\s]+(\d{1,3}\.\d{1,3})",
|
||||
"qty",
|
||||
0.70,
|
||||
),
|
||||
]
|
||||
|
||||
# Liters patterns (for international receipts)
|
||||
LITERS_PATTERNS = [
|
||||
# XX.XX L or XX.XX LITERS
|
||||
(
|
||||
r"(\d{1,3}\.\d{1,3})\s*(?:L(?:ITERS?)?)",
|
||||
"liters_suffix",
|
||||
0.95,
|
||||
),
|
||||
# LITERS: XX.XX
|
||||
(
|
||||
r"(?:L(?:ITERS?)?)[:\s]+(\d{1,3}\.\d{1,3})",
|
||||
"liters_prefix",
|
||||
0.93,
|
||||
),
|
||||
]
|
||||
|
||||
# Price per gallon patterns
|
||||
PRICE_PER_UNIT_PATTERNS = [
|
||||
# $X.XXX/GAL or $X.XX/GAL
|
||||
(
|
||||
r"\$?\s*(\d{1,2}\.\d{2,3})\s*/\s*GAL",
|
||||
"price_per_gal",
|
||||
0.98,
|
||||
),
|
||||
# PRICE/GAL $X.XXX
|
||||
(
|
||||
r"PRICE\s*/\s*GAL[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
|
||||
"labeled_price_gal",
|
||||
0.96,
|
||||
),
|
||||
# UNIT PRICE $X.XXX
|
||||
(
|
||||
r"UNIT\s*PRICE[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
|
||||
"unit_price",
|
||||
0.90,
|
||||
),
|
||||
# @ $X.XXX (per unit implied)
|
||||
(
|
||||
r"@\s*\$?\s*(\d{1,2}\.\d{2,3})",
|
||||
"at_price",
|
||||
0.85,
|
||||
),
|
||||
# PPG $X.XXX (price per gallon)
|
||||
(
|
||||
r"PPG[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
|
||||
"ppg",
|
||||
0.92,
|
||||
),
|
||||
]
|
||||
|
||||
# Fuel grade patterns
|
||||
GRADE_PATTERNS = [
|
||||
# REGULAR 87, REG 87
|
||||
(r"(?:REGULAR|REG)\s*(\d{2})", "regular", 0.95),
|
||||
# UNLEADED 87
|
||||
(r"UNLEADED\s*(\d{2})", "unleaded", 0.93),
|
||||
# PLUS 89, MID 89, MIDGRADE 89
|
||||
(r"(?:PLUS|MID(?:GRADE)?)\s*(\d{2})", "plus", 0.95),
|
||||
# PREMIUM 91/93, PREM 91/93, SUPER 91/93
|
||||
(r"(?:PREMIUM|PREM|SUPER)\s*(\d{2})", "premium", 0.95),
|
||||
# Just the octane number near fuel context (87, 89, 91, 93)
|
||||
(r"(?<!\d)\s*(87|89|91|93)\s*(?:OCT(?:ANE)?)?", "octane_only", 0.75),
|
||||
# DIESEL (no octane)
|
||||
(r"DIESEL(?:\s*#?\d)?", "diesel", 0.98),
|
||||
# E85 (ethanol blend)
|
||||
(r"E\s*85", "e85", 0.95),
|
||||
]
|
||||
|
||||
# Common gas station names
|
||||
STATION_NAMES = [
|
||||
"SHELL", "CHEVRON", "EXXON", "MOBIL", "BP", "SUNOCO", "76",
|
||||
"CIRCLE K", "SPEEDWAY", "WAWA", "SHEETZ", "CASEY", "PILOT",
|
||||
"FLYING J", "LOVES", "TA", "PETRO", "MARATHON", "CITGO",
|
||||
"VALERO", "MURPHY", "COSTCO", "SAMS CLUB", "SAM'S CLUB",
|
||||
"KROGER", "QT", "QUIKTRIP", "RACETRAC", "KUM & GO",
|
||||
"KWIK TRIP", "HOLIDAY", "SINCLAIR", "CONOCO", "PHILLIPS 66",
|
||||
"ARCO", "AMPM", "AM/PM", "7-ELEVEN", "7 ELEVEN", "GETTY",
|
||||
"GULF", "HESS", "TEXACO", "TURKEY HILL", "CUMBERLAND FARMS",
|
||||
]
|
||||
|
||||
def extract_gallons(self, text: str) -> Optional[FuelQuantityMatch]:
|
||||
"""
|
||||
Extract fuel quantity in gallons.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelQuantityMatch or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, confidence in self.GALLONS_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
quantity = float(match.group(1))
|
||||
if self._is_reasonable_quantity(quantity):
|
||||
return FuelQuantityMatch(
|
||||
value=quantity,
|
||||
unit="GAL",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_liters(self, text: str) -> Optional[FuelQuantityMatch]:
|
||||
"""
|
||||
Extract fuel quantity in liters.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelQuantityMatch or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, confidence in self.LITERS_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
quantity = float(match.group(1))
|
||||
if self._is_reasonable_quantity(quantity, is_liters=True):
|
||||
return FuelQuantityMatch(
|
||||
value=quantity,
|
||||
unit="L",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_quantity(self, text: str) -> Optional[FuelQuantityMatch]:
|
||||
"""
|
||||
Extract fuel quantity (gallons or liters).
|
||||
|
||||
Prefers gallons for US receipts.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelQuantityMatch or None
|
||||
"""
|
||||
# Try gallons first (more common in US)
|
||||
gallons = self.extract_gallons(text)
|
||||
if gallons:
|
||||
return gallons
|
||||
|
||||
# Fall back to liters
|
||||
return self.extract_liters(text)
|
||||
|
||||
def extract_price_per_unit(self, text: str) -> Optional[FuelPriceMatch]:
|
||||
"""
|
||||
Extract price per gallon/liter.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelPriceMatch or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, confidence in self.PRICE_PER_UNIT_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
price = float(match.group(1))
|
||||
if self._is_reasonable_price(price):
|
||||
return FuelPriceMatch(
|
||||
value=price,
|
||||
unit="GAL", # Default to gallons for US
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_grade(self, text: str) -> Optional[FuelGradeMatch]:
|
||||
"""
|
||||
Extract fuel grade (octane rating or diesel).
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelGradeMatch or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, confidence in self.GRADE_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
if name == "diesel":
|
||||
return FuelGradeMatch(
|
||||
value="DIESEL",
|
||||
display_name="Diesel",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
)
|
||||
elif name == "e85":
|
||||
return FuelGradeMatch(
|
||||
value="E85",
|
||||
display_name="E85 Ethanol",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
)
|
||||
else:
|
||||
octane = match.group(1)
|
||||
display = self._get_grade_display_name(octane, name)
|
||||
return FuelGradeMatch(
|
||||
value=octane,
|
||||
display_name=display,
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_merchant_name(self, text: str) -> Optional[tuple[str, float]]:
|
||||
"""
|
||||
Extract gas station/merchant name.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
Tuple of (merchant_name, confidence) or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Check for known station names
|
||||
for station in self.STATION_NAMES:
|
||||
if station in text_upper:
|
||||
# Try to get the full line for context
|
||||
for line in text.split("\n"):
|
||||
if station in line.upper():
|
||||
# Clean up the line
|
||||
cleaned = line.strip()
|
||||
if len(cleaned) <= 50: # Reasonable length
|
||||
return (cleaned, 0.90)
|
||||
return (station.title(), 0.85)
|
||||
|
||||
# Fall back to first non-empty line (often the merchant)
|
||||
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
||||
if lines:
|
||||
first_line = lines[0]
|
||||
# Skip if it looks like a date or number
|
||||
if not re.match(r"^\d+[/\-.]", first_line):
|
||||
return (first_line[:50], 0.50) # Low confidence
|
||||
|
||||
return None
|
||||
|
||||
def _is_reasonable_quantity(
|
||||
self, quantity: float, is_liters: bool = False
|
||||
) -> bool:
|
||||
"""Check if fuel quantity is reasonable."""
|
||||
if is_liters:
|
||||
# Typical fill: 20-100 liters
|
||||
return 0.5 <= quantity <= 150.0
|
||||
else:
|
||||
# Typical fill: 5-30 gallons
|
||||
return 0.1 <= quantity <= 50.0
|
||||
|
||||
def _is_reasonable_price(self, price: float) -> bool:
|
||||
"""Check if price per unit is reasonable."""
|
||||
# US gas prices: $1.50 - $8.00 per gallon (allowing for fluctuation)
|
||||
return 1.00 <= price <= 10.00
|
||||
|
||||
def _get_grade_display_name(self, octane: str, pattern_name: str) -> str:
|
||||
"""Get display name for fuel grade."""
|
||||
grade_names = {
|
||||
"87": "Regular 87",
|
||||
"89": "Plus 89",
|
||||
"91": "Premium 91",
|
||||
"93": "Premium 93",
|
||||
}
|
||||
|
||||
if octane in grade_names:
|
||||
return grade_names[octane]
|
||||
|
||||
# Use pattern hint
|
||||
if pattern_name == "premium":
|
||||
return f"Premium {octane}"
|
||||
elif pattern_name == "plus":
|
||||
return f"Plus {octane}"
|
||||
else:
|
||||
return f"Unleaded {octane}"
|
||||
|
||||
|
||||
# Singleton instance
|
||||
fuel_matcher = FuelPatternMatcher()
|
||||
Reference in New Issue
Block a user