feat: add receipt OCR pipeline (refs #69)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
13
ocr/app/patterns/__init__.py
Normal file
13
ocr/app/patterns/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
"""Pattern matching modules for receipt field extraction."""
|
||||
from app.patterns.date_patterns import DatePatternMatcher, date_matcher
|
||||
from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher
|
||||
from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
|
||||
|
||||
__all__ = [
|
||||
"DatePatternMatcher",
|
||||
"date_matcher",
|
||||
"CurrencyPatternMatcher",
|
||||
"currency_matcher",
|
||||
"FuelPatternMatcher",
|
||||
"fuel_matcher",
|
||||
]
|
||||
227
ocr/app/patterns/currency_patterns.py
Normal file
227
ocr/app/patterns/currency_patterns.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""Currency and amount pattern matching for receipt extraction."""
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class AmountMatch:
|
||||
"""Result of currency/amount pattern matching."""
|
||||
|
||||
value: float
|
||||
raw_match: str
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
label: Optional[str] = None # e.g., "TOTAL", "SUBTOTAL"
|
||||
|
||||
|
||||
class CurrencyPatternMatcher:
|
||||
"""Extract and normalize currency amounts from receipt text."""
|
||||
|
||||
# Total amount patterns (prioritized)
|
||||
TOTAL_PATTERNS = [
|
||||
# TOTAL $XX.XX or TOTAL: $XX.XX
|
||||
(
|
||||
r"(?:^|\s)TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
|
||||
"total_explicit",
|
||||
0.98,
|
||||
),
|
||||
# AMOUNT DUE $XX.XX
|
||||
(
|
||||
r"AMOUNT\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
||||
"amount_due",
|
||||
0.95,
|
||||
),
|
||||
# SALE $XX.XX
|
||||
(
|
||||
r"(?:^|\s)SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
|
||||
"sale_explicit",
|
||||
0.92,
|
||||
),
|
||||
# GRAND TOTAL $XX.XX
|
||||
(
|
||||
r"GRAND\s*TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
||||
"grand_total",
|
||||
0.97,
|
||||
),
|
||||
# TOTAL SALE $XX.XX
|
||||
(
|
||||
r"TOTAL\s*SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
||||
"total_sale",
|
||||
0.96,
|
||||
),
|
||||
# BALANCE DUE $XX.XX
|
||||
(
|
||||
r"BALANCE\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
||||
"balance_due",
|
||||
0.94,
|
||||
),
|
||||
# PURCHASE $XX.XX
|
||||
(
|
||||
r"(?:^|\s)PURCHASE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
|
||||
"purchase",
|
||||
0.88,
|
||||
),
|
||||
]
|
||||
|
||||
# Generic amount patterns (lower priority)
|
||||
AMOUNT_PATTERNS = [
|
||||
# $XX.XX (standalone dollar amount)
|
||||
(
|
||||
r"\$\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
|
||||
"dollar_amount",
|
||||
0.60,
|
||||
),
|
||||
# XX.XX (standalone decimal amount)
|
||||
(
|
||||
r"(?<![.$\d])(\d{1,6}[.,]\d{2})(?![.\d])",
|
||||
"decimal_amount",
|
||||
0.40,
|
||||
),
|
||||
]
|
||||
|
||||
def extract_total(self, text: str) -> Optional[AmountMatch]:
|
||||
"""
|
||||
Extract the total amount from receipt text.
|
||||
|
||||
Prioritizes explicit total patterns over generic amounts.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
AmountMatch for total or None if not found
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Try total-specific patterns first
|
||||
for pattern, name, confidence in self.TOTAL_PATTERNS:
|
||||
match = re.search(pattern, text_upper, re.MULTILINE)
|
||||
if match:
|
||||
amount = self._parse_amount(match.group(1))
|
||||
if amount is not None and self._is_reasonable_total(amount):
|
||||
return AmountMatch(
|
||||
value=amount,
|
||||
raw_match=match.group(0).strip(),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
label=self._extract_label(name),
|
||||
)
|
||||
|
||||
# Fall back to finding the largest reasonable amount
|
||||
all_amounts = self.extract_all_amounts(text)
|
||||
reasonable = [a for a in all_amounts if self._is_reasonable_total(a.value)]
|
||||
if reasonable:
|
||||
# Assume largest amount is the total
|
||||
reasonable.sort(key=lambda x: x.value, reverse=True)
|
||||
best = reasonable[0]
|
||||
# Lower confidence since we're guessing
|
||||
return AmountMatch(
|
||||
value=best.value,
|
||||
raw_match=best.raw_match,
|
||||
confidence=min(0.60, best.confidence),
|
||||
pattern_name="inferred_total",
|
||||
label="TOTAL",
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_all_amounts(self, text: str) -> list[AmountMatch]:
|
||||
"""
|
||||
Extract all currency amounts from text.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
List of AmountMatch objects
|
||||
"""
|
||||
matches = []
|
||||
text_upper = text.upper()
|
||||
|
||||
# Check total patterns
|
||||
for pattern, name, confidence in self.TOTAL_PATTERNS:
|
||||
for match in re.finditer(pattern, text_upper, re.MULTILINE):
|
||||
amount = self._parse_amount(match.group(1))
|
||||
if amount is not None:
|
||||
matches.append(
|
||||
AmountMatch(
|
||||
value=amount,
|
||||
raw_match=match.group(0).strip(),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
label=self._extract_label(name),
|
||||
)
|
||||
)
|
||||
|
||||
# Check generic amount patterns
|
||||
for pattern, name, confidence in self.AMOUNT_PATTERNS:
|
||||
for match in re.finditer(pattern, text_upper):
|
||||
amount = self._parse_amount(match.group(1))
|
||||
if amount is not None:
|
||||
# Skip if already found by a more specific pattern
|
||||
if not any(abs(m.value - amount) < 0.01 for m in matches):
|
||||
matches.append(
|
||||
AmountMatch(
|
||||
value=amount,
|
||||
raw_match=match.group(0).strip(),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
)
|
||||
|
||||
return matches
|
||||
|
||||
def _parse_amount(self, amount_str: str) -> Optional[float]:
|
||||
"""Parse amount string to float, handling various formats."""
|
||||
# Remove any spaces
|
||||
cleaned = amount_str.strip().replace(" ", "")
|
||||
|
||||
# Handle European format (1.234,56) vs US format (1,234.56)
|
||||
# For US receipts, assume comma is thousands separator
|
||||
if "," in cleaned and "." in cleaned:
|
||||
# Determine which is decimal separator (last one)
|
||||
if cleaned.rfind(",") > cleaned.rfind("."):
|
||||
# European format
|
||||
cleaned = cleaned.replace(".", "").replace(",", ".")
|
||||
else:
|
||||
# US format
|
||||
cleaned = cleaned.replace(",", "")
|
||||
elif "," in cleaned:
|
||||
# Could be thousands separator or decimal
|
||||
parts = cleaned.split(",")
|
||||
if len(parts) == 2 and len(parts[1]) == 2:
|
||||
# Likely decimal separator
|
||||
cleaned = cleaned.replace(",", ".")
|
||||
else:
|
||||
# Likely thousands separator
|
||||
cleaned = cleaned.replace(",", "")
|
||||
|
||||
try:
|
||||
amount = float(Decimal(cleaned))
|
||||
return amount if amount >= 0 else None
|
||||
except (InvalidOperation, ValueError):
|
||||
return None
|
||||
|
||||
def _is_reasonable_total(self, amount: float) -> bool:
|
||||
"""Check if amount is a reasonable total for a fuel receipt."""
|
||||
# Reasonable range: $1 to $500 for typical fuel purchases
|
||||
return 1.0 <= amount <= 500.0
|
||||
|
||||
def _extract_label(self, pattern_name: str) -> str:
|
||||
"""Extract display label from pattern name."""
|
||||
labels = {
|
||||
"total_explicit": "TOTAL",
|
||||
"amount_due": "AMOUNT DUE",
|
||||
"sale_explicit": "SALE",
|
||||
"grand_total": "GRAND TOTAL",
|
||||
"total_sale": "TOTAL SALE",
|
||||
"balance_due": "BALANCE DUE",
|
||||
"purchase": "PURCHASE",
|
||||
}
|
||||
return labels.get(pattern_name, "TOTAL")
|
||||
|
||||
|
||||
# Singleton instance
|
||||
currency_matcher = CurrencyPatternMatcher()
|
||||
186
ocr/app/patterns/date_patterns.py
Normal file
186
ocr/app/patterns/date_patterns.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"""Date pattern matching for receipt extraction."""
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class DateMatch:
|
||||
"""Result of date pattern matching."""
|
||||
|
||||
value: str # ISO format YYYY-MM-DD
|
||||
raw_match: str # Original text matched
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
|
||||
|
||||
class DatePatternMatcher:
|
||||
"""Extract and normalize dates from receipt text."""
|
||||
|
||||
# Pattern definitions with named groups and confidence weights
|
||||
PATTERNS = [
|
||||
# MM/DD/YYYY or MM/DD/YY (most common US format)
|
||||
(
|
||||
r"(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4})",
|
||||
"mm_dd_yyyy",
|
||||
0.95,
|
||||
),
|
||||
# MM-DD-YYYY or MM-DD-YY
|
||||
(
|
||||
r"(?P<month>\d{1,2})-(?P<day>\d{1,2})-(?P<year>\d{2,4})",
|
||||
"mm_dd_yyyy_dash",
|
||||
0.90,
|
||||
),
|
||||
# YYYY-MM-DD (ISO format)
|
||||
(
|
||||
r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})",
|
||||
"iso_date",
|
||||
0.98,
|
||||
),
|
||||
# Mon DD, YYYY (e.g., Jan 15, 2024)
|
||||
(
|
||||
r"(?P<month_name>[A-Za-z]{3})\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})",
|
||||
"month_name_long",
|
||||
0.85,
|
||||
),
|
||||
# DD Mon YYYY (e.g., 15 Jan 2024)
|
||||
(
|
||||
r"(?P<day>\d{1,2})\s+(?P<month_name>[A-Za-z]{3})\s+(?P<year>\d{4})",
|
||||
"day_month_year",
|
||||
0.85,
|
||||
),
|
||||
# MMDDYYYY or MMDDYY (no separators, common in some POS systems)
|
||||
(
|
||||
r"(?<!\d)(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{2,4})(?!\d)",
|
||||
"compact_date",
|
||||
0.70,
|
||||
),
|
||||
]
|
||||
|
||||
MONTH_NAMES = {
|
||||
"jan": 1, "january": 1,
|
||||
"feb": 2, "february": 2,
|
||||
"mar": 3, "march": 3,
|
||||
"apr": 4, "april": 4,
|
||||
"may": 5,
|
||||
"jun": 6, "june": 6,
|
||||
"jul": 7, "july": 7,
|
||||
"aug": 8, "august": 8,
|
||||
"sep": 9, "sept": 9, "september": 9,
|
||||
"oct": 10, "october": 10,
|
||||
"nov": 11, "november": 11,
|
||||
"dec": 12, "december": 12,
|
||||
}
|
||||
|
||||
def extract_dates(self, text: str) -> list[DateMatch]:
|
||||
"""
|
||||
Extract all date patterns from text.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
List of DateMatch objects sorted by confidence
|
||||
"""
|
||||
matches = []
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, base_confidence in self.PATTERNS:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
parsed = self._parse_match(match, name)
|
||||
if parsed:
|
||||
year, month, day = parsed
|
||||
if self._is_valid_date(year, month, day):
|
||||
# Adjust confidence based on context
|
||||
confidence = self._adjust_confidence(
|
||||
base_confidence, text_upper, match.start()
|
||||
)
|
||||
matches.append(
|
||||
DateMatch(
|
||||
value=f"{year:04d}-{month:02d}-{day:02d}",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
)
|
||||
|
||||
# Sort by confidence, deduplicate by value
|
||||
matches.sort(key=lambda x: x.confidence, reverse=True)
|
||||
seen = set()
|
||||
unique_matches = []
|
||||
for match in matches:
|
||||
if match.value not in seen:
|
||||
seen.add(match.value)
|
||||
unique_matches.append(match)
|
||||
|
||||
return unique_matches
|
||||
|
||||
def extract_best_date(self, text: str) -> Optional[DateMatch]:
|
||||
"""
|
||||
Extract the most likely transaction date.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
Best DateMatch or None if no date found
|
||||
"""
|
||||
matches = self.extract_dates(text)
|
||||
return matches[0] if matches else None
|
||||
|
||||
def _parse_match(
|
||||
self, match: re.Match, pattern_name: str
|
||||
) -> Optional[tuple[int, int, int]]:
|
||||
"""Parse regex match into year, month, day tuple."""
|
||||
groups = match.groupdict()
|
||||
|
||||
# Handle month name patterns
|
||||
if "month_name" in groups:
|
||||
month_str = groups["month_name"].lower()
|
||||
month = self.MONTH_NAMES.get(month_str)
|
||||
if not month:
|
||||
return None
|
||||
else:
|
||||
month = int(groups["month"])
|
||||
|
||||
day = int(groups["day"])
|
||||
year = int(groups["year"])
|
||||
|
||||
# Normalize 2-digit years
|
||||
if year < 100:
|
||||
year = 2000 + year if year < 50 else 1900 + year
|
||||
|
||||
return year, month, day
|
||||
|
||||
def _is_valid_date(self, year: int, month: int, day: int) -> bool:
|
||||
"""Check if date components form a valid date."""
|
||||
try:
|
||||
datetime(year=year, month=month, day=day)
|
||||
# Reasonable year range for receipts
|
||||
return 2000 <= year <= 2100
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def _adjust_confidence(
|
||||
self, base_confidence: float, text: str, position: int
|
||||
) -> float:
|
||||
"""
|
||||
Adjust confidence based on context clues.
|
||||
|
||||
Boost confidence if date appears near date-related keywords.
|
||||
"""
|
||||
# Look for nearby date keywords
|
||||
context_start = max(0, position - 50)
|
||||
context = text[context_start:position + 50]
|
||||
|
||||
date_keywords = ["DATE", "TIME", "TRANS", "SALE"]
|
||||
for keyword in date_keywords:
|
||||
if keyword in context:
|
||||
return min(1.0, base_confidence + 0.05)
|
||||
|
||||
return base_confidence
|
||||
|
||||
|
||||
# Singleton instance
|
||||
date_matcher = DatePatternMatcher()
|
||||
364
ocr/app/patterns/fuel_patterns.py
Normal file
364
ocr/app/patterns/fuel_patterns.py
Normal file
@@ -0,0 +1,364 @@
|
||||
"""Fuel-specific pattern matching for receipt extraction."""
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class FuelQuantityMatch:
|
||||
"""Result of fuel quantity pattern matching."""
|
||||
|
||||
value: float # Gallons or liters
|
||||
unit: str # "GAL" or "L"
|
||||
raw_match: str
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FuelPriceMatch:
|
||||
"""Result of fuel price per unit pattern matching."""
|
||||
|
||||
value: float
|
||||
unit: str # "GAL" or "L"
|
||||
raw_match: str
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FuelGradeMatch:
|
||||
"""Result of fuel grade pattern matching."""
|
||||
|
||||
value: str # e.g., "87", "89", "93", "DIESEL"
|
||||
display_name: str # e.g., "Regular 87", "Premium 93"
|
||||
raw_match: str
|
||||
confidence: float
|
||||
|
||||
|
||||
class FuelPatternMatcher:
|
||||
"""Extract fuel-specific data from receipt text."""
|
||||
|
||||
# Gallons patterns
|
||||
GALLONS_PATTERNS = [
|
||||
# XX.XXX GAL or XX.XXX GALLONS
|
||||
(
|
||||
r"(\d{1,3}\.\d{1,3})\s*(?:GAL(?:LON)?S?)",
|
||||
"gallons_suffix",
|
||||
0.95,
|
||||
),
|
||||
# GALLONS: XX.XXX or GAL: XX.XXX
|
||||
(
|
||||
r"(?:GAL(?:LON)?S?)[:\s]+(\d{1,3}\.\d{1,3})",
|
||||
"gallons_prefix",
|
||||
0.93,
|
||||
),
|
||||
# VOLUME XX.XXX
|
||||
(
|
||||
r"VOLUME[:\s]+(\d{1,3}\.\d{1,3})",
|
||||
"volume",
|
||||
0.85,
|
||||
),
|
||||
# QTY XX.XXX (near fuel context)
|
||||
(
|
||||
r"QTY[:\s]+(\d{1,3}\.\d{1,3})",
|
||||
"qty",
|
||||
0.70,
|
||||
),
|
||||
]
|
||||
|
||||
# Liters patterns (for international receipts)
|
||||
LITERS_PATTERNS = [
|
||||
# XX.XX L or XX.XX LITERS
|
||||
(
|
||||
r"(\d{1,3}\.\d{1,3})\s*(?:L(?:ITERS?)?)",
|
||||
"liters_suffix",
|
||||
0.95,
|
||||
),
|
||||
# LITERS: XX.XX
|
||||
(
|
||||
r"(?:L(?:ITERS?)?)[:\s]+(\d{1,3}\.\d{1,3})",
|
||||
"liters_prefix",
|
||||
0.93,
|
||||
),
|
||||
]
|
||||
|
||||
# Price per gallon patterns
|
||||
PRICE_PER_UNIT_PATTERNS = [
|
||||
# $X.XXX/GAL or $X.XX/GAL
|
||||
(
|
||||
r"\$?\s*(\d{1,2}\.\d{2,3})\s*/\s*GAL",
|
||||
"price_per_gal",
|
||||
0.98,
|
||||
),
|
||||
# PRICE/GAL $X.XXX
|
||||
(
|
||||
r"PRICE\s*/\s*GAL[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
|
||||
"labeled_price_gal",
|
||||
0.96,
|
||||
),
|
||||
# UNIT PRICE $X.XXX
|
||||
(
|
||||
r"UNIT\s*PRICE[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
|
||||
"unit_price",
|
||||
0.90,
|
||||
),
|
||||
# @ $X.XXX (per unit implied)
|
||||
(
|
||||
r"@\s*\$?\s*(\d{1,2}\.\d{2,3})",
|
||||
"at_price",
|
||||
0.85,
|
||||
),
|
||||
# PPG $X.XXX (price per gallon)
|
||||
(
|
||||
r"PPG[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
|
||||
"ppg",
|
||||
0.92,
|
||||
),
|
||||
]
|
||||
|
||||
# Fuel grade patterns
|
||||
GRADE_PATTERNS = [
|
||||
# REGULAR 87, REG 87
|
||||
(r"(?:REGULAR|REG)\s*(\d{2})", "regular", 0.95),
|
||||
# UNLEADED 87
|
||||
(r"UNLEADED\s*(\d{2})", "unleaded", 0.93),
|
||||
# PLUS 89, MID 89, MIDGRADE 89
|
||||
(r"(?:PLUS|MID(?:GRADE)?)\s*(\d{2})", "plus", 0.95),
|
||||
# PREMIUM 91/93, PREM 91/93, SUPER 91/93
|
||||
(r"(?:PREMIUM|PREM|SUPER)\s*(\d{2})", "premium", 0.95),
|
||||
# Just the octane number near fuel context (87, 89, 91, 93)
|
||||
(r"(?<!\d)\s*(87|89|91|93)\s*(?:OCT(?:ANE)?)?", "octane_only", 0.75),
|
||||
# DIESEL (no octane)
|
||||
(r"DIESEL(?:\s*#?\d)?", "diesel", 0.98),
|
||||
# E85 (ethanol blend)
|
||||
(r"E\s*85", "e85", 0.95),
|
||||
]
|
||||
|
||||
# Common gas station names
|
||||
STATION_NAMES = [
|
||||
"SHELL", "CHEVRON", "EXXON", "MOBIL", "BP", "SUNOCO", "76",
|
||||
"CIRCLE K", "SPEEDWAY", "WAWA", "SHEETZ", "CASEY", "PILOT",
|
||||
"FLYING J", "LOVES", "TA", "PETRO", "MARATHON", "CITGO",
|
||||
"VALERO", "MURPHY", "COSTCO", "SAMS CLUB", "SAM'S CLUB",
|
||||
"KROGER", "QT", "QUIKTRIP", "RACETRAC", "KUM & GO",
|
||||
"KWIK TRIP", "HOLIDAY", "SINCLAIR", "CONOCO", "PHILLIPS 66",
|
||||
"ARCO", "AMPM", "AM/PM", "7-ELEVEN", "7 ELEVEN", "GETTY",
|
||||
"GULF", "HESS", "TEXACO", "TURKEY HILL", "CUMBERLAND FARMS",
|
||||
]
|
||||
|
||||
def extract_gallons(self, text: str) -> Optional[FuelQuantityMatch]:
|
||||
"""
|
||||
Extract fuel quantity in gallons.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelQuantityMatch or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, confidence in self.GALLONS_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
quantity = float(match.group(1))
|
||||
if self._is_reasonable_quantity(quantity):
|
||||
return FuelQuantityMatch(
|
||||
value=quantity,
|
||||
unit="GAL",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_liters(self, text: str) -> Optional[FuelQuantityMatch]:
|
||||
"""
|
||||
Extract fuel quantity in liters.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelQuantityMatch or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, confidence in self.LITERS_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
quantity = float(match.group(1))
|
||||
if self._is_reasonable_quantity(quantity, is_liters=True):
|
||||
return FuelQuantityMatch(
|
||||
value=quantity,
|
||||
unit="L",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_quantity(self, text: str) -> Optional[FuelQuantityMatch]:
|
||||
"""
|
||||
Extract fuel quantity (gallons or liters).
|
||||
|
||||
Prefers gallons for US receipts.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelQuantityMatch or None
|
||||
"""
|
||||
# Try gallons first (more common in US)
|
||||
gallons = self.extract_gallons(text)
|
||||
if gallons:
|
||||
return gallons
|
||||
|
||||
# Fall back to liters
|
||||
return self.extract_liters(text)
|
||||
|
||||
def extract_price_per_unit(self, text: str) -> Optional[FuelPriceMatch]:
|
||||
"""
|
||||
Extract price per gallon/liter.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelPriceMatch or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, confidence in self.PRICE_PER_UNIT_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
price = float(match.group(1))
|
||||
if self._is_reasonable_price(price):
|
||||
return FuelPriceMatch(
|
||||
value=price,
|
||||
unit="GAL", # Default to gallons for US
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_grade(self, text: str) -> Optional[FuelGradeMatch]:
|
||||
"""
|
||||
Extract fuel grade (octane rating or diesel).
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
FuelGradeMatch or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, confidence in self.GRADE_PATTERNS:
|
||||
match = re.search(pattern, text_upper)
|
||||
if match:
|
||||
if name == "diesel":
|
||||
return FuelGradeMatch(
|
||||
value="DIESEL",
|
||||
display_name="Diesel",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
)
|
||||
elif name == "e85":
|
||||
return FuelGradeMatch(
|
||||
value="E85",
|
||||
display_name="E85 Ethanol",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
)
|
||||
else:
|
||||
octane = match.group(1)
|
||||
display = self._get_grade_display_name(octane, name)
|
||||
return FuelGradeMatch(
|
||||
value=octane,
|
||||
display_name=display,
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_merchant_name(self, text: str) -> Optional[tuple[str, float]]:
|
||||
"""
|
||||
Extract gas station/merchant name.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
Tuple of (merchant_name, confidence) or None
|
||||
"""
|
||||
text_upper = text.upper()
|
||||
|
||||
# Check for known station names
|
||||
for station in self.STATION_NAMES:
|
||||
if station in text_upper:
|
||||
# Try to get the full line for context
|
||||
for line in text.split("\n"):
|
||||
if station in line.upper():
|
||||
# Clean up the line
|
||||
cleaned = line.strip()
|
||||
if len(cleaned) <= 50: # Reasonable length
|
||||
return (cleaned, 0.90)
|
||||
return (station.title(), 0.85)
|
||||
|
||||
# Fall back to first non-empty line (often the merchant)
|
||||
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
||||
if lines:
|
||||
first_line = lines[0]
|
||||
# Skip if it looks like a date or number
|
||||
if not re.match(r"^\d+[/\-.]", first_line):
|
||||
return (first_line[:50], 0.50) # Low confidence
|
||||
|
||||
return None
|
||||
|
||||
def _is_reasonable_quantity(
|
||||
self, quantity: float, is_liters: bool = False
|
||||
) -> bool:
|
||||
"""Check if fuel quantity is reasonable."""
|
||||
if is_liters:
|
||||
# Typical fill: 20-100 liters
|
||||
return 0.5 <= quantity <= 150.0
|
||||
else:
|
||||
# Typical fill: 5-30 gallons
|
||||
return 0.1 <= quantity <= 50.0
|
||||
|
||||
def _is_reasonable_price(self, price: float) -> bool:
|
||||
"""Check if price per unit is reasonable."""
|
||||
# US gas prices: $1.50 - $8.00 per gallon (allowing for fluctuation)
|
||||
return 1.00 <= price <= 10.00
|
||||
|
||||
def _get_grade_display_name(self, octane: str, pattern_name: str) -> str:
|
||||
"""Get display name for fuel grade."""
|
||||
grade_names = {
|
||||
"87": "Regular 87",
|
||||
"89": "Plus 89",
|
||||
"91": "Premium 91",
|
||||
"93": "Premium 93",
|
||||
}
|
||||
|
||||
if octane in grade_names:
|
||||
return grade_names[octane]
|
||||
|
||||
# Use pattern hint
|
||||
if pattern_name == "premium":
|
||||
return f"Premium {octane}"
|
||||
elif pattern_name == "plus":
|
||||
return f"Plus {octane}"
|
||||
else:
|
||||
return f"Unleaded {octane}"
|
||||
|
||||
|
||||
# Singleton instance
|
||||
fuel_matcher = FuelPatternMatcher()
|
||||
Reference in New Issue
Block a user