feat: add receipt OCR pipeline (refs #69)

Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 20:43:30 -06:00
parent a2f0abb14c
commit 6319d50fb1
16 changed files with 2845 additions and 2 deletions
--- a/ocr/app/patterns/init.py
+++ b/ocr/app/patterns/init.py
@@ -0,0 +1,13 @@
+"""Pattern matching modules for receipt field extraction."""
+from app.patterns.date_patterns import DatePatternMatcher, date_matcher
+from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher
+from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
+
+__all__ = [
+    "DatePatternMatcher",
+    "date_matcher",
+    "CurrencyPatternMatcher",
+    "currency_matcher",
+    "FuelPatternMatcher",
+    "fuel_matcher",
+]
--- a/ocr/app/patterns/currency_patterns.py
+++ b/ocr/app/patterns/currency_patterns.py
@@ -0,0 +1,227 @@
+"""Currency and amount pattern matching for receipt extraction."""
+import re
+from dataclasses import dataclass
+from decimal import Decimal, InvalidOperation
+from typing import Optional
+
+
+@dataclass
+class AmountMatch:
+    """Result of currency/amount pattern matching."""
+
+    value: float
+    raw_match: str
+    confidence: float
+    pattern_name: str
+    label: Optional[str] = None  # e.g., "TOTAL", "SUBTOTAL"
+
+
+class CurrencyPatternMatcher:
+    """Extract and normalize currency amounts from receipt text."""
+
+    # Total amount patterns (prioritized)
+    TOTAL_PATTERNS = [
+        # TOTAL $XX.XX or TOTAL: $XX.XX
+        (
+            r"(?:^|\s)TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
+            "total_explicit",
+            0.98,
+        ),
+        # AMOUNT DUE $XX.XX
+        (
+            r"AMOUNT\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
+            "amount_due",
+            0.95,
+        ),
+        # SALE $XX.XX
+        (
+            r"(?:^|\s)SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
+            "sale_explicit",
+            0.92,
+        ),
+        # GRAND TOTAL $XX.XX
+        (
+            r"GRAND\s*TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
+            "grand_total",
+            0.97,
+        ),
+        # TOTAL SALE $XX.XX
+        (
+            r"TOTAL\s*SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
+            "total_sale",
+            0.96,
+        ),
+        # BALANCE DUE $XX.XX
+        (
+            r"BALANCE\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
+            "balance_due",
+            0.94,
+        ),
+        # PURCHASE $XX.XX
+        (
+            r"(?:^|\s)PURCHASE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
+            "purchase",
+            0.88,
+        ),
+    ]
+
+    # Generic amount patterns (lower priority)
+    AMOUNT_PATTERNS = [
+        # $XX.XX (standalone dollar amount)
+        (
+            r"\$\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
+            "dollar_amount",
+            0.60,
+        ),
+        # XX.XX (standalone decimal amount)
+        (
+            r"(?<![.$\d])(\d{1,6}[.,]\d{2})(?![.\d])",
+            "decimal_amount",
+            0.40,
+        ),
+    ]
+
+    def extract_total(self, text: str) -> Optional[AmountMatch]:
+        """
+        Extract the total amount from receipt text.
+
+        Prioritizes explicit total patterns over generic amounts.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            AmountMatch for total or None if not found
+        """
+        text_upper = text.upper()
+
+        # Try total-specific patterns first
+        for pattern, name, confidence in self.TOTAL_PATTERNS:
+            match = re.search(pattern, text_upper, re.MULTILINE)
+            if match:
+                amount = self._parse_amount(match.group(1))
+                if amount is not None and self._is_reasonable_total(amount):
+                    return AmountMatch(
+                        value=amount,
+                        raw_match=match.group(0).strip(),
+                        confidence=confidence,
+                        pattern_name=name,
+                        label=self._extract_label(name),
+                    )
+
+        # Fall back to finding the largest reasonable amount
+        all_amounts = self.extract_all_amounts(text)
+        reasonable = [a for a in all_amounts if self._is_reasonable_total(a.value)]
+        if reasonable:
+            # Assume largest amount is the total
+            reasonable.sort(key=lambda x: x.value, reverse=True)
+            best = reasonable[0]
+            # Lower confidence since we're guessing
+            return AmountMatch(
+                value=best.value,
+                raw_match=best.raw_match,
+                confidence=min(0.60, best.confidence),
+                pattern_name="inferred_total",
+                label="TOTAL",
+            )
+
+        return None
+
+    def extract_all_amounts(self, text: str) -> list[AmountMatch]:
+        """
+        Extract all currency amounts from text.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            List of AmountMatch objects
+        """
+        matches = []
+        text_upper = text.upper()
+
+        # Check total patterns
+        for pattern, name, confidence in self.TOTAL_PATTERNS:
+            for match in re.finditer(pattern, text_upper, re.MULTILINE):
+                amount = self._parse_amount(match.group(1))
+                if amount is not None:
+                    matches.append(
+                        AmountMatch(
+                            value=amount,
+                            raw_match=match.group(0).strip(),
+                            confidence=confidence,
+                            pattern_name=name,
+                            label=self._extract_label(name),
+                        )
+                    )
+
+        # Check generic amount patterns
+        for pattern, name, confidence in self.AMOUNT_PATTERNS:
+            for match in re.finditer(pattern, text_upper):
+                amount = self._parse_amount(match.group(1))
+                if amount is not None:
+                    # Skip if already found by a more specific pattern
+                    if not any(abs(m.value - amount) < 0.01 for m in matches):
+                        matches.append(
+                            AmountMatch(
+                                value=amount,
+                                raw_match=match.group(0).strip(),
+                                confidence=confidence,
+                                pattern_name=name,
+                            )
+                        )
+
+        return matches
+
+    def _parse_amount(self, amount_str: str) -> Optional[float]:
+        """Parse amount string to float, handling various formats."""
+        # Remove any spaces
+        cleaned = amount_str.strip().replace(" ", "")
+
+        # Handle European format (1.234,56) vs US format (1,234.56)
+        # For US receipts, assume comma is thousands separator
+        if "," in cleaned and "." in cleaned:
+            # Determine which is decimal separator (last one)
+            if cleaned.rfind(",") > cleaned.rfind("."):
+                # European format
+                cleaned = cleaned.replace(".", "").replace(",", ".")
+            else:
+                # US format
+                cleaned = cleaned.replace(",", "")
+        elif "," in cleaned:
+            # Could be thousands separator or decimal
+            parts = cleaned.split(",")
+            if len(parts) == 2 and len(parts[1]) == 2:
+                # Likely decimal separator
+                cleaned = cleaned.replace(",", ".")
+            else:
+                # Likely thousands separator
+                cleaned = cleaned.replace(",", "")
+
+        try:
+            amount = float(Decimal(cleaned))
+            return amount if amount >= 0 else None
+        except (InvalidOperation, ValueError):
+            return None
+
+    def _is_reasonable_total(self, amount: float) -> bool:
+        """Check if amount is a reasonable total for a fuel receipt."""
+        # Reasonable range: $1 to $500 for typical fuel purchases
+        return 1.0 <= amount <= 500.0
+
+    def _extract_label(self, pattern_name: str) -> str:
+        """Extract display label from pattern name."""
+        labels = {
+            "total_explicit": "TOTAL",
+            "amount_due": "AMOUNT DUE",
+            "sale_explicit": "SALE",
+            "grand_total": "GRAND TOTAL",
+            "total_sale": "TOTAL SALE",
+            "balance_due": "BALANCE DUE",
+            "purchase": "PURCHASE",
+        }
+        return labels.get(pattern_name, "TOTAL")
+
+
+# Singleton instance
+currency_matcher = CurrencyPatternMatcher()
--- a/ocr/app/patterns/date_patterns.py
+++ b/ocr/app/patterns/date_patterns.py
@@ -0,0 +1,186 @@
+"""Date pattern matching for receipt extraction."""
+import re
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Optional
+
+
+@dataclass
+class DateMatch:
+    """Result of date pattern matching."""
+
+    value: str  # ISO format YYYY-MM-DD
+    raw_match: str  # Original text matched
+    confidence: float
+    pattern_name: str
+
+
+class DatePatternMatcher:
+    """Extract and normalize dates from receipt text."""
+
+    # Pattern definitions with named groups and confidence weights
+    PATTERNS = [
+        # MM/DD/YYYY or MM/DD/YY (most common US format)
+        (
+            r"(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4})",
+            "mm_dd_yyyy",
+            0.95,
+        ),
+        # MM-DD-YYYY or MM-DD-YY
+        (
+            r"(?P<month>\d{1,2})-(?P<day>\d{1,2})-(?P<year>\d{2,4})",
+            "mm_dd_yyyy_dash",
+            0.90,
+        ),
+        # YYYY-MM-DD (ISO format)
+        (
+            r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})",
+            "iso_date",
+            0.98,
+        ),
+        # Mon DD, YYYY (e.g., Jan 15, 2024)
+        (
+            r"(?P<month_name>[A-Za-z]{3})\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})",
+            "month_name_long",
+            0.85,
+        ),
+        # DD Mon YYYY (e.g., 15 Jan 2024)
+        (
+            r"(?P<day>\d{1,2})\s+(?P<month_name>[A-Za-z]{3})\s+(?P<year>\d{4})",
+            "day_month_year",
+            0.85,
+        ),
+        # MMDDYYYY or MMDDYY (no separators, common in some POS systems)
+        (
+            r"(?<!\d)(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{2,4})(?!\d)",
+            "compact_date",
+            0.70,
+        ),
+    ]
+
+    MONTH_NAMES = {
+        "jan": 1, "january": 1,
+        "feb": 2, "february": 2,
+        "mar": 3, "march": 3,
+        "apr": 4, "april": 4,
+        "may": 5,
+        "jun": 6, "june": 6,
+        "jul": 7, "july": 7,
+        "aug": 8, "august": 8,
+        "sep": 9, "sept": 9, "september": 9,
+        "oct": 10, "october": 10,
+        "nov": 11, "november": 11,
+        "dec": 12, "december": 12,
+    }
+
+    def extract_dates(self, text: str) -> list[DateMatch]:
+        """
+        Extract all date patterns from text.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            List of DateMatch objects sorted by confidence
+        """
+        matches = []
+        text_upper = text.upper()
+
+        for pattern, name, base_confidence in self.PATTERNS:
+            for match in re.finditer(pattern, text, re.IGNORECASE):
+                parsed = self._parse_match(match, name)
+                if parsed:
+                    year, month, day = parsed
+                    if self._is_valid_date(year, month, day):
+                        # Adjust confidence based on context
+                        confidence = self._adjust_confidence(
+                            base_confidence, text_upper, match.start()
+                        )
+                        matches.append(
+                            DateMatch(
+                                value=f"{year:04d}-{month:02d}-{day:02d}",
+                                raw_match=match.group(0),
+                                confidence=confidence,
+                                pattern_name=name,
+                            )
+                        )
+
+        # Sort by confidence, deduplicate by value
+        matches.sort(key=lambda x: x.confidence, reverse=True)
+        seen = set()
+        unique_matches = []
+        for match in matches:
+            if match.value not in seen:
+                seen.add(match.value)
+                unique_matches.append(match)
+
+        return unique_matches
+
+    def extract_best_date(self, text: str) -> Optional[DateMatch]:
+        """
+        Extract the most likely transaction date.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            Best DateMatch or None if no date found
+        """
+        matches = self.extract_dates(text)
+        return matches[0] if matches else None
+
+    def _parse_match(
+        self, match: re.Match, pattern_name: str
+    ) -> Optional[tuple[int, int, int]]:
+        """Parse regex match into year, month, day tuple."""
+        groups = match.groupdict()
+
+        # Handle month name patterns
+        if "month_name" in groups:
+            month_str = groups["month_name"].lower()
+            month = self.MONTH_NAMES.get(month_str)
+            if not month:
+                return None
+        else:
+            month = int(groups["month"])
+
+        day = int(groups["day"])
+        year = int(groups["year"])
+
+        # Normalize 2-digit years
+        if year < 100:
+            year = 2000 + year if year < 50 else 1900 + year
+
+        return year, month, day
+
+    def _is_valid_date(self, year: int, month: int, day: int) -> bool:
+        """Check if date components form a valid date."""
+        try:
+            datetime(year=year, month=month, day=day)
+            # Reasonable year range for receipts
+            return 2000 <= year <= 2100
+        except ValueError:
+            return False
+
+    def _adjust_confidence(
+        self, base_confidence: float, text: str, position: int
+    ) -> float:
+        """
+        Adjust confidence based on context clues.
+
+        Boost confidence if date appears near date-related keywords.
+        """
+        # Look for nearby date keywords
+        context_start = max(0, position - 50)
+        context = text[context_start:position + 50]
+
+        date_keywords = ["DATE", "TIME", "TRANS", "SALE"]
+        for keyword in date_keywords:
+            if keyword in context:
+                return min(1.0, base_confidence + 0.05)
+
+        return base_confidence
+
+
+# Singleton instance
+date_matcher = DatePatternMatcher()
--- a/ocr/app/patterns/fuel_patterns.py
+++ b/ocr/app/patterns/fuel_patterns.py
@@ -0,0 +1,364 @@
+"""Fuel-specific pattern matching for receipt extraction."""
+import re
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class FuelQuantityMatch:
+    """Result of fuel quantity pattern matching."""
+
+    value: float  # Gallons or liters
+    unit: str  # "GAL" or "L"
+    raw_match: str
+    confidence: float
+    pattern_name: str
+
+
+@dataclass
+class FuelPriceMatch:
+    """Result of fuel price per unit pattern matching."""
+
+    value: float
+    unit: str  # "GAL" or "L"
+    raw_match: str
+    confidence: float
+    pattern_name: str
+
+
+@dataclass
+class FuelGradeMatch:
+    """Result of fuel grade pattern matching."""
+
+    value: str  # e.g., "87", "89", "93", "DIESEL"
+    display_name: str  # e.g., "Regular 87", "Premium 93"
+    raw_match: str
+    confidence: float
+
+
+class FuelPatternMatcher:
+    """Extract fuel-specific data from receipt text."""
+
+    # Gallons patterns
+    GALLONS_PATTERNS = [
+        # XX.XXX GAL or XX.XXX GALLONS
+        (
+            r"(\d{1,3}\.\d{1,3})\s*(?:GAL(?:LON)?S?)",
+            "gallons_suffix",
+            0.95,
+        ),
+        # GALLONS: XX.XXX or GAL: XX.XXX
+        (
+            r"(?:GAL(?:LON)?S?)[:\s]+(\d{1,3}\.\d{1,3})",
+            "gallons_prefix",
+            0.93,
+        ),
+        # VOLUME XX.XXX
+        (
+            r"VOLUME[:\s]+(\d{1,3}\.\d{1,3})",
+            "volume",
+            0.85,
+        ),
+        # QTY XX.XXX (near fuel context)
+        (
+            r"QTY[:\s]+(\d{1,3}\.\d{1,3})",
+            "qty",
+            0.70,
+        ),
+    ]
+
+    # Liters patterns (for international receipts)
+    LITERS_PATTERNS = [
+        # XX.XX L or XX.XX LITERS
+        (
+            r"(\d{1,3}\.\d{1,3})\s*(?:L(?:ITERS?)?)",
+            "liters_suffix",
+            0.95,
+        ),
+        # LITERS: XX.XX
+        (
+            r"(?:L(?:ITERS?)?)[:\s]+(\d{1,3}\.\d{1,3})",
+            "liters_prefix",
+            0.93,
+        ),
+    ]
+
+    # Price per gallon patterns
+    PRICE_PER_UNIT_PATTERNS = [
+        # $X.XXX/GAL or $X.XX/GAL
+        (
+            r"\$?\s*(\d{1,2}\.\d{2,3})\s*/\s*GAL",
+            "price_per_gal",
+            0.98,
+        ),
+        # PRICE/GAL $X.XXX
+        (
+            r"PRICE\s*/\s*GAL[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
+            "labeled_price_gal",
+            0.96,
+        ),
+        # UNIT PRICE $X.XXX
+        (
+            r"UNIT\s*PRICE[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
+            "unit_price",
+            0.90,
+        ),
+        # @ $X.XXX (per unit implied)
+        (
+            r"@\s*\$?\s*(\d{1,2}\.\d{2,3})",
+            "at_price",
+            0.85,
+        ),
+        # PPG $X.XXX (price per gallon)
+        (
+            r"PPG[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
+            "ppg",
+            0.92,
+        ),
+    ]
+
+    # Fuel grade patterns
+    GRADE_PATTERNS = [
+        # REGULAR 87, REG 87
+        (r"(?:REGULAR|REG)\s*(\d{2})", "regular", 0.95),
+        # UNLEADED 87
+        (r"UNLEADED\s*(\d{2})", "unleaded", 0.93),
+        # PLUS 89, MID 89, MIDGRADE 89
+        (r"(?:PLUS|MID(?:GRADE)?)\s*(\d{2})", "plus", 0.95),
+        # PREMIUM 91/93, PREM 91/93, SUPER 91/93
+        (r"(?:PREMIUM|PREM|SUPER)\s*(\d{2})", "premium", 0.95),
+        # Just the octane number near fuel context (87, 89, 91, 93)
+        (r"(?<!\d)\s*(87|89|91|93)\s*(?:OCT(?:ANE)?)?", "octane_only", 0.75),
+        # DIESEL (no octane)
+        (r"DIESEL(?:\s*#?\d)?", "diesel", 0.98),
+        # E85 (ethanol blend)
+        (r"E\s*85", "e85", 0.95),
+    ]
+
+    # Common gas station names
+    STATION_NAMES = [
+        "SHELL", "CHEVRON", "EXXON", "MOBIL", "BP", "SUNOCO", "76",
+        "CIRCLE K", "SPEEDWAY", "WAWA", "SHEETZ", "CASEY", "PILOT",
+        "FLYING J", "LOVES", "TA", "PETRO", "MARATHON", "CITGO",
+        "VALERO", "MURPHY", "COSTCO", "SAMS CLUB", "SAM'S CLUB",
+        "KROGER", "QT", "QUIKTRIP", "RACETRAC", "KUM & GO",
+        "KWIK TRIP", "HOLIDAY", "SINCLAIR", "CONOCO", "PHILLIPS 66",
+        "ARCO", "AMPM", "AM/PM", "7-ELEVEN", "7 ELEVEN", "GETTY",
+        "GULF", "HESS", "TEXACO", "TURKEY HILL", "CUMBERLAND FARMS",
+    ]
+
+    def extract_gallons(self, text: str) -> Optional[FuelQuantityMatch]:
+        """
+        Extract fuel quantity in gallons.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelQuantityMatch or None
+        """
+        text_upper = text.upper()
+
+        for pattern, name, confidence in self.GALLONS_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                quantity = float(match.group(1))
+                if self._is_reasonable_quantity(quantity):
+                    return FuelQuantityMatch(
+                        value=quantity,
+                        unit="GAL",
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                        pattern_name=name,
+                    )
+
+        return None
+
+    def extract_liters(self, text: str) -> Optional[FuelQuantityMatch]:
+        """
+        Extract fuel quantity in liters.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelQuantityMatch or None
+        """
+        text_upper = text.upper()
+
+        for pattern, name, confidence in self.LITERS_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                quantity = float(match.group(1))
+                if self._is_reasonable_quantity(quantity, is_liters=True):
+                    return FuelQuantityMatch(
+                        value=quantity,
+                        unit="L",
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                        pattern_name=name,
+                    )
+
+        return None
+
+    def extract_quantity(self, text: str) -> Optional[FuelQuantityMatch]:
+        """
+        Extract fuel quantity (gallons or liters).
+
+        Prefers gallons for US receipts.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelQuantityMatch or None
+        """
+        # Try gallons first (more common in US)
+        gallons = self.extract_gallons(text)
+        if gallons:
+            return gallons
+
+        # Fall back to liters
+        return self.extract_liters(text)
+
+    def extract_price_per_unit(self, text: str) -> Optional[FuelPriceMatch]:
+        """
+        Extract price per gallon/liter.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelPriceMatch or None
+        """
+        text_upper = text.upper()
+
+        for pattern, name, confidence in self.PRICE_PER_UNIT_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                price = float(match.group(1))
+                if self._is_reasonable_price(price):
+                    return FuelPriceMatch(
+                        value=price,
+                        unit="GAL",  # Default to gallons for US
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                        pattern_name=name,
+                    )
+
+        return None
+
+    def extract_grade(self, text: str) -> Optional[FuelGradeMatch]:
+        """
+        Extract fuel grade (octane rating or diesel).
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelGradeMatch or None
+        """
+        text_upper = text.upper()
+
+        for pattern, name, confidence in self.GRADE_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                if name == "diesel":
+                    return FuelGradeMatch(
+                        value="DIESEL",
+                        display_name="Diesel",
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                    )
+                elif name == "e85":
+                    return FuelGradeMatch(
+                        value="E85",
+                        display_name="E85 Ethanol",
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                    )
+                else:
+                    octane = match.group(1)
+                    display = self._get_grade_display_name(octane, name)
+                    return FuelGradeMatch(
+                        value=octane,
+                        display_name=display,
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                    )
+
+        return None
+
+    def extract_merchant_name(self, text: str) -> Optional[tuple[str, float]]:
+        """
+        Extract gas station/merchant name.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            Tuple of (merchant_name, confidence) or None
+        """
+        text_upper = text.upper()
+
+        # Check for known station names
+        for station in self.STATION_NAMES:
+            if station in text_upper:
+                # Try to get the full line for context
+                for line in text.split("\n"):
+                    if station in line.upper():
+                        # Clean up the line
+                        cleaned = line.strip()
+                        if len(cleaned) <= 50:  # Reasonable length
+                            return (cleaned, 0.90)
+                        return (station.title(), 0.85)
+
+        # Fall back to first non-empty line (often the merchant)
+        lines = [l.strip() for l in text.split("\n") if l.strip()]
+        if lines:
+            first_line = lines[0]
+            # Skip if it looks like a date or number
+            if not re.match(r"^\d+[/\-.]", first_line):
+                return (first_line[:50], 0.50)  # Low confidence
+
+        return None
+
+    def _is_reasonable_quantity(
+        self, quantity: float, is_liters: bool = False
+    ) -> bool:
+        """Check if fuel quantity is reasonable."""
+        if is_liters:
+            # Typical fill: 20-100 liters
+            return 0.5 <= quantity <= 150.0
+        else:
+            # Typical fill: 5-30 gallons
+            return 0.1 <= quantity <= 50.0
+
+    def _is_reasonable_price(self, price: float) -> bool:
+        """Check if price per unit is reasonable."""
+        # US gas prices: $1.50 - $8.00 per gallon (allowing for fluctuation)
+        return 1.00 <= price <= 10.00
+
+    def _get_grade_display_name(self, octane: str, pattern_name: str) -> str:
+        """Get display name for fuel grade."""
+        grade_names = {
+            "87": "Regular 87",
+            "89": "Plus 89",
+            "91": "Premium 91",
+            "93": "Premium 93",
+        }
+
+        if octane in grade_names:
+            return grade_names[octane]
+
+        # Use pattern hint
+        if pattern_name == "premium":
+            return f"Premium {octane}"
+        elif pattern_name == "plus":
+            return f"Plus {octane}"
+        else:
+            return f"Unleaded {octane}"
+
+
+# Singleton instance
+fuel_matcher = FuelPatternMatcher()