feat: add receipt OCR pipeline (refs #69)

Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 20:43:30 -06:00
parent a2f0abb14c
commit 6319d50fb1
16 changed files with 2845 additions and 2 deletions
--- a/ocr/app/patterns/fuel_patterns.py
+++ b/ocr/app/patterns/fuel_patterns.py
@@ -0,0 +1,364 @@
+"""Fuel-specific pattern matching for receipt extraction."""
+import re
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class FuelQuantityMatch:
+    """Result of fuel quantity pattern matching."""
+
+    value: float  # Gallons or liters
+    unit: str  # "GAL" or "L"
+    raw_match: str
+    confidence: float
+    pattern_name: str
+
+
+@dataclass
+class FuelPriceMatch:
+    """Result of fuel price per unit pattern matching."""
+
+    value: float
+    unit: str  # "GAL" or "L"
+    raw_match: str
+    confidence: float
+    pattern_name: str
+
+
+@dataclass
+class FuelGradeMatch:
+    """Result of fuel grade pattern matching."""
+
+    value: str  # e.g., "87", "89", "93", "DIESEL"
+    display_name: str  # e.g., "Regular 87", "Premium 93"
+    raw_match: str
+    confidence: float
+
+
+class FuelPatternMatcher:
+    """Extract fuel-specific data from receipt text."""
+
+    # Gallons patterns
+    GALLONS_PATTERNS = [
+        # XX.XXX GAL or XX.XXX GALLONS
+        (
+            r"(\d{1,3}\.\d{1,3})\s*(?:GAL(?:LON)?S?)",
+            "gallons_suffix",
+            0.95,
+        ),
+        # GALLONS: XX.XXX or GAL: XX.XXX
+        (
+            r"(?:GAL(?:LON)?S?)[:\s]+(\d{1,3}\.\d{1,3})",
+            "gallons_prefix",
+            0.93,
+        ),
+        # VOLUME XX.XXX
+        (
+            r"VOLUME[:\s]+(\d{1,3}\.\d{1,3})",
+            "volume",
+            0.85,
+        ),
+        # QTY XX.XXX (near fuel context)
+        (
+            r"QTY[:\s]+(\d{1,3}\.\d{1,3})",
+            "qty",
+            0.70,
+        ),
+    ]
+
+    # Liters patterns (for international receipts)
+    LITERS_PATTERNS = [
+        # XX.XX L or XX.XX LITERS
+        (
+            r"(\d{1,3}\.\d{1,3})\s*(?:L(?:ITERS?)?)",
+            "liters_suffix",
+            0.95,
+        ),
+        # LITERS: XX.XX
+        (
+            r"(?:L(?:ITERS?)?)[:\s]+(\d{1,3}\.\d{1,3})",
+            "liters_prefix",
+            0.93,
+        ),
+    ]
+
+    # Price per gallon patterns
+    PRICE_PER_UNIT_PATTERNS = [
+        # $X.XXX/GAL or $X.XX/GAL
+        (
+            r"\$?\s*(\d{1,2}\.\d{2,3})\s*/\s*GAL",
+            "price_per_gal",
+            0.98,
+        ),
+        # PRICE/GAL $X.XXX
+        (
+            r"PRICE\s*/\s*GAL[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
+            "labeled_price_gal",
+            0.96,
+        ),
+        # UNIT PRICE $X.XXX
+        (
+            r"UNIT\s*PRICE[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
+            "unit_price",
+            0.90,
+        ),
+        # @ $X.XXX (per unit implied)
+        (
+            r"@\s*\$?\s*(\d{1,2}\.\d{2,3})",
+            "at_price",
+            0.85,
+        ),
+        # PPG $X.XXX (price per gallon)
+        (
+            r"PPG[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
+            "ppg",
+            0.92,
+        ),
+    ]
+
+    # Fuel grade patterns
+    GRADE_PATTERNS = [
+        # REGULAR 87, REG 87
+        (r"(?:REGULAR|REG)\s*(\d{2})", "regular", 0.95),
+        # UNLEADED 87
+        (r"UNLEADED\s*(\d{2})", "unleaded", 0.93),
+        # PLUS 89, MID 89, MIDGRADE 89
+        (r"(?:PLUS|MID(?:GRADE)?)\s*(\d{2})", "plus", 0.95),
+        # PREMIUM 91/93, PREM 91/93, SUPER 91/93
+        (r"(?:PREMIUM|PREM|SUPER)\s*(\d{2})", "premium", 0.95),
+        # Just the octane number near fuel context (87, 89, 91, 93)
+        (r"(?<!\d)\s*(87|89|91|93)\s*(?:OCT(?:ANE)?)?", "octane_only", 0.75),
+        # DIESEL (no octane)
+        (r"DIESEL(?:\s*#?\d)?", "diesel", 0.98),
+        # E85 (ethanol blend)
+        (r"E\s*85", "e85", 0.95),
+    ]
+
+    # Common gas station names
+    STATION_NAMES = [
+        "SHELL", "CHEVRON", "EXXON", "MOBIL", "BP", "SUNOCO", "76",
+        "CIRCLE K", "SPEEDWAY", "WAWA", "SHEETZ", "CASEY", "PILOT",
+        "FLYING J", "LOVES", "TA", "PETRO", "MARATHON", "CITGO",
+        "VALERO", "MURPHY", "COSTCO", "SAMS CLUB", "SAM'S CLUB",
+        "KROGER", "QT", "QUIKTRIP", "RACETRAC", "KUM & GO",
+        "KWIK TRIP", "HOLIDAY", "SINCLAIR", "CONOCO", "PHILLIPS 66",
+        "ARCO", "AMPM", "AM/PM", "7-ELEVEN", "7 ELEVEN", "GETTY",
+        "GULF", "HESS", "TEXACO", "TURKEY HILL", "CUMBERLAND FARMS",
+    ]
+
+    def extract_gallons(self, text: str) -> Optional[FuelQuantityMatch]:
+        """
+        Extract fuel quantity in gallons.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelQuantityMatch or None
+        """
+        text_upper = text.upper()
+
+        for pattern, name, confidence in self.GALLONS_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                quantity = float(match.group(1))
+                if self._is_reasonable_quantity(quantity):
+                    return FuelQuantityMatch(
+                        value=quantity,
+                        unit="GAL",
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                        pattern_name=name,
+                    )
+
+        return None
+
+    def extract_liters(self, text: str) -> Optional[FuelQuantityMatch]:
+        """
+        Extract fuel quantity in liters.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelQuantityMatch or None
+        """
+        text_upper = text.upper()
+
+        for pattern, name, confidence in self.LITERS_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                quantity = float(match.group(1))
+                if self._is_reasonable_quantity(quantity, is_liters=True):
+                    return FuelQuantityMatch(
+                        value=quantity,
+                        unit="L",
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                        pattern_name=name,
+                    )
+
+        return None
+
+    def extract_quantity(self, text: str) -> Optional[FuelQuantityMatch]:
+        """
+        Extract fuel quantity (gallons or liters).
+
+        Prefers gallons for US receipts.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelQuantityMatch or None
+        """
+        # Try gallons first (more common in US)
+        gallons = self.extract_gallons(text)
+        if gallons:
+            return gallons
+
+        # Fall back to liters
+        return self.extract_liters(text)
+
+    def extract_price_per_unit(self, text: str) -> Optional[FuelPriceMatch]:
+        """
+        Extract price per gallon/liter.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelPriceMatch or None
+        """
+        text_upper = text.upper()
+
+        for pattern, name, confidence in self.PRICE_PER_UNIT_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                price = float(match.group(1))
+                if self._is_reasonable_price(price):
+                    return FuelPriceMatch(
+                        value=price,
+                        unit="GAL",  # Default to gallons for US
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                        pattern_name=name,
+                    )
+
+        return None
+
+    def extract_grade(self, text: str) -> Optional[FuelGradeMatch]:
+        """
+        Extract fuel grade (octane rating or diesel).
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            FuelGradeMatch or None
+        """
+        text_upper = text.upper()
+
+        for pattern, name, confidence in self.GRADE_PATTERNS:
+            match = re.search(pattern, text_upper)
+            if match:
+                if name == "diesel":
+                    return FuelGradeMatch(
+                        value="DIESEL",
+                        display_name="Diesel",
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                    )
+                elif name == "e85":
+                    return FuelGradeMatch(
+                        value="E85",
+                        display_name="E85 Ethanol",
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                    )
+                else:
+                    octane = match.group(1)
+                    display = self._get_grade_display_name(octane, name)
+                    return FuelGradeMatch(
+                        value=octane,
+                        display_name=display,
+                        raw_match=match.group(0),
+                        confidence=confidence,
+                    )
+
+        return None
+
+    def extract_merchant_name(self, text: str) -> Optional[tuple[str, float]]:
+        """
+        Extract gas station/merchant name.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            Tuple of (merchant_name, confidence) or None
+        """
+        text_upper = text.upper()
+
+        # Check for known station names
+        for station in self.STATION_NAMES:
+            if station in text_upper:
+                # Try to get the full line for context
+                for line in text.split("\n"):
+                    if station in line.upper():
+                        # Clean up the line
+                        cleaned = line.strip()
+                        if len(cleaned) <= 50:  # Reasonable length
+                            return (cleaned, 0.90)
+                        return (station.title(), 0.85)
+
+        # Fall back to first non-empty line (often the merchant)
+        lines = [l.strip() for l in text.split("\n") if l.strip()]
+        if lines:
+            first_line = lines[0]
+            # Skip if it looks like a date or number
+            if not re.match(r"^\d+[/\-.]", first_line):
+                return (first_line[:50], 0.50)  # Low confidence
+
+        return None
+
+    def _is_reasonable_quantity(
+        self, quantity: float, is_liters: bool = False
+    ) -> bool:
+        """Check if fuel quantity is reasonable."""
+        if is_liters:
+            # Typical fill: 20-100 liters
+            return 0.5 <= quantity <= 150.0
+        else:
+            # Typical fill: 5-30 gallons
+            return 0.1 <= quantity <= 50.0
+
+    def _is_reasonable_price(self, price: float) -> bool:
+        """Check if price per unit is reasonable."""
+        # US gas prices: $1.50 - $8.00 per gallon (allowing for fluctuation)
+        return 1.00 <= price <= 10.00
+
+    def _get_grade_display_name(self, octane: str, pattern_name: str) -> str:
+        """Get display name for fuel grade."""
+        grade_names = {
+            "87": "Regular 87",
+            "89": "Plus 89",
+            "91": "Premium 91",
+            "93": "Premium 93",
+        }
+
+        if octane in grade_names:
+            return grade_names[octane]
+
+        # Use pattern hint
+        if pattern_name == "premium":
+            return f"Premium {octane}"
+        elif pattern_name == "plus":
+            return f"Plus {octane}"
+        else:
+            return f"Unleaded {octane}"
+
+
+# Singleton instance
+fuel_matcher = FuelPatternMatcher()