feat: add receipt OCR pipeline (refs #69)

Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 20:43:30 -06:00
parent a2f0abb14c
commit 6319d50fb1
16 changed files with 2845 additions and 2 deletions
--- a/ocr/app/patterns/date_patterns.py
+++ b/ocr/app/patterns/date_patterns.py
@@ -0,0 +1,186 @@
+"""Date pattern matching for receipt extraction."""
+import re
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Optional
+
+
+@dataclass
+class DateMatch:
+    """Result of date pattern matching."""
+
+    value: str  # ISO format YYYY-MM-DD
+    raw_match: str  # Original text matched
+    confidence: float
+    pattern_name: str
+
+
+class DatePatternMatcher:
+    """Extract and normalize dates from receipt text."""
+
+    # Pattern definitions with named groups and confidence weights
+    PATTERNS = [
+        # MM/DD/YYYY or MM/DD/YY (most common US format)
+        (
+            r"(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4})",
+            "mm_dd_yyyy",
+            0.95,
+        ),
+        # MM-DD-YYYY or MM-DD-YY
+        (
+            r"(?P<month>\d{1,2})-(?P<day>\d{1,2})-(?P<year>\d{2,4})",
+            "mm_dd_yyyy_dash",
+            0.90,
+        ),
+        # YYYY-MM-DD (ISO format)
+        (
+            r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})",
+            "iso_date",
+            0.98,
+        ),
+        # Mon DD, YYYY (e.g., Jan 15, 2024)
+        (
+            r"(?P<month_name>[A-Za-z]{3})\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})",
+            "month_name_long",
+            0.85,
+        ),
+        # DD Mon YYYY (e.g., 15 Jan 2024)
+        (
+            r"(?P<day>\d{1,2})\s+(?P<month_name>[A-Za-z]{3})\s+(?P<year>\d{4})",
+            "day_month_year",
+            0.85,
+        ),
+        # MMDDYYYY or MMDDYY (no separators, common in some POS systems)
+        (
+            r"(?<!\d)(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{2,4})(?!\d)",
+            "compact_date",
+            0.70,
+        ),
+    ]
+
+    MONTH_NAMES = {
+        "jan": 1, "january": 1,
+        "feb": 2, "february": 2,
+        "mar": 3, "march": 3,
+        "apr": 4, "april": 4,
+        "may": 5,
+        "jun": 6, "june": 6,
+        "jul": 7, "july": 7,
+        "aug": 8, "august": 8,
+        "sep": 9, "sept": 9, "september": 9,
+        "oct": 10, "october": 10,
+        "nov": 11, "november": 11,
+        "dec": 12, "december": 12,
+    }
+
+    def extract_dates(self, text: str) -> list[DateMatch]:
+        """
+        Extract all date patterns from text.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            List of DateMatch objects sorted by confidence
+        """
+        matches = []
+        text_upper = text.upper()
+
+        for pattern, name, base_confidence in self.PATTERNS:
+            for match in re.finditer(pattern, text, re.IGNORECASE):
+                parsed = self._parse_match(match, name)
+                if parsed:
+                    year, month, day = parsed
+                    if self._is_valid_date(year, month, day):
+                        # Adjust confidence based on context
+                        confidence = self._adjust_confidence(
+                            base_confidence, text_upper, match.start()
+                        )
+                        matches.append(
+                            DateMatch(
+                                value=f"{year:04d}-{month:02d}-{day:02d}",
+                                raw_match=match.group(0),
+                                confidence=confidence,
+                                pattern_name=name,
+                            )
+                        )
+
+        # Sort by confidence, deduplicate by value
+        matches.sort(key=lambda x: x.confidence, reverse=True)
+        seen = set()
+        unique_matches = []
+        for match in matches:
+            if match.value not in seen:
+                seen.add(match.value)
+                unique_matches.append(match)
+
+        return unique_matches
+
+    def extract_best_date(self, text: str) -> Optional[DateMatch]:
+        """
+        Extract the most likely transaction date.
+
+        Args:
+            text: Receipt text to search
+
+        Returns:
+            Best DateMatch or None if no date found
+        """
+        matches = self.extract_dates(text)
+        return matches[0] if matches else None
+
+    def _parse_match(
+        self, match: re.Match, pattern_name: str
+    ) -> Optional[tuple[int, int, int]]:
+        """Parse regex match into year, month, day tuple."""
+        groups = match.groupdict()
+
+        # Handle month name patterns
+        if "month_name" in groups:
+            month_str = groups["month_name"].lower()
+            month = self.MONTH_NAMES.get(month_str)
+            if not month:
+                return None
+        else:
+            month = int(groups["month"])
+
+        day = int(groups["day"])
+        year = int(groups["year"])
+
+        # Normalize 2-digit years
+        if year < 100:
+            year = 2000 + year if year < 50 else 1900 + year
+
+        return year, month, day
+
+    def _is_valid_date(self, year: int, month: int, day: int) -> bool:
+        """Check if date components form a valid date."""
+        try:
+            datetime(year=year, month=month, day=day)
+            # Reasonable year range for receipts
+            return 2000 <= year <= 2100
+        except ValueError:
+            return False
+
+    def _adjust_confidence(
+        self, base_confidence: float, text: str, position: int
+    ) -> float:
+        """
+        Adjust confidence based on context clues.
+
+        Boost confidence if date appears near date-related keywords.
+        """
+        # Look for nearby date keywords
+        context_start = max(0, position - 50)
+        context = text[context_start:position + 50]
+
+        date_keywords = ["DATE", "TIME", "TRANS", "SALE"]
+        for keyword in date_keywords:
+            if keyword in context:
+                return min(1.0, base_confidence + 0.05)
+
+        return base_confidence
+
+
+# Singleton instance
+date_matcher = DatePatternMatcher()