feat: add receipt OCR pipeline (refs #69)

Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 20:43:30 -06:00
parent a2f0abb14c
commit 6319d50fb1
16 changed files with 2845 additions and 2 deletions
--- a/ocr/tests/test_currency_patterns.py
+++ b/ocr/tests/test_currency_patterns.py
@@ -0,0 +1,198 @@
+"""Tests for currency pattern matching."""
+import pytest
+
+from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher
+
+
+class TestCurrencyPatternMatcher:
+    """Test currency and amount extraction."""
+
+    def test_total_explicit(self) -> None:
+        """Test 'TOTAL $XX.XX' pattern."""
+        text = "TOTAL $45.67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+        assert result.confidence > 0.9
+        assert result.label == "TOTAL"
+
+    def test_total_with_colon(self) -> None:
+        """Test 'TOTAL: $XX.XX' pattern."""
+        text = "TOTAL: $45.67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+
+    def test_total_without_dollar_sign(self) -> None:
+        """Test 'TOTAL 45.67' pattern."""
+        text = "TOTAL 45.67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+
+    def test_amount_due(self) -> None:
+        """Test 'AMOUNT DUE' pattern."""
+        text = "AMOUNT DUE: $45.67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+        assert result.label == "AMOUNT DUE"
+
+    def test_sale_pattern(self) -> None:
+        """Test 'SALE $XX.XX' pattern."""
+        text = "SALE $45.67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+
+    def test_grand_total(self) -> None:
+        """Test 'GRAND TOTAL' pattern."""
+        text = "GRAND TOTAL $45.67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+        assert result.label == "GRAND TOTAL"
+
+    def test_total_sale(self) -> None:
+        """Test 'TOTAL SALE' pattern."""
+        text = "TOTAL SALE: $45.67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+
+    def test_balance_due(self) -> None:
+        """Test 'BALANCE DUE' pattern."""
+        text = "BALANCE DUE $45.67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+
+    def test_multiple_amounts_picks_total(self) -> None:
+        """Test that labeled total is preferred over generic amounts."""
+        text = """
+        REGULAR 87
+        10.500 GAL @ $3.67
+        SUBTOTAL $38.54
+        TAX $0.00
+        TOTAL $38.54
+        """
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 38.54
+        assert result.pattern_name == "total_explicit"
+
+    def test_all_amounts(self) -> None:
+        """Test extracting all amounts from receipt."""
+        text = """
+        SUBTOTAL $35.00
+        TAX $3.54
+        TOTAL $38.54
+        """
+        results = currency_matcher.extract_all_amounts(text)
+
+        # Should find TOTAL and possibly others
+        assert len(results) >= 1
+        assert any(r.value == 38.54 for r in results)
+
+    def test_comma_thousand_separator(self) -> None:
+        """Test amounts with thousand separators."""
+        text = "TOTAL $1,234.56"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 1234.56
+
+    def test_reasonable_total_range(self) -> None:
+        """Test that unreasonable totals are filtered."""
+        # Very small amount
+        text = "TOTAL $0.05"
+        result = currency_matcher.extract_total(text)
+        assert result is None  # Too small for fuel receipt
+
+        # Reasonable amount
+        text = "TOTAL $45.67"
+        result = currency_matcher.extract_total(text)
+        assert result is not None
+
+    def test_receipt_context_extraction(self) -> None:
+        """Test extraction from realistic receipt text."""
+        text = """
+        SHELL
+        123 MAIN ST
+        DATE: 01/15/2024
+
+        UNLEADED 87
+        10.500 GAL
+        @ $3.679/GAL
+
+        FUEL TOTAL    $38.63
+        TAX           $0.00
+        TOTAL         $38.63
+
+        DEBIT CARD
+        ************1234
+        """
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 38.63
+
+    def test_no_total_returns_largest(self) -> None:
+        """Test fallback to largest amount when no labeled total."""
+        text = """
+        $10.50
+        $5.00
+        $45.67
+        """
+        result = currency_matcher.extract_total(text)
+
+        # Should infer largest reasonable amount as total
+        assert result is not None
+        assert result.value == 45.67
+        assert result.confidence < 0.7  # Lower confidence for inferred
+
+    def test_no_amounts_returns_none(self) -> None:
+        """Test that text without amounts returns None."""
+        text = "SHELL STATION\nPUMP 5"
+        result = currency_matcher.extract_total(text)
+
+        assert result is None
+
+
+class TestEdgeCases:
+    """Test edge cases in currency parsing."""
+
+    def test_european_format(self) -> None:
+        """Test European format (comma as decimal)."""
+        # European: 45,67 means 45.67
+        text = "TOTAL 45,67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+
+    def test_spaces_in_amount(self) -> None:
+        """Test handling of spaces around amounts."""
+        text = "TOTAL $ 45.67"
+        result = currency_matcher.extract_total(text)
+
+        assert result is not None
+        assert result.value == 45.67
+
+    def test_case_insensitive(self) -> None:
+        """Test case insensitive matching."""
+        for label in ["TOTAL", "Total", "total"]:
+            text = f"{label} $45.67"
+            result = currency_matcher.extract_total(text)
+
+            assert result is not None, f"Failed for {label}"
+            assert result.value == 45.67