feat: add receipt OCR pipeline (refs #69)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

Implement receipt-specific OCR extraction for fuel receipts:

- Pattern matching modules for date, currency, and fuel data extraction
- Receipt-optimized image preprocessing for thermal receipts
- POST /extract/receipt endpoint with field extraction
- Confidence scoring per extracted field
- Cross-validation of fuel receipt data
- Unit tests for all pattern matchers

Extracted fields: merchantName, transactionDate, totalAmount,
fuelQuantity, pricePerUnit, fuelGrade

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 20:43:30 -06:00
parent a2f0abb14c
commit 6319d50fb1
16 changed files with 2845 additions and 2 deletions

View File

@@ -0,0 +1,198 @@
"""Tests for currency pattern matching."""
import pytest
from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher
class TestCurrencyPatternMatcher:
"""Test currency and amount extraction."""
def test_total_explicit(self) -> None:
"""Test 'TOTAL $XX.XX' pattern."""
text = "TOTAL $45.67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
assert result.confidence > 0.9
assert result.label == "TOTAL"
def test_total_with_colon(self) -> None:
"""Test 'TOTAL: $XX.XX' pattern."""
text = "TOTAL: $45.67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
def test_total_without_dollar_sign(self) -> None:
"""Test 'TOTAL 45.67' pattern."""
text = "TOTAL 45.67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
def test_amount_due(self) -> None:
"""Test 'AMOUNT DUE' pattern."""
text = "AMOUNT DUE: $45.67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
assert result.label == "AMOUNT DUE"
def test_sale_pattern(self) -> None:
"""Test 'SALE $XX.XX' pattern."""
text = "SALE $45.67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
def test_grand_total(self) -> None:
"""Test 'GRAND TOTAL' pattern."""
text = "GRAND TOTAL $45.67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
assert result.label == "GRAND TOTAL"
def test_total_sale(self) -> None:
"""Test 'TOTAL SALE' pattern."""
text = "TOTAL SALE: $45.67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
def test_balance_due(self) -> None:
"""Test 'BALANCE DUE' pattern."""
text = "BALANCE DUE $45.67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
def test_multiple_amounts_picks_total(self) -> None:
"""Test that labeled total is preferred over generic amounts."""
text = """
REGULAR 87
10.500 GAL @ $3.67
SUBTOTAL $38.54
TAX $0.00
TOTAL $38.54
"""
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 38.54
assert result.pattern_name == "total_explicit"
def test_all_amounts(self) -> None:
"""Test extracting all amounts from receipt."""
text = """
SUBTOTAL $35.00
TAX $3.54
TOTAL $38.54
"""
results = currency_matcher.extract_all_amounts(text)
# Should find TOTAL and possibly others
assert len(results) >= 1
assert any(r.value == 38.54 for r in results)
def test_comma_thousand_separator(self) -> None:
"""Test amounts with thousand separators."""
text = "TOTAL $1,234.56"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 1234.56
def test_reasonable_total_range(self) -> None:
"""Test that unreasonable totals are filtered."""
# Very small amount
text = "TOTAL $0.05"
result = currency_matcher.extract_total(text)
assert result is None # Too small for fuel receipt
# Reasonable amount
text = "TOTAL $45.67"
result = currency_matcher.extract_total(text)
assert result is not None
def test_receipt_context_extraction(self) -> None:
"""Test extraction from realistic receipt text."""
text = """
SHELL
123 MAIN ST
DATE: 01/15/2024
UNLEADED 87
10.500 GAL
@ $3.679/GAL
FUEL TOTAL $38.63
TAX $0.00
TOTAL $38.63
DEBIT CARD
************1234
"""
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 38.63
def test_no_total_returns_largest(self) -> None:
"""Test fallback to largest amount when no labeled total."""
text = """
$10.50
$5.00
$45.67
"""
result = currency_matcher.extract_total(text)
# Should infer largest reasonable amount as total
assert result is not None
assert result.value == 45.67
assert result.confidence < 0.7 # Lower confidence for inferred
def test_no_amounts_returns_none(self) -> None:
"""Test that text without amounts returns None."""
text = "SHELL STATION\nPUMP 5"
result = currency_matcher.extract_total(text)
assert result is None
class TestEdgeCases:
"""Test edge cases in currency parsing."""
def test_european_format(self) -> None:
"""Test European format (comma as decimal)."""
# European: 45,67 means 45.67
text = "TOTAL 45,67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
def test_spaces_in_amount(self) -> None:
"""Test handling of spaces around amounts."""
text = "TOTAL $ 45.67"
result = currency_matcher.extract_total(text)
assert result is not None
assert result.value == 45.67
def test_case_insensitive(self) -> None:
"""Test case insensitive matching."""
for label in ["TOTAL", "Total", "total"]:
text = f"{label} $45.67"
result = currency_matcher.extract_total(text)
assert result is not None, f"Failed for {label}"
assert result.value == 45.67