feat: add receipt OCR pipeline (refs #69)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
198
ocr/tests/test_currency_patterns.py
Normal file
198
ocr/tests/test_currency_patterns.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""Tests for currency pattern matching."""
|
||||
import pytest
|
||||
|
||||
from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher
|
||||
|
||||
|
||||
class TestCurrencyPatternMatcher:
|
||||
"""Test currency and amount extraction."""
|
||||
|
||||
def test_total_explicit(self) -> None:
|
||||
"""Test 'TOTAL $XX.XX' pattern."""
|
||||
text = "TOTAL $45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
assert result.confidence > 0.9
|
||||
assert result.label == "TOTAL"
|
||||
|
||||
def test_total_with_colon(self) -> None:
|
||||
"""Test 'TOTAL: $XX.XX' pattern."""
|
||||
text = "TOTAL: $45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
|
||||
def test_total_without_dollar_sign(self) -> None:
|
||||
"""Test 'TOTAL 45.67' pattern."""
|
||||
text = "TOTAL 45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
|
||||
def test_amount_due(self) -> None:
|
||||
"""Test 'AMOUNT DUE' pattern."""
|
||||
text = "AMOUNT DUE: $45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
assert result.label == "AMOUNT DUE"
|
||||
|
||||
def test_sale_pattern(self) -> None:
|
||||
"""Test 'SALE $XX.XX' pattern."""
|
||||
text = "SALE $45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
|
||||
def test_grand_total(self) -> None:
|
||||
"""Test 'GRAND TOTAL' pattern."""
|
||||
text = "GRAND TOTAL $45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
assert result.label == "GRAND TOTAL"
|
||||
|
||||
def test_total_sale(self) -> None:
|
||||
"""Test 'TOTAL SALE' pattern."""
|
||||
text = "TOTAL SALE: $45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
|
||||
def test_balance_due(self) -> None:
|
||||
"""Test 'BALANCE DUE' pattern."""
|
||||
text = "BALANCE DUE $45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
|
||||
def test_multiple_amounts_picks_total(self) -> None:
|
||||
"""Test that labeled total is preferred over generic amounts."""
|
||||
text = """
|
||||
REGULAR 87
|
||||
10.500 GAL @ $3.67
|
||||
SUBTOTAL $38.54
|
||||
TAX $0.00
|
||||
TOTAL $38.54
|
||||
"""
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 38.54
|
||||
assert result.pattern_name == "total_explicit"
|
||||
|
||||
def test_all_amounts(self) -> None:
|
||||
"""Test extracting all amounts from receipt."""
|
||||
text = """
|
||||
SUBTOTAL $35.00
|
||||
TAX $3.54
|
||||
TOTAL $38.54
|
||||
"""
|
||||
results = currency_matcher.extract_all_amounts(text)
|
||||
|
||||
# Should find TOTAL and possibly others
|
||||
assert len(results) >= 1
|
||||
assert any(r.value == 38.54 for r in results)
|
||||
|
||||
def test_comma_thousand_separator(self) -> None:
|
||||
"""Test amounts with thousand separators."""
|
||||
text = "TOTAL $1,234.56"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 1234.56
|
||||
|
||||
def test_reasonable_total_range(self) -> None:
|
||||
"""Test that unreasonable totals are filtered."""
|
||||
# Very small amount
|
||||
text = "TOTAL $0.05"
|
||||
result = currency_matcher.extract_total(text)
|
||||
assert result is None # Too small for fuel receipt
|
||||
|
||||
# Reasonable amount
|
||||
text = "TOTAL $45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
assert result is not None
|
||||
|
||||
def test_receipt_context_extraction(self) -> None:
|
||||
"""Test extraction from realistic receipt text."""
|
||||
text = """
|
||||
SHELL
|
||||
123 MAIN ST
|
||||
DATE: 01/15/2024
|
||||
|
||||
UNLEADED 87
|
||||
10.500 GAL
|
||||
@ $3.679/GAL
|
||||
|
||||
FUEL TOTAL $38.63
|
||||
TAX $0.00
|
||||
TOTAL $38.63
|
||||
|
||||
DEBIT CARD
|
||||
************1234
|
||||
"""
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 38.63
|
||||
|
||||
def test_no_total_returns_largest(self) -> None:
|
||||
"""Test fallback to largest amount when no labeled total."""
|
||||
text = """
|
||||
$10.50
|
||||
$5.00
|
||||
$45.67
|
||||
"""
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
# Should infer largest reasonable amount as total
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
assert result.confidence < 0.7 # Lower confidence for inferred
|
||||
|
||||
def test_no_amounts_returns_none(self) -> None:
|
||||
"""Test that text without amounts returns None."""
|
||||
text = "SHELL STATION\nPUMP 5"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestEdgeCases:
|
||||
"""Test edge cases in currency parsing."""
|
||||
|
||||
def test_european_format(self) -> None:
|
||||
"""Test European format (comma as decimal)."""
|
||||
# European: 45,67 means 45.67
|
||||
text = "TOTAL 45,67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
|
||||
def test_spaces_in_amount(self) -> None:
|
||||
"""Test handling of spaces around amounts."""
|
||||
text = "TOTAL $ 45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == 45.67
|
||||
|
||||
def test_case_insensitive(self) -> None:
|
||||
"""Test case insensitive matching."""
|
||||
for label in ["TOTAL", "Total", "total"]:
|
||||
text = f"{label} $45.67"
|
||||
result = currency_matcher.extract_total(text)
|
||||
|
||||
assert result is not None, f"Failed for {label}"
|
||||
assert result.value == 45.67
|
||||
Reference in New Issue
Block a user