All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
164 lines
5.2 KiB
Python
164 lines
5.2 KiB
Python
"""Tests for date pattern matching."""
|
|
import pytest
|
|
|
|
from app.patterns.date_patterns import DatePatternMatcher, date_matcher
|
|
|
|
|
|
class TestDatePatternMatcher:
|
|
"""Test date pattern extraction."""
|
|
|
|
def test_mm_dd_yyyy_slash(self) -> None:
|
|
"""Test MM/DD/YYYY format."""
|
|
text = "DATE: 01/15/2024"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-01-15"
|
|
assert result.confidence > 0.9
|
|
|
|
def test_mm_dd_yy_slash(self) -> None:
|
|
"""Test MM/DD/YY format with 2-digit year."""
|
|
text = "01/15/24"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-01-15"
|
|
|
|
def test_mm_dd_yyyy_dash(self) -> None:
|
|
"""Test MM-DD-YYYY format."""
|
|
text = "01-15-2024"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-01-15"
|
|
|
|
def test_iso_format(self) -> None:
|
|
"""Test ISO YYYY-MM-DD format."""
|
|
text = "2024-01-15"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-01-15"
|
|
assert result.confidence > 0.95
|
|
|
|
def test_month_name_format(self) -> None:
|
|
"""Test 'Jan 15, 2024' format."""
|
|
text = "Jan 15, 2024"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-01-15"
|
|
|
|
def test_month_name_no_comma(self) -> None:
|
|
"""Test 'Jan 15 2024' format without comma."""
|
|
text = "Jan 15 2024"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-01-15"
|
|
|
|
def test_day_month_year_format(self) -> None:
|
|
"""Test '15 Jan 2024' format."""
|
|
text = "15 Jan 2024"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-01-15"
|
|
|
|
def test_full_month_name(self) -> None:
|
|
"""Test full month name like 'January'."""
|
|
text = "January 15, 2024"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-01-15"
|
|
|
|
def test_multiple_dates_returns_best(self) -> None:
|
|
"""Test that multiple dates returns highest confidence."""
|
|
text = "Date: 01/15/2024\nExpires: 01/15/2025"
|
|
results = date_matcher.extract_dates(text)
|
|
|
|
assert len(results) == 2
|
|
# Both should be valid
|
|
assert all(r.confidence > 0.5 for r in results)
|
|
|
|
def test_invalid_date_rejected(self) -> None:
|
|
"""Test that invalid dates are rejected."""
|
|
text = "13/45/2024" # Invalid month/day
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is None
|
|
|
|
def test_receipt_context_text(self) -> None:
|
|
"""Test date extraction from realistic receipt text."""
|
|
text = """
|
|
SHELL STATION
|
|
123 MAIN ST
|
|
DATE: 01/15/2024
|
|
TIME: 14:32
|
|
PUMP #5
|
|
REGULAR 87
|
|
10.500 GAL
|
|
TOTAL $38.50
|
|
"""
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-01-15"
|
|
|
|
def test_no_date_returns_none(self) -> None:
|
|
"""Test that text without dates returns None."""
|
|
text = "SHELL STATION\nTOTAL $38.50"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is None
|
|
|
|
def test_confidence_boost_near_keyword(self) -> None:
|
|
"""Test confidence boost when date is near DATE keyword."""
|
|
text_with_keyword = "DATE: 01/15/2024"
|
|
text_without = "01/15/2024"
|
|
|
|
result_with = date_matcher.extract_best_date(text_with_keyword)
|
|
result_without = date_matcher.extract_best_date(text_without)
|
|
|
|
assert result_with is not None
|
|
assert result_without is not None
|
|
# Keyword proximity should boost confidence
|
|
assert result_with.confidence >= result_without.confidence
|
|
|
|
|
|
class TestEdgeCases:
|
|
"""Test edge cases in date parsing."""
|
|
|
|
def test_year_2000(self) -> None:
|
|
"""Test 2-digit year 00 is parsed as 2000."""
|
|
text = "01/15/00"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2000-01-15"
|
|
|
|
def test_leap_year_date(self) -> None:
|
|
"""Test Feb 29 on leap year."""
|
|
text = "02/29/2024" # 2024 is a leap year
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None
|
|
assert result.value == "2024-02-29"
|
|
|
|
def test_leap_year_invalid(self) -> None:
|
|
"""Test Feb 29 on non-leap year is rejected."""
|
|
text = "02/29/2023" # 2023 is not a leap year
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is None
|
|
|
|
def test_september_abbrev(self) -> None:
|
|
"""Test September abbreviation (Sept vs Sep)."""
|
|
for abbrev in ["Sep", "Sept", "September"]:
|
|
text = f"{abbrev} 15, 2024"
|
|
result = date_matcher.extract_best_date(text)
|
|
|
|
assert result is not None, f"Failed for {abbrev}"
|
|
assert result.value == "2024-09-15"
|