Files
motovaultpro/ocr/tests/test_date_patterns.py
Eric Gullickson 6319d50fb1
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
feat: add receipt OCR pipeline (refs #69)
Implement receipt-specific OCR extraction for fuel receipts:

- Pattern matching modules for date, currency, and fuel data extraction
- Receipt-optimized image preprocessing for thermal receipts
- POST /extract/receipt endpoint with field extraction
- Confidence scoring per extracted field
- Cross-validation of fuel receipt data
- Unit tests for all pattern matchers

Extracted fields: merchantName, transactionDate, totalAmount,
fuelQuantity, pricePerUnit, fuelGrade

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 20:43:30 -06:00

164 lines
5.2 KiB
Python

"""Tests for date pattern matching."""
import pytest
from app.patterns.date_patterns import DatePatternMatcher, date_matcher
class TestDatePatternMatcher:
"""Test date pattern extraction."""
def test_mm_dd_yyyy_slash(self) -> None:
"""Test MM/DD/YYYY format."""
text = "DATE: 01/15/2024"
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-01-15"
assert result.confidence > 0.9
def test_mm_dd_yy_slash(self) -> None:
"""Test MM/DD/YY format with 2-digit year."""
text = "01/15/24"
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-01-15"
def test_mm_dd_yyyy_dash(self) -> None:
"""Test MM-DD-YYYY format."""
text = "01-15-2024"
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-01-15"
def test_iso_format(self) -> None:
"""Test ISO YYYY-MM-DD format."""
text = "2024-01-15"
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-01-15"
assert result.confidence > 0.95
def test_month_name_format(self) -> None:
"""Test 'Jan 15, 2024' format."""
text = "Jan 15, 2024"
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-01-15"
def test_month_name_no_comma(self) -> None:
"""Test 'Jan 15 2024' format without comma."""
text = "Jan 15 2024"
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-01-15"
def test_day_month_year_format(self) -> None:
"""Test '15 Jan 2024' format."""
text = "15 Jan 2024"
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-01-15"
def test_full_month_name(self) -> None:
"""Test full month name like 'January'."""
text = "January 15, 2024"
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-01-15"
def test_multiple_dates_returns_best(self) -> None:
"""Test that multiple dates returns highest confidence."""
text = "Date: 01/15/2024\nExpires: 01/15/2025"
results = date_matcher.extract_dates(text)
assert len(results) == 2
# Both should be valid
assert all(r.confidence > 0.5 for r in results)
def test_invalid_date_rejected(self) -> None:
"""Test that invalid dates are rejected."""
text = "13/45/2024" # Invalid month/day
result = date_matcher.extract_best_date(text)
assert result is None
def test_receipt_context_text(self) -> None:
"""Test date extraction from realistic receipt text."""
text = """
SHELL STATION
123 MAIN ST
DATE: 01/15/2024
TIME: 14:32
PUMP #5
REGULAR 87
10.500 GAL
TOTAL $38.50
"""
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-01-15"
def test_no_date_returns_none(self) -> None:
"""Test that text without dates returns None."""
text = "SHELL STATION\nTOTAL $38.50"
result = date_matcher.extract_best_date(text)
assert result is None
def test_confidence_boost_near_keyword(self) -> None:
"""Test confidence boost when date is near DATE keyword."""
text_with_keyword = "DATE: 01/15/2024"
text_without = "01/15/2024"
result_with = date_matcher.extract_best_date(text_with_keyword)
result_without = date_matcher.extract_best_date(text_without)
assert result_with is not None
assert result_without is not None
# Keyword proximity should boost confidence
assert result_with.confidence >= result_without.confidence
class TestEdgeCases:
"""Test edge cases in date parsing."""
def test_year_2000(self) -> None:
"""Test 2-digit year 00 is parsed as 2000."""
text = "01/15/00"
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2000-01-15"
def test_leap_year_date(self) -> None:
"""Test Feb 29 on leap year."""
text = "02/29/2024" # 2024 is a leap year
result = date_matcher.extract_best_date(text)
assert result is not None
assert result.value == "2024-02-29"
def test_leap_year_invalid(self) -> None:
"""Test Feb 29 on non-leap year is rejected."""
text = "02/29/2023" # 2023 is not a leap year
result = date_matcher.extract_best_date(text)
assert result is None
def test_september_abbrev(self) -> None:
"""Test September abbreviation (Sept vs Sep)."""
for abbrev in ["Sep", "Sept", "September"]:
text = f"{abbrev} 15, 2024"
result = date_matcher.extract_best_date(text)
assert result is not None, f"Failed for {abbrev}"
assert result.value == "2024-09-15"