feat: add receipt OCR pipeline (refs #69)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
163
ocr/tests/test_date_patterns.py
Normal file
163
ocr/tests/test_date_patterns.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""Tests for date pattern matching."""
|
||||
import pytest
|
||||
|
||||
from app.patterns.date_patterns import DatePatternMatcher, date_matcher
|
||||
|
||||
|
||||
class TestDatePatternMatcher:
|
||||
"""Test date pattern extraction."""
|
||||
|
||||
def test_mm_dd_yyyy_slash(self) -> None:
|
||||
"""Test MM/DD/YYYY format."""
|
||||
text = "DATE: 01/15/2024"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-01-15"
|
||||
assert result.confidence > 0.9
|
||||
|
||||
def test_mm_dd_yy_slash(self) -> None:
|
||||
"""Test MM/DD/YY format with 2-digit year."""
|
||||
text = "01/15/24"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-01-15"
|
||||
|
||||
def test_mm_dd_yyyy_dash(self) -> None:
|
||||
"""Test MM-DD-YYYY format."""
|
||||
text = "01-15-2024"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-01-15"
|
||||
|
||||
def test_iso_format(self) -> None:
|
||||
"""Test ISO YYYY-MM-DD format."""
|
||||
text = "2024-01-15"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-01-15"
|
||||
assert result.confidence > 0.95
|
||||
|
||||
def test_month_name_format(self) -> None:
|
||||
"""Test 'Jan 15, 2024' format."""
|
||||
text = "Jan 15, 2024"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-01-15"
|
||||
|
||||
def test_month_name_no_comma(self) -> None:
|
||||
"""Test 'Jan 15 2024' format without comma."""
|
||||
text = "Jan 15 2024"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-01-15"
|
||||
|
||||
def test_day_month_year_format(self) -> None:
|
||||
"""Test '15 Jan 2024' format."""
|
||||
text = "15 Jan 2024"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-01-15"
|
||||
|
||||
def test_full_month_name(self) -> None:
|
||||
"""Test full month name like 'January'."""
|
||||
text = "January 15, 2024"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-01-15"
|
||||
|
||||
def test_multiple_dates_returns_best(self) -> None:
|
||||
"""Test that multiple dates returns highest confidence."""
|
||||
text = "Date: 01/15/2024\nExpires: 01/15/2025"
|
||||
results = date_matcher.extract_dates(text)
|
||||
|
||||
assert len(results) == 2
|
||||
# Both should be valid
|
||||
assert all(r.confidence > 0.5 for r in results)
|
||||
|
||||
def test_invalid_date_rejected(self) -> None:
|
||||
"""Test that invalid dates are rejected."""
|
||||
text = "13/45/2024" # Invalid month/day
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_receipt_context_text(self) -> None:
|
||||
"""Test date extraction from realistic receipt text."""
|
||||
text = """
|
||||
SHELL STATION
|
||||
123 MAIN ST
|
||||
DATE: 01/15/2024
|
||||
TIME: 14:32
|
||||
PUMP #5
|
||||
REGULAR 87
|
||||
10.500 GAL
|
||||
TOTAL $38.50
|
||||
"""
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-01-15"
|
||||
|
||||
def test_no_date_returns_none(self) -> None:
|
||||
"""Test that text without dates returns None."""
|
||||
text = "SHELL STATION\nTOTAL $38.50"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_confidence_boost_near_keyword(self) -> None:
|
||||
"""Test confidence boost when date is near DATE keyword."""
|
||||
text_with_keyword = "DATE: 01/15/2024"
|
||||
text_without = "01/15/2024"
|
||||
|
||||
result_with = date_matcher.extract_best_date(text_with_keyword)
|
||||
result_without = date_matcher.extract_best_date(text_without)
|
||||
|
||||
assert result_with is not None
|
||||
assert result_without is not None
|
||||
# Keyword proximity should boost confidence
|
||||
assert result_with.confidence >= result_without.confidence
|
||||
|
||||
|
||||
class TestEdgeCases:
|
||||
"""Test edge cases in date parsing."""
|
||||
|
||||
def test_year_2000(self) -> None:
|
||||
"""Test 2-digit year 00 is parsed as 2000."""
|
||||
text = "01/15/00"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2000-01-15"
|
||||
|
||||
def test_leap_year_date(self) -> None:
|
||||
"""Test Feb 29 on leap year."""
|
||||
text = "02/29/2024" # 2024 is a leap year
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None
|
||||
assert result.value == "2024-02-29"
|
||||
|
||||
def test_leap_year_invalid(self) -> None:
|
||||
"""Test Feb 29 on non-leap year is rejected."""
|
||||
text = "02/29/2023" # 2023 is not a leap year
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_september_abbrev(self) -> None:
|
||||
"""Test September abbreviation (Sept vs Sep)."""
|
||||
for abbrev in ["Sep", "Sept", "September"]:
|
||||
text = f"{abbrev} 15, 2024"
|
||||
result = date_matcher.extract_best_date(text)
|
||||
|
||||
assert result is not None, f"Failed for {abbrev}"
|
||||
assert result.value == "2024-09-15"
|
||||
Reference in New Issue
Block a user