All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
187 lines
5.6 KiB
Python
187 lines
5.6 KiB
Python
"""Date pattern matching for receipt extraction."""
|
|
import re
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class DateMatch:
|
|
"""Result of date pattern matching."""
|
|
|
|
value: str # ISO format YYYY-MM-DD
|
|
raw_match: str # Original text matched
|
|
confidence: float
|
|
pattern_name: str
|
|
|
|
|
|
class DatePatternMatcher:
|
|
"""Extract and normalize dates from receipt text."""
|
|
|
|
# Pattern definitions with named groups and confidence weights
|
|
PATTERNS = [
|
|
# MM/DD/YYYY or MM/DD/YY (most common US format)
|
|
(
|
|
r"(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4})",
|
|
"mm_dd_yyyy",
|
|
0.95,
|
|
),
|
|
# MM-DD-YYYY or MM-DD-YY
|
|
(
|
|
r"(?P<month>\d{1,2})-(?P<day>\d{1,2})-(?P<year>\d{2,4})",
|
|
"mm_dd_yyyy_dash",
|
|
0.90,
|
|
),
|
|
# YYYY-MM-DD (ISO format)
|
|
(
|
|
r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})",
|
|
"iso_date",
|
|
0.98,
|
|
),
|
|
# Mon DD, YYYY (e.g., Jan 15, 2024)
|
|
(
|
|
r"(?P<month_name>[A-Za-z]{3})\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})",
|
|
"month_name_long",
|
|
0.85,
|
|
),
|
|
# DD Mon YYYY (e.g., 15 Jan 2024)
|
|
(
|
|
r"(?P<day>\d{1,2})\s+(?P<month_name>[A-Za-z]{3})\s+(?P<year>\d{4})",
|
|
"day_month_year",
|
|
0.85,
|
|
),
|
|
# MMDDYYYY or MMDDYY (no separators, common in some POS systems)
|
|
(
|
|
r"(?<!\d)(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{2,4})(?!\d)",
|
|
"compact_date",
|
|
0.70,
|
|
),
|
|
]
|
|
|
|
MONTH_NAMES = {
|
|
"jan": 1, "january": 1,
|
|
"feb": 2, "february": 2,
|
|
"mar": 3, "march": 3,
|
|
"apr": 4, "april": 4,
|
|
"may": 5,
|
|
"jun": 6, "june": 6,
|
|
"jul": 7, "july": 7,
|
|
"aug": 8, "august": 8,
|
|
"sep": 9, "sept": 9, "september": 9,
|
|
"oct": 10, "october": 10,
|
|
"nov": 11, "november": 11,
|
|
"dec": 12, "december": 12,
|
|
}
|
|
|
|
def extract_dates(self, text: str) -> list[DateMatch]:
|
|
"""
|
|
Extract all date patterns from text.
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
List of DateMatch objects sorted by confidence
|
|
"""
|
|
matches = []
|
|
text_upper = text.upper()
|
|
|
|
for pattern, name, base_confidence in self.PATTERNS:
|
|
for match in re.finditer(pattern, text, re.IGNORECASE):
|
|
parsed = self._parse_match(match, name)
|
|
if parsed:
|
|
year, month, day = parsed
|
|
if self._is_valid_date(year, month, day):
|
|
# Adjust confidence based on context
|
|
confidence = self._adjust_confidence(
|
|
base_confidence, text_upper, match.start()
|
|
)
|
|
matches.append(
|
|
DateMatch(
|
|
value=f"{year:04d}-{month:02d}-{day:02d}",
|
|
raw_match=match.group(0),
|
|
confidence=confidence,
|
|
pattern_name=name,
|
|
)
|
|
)
|
|
|
|
# Sort by confidence, deduplicate by value
|
|
matches.sort(key=lambda x: x.confidence, reverse=True)
|
|
seen = set()
|
|
unique_matches = []
|
|
for match in matches:
|
|
if match.value not in seen:
|
|
seen.add(match.value)
|
|
unique_matches.append(match)
|
|
|
|
return unique_matches
|
|
|
|
def extract_best_date(self, text: str) -> Optional[DateMatch]:
|
|
"""
|
|
Extract the most likely transaction date.
|
|
|
|
Args:
|
|
text: Receipt text to search
|
|
|
|
Returns:
|
|
Best DateMatch or None if no date found
|
|
"""
|
|
matches = self.extract_dates(text)
|
|
return matches[0] if matches else None
|
|
|
|
def _parse_match(
|
|
self, match: re.Match, pattern_name: str
|
|
) -> Optional[tuple[int, int, int]]:
|
|
"""Parse regex match into year, month, day tuple."""
|
|
groups = match.groupdict()
|
|
|
|
# Handle month name patterns
|
|
if "month_name" in groups:
|
|
month_str = groups["month_name"].lower()
|
|
month = self.MONTH_NAMES.get(month_str)
|
|
if not month:
|
|
return None
|
|
else:
|
|
month = int(groups["month"])
|
|
|
|
day = int(groups["day"])
|
|
year = int(groups["year"])
|
|
|
|
# Normalize 2-digit years
|
|
if year < 100:
|
|
year = 2000 + year if year < 50 else 1900 + year
|
|
|
|
return year, month, day
|
|
|
|
def _is_valid_date(self, year: int, month: int, day: int) -> bool:
|
|
"""Check if date components form a valid date."""
|
|
try:
|
|
datetime(year=year, month=month, day=day)
|
|
# Reasonable year range for receipts
|
|
return 2000 <= year <= 2100
|
|
except ValueError:
|
|
return False
|
|
|
|
def _adjust_confidence(
|
|
self, base_confidence: float, text: str, position: int
|
|
) -> float:
|
|
"""
|
|
Adjust confidence based on context clues.
|
|
|
|
Boost confidence if date appears near date-related keywords.
|
|
"""
|
|
# Look for nearby date keywords
|
|
context_start = max(0, position - 50)
|
|
context = text[context_start:position + 50]
|
|
|
|
date_keywords = ["DATE", "TIME", "TRANS", "SALE"]
|
|
for keyword in date_keywords:
|
|
if keyword in context:
|
|
return min(1.0, base_confidence + 0.05)
|
|
|
|
return base_confidence
|
|
|
|
|
|
# Singleton instance
|
|
date_matcher = DatePatternMatcher()
|