"""Date pattern matching for receipt extraction.""" import re from dataclasses import dataclass from datetime import datetime from typing import Optional @dataclass class DateMatch: """Result of date pattern matching.""" value: str # ISO format YYYY-MM-DD raw_match: str # Original text matched confidence: float pattern_name: str class DatePatternMatcher: """Extract and normalize dates from receipt text.""" # Pattern definitions with named groups and confidence weights PATTERNS = [ # MM/DD/YYYY or MM/DD/YY (most common US format) ( r"(?P\d{1,2})/(?P\d{1,2})/(?P\d{2,4})", "mm_dd_yyyy", 0.95, ), # MM-DD-YYYY or MM-DD-YY ( r"(?P\d{1,2})-(?P\d{1,2})-(?P\d{2,4})", "mm_dd_yyyy_dash", 0.90, ), # YYYY-MM-DD (ISO format) ( r"(?P\d{4})-(?P\d{1,2})-(?P\d{1,2})", "iso_date", 0.98, ), # Mon DD, YYYY (e.g., Jan 15, 2024) ( r"(?P[A-Za-z]{3})\s+(?P\d{1,2}),?\s+(?P\d{4})", "month_name_long", 0.85, ), # DD Mon YYYY (e.g., 15 Jan 2024) ( r"(?P\d{1,2})\s+(?P[A-Za-z]{3})\s+(?P\d{4})", "day_month_year", 0.85, ), # MMDDYYYY or MMDDYY (no separators, common in some POS systems) ( r"(?\d{2})(?P\d{2})(?P\d{2,4})(?!\d)", "compact_date", 0.70, ), ] MONTH_NAMES = { "jan": 1, "january": 1, "feb": 2, "february": 2, "mar": 3, "march": 3, "apr": 4, "april": 4, "may": 5, "jun": 6, "june": 6, "jul": 7, "july": 7, "aug": 8, "august": 8, "sep": 9, "sept": 9, "september": 9, "oct": 10, "october": 10, "nov": 11, "november": 11, "dec": 12, "december": 12, } def extract_dates(self, text: str) -> list[DateMatch]: """ Extract all date patterns from text. Args: text: Receipt text to search Returns: List of DateMatch objects sorted by confidence """ matches = [] text_upper = text.upper() for pattern, name, base_confidence in self.PATTERNS: for match in re.finditer(pattern, text, re.IGNORECASE): parsed = self._parse_match(match, name) if parsed: year, month, day = parsed if self._is_valid_date(year, month, day): # Adjust confidence based on context confidence = self._adjust_confidence( base_confidence, text_upper, match.start() ) matches.append( DateMatch( value=f"{year:04d}-{month:02d}-{day:02d}", raw_match=match.group(0), confidence=confidence, pattern_name=name, ) ) # Sort by confidence, deduplicate by value matches.sort(key=lambda x: x.confidence, reverse=True) seen = set() unique_matches = [] for match in matches: if match.value not in seen: seen.add(match.value) unique_matches.append(match) return unique_matches def extract_best_date(self, text: str) -> Optional[DateMatch]: """ Extract the most likely transaction date. Args: text: Receipt text to search Returns: Best DateMatch or None if no date found """ matches = self.extract_dates(text) return matches[0] if matches else None def _parse_match( self, match: re.Match, pattern_name: str ) -> Optional[tuple[int, int, int]]: """Parse regex match into year, month, day tuple.""" groups = match.groupdict() # Handle month name patterns if "month_name" in groups: month_str = groups["month_name"].lower() month = self.MONTH_NAMES.get(month_str) if not month: return None else: month = int(groups["month"]) day = int(groups["day"]) year = int(groups["year"]) # Normalize 2-digit years if year < 100: year = 2000 + year if year < 50 else 1900 + year return year, month, day def _is_valid_date(self, year: int, month: int, day: int) -> bool: """Check if date components form a valid date.""" try: datetime(year=year, month=month, day=day) # Reasonable year range for receipts return 2000 <= year <= 2100 except ValueError: return False def _adjust_confidence( self, base_confidence: float, text: str, position: int ) -> float: """ Adjust confidence based on context clues. Boost confidence if date appears near date-related keywords. """ # Look for nearby date keywords context_start = max(0, position - 50) context = text[context_start:position + 50] date_keywords = ["DATE", "TIME", "TRANS", "SALE"] for keyword in date_keywords: if keyword in context: return min(1.0, base_confidence + 0.05) return base_confidence # Singleton instance date_matcher = DatePatternMatcher()