motovaultpro/ocr/app/patterns/date_patterns.py

"""Date pattern matching for receipt extraction."""
import re
from dataclasses import dataclass
from datetime import datetime
from typing import Optional


@dataclass
class DateMatch:
    """Result of date pattern matching."""

    value: str  # ISO format YYYY-MM-DD
    raw_match: str  # Original text matched
    confidence: float
    pattern_name: str


class DatePatternMatcher:
    """Extract and normalize dates from receipt text."""

    # Pattern definitions with named groups and confidence weights
    PATTERNS = [
        # MM/DD/YYYY or MM/DD/YY (most common US format)
        (
            r"(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4})",
            "mm_dd_yyyy",
            0.95,
        ),
        # MM-DD-YYYY or MM-DD-YY
        (
            r"(?P<month>\d{1,2})-(?P<day>\d{1,2})-(?P<year>\d{2,4})",
            "mm_dd_yyyy_dash",
            0.90,
        ),
        # YYYY-MM-DD (ISO format)
        (
            r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})",
            "iso_date",
            0.98,
        ),
        # Mon DD, YYYY (e.g., Jan 15, 2024)
        (
            r"(?P<month_name>[A-Za-z]{3})\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})",
            "month_name_long",
            0.85,
        ),
        # DD Mon YYYY (e.g., 15 Jan 2024)
        (
            r"(?P<day>\d{1,2})\s+(?P<month_name>[A-Za-z]{3})\s+(?P<year>\d{4})",
            "day_month_year",
            0.85,
        ),
        # MMDDYYYY or MMDDYY (no separators, common in some POS systems)
        (
            r"(?<!\d)(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{2,4})(?!\d)",
            "compact_date",
            0.70,
        ),
    ]

    MONTH_NAMES = {
        "jan": 1, "january": 1,
        "feb": 2, "february": 2,
        "mar": 3, "march": 3,
        "apr": 4, "april": 4,
        "may": 5,
        "jun": 6, "june": 6,
        "jul": 7, "july": 7,
        "aug": 8, "august": 8,
        "sep": 9, "sept": 9, "september": 9,
        "oct": 10, "october": 10,
        "nov": 11, "november": 11,
        "dec": 12, "december": 12,
    }

    def extract_dates(self, text: str) -> list[DateMatch]:
        """
        Extract all date patterns from text.

        Args:
            text: Receipt text to search

        Returns:
            List of DateMatch objects sorted by confidence
        """
        matches = []
        text_upper = text.upper()

        for pattern, name, base_confidence in self.PATTERNS:
            for match in re.finditer(pattern, text, re.IGNORECASE):
                parsed = self._parse_match(match, name)
                if parsed:
                    year, month, day = parsed
                    if self._is_valid_date(year, month, day):
                        # Adjust confidence based on context
                        confidence = self._adjust_confidence(
                            base_confidence, text_upper, match.start()
                        )
                        matches.append(
                            DateMatch(
                                value=f"{year:04d}-{month:02d}-{day:02d}",
                                raw_match=match.group(0),
                                confidence=confidence,
                                pattern_name=name,
                            )
                        )

        # Sort by confidence, deduplicate by value
        matches.sort(key=lambda x: x.confidence, reverse=True)
        seen = set()
        unique_matches = []
        for match in matches:
            if match.value not in seen:
                seen.add(match.value)
                unique_matches.append(match)

        return unique_matches

    def extract_best_date(self, text: str) -> Optional[DateMatch]:
        """
        Extract the most likely transaction date.

        Args:
            text: Receipt text to search

        Returns:
            Best DateMatch or None if no date found
        """
        matches = self.extract_dates(text)
        return matches[0] if matches else None

    def _parse_match(
        self, match: re.Match, pattern_name: str
    ) -> Optional[tuple[int, int, int]]:
        """Parse regex match into year, month, day tuple."""
        groups = match.groupdict()

        # Handle month name patterns
        if "month_name" in groups:
            month_str = groups["month_name"].lower()
            month = self.MONTH_NAMES.get(month_str)
            if not month:
                return None
        else:
            month = int(groups["month"])

        day = int(groups["day"])
        year = int(groups["year"])

        # Normalize 2-digit years
        if year < 100:
            year = 2000 + year if year < 50 else 1900 + year

        return year, month, day

    def _is_valid_date(self, year: int, month: int, day: int) -> bool:
        """Check if date components form a valid date."""
        try:
            datetime(year=year, month=month, day=day)
            # Reasonable year range for receipts
            return 2000 <= year <= 2100
        except ValueError:
            return False

    def _adjust_confidence(
        self, base_confidence: float, text: str, position: int
    ) -> float:
        """
        Adjust confidence based on context clues.

        Boost confidence if date appears near date-related keywords.
        """
        # Look for nearby date keywords
        context_start = max(0, position - 50)
        context = text[context_start:position + 50]

        date_keywords = ["DATE", "TIME", "TRANS", "SALE"]
        for keyword in date_keywords:
            if keyword in context:
                return min(1.0, base_confidence + 0.05)

        return base_confidence


# Singleton instance
date_matcher = DatePatternMatcher()