Files
motovaultpro/ocr/app/patterns/date_patterns.py
Eric Gullickson 6319d50fb1
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
feat: add receipt OCR pipeline (refs #69)
Implement receipt-specific OCR extraction for fuel receipts:

- Pattern matching modules for date, currency, and fuel data extraction
- Receipt-optimized image preprocessing for thermal receipts
- POST /extract/receipt endpoint with field extraction
- Confidence scoring per extracted field
- Cross-validation of fuel receipt data
- Unit tests for all pattern matchers

Extracted fields: merchantName, transactionDate, totalAmount,
fuelQuantity, pricePerUnit, fuelGrade

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 20:43:30 -06:00

187 lines
5.6 KiB
Python

"""Date pattern matching for receipt extraction."""
import re
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
@dataclass
class DateMatch:
"""Result of date pattern matching."""
value: str # ISO format YYYY-MM-DD
raw_match: str # Original text matched
confidence: float
pattern_name: str
class DatePatternMatcher:
"""Extract and normalize dates from receipt text."""
# Pattern definitions with named groups and confidence weights
PATTERNS = [
# MM/DD/YYYY or MM/DD/YY (most common US format)
(
r"(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4})",
"mm_dd_yyyy",
0.95,
),
# MM-DD-YYYY or MM-DD-YY
(
r"(?P<month>\d{1,2})-(?P<day>\d{1,2})-(?P<year>\d{2,4})",
"mm_dd_yyyy_dash",
0.90,
),
# YYYY-MM-DD (ISO format)
(
r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})",
"iso_date",
0.98,
),
# Mon DD, YYYY (e.g., Jan 15, 2024)
(
r"(?P<month_name>[A-Za-z]{3})\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})",
"month_name_long",
0.85,
),
# DD Mon YYYY (e.g., 15 Jan 2024)
(
r"(?P<day>\d{1,2})\s+(?P<month_name>[A-Za-z]{3})\s+(?P<year>\d{4})",
"day_month_year",
0.85,
),
# MMDDYYYY or MMDDYY (no separators, common in some POS systems)
(
r"(?<!\d)(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{2,4})(?!\d)",
"compact_date",
0.70,
),
]
MONTH_NAMES = {
"jan": 1, "january": 1,
"feb": 2, "february": 2,
"mar": 3, "march": 3,
"apr": 4, "april": 4,
"may": 5,
"jun": 6, "june": 6,
"jul": 7, "july": 7,
"aug": 8, "august": 8,
"sep": 9, "sept": 9, "september": 9,
"oct": 10, "october": 10,
"nov": 11, "november": 11,
"dec": 12, "december": 12,
}
def extract_dates(self, text: str) -> list[DateMatch]:
"""
Extract all date patterns from text.
Args:
text: Receipt text to search
Returns:
List of DateMatch objects sorted by confidence
"""
matches = []
text_upper = text.upper()
for pattern, name, base_confidence in self.PATTERNS:
for match in re.finditer(pattern, text, re.IGNORECASE):
parsed = self._parse_match(match, name)
if parsed:
year, month, day = parsed
if self._is_valid_date(year, month, day):
# Adjust confidence based on context
confidence = self._adjust_confidence(
base_confidence, text_upper, match.start()
)
matches.append(
DateMatch(
value=f"{year:04d}-{month:02d}-{day:02d}",
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
)
# Sort by confidence, deduplicate by value
matches.sort(key=lambda x: x.confidence, reverse=True)
seen = set()
unique_matches = []
for match in matches:
if match.value not in seen:
seen.add(match.value)
unique_matches.append(match)
return unique_matches
def extract_best_date(self, text: str) -> Optional[DateMatch]:
"""
Extract the most likely transaction date.
Args:
text: Receipt text to search
Returns:
Best DateMatch or None if no date found
"""
matches = self.extract_dates(text)
return matches[0] if matches else None
def _parse_match(
self, match: re.Match, pattern_name: str
) -> Optional[tuple[int, int, int]]:
"""Parse regex match into year, month, day tuple."""
groups = match.groupdict()
# Handle month name patterns
if "month_name" in groups:
month_str = groups["month_name"].lower()
month = self.MONTH_NAMES.get(month_str)
if not month:
return None
else:
month = int(groups["month"])
day = int(groups["day"])
year = int(groups["year"])
# Normalize 2-digit years
if year < 100:
year = 2000 + year if year < 50 else 1900 + year
return year, month, day
def _is_valid_date(self, year: int, month: int, day: int) -> bool:
"""Check if date components form a valid date."""
try:
datetime(year=year, month=month, day=day)
# Reasonable year range for receipts
return 2000 <= year <= 2100
except ValueError:
return False
def _adjust_confidence(
self, base_confidence: float, text: str, position: int
) -> float:
"""
Adjust confidence based on context clues.
Boost confidence if date appears near date-related keywords.
"""
# Look for nearby date keywords
context_start = max(0, position - 50)
context = text[context_start:position + 50]
date_keywords = ["DATE", "TIME", "TRANS", "SALE"]
for keyword in date_keywords:
if keyword in context:
return min(1.0, base_confidence + 0.05)
return base_confidence
# Singleton instance
date_matcher = DatePatternMatcher()