feat: add receipt OCR pipeline (refs #69)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
186
ocr/app/patterns/date_patterns.py
Normal file
186
ocr/app/patterns/date_patterns.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"""Date pattern matching for receipt extraction."""
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class DateMatch:
|
||||
"""Result of date pattern matching."""
|
||||
|
||||
value: str # ISO format YYYY-MM-DD
|
||||
raw_match: str # Original text matched
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
|
||||
|
||||
class DatePatternMatcher:
|
||||
"""Extract and normalize dates from receipt text."""
|
||||
|
||||
# Pattern definitions with named groups and confidence weights
|
||||
PATTERNS = [
|
||||
# MM/DD/YYYY or MM/DD/YY (most common US format)
|
||||
(
|
||||
r"(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4})",
|
||||
"mm_dd_yyyy",
|
||||
0.95,
|
||||
),
|
||||
# MM-DD-YYYY or MM-DD-YY
|
||||
(
|
||||
r"(?P<month>\d{1,2})-(?P<day>\d{1,2})-(?P<year>\d{2,4})",
|
||||
"mm_dd_yyyy_dash",
|
||||
0.90,
|
||||
),
|
||||
# YYYY-MM-DD (ISO format)
|
||||
(
|
||||
r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})",
|
||||
"iso_date",
|
||||
0.98,
|
||||
),
|
||||
# Mon DD, YYYY (e.g., Jan 15, 2024)
|
||||
(
|
||||
r"(?P<month_name>[A-Za-z]{3})\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})",
|
||||
"month_name_long",
|
||||
0.85,
|
||||
),
|
||||
# DD Mon YYYY (e.g., 15 Jan 2024)
|
||||
(
|
||||
r"(?P<day>\d{1,2})\s+(?P<month_name>[A-Za-z]{3})\s+(?P<year>\d{4})",
|
||||
"day_month_year",
|
||||
0.85,
|
||||
),
|
||||
# MMDDYYYY or MMDDYY (no separators, common in some POS systems)
|
||||
(
|
||||
r"(?<!\d)(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{2,4})(?!\d)",
|
||||
"compact_date",
|
||||
0.70,
|
||||
),
|
||||
]
|
||||
|
||||
MONTH_NAMES = {
|
||||
"jan": 1, "january": 1,
|
||||
"feb": 2, "february": 2,
|
||||
"mar": 3, "march": 3,
|
||||
"apr": 4, "april": 4,
|
||||
"may": 5,
|
||||
"jun": 6, "june": 6,
|
||||
"jul": 7, "july": 7,
|
||||
"aug": 8, "august": 8,
|
||||
"sep": 9, "sept": 9, "september": 9,
|
||||
"oct": 10, "october": 10,
|
||||
"nov": 11, "november": 11,
|
||||
"dec": 12, "december": 12,
|
||||
}
|
||||
|
||||
def extract_dates(self, text: str) -> list[DateMatch]:
|
||||
"""
|
||||
Extract all date patterns from text.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
List of DateMatch objects sorted by confidence
|
||||
"""
|
||||
matches = []
|
||||
text_upper = text.upper()
|
||||
|
||||
for pattern, name, base_confidence in self.PATTERNS:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
parsed = self._parse_match(match, name)
|
||||
if parsed:
|
||||
year, month, day = parsed
|
||||
if self._is_valid_date(year, month, day):
|
||||
# Adjust confidence based on context
|
||||
confidence = self._adjust_confidence(
|
||||
base_confidence, text_upper, match.start()
|
||||
)
|
||||
matches.append(
|
||||
DateMatch(
|
||||
value=f"{year:04d}-{month:02d}-{day:02d}",
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
)
|
||||
|
||||
# Sort by confidence, deduplicate by value
|
||||
matches.sort(key=lambda x: x.confidence, reverse=True)
|
||||
seen = set()
|
||||
unique_matches = []
|
||||
for match in matches:
|
||||
if match.value not in seen:
|
||||
seen.add(match.value)
|
||||
unique_matches.append(match)
|
||||
|
||||
return unique_matches
|
||||
|
||||
def extract_best_date(self, text: str) -> Optional[DateMatch]:
|
||||
"""
|
||||
Extract the most likely transaction date.
|
||||
|
||||
Args:
|
||||
text: Receipt text to search
|
||||
|
||||
Returns:
|
||||
Best DateMatch or None if no date found
|
||||
"""
|
||||
matches = self.extract_dates(text)
|
||||
return matches[0] if matches else None
|
||||
|
||||
def _parse_match(
|
||||
self, match: re.Match, pattern_name: str
|
||||
) -> Optional[tuple[int, int, int]]:
|
||||
"""Parse regex match into year, month, day tuple."""
|
||||
groups = match.groupdict()
|
||||
|
||||
# Handle month name patterns
|
||||
if "month_name" in groups:
|
||||
month_str = groups["month_name"].lower()
|
||||
month = self.MONTH_NAMES.get(month_str)
|
||||
if not month:
|
||||
return None
|
||||
else:
|
||||
month = int(groups["month"])
|
||||
|
||||
day = int(groups["day"])
|
||||
year = int(groups["year"])
|
||||
|
||||
# Normalize 2-digit years
|
||||
if year < 100:
|
||||
year = 2000 + year if year < 50 else 1900 + year
|
||||
|
||||
return year, month, day
|
||||
|
||||
def _is_valid_date(self, year: int, month: int, day: int) -> bool:
|
||||
"""Check if date components form a valid date."""
|
||||
try:
|
||||
datetime(year=year, month=month, day=day)
|
||||
# Reasonable year range for receipts
|
||||
return 2000 <= year <= 2100
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def _adjust_confidence(
|
||||
self, base_confidence: float, text: str, position: int
|
||||
) -> float:
|
||||
"""
|
||||
Adjust confidence based on context clues.
|
||||
|
||||
Boost confidence if date appears near date-related keywords.
|
||||
"""
|
||||
# Look for nearby date keywords
|
||||
context_start = max(0, position - 50)
|
||||
context = text[context_start:position + 50]
|
||||
|
||||
date_keywords = ["DATE", "TIME", "TRANS", "SALE"]
|
||||
for keyword in date_keywords:
|
||||
if keyword in context:
|
||||
return min(1.0, base_confidence + 0.05)
|
||||
|
||||
return base_confidence
|
||||
|
||||
|
||||
# Singleton instance
|
||||
date_matcher = DatePatternMatcher()
|
||||
Reference in New Issue
Block a user