2026-02-02 02:47:52 +00:00
16 changed files with 2845 additions and 2 deletions
--- a/ocr/app/extractors/init.py
+++ b/ocr/app/extractors/init.py
@@ -1,10 +1,23 @@
 """Extractors package for domain-specific OCR extraction."""
 from app.extractors.base import BaseExtractor, ExtractionResult
 from app.extractors.vin_extractor import VinExtractor, vin_extractor
 from app.extractors.receipt_extractor import (
    ReceiptExtractor,
    receipt_extractor,
    ReceiptExtractionResult,
    ExtractedField,
 )
 from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor
 __all__ = [
    "BaseExtractor",
    "ExtractionResult",
    "VinExtractor",
    "vin_extractor",
    "ReceiptExtractor",
    "receipt_extractor",
    "ReceiptExtractionResult",
    "ExtractedField",
    "FuelReceiptExtractor",
    "fuel_receipt_extractor",
 ]
--- a/ocr/app/extractors/fuel_receipt.py
+++ b/ocr/app/extractors/fuel_receipt.py
@@ -0,0 +1,193 @@
 """Fuel receipt specialization with validation and cross-checking."""
 import logging
 from dataclasses import dataclass
 from typing import Optional
 from app.extractors.receipt_extractor import (
    ExtractedField,
    ReceiptExtractionResult,
    receipt_extractor,
 )
 logger = logging.getLogger(__name__)
@dataclass
 class FuelReceiptValidation:
    """Validation result for fuel receipt extraction."""
    is_valid: bool
    issues: list[str]
    confidence_score: float
 class FuelReceiptExtractor:
    """Specialized fuel receipt extractor with cross-validation.
    Provides additional validation and confidence scoring specific
    to fuel receipts by cross-checking extracted values.
    """
    # Expected fields for a complete fuel receipt
    REQUIRED_FIELDS = ["totalAmount"]
    OPTIONAL_FIELDS = [
        "merchantName",
        "transactionDate",
        "fuelQuantity",
        "pricePerUnit",
        "fuelGrade",
    ]
    def extract(
        self,
        image_bytes: bytes,
        content_type: Optional[str] = None,
    ) -> ReceiptExtractionResult:
        """
        Extract fuel receipt data with validation.
        Args:
            image_bytes: Raw image bytes
            content_type: MIME type
        Returns:
            ReceiptExtractionResult with fuel-specific extraction
        """
        # Use base receipt extractor with fuel hint
        result = receipt_extractor.extract(
            image_bytes=image_bytes,
            content_type=content_type,
            receipt_type="fuel",
        )
        if not result.success:
            return result
        # Validate and cross-check fuel fields
        validation = self._validate_fuel_receipt(result.extracted_fields)
        if validation.issues:
            logger.warning(
                f"Fuel receipt validation issues: {validation.issues}"
            )
        # Update overall confidence based on validation
        result.extracted_fields = self._adjust_confidences(
            result.extracted_fields, validation
        )
        return result
    def _validate_fuel_receipt(
        self, fields: dict[str, ExtractedField]
    ) -> FuelReceiptValidation:
        """
        Validate extracted fuel receipt fields.
        Cross-checks:
        - total = quantity * price per unit (within tolerance)
        - quantity is reasonable for a single fill-up
        - price per unit is within market range
        Args:
            fields: Extracted fields
        Returns:
            FuelReceiptValidation with issues and confidence
        """
        issues = []
        confidence_score = 1.0
        # Check required fields
        for field_name in self.REQUIRED_FIELDS:
            if field_name not in fields:
                issues.append(f"Missing required field: {field_name}")
                confidence_score *= 0.5
        # Cross-validate total = quantity * price
        if all(
            f in fields for f in ["totalAmount", "fuelQuantity", "pricePerUnit"]
        ):
            total = fields["totalAmount"].value
            quantity = fields["fuelQuantity"].value
            price = fields["pricePerUnit"].value
            calculated_total = quantity * price
            tolerance = 0.10  # Allow 10% tolerance for rounding
            if abs(total - calculated_total) > total * tolerance:
                issues.append(
                    f"Total ({total}) doesn't match quantity ({quantity}) * "
                    f"price ({price}) = {calculated_total:.2f}"
                )
                confidence_score *= 0.7
        # Validate quantity is reasonable
        if "fuelQuantity" in fields:
            quantity = fields["fuelQuantity"].value
            if quantity < 0.5:
                issues.append(f"Fuel quantity too small: {quantity}")
                confidence_score *= 0.6
            elif quantity > 40:  # 40 gallons is very large tank
                issues.append(f"Fuel quantity unusually large: {quantity}")
                confidence_score *= 0.8
        # Validate price is reasonable (current US market range)
        if "pricePerUnit" in fields:
            price = fields["pricePerUnit"].value
            if price < 1.50:
                issues.append(f"Price per unit too low: ${price}")
                confidence_score *= 0.7
            elif price > 7.00:
                issues.append(f"Price per unit unusually high: ${price}")
                confidence_score *= 0.8
        # Validate fuel grade
        if "fuelGrade" in fields:
            grade = fields["fuelGrade"].value
            valid_grades = ["87", "89", "91", "93", "DIESEL", "E85"]
            if grade not in valid_grades:
                issues.append(f"Unknown fuel grade: {grade}")
                confidence_score *= 0.9
        is_valid = len(issues) == 0
        return FuelReceiptValidation(
            is_valid=is_valid,
            issues=issues,
            confidence_score=confidence_score,
        )
    def _adjust_confidences(
        self,
        fields: dict[str, ExtractedField],
        validation: FuelReceiptValidation,
    ) -> dict[str, ExtractedField]:
        """
        Adjust field confidences based on validation.
        Args:
            fields: Extracted fields
            validation: Validation result
        Returns:
            Fields with adjusted confidences
        """
        if validation.is_valid:
            # Boost confidences when cross-validation passes
            boost = 1.1
        else:
            # Reduce confidences when there are issues
            boost = validation.confidence_score
        adjusted = {}
        for name, field in fields.items():
            adjusted[name] = ExtractedField(
                value=field.value,
                confidence=min(1.0, field.confidence * boost),
            )
        return adjusted
 # Singleton instance
 fuel_receipt_extractor = FuelReceiptExtractor()
--- a/ocr/app/extractors/receipt_extractor.py
+++ b/ocr/app/extractors/receipt_extractor.py
@@ -0,0 +1,345 @@
 """Receipt-specific OCR extractor with field extraction."""
 import io
 import logging
 import time
 from dataclasses import dataclass, field
 from typing import Any, Optional
 import magic
 import pytesseract
 from PIL import Image
 from pillow_heif import register_heif_opener
 from app.config import settings
 from app.extractors.base import BaseExtractor
 from app.preprocessors.receipt_preprocessor import receipt_preprocessor
 from app.patterns import currency_matcher, date_matcher, fuel_matcher
 # Register HEIF/HEIC opener
 register_heif_opener()
 logger = logging.getLogger(__name__)
@dataclass
 class ExtractedField:
    """A single extracted field with confidence."""
    value: Any
    confidence: float
@dataclass
 class ReceiptExtractionResult:
    """Result of receipt extraction."""
    success: bool
    receipt_type: str = "unknown"
    extracted_fields: dict[str, ExtractedField] = field(default_factory=dict)
    raw_text: str = ""
    processing_time_ms: int = 0
    error: Optional[str] = None
 class ReceiptExtractor(BaseExtractor):
    """Receipt-specific OCR extractor for fuel and general receipts."""
    # Supported MIME types
    SUPPORTED_TYPES = {
        "image/jpeg",
        "image/png",
        "image/heic",
        "image/heif",
    }
    def __init__(self) -> None:
        """Initialize receipt extractor."""
        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
    def extract(
        self,
        image_bytes: bytes,
        content_type: Optional[str] = None,
        receipt_type: Optional[str] = None,
    ) -> ReceiptExtractionResult:
        """
        Extract data from a receipt image.
        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
            content_type: MIME type (auto-detected if not provided)
            receipt_type: Hint for receipt type ("fuel" for specialized extraction)
        Returns:
            ReceiptExtractionResult with extracted fields
        """
        start_time = time.time()
        # Detect content type if not provided
        if not content_type:
            content_type = self._detect_mime_type(image_bytes)
        # Validate content type
        if content_type not in self.SUPPORTED_TYPES:
            return ReceiptExtractionResult(
                success=False,
                error=f"Unsupported file type: {content_type}",
                processing_time_ms=int((time.time() - start_time) * 1000),
            )
        try:
            # Apply receipt-optimized preprocessing
            preprocessing_result = receipt_preprocessor.preprocess(image_bytes)
            preprocessed_bytes = preprocessing_result.image_bytes
            # Perform OCR
            raw_text = self._perform_ocr(preprocessed_bytes)
            if not raw_text.strip():
                # Try with less aggressive preprocessing
                preprocessing_result = receipt_preprocessor.preprocess(
                    image_bytes,
                    apply_threshold=False,
                )
                preprocessed_bytes = preprocessing_result.image_bytes
                raw_text = self._perform_ocr(preprocessed_bytes)
            if not raw_text.strip():
                return ReceiptExtractionResult(
                    success=False,
                    error="No text found in image",
                    processing_time_ms=int((time.time() - start_time) * 1000),
                )
            # Detect receipt type if not specified
            detected_type = receipt_type or self._detect_receipt_type(raw_text)
            # Extract fields based on receipt type
            if detected_type == "fuel":
                extracted_fields = self._extract_fuel_fields(raw_text)
            else:
                extracted_fields = self._extract_generic_fields(raw_text)
            processing_time_ms = int((time.time() - start_time) * 1000)
            logger.info(
                f"Receipt extraction: type={detected_type}, "
                f"fields={len(extracted_fields)}, "
                f"time={processing_time_ms}ms"
            )
            return ReceiptExtractionResult(
                success=True,
                receipt_type=detected_type,
                extracted_fields=extracted_fields,
                raw_text=raw_text,
                processing_time_ms=processing_time_ms,
            )
        except Exception as e:
            logger.error(f"Receipt extraction failed: {e}", exc_info=True)
            return ReceiptExtractionResult(
                success=False,
                error=str(e),
                processing_time_ms=int((time.time() - start_time) * 1000),
            )
    def _detect_mime_type(self, file_bytes: bytes) -> str:
        """Detect MIME type using python-magic."""
        mime = magic.Magic(mime=True)
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"
    def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
        """
        Perform OCR on preprocessed image.
        Args:
            image_bytes: Preprocessed image bytes
            psm: Tesseract page segmentation mode
                 4 = Assume single column of text
                 6 = Uniform block of text
        Returns:
            Raw OCR text
        """
        image = Image.open(io.BytesIO(image_bytes))
        # Configure Tesseract for receipt OCR
        # PSM 4 works well for columnar receipt text
        config = f"--psm {psm}"
        return pytesseract.image_to_string(image, config=config)
    def _detect_receipt_type(self, text: str) -> str:
        """
        Detect receipt type based on content.
        Args:
            text: OCR text
        Returns:
            Receipt type: "fuel", "retail", or "unknown"
        """
        text_upper = text.upper()
        # Fuel receipt indicators
        fuel_keywords = [
            "GALLON", "GAL", "FUEL", "GAS", "DIESEL", "UNLEADED",
            "REGULAR", "PREMIUM", "OCTANE", "PPG", "PUMP",
        ]
        fuel_score = sum(1 for kw in fuel_keywords if kw in text_upper)
        # Check for known gas stations
        if fuel_matcher.extract_merchant_name(text):
            merchant, _ = fuel_matcher.extract_merchant_name(text)
            if any(
                station in merchant.upper()
                for station in fuel_matcher.STATION_NAMES
            ):
                fuel_score += 3
        if fuel_score >= 2:
            return "fuel"
        return "unknown"
    def _extract_fuel_fields(self, text: str) -> dict[str, ExtractedField]:
        """
        Extract fuel-specific fields from receipt text.
        Args:
            text: OCR text
        Returns:
            Dictionary of extracted fields
        """
        fields: dict[str, ExtractedField] = {}
        # Extract merchant name
        merchant_result = fuel_matcher.extract_merchant_name(text)
        if merchant_result:
            merchant_name, confidence = merchant_result
            fields["merchantName"] = ExtractedField(
                value=merchant_name,
                confidence=confidence,
            )
        # Extract transaction date
        date_match = date_matcher.extract_best_date(text)
        if date_match:
            fields["transactionDate"] = ExtractedField(
                value=date_match.value,
                confidence=date_match.confidence,
            )
        # Extract total amount
        total_match = currency_matcher.extract_total(text)
        if total_match:
            fields["totalAmount"] = ExtractedField(
                value=total_match.value,
                confidence=total_match.confidence,
            )
        # Extract fuel quantity
        quantity_match = fuel_matcher.extract_quantity(text)
        if quantity_match:
            fields["fuelQuantity"] = ExtractedField(
                value=quantity_match.value,
                confidence=quantity_match.confidence,
            )
        # Extract price per unit
        price_match = fuel_matcher.extract_price_per_unit(text)
        if price_match:
            fields["pricePerUnit"] = ExtractedField(
                value=price_match.value,
                confidence=price_match.confidence,
            )
        # Extract fuel grade
        grade_match = fuel_matcher.extract_grade(text)
        if grade_match:
            fields["fuelGrade"] = ExtractedField(
                value=grade_match.value,
                confidence=grade_match.confidence,
            )
        # Calculate derived values if we have enough data
        if "totalAmount" in fields and "fuelQuantity" in fields:
            if "pricePerUnit" not in fields:
                # Calculate price per unit from total and quantity
                calculated_price = (
                    fields["totalAmount"].value / fields["fuelQuantity"].value
                )
                # Only use if reasonable
                if 1.0 <= calculated_price <= 10.0:
                    fields["pricePerUnit"] = ExtractedField(
                        value=round(calculated_price, 3),
                        confidence=min(
                            fields["totalAmount"].confidence,
                            fields["fuelQuantity"].confidence,
                        )
                        * 0.8,  # Lower confidence for calculated value
                    )
        return fields
    def _extract_generic_fields(self, text: str) -> dict[str, ExtractedField]:
        """
        Extract generic fields from receipt text.
        Args:
            text: OCR text
        Returns:
            Dictionary of extracted fields
        """
        fields: dict[str, ExtractedField] = {}
        # Extract date
        date_match = date_matcher.extract_best_date(text)
        if date_match:
            fields["transactionDate"] = ExtractedField(
                value=date_match.value,
                confidence=date_match.confidence,
            )
        # Extract total amount
        total_match = currency_matcher.extract_total(text)
        if total_match:
            fields["totalAmount"] = ExtractedField(
                value=total_match.value,
                confidence=total_match.confidence,
            )
        # Try to get merchant from first line
        lines = [l.strip() for l in text.split("\n") if l.strip()]
        if lines:
            fields["merchantName"] = ExtractedField(
                value=lines[0][:50],
                confidence=0.40,
            )
        return fields
    def validate(self, data: Any) -> bool:
        """
        Validate extracted receipt data.
        Args:
            data: Extracted data to validate
        Returns:
            True if data has minimum required fields
        """
        if not isinstance(data, dict):
            return False
        # Minimum: must have at least total amount or date
        return "totalAmount" in data or "transactionDate" in data
 # Singleton instance
 receipt_extractor = ReceiptExtractor()
--- a/ocr/app/models/init.py
+++ b/ocr/app/models/init.py
@@ -7,6 +7,8 @@ from .schemas import (
    JobStatus,
    JobSubmitRequest,
    OcrResponse,
    ReceiptExtractedField,
    ReceiptExtractionResponse,
    VinAlternative,
    VinExtractionResponse,
 )
@@ -19,6 +21,8 @@ __all__ = [
    "JobStatus",
    "JobSubmitRequest",
    "OcrResponse",
    "ReceiptExtractedField",
    "ReceiptExtractionResponse",
    "VinAlternative",
    "VinExtractionResponse",
 ]
--- a/ocr/app/models/schemas.py
+++ b/ocr/app/models/schemas.py
@@ -93,3 +93,25 @@ class JobSubmitRequest(BaseModel):
    callback_url: Optional[str] = Field(default=None, alias="callbackUrl")
    model_config = {"populate_by_name": True}
 class ReceiptExtractedField(BaseModel):
    """A single extracted field from a receipt with confidence."""
    value: str | float
    confidence: float = Field(ge=0.0, le=1.0)
 class ReceiptExtractionResponse(BaseModel):
    """Response from receipt extraction endpoint."""
    success: bool
    receipt_type: str = Field(alias="receiptType")
    extracted_fields: dict[str, ReceiptExtractedField] = Field(
        default_factory=dict, alias="extractedFields"
    )
    raw_text: str = Field(alias="rawText")
    processing_time_ms: int = Field(alias="processingTimeMs")
    error: Optional[str] = None
    model_config = {"populate_by_name": True}
--- a/ocr/app/patterns/init.py
+++ b/ocr/app/patterns/init.py
@@ -0,0 +1,13 @@
 """Pattern matching modules for receipt field extraction."""
 from app.patterns.date_patterns import DatePatternMatcher, date_matcher
 from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher
 from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
 __all__ = [
    "DatePatternMatcher",
    "date_matcher",
    "CurrencyPatternMatcher",
    "currency_matcher",
    "FuelPatternMatcher",
    "fuel_matcher",
 ]
--- a/ocr/app/patterns/currency_patterns.py
+++ b/ocr/app/patterns/currency_patterns.py
@@ -0,0 +1,227 @@
 """Currency and amount pattern matching for receipt extraction."""
 import re
 from dataclasses import dataclass
 from decimal import Decimal, InvalidOperation
 from typing import Optional
@dataclass
 class AmountMatch:
    """Result of currency/amount pattern matching."""
    value: float
    raw_match: str
    confidence: float
    pattern_name: str
    label: Optional[str] = None  # e.g., "TOTAL", "SUBTOTAL"
 class CurrencyPatternMatcher:
    """Extract and normalize currency amounts from receipt text."""
    # Total amount patterns (prioritized)
    TOTAL_PATTERNS = [
        # TOTAL $XX.XX or TOTAL: $XX.XX
        (
            r"(?:^|\s)TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
            "total_explicit",
            0.98,
        ),
        # AMOUNT DUE $XX.XX
        (
            r"AMOUNT\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "amount_due",
            0.95,
        ),
        # SALE $XX.XX
        (
            r"(?:^|\s)SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
            "sale_explicit",
            0.92,
        ),
        # GRAND TOTAL $XX.XX
        (
            r"GRAND\s*TOTAL[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "grand_total",
            0.97,
        ),
        # TOTAL SALE $XX.XX
        (
            r"TOTAL\s*SALE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "total_sale",
            0.96,
        ),
        # BALANCE DUE $XX.XX
        (
            r"BALANCE\s*DUE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "balance_due",
            0.94,
        ),
        # PURCHASE $XX.XX
        (
            r"(?:^|\s)PURCHASE[:\s]*\$?\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})(?:\s|$)",
            "purchase",
            0.88,
        ),
    ]
    # Generic amount patterns (lower priority)
    AMOUNT_PATTERNS = [
        # $XX.XX (standalone dollar amount)
        (
            r"\$\s*(\d{1,6}[,.]?\d{0,3}[.,]\d{2})",
            "dollar_amount",
            0.60,
        ),
        # XX.XX (standalone decimal amount)
        (
            r"(?<![.$\d])(\d{1,6}[.,]\d{2})(?![.\d])",
            "decimal_amount",
            0.40,
        ),
    ]
    def extract_total(self, text: str) -> Optional[AmountMatch]:
        """
        Extract the total amount from receipt text.
        Prioritizes explicit total patterns over generic amounts.
        Args:
            text: Receipt text to search
        Returns:
            AmountMatch for total or None if not found
        """
        text_upper = text.upper()
        # Try total-specific patterns first
        for pattern, name, confidence in self.TOTAL_PATTERNS:
            match = re.search(pattern, text_upper, re.MULTILINE)
            if match:
                amount = self._parse_amount(match.group(1))
                if amount is not None and self._is_reasonable_total(amount):
                    return AmountMatch(
                        value=amount,
                        raw_match=match.group(0).strip(),
                        confidence=confidence,
                        pattern_name=name,
                        label=self._extract_label(name),
                    )
        # Fall back to finding the largest reasonable amount
        all_amounts = self.extract_all_amounts(text)
        reasonable = [a for a in all_amounts if self._is_reasonable_total(a.value)]
        if reasonable:
            # Assume largest amount is the total
            reasonable.sort(key=lambda x: x.value, reverse=True)
            best = reasonable[0]
            # Lower confidence since we're guessing
            return AmountMatch(
                value=best.value,
                raw_match=best.raw_match,
                confidence=min(0.60, best.confidence),
                pattern_name="inferred_total",
                label="TOTAL",
            )
        return None
    def extract_all_amounts(self, text: str) -> list[AmountMatch]:
        """
        Extract all currency amounts from text.
        Args:
            text: Receipt text to search
        Returns:
            List of AmountMatch objects
        """
        matches = []
        text_upper = text.upper()
        # Check total patterns
        for pattern, name, confidence in self.TOTAL_PATTERNS:
            for match in re.finditer(pattern, text_upper, re.MULTILINE):
                amount = self._parse_amount(match.group(1))
                if amount is not None:
                    matches.append(
                        AmountMatch(
                            value=amount,
                            raw_match=match.group(0).strip(),
                            confidence=confidence,
                            pattern_name=name,
                            label=self._extract_label(name),
                        )
                    )
        # Check generic amount patterns
        for pattern, name, confidence in self.AMOUNT_PATTERNS:
            for match in re.finditer(pattern, text_upper):
                amount = self._parse_amount(match.group(1))
                if amount is not None:
                    # Skip if already found by a more specific pattern
                    if not any(abs(m.value - amount) < 0.01 for m in matches):
                        matches.append(
                            AmountMatch(
                                value=amount,
                                raw_match=match.group(0).strip(),
                                confidence=confidence,
                                pattern_name=name,
                            )
                        )
        return matches
    def _parse_amount(self, amount_str: str) -> Optional[float]:
        """Parse amount string to float, handling various formats."""
        # Remove any spaces
        cleaned = amount_str.strip().replace(" ", "")
        # Handle European format (1.234,56) vs US format (1,234.56)
        # For US receipts, assume comma is thousands separator
        if "," in cleaned and "." in cleaned:
            # Determine which is decimal separator (last one)
            if cleaned.rfind(",") > cleaned.rfind("."):
                # European format
                cleaned = cleaned.replace(".", "").replace(",", ".")
            else:
                # US format
                cleaned = cleaned.replace(",", "")
        elif "," in cleaned:
            # Could be thousands separator or decimal
            parts = cleaned.split(",")
            if len(parts) == 2 and len(parts[1]) == 2:
                # Likely decimal separator
                cleaned = cleaned.replace(",", ".")
            else:
                # Likely thousands separator
                cleaned = cleaned.replace(",", "")
        try:
            amount = float(Decimal(cleaned))
            return amount if amount >= 0 else None
        except (InvalidOperation, ValueError):
            return None
    def _is_reasonable_total(self, amount: float) -> bool:
        """Check if amount is a reasonable total for a fuel receipt."""
        # Reasonable range: $1 to $500 for typical fuel purchases
        return 1.0 <= amount <= 500.0
    def _extract_label(self, pattern_name: str) -> str:
        """Extract display label from pattern name."""
        labels = {
            "total_explicit": "TOTAL",
            "amount_due": "AMOUNT DUE",
            "sale_explicit": "SALE",
            "grand_total": "GRAND TOTAL",
            "total_sale": "TOTAL SALE",
            "balance_due": "BALANCE DUE",
            "purchase": "PURCHASE",
        }
        return labels.get(pattern_name, "TOTAL")
 # Singleton instance
 currency_matcher = CurrencyPatternMatcher()
--- a/ocr/app/patterns/date_patterns.py
+++ b/ocr/app/patterns/date_patterns.py
@@ -0,0 +1,186 @@
 """Date pattern matching for receipt extraction."""
 import re
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Optional
@dataclass
 class DateMatch:
    """Result of date pattern matching."""
    value: str  # ISO format YYYY-MM-DD
    raw_match: str  # Original text matched
    confidence: float
    pattern_name: str
 class DatePatternMatcher:
    """Extract and normalize dates from receipt text."""
    # Pattern definitions with named groups and confidence weights
    PATTERNS = [
        # MM/DD/YYYY or MM/DD/YY (most common US format)
        (
            r"(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<year>\d{2,4})",
            "mm_dd_yyyy",
            0.95,
        ),
        # MM-DD-YYYY or MM-DD-YY
        (
            r"(?P<month>\d{1,2})-(?P<day>\d{1,2})-(?P<year>\d{2,4})",
            "mm_dd_yyyy_dash",
            0.90,
        ),
        # YYYY-MM-DD (ISO format)
        (
            r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})",
            "iso_date",
            0.98,
        ),
        # Mon DD, YYYY (e.g., Jan 15, 2024)
        (
            r"(?P<month_name>[A-Za-z]{3})\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})",
            "month_name_long",
            0.85,
        ),
        # DD Mon YYYY (e.g., 15 Jan 2024)
        (
            r"(?P<day>\d{1,2})\s+(?P<month_name>[A-Za-z]{3})\s+(?P<year>\d{4})",
            "day_month_year",
            0.85,
        ),
        # MMDDYYYY or MMDDYY (no separators, common in some POS systems)
        (
            r"(?<!\d)(?P<month>\d{2})(?P<day>\d{2})(?P<year>\d{2,4})(?!\d)",
            "compact_date",
            0.70,
        ),
    ]
    MONTH_NAMES = {
        "jan": 1, "january": 1,
        "feb": 2, "february": 2,
        "mar": 3, "march": 3,
        "apr": 4, "april": 4,
        "may": 5,
        "jun": 6, "june": 6,
        "jul": 7, "july": 7,
        "aug": 8, "august": 8,
        "sep": 9, "sept": 9, "september": 9,
        "oct": 10, "october": 10,
        "nov": 11, "november": 11,
        "dec": 12, "december": 12,
    }
    def extract_dates(self, text: str) -> list[DateMatch]:
        """
        Extract all date patterns from text.
        Args:
            text: Receipt text to search
        Returns:
            List of DateMatch objects sorted by confidence
        """
        matches = []
        text_upper = text.upper()
        for pattern, name, base_confidence in self.PATTERNS:
            for match in re.finditer(pattern, text, re.IGNORECASE):
                parsed = self._parse_match(match, name)
                if parsed:
                    year, month, day = parsed
                    if self._is_valid_date(year, month, day):
                        # Adjust confidence based on context
                        confidence = self._adjust_confidence(
                            base_confidence, text_upper, match.start()
                        )
                        matches.append(
                            DateMatch(
                                value=f"{year:04d}-{month:02d}-{day:02d}",
                                raw_match=match.group(0),
                                confidence=confidence,
                                pattern_name=name,
                            )
                        )
        # Sort by confidence, deduplicate by value
        matches.sort(key=lambda x: x.confidence, reverse=True)
        seen = set()
        unique_matches = []
        for match in matches:
            if match.value not in seen:
                seen.add(match.value)
                unique_matches.append(match)
        return unique_matches
    def extract_best_date(self, text: str) -> Optional[DateMatch]:
        """
        Extract the most likely transaction date.
        Args:
            text: Receipt text to search
        Returns:
            Best DateMatch or None if no date found
        """
        matches = self.extract_dates(text)
        return matches[0] if matches else None
    def _parse_match(
        self, match: re.Match, pattern_name: str
    ) -> Optional[tuple[int, int, int]]:
        """Parse regex match into year, month, day tuple."""
        groups = match.groupdict()
        # Handle month name patterns
        if "month_name" in groups:
            month_str = groups["month_name"].lower()
            month = self.MONTH_NAMES.get(month_str)
            if not month:
                return None
        else:
            month = int(groups["month"])
        day = int(groups["day"])
        year = int(groups["year"])
        # Normalize 2-digit years
        if year < 100:
            year = 2000 + year if year < 50 else 1900 + year
        return year, month, day
    def _is_valid_date(self, year: int, month: int, day: int) -> bool:
        """Check if date components form a valid date."""
        try:
            datetime(year=year, month=month, day=day)
            # Reasonable year range for receipts
            return 2000 <= year <= 2100
        except ValueError:
            return False
    def _adjust_confidence(
        self, base_confidence: float, text: str, position: int
    ) -> float:
        """
        Adjust confidence based on context clues.
        Boost confidence if date appears near date-related keywords.
        """
        # Look for nearby date keywords
        context_start = max(0, position - 50)
        context = text[context_start:position + 50]
        date_keywords = ["DATE", "TIME", "TRANS", "SALE"]
        for keyword in date_keywords:
            if keyword in context:
                return min(1.0, base_confidence + 0.05)
        return base_confidence
 # Singleton instance
 date_matcher = DatePatternMatcher()
--- a/ocr/app/patterns/fuel_patterns.py
+++ b/ocr/app/patterns/fuel_patterns.py
@@ -0,0 +1,364 @@
 """Fuel-specific pattern matching for receipt extraction."""
 import re
 from dataclasses import dataclass
 from typing import Optional
@dataclass
 class FuelQuantityMatch:
    """Result of fuel quantity pattern matching."""
    value: float  # Gallons or liters
    unit: str  # "GAL" or "L"
    raw_match: str
    confidence: float
    pattern_name: str
@dataclass
 class FuelPriceMatch:
    """Result of fuel price per unit pattern matching."""
    value: float
    unit: str  # "GAL" or "L"
    raw_match: str
    confidence: float
    pattern_name: str
@dataclass
 class FuelGradeMatch:
    """Result of fuel grade pattern matching."""
    value: str  # e.g., "87", "89", "93", "DIESEL"
    display_name: str  # e.g., "Regular 87", "Premium 93"
    raw_match: str
    confidence: float
 class FuelPatternMatcher:
    """Extract fuel-specific data from receipt text."""
    # Gallons patterns
    GALLONS_PATTERNS = [
        # XX.XXX GAL or XX.XXX GALLONS
        (
            r"(\d{1,3}\.\d{1,3})\s*(?:GAL(?:LON)?S?)",
            "gallons_suffix",
            0.95,
        ),
        # GALLONS: XX.XXX or GAL: XX.XXX
        (
            r"(?:GAL(?:LON)?S?)[:\s]+(\d{1,3}\.\d{1,3})",
            "gallons_prefix",
            0.93,
        ),
        # VOLUME XX.XXX
        (
            r"VOLUME[:\s]+(\d{1,3}\.\d{1,3})",
            "volume",
            0.85,
        ),
        # QTY XX.XXX (near fuel context)
        (
            r"QTY[:\s]+(\d{1,3}\.\d{1,3})",
            "qty",
            0.70,
        ),
    ]
    # Liters patterns (for international receipts)
    LITERS_PATTERNS = [
        # XX.XX L or XX.XX LITERS
        (
            r"(\d{1,3}\.\d{1,3})\s*(?:L(?:ITERS?)?)",
            "liters_suffix",
            0.95,
        ),
        # LITERS: XX.XX
        (
            r"(?:L(?:ITERS?)?)[:\s]+(\d{1,3}\.\d{1,3})",
            "liters_prefix",
            0.93,
        ),
    ]
    # Price per gallon patterns
    PRICE_PER_UNIT_PATTERNS = [
        # $X.XXX/GAL or $X.XX/GAL
        (
            r"\$?\s*(\d{1,2}\.\d{2,3})\s*/\s*GAL",
            "price_per_gal",
            0.98,
        ),
        # PRICE/GAL $X.XXX
        (
            r"PRICE\s*/\s*GAL[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
            "labeled_price_gal",
            0.96,
        ),
        # UNIT PRICE $X.XXX
        (
            r"UNIT\s*PRICE[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
            "unit_price",
            0.90,
        ),
        # @ $X.XXX (per unit implied)
        (
            r"@\s*\$?\s*(\d{1,2}\.\d{2,3})",
            "at_price",
            0.85,
        ),
        # PPG $X.XXX (price per gallon)
        (
            r"PPG[:\s]*\$?\s*(\d{1,2}\.\d{2,3})",
            "ppg",
            0.92,
        ),
    ]
    # Fuel grade patterns
    GRADE_PATTERNS = [
        # REGULAR 87, REG 87
        (r"(?:REGULAR|REG)\s*(\d{2})", "regular", 0.95),
        # UNLEADED 87
        (r"UNLEADED\s*(\d{2})", "unleaded", 0.93),
        # PLUS 89, MID 89, MIDGRADE 89
        (r"(?:PLUS|MID(?:GRADE)?)\s*(\d{2})", "plus", 0.95),
        # PREMIUM 91/93, PREM 91/93, SUPER 91/93
        (r"(?:PREMIUM|PREM|SUPER)\s*(\d{2})", "premium", 0.95),
        # Just the octane number near fuel context (87, 89, 91, 93)
        (r"(?<!\d)\s*(87|89|91|93)\s*(?:OCT(?:ANE)?)?", "octane_only", 0.75),
        # DIESEL (no octane)
        (r"DIESEL(?:\s*#?\d)?", "diesel", 0.98),
        # E85 (ethanol blend)
        (r"E\s*85", "e85", 0.95),
    ]
    # Common gas station names
    STATION_NAMES = [
        "SHELL", "CHEVRON", "EXXON", "MOBIL", "BP", "SUNOCO", "76",
        "CIRCLE K", "SPEEDWAY", "WAWA", "SHEETZ", "CASEY", "PILOT",
        "FLYING J", "LOVES", "TA", "PETRO", "MARATHON", "CITGO",
        "VALERO", "MURPHY", "COSTCO", "SAMS CLUB", "SAM'S CLUB",
        "KROGER", "QT", "QUIKTRIP", "RACETRAC", "KUM & GO",
        "KWIK TRIP", "HOLIDAY", "SINCLAIR", "CONOCO", "PHILLIPS 66",
        "ARCO", "AMPM", "AM/PM", "7-ELEVEN", "7 ELEVEN", "GETTY",
        "GULF", "HESS", "TEXACO", "TURKEY HILL", "CUMBERLAND FARMS",
    ]
    def extract_gallons(self, text: str) -> Optional[FuelQuantityMatch]:
        """
        Extract fuel quantity in gallons.
        Args:
            text: Receipt text to search
        Returns:
            FuelQuantityMatch or None
        """
        text_upper = text.upper()
        for pattern, name, confidence in self.GALLONS_PATTERNS:
            match = re.search(pattern, text_upper)
            if match:
                quantity = float(match.group(1))
                if self._is_reasonable_quantity(quantity):
                    return FuelQuantityMatch(
                        value=quantity,
                        unit="GAL",
                        raw_match=match.group(0),
                        confidence=confidence,
                        pattern_name=name,
                    )
        return None
    def extract_liters(self, text: str) -> Optional[FuelQuantityMatch]:
        """
        Extract fuel quantity in liters.
        Args:
            text: Receipt text to search
        Returns:
            FuelQuantityMatch or None
        """
        text_upper = text.upper()
        for pattern, name, confidence in self.LITERS_PATTERNS:
            match = re.search(pattern, text_upper)
            if match:
                quantity = float(match.group(1))
                if self._is_reasonable_quantity(quantity, is_liters=True):
                    return FuelQuantityMatch(
                        value=quantity,
                        unit="L",
                        raw_match=match.group(0),
                        confidence=confidence,
                        pattern_name=name,
                    )
        return None
    def extract_quantity(self, text: str) -> Optional[FuelQuantityMatch]:
        """
        Extract fuel quantity (gallons or liters).
        Prefers gallons for US receipts.
        Args:
            text: Receipt text to search
        Returns:
            FuelQuantityMatch or None
        """
        # Try gallons first (more common in US)
        gallons = self.extract_gallons(text)
        if gallons:
            return gallons
        # Fall back to liters
        return self.extract_liters(text)
    def extract_price_per_unit(self, text: str) -> Optional[FuelPriceMatch]:
        """
        Extract price per gallon/liter.
        Args:
            text: Receipt text to search
        Returns:
            FuelPriceMatch or None
        """
        text_upper = text.upper()
        for pattern, name, confidence in self.PRICE_PER_UNIT_PATTERNS:
            match = re.search(pattern, text_upper)
            if match:
                price = float(match.group(1))
                if self._is_reasonable_price(price):
                    return FuelPriceMatch(
                        value=price,
                        unit="GAL",  # Default to gallons for US
                        raw_match=match.group(0),
                        confidence=confidence,
                        pattern_name=name,
                    )
        return None
    def extract_grade(self, text: str) -> Optional[FuelGradeMatch]:
        """
        Extract fuel grade (octane rating or diesel).
        Args:
            text: Receipt text to search
        Returns:
            FuelGradeMatch or None
        """
        text_upper = text.upper()
        for pattern, name, confidence in self.GRADE_PATTERNS:
            match = re.search(pattern, text_upper)
            if match:
                if name == "diesel":
                    return FuelGradeMatch(
                        value="DIESEL",
                        display_name="Diesel",
                        raw_match=match.group(0),
                        confidence=confidence,
                    )
                elif name == "e85":
                    return FuelGradeMatch(
                        value="E85",
                        display_name="E85 Ethanol",
                        raw_match=match.group(0),
                        confidence=confidence,
                    )
                else:
                    octane = match.group(1)
                    display = self._get_grade_display_name(octane, name)
                    return FuelGradeMatch(
                        value=octane,
                        display_name=display,
                        raw_match=match.group(0),
                        confidence=confidence,
                    )
        return None
    def extract_merchant_name(self, text: str) -> Optional[tuple[str, float]]:
        """
        Extract gas station/merchant name.
        Args:
            text: Receipt text to search
        Returns:
            Tuple of (merchant_name, confidence) or None
        """
        text_upper = text.upper()
        # Check for known station names
        for station in self.STATION_NAMES:
            if station in text_upper:
                # Try to get the full line for context
                for line in text.split("\n"):
                    if station in line.upper():
                        # Clean up the line
                        cleaned = line.strip()
                        if len(cleaned) <= 50:  # Reasonable length
                            return (cleaned, 0.90)
                        return (station.title(), 0.85)
        # Fall back to first non-empty line (often the merchant)
        lines = [l.strip() for l in text.split("\n") if l.strip()]
        if lines:
            first_line = lines[0]
            # Skip if it looks like a date or number
            if not re.match(r"^\d+[/\-.]", first_line):
                return (first_line[:50], 0.50)  # Low confidence
        return None
    def _is_reasonable_quantity(
        self, quantity: float, is_liters: bool = False
    ) -> bool:
        """Check if fuel quantity is reasonable."""
        if is_liters:
            # Typical fill: 20-100 liters
            return 0.5 <= quantity <= 150.0
        else:
            # Typical fill: 5-30 gallons
            return 0.1 <= quantity <= 50.0
    def _is_reasonable_price(self, price: float) -> bool:
        """Check if price per unit is reasonable."""
        # US gas prices: $1.50 - $8.00 per gallon (allowing for fluctuation)
        return 1.00 <= price <= 10.00
    def _get_grade_display_name(self, octane: str, pattern_name: str) -> str:
        """Get display name for fuel grade."""
        grade_names = {
            "87": "Regular 87",
            "89": "Plus 89",
            "91": "Premium 91",
            "93": "Premium 93",
        }
        if octane in grade_names:
            return grade_names[octane]
        # Use pattern hint
        if pattern_name == "premium":
            return f"Premium {octane}"
        elif pattern_name == "plus":
            return f"Plus {octane}"
        else:
            return f"Unleaded {octane}"
 # Singleton instance
 fuel_matcher = FuelPatternMatcher()
--- a/ocr/app/preprocessors/init.py
+++ b/ocr/app/preprocessors/init.py
@@ -1,10 +1,16 @@
 """Image preprocessors for OCR optimization."""
 from app.services.preprocessor import ImagePreprocessor, preprocessor
 from app.preprocessors.vin_preprocessor import VinPreprocessor, vin_preprocessor
 from app.preprocessors.receipt_preprocessor import (
    ReceiptPreprocessor,
    receipt_preprocessor,
 )
 __all__ = [
    "ImagePreprocessor",
    "preprocessor",
    "VinPreprocessor",
    "vin_preprocessor",
    "ReceiptPreprocessor",
    "receipt_preprocessor",
 ]
--- a/ocr/app/preprocessors/receipt_preprocessor.py
+++ b/ocr/app/preprocessors/receipt_preprocessor.py
@@ -0,0 +1,340 @@
 """Receipt-optimized image preprocessing pipeline."""
 import io
 import logging
 from dataclasses import dataclass
 from typing import Optional
 import cv2
 import numpy as np
 from PIL import Image
 from pillow_heif import register_heif_opener
 # Register HEIF/HEIC opener
 register_heif_opener()
 logger = logging.getLogger(__name__)
@dataclass
 class ReceiptPreprocessingResult:
    """Result of receipt preprocessing."""
    image_bytes: bytes
    preprocessing_applied: list[str]
    original_width: int
    original_height: int
 class ReceiptPreprocessor:
    """Receipt-optimized image preprocessing for improved OCR accuracy.
    Thermal receipts typically have:
    - Low contrast (faded ink)
    - Uneven illumination
    - Paper curl/skew
    - Variable font weights
    This preprocessor addresses these issues with targeted enhancements.
    """
    # Optimal width for receipt OCR (narrow receipts work better)
    TARGET_WIDTH = 800
    def preprocess(
        self,
        image_bytes: bytes,
        apply_contrast: bool = True,
        apply_deskew: bool = True,
        apply_denoise: bool = True,
        apply_threshold: bool = True,
        apply_sharpen: bool = True,
    ) -> ReceiptPreprocessingResult:
        """
        Apply receipt-optimized preprocessing pipeline.
        Pipeline optimized for thermal receipts:
        1. HEIC conversion (if needed)
        2. Grayscale conversion
        3. Resize to optimal width
        4. Deskew (correct rotation)
        5. High contrast enhancement (CLAHE + histogram stretch)
        6. Adaptive sharpening
        7. Noise reduction
        8. Adaptive thresholding (receipt-optimized)
        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
            apply_contrast: Apply contrast enhancement
            apply_deskew: Apply deskew correction
            apply_denoise: Apply noise reduction
            apply_threshold: Apply adaptive thresholding
            apply_sharpen: Apply sharpening
        Returns:
            ReceiptPreprocessingResult with processed image bytes
        """
        steps_applied = []
        # Load image with PIL (handles HEIC via pillow-heif)
        pil_image = Image.open(io.BytesIO(image_bytes))
        original_width, original_height = pil_image.size
        steps_applied.append("loaded")
        # Handle EXIF rotation
        pil_image = self._fix_orientation(pil_image)
        # Convert to RGB if needed
        if pil_image.mode not in ("RGB", "L"):
            pil_image = pil_image.convert("RGB")
            steps_applied.append("convert_rgb")
        # Convert to OpenCV format
        cv_image = np.array(pil_image)
        if len(cv_image.shape) == 3:
            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
        # Convert to grayscale
        if len(cv_image.shape) == 3:
            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
        else:
            gray = cv_image
        steps_applied.append("grayscale")
        # Resize to optimal width while maintaining aspect ratio
        gray = self._resize_optimal(gray)
        steps_applied.append("resize")
        # Apply deskew
        if apply_deskew:
            gray = self._deskew(gray)
            steps_applied.append("deskew")
        # Apply high contrast enhancement (critical for thermal receipts)
        if apply_contrast:
            gray = self._enhance_contrast(gray)
            steps_applied.append("contrast")
        # Apply sharpening
        if apply_sharpen:
            gray = self._sharpen(gray)
            steps_applied.append("sharpen")
        # Apply denoising
        if apply_denoise:
            gray = self._denoise(gray)
            steps_applied.append("denoise")
        # Apply adaptive thresholding (receipt-optimized parameters)
        if apply_threshold:
            gray = self._adaptive_threshold_receipt(gray)
            steps_applied.append("threshold")
        # Convert back to PNG bytes
        result_image = Image.fromarray(gray)
        buffer = io.BytesIO()
        result_image.save(buffer, format="PNG")
        logger.debug(f"Receipt preprocessing applied: {steps_applied}")
        return ReceiptPreprocessingResult(
            image_bytes=buffer.getvalue(),
            preprocessing_applied=steps_applied,
            original_width=original_width,
            original_height=original_height,
        )
    def _fix_orientation(self, image: Image.Image) -> Image.Image:
        """Fix image orientation based on EXIF data."""
        try:
            exif = image.getexif()
            if exif:
                orientation = exif.get(274)  # Orientation tag
                if orientation:
                    rotate_values = {
                        3: 180,
                        6: 270,
                        8: 90,
                    }
                    if orientation in rotate_values:
                        return image.rotate(
                            rotate_values[orientation], expand=True
                        )
        except Exception as e:
            logger.debug(f"Could not read EXIF orientation: {e}")
        return image
    def _resize_optimal(self, image: np.ndarray) -> np.ndarray:
        """Resize image to optimal width for OCR."""
        height, width = image.shape[:2]
        if width <= self.TARGET_WIDTH:
            return image
        scale = self.TARGET_WIDTH / width
        new_height = int(height * scale)
        return cv2.resize(
            image,
            (self.TARGET_WIDTH, new_height),
            interpolation=cv2.INTER_AREA,
        )
    def _deskew(self, image: np.ndarray) -> np.ndarray:
        """
        Correct image rotation using projection profile.
        Receipts often have slight rotation from scanning/photography.
        Uses projection profile method optimized for text documents.
        """
        try:
            # Create binary image for angle detection
            _, binary = cv2.threshold(
                image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
            )
            # Find all non-zero points
            coords = np.column_stack(np.where(binary > 0))
            if len(coords) < 100:
                return image
            # Use minimum area rectangle to find angle
            rect = cv2.minAreaRect(coords)
            angle = rect[-1]
            # Normalize angle
            if angle < -45:
                angle = 90 + angle
            elif angle > 45:
                angle = angle - 90
            # Only correct if angle is significant but not extreme
            if abs(angle) < 0.5 or abs(angle) > 15:
                return image
            # Rotate to correct skew
            height, width = image.shape[:2]
            center = (width // 2, height // 2)
            rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
            rotated = cv2.warpAffine(
                image,
                rotation_matrix,
                (width, height),
                borderMode=cv2.BORDER_REPLICATE,
            )
            logger.debug(f"Receipt deskewed by {angle:.2f} degrees")
            return rotated
        except Exception as e:
            logger.warning(f"Deskew failed: {e}")
            return image
    def _enhance_contrast(self, image: np.ndarray) -> np.ndarray:
        """
        Apply aggressive contrast enhancement for faded receipts.
        Combines:
        1. Histogram stretching
        2. CLAHE (Contrast Limited Adaptive Histogram Equalization)
        """
        try:
            # First, stretch histogram to use full dynamic range
            p2, p98 = np.percentile(image, (2, 98))
            stretched = np.clip(
                (image - p2) * 255.0 / (p98 - p2), 0, 255
            ).astype(np.uint8)
            # Apply CLAHE with parameters optimized for receipts
            # Higher clipLimit for faded thermal receipts
            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
            enhanced = clahe.apply(stretched)
            return enhanced
        except Exception as e:
            logger.warning(f"Contrast enhancement failed: {e}")
            return image
    def _sharpen(self, image: np.ndarray) -> np.ndarray:
        """
        Apply unsharp masking for clearer text edges.
        Light sharpening improves OCR on slightly blurry images.
        """
        try:
            # Gaussian blur for unsharp mask
            blurred = cv2.GaussianBlur(image, (0, 0), 2.0)
            # Unsharp mask: original + alpha * (original - blurred)
            sharpened = cv2.addWeighted(image, 1.5, blurred, -0.5, 0)
            return sharpened
        except Exception as e:
            logger.warning(f"Sharpening failed: {e}")
            return image
    def _denoise(self, image: np.ndarray) -> np.ndarray:
        """
        Apply light denoising optimized for text.
        Uses bilateral filter to preserve edges while reducing noise.
        """
        try:
            # Bilateral filter preserves edges better than Gaussian
            # Light denoising - don't want to blur text
            return cv2.bilateralFilter(image, 5, 50, 50)
        except Exception as e:
            logger.warning(f"Denoising failed: {e}")
            return image
    def _adaptive_threshold_receipt(self, image: np.ndarray) -> np.ndarray:
        """
        Apply adaptive thresholding optimized for receipt text.
        Uses parameters tuned for:
        - Variable font sizes (small print + headers)
        - Faded thermal printing
        - Uneven paper illumination
        """
        try:
            # Use Gaussian adaptive threshold
            # Larger block size (31) handles uneven illumination
            # Moderate C value (8) for faded receipts
            binary = cv2.adaptiveThreshold(
                image,
                255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY,
                blockSize=31,
                C=8,
            )
            return binary
        except Exception as e:
            logger.warning(f"Adaptive threshold failed: {e}")
            return image
    def preprocess_for_low_quality(
        self, image_bytes: bytes
    ) -> ReceiptPreprocessingResult:
        """
        Apply aggressive preprocessing for very low quality receipts.
        Use this when standard preprocessing fails to produce readable text.
        """
        return self.preprocess(
            image_bytes,
            apply_contrast=True,
            apply_deskew=True,
            apply_denoise=True,
            apply_threshold=True,
            apply_sharpen=True,
        )
 # Singleton instance
 receipt_preprocessor = ReceiptPreprocessor()
--- a/ocr/app/routers/extract.py
+++ b/ocr/app/routers/extract.py
@@ -1,10 +1,19 @@
 """OCR extraction endpoints."""
 import logging
 from typing import Optional
-from fastapi import APIRouter, File, HTTPException, Query, UploadFile
+from fastapi import APIRouter, File, Form, HTTPException, Query, UploadFile
 from app.extractors.vin_extractor import vin_extractor
-from app.models import BoundingBox, OcrResponse, VinAlternative, VinExtractionResponse
+from app.extractors.receipt_extractor import receipt_extractor
 from app.models import (
    BoundingBox,
    OcrResponse,
    ReceiptExtractedField,
    ReceiptExtractionResponse,
    VinAlternative,
    VinExtractionResponse,
 )
 from app.services import ocr_service
 logger = logging.getLogger(__name__)
@@ -154,3 +163,97 @@ async def extract_vin(
        processingTimeMs=result.processing_time_ms,
        error=result.error,
    )
@router.post("/receipt", response_model=ReceiptExtractionResponse)
 async def extract_receipt(
    file: UploadFile = File(..., description="Receipt image file"),
    receipt_type: Optional[str] = Form(
        default=None,
        description="Receipt type hint: 'fuel' for specialized extraction",
    ),
 ) -> ReceiptExtractionResponse:
    """
    Extract data from a receipt image using OCR.
    Optimized for fuel receipts with pattern-based field extraction:
    - HEIC conversion (if needed)
    - Grayscale conversion
    - High contrast enhancement (for thermal receipts)
    - Adaptive thresholding
    - Pattern matching for dates, amounts, fuel quantities
    Supports HEIC, JPEG, PNG formats.
    Processing time target: <3 seconds.
    - **file**: Receipt image file (max 10MB)
    - **receipt_type**: Optional hint ("fuel" for gas station receipts)
    Returns:
    - **receiptType**: Detected type ("fuel" or "unknown")
    - **extractedFields**: Dictionary of extracted fields with confidence scores
      - merchantName: Gas station or store name
      - transactionDate: Date in YYYY-MM-DD format
      - totalAmount: Total purchase amount
      - fuelQuantity: Gallons/liters purchased (fuel receipts)
      - pricePerUnit: Price per gallon/liter (fuel receipts)
      - fuelGrade: Octane rating or fuel type (fuel receipts)
    - **rawText**: Full OCR text
    - **processingTimeMs**: Processing time in milliseconds
    """
    # Validate file presence
    if not file.filename:
        raise HTTPException(status_code=400, detail="No file provided")
    # Read file content
    content = await file.read()
    file_size = len(content)
    # Validate file size
    if file_size > MAX_SYNC_SIZE:
        raise HTTPException(
            status_code=413,
            detail=f"File too large. Max: {MAX_SYNC_SIZE // (1024*1024)}MB",
        )
    if file_size == 0:
        raise HTTPException(status_code=400, detail="Empty file provided")
    logger.info(
        f"Receipt extraction: {file.filename}, "
        f"size: {file_size} bytes, "
        f"content_type: {file.content_type}, "
        f"receipt_type: {receipt_type}"
    )
    # Perform receipt extraction
    result = receipt_extractor.extract(
        image_bytes=content,
        content_type=file.content_type,
        receipt_type=receipt_type,
    )
    if not result.success:
        logger.warning(f"Receipt extraction failed for {file.filename}: {result.error}")
        raise HTTPException(
            status_code=422,
            detail=result.error or "Failed to extract data from receipt image",
        )
    # Convert internal fields to API response format
    extracted_fields = {
        name: ReceiptExtractedField(
            value=field.value,
            confidence=field.confidence,
        )
        for name, field in result.extracted_fields.items()
    }
    return ReceiptExtractionResponse(
        success=result.success,
        receiptType=result.receipt_type,
        extractedFields=extracted_fields,
        rawText=result.raw_text,
        processingTimeMs=result.processing_time_ms,
        error=result.error,
    )
--- a/ocr/tests/test_currency_patterns.py
+++ b/ocr/tests/test_currency_patterns.py
@@ -0,0 +1,198 @@
 """Tests for currency pattern matching."""
 import pytest
 from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher
 class TestCurrencyPatternMatcher:
    """Test currency and amount extraction."""
    def test_total_explicit(self) -> None:
        """Test 'TOTAL $XX.XX' pattern."""
        text = "TOTAL $45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
        assert result.confidence > 0.9
        assert result.label == "TOTAL"
    def test_total_with_colon(self) -> None:
        """Test 'TOTAL: $XX.XX' pattern."""
        text = "TOTAL: $45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
    def test_total_without_dollar_sign(self) -> None:
        """Test 'TOTAL 45.67' pattern."""
        text = "TOTAL 45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
    def test_amount_due(self) -> None:
        """Test 'AMOUNT DUE' pattern."""
        text = "AMOUNT DUE: $45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
        assert result.label == "AMOUNT DUE"
    def test_sale_pattern(self) -> None:
        """Test 'SALE $XX.XX' pattern."""
        text = "SALE $45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
    def test_grand_total(self) -> None:
        """Test 'GRAND TOTAL' pattern."""
        text = "GRAND TOTAL $45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
        assert result.label == "GRAND TOTAL"
    def test_total_sale(self) -> None:
        """Test 'TOTAL SALE' pattern."""
        text = "TOTAL SALE: $45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
    def test_balance_due(self) -> None:
        """Test 'BALANCE DUE' pattern."""
        text = "BALANCE DUE $45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
    def test_multiple_amounts_picks_total(self) -> None:
        """Test that labeled total is preferred over generic amounts."""
        text = """
        REGULAR 87
        10.500 GAL @ $3.67
        SUBTOTAL $38.54
        TAX $0.00
        TOTAL $38.54
        """
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 38.54
        assert result.pattern_name == "total_explicit"
    def test_all_amounts(self) -> None:
        """Test extracting all amounts from receipt."""
        text = """
        SUBTOTAL $35.00
        TAX $3.54
        TOTAL $38.54
        """
        results = currency_matcher.extract_all_amounts(text)
        # Should find TOTAL and possibly others
        assert len(results) >= 1
        assert any(r.value == 38.54 for r in results)
    def test_comma_thousand_separator(self) -> None:
        """Test amounts with thousand separators."""
        text = "TOTAL $1,234.56"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 1234.56
    def test_reasonable_total_range(self) -> None:
        """Test that unreasonable totals are filtered."""
        # Very small amount
        text = "TOTAL $0.05"
        result = currency_matcher.extract_total(text)
        assert result is None  # Too small for fuel receipt
        # Reasonable amount
        text = "TOTAL $45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
    def test_receipt_context_extraction(self) -> None:
        """Test extraction from realistic receipt text."""
        text = """
        SHELL
        123 MAIN ST
        DATE: 01/15/2024
        UNLEADED 87
        10.500 GAL
        @ $3.679/GAL
        FUEL TOTAL    $38.63
        TAX           $0.00
        TOTAL         $38.63
        DEBIT CARD
        ************1234
        """
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 38.63
    def test_no_total_returns_largest(self) -> None:
        """Test fallback to largest amount when no labeled total."""
        text = """
        $10.50
        $5.00
        $45.67
        """
        result = currency_matcher.extract_total(text)
        # Should infer largest reasonable amount as total
        assert result is not None
        assert result.value == 45.67
        assert result.confidence < 0.7  # Lower confidence for inferred
    def test_no_amounts_returns_none(self) -> None:
        """Test that text without amounts returns None."""
        text = "SHELL STATION\nPUMP 5"
        result = currency_matcher.extract_total(text)
        assert result is None
 class TestEdgeCases:
    """Test edge cases in currency parsing."""
    def test_european_format(self) -> None:
        """Test European format (comma as decimal)."""
        # European: 45,67 means 45.67
        text = "TOTAL 45,67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
    def test_spaces_in_amount(self) -> None:
        """Test handling of spaces around amounts."""
        text = "TOTAL $ 45.67"
        result = currency_matcher.extract_total(text)
        assert result is not None
        assert result.value == 45.67
    def test_case_insensitive(self) -> None:
        """Test case insensitive matching."""
        for label in ["TOTAL", "Total", "total"]:
            text = f"{label} $45.67"
            result = currency_matcher.extract_total(text)
            assert result is not None, f"Failed for {label}"
            assert result.value == 45.67
--- a/ocr/tests/test_date_patterns.py
+++ b/ocr/tests/test_date_patterns.py
@@ -0,0 +1,163 @@
 """Tests for date pattern matching."""
 import pytest
 from app.patterns.date_patterns import DatePatternMatcher, date_matcher
 class TestDatePatternMatcher:
    """Test date pattern extraction."""
    def test_mm_dd_yyyy_slash(self) -> None:
        """Test MM/DD/YYYY format."""
        text = "DATE: 01/15/2024"
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-01-15"
        assert result.confidence > 0.9
    def test_mm_dd_yy_slash(self) -> None:
        """Test MM/DD/YY format with 2-digit year."""
        text = "01/15/24"
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-01-15"
    def test_mm_dd_yyyy_dash(self) -> None:
        """Test MM-DD-YYYY format."""
        text = "01-15-2024"
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-01-15"
    def test_iso_format(self) -> None:
        """Test ISO YYYY-MM-DD format."""
        text = "2024-01-15"
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-01-15"
        assert result.confidence > 0.95
    def test_month_name_format(self) -> None:
        """Test 'Jan 15, 2024' format."""
        text = "Jan 15, 2024"
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-01-15"
    def test_month_name_no_comma(self) -> None:
        """Test 'Jan 15 2024' format without comma."""
        text = "Jan 15 2024"
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-01-15"
    def test_day_month_year_format(self) -> None:
        """Test '15 Jan 2024' format."""
        text = "15 Jan 2024"
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-01-15"
    def test_full_month_name(self) -> None:
        """Test full month name like 'January'."""
        text = "January 15, 2024"
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-01-15"
    def test_multiple_dates_returns_best(self) -> None:
        """Test that multiple dates returns highest confidence."""
        text = "Date: 01/15/2024\nExpires: 01/15/2025"
        results = date_matcher.extract_dates(text)
        assert len(results) == 2
        # Both should be valid
        assert all(r.confidence > 0.5 for r in results)
    def test_invalid_date_rejected(self) -> None:
        """Test that invalid dates are rejected."""
        text = "13/45/2024"  # Invalid month/day
        result = date_matcher.extract_best_date(text)
        assert result is None
    def test_receipt_context_text(self) -> None:
        """Test date extraction from realistic receipt text."""
        text = """
        SHELL STATION
        123 MAIN ST
        DATE: 01/15/2024
        TIME: 14:32
        PUMP #5
        REGULAR 87
        10.500 GAL
        TOTAL $38.50
        """
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-01-15"
    def test_no_date_returns_none(self) -> None:
        """Test that text without dates returns None."""
        text = "SHELL STATION\nTOTAL $38.50"
        result = date_matcher.extract_best_date(text)
        assert result is None
    def test_confidence_boost_near_keyword(self) -> None:
        """Test confidence boost when date is near DATE keyword."""
        text_with_keyword = "DATE: 01/15/2024"
        text_without = "01/15/2024"
        result_with = date_matcher.extract_best_date(text_with_keyword)
        result_without = date_matcher.extract_best_date(text_without)
        assert result_with is not None
        assert result_without is not None
        # Keyword proximity should boost confidence
        assert result_with.confidence >= result_without.confidence
 class TestEdgeCases:
    """Test edge cases in date parsing."""
    def test_year_2000(self) -> None:
        """Test 2-digit year 00 is parsed as 2000."""
        text = "01/15/00"
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2000-01-15"
    def test_leap_year_date(self) -> None:
        """Test Feb 29 on leap year."""
        text = "02/29/2024"  # 2024 is a leap year
        result = date_matcher.extract_best_date(text)
        assert result is not None
        assert result.value == "2024-02-29"
    def test_leap_year_invalid(self) -> None:
        """Test Feb 29 on non-leap year is rejected."""
        text = "02/29/2023"  # 2023 is not a leap year
        result = date_matcher.extract_best_date(text)
        assert result is None
    def test_september_abbrev(self) -> None:
        """Test September abbreviation (Sept vs Sep)."""
        for abbrev in ["Sep", "Sept", "September"]:
            text = f"{abbrev} 15, 2024"
            result = date_matcher.extract_best_date(text)
            assert result is not None, f"Failed for {abbrev}"
            assert result.value == "2024-09-15"
--- a/ocr/tests/test_fuel_patterns.py
+++ b/ocr/tests/test_fuel_patterns.py
@@ -0,0 +1,327 @@
 """Tests for fuel-specific pattern matching."""
 import pytest
 from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
 class TestFuelQuantityExtraction:
    """Test fuel quantity (gallons/liters) extraction."""
    def test_gallons_suffix(self) -> None:
        """Test 'XX.XXX GAL' pattern."""
        text = "10.500 GAL"
        result = fuel_matcher.extract_gallons(text)
        assert result is not None
        assert result.value == 10.5
        assert result.unit == "GAL"
        assert result.confidence > 0.9
    def test_gallons_full_word(self) -> None:
        """Test 'XX.XXX GALLONS' pattern."""
        text = "10.500 GALLONS"
        result = fuel_matcher.extract_gallons(text)
        assert result is not None
        assert result.value == 10.5
    def test_gallons_prefix(self) -> None:
        """Test 'GALLONS: XX.XXX' pattern."""
        text = "GALLONS: 10.500"
        result = fuel_matcher.extract_gallons(text)
        assert result is not None
        assert result.value == 10.5
    def test_gal_prefix(self) -> None:
        """Test 'GAL: XX.XXX' pattern."""
        text = "GAL: 10.500"
        result = fuel_matcher.extract_gallons(text)
        assert result is not None
        assert result.value == 10.5
    def test_volume_label(self) -> None:
        """Test 'VOLUME XX.XXX' pattern."""
        text = "VOLUME: 10.500"
        result = fuel_matcher.extract_gallons(text)
        assert result is not None
        assert result.value == 10.5
    def test_liters_suffix(self) -> None:
        """Test 'XX.XX L' pattern."""
        text = "40.5 L"
        result = fuel_matcher.extract_liters(text)
        assert result is not None
        assert result.value == 40.5
        assert result.unit == "L"
    def test_liters_full_word(self) -> None:
        """Test 'XX.XX LITERS' pattern."""
        text = "40.5 LITERS"
        result = fuel_matcher.extract_liters(text)
        assert result is not None
        assert result.value == 40.5
    def test_quantity_prefers_gallons(self) -> None:
        """Test extract_quantity prefers gallons for US receipts."""
        text = "10.500 GAL"
        result = fuel_matcher.extract_quantity(text)
        assert result is not None
        assert result.unit == "GAL"
    def test_reasonable_quantity_filter(self) -> None:
        """Test unreasonable quantities are filtered."""
        # Too small
        text = "0.001 GAL"
        result = fuel_matcher.extract_gallons(text)
        assert result is None
        # Too large
        text = "100.0 GAL"
        result = fuel_matcher.extract_gallons(text)
        assert result is None
 class TestFuelPriceExtraction:
    """Test price per unit extraction."""
    def test_price_per_gal_dollar_sign(self) -> None:
        """Test '$X.XXX/GAL' pattern."""
        text = "$3.679/GAL"
        result = fuel_matcher.extract_price_per_unit(text)
        assert result is not None
        assert result.value == 3.679
        assert result.unit == "GAL"
        assert result.confidence > 0.95
    def test_price_per_gal_no_dollar(self) -> None:
        """Test 'X.XXX/GAL' pattern."""
        text = "3.679/GAL"
        result = fuel_matcher.extract_price_per_unit(text)
        assert result is not None
        assert result.value == 3.679
    def test_labeled_price_gal(self) -> None:
        """Test 'PRICE/GAL $X.XXX' pattern."""
        text = "PRICE/GAL $3.679"
        result = fuel_matcher.extract_price_per_unit(text)
        assert result is not None
        assert result.value == 3.679
    def test_unit_price(self) -> None:
        """Test 'UNIT PRICE $X.XXX' pattern."""
        text = "UNIT PRICE: $3.679"
        result = fuel_matcher.extract_price_per_unit(text)
        assert result is not None
        assert result.value == 3.679
    def test_at_price(self) -> None:
        """Test '@ $X.XXX' pattern."""
        text = "10.500 GAL @ $3.679"
        result = fuel_matcher.extract_price_per_unit(text)
        assert result is not None
        assert result.value == 3.679
    def test_ppg_pattern(self) -> None:
        """Test 'PPG $X.XXX' pattern."""
        text = "PPG: $3.679"
        result = fuel_matcher.extract_price_per_unit(text)
        assert result is not None
        assert result.value == 3.679
    def test_reasonable_price_filter(self) -> None:
        """Test unreasonable prices are filtered."""
        # Too low
        text = "$0.50/GAL"
        result = fuel_matcher.extract_price_per_unit(text)
        assert result is None
        # Too high
        text = "$15.00/GAL"
        result = fuel_matcher.extract_price_per_unit(text)
        assert result is None
 class TestFuelGradeExtraction:
    """Test fuel grade/octane extraction."""
    def test_regular_87(self) -> None:
        """Test 'REGULAR 87' pattern."""
        text = "REGULAR 87"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "87"
        assert "Regular" in result.display_name
    def test_reg_87(self) -> None:
        """Test 'REG 87' pattern."""
        text = "REG 87"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "87"
    def test_unleaded_87(self) -> None:
        """Test 'UNLEADED 87' pattern."""
        text = "UNLEADED 87"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "87"
    def test_plus_89(self) -> None:
        """Test 'PLUS 89' pattern."""
        text = "PLUS 89"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "89"
        assert "Plus" in result.display_name
    def test_midgrade_89(self) -> None:
        """Test 'MIDGRADE 89' pattern."""
        text = "MIDGRADE 89"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "89"
    def test_premium_93(self) -> None:
        """Test 'PREMIUM 93' pattern."""
        text = "PREMIUM 93"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "93"
        assert "Premium" in result.display_name
    def test_super_93(self) -> None:
        """Test 'SUPER 93' pattern."""
        text = "SUPER 93"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "93"
    def test_diesel(self) -> None:
        """Test 'DIESEL' pattern."""
        text = "DIESEL #2"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "DIESEL"
        assert "Diesel" in result.display_name
    def test_e85(self) -> None:
        """Test 'E85' ethanol pattern."""
        text = "E85"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "E85"
    def test_octane_only(self) -> None:
        """Test standalone octane number."""
        text = "87 OCTANE"
        result = fuel_matcher.extract_grade(text)
        assert result is not None
        assert result.value == "87"
 class TestMerchantExtraction:
    """Test gas station name extraction."""
    def test_shell_station(self) -> None:
        """Test Shell station detection."""
        text = "SHELL\n123 MAIN ST"
        result = fuel_matcher.extract_merchant_name(text)
        assert result is not None
        merchant, confidence = result
        assert "SHELL" in merchant.upper()
        assert confidence > 0.8
    def test_chevron_station(self) -> None:
        """Test Chevron station detection."""
        text = "CHEVRON #12345\n456 OAK AVE"
        result = fuel_matcher.extract_merchant_name(text)
        assert result is not None
        merchant, confidence = result
        assert "CHEVRON" in merchant.upper()
    def test_costco_gas(self) -> None:
        """Test Costco gas detection."""
        text = "COSTCO GASOLINE\n789 WAREHOUSE BLVD"
        result = fuel_matcher.extract_merchant_name(text)
        assert result is not None
        merchant, confidence = result
        assert "COSTCO" in merchant.upper()
    def test_unknown_station_fallback(self) -> None:
        """Test fallback to first line for unknown stations."""
        text = "JOE'S GAS\n123 MAIN ST"
        result = fuel_matcher.extract_merchant_name(text)
        assert result is not None
        merchant, confidence = result
        assert "JOE'S GAS" in merchant
        assert confidence < 0.7  # Lower confidence for unknown
 class TestReceiptContextExtraction:
    """Test extraction from realistic receipt text."""
    def test_full_receipt_extraction(self) -> None:
        """Test all fields from complete receipt text."""
        text = """
        SHELL
        123 MAIN STREET
        ANYTOWN, USA 12345
        DATE: 01/15/2024
        TIME: 14:32
        PUMP #5
        REGULAR 87
        10.500 GAL @ $3.679/GAL
        FUEL TOTAL    $38.63
        TAX           $0.00
        TOTAL         $38.63
        DEBIT CARD
        ************1234
        APPROVED
        """
        # Test all extractors on this text
        quantity = fuel_matcher.extract_quantity(text)
        assert quantity is not None
        assert quantity.value == 10.5
        price = fuel_matcher.extract_price_per_unit(text)
        assert price is not None
        assert price.value == 3.679
        grade = fuel_matcher.extract_grade(text)
        assert grade is not None
        assert grade.value == "87"
        merchant = fuel_matcher.extract_merchant_name(text)
        assert merchant is not None
        assert "SHELL" in merchant[0].upper()
--- a/ocr/tests/test_receipt_extraction.py
+++ b/ocr/tests/test_receipt_extraction.py
@@ -0,0 +1,339 @@
 """Tests for receipt extraction pipeline."""
 import io
 import pytest
 from unittest.mock import MagicMock, patch
 from app.extractors.receipt_extractor import (
    ReceiptExtractor,
    ReceiptExtractionResult,
    receipt_extractor,
 )
 from app.extractors.fuel_receipt import (
    FuelReceiptExtractor,
    FuelReceiptValidation,
    fuel_receipt_extractor,
 )
 class TestReceiptExtractor:
    """Test the receipt extraction pipeline."""
    def test_detect_receipt_type_fuel(self) -> None:
        """Test fuel receipt type detection."""
        text = """
        SHELL STATION
        REGULAR 87
        10.500 GAL
        TOTAL $38.50
        """
        extractor = ReceiptExtractor()
        receipt_type = extractor._detect_receipt_type(text)
        assert receipt_type == "fuel"
    def test_detect_receipt_type_unknown(self) -> None:
        """Test unknown receipt type detection."""
        text = """
        WALMART
        GROCERIES
        MILK $3.99
        BREAD $2.50
        TOTAL $6.49
        """
        extractor = ReceiptExtractor()
        receipt_type = extractor._detect_receipt_type(text)
        assert receipt_type == "unknown"
    def test_extract_fuel_fields(self) -> None:
        """Test fuel field extraction from OCR text."""
        text = """
        SHELL
        123 MAIN ST
        DATE: 01/15/2024
        REGULAR 87
        10.500 GAL @ $3.679
        TOTAL $38.63
        """
        extractor = ReceiptExtractor()
        fields = extractor._extract_fuel_fields(text)
        assert "merchantName" in fields
        assert "transactionDate" in fields
        assert "totalAmount" in fields
        assert "fuelQuantity" in fields
        assert "pricePerUnit" in fields
        assert "fuelGrade" in fields
        assert fields["totalAmount"].value == 38.63
        assert fields["fuelQuantity"].value == 10.5
        assert fields["fuelGrade"].value == "87"
    def test_extract_generic_fields(self) -> None:
        """Test generic field extraction."""
        text = """
        WALMART
        01/15/2024
        TOTAL $25.99
        """
        extractor = ReceiptExtractor()
        fields = extractor._extract_generic_fields(text)
        assert "transactionDate" in fields
        assert "totalAmount" in fields
        assert fields["totalAmount"].value == 25.99
    def test_calculated_price_per_unit(self) -> None:
        """Test price per unit calculation when not explicitly stated."""
        text = """
        SHELL
        DATE: 01/15/2024
        10.000 GAL
        TOTAL $35.00
        """
        extractor = ReceiptExtractor()
        fields = extractor._extract_fuel_fields(text)
        assert "pricePerUnit" in fields
        # 35.00 / 10.000 = 3.50
        assert abs(fields["pricePerUnit"].value - 3.50) < 0.01
        # Calculated values should have lower confidence
        assert fields["pricePerUnit"].confidence < 0.9
    def test_validate_valid_data(self) -> None:
        """Test validation of valid receipt data."""
        extractor = ReceiptExtractor()
        data = {"totalAmount": 38.63, "transactionDate": "2024-01-15"}
        assert extractor.validate(data) is True
    def test_validate_invalid_data(self) -> None:
        """Test validation of invalid receipt data."""
        extractor = ReceiptExtractor()
        # Empty dict
        assert extractor.validate({}) is False
        # Not a dict
        assert extractor.validate("invalid") is False
    def test_unsupported_file_type(self) -> None:
        """Test handling of unsupported file types."""
        extractor = ReceiptExtractor()
        with patch.object(extractor, "_detect_mime_type", return_value="application/pdf"):
            result = extractor.extract(b"fake pdf content")
            assert result.success is False
            assert "Unsupported file type" in result.error
 class TestFuelReceiptExtractor:
    """Test fuel receipt specialized extractor."""
    def test_validation_success(self) -> None:
        """Test validation passes for consistent data."""
        from app.extractors.receipt_extractor import ExtractedField
        extractor = FuelReceiptExtractor()
        fields = {
            "totalAmount": ExtractedField(value=38.63, confidence=0.95),
            "fuelQuantity": ExtractedField(value=10.5, confidence=0.90),
            "pricePerUnit": ExtractedField(value=3.679, confidence=0.92),
            "fuelGrade": ExtractedField(value="87", confidence=0.88),
        }
        validation = extractor._validate_fuel_receipt(fields)
        assert validation.is_valid is True
        assert len(validation.issues) == 0
        assert validation.confidence_score == 1.0
    def test_validation_math_mismatch(self) -> None:
        """Test validation catches total != quantity * price."""
        from app.extractors.receipt_extractor import ExtractedField
        extractor = FuelReceiptExtractor()
        fields = {
            "totalAmount": ExtractedField(value=50.00, confidence=0.95),
            "fuelQuantity": ExtractedField(value=10.0, confidence=0.90),
            "pricePerUnit": ExtractedField(value=3.00, confidence=0.92),  # Should be $30
        }
        validation = extractor._validate_fuel_receipt(fields)
        assert validation.is_valid is False
        assert any("doesn't match" in issue for issue in validation.issues)
    def test_validation_quantity_too_small(self) -> None:
        """Test validation catches too-small quantity."""
        from app.extractors.receipt_extractor import ExtractedField
        extractor = FuelReceiptExtractor()
        fields = {
            "totalAmount": ExtractedField(value=1.00, confidence=0.95),
            "fuelQuantity": ExtractedField(value=0.1, confidence=0.90),
        }
        validation = extractor._validate_fuel_receipt(fields)
        assert any("too small" in issue for issue in validation.issues)
    def test_validation_quantity_too_large(self) -> None:
        """Test validation warns on very large quantity."""
        from app.extractors.receipt_extractor import ExtractedField
        extractor = FuelReceiptExtractor()
        fields = {
            "totalAmount": ExtractedField(value=150.00, confidence=0.95),
            "fuelQuantity": ExtractedField(value=45.0, confidence=0.90),
        }
        validation = extractor._validate_fuel_receipt(fields)
        assert any("unusually large" in issue for issue in validation.issues)
    def test_validation_price_too_low(self) -> None:
        """Test validation catches too-low price."""
        from app.extractors.receipt_extractor import ExtractedField
        extractor = FuelReceiptExtractor()
        fields = {
            "totalAmount": ExtractedField(value=10.00, confidence=0.95),
            "pricePerUnit": ExtractedField(value=1.00, confidence=0.90),
        }
        validation = extractor._validate_fuel_receipt(fields)
        assert any("too low" in issue for issue in validation.issues)
    def test_validation_unknown_grade(self) -> None:
        """Test validation catches unknown fuel grade."""
        from app.extractors.receipt_extractor import ExtractedField
        extractor = FuelReceiptExtractor()
        fields = {
            "totalAmount": ExtractedField(value=38.00, confidence=0.95),
            "fuelGrade": ExtractedField(value="95", confidence=0.70),  # Not valid US grade
        }
        validation = extractor._validate_fuel_receipt(fields)
        assert any("Unknown fuel grade" in issue for issue in validation.issues)
    def test_confidence_adjustment_boost(self) -> None:
        """Test confidence boost when validation passes."""
        from app.extractors.receipt_extractor import ExtractedField
        extractor = FuelReceiptExtractor()
        fields = {
            "totalAmount": ExtractedField(value=38.63, confidence=0.80),
        }
        validation = FuelReceiptValidation(
            is_valid=True, issues=[], confidence_score=1.0
        )
        adjusted = extractor._adjust_confidences(fields, validation)
        # Should be boosted by 1.1
        assert adjusted["totalAmount"].confidence == min(1.0, 0.80 * 1.1)
    def test_confidence_adjustment_reduce(self) -> None:
        """Test confidence reduction when validation fails."""
        from app.extractors.receipt_extractor import ExtractedField
        extractor = FuelReceiptExtractor()
        fields = {
            "totalAmount": ExtractedField(value=50.00, confidence=0.90),
        }
        validation = FuelReceiptValidation(
            is_valid=False, issues=["Math mismatch"], confidence_score=0.7
        )
        adjusted = extractor._adjust_confidences(fields, validation)
        # Should be reduced by 0.7
        assert adjusted["totalAmount"].confidence == 0.90 * 0.7
 class TestReceiptPreprocessing:
    """Test receipt preprocessing integration."""
    def test_preprocessing_result_structure(self) -> None:
        """Test preprocessing returns expected structure."""
        from app.preprocessors.receipt_preprocessor import (
            receipt_preprocessor,
            ReceiptPreprocessingResult,
        )
        # Create a simple test image (1x1 white pixel PNG)
        from PIL import Image
        img = Image.new("RGB", (100, 100), color="white")
        buffer = io.BytesIO()
        img.save(buffer, format="PNG")
        image_bytes = buffer.getvalue()
        result = receipt_preprocessor.preprocess(image_bytes)
        assert isinstance(result, ReceiptPreprocessingResult)
        assert len(result.image_bytes) > 0
        assert "loaded" in result.preprocessing_applied
        assert "grayscale" in result.preprocessing_applied
        assert result.original_width == 100
        assert result.original_height == 100
    def test_preprocessing_steps_applied(self) -> None:
        """Test all preprocessing steps are applied."""
        from app.preprocessors.receipt_preprocessor import receipt_preprocessor
        from PIL import Image
        img = Image.new("RGB", (100, 100), color="white")
        buffer = io.BytesIO()
        img.save(buffer, format="PNG")
        image_bytes = buffer.getvalue()
        result = receipt_preprocessor.preprocess(
            image_bytes,
            apply_contrast=True,
            apply_deskew=True,
            apply_denoise=True,
            apply_threshold=True,
            apply_sharpen=True,
        )
        # Check that expected steps are in the applied list
        assert "contrast" in result.preprocessing_applied
        assert "denoise" in result.preprocessing_applied
        assert "threshold" in result.preprocessing_applied
 class TestEndpointIntegration:
    """Test receipt extraction endpoint integration."""
    @pytest.fixture
    def test_client(self):
        """Create test client for FastAPI app."""
        from fastapi.testclient import TestClient
        from app.main import app
        return TestClient(app)
    def test_receipt_endpoint_exists(self, test_client) -> None:
        """Test that /extract/receipt endpoint exists."""
        # Should get 422 (no file) not 404 (not found)
        response = test_client.post("/extract/receipt")
        assert response.status_code == 422  # Unprocessable Entity (missing file)
    def test_receipt_endpoint_no_file(self, test_client) -> None:
        """Test endpoint returns error when no file provided."""
        response = test_client.post("/extract/receipt")
        assert response.status_code == 422
    def test_receipt_endpoint_empty_file(self, test_client) -> None:
        """Test endpoint returns error for empty file."""
        response = test_client.post(
            "/extract/receipt",
            files={"file": ("receipt.jpg", b"", "image/jpeg")},
        )
        assert response.status_code == 400
        assert "Empty file" in response.json()["detail"]