motovaultpro/ocr/app/extractors/receipt_extractor.py

"""Receipt-specific OCR extractor with field extraction."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Optional

import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener

from app.config import settings
from app.extractors.base import BaseExtractor
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
from app.patterns import currency_matcher, date_matcher, fuel_matcher

# Register HEIF/HEIC opener
register_heif_opener()

logger = logging.getLogger(__name__)


@dataclass
class ExtractedField:
    """A single extracted field with confidence."""

    value: Any
    confidence: float


@dataclass
class ReceiptExtractionResult:
    """Result of receipt extraction."""

    success: bool
    receipt_type: str = "unknown"
    extracted_fields: dict[str, ExtractedField] = field(default_factory=dict)
    raw_text: str = ""
    processing_time_ms: int = 0
    error: Optional[str] = None


class ReceiptExtractor(BaseExtractor):
    """Receipt-specific OCR extractor for fuel and general receipts."""

    # Supported MIME types
    SUPPORTED_TYPES = {
        "image/jpeg",
        "image/png",
        "image/heic",
        "image/heif",
    }

    def __init__(self) -> None:
        """Initialize receipt extractor."""
        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd

    def extract(
        self,
        image_bytes: bytes,
        content_type: Optional[str] = None,
        receipt_type: Optional[str] = None,
    ) -> ReceiptExtractionResult:
        """
        Extract data from a receipt image.

        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
            content_type: MIME type (auto-detected if not provided)
            receipt_type: Hint for receipt type ("fuel" for specialized extraction)

        Returns:
            ReceiptExtractionResult with extracted fields
        """
        start_time = time.time()

        # Detect content type if not provided
        if not content_type:
            content_type = self._detect_mime_type(image_bytes)

        # Validate content type
        if content_type not in self.SUPPORTED_TYPES:
            return ReceiptExtractionResult(
                success=False,
                error=f"Unsupported file type: {content_type}",
                processing_time_ms=int((time.time() - start_time) * 1000),
            )

        try:
            # Apply receipt-optimized preprocessing
            preprocessing_result = receipt_preprocessor.preprocess(image_bytes)
            preprocessed_bytes = preprocessing_result.image_bytes

            # Perform OCR
            raw_text = self._perform_ocr(preprocessed_bytes)

            if not raw_text.strip():
                # Try with less aggressive preprocessing
                preprocessing_result = receipt_preprocessor.preprocess(
                    image_bytes,
                    apply_threshold=False,
                )
                preprocessed_bytes = preprocessing_result.image_bytes
                raw_text = self._perform_ocr(preprocessed_bytes)

            if not raw_text.strip():
                return ReceiptExtractionResult(
                    success=False,
                    error="No text found in image",
                    processing_time_ms=int((time.time() - start_time) * 1000),
                )

            # Detect receipt type if not specified
            detected_type = receipt_type or self._detect_receipt_type(raw_text)

            # Extract fields based on receipt type
            if detected_type == "fuel":
                extracted_fields = self._extract_fuel_fields(raw_text)
            else:
                extracted_fields = self._extract_generic_fields(raw_text)

            processing_time_ms = int((time.time() - start_time) * 1000)

            logger.info(
                f"Receipt extraction: type={detected_type}, "
                f"fields={len(extracted_fields)}, "
                f"time={processing_time_ms}ms"
            )

            return ReceiptExtractionResult(
                success=True,
                receipt_type=detected_type,
                extracted_fields=extracted_fields,
                raw_text=raw_text,
                processing_time_ms=processing_time_ms,
            )

        except Exception as e:
            logger.error(f"Receipt extraction failed: {e}", exc_info=True)
            return ReceiptExtractionResult(
                success=False,
                error=str(e),
                processing_time_ms=int((time.time() - start_time) * 1000),
            )

    def _detect_mime_type(self, file_bytes: bytes) -> str:
        """Detect MIME type using python-magic."""
        mime = magic.Magic(mime=True)
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"

    def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
        """
        Perform OCR on preprocessed image.

        Args:
            image_bytes: Preprocessed image bytes
            psm: Tesseract page segmentation mode
                 4 = Assume single column of text
                 6 = Uniform block of text

        Returns:
            Raw OCR text
        """
        image = Image.open(io.BytesIO(image_bytes))

        # Configure Tesseract for receipt OCR
        # PSM 4 works well for columnar receipt text
        config = f"--psm {psm}"

        return pytesseract.image_to_string(image, config=config)

    def _detect_receipt_type(self, text: str) -> str:
        """
        Detect receipt type based on content.

        Args:
            text: OCR text

        Returns:
            Receipt type: "fuel", "retail", or "unknown"
        """
        text_upper = text.upper()

        # Fuel receipt indicators
        fuel_keywords = [
            "GALLON", "GAL", "FUEL", "GAS", "DIESEL", "UNLEADED",
            "REGULAR", "PREMIUM", "OCTANE", "PPG", "PUMP",
        ]

        fuel_score = sum(1 for kw in fuel_keywords if kw in text_upper)

        # Check for known gas stations
        if fuel_matcher.extract_merchant_name(text):
            merchant, _ = fuel_matcher.extract_merchant_name(text)
            if any(
                station in merchant.upper()
                for station in fuel_matcher.STATION_NAMES
            ):
                fuel_score += 3

        if fuel_score >= 2:
            return "fuel"

        return "unknown"

    def _extract_fuel_fields(self, text: str) -> dict[str, ExtractedField]:
        """
        Extract fuel-specific fields from receipt text.

        Args:
            text: OCR text

        Returns:
            Dictionary of extracted fields
        """
        fields: dict[str, ExtractedField] = {}

        # Extract merchant name
        merchant_result = fuel_matcher.extract_merchant_name(text)
        if merchant_result:
            merchant_name, confidence = merchant_result
            fields["merchantName"] = ExtractedField(
                value=merchant_name,
                confidence=confidence,
            )

        # Extract transaction date
        date_match = date_matcher.extract_best_date(text)
        if date_match:
            fields["transactionDate"] = ExtractedField(
                value=date_match.value,
                confidence=date_match.confidence,
            )

        # Extract total amount
        total_match = currency_matcher.extract_total(text)
        if total_match:
            fields["totalAmount"] = ExtractedField(
                value=total_match.value,
                confidence=total_match.confidence,
            )

        # Extract fuel quantity
        quantity_match = fuel_matcher.extract_quantity(text)
        if quantity_match:
            fields["fuelQuantity"] = ExtractedField(
                value=quantity_match.value,
                confidence=quantity_match.confidence,
            )

        # Extract price per unit
        price_match = fuel_matcher.extract_price_per_unit(text)
        if price_match:
            fields["pricePerUnit"] = ExtractedField(
                value=price_match.value,
                confidence=price_match.confidence,
            )

        # Extract fuel grade
        grade_match = fuel_matcher.extract_grade(text)
        if grade_match:
            fields["fuelGrade"] = ExtractedField(
                value=grade_match.value,
                confidence=grade_match.confidence,
            )

        # Calculate derived values if we have enough data
        if "totalAmount" in fields and "fuelQuantity" in fields:
            if "pricePerUnit" not in fields:
                # Calculate price per unit from total and quantity
                calculated_price = (
                    fields["totalAmount"].value / fields["fuelQuantity"].value
                )
                # Only use if reasonable
                if 1.0 <= calculated_price <= 10.0:
                    fields["pricePerUnit"] = ExtractedField(
                        value=round(calculated_price, 3),
                        confidence=min(
                            fields["totalAmount"].confidence,
                            fields["fuelQuantity"].confidence,
                        )
                        * 0.8,  # Lower confidence for calculated value
                    )

        return fields

    def _extract_generic_fields(self, text: str) -> dict[str, ExtractedField]:
        """
        Extract generic fields from receipt text.

        Args:
            text: OCR text

        Returns:
            Dictionary of extracted fields
        """
        fields: dict[str, ExtractedField] = {}

        # Extract date
        date_match = date_matcher.extract_best_date(text)
        if date_match:
            fields["transactionDate"] = ExtractedField(
                value=date_match.value,
                confidence=date_match.confidence,
            )

        # Extract total amount
        total_match = currency_matcher.extract_total(text)
        if total_match:
            fields["totalAmount"] = ExtractedField(
                value=total_match.value,
                confidence=total_match.confidence,
            )

        # Try to get merchant from first line
        lines = [l.strip() for l in text.split("\n") if l.strip()]
        if lines:
            fields["merchantName"] = ExtractedField(
                value=lines[0][:50],
                confidence=0.40,
            )

        return fields

    def validate(self, data: Any) -> bool:
        """
        Validate extracted receipt data.

        Args:
            data: Extracted data to validate

        Returns:
            True if data has minimum required fields
        """
        if not isinstance(data, dict):
            return False

        # Minimum: must have at least total amount or date
        return "totalAmount" in data or "transactionDate" in data


# Singleton instance
receipt_extractor = ReceiptExtractor()