motovaultpro/ocr/app/extractors/receipt_extractor.py

"""Receipt-specific OCR extractor with field extraction."""
import io
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Optional

import magic
from pillow_heif import register_heif_opener

from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.receipt_preprocessor import receipt_preprocessor
from app.patterns import currency_matcher, date_matcher, fuel_matcher

# Register HEIF/HEIC opener
register_heif_opener()

logger = logging.getLogger(__name__)


@dataclass
class ExtractedField:
    """A single extracted field with confidence."""

    value: Any
    confidence: float


@dataclass
class ReceiptExtractionResult:
    """Result of receipt extraction."""

    success: bool
    receipt_type: str = "unknown"
    extracted_fields: dict[str, ExtractedField] = field(default_factory=dict)
    raw_text: str = ""
    processing_time_ms: int = 0
    error: Optional[str] = None


class ReceiptExtractor(BaseExtractor):
    """Receipt-specific OCR extractor for fuel and general receipts."""

    # Supported MIME types
    SUPPORTED_TYPES = {
        "image/jpeg",
        "image/png",
        "image/heic",
        "image/heif",
        "application/pdf",
    }

    def __init__(self) -> None:
        """Initialize receipt extractor with engine from factory."""
        self._engine = create_engine()

    def extract(
        self,
        image_bytes: bytes,
        content_type: Optional[str] = None,
        receipt_type: Optional[str] = None,
    ) -> ReceiptExtractionResult:
        """
        Extract data from a receipt image.

        Args:
            image_bytes: Raw image or PDF bytes (HEIC, JPEG, PNG, PDF)
            content_type: MIME type (auto-detected if not provided)
            receipt_type: Hint for receipt type ("fuel" for specialized extraction)

        Returns:
            ReceiptExtractionResult with extracted fields
        """
        start_time = time.time()

        # Detect content type if not provided
        if not content_type:
            content_type = self._detect_mime_type(image_bytes)

        # Validate content type
        if content_type not in self.SUPPORTED_TYPES:
            return ReceiptExtractionResult(
                success=False,
                error=f"Unsupported file type: {content_type}",
                processing_time_ms=int((time.time() - start_time) * 1000),
            )

        try:
            # Convert PDF to image (first page)
            if content_type == "application/pdf":
                image_bytes = self._extract_pdf_first_page(image_bytes)
                if not image_bytes:
                    return ReceiptExtractionResult(
                        success=False,
                        error="Failed to extract image from PDF",
                        processing_time_ms=int((time.time() - start_time) * 1000),
                    )

            # Apply receipt-optimized preprocessing
            preprocessing_result = receipt_preprocessor.preprocess(image_bytes)
            preprocessed_bytes = preprocessing_result.image_bytes

            # Perform OCR
            raw_text = self._perform_ocr(preprocessed_bytes)

            if not raw_text.strip():
                # Try with less aggressive preprocessing
                preprocessing_result = receipt_preprocessor.preprocess(
                    image_bytes,
                    apply_threshold=False,
                )
                preprocessed_bytes = preprocessing_result.image_bytes
                raw_text = self._perform_ocr(preprocessed_bytes)

            if not raw_text.strip():
                return ReceiptExtractionResult(
                    success=False,
                    error="No text found in image",
                    processing_time_ms=int((time.time() - start_time) * 1000),
                )

            # Detect receipt type if not specified
            detected_type = receipt_type or self._detect_receipt_type(raw_text)

            # Extract fields based on receipt type
            if detected_type == "fuel":
                extracted_fields = self._extract_fuel_fields(raw_text)
            else:
                extracted_fields = self._extract_generic_fields(raw_text)

            processing_time_ms = int((time.time() - start_time) * 1000)

            logger.info(
                f"Receipt extraction: type={detected_type}, "
                f"fields={len(extracted_fields)}, "
                f"time={processing_time_ms}ms"
            )

            return ReceiptExtractionResult(
                success=True,
                receipt_type=detected_type,
                extracted_fields=extracted_fields,
                raw_text=raw_text,
                processing_time_ms=processing_time_ms,
            )

        except Exception as e:
            logger.error(f"Receipt extraction failed: {e}", exc_info=True)
            return ReceiptExtractionResult(
                success=False,
                error=str(e),
                processing_time_ms=int((time.time() - start_time) * 1000),
            )

    def _detect_mime_type(self, file_bytes: bytes) -> str:
        """Detect MIME type using python-magic."""
        mime = magic.Magic(mime=True)
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"

    def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
        """Extract first page of PDF as PNG image for OCR processing."""
        try:
            import fitz  # PyMuPDF

            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            page = doc[0]
            # Render at 300 DPI (default is 72, so scale factor = 300/72)
            mat = fitz.Matrix(300 / 72, 300 / 72)
            pix = page.get_pixmap(matrix=mat)
            png_bytes = pix.tobytes("png")
            doc.close()
            return png_bytes
        except ImportError:
            logger.warning("PyMuPDF not available, PDF support limited")
        except Exception as e:
            logger.error(f"PDF first page extraction failed: {e}")

        return b""

    def _perform_ocr(self, image_bytes: bytes) -> str:
        """
        Perform OCR on preprocessed image via engine abstraction.

        Args:
            image_bytes: Preprocessed image bytes

        Returns:
            Raw OCR text
        """
        config = OcrConfig()
        result = self._engine.recognize(image_bytes, config)
        return result.text

    def _detect_receipt_type(self, text: str) -> str:
        """
        Detect receipt type based on content.

        Args:
            text: OCR text

        Returns:
            Receipt type: "fuel", "retail", or "unknown"
        """
        text_upper = text.upper()

        # Fuel receipt indicators
        fuel_keywords = [
            "GALLON", "GAL", "FUEL", "GAS", "DIESEL", "UNLEADED",
            "REGULAR", "PREMIUM", "OCTANE", "PPG", "PUMP",
        ]

        fuel_score = sum(1 for kw in fuel_keywords if kw in text_upper)

        # Check for known gas stations
        if fuel_matcher.extract_merchant_name(text):
            merchant, _ = fuel_matcher.extract_merchant_name(text)
            if any(
                station in merchant.upper()
                for station in fuel_matcher.STATION_NAMES
            ):
                fuel_score += 3

        if fuel_score >= 2:
            return "fuel"

        return "unknown"

    def _extract_fuel_fields(self, text: str) -> dict[str, ExtractedField]:
        """
        Extract fuel-specific fields from receipt text.

        Args:
            text: OCR text

        Returns:
            Dictionary of extracted fields
        """
        fields: dict[str, ExtractedField] = {}

        # Extract merchant name
        merchant_result = fuel_matcher.extract_merchant_name(text)
        if merchant_result:
            merchant_name, confidence = merchant_result
            fields["merchantName"] = ExtractedField(
                value=merchant_name,
                confidence=confidence,
            )

        # Extract transaction date
        date_match = date_matcher.extract_best_date(text)
        if date_match:
            fields["transactionDate"] = ExtractedField(
                value=date_match.value,
                confidence=date_match.confidence,
            )

        # Extract total amount
        total_match = currency_matcher.extract_total(text)
        if total_match:
            fields["totalAmount"] = ExtractedField(
                value=total_match.value,
                confidence=total_match.confidence,
            )

        # Extract fuel quantity
        quantity_match = fuel_matcher.extract_quantity(text)
        if quantity_match:
            fields["fuelQuantity"] = ExtractedField(
                value=quantity_match.value,
                confidence=quantity_match.confidence,
            )

        # Extract price per unit
        price_match = fuel_matcher.extract_price_per_unit(text)
        if price_match:
            fields["pricePerUnit"] = ExtractedField(
                value=price_match.value,
                confidence=price_match.confidence,
            )

        # Extract fuel grade
        grade_match = fuel_matcher.extract_grade(text)
        if grade_match:
            fields["fuelGrade"] = ExtractedField(
                value=grade_match.value,
                confidence=grade_match.confidence,
            )

        # Calculate derived values if we have enough data
        if "totalAmount" in fields and "fuelQuantity" in fields:
            if "pricePerUnit" not in fields:
                # Calculate price per unit from total and quantity
                calculated_price = (
                    fields["totalAmount"].value / fields["fuelQuantity"].value
                )
                # Only use if reasonable
                if 1.0 <= calculated_price <= 10.0:
                    fields["pricePerUnit"] = ExtractedField(
                        value=round(calculated_price, 3),
                        confidence=min(
                            fields["totalAmount"].confidence,
                            fields["fuelQuantity"].confidence,
                        )
                        * 0.8,  # Lower confidence for calculated value
                    )

        return fields

    def _extract_generic_fields(self, text: str) -> dict[str, ExtractedField]:
        """
        Extract generic fields from receipt text.

        Args:
            text: OCR text

        Returns:
            Dictionary of extracted fields
        """
        fields: dict[str, ExtractedField] = {}

        # Extract date
        date_match = date_matcher.extract_best_date(text)
        if date_match:
            fields["transactionDate"] = ExtractedField(
                value=date_match.value,
                confidence=date_match.confidence,
            )

        # Extract total amount
        total_match = currency_matcher.extract_total(text)
        if total_match:
            fields["totalAmount"] = ExtractedField(
                value=total_match.value,
                confidence=total_match.confidence,
            )

        # Try to get merchant from first line
        lines = [l.strip() for l in text.split("\n") if l.strip()]
        if lines:
            fields["merchantName"] = ExtractedField(
                value=lines[0][:50],
                confidence=0.40,
            )

        return fields

    def validate(self, data: Any) -> bool:
        """
        Validate extracted receipt data.

        Args:
            data: Extracted data to validate

        Returns:
            True if data has minimum required fields
        """
        if not isinstance(data, dict):
            return False

        # Minimum: must have at least total amount or date
        return "totalAmount" in data or "transactionDate" in data


# Singleton instance
receipt_extractor = ReceiptExtractor()