motovaultpro/ocr/app/services/ocr_service.py

"""Core OCR service using Tesseract with HEIC support."""
import io
import logging
import time
from typing import Optional

import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener

from app.config import settings
from app.models import DocumentType, ExtractedField, OcrResponse
from app.services.preprocessor import preprocessor

# Register HEIF/HEIC opener with Pillow
register_heif_opener()

logger = logging.getLogger(__name__)


class OcrService:
    """Core OCR processing service."""

    # Supported MIME types
    SUPPORTED_TYPES = {
        "image/jpeg",
        "image/png",
        "image/heic",
        "image/heif",
        "application/pdf",
    }

    def __init__(self) -> None:
        """Initialize OCR service."""
        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd

    def extract(
        self,
        file_bytes: bytes,
        content_type: Optional[str] = None,
        preprocess: bool = True,
    ) -> OcrResponse:
        """
        Extract text from an image file.

        Args:
            file_bytes: Raw file bytes
            content_type: MIME type (optional, will be detected if not provided)
            preprocess: Whether to apply preprocessing

        Returns:
            OcrResponse with extracted text and metadata
        """
        start_time = time.time()

        # Detect file type if not provided
        if not content_type:
            content_type = self._detect_mime_type(file_bytes)

        # Validate file type
        if content_type not in self.SUPPORTED_TYPES:
            return OcrResponse(
                success=False,
                documentType=DocumentType.UNKNOWN,
                rawText="",
                confidence=0.0,
                extractedFields={},
                processingTimeMs=int((time.time() - start_time) * 1000),
            )

        try:
            # Convert HEIC/HEIF to standard format
            if content_type in ("image/heic", "image/heif"):
                file_bytes = self._convert_heic(file_bytes)
                content_type = "image/png"

            # Handle PDF (extract first page as image)
            if content_type == "application/pdf":
                file_bytes = self._extract_pdf_first_page(file_bytes)
                content_type = "image/png"

            # Apply preprocessing if enabled
            if preprocess:
                file_bytes = preprocessor.preprocess(
                    file_bytes, deskew=True, denoise=True
                )

            # Perform OCR
            image = Image.open(io.BytesIO(file_bytes))
            ocr_data = pytesseract.image_to_data(
                image, output_type=pytesseract.Output.DICT
            )

            # Extract text and calculate confidence
            raw_text, confidence = self._process_ocr_data(ocr_data)

            # Detect document type from content
            document_type = self._detect_document_type(raw_text)

            # Extract fields based on document type
            extracted_fields = self._extract_fields(raw_text, document_type)

            processing_time_ms = int((time.time() - start_time) * 1000)

            logger.info(
                f"OCR completed: {len(raw_text)} chars, "
                f"{confidence:.2%} confidence, {processing_time_ms}ms"
            )

            return OcrResponse(
                success=True,
                documentType=document_type,
                rawText=raw_text,
                confidence=confidence,
                extractedFields=extracted_fields,
                processingTimeMs=processing_time_ms,
            )

        except Exception as e:
            logger.error(f"OCR extraction failed: {e}", exc_info=True)
            return OcrResponse(
                success=False,
                documentType=DocumentType.UNKNOWN,
                rawText="",
                confidence=0.0,
                extractedFields={},
                processingTimeMs=int((time.time() - start_time) * 1000),
            )

    def _detect_mime_type(self, file_bytes: bytes) -> str:
        """Detect MIME type using python-magic."""
        mime = magic.Magic(mime=True)
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"

    def _convert_heic(self, heic_bytes: bytes) -> bytes:
        """Convert HEIC/HEIF to PNG format."""
        # pillow-heif registers itself with PIL, so we can open HEIC directly
        image = Image.open(io.BytesIO(heic_bytes))
        buffer = io.BytesIO()
        image.save(buffer, format="PNG")
        return buffer.getvalue()

    def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
        """Extract first page of PDF as PNG image."""
        try:
            # Use pdf2image if available, otherwise return empty
            from pdf2image import convert_from_bytes

            images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
            if images:
                buffer = io.BytesIO()
                images[0].save(buffer, format="PNG")
                return buffer.getvalue()
        except ImportError:
            logger.warning("pdf2image not available, PDF support limited")
        except Exception as e:
            logger.error(f"PDF extraction failed: {e}")

        return b""

    def _process_ocr_data(
        self, ocr_data: dict
    ) -> tuple[str, float]:
        """Process Tesseract output to extract text and confidence."""
        words = []
        confidences = []

        for i, text in enumerate(ocr_data["text"]):
            # Filter out empty strings and low-confidence results
            conf = int(ocr_data["conf"][i])
            if text.strip() and conf > 0:
                words.append(text)
                confidences.append(conf)

        raw_text = " ".join(words)
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0

        # Normalize confidence to 0-1 range (Tesseract returns 0-100)
        return raw_text, avg_confidence / 100.0

    def _detect_document_type(self, text: str) -> DocumentType:
        """Detect document type from extracted text content."""
        text_lower = text.lower()

        # VIN document indicators
        vin_indicators = [
            "vin",
            "vehicle identification",
            "title",
            "registration",
            "certificate of title",
        ]
        if any(indicator in text_lower for indicator in vin_indicators):
            # Additional check: look for 17-character alphanumeric sequences
            import re

            vin_pattern = r"\b[A-HJ-NPR-Z0-9]{17}\b"
            if re.search(vin_pattern, text.upper()):
                return DocumentType.VIN

        # Receipt indicators
        receipt_indicators = [
            "receipt",
            "total",
            "subtotal",
            "tax",
            "payment",
            "invoice",
            "amount due",
            "gallons",
            "price/gallon",
        ]
        if sum(1 for ind in receipt_indicators if ind in text_lower) >= 2:
            return DocumentType.RECEIPT

        # Manual indicators
        manual_indicators = [
            "owner's manual",
            "maintenance schedule",
            "service interval",
            "chapter",
            "table of contents",
            "specifications",
        ]
        if any(indicator in text_lower for indicator in manual_indicators):
            return DocumentType.MANUAL

        return DocumentType.UNKNOWN

    def _extract_fields(
        self, text: str, document_type: DocumentType
    ) -> dict[str, ExtractedField]:
        """Extract specific fields based on document type."""
        import re

        fields: dict[str, ExtractedField] = {}

        if document_type == DocumentType.VIN:
            # Extract VIN (17 alphanumeric characters, excluding I, O, Q)
            vin_pattern = r"\b([A-HJ-NPR-Z0-9]{17})\b"
            match = re.search(vin_pattern, text.upper())
            if match:
                fields["vin"] = ExtractedField(value=match.group(1), confidence=0.9)

        elif document_type == DocumentType.RECEIPT:
            # Extract amounts (currency patterns)
            amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)"
            amounts = re.findall(amount_pattern, text)
            if amounts:
                # Last amount is often the total
                fields["total"] = ExtractedField(
                    value=f"${amounts[-1]}", confidence=0.7
                )

            # Extract date
            date_pattern = r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})"
            date_match = re.search(date_pattern, text)
            if date_match:
                fields["date"] = ExtractedField(value=date_match.group(1), confidence=0.8)

            # Extract gallons (for fuel receipts)
            gallon_pattern = r"(\d+\.?\d*)\s*(?:gal|gallons)"
            gallon_match = re.search(gallon_pattern, text.lower())
            if gallon_match:
                fields["gallons"] = ExtractedField(
                    value=gallon_match.group(1), confidence=0.85
                )

        return fields


# Singleton instance
ocr_service = OcrService()