"""Core OCR service with HEIC support, using pluggable engine abstraction.""" import io import logging import time from typing import Optional import magic from PIL import Image from pillow_heif import register_heif_opener from app.engines import OcrConfig, create_engine from app.models import DocumentType, ExtractedField, OcrResponse from app.services.preprocessor import preprocessor # Register HEIF/HEIC opener with Pillow register_heif_opener() logger = logging.getLogger(__name__) class OcrService: """Core OCR processing service.""" # Supported MIME types SUPPORTED_TYPES = { "image/jpeg", "image/png", "image/heic", "image/heif", "application/pdf", } def __init__(self) -> None: """Initialize OCR service with engine from factory.""" self._engine = create_engine() def extract( self, file_bytes: bytes, content_type: Optional[str] = None, preprocess: bool = True, ) -> OcrResponse: """ Extract text from an image file. Args: file_bytes: Raw file bytes content_type: MIME type (optional, will be detected if not provided) preprocess: Whether to apply preprocessing Returns: OcrResponse with extracted text and metadata """ start_time = time.time() # Detect file type if not provided if not content_type: content_type = self._detect_mime_type(file_bytes) # Validate file type if content_type not in self.SUPPORTED_TYPES: return OcrResponse( success=False, documentType=DocumentType.UNKNOWN, rawText="", confidence=0.0, extractedFields={}, processingTimeMs=int((time.time() - start_time) * 1000), ) try: # Convert HEIC/HEIF to standard format if content_type in ("image/heic", "image/heif"): file_bytes = self._convert_heic(file_bytes) content_type = "image/png" # Handle PDF (extract first page as image) if content_type == "application/pdf": file_bytes = self._extract_pdf_first_page(file_bytes) content_type = "image/png" # Apply preprocessing if enabled if preprocess: file_bytes = preprocessor.preprocess( file_bytes, deskew=True, denoise=True ) # Perform OCR via engine abstraction config = OcrConfig() result = self._engine.recognize(file_bytes, config) raw_text = result.text confidence = result.confidence # Detect document type from content document_type = self._detect_document_type(raw_text) # Extract fields based on document type extracted_fields = self._extract_fields(raw_text, document_type) processing_time_ms = int((time.time() - start_time) * 1000) logger.info( f"OCR completed: {len(raw_text)} chars, " f"{confidence:.2%} confidence, {processing_time_ms}ms" ) return OcrResponse( success=True, documentType=document_type, rawText=raw_text, confidence=confidence, extractedFields=extracted_fields, processingTimeMs=processing_time_ms, ) except Exception as e: logger.error(f"OCR extraction failed: {e}", exc_info=True) return OcrResponse( success=False, documentType=DocumentType.UNKNOWN, rawText="", confidence=0.0, extractedFields={}, processingTimeMs=int((time.time() - start_time) * 1000), ) def _detect_mime_type(self, file_bytes: bytes) -> str: """Detect MIME type using python-magic.""" mime = magic.Magic(mime=True) detected = mime.from_buffer(file_bytes) return detected or "application/octet-stream" def _convert_heic(self, heic_bytes: bytes) -> bytes: """Convert HEIC/HEIF to PNG format.""" # pillow-heif registers itself with PIL, so we can open HEIC directly image = Image.open(io.BytesIO(heic_bytes)) buffer = io.BytesIO() image.save(buffer, format="PNG") return buffer.getvalue() def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes: """Extract first page of PDF as PNG image.""" try: import fitz # PyMuPDF doc = fitz.open(stream=pdf_bytes, filetype="pdf") page = doc[0] mat = fitz.Matrix(300 / 72, 300 / 72) pix = page.get_pixmap(matrix=mat) png_bytes = pix.tobytes("png") doc.close() return png_bytes except ImportError: logger.warning("PyMuPDF not available, PDF support limited") except Exception as e: logger.error(f"PDF extraction failed: {e}") return b"" def _detect_document_type(self, text: str) -> DocumentType: """Detect document type from extracted text content.""" text_lower = text.lower() # VIN document indicators vin_indicators = [ "vin", "vehicle identification", "title", "registration", "certificate of title", ] if any(indicator in text_lower for indicator in vin_indicators): # Additional check: look for 17-character alphanumeric sequences import re vin_pattern = r"\b[A-HJ-NPR-Z0-9]{17}\b" if re.search(vin_pattern, text.upper()): return DocumentType.VIN # Receipt indicators receipt_indicators = [ "receipt", "total", "subtotal", "tax", "payment", "invoice", "amount due", "gallons", "price/gallon", ] if sum(1 for ind in receipt_indicators if ind in text_lower) >= 2: return DocumentType.RECEIPT # Manual indicators manual_indicators = [ "owner's manual", "maintenance schedule", "service interval", "chapter", "table of contents", "specifications", ] if any(indicator in text_lower for indicator in manual_indicators): return DocumentType.MANUAL return DocumentType.UNKNOWN def _extract_fields( self, text: str, document_type: DocumentType ) -> dict[str, ExtractedField]: """Extract specific fields based on document type.""" import re fields: dict[str, ExtractedField] = {} if document_type == DocumentType.VIN: # Extract VIN (17 alphanumeric characters, excluding I, O, Q) vin_pattern = r"\b([A-HJ-NPR-Z0-9]{17})\b" match = re.search(vin_pattern, text.upper()) if match: fields["vin"] = ExtractedField(value=match.group(1), confidence=0.9) elif document_type == DocumentType.RECEIPT: # Extract amounts (currency patterns) amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)" amounts = re.findall(amount_pattern, text) if amounts: # Last amount is often the total fields["total"] = ExtractedField( value=f"${amounts[-1]}", confidence=0.7 ) # Extract date date_pattern = r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})" date_match = re.search(date_pattern, text) if date_match: fields["date"] = ExtractedField(value=date_match.group(1), confidence=0.8) # Extract gallons (for fuel receipts) gallon_pattern = r"(\d+\.?\d*)\s*(?:gal|gallons)" gallon_match = re.search(gallon_pattern, text.lower()) if gallon_match: fields["gallons"] = ExtractedField( value=gallon_match.group(1), confidence=0.85 ) return fields # Singleton instance ocr_service = OcrService()