Files
motovaultpro/ocr/app/services/ocr_service.py
Eric Gullickson 5e4515da7c
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 37s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 52s
Deploy to Staging / Verify Staging (pull_request) Successful in 9s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
fix: use PyMuPDF instead of pdf2image for PDF-to-image conversion (refs #182)
pdf2image requires poppler-utils which is not installed in the OCR
container. PyMuPDF is already in requirements.txt and can render PDF
pages to PNG at 300 DPI natively without extra system dependencies.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 21:34:17 -06:00

253 lines
8.2 KiB
Python

"""Core OCR service with HEIC support, using pluggable engine abstraction."""
import io
import logging
import time
from typing import Optional
import magic
from PIL import Image
from pillow_heif import register_heif_opener
from app.engines import OcrConfig, create_engine
from app.models import DocumentType, ExtractedField, OcrResponse
from app.services.preprocessor import preprocessor
# Register HEIF/HEIC opener with Pillow
register_heif_opener()
logger = logging.getLogger(__name__)
class OcrService:
"""Core OCR processing service."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
"application/pdf",
}
def __init__(self) -> None:
"""Initialize OCR service with engine from factory."""
self._engine = create_engine()
def extract(
self,
file_bytes: bytes,
content_type: Optional[str] = None,
preprocess: bool = True,
) -> OcrResponse:
"""
Extract text from an image file.
Args:
file_bytes: Raw file bytes
content_type: MIME type (optional, will be detected if not provided)
preprocess: Whether to apply preprocessing
Returns:
OcrResponse with extracted text and metadata
"""
start_time = time.time()
# Detect file type if not provided
if not content_type:
content_type = self._detect_mime_type(file_bytes)
# Validate file type
if content_type not in self.SUPPORTED_TYPES:
return OcrResponse(
success=False,
documentType=DocumentType.UNKNOWN,
rawText="",
confidence=0.0,
extractedFields={},
processingTimeMs=int((time.time() - start_time) * 1000),
)
try:
# Convert HEIC/HEIF to standard format
if content_type in ("image/heic", "image/heif"):
file_bytes = self._convert_heic(file_bytes)
content_type = "image/png"
# Handle PDF (extract first page as image)
if content_type == "application/pdf":
file_bytes = self._extract_pdf_first_page(file_bytes)
content_type = "image/png"
# Apply preprocessing if enabled
if preprocess:
file_bytes = preprocessor.preprocess(
file_bytes, deskew=True, denoise=True
)
# Perform OCR via engine abstraction
config = OcrConfig()
result = self._engine.recognize(file_bytes, config)
raw_text = result.text
confidence = result.confidence
# Detect document type from content
document_type = self._detect_document_type(raw_text)
# Extract fields based on document type
extracted_fields = self._extract_fields(raw_text, document_type)
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
f"OCR completed: {len(raw_text)} chars, "
f"{confidence:.2%} confidence, {processing_time_ms}ms"
)
return OcrResponse(
success=True,
documentType=document_type,
rawText=raw_text,
confidence=confidence,
extractedFields=extracted_fields,
processingTimeMs=processing_time_ms,
)
except Exception as e:
logger.error(f"OCR extraction failed: {e}", exc_info=True)
return OcrResponse(
success=False,
documentType=DocumentType.UNKNOWN,
rawText="",
confidence=0.0,
extractedFields={},
processingTimeMs=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _convert_heic(self, heic_bytes: bytes) -> bytes:
"""Convert HEIC/HEIF to PNG format."""
# pillow-heif registers itself with PIL, so we can open HEIC directly
image = Image.open(io.BytesIO(heic_bytes))
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return buffer.getvalue()
def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
"""Extract first page of PDF as PNG image."""
try:
import fitz # PyMuPDF
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
page = doc[0]
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
png_bytes = pix.tobytes("png")
doc.close()
return png_bytes
except ImportError:
logger.warning("PyMuPDF not available, PDF support limited")
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
return b""
def _detect_document_type(self, text: str) -> DocumentType:
"""Detect document type from extracted text content."""
text_lower = text.lower()
# VIN document indicators
vin_indicators = [
"vin",
"vehicle identification",
"title",
"registration",
"certificate of title",
]
if any(indicator in text_lower for indicator in vin_indicators):
# Additional check: look for 17-character alphanumeric sequences
import re
vin_pattern = r"\b[A-HJ-NPR-Z0-9]{17}\b"
if re.search(vin_pattern, text.upper()):
return DocumentType.VIN
# Receipt indicators
receipt_indicators = [
"receipt",
"total",
"subtotal",
"tax",
"payment",
"invoice",
"amount due",
"gallons",
"price/gallon",
]
if sum(1 for ind in receipt_indicators if ind in text_lower) >= 2:
return DocumentType.RECEIPT
# Manual indicators
manual_indicators = [
"owner's manual",
"maintenance schedule",
"service interval",
"chapter",
"table of contents",
"specifications",
]
if any(indicator in text_lower for indicator in manual_indicators):
return DocumentType.MANUAL
return DocumentType.UNKNOWN
def _extract_fields(
self, text: str, document_type: DocumentType
) -> dict[str, ExtractedField]:
"""Extract specific fields based on document type."""
import re
fields: dict[str, ExtractedField] = {}
if document_type == DocumentType.VIN:
# Extract VIN (17 alphanumeric characters, excluding I, O, Q)
vin_pattern = r"\b([A-HJ-NPR-Z0-9]{17})\b"
match = re.search(vin_pattern, text.upper())
if match:
fields["vin"] = ExtractedField(value=match.group(1), confidence=0.9)
elif document_type == DocumentType.RECEIPT:
# Extract amounts (currency patterns)
amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)"
amounts = re.findall(amount_pattern, text)
if amounts:
# Last amount is often the total
fields["total"] = ExtractedField(
value=f"${amounts[-1]}", confidence=0.7
)
# Extract date
date_pattern = r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})"
date_match = re.search(date_pattern, text)
if date_match:
fields["date"] = ExtractedField(value=date_match.group(1), confidence=0.8)
# Extract gallons (for fuel receipts)
gallon_pattern = r"(\d+\.?\d*)\s*(?:gal|gallons)"
gallon_match = re.search(gallon_pattern, text.lower())
if gallon_match:
fields["gallons"] = ExtractedField(
value=gallon_match.group(1), confidence=0.85
)
return fields
# Singleton instance
ocr_service = OcrService()