All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 37s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 52s
Deploy to Staging / Verify Staging (pull_request) Successful in 9s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
pdf2image requires poppler-utils which is not installed in the OCR container. PyMuPDF is already in requirements.txt and can render PDF pages to PNG at 300 DPI natively without extra system dependencies. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
253 lines
8.2 KiB
Python
253 lines
8.2 KiB
Python
"""Core OCR service with HEIC support, using pluggable engine abstraction."""
|
|
import io
|
|
import logging
|
|
import time
|
|
from typing import Optional
|
|
|
|
import magic
|
|
from PIL import Image
|
|
from pillow_heif import register_heif_opener
|
|
|
|
from app.engines import OcrConfig, create_engine
|
|
from app.models import DocumentType, ExtractedField, OcrResponse
|
|
from app.services.preprocessor import preprocessor
|
|
|
|
# Register HEIF/HEIC opener with Pillow
|
|
register_heif_opener()
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class OcrService:
|
|
"""Core OCR processing service."""
|
|
|
|
# Supported MIME types
|
|
SUPPORTED_TYPES = {
|
|
"image/jpeg",
|
|
"image/png",
|
|
"image/heic",
|
|
"image/heif",
|
|
"application/pdf",
|
|
}
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialize OCR service with engine from factory."""
|
|
self._engine = create_engine()
|
|
|
|
def extract(
|
|
self,
|
|
file_bytes: bytes,
|
|
content_type: Optional[str] = None,
|
|
preprocess: bool = True,
|
|
) -> OcrResponse:
|
|
"""
|
|
Extract text from an image file.
|
|
|
|
Args:
|
|
file_bytes: Raw file bytes
|
|
content_type: MIME type (optional, will be detected if not provided)
|
|
preprocess: Whether to apply preprocessing
|
|
|
|
Returns:
|
|
OcrResponse with extracted text and metadata
|
|
"""
|
|
start_time = time.time()
|
|
|
|
# Detect file type if not provided
|
|
if not content_type:
|
|
content_type = self._detect_mime_type(file_bytes)
|
|
|
|
# Validate file type
|
|
if content_type not in self.SUPPORTED_TYPES:
|
|
return OcrResponse(
|
|
success=False,
|
|
documentType=DocumentType.UNKNOWN,
|
|
rawText="",
|
|
confidence=0.0,
|
|
extractedFields={},
|
|
processingTimeMs=int((time.time() - start_time) * 1000),
|
|
)
|
|
|
|
try:
|
|
# Convert HEIC/HEIF to standard format
|
|
if content_type in ("image/heic", "image/heif"):
|
|
file_bytes = self._convert_heic(file_bytes)
|
|
content_type = "image/png"
|
|
|
|
# Handle PDF (extract first page as image)
|
|
if content_type == "application/pdf":
|
|
file_bytes = self._extract_pdf_first_page(file_bytes)
|
|
content_type = "image/png"
|
|
|
|
# Apply preprocessing if enabled
|
|
if preprocess:
|
|
file_bytes = preprocessor.preprocess(
|
|
file_bytes, deskew=True, denoise=True
|
|
)
|
|
|
|
# Perform OCR via engine abstraction
|
|
config = OcrConfig()
|
|
result = self._engine.recognize(file_bytes, config)
|
|
raw_text = result.text
|
|
confidence = result.confidence
|
|
|
|
# Detect document type from content
|
|
document_type = self._detect_document_type(raw_text)
|
|
|
|
# Extract fields based on document type
|
|
extracted_fields = self._extract_fields(raw_text, document_type)
|
|
|
|
processing_time_ms = int((time.time() - start_time) * 1000)
|
|
|
|
logger.info(
|
|
f"OCR completed: {len(raw_text)} chars, "
|
|
f"{confidence:.2%} confidence, {processing_time_ms}ms"
|
|
)
|
|
|
|
return OcrResponse(
|
|
success=True,
|
|
documentType=document_type,
|
|
rawText=raw_text,
|
|
confidence=confidence,
|
|
extractedFields=extracted_fields,
|
|
processingTimeMs=processing_time_ms,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"OCR extraction failed: {e}", exc_info=True)
|
|
return OcrResponse(
|
|
success=False,
|
|
documentType=DocumentType.UNKNOWN,
|
|
rawText="",
|
|
confidence=0.0,
|
|
extractedFields={},
|
|
processingTimeMs=int((time.time() - start_time) * 1000),
|
|
)
|
|
|
|
def _detect_mime_type(self, file_bytes: bytes) -> str:
|
|
"""Detect MIME type using python-magic."""
|
|
mime = magic.Magic(mime=True)
|
|
detected = mime.from_buffer(file_bytes)
|
|
return detected or "application/octet-stream"
|
|
|
|
def _convert_heic(self, heic_bytes: bytes) -> bytes:
|
|
"""Convert HEIC/HEIF to PNG format."""
|
|
# pillow-heif registers itself with PIL, so we can open HEIC directly
|
|
image = Image.open(io.BytesIO(heic_bytes))
|
|
buffer = io.BytesIO()
|
|
image.save(buffer, format="PNG")
|
|
return buffer.getvalue()
|
|
|
|
def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
|
|
"""Extract first page of PDF as PNG image."""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
page = doc[0]
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
png_bytes = pix.tobytes("png")
|
|
doc.close()
|
|
return png_bytes
|
|
except ImportError:
|
|
logger.warning("PyMuPDF not available, PDF support limited")
|
|
except Exception as e:
|
|
logger.error(f"PDF extraction failed: {e}")
|
|
|
|
return b""
|
|
|
|
def _detect_document_type(self, text: str) -> DocumentType:
|
|
"""Detect document type from extracted text content."""
|
|
text_lower = text.lower()
|
|
|
|
# VIN document indicators
|
|
vin_indicators = [
|
|
"vin",
|
|
"vehicle identification",
|
|
"title",
|
|
"registration",
|
|
"certificate of title",
|
|
]
|
|
if any(indicator in text_lower for indicator in vin_indicators):
|
|
# Additional check: look for 17-character alphanumeric sequences
|
|
import re
|
|
|
|
vin_pattern = r"\b[A-HJ-NPR-Z0-9]{17}\b"
|
|
if re.search(vin_pattern, text.upper()):
|
|
return DocumentType.VIN
|
|
|
|
# Receipt indicators
|
|
receipt_indicators = [
|
|
"receipt",
|
|
"total",
|
|
"subtotal",
|
|
"tax",
|
|
"payment",
|
|
"invoice",
|
|
"amount due",
|
|
"gallons",
|
|
"price/gallon",
|
|
]
|
|
if sum(1 for ind in receipt_indicators if ind in text_lower) >= 2:
|
|
return DocumentType.RECEIPT
|
|
|
|
# Manual indicators
|
|
manual_indicators = [
|
|
"owner's manual",
|
|
"maintenance schedule",
|
|
"service interval",
|
|
"chapter",
|
|
"table of contents",
|
|
"specifications",
|
|
]
|
|
if any(indicator in text_lower for indicator in manual_indicators):
|
|
return DocumentType.MANUAL
|
|
|
|
return DocumentType.UNKNOWN
|
|
|
|
def _extract_fields(
|
|
self, text: str, document_type: DocumentType
|
|
) -> dict[str, ExtractedField]:
|
|
"""Extract specific fields based on document type."""
|
|
import re
|
|
|
|
fields: dict[str, ExtractedField] = {}
|
|
|
|
if document_type == DocumentType.VIN:
|
|
# Extract VIN (17 alphanumeric characters, excluding I, O, Q)
|
|
vin_pattern = r"\b([A-HJ-NPR-Z0-9]{17})\b"
|
|
match = re.search(vin_pattern, text.upper())
|
|
if match:
|
|
fields["vin"] = ExtractedField(value=match.group(1), confidence=0.9)
|
|
|
|
elif document_type == DocumentType.RECEIPT:
|
|
# Extract amounts (currency patterns)
|
|
amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)"
|
|
amounts = re.findall(amount_pattern, text)
|
|
if amounts:
|
|
# Last amount is often the total
|
|
fields["total"] = ExtractedField(
|
|
value=f"${amounts[-1]}", confidence=0.7
|
|
)
|
|
|
|
# Extract date
|
|
date_pattern = r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})"
|
|
date_match = re.search(date_pattern, text)
|
|
if date_match:
|
|
fields["date"] = ExtractedField(value=date_match.group(1), confidence=0.8)
|
|
|
|
# Extract gallons (for fuel receipts)
|
|
gallon_pattern = r"(\d+\.?\d*)\s*(?:gal|gallons)"
|
|
gallon_match = re.search(gallon_pattern, text.lower())
|
|
if gallon_match:
|
|
fields["gallons"] = ExtractedField(
|
|
value=gallon_match.group(1), confidence=0.85
|
|
)
|
|
|
|
return fields
|
|
|
|
|
|
# Singleton instance
|
|
ocr_service = OcrService()
|