feat: add core OCR API integration (refs #65)

OCR Service (Python/FastAPI): - POST /extract for synchronous OCR extraction - POST /jobs and GET /jobs/{job_id} for async processing - Image preprocessing (deskew, denoise) for accuracy - HEIC conversion via pillow-heif - Redis job queue for async processing Backend (Fastify): - POST /api/ocr/extract - authenticated proxy to OCR - POST /api/ocr/jobs - async job submission - GET /api/ocr/jobs/:jobId - job polling - Multipart file upload handling - JWT authentication required File size limits: 10MB sync, 200MB async Processing time target: <3 seconds for typical photos Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 16:02:11 -06:00
parent 94e49306dc
commit 852c9013b5
25 changed files with 1931 additions and 3 deletions
--- a/ocr/app/services/ocr_service.py
+++ b/ocr/app/services/ocr_service.py
@@ -0,0 +1,275 @@
+"""Core OCR service using Tesseract with HEIC support."""
+import io
+import logging
+import time
+from typing import Optional
+
+import magic
+import pytesseract
+from PIL import Image
+from pillow_heif import register_heif_opener
+
+from app.config import settings
+from app.models import DocumentType, ExtractedField, OcrResponse
+from app.services.preprocessor import preprocessor
+
+# Register HEIF/HEIC opener with Pillow
+register_heif_opener()
+
+logger = logging.getLogger(__name__)
+
+
+class OcrService:
+    """Core OCR processing service."""
+
+    # Supported MIME types
+    SUPPORTED_TYPES = {
+        "image/jpeg",
+        "image/png",
+        "image/heic",
+        "image/heif",
+        "application/pdf",
+    }
+
+    def __init__(self) -> None:
+        """Initialize OCR service."""
+        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+
+    def extract(
+        self,
+        file_bytes: bytes,
+        content_type: Optional[str] = None,
+        preprocess: bool = True,
+    ) -> OcrResponse:
+        """
+        Extract text from an image file.
+
+        Args:
+            file_bytes: Raw file bytes
+            content_type: MIME type (optional, will be detected if not provided)
+            preprocess: Whether to apply preprocessing
+
+        Returns:
+            OcrResponse with extracted text and metadata
+        """
+        start_time = time.time()
+
+        # Detect file type if not provided
+        if not content_type:
+            content_type = self._detect_mime_type(file_bytes)
+
+        # Validate file type
+        if content_type not in self.SUPPORTED_TYPES:
+            return OcrResponse(
+                success=False,
+                documentType=DocumentType.UNKNOWN,
+                rawText="",
+                confidence=0.0,
+                extractedFields={},
+                processingTimeMs=int((time.time() - start_time) * 1000),
+            )
+
+        try:
+            # Convert HEIC/HEIF to standard format
+            if content_type in ("image/heic", "image/heif"):
+                file_bytes = self._convert_heic(file_bytes)
+                content_type = "image/png"
+
+            # Handle PDF (extract first page as image)
+            if content_type == "application/pdf":
+                file_bytes = self._extract_pdf_first_page(file_bytes)
+                content_type = "image/png"
+
+            # Apply preprocessing if enabled
+            if preprocess:
+                file_bytes = preprocessor.preprocess(
+                    file_bytes, deskew=True, denoise=True
+                )
+
+            # Perform OCR
+            image = Image.open(io.BytesIO(file_bytes))
+            ocr_data = pytesseract.image_to_data(
+                image, output_type=pytesseract.Output.DICT
+            )
+
+            # Extract text and calculate confidence
+            raw_text, confidence = self._process_ocr_data(ocr_data)
+
+            # Detect document type from content
+            document_type = self._detect_document_type(raw_text)
+
+            # Extract fields based on document type
+            extracted_fields = self._extract_fields(raw_text, document_type)
+
+            processing_time_ms = int((time.time() - start_time) * 1000)
+
+            logger.info(
+                f"OCR completed: {len(raw_text)} chars, "
+                f"{confidence:.2%} confidence, {processing_time_ms}ms"
+            )
+
+            return OcrResponse(
+                success=True,
+                documentType=document_type,
+                rawText=raw_text,
+                confidence=confidence,
+                extractedFields=extracted_fields,
+                processingTimeMs=processing_time_ms,
+            )
+
+        except Exception as e:
+            logger.error(f"OCR extraction failed: {e}", exc_info=True)
+            return OcrResponse(
+                success=False,
+                documentType=DocumentType.UNKNOWN,
+                rawText="",
+                confidence=0.0,
+                extractedFields={},
+                processingTimeMs=int((time.time() - start_time) * 1000),
+            )
+
+    def _detect_mime_type(self, file_bytes: bytes) -> str:
+        """Detect MIME type using python-magic."""
+        mime = magic.Magic(mime=True)
+        detected = mime.from_buffer(file_bytes)
+        return detected or "application/octet-stream"
+
+    def _convert_heic(self, heic_bytes: bytes) -> bytes:
+        """Convert HEIC/HEIF to PNG format."""
+        # pillow-heif registers itself with PIL, so we can open HEIC directly
+        image = Image.open(io.BytesIO(heic_bytes))
+        buffer = io.BytesIO()
+        image.save(buffer, format="PNG")
+        return buffer.getvalue()
+
+    def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
+        """Extract first page of PDF as PNG image."""
+        try:
+            # Use pdf2image if available, otherwise return empty
+            from pdf2image import convert_from_bytes
+
+            images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
+            if images:
+                buffer = io.BytesIO()
+                images[0].save(buffer, format="PNG")
+                return buffer.getvalue()
+        except ImportError:
+            logger.warning("pdf2image not available, PDF support limited")
+        except Exception as e:
+            logger.error(f"PDF extraction failed: {e}")
+
+        return b""
+
+    def _process_ocr_data(
+        self, ocr_data: dict
+    ) -> tuple[str, float]:
+        """Process Tesseract output to extract text and confidence."""
+        words = []
+        confidences = []
+
+        for i, text in enumerate(ocr_data["text"]):
+            # Filter out empty strings and low-confidence results
+            conf = int(ocr_data["conf"][i])
+            if text.strip() and conf > 0:
+                words.append(text)
+                confidences.append(conf)
+
+        raw_text = " ".join(words)
+        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+
+        # Normalize confidence to 0-1 range (Tesseract returns 0-100)
+        return raw_text, avg_confidence / 100.0
+
+    def _detect_document_type(self, text: str) -> DocumentType:
+        """Detect document type from extracted text content."""
+        text_lower = text.lower()
+
+        # VIN document indicators
+        vin_indicators = [
+            "vin",
+            "vehicle identification",
+            "title",
+            "registration",
+            "certificate of title",
+        ]
+        if any(indicator in text_lower for indicator in vin_indicators):
+            # Additional check: look for 17-character alphanumeric sequences
+            import re
+
+            vin_pattern = r"\b[A-HJ-NPR-Z0-9]{17}\b"
+            if re.search(vin_pattern, text.upper()):
+                return DocumentType.VIN
+
+        # Receipt indicators
+        receipt_indicators = [
+            "receipt",
+            "total",
+            "subtotal",
+            "tax",
+            "payment",
+            "invoice",
+            "amount due",
+            "gallons",
+            "price/gallon",
+        ]
+        if sum(1 for ind in receipt_indicators if ind in text_lower) >= 2:
+            return DocumentType.RECEIPT
+
+        # Manual indicators
+        manual_indicators = [
+            "owner's manual",
+            "maintenance schedule",
+            "service interval",
+            "chapter",
+            "table of contents",
+            "specifications",
+        ]
+        if any(indicator in text_lower for indicator in manual_indicators):
+            return DocumentType.MANUAL
+
+        return DocumentType.UNKNOWN
+
+    def _extract_fields(
+        self, text: str, document_type: DocumentType
+    ) -> dict[str, ExtractedField]:
+        """Extract specific fields based on document type."""
+        import re
+
+        fields: dict[str, ExtractedField] = {}
+
+        if document_type == DocumentType.VIN:
+            # Extract VIN (17 alphanumeric characters, excluding I, O, Q)
+            vin_pattern = r"\b([A-HJ-NPR-Z0-9]{17})\b"
+            match = re.search(vin_pattern, text.upper())
+            if match:
+                fields["vin"] = ExtractedField(value=match.group(1), confidence=0.9)
+
+        elif document_type == DocumentType.RECEIPT:
+            # Extract amounts (currency patterns)
+            amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)"
+            amounts = re.findall(amount_pattern, text)
+            if amounts:
+                # Last amount is often the total
+                fields["total"] = ExtractedField(
+                    value=f"${amounts[-1]}", confidence=0.7
+                )
+
+            # Extract date
+            date_pattern = r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})"
+            date_match = re.search(date_pattern, text)
+            if date_match:
+                fields["date"] = ExtractedField(value=date_match.group(1), confidence=0.8)
+
+            # Extract gallons (for fuel receipts)
+            gallon_pattern = r"(\d+\.?\d*)\s*(?:gal|gallons)"
+            gallon_match = re.search(gallon_pattern, text.lower())
+            if gallon_match:
+                fields["gallons"] = ExtractedField(
+                    value=gallon_match.group(1), confidence=0.85
+                )
+
+        return fields
+
+
+# Singleton instance
+ocr_service = OcrService()