feat: add core OCR API integration (refs #65)

OCR Service (Python/FastAPI): - POST /extract for synchronous OCR extraction - POST /jobs and GET /jobs/{job_id} for async processing - Image preprocessing (deskew, denoise) for accuracy - HEIC conversion via pillow-heif - Redis job queue for async processing Backend (Fastify): - POST /api/ocr/extract - authenticated proxy to OCR - POST /api/ocr/jobs - async job submission - GET /api/ocr/jobs/:jobId - job polling - Multipart file upload handling - JWT authentication required File size limits: 10MB sync, 200MB async Processing time target: <3 seconds for typical photos Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 16:02:11 -06:00
parent 94e49306dc
commit 852c9013b5
25 changed files with 1931 additions and 3 deletions
--- a/ocr/app/services/init.py
+++ b/ocr/app/services/init.py
@@ -0,0 +1,6 @@
+"""OCR service layer."""
+from .job_queue import job_queue
+from .ocr_service import ocr_service
+from .preprocessor import preprocessor
+
+__all__ = ["job_queue", "ocr_service", "preprocessor"]
--- a/ocr/app/services/job_queue.py
+++ b/ocr/app/services/job_queue.py
@@ -0,0 +1,233 @@
+"""Redis-based job queue for async OCR processing."""
+import asyncio
+import json
+import logging
+import uuid
+from typing import Optional
+
+import redis.asyncio as redis
+
+from app.config import settings
+from app.models import JobResponse, JobStatus, OcrResponse
+
+logger = logging.getLogger(__name__)
+
+# Job TTL in seconds (1 hour)
+JOB_TTL = 3600
+
+# Key prefixes
+JOB_PREFIX = "ocr:job:"
+JOB_DATA_PREFIX = "ocr:job:data:"
+JOB_RESULT_PREFIX = "ocr:job:result:"
+
+
+class JobQueue:
+    """Manages async OCR jobs using Redis."""
+
+    def __init__(self) -> None:
+        """Initialize job queue."""
+        self._redis: Optional[redis.Redis] = None
+
+    async def get_redis(self) -> redis.Redis:
+        """Get or create Redis connection."""
+        if self._redis is None:
+            self._redis = redis.Redis(
+                host=settings.redis_host,
+                port=settings.redis_port,
+                db=settings.redis_db,
+                decode_responses=True,
+            )
+        return self._redis
+
+    async def close(self) -> None:
+        """Close Redis connection."""
+        if self._redis:
+            await self._redis.close()
+            self._redis = None
+
+    async def submit_job(
+        self,
+        file_bytes: bytes,
+        content_type: str,
+        callback_url: Optional[str] = None,
+    ) -> str:
+        """
+        Submit a new OCR job.
+
+        Args:
+            file_bytes: Raw file bytes to process
+            content_type: MIME type of the file
+            callback_url: Optional URL to call when job completes
+
+        Returns:
+            Job ID
+        """
+        r = await self.get_redis()
+        job_id = str(uuid.uuid4())
+
+        # Store job metadata
+        job_meta = {
+            "status": JobStatus.PENDING.value,
+            "progress": 0,
+            "content_type": content_type,
+            "callback_url": callback_url or "",
+        }
+
+        # Store file data separately (binary)
+        data_key = f"{JOB_DATA_PREFIX}{job_id}"
+        meta_key = f"{JOB_PREFIX}{job_id}"
+
+        # Use pipeline for atomic operation
+        async with r.pipeline() as pipe:
+            # Store metadata as hash
+            await pipe.hset(meta_key, mapping=job_meta)  # type: ignore
+            await pipe.expire(meta_key, JOB_TTL)
+
+            # Store binary data
+            await pipe.set(data_key, file_bytes)
+            await pipe.expire(data_key, JOB_TTL)
+
+            await pipe.execute()
+
+        logger.info(f"Job {job_id} submitted")
+        return job_id
+
+    async def get_job_status(self, job_id: str) -> Optional[JobResponse]:
+        """
+        Get the status of a job.
+
+        Args:
+            job_id: Job ID to check
+
+        Returns:
+            JobResponse or None if job doesn't exist
+        """
+        r = await self.get_redis()
+        meta_key = f"{JOB_PREFIX}{job_id}"
+        result_key = f"{JOB_RESULT_PREFIX}{job_id}"
+
+        # Get job metadata
+        meta = await r.hgetall(meta_key)  # type: ignore
+        if not meta:
+            return None
+
+        status = JobStatus(meta.get("status", JobStatus.PENDING.value))
+        progress = int(meta.get("progress", 0))
+        error = meta.get("error")
+
+        # Get result if completed
+        result = None
+        if status == JobStatus.COMPLETED:
+            result_json = await r.get(result_key)
+            if result_json:
+                result_dict = json.loads(result_json)
+                result = OcrResponse(**result_dict)
+
+        return JobResponse(
+            jobId=job_id,
+            status=status,
+            progress=progress if status == JobStatus.PROCESSING else None,
+            result=result,
+            error=error if status == JobStatus.FAILED else None,
+        )
+
+    async def update_job_progress(self, job_id: str, progress: int) -> None:
+        """Update job progress percentage."""
+        r = await self.get_redis()
+        meta_key = f"{JOB_PREFIX}{job_id}"
+
+        await r.hset(meta_key, mapping={  # type: ignore
+            "status": JobStatus.PROCESSING.value,
+            "progress": progress,
+        })
+
+    async def complete_job(self, job_id: str, result: OcrResponse) -> None:
+        """Mark job as completed with result."""
+        r = await self.get_redis()
+        meta_key = f"{JOB_PREFIX}{job_id}"
+        result_key = f"{JOB_RESULT_PREFIX}{job_id}"
+        data_key = f"{JOB_DATA_PREFIX}{job_id}"
+
+        # Store result
+        result_dict = result.model_dump(by_alias=True)
+        result_json = json.dumps(result_dict)
+
+        async with r.pipeline() as pipe:
+            # Update status
+            await pipe.hset(meta_key, mapping={  # type: ignore
+                "status": JobStatus.COMPLETED.value,
+                "progress": 100,
+            })
+
+            # Store result
+            await pipe.set(result_key, result_json)
+            await pipe.expire(result_key, JOB_TTL)
+
+            # Delete file data (no longer needed)
+            await pipe.delete(data_key)
+
+            await pipe.execute()
+
+        logger.info(f"Job {job_id} completed")
+
+        # TODO: Trigger callback if configured
+        meta = await r.hgetall(meta_key)  # type: ignore
+        callback_url = meta.get("callback_url")
+        if callback_url:
+            # Fire-and-forget callback (don't block)
+            asyncio.create_task(self._send_callback(callback_url, job_id, result_dict))
+
+    async def fail_job(self, job_id: str, error: str) -> None:
+        """Mark job as failed with error message."""
+        r = await self.get_redis()
+        meta_key = f"{JOB_PREFIX}{job_id}"
+        data_key = f"{JOB_DATA_PREFIX}{job_id}"
+
+        async with r.pipeline() as pipe:
+            await pipe.hset(meta_key, mapping={  # type: ignore
+                "status": JobStatus.FAILED.value,
+                "error": error,
+            })
+            # Delete file data
+            await pipe.delete(data_key)
+            await pipe.execute()
+
+        logger.error(f"Job {job_id} failed: {error}")
+
+    async def get_job_data(self, job_id: str) -> Optional[bytes]:
+        """Get the file data for a job."""
+        r = await self.get_redis()
+        data_key = f"{JOB_DATA_PREFIX}{job_id}"
+
+        # Get raw bytes (not decoded)
+        raw_redis = redis.Redis(
+            host=settings.redis_host,
+            port=settings.redis_port,
+            db=settings.redis_db,
+            decode_responses=False,
+        )
+        try:
+            data = await raw_redis.get(data_key)
+            return data  # type: ignore
+        finally:
+            await raw_redis.close()
+
+    async def _send_callback(
+        self, url: str, job_id: str, result: dict
+    ) -> None:
+        """Send callback notification when job completes."""
+        try:
+            import httpx
+
+            async with httpx.AsyncClient(timeout=10.0) as client:
+                await client.post(
+                    url,
+                    json={"jobId": job_id, "result": result},
+                )
+            logger.info(f"Callback sent for job {job_id}")
+        except Exception as e:
+            logger.error(f"Callback failed for job {job_id}: {e}")
+
+
+# Singleton instance
+job_queue = JobQueue()
--- a/ocr/app/services/ocr_service.py
+++ b/ocr/app/services/ocr_service.py
@@ -0,0 +1,275 @@
+"""Core OCR service using Tesseract with HEIC support."""
+import io
+import logging
+import time
+from typing import Optional
+
+import magic
+import pytesseract
+from PIL import Image
+from pillow_heif import register_heif_opener
+
+from app.config import settings
+from app.models import DocumentType, ExtractedField, OcrResponse
+from app.services.preprocessor import preprocessor
+
+# Register HEIF/HEIC opener with Pillow
+register_heif_opener()
+
+logger = logging.getLogger(__name__)
+
+
+class OcrService:
+    """Core OCR processing service."""
+
+    # Supported MIME types
+    SUPPORTED_TYPES = {
+        "image/jpeg",
+        "image/png",
+        "image/heic",
+        "image/heif",
+        "application/pdf",
+    }
+
+    def __init__(self) -> None:
+        """Initialize OCR service."""
+        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+
+    def extract(
+        self,
+        file_bytes: bytes,
+        content_type: Optional[str] = None,
+        preprocess: bool = True,
+    ) -> OcrResponse:
+        """
+        Extract text from an image file.
+
+        Args:
+            file_bytes: Raw file bytes
+            content_type: MIME type (optional, will be detected if not provided)
+            preprocess: Whether to apply preprocessing
+
+        Returns:
+            OcrResponse with extracted text and metadata
+        """
+        start_time = time.time()
+
+        # Detect file type if not provided
+        if not content_type:
+            content_type = self._detect_mime_type(file_bytes)
+
+        # Validate file type
+        if content_type not in self.SUPPORTED_TYPES:
+            return OcrResponse(
+                success=False,
+                documentType=DocumentType.UNKNOWN,
+                rawText="",
+                confidence=0.0,
+                extractedFields={},
+                processingTimeMs=int((time.time() - start_time) * 1000),
+            )
+
+        try:
+            # Convert HEIC/HEIF to standard format
+            if content_type in ("image/heic", "image/heif"):
+                file_bytes = self._convert_heic(file_bytes)
+                content_type = "image/png"
+
+            # Handle PDF (extract first page as image)
+            if content_type == "application/pdf":
+                file_bytes = self._extract_pdf_first_page(file_bytes)
+                content_type = "image/png"
+
+            # Apply preprocessing if enabled
+            if preprocess:
+                file_bytes = preprocessor.preprocess(
+                    file_bytes, deskew=True, denoise=True
+                )
+
+            # Perform OCR
+            image = Image.open(io.BytesIO(file_bytes))
+            ocr_data = pytesseract.image_to_data(
+                image, output_type=pytesseract.Output.DICT
+            )
+
+            # Extract text and calculate confidence
+            raw_text, confidence = self._process_ocr_data(ocr_data)
+
+            # Detect document type from content
+            document_type = self._detect_document_type(raw_text)
+
+            # Extract fields based on document type
+            extracted_fields = self._extract_fields(raw_text, document_type)
+
+            processing_time_ms = int((time.time() - start_time) * 1000)
+
+            logger.info(
+                f"OCR completed: {len(raw_text)} chars, "
+                f"{confidence:.2%} confidence, {processing_time_ms}ms"
+            )
+
+            return OcrResponse(
+                success=True,
+                documentType=document_type,
+                rawText=raw_text,
+                confidence=confidence,
+                extractedFields=extracted_fields,
+                processingTimeMs=processing_time_ms,
+            )
+
+        except Exception as e:
+            logger.error(f"OCR extraction failed: {e}", exc_info=True)
+            return OcrResponse(
+                success=False,
+                documentType=DocumentType.UNKNOWN,
+                rawText="",
+                confidence=0.0,
+                extractedFields={},
+                processingTimeMs=int((time.time() - start_time) * 1000),
+            )
+
+    def _detect_mime_type(self, file_bytes: bytes) -> str:
+        """Detect MIME type using python-magic."""
+        mime = magic.Magic(mime=True)
+        detected = mime.from_buffer(file_bytes)
+        return detected or "application/octet-stream"
+
+    def _convert_heic(self, heic_bytes: bytes) -> bytes:
+        """Convert HEIC/HEIF to PNG format."""
+        # pillow-heif registers itself with PIL, so we can open HEIC directly
+        image = Image.open(io.BytesIO(heic_bytes))
+        buffer = io.BytesIO()
+        image.save(buffer, format="PNG")
+        return buffer.getvalue()
+
+    def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
+        """Extract first page of PDF as PNG image."""
+        try:
+            # Use pdf2image if available, otherwise return empty
+            from pdf2image import convert_from_bytes
+
+            images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
+            if images:
+                buffer = io.BytesIO()
+                images[0].save(buffer, format="PNG")
+                return buffer.getvalue()
+        except ImportError:
+            logger.warning("pdf2image not available, PDF support limited")
+        except Exception as e:
+            logger.error(f"PDF extraction failed: {e}")
+
+        return b""
+
+    def _process_ocr_data(
+        self, ocr_data: dict
+    ) -> tuple[str, float]:
+        """Process Tesseract output to extract text and confidence."""
+        words = []
+        confidences = []
+
+        for i, text in enumerate(ocr_data["text"]):
+            # Filter out empty strings and low-confidence results
+            conf = int(ocr_data["conf"][i])
+            if text.strip() and conf > 0:
+                words.append(text)
+                confidences.append(conf)
+
+        raw_text = " ".join(words)
+        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+
+        # Normalize confidence to 0-1 range (Tesseract returns 0-100)
+        return raw_text, avg_confidence / 100.0
+
+    def _detect_document_type(self, text: str) -> DocumentType:
+        """Detect document type from extracted text content."""
+        text_lower = text.lower()
+
+        # VIN document indicators
+        vin_indicators = [
+            "vin",
+            "vehicle identification",
+            "title",
+            "registration",
+            "certificate of title",
+        ]
+        if any(indicator in text_lower for indicator in vin_indicators):
+            # Additional check: look for 17-character alphanumeric sequences
+            import re
+
+            vin_pattern = r"\b[A-HJ-NPR-Z0-9]{17}\b"
+            if re.search(vin_pattern, text.upper()):
+                return DocumentType.VIN
+
+        # Receipt indicators
+        receipt_indicators = [
+            "receipt",
+            "total",
+            "subtotal",
+            "tax",
+            "payment",
+            "invoice",
+            "amount due",
+            "gallons",
+            "price/gallon",
+        ]
+        if sum(1 for ind in receipt_indicators if ind in text_lower) >= 2:
+            return DocumentType.RECEIPT
+
+        # Manual indicators
+        manual_indicators = [
+            "owner's manual",
+            "maintenance schedule",
+            "service interval",
+            "chapter",
+            "table of contents",
+            "specifications",
+        ]
+        if any(indicator in text_lower for indicator in manual_indicators):
+            return DocumentType.MANUAL
+
+        return DocumentType.UNKNOWN
+
+    def _extract_fields(
+        self, text: str, document_type: DocumentType
+    ) -> dict[str, ExtractedField]:
+        """Extract specific fields based on document type."""
+        import re
+
+        fields: dict[str, ExtractedField] = {}
+
+        if document_type == DocumentType.VIN:
+            # Extract VIN (17 alphanumeric characters, excluding I, O, Q)
+            vin_pattern = r"\b([A-HJ-NPR-Z0-9]{17})\b"
+            match = re.search(vin_pattern, text.upper())
+            if match:
+                fields["vin"] = ExtractedField(value=match.group(1), confidence=0.9)
+
+        elif document_type == DocumentType.RECEIPT:
+            # Extract amounts (currency patterns)
+            amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)"
+            amounts = re.findall(amount_pattern, text)
+            if amounts:
+                # Last amount is often the total
+                fields["total"] = ExtractedField(
+                    value=f"${amounts[-1]}", confidence=0.7
+                )
+
+            # Extract date
+            date_pattern = r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})"
+            date_match = re.search(date_pattern, text)
+            if date_match:
+                fields["date"] = ExtractedField(value=date_match.group(1), confidence=0.8)
+
+            # Extract gallons (for fuel receipts)
+            gallon_pattern = r"(\d+\.?\d*)\s*(?:gal|gallons)"
+            gallon_match = re.search(gallon_pattern, text.lower())
+            if gallon_match:
+                fields["gallons"] = ExtractedField(
+                    value=gallon_match.group(1), confidence=0.85
+                )
+
+        return fields
+
+
+# Singleton instance
+ocr_service = OcrService()
--- a/ocr/app/services/preprocessor.py
+++ b/ocr/app/services/preprocessor.py
@@ -0,0 +1,176 @@
+"""Image preprocessing service for OCR accuracy improvement."""
+import io
+import logging
+from typing import Optional
+
+import cv2
+import numpy as np
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+class ImagePreprocessor:
+    """Handles image preprocessing for improved OCR accuracy."""
+
+    def preprocess(
+        self,
+        image_bytes: bytes,
+        deskew: bool = True,
+        denoise: bool = True,
+        binarize: bool = False,
+    ) -> bytes:
+        """
+        Apply preprocessing to an image for better OCR results.
+
+        Args:
+            image_bytes: Raw image bytes
+            deskew: Whether to correct image rotation
+            denoise: Whether to apply noise reduction
+            binarize: Whether to convert to black and white
+
+        Returns:
+            Preprocessed image as PNG bytes
+        """
+        # Convert bytes to numpy array via PIL
+        pil_image = Image.open(io.BytesIO(image_bytes))
+
+        # Convert to RGB if necessary (handles RGBA, grayscale, etc.)
+        if pil_image.mode not in ("RGB", "L"):
+            pil_image = pil_image.convert("RGB")
+
+        # Convert PIL to OpenCV format
+        cv_image = np.array(pil_image)
+
+        # Convert RGB to BGR for OpenCV (if color image)
+        if len(cv_image.shape) == 3:
+            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
+
+        # Convert to grayscale for processing
+        if len(cv_image.shape) == 3:
+            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = cv_image
+
+        # Apply denoising
+        if denoise:
+            gray = self._denoise(gray)
+
+        # Apply deskewing
+        if deskew:
+            gray = self._deskew(gray)
+
+        # Apply binarization (optional - can help with some documents)
+        if binarize:
+            gray = self._binarize(gray)
+
+        # Convert back to PIL and return as PNG bytes
+        result_image = Image.fromarray(gray)
+        buffer = io.BytesIO()
+        result_image.save(buffer, format="PNG")
+        return buffer.getvalue()
+
+    def _denoise(self, image: np.ndarray) -> np.ndarray:
+        """Apply noise reduction using non-local means denoising."""
+        try:
+            # fastNlMeansDenoising is effective for grayscale images
+            return cv2.fastNlMeansDenoising(image, h=10, templateWindowSize=7, searchWindowSize=21)
+        except cv2.error as e:
+            logger.warning(f"Denoising failed: {e}")
+            return image
+
+    def _deskew(self, image: np.ndarray) -> np.ndarray:
+        """Correct image rotation using Hough transform."""
+        try:
+            # Detect edges
+            edges = cv2.Canny(image, 50, 150, apertureSize=3)
+
+            # Detect lines using Hough transform
+            lines = cv2.HoughLinesP(
+                edges,
+                rho=1,
+                theta=np.pi / 180,
+                threshold=100,
+                minLineLength=100,
+                maxLineGap=10,
+            )
+
+            if lines is None:
+                return image
+
+            # Calculate the average angle of detected lines
+            angles = []
+            for line in lines:
+                x1, y1, x2, y2 = line[0]
+                if x2 - x1 != 0:  # Avoid division by zero
+                    angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
+                    # Only consider nearly horizontal lines (within 45 degrees)
+                    if -45 < angle < 45:
+                        angles.append(angle)
+
+            if not angles:
+                return image
+
+            # Use median angle to avoid outliers
+            median_angle = np.median(angles)
+
+            # Only correct if skew is significant but not too extreme
+            if abs(median_angle) < 0.5 or abs(median_angle) > 15:
+                return image
+
+            # Rotate the image to correct skew
+            height, width = image.shape[:2]
+            center = (width // 2, height // 2)
+            rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+
+            # Calculate new image bounds to avoid cropping
+            cos_val = abs(rotation_matrix[0, 0])
+            sin_val = abs(rotation_matrix[0, 1])
+            new_width = int(height * sin_val + width * cos_val)
+            new_height = int(height * cos_val + width * sin_val)
+
+            rotation_matrix[0, 2] += (new_width - width) / 2
+            rotation_matrix[1, 2] += (new_height - height) / 2
+
+            rotated = cv2.warpAffine(
+                image,
+                rotation_matrix,
+                (new_width, new_height),
+                borderMode=cv2.BORDER_REPLICATE,
+            )
+
+            logger.debug(f"Deskewed image by {median_angle:.2f} degrees")
+            return rotated
+
+        except Exception as e:
+            logger.warning(f"Deskewing failed: {e}")
+            return image
+
+    def _binarize(self, image: np.ndarray) -> np.ndarray:
+        """Convert to binary (black and white) using adaptive thresholding."""
+        try:
+            return cv2.adaptiveThreshold(
+                image,
+                255,
+                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY,
+                blockSize=11,
+                C=2,
+            )
+        except cv2.error as e:
+            logger.warning(f"Binarization failed: {e}")
+            return image
+
+    def get_image_info(self, image_bytes: bytes) -> dict:
+        """Get basic information about an image."""
+        pil_image = Image.open(io.BytesIO(image_bytes))
+        return {
+            "width": pil_image.width,
+            "height": pil_image.height,
+            "mode": pil_image.mode,
+            "format": pil_image.format,
+        }
+
+
+# Singleton instance
+preprocessor = ImagePreprocessor()