motovaultpro/ocr/app/routers/extract.py

"""OCR extraction endpoints."""
import logging

from fastapi import APIRouter, File, HTTPException, Query, UploadFile

from app.models import OcrResponse
from app.services import ocr_service

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/extract", tags=["extract"])

# Maximum file size for synchronous processing (10MB)
MAX_SYNC_SIZE = 10 * 1024 * 1024


@router.post("", response_model=OcrResponse)
async def extract_text(
    file: UploadFile = File(..., description="Image file to process"),
    preprocess: bool = Query(True, description="Apply image preprocessing"),
) -> OcrResponse:
    """
    Extract text from an uploaded image using OCR.

    Supports HEIC, JPEG, PNG, and PDF (first page only) formats.
    Processing time target: <3 seconds for typical photos.

    - **file**: Image file (max 10MB for sync processing)
    - **preprocess**: Whether to apply deskew/denoise preprocessing (default: true)
    """
    # Validate file presence
    if not file.filename:
        raise HTTPException(status_code=400, detail="No file provided")

    # Read file content
    content = await file.read()
    file_size = len(content)

    # Validate file size
    if file_size > MAX_SYNC_SIZE:
        raise HTTPException(
            status_code=413,
            detail=f"File too large for sync processing. Max: {MAX_SYNC_SIZE // (1024*1024)}MB. Use /jobs for larger files.",
        )

    if file_size == 0:
        raise HTTPException(status_code=400, detail="Empty file provided")

    logger.info(
        f"Processing file: {file.filename}, "
        f"size: {file_size} bytes, "
        f"content_type: {file.content_type}"
    )

    # Perform OCR extraction
    result = ocr_service.extract(
        file_bytes=content,
        content_type=file.content_type,
        preprocess=preprocess,
    )

    if not result.success:
        logger.warning(f"OCR extraction failed for {file.filename}")
        raise HTTPException(
            status_code=422,
            detail="Failed to extract text from image. Ensure the file is a valid image format.",
        )

    return result