feat: add core OCR API integration (refs #65)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 5m59s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

OCR Service (Python/FastAPI):
- POST /extract for synchronous OCR extraction
- POST /jobs and GET /jobs/{job_id} for async processing
- Image preprocessing (deskew, denoise) for accuracy
- HEIC conversion via pillow-heif
- Redis job queue for async processing

Backend (Fastify):
- POST /api/ocr/extract - authenticated proxy to OCR
- POST /api/ocr/jobs - async job submission
- GET /api/ocr/jobs/:jobId - job polling
- Multipart file upload handling
- JWT authentication required

File size limits: 10MB sync, 200MB async
Processing time target: <3 seconds for typical photos

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 16:02:11 -06:00
parent 94e49306dc
commit 852c9013b5
25 changed files with 1931 additions and 3 deletions

View File

@@ -11,5 +11,10 @@ class Settings:
self.port: int = int(os.getenv("PORT", "8000"))
self.tesseract_cmd: str = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
# Redis configuration for job queue
self.redis_host: str = os.getenv("REDIS_HOST", "mvp-redis")
self.redis_port: int = int(os.getenv("REDIS_PORT", "6379"))
self.redis_db: int = int(os.getenv("REDIS_DB", "1"))
settings = Settings()

View File

@@ -1,14 +1,44 @@
"""OCR Service FastAPI Application."""
import logging
from contextlib import asynccontextmanager
from typing import AsyncIterator
from fastapi import FastAPI
from app.config import settings
from app.routers import extract_router, jobs_router
from app.services import job_queue
# Configure logging
logging.basicConfig(
level=getattr(logging, settings.log_level.upper(), logging.INFO),
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
"""Application lifespan handler for startup/shutdown."""
# Startup
logger.info("OCR service starting up")
yield
# Shutdown
logger.info("OCR service shutting down")
await job_queue.close()
app = FastAPI(
title="MotoVaultPro OCR Service",
description="OCR processing service for vehicle documents",
version="1.0.0",
lifespan=lifespan,
)
# Include routers
app.include_router(extract_router)
app.include_router(jobs_router)
@app.get("/health")
async def health_check() -> dict:
@@ -23,4 +53,9 @@ async def root() -> dict:
"service": "mvp-ocr",
"version": "1.0.0",
"log_level": settings.log_level,
"endpoints": [
"POST /extract - Synchronous OCR extraction",
"POST /jobs - Submit async OCR job",
"GET /jobs/{job_id} - Get async job status",
],
}

View File

@@ -0,0 +1,18 @@
"""Pydantic models for OCR service."""
from .schemas import (
DocumentType,
ExtractedField,
JobResponse,
JobStatus,
JobSubmitRequest,
OcrResponse,
)
__all__ = [
"DocumentType",
"ExtractedField",
"JobResponse",
"JobStatus",
"JobSubmitRequest",
"OcrResponse",
]

65
ocr/app/models/schemas.py Normal file
View File

@@ -0,0 +1,65 @@
"""Pydantic models for OCR API request/response validation."""
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field
class DocumentType(str, Enum):
"""Types of documents that can be processed."""
VIN = "vin"
RECEIPT = "receipt"
MANUAL = "manual"
UNKNOWN = "unknown"
class ExtractedField(BaseModel):
"""A single extracted field with confidence score."""
value: str
confidence: float = Field(ge=0.0, le=1.0)
class OcrResponse(BaseModel):
"""Response from OCR extraction."""
success: bool
document_type: DocumentType = Field(alias="documentType")
raw_text: str = Field(alias="rawText")
confidence: float = Field(ge=0.0, le=1.0)
extracted_fields: dict[str, ExtractedField] = Field(
default_factory=dict, alias="extractedFields"
)
processing_time_ms: int = Field(alias="processingTimeMs")
model_config = {"populate_by_name": True}
class JobStatus(str, Enum):
"""Status of an async OCR job."""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class JobResponse(BaseModel):
"""Response for async job status."""
job_id: str = Field(alias="jobId")
status: JobStatus
progress: Optional[int] = Field(default=None, ge=0, le=100)
result: Optional[OcrResponse] = None
error: Optional[str] = None
model_config = {"populate_by_name": True}
class JobSubmitRequest(BaseModel):
"""Request to submit an async OCR job."""
callback_url: Optional[str] = Field(default=None, alias="callbackUrl")
model_config = {"populate_by_name": True}

View File

@@ -0,0 +1,5 @@
"""OCR API routers."""
from .extract import router as extract_router
from .jobs import router as jobs_router
__all__ = ["extract_router", "jobs_router"]

View File

@@ -0,0 +1,69 @@
"""OCR extraction endpoints."""
import logging
from fastapi import APIRouter, File, HTTPException, Query, UploadFile
from app.models import OcrResponse
from app.services import ocr_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/extract", tags=["extract"])
# Maximum file size for synchronous processing (10MB)
MAX_SYNC_SIZE = 10 * 1024 * 1024
@router.post("", response_model=OcrResponse)
async def extract_text(
file: UploadFile = File(..., description="Image file to process"),
preprocess: bool = Query(True, description="Apply image preprocessing"),
) -> OcrResponse:
"""
Extract text from an uploaded image using OCR.
Supports HEIC, JPEG, PNG, and PDF (first page only) formats.
Processing time target: <3 seconds for typical photos.
- **file**: Image file (max 10MB for sync processing)
- **preprocess**: Whether to apply deskew/denoise preprocessing (default: true)
"""
# Validate file presence
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Read file content
content = await file.read()
file_size = len(content)
# Validate file size
if file_size > MAX_SYNC_SIZE:
raise HTTPException(
status_code=413,
detail=f"File too large for sync processing. Max: {MAX_SYNC_SIZE // (1024*1024)}MB. Use /jobs for larger files.",
)
if file_size == 0:
raise HTTPException(status_code=400, detail="Empty file provided")
logger.info(
f"Processing file: {file.filename}, "
f"size: {file_size} bytes, "
f"content_type: {file.content_type}"
)
# Perform OCR extraction
result = ocr_service.extract(
file_bytes=content,
content_type=file.content_type,
preprocess=preprocess,
)
if not result.success:
logger.warning(f"OCR extraction failed for {file.filename}")
raise HTTPException(
status_code=422,
detail="Failed to extract text from image. Ensure the file is a valid image format.",
)
return result

142
ocr/app/routers/jobs.py Normal file
View File

@@ -0,0 +1,142 @@
"""Async OCR job endpoints."""
import asyncio
import logging
from typing import Optional
from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, UploadFile
from app.models import JobResponse, JobSubmitRequest
from app.services import job_queue, ocr_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/jobs", tags=["jobs"])
# Maximum file size for async processing (200MB)
MAX_ASYNC_SIZE = 200 * 1024 * 1024
@router.post("", response_model=JobResponse)
async def submit_job(
background_tasks: BackgroundTasks,
file: UploadFile = File(..., description="Image file to process"),
callback_url: Optional[str] = Form(None, description="URL to call when job completes"),
) -> JobResponse:
"""
Submit an async OCR job for large files.
Use this endpoint for files larger than 10MB or when you don't want to wait
for processing to complete. Poll GET /jobs/{job_id} for status.
- **file**: Image file (max 200MB)
- **callback_url**: Optional webhook URL to receive job completion notification
"""
# Validate file presence
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Read file content
content = await file.read()
file_size = len(content)
# Validate file size
if file_size > MAX_ASYNC_SIZE:
raise HTTPException(
status_code=413,
detail=f"File too large. Max: {MAX_ASYNC_SIZE // (1024*1024)}MB.",
)
if file_size == 0:
raise HTTPException(status_code=400, detail="Empty file provided")
logger.info(
f"Submitting async job: {file.filename}, "
f"size: {file_size} bytes, "
f"content_type: {file.content_type}"
)
# Submit job to queue
job_id = await job_queue.submit_job(
file_bytes=content,
content_type=file.content_type or "application/octet-stream",
callback_url=callback_url,
)
# Schedule background processing
background_tasks.add_task(process_job, job_id)
# Return initial status
return JobResponse(
jobId=job_id,
status="pending",
progress=0,
)
@router.get("/{job_id}", response_model=JobResponse)
async def get_job_status(job_id: str) -> JobResponse:
"""
Get the status of an async OCR job.
Poll this endpoint to check job progress and retrieve results.
Returns:
- **pending**: Job is queued
- **processing**: Job is being processed (includes progress %)
- **completed**: Job finished successfully (includes result)
- **failed**: Job failed (includes error message)
"""
result = await job_queue.get_job_status(job_id)
if result is None:
raise HTTPException(
status_code=404,
detail=f"Job {job_id} not found. Jobs expire after 1 hour.",
)
return result
async def process_job(job_id: str) -> None:
"""Background task to process an OCR job."""
logger.info(f"Starting job {job_id}")
try:
# Update status to processing
await job_queue.update_job_progress(job_id, 10)
# Get job data
file_bytes = await job_queue.get_job_data(job_id)
if not file_bytes:
await job_queue.fail_job(job_id, "Job data not found")
return
await job_queue.update_job_progress(job_id, 30)
# Get metadata for content type
status = await job_queue.get_job_status(job_id)
if not status:
return
# Perform OCR in thread pool (CPU-bound operation)
await job_queue.update_job_progress(job_id, 50)
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
lambda: ocr_service.extract(
file_bytes=file_bytes,
preprocess=True,
),
)
await job_queue.update_job_progress(job_id, 90)
if result.success:
await job_queue.complete_job(job_id, result)
else:
await job_queue.fail_job(job_id, "OCR extraction failed")
except Exception as e:
logger.error(f"Job {job_id} failed: {e}", exc_info=True)
await job_queue.fail_job(job_id, str(e))

View File

@@ -0,0 +1,6 @@
"""OCR service layer."""
from .job_queue import job_queue
from .ocr_service import ocr_service
from .preprocessor import preprocessor
__all__ = ["job_queue", "ocr_service", "preprocessor"]

View File

@@ -0,0 +1,233 @@
"""Redis-based job queue for async OCR processing."""
import asyncio
import json
import logging
import uuid
from typing import Optional
import redis.asyncio as redis
from app.config import settings
from app.models import JobResponse, JobStatus, OcrResponse
logger = logging.getLogger(__name__)
# Job TTL in seconds (1 hour)
JOB_TTL = 3600
# Key prefixes
JOB_PREFIX = "ocr:job:"
JOB_DATA_PREFIX = "ocr:job:data:"
JOB_RESULT_PREFIX = "ocr:job:result:"
class JobQueue:
"""Manages async OCR jobs using Redis."""
def __init__(self) -> None:
"""Initialize job queue."""
self._redis: Optional[redis.Redis] = None
async def get_redis(self) -> redis.Redis:
"""Get or create Redis connection."""
if self._redis is None:
self._redis = redis.Redis(
host=settings.redis_host,
port=settings.redis_port,
db=settings.redis_db,
decode_responses=True,
)
return self._redis
async def close(self) -> None:
"""Close Redis connection."""
if self._redis:
await self._redis.close()
self._redis = None
async def submit_job(
self,
file_bytes: bytes,
content_type: str,
callback_url: Optional[str] = None,
) -> str:
"""
Submit a new OCR job.
Args:
file_bytes: Raw file bytes to process
content_type: MIME type of the file
callback_url: Optional URL to call when job completes
Returns:
Job ID
"""
r = await self.get_redis()
job_id = str(uuid.uuid4())
# Store job metadata
job_meta = {
"status": JobStatus.PENDING.value,
"progress": 0,
"content_type": content_type,
"callback_url": callback_url or "",
}
# Store file data separately (binary)
data_key = f"{JOB_DATA_PREFIX}{job_id}"
meta_key = f"{JOB_PREFIX}{job_id}"
# Use pipeline for atomic operation
async with r.pipeline() as pipe:
# Store metadata as hash
await pipe.hset(meta_key, mapping=job_meta) # type: ignore
await pipe.expire(meta_key, JOB_TTL)
# Store binary data
await pipe.set(data_key, file_bytes)
await pipe.expire(data_key, JOB_TTL)
await pipe.execute()
logger.info(f"Job {job_id} submitted")
return job_id
async def get_job_status(self, job_id: str) -> Optional[JobResponse]:
"""
Get the status of a job.
Args:
job_id: Job ID to check
Returns:
JobResponse or None if job doesn't exist
"""
r = await self.get_redis()
meta_key = f"{JOB_PREFIX}{job_id}"
result_key = f"{JOB_RESULT_PREFIX}{job_id}"
# Get job metadata
meta = await r.hgetall(meta_key) # type: ignore
if not meta:
return None
status = JobStatus(meta.get("status", JobStatus.PENDING.value))
progress = int(meta.get("progress", 0))
error = meta.get("error")
# Get result if completed
result = None
if status == JobStatus.COMPLETED:
result_json = await r.get(result_key)
if result_json:
result_dict = json.loads(result_json)
result = OcrResponse(**result_dict)
return JobResponse(
jobId=job_id,
status=status,
progress=progress if status == JobStatus.PROCESSING else None,
result=result,
error=error if status == JobStatus.FAILED else None,
)
async def update_job_progress(self, job_id: str, progress: int) -> None:
"""Update job progress percentage."""
r = await self.get_redis()
meta_key = f"{JOB_PREFIX}{job_id}"
await r.hset(meta_key, mapping={ # type: ignore
"status": JobStatus.PROCESSING.value,
"progress": progress,
})
async def complete_job(self, job_id: str, result: OcrResponse) -> None:
"""Mark job as completed with result."""
r = await self.get_redis()
meta_key = f"{JOB_PREFIX}{job_id}"
result_key = f"{JOB_RESULT_PREFIX}{job_id}"
data_key = f"{JOB_DATA_PREFIX}{job_id}"
# Store result
result_dict = result.model_dump(by_alias=True)
result_json = json.dumps(result_dict)
async with r.pipeline() as pipe:
# Update status
await pipe.hset(meta_key, mapping={ # type: ignore
"status": JobStatus.COMPLETED.value,
"progress": 100,
})
# Store result
await pipe.set(result_key, result_json)
await pipe.expire(result_key, JOB_TTL)
# Delete file data (no longer needed)
await pipe.delete(data_key)
await pipe.execute()
logger.info(f"Job {job_id} completed")
# TODO: Trigger callback if configured
meta = await r.hgetall(meta_key) # type: ignore
callback_url = meta.get("callback_url")
if callback_url:
# Fire-and-forget callback (don't block)
asyncio.create_task(self._send_callback(callback_url, job_id, result_dict))
async def fail_job(self, job_id: str, error: str) -> None:
"""Mark job as failed with error message."""
r = await self.get_redis()
meta_key = f"{JOB_PREFIX}{job_id}"
data_key = f"{JOB_DATA_PREFIX}{job_id}"
async with r.pipeline() as pipe:
await pipe.hset(meta_key, mapping={ # type: ignore
"status": JobStatus.FAILED.value,
"error": error,
})
# Delete file data
await pipe.delete(data_key)
await pipe.execute()
logger.error(f"Job {job_id} failed: {error}")
async def get_job_data(self, job_id: str) -> Optional[bytes]:
"""Get the file data for a job."""
r = await self.get_redis()
data_key = f"{JOB_DATA_PREFIX}{job_id}"
# Get raw bytes (not decoded)
raw_redis = redis.Redis(
host=settings.redis_host,
port=settings.redis_port,
db=settings.redis_db,
decode_responses=False,
)
try:
data = await raw_redis.get(data_key)
return data # type: ignore
finally:
await raw_redis.close()
async def _send_callback(
self, url: str, job_id: str, result: dict
) -> None:
"""Send callback notification when job completes."""
try:
import httpx
async with httpx.AsyncClient(timeout=10.0) as client:
await client.post(
url,
json={"jobId": job_id, "result": result},
)
logger.info(f"Callback sent for job {job_id}")
except Exception as e:
logger.error(f"Callback failed for job {job_id}: {e}")
# Singleton instance
job_queue = JobQueue()

View File

@@ -0,0 +1,275 @@
"""Core OCR service using Tesseract with HEIC support."""
import io
import logging
import time
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.models import DocumentType, ExtractedField, OcrResponse
from app.services.preprocessor import preprocessor
# Register HEIF/HEIC opener with Pillow
register_heif_opener()
logger = logging.getLogger(__name__)
class OcrService:
"""Core OCR processing service."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
"application/pdf",
}
def __init__(self) -> None:
"""Initialize OCR service."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
def extract(
self,
file_bytes: bytes,
content_type: Optional[str] = None,
preprocess: bool = True,
) -> OcrResponse:
"""
Extract text from an image file.
Args:
file_bytes: Raw file bytes
content_type: MIME type (optional, will be detected if not provided)
preprocess: Whether to apply preprocessing
Returns:
OcrResponse with extracted text and metadata
"""
start_time = time.time()
# Detect file type if not provided
if not content_type:
content_type = self._detect_mime_type(file_bytes)
# Validate file type
if content_type not in self.SUPPORTED_TYPES:
return OcrResponse(
success=False,
documentType=DocumentType.UNKNOWN,
rawText="",
confidence=0.0,
extractedFields={},
processingTimeMs=int((time.time() - start_time) * 1000),
)
try:
# Convert HEIC/HEIF to standard format
if content_type in ("image/heic", "image/heif"):
file_bytes = self._convert_heic(file_bytes)
content_type = "image/png"
# Handle PDF (extract first page as image)
if content_type == "application/pdf":
file_bytes = self._extract_pdf_first_page(file_bytes)
content_type = "image/png"
# Apply preprocessing if enabled
if preprocess:
file_bytes = preprocessor.preprocess(
file_bytes, deskew=True, denoise=True
)
# Perform OCR
image = Image.open(io.BytesIO(file_bytes))
ocr_data = pytesseract.image_to_data(
image, output_type=pytesseract.Output.DICT
)
# Extract text and calculate confidence
raw_text, confidence = self._process_ocr_data(ocr_data)
# Detect document type from content
document_type = self._detect_document_type(raw_text)
# Extract fields based on document type
extracted_fields = self._extract_fields(raw_text, document_type)
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
f"OCR completed: {len(raw_text)} chars, "
f"{confidence:.2%} confidence, {processing_time_ms}ms"
)
return OcrResponse(
success=True,
documentType=document_type,
rawText=raw_text,
confidence=confidence,
extractedFields=extracted_fields,
processingTimeMs=processing_time_ms,
)
except Exception as e:
logger.error(f"OCR extraction failed: {e}", exc_info=True)
return OcrResponse(
success=False,
documentType=DocumentType.UNKNOWN,
rawText="",
confidence=0.0,
extractedFields={},
processingTimeMs=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _convert_heic(self, heic_bytes: bytes) -> bytes:
"""Convert HEIC/HEIF to PNG format."""
# pillow-heif registers itself with PIL, so we can open HEIC directly
image = Image.open(io.BytesIO(heic_bytes))
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return buffer.getvalue()
def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
"""Extract first page of PDF as PNG image."""
try:
# Use pdf2image if available, otherwise return empty
from pdf2image import convert_from_bytes
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
if images:
buffer = io.BytesIO()
images[0].save(buffer, format="PNG")
return buffer.getvalue()
except ImportError:
logger.warning("pdf2image not available, PDF support limited")
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
return b""
def _process_ocr_data(
self, ocr_data: dict
) -> tuple[str, float]:
"""Process Tesseract output to extract text and confidence."""
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
# Filter out empty strings and low-confidence results
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text)
confidences.append(conf)
raw_text = " ".join(words)
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
return raw_text, avg_confidence / 100.0
def _detect_document_type(self, text: str) -> DocumentType:
"""Detect document type from extracted text content."""
text_lower = text.lower()
# VIN document indicators
vin_indicators = [
"vin",
"vehicle identification",
"title",
"registration",
"certificate of title",
]
if any(indicator in text_lower for indicator in vin_indicators):
# Additional check: look for 17-character alphanumeric sequences
import re
vin_pattern = r"\b[A-HJ-NPR-Z0-9]{17}\b"
if re.search(vin_pattern, text.upper()):
return DocumentType.VIN
# Receipt indicators
receipt_indicators = [
"receipt",
"total",
"subtotal",
"tax",
"payment",
"invoice",
"amount due",
"gallons",
"price/gallon",
]
if sum(1 for ind in receipt_indicators if ind in text_lower) >= 2:
return DocumentType.RECEIPT
# Manual indicators
manual_indicators = [
"owner's manual",
"maintenance schedule",
"service interval",
"chapter",
"table of contents",
"specifications",
]
if any(indicator in text_lower for indicator in manual_indicators):
return DocumentType.MANUAL
return DocumentType.UNKNOWN
def _extract_fields(
self, text: str, document_type: DocumentType
) -> dict[str, ExtractedField]:
"""Extract specific fields based on document type."""
import re
fields: dict[str, ExtractedField] = {}
if document_type == DocumentType.VIN:
# Extract VIN (17 alphanumeric characters, excluding I, O, Q)
vin_pattern = r"\b([A-HJ-NPR-Z0-9]{17})\b"
match = re.search(vin_pattern, text.upper())
if match:
fields["vin"] = ExtractedField(value=match.group(1), confidence=0.9)
elif document_type == DocumentType.RECEIPT:
# Extract amounts (currency patterns)
amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)"
amounts = re.findall(amount_pattern, text)
if amounts:
# Last amount is often the total
fields["total"] = ExtractedField(
value=f"${amounts[-1]}", confidence=0.7
)
# Extract date
date_pattern = r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})"
date_match = re.search(date_pattern, text)
if date_match:
fields["date"] = ExtractedField(value=date_match.group(1), confidence=0.8)
# Extract gallons (for fuel receipts)
gallon_pattern = r"(\d+\.?\d*)\s*(?:gal|gallons)"
gallon_match = re.search(gallon_pattern, text.lower())
if gallon_match:
fields["gallons"] = ExtractedField(
value=gallon_match.group(1), confidence=0.85
)
return fields
# Singleton instance
ocr_service = OcrService()

View File

@@ -0,0 +1,176 @@
"""Image preprocessing service for OCR accuracy improvement."""
import io
import logging
from typing import Optional
import cv2
import numpy as np
from PIL import Image
logger = logging.getLogger(__name__)
class ImagePreprocessor:
"""Handles image preprocessing for improved OCR accuracy."""
def preprocess(
self,
image_bytes: bytes,
deskew: bool = True,
denoise: bool = True,
binarize: bool = False,
) -> bytes:
"""
Apply preprocessing to an image for better OCR results.
Args:
image_bytes: Raw image bytes
deskew: Whether to correct image rotation
denoise: Whether to apply noise reduction
binarize: Whether to convert to black and white
Returns:
Preprocessed image as PNG bytes
"""
# Convert bytes to numpy array via PIL
pil_image = Image.open(io.BytesIO(image_bytes))
# Convert to RGB if necessary (handles RGBA, grayscale, etc.)
if pil_image.mode not in ("RGB", "L"):
pil_image = pil_image.convert("RGB")
# Convert PIL to OpenCV format
cv_image = np.array(pil_image)
# Convert RGB to BGR for OpenCV (if color image)
if len(cv_image.shape) == 3:
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
# Convert to grayscale for processing
if len(cv_image.shape) == 3:
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
else:
gray = cv_image
# Apply denoising
if denoise:
gray = self._denoise(gray)
# Apply deskewing
if deskew:
gray = self._deskew(gray)
# Apply binarization (optional - can help with some documents)
if binarize:
gray = self._binarize(gray)
# Convert back to PIL and return as PNG bytes
result_image = Image.fromarray(gray)
buffer = io.BytesIO()
result_image.save(buffer, format="PNG")
return buffer.getvalue()
def _denoise(self, image: np.ndarray) -> np.ndarray:
"""Apply noise reduction using non-local means denoising."""
try:
# fastNlMeansDenoising is effective for grayscale images
return cv2.fastNlMeansDenoising(image, h=10, templateWindowSize=7, searchWindowSize=21)
except cv2.error as e:
logger.warning(f"Denoising failed: {e}")
return image
def _deskew(self, image: np.ndarray) -> np.ndarray:
"""Correct image rotation using Hough transform."""
try:
# Detect edges
edges = cv2.Canny(image, 50, 150, apertureSize=3)
# Detect lines using Hough transform
lines = cv2.HoughLinesP(
edges,
rho=1,
theta=np.pi / 180,
threshold=100,
minLineLength=100,
maxLineGap=10,
)
if lines is None:
return image
# Calculate the average angle of detected lines
angles = []
for line in lines:
x1, y1, x2, y2 = line[0]
if x2 - x1 != 0: # Avoid division by zero
angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
# Only consider nearly horizontal lines (within 45 degrees)
if -45 < angle < 45:
angles.append(angle)
if not angles:
return image
# Use median angle to avoid outliers
median_angle = np.median(angles)
# Only correct if skew is significant but not too extreme
if abs(median_angle) < 0.5 or abs(median_angle) > 15:
return image
# Rotate the image to correct skew
height, width = image.shape[:2]
center = (width // 2, height // 2)
rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
# Calculate new image bounds to avoid cropping
cos_val = abs(rotation_matrix[0, 0])
sin_val = abs(rotation_matrix[0, 1])
new_width = int(height * sin_val + width * cos_val)
new_height = int(height * cos_val + width * sin_val)
rotation_matrix[0, 2] += (new_width - width) / 2
rotation_matrix[1, 2] += (new_height - height) / 2
rotated = cv2.warpAffine(
image,
rotation_matrix,
(new_width, new_height),
borderMode=cv2.BORDER_REPLICATE,
)
logger.debug(f"Deskewed image by {median_angle:.2f} degrees")
return rotated
except Exception as e:
logger.warning(f"Deskewing failed: {e}")
return image
def _binarize(self, image: np.ndarray) -> np.ndarray:
"""Convert to binary (black and white) using adaptive thresholding."""
try:
return cv2.adaptiveThreshold(
image,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11,
C=2,
)
except cv2.error as e:
logger.warning(f"Binarization failed: {e}")
return image
def get_image_info(self, image_bytes: bytes) -> dict:
"""Get basic information about an image."""
pil_image = Image.open(io.BytesIO(image_bytes))
return {
"width": pil_image.width,
"height": pil_image.height,
"mode": pil_image.mode,
"format": pil_image.format,
}
# Singleton instance
preprocessor = ImagePreprocessor()

View File

@@ -2,6 +2,7 @@
fastapi>=0.100.0
uvicorn[standard]>=0.23.0
python-multipart>=0.0.6
pydantic>=2.0.0
# File Detection & Handling
python-magic>=0.4.27
@@ -15,6 +16,12 @@ numpy>=1.24.0
# OCR Engines
pytesseract>=0.3.10
# Redis for job queue
redis>=5.0.0
# HTTP client for callbacks
httpx>=0.24.0
# Testing
pytest>=7.4.0
httpx>=0.24.0
pytest-asyncio>=0.21.0