feat: add core OCR API integration (refs #65)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 5m59s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 5m59s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
OCR Service (Python/FastAPI):
- POST /extract for synchronous OCR extraction
- POST /jobs and GET /jobs/{job_id} for async processing
- Image preprocessing (deskew, denoise) for accuracy
- HEIC conversion via pillow-heif
- Redis job queue for async processing
Backend (Fastify):
- POST /api/ocr/extract - authenticated proxy to OCR
- POST /api/ocr/jobs - async job submission
- GET /api/ocr/jobs/:jobId - job polling
- Multipart file upload handling
- JWT authentication required
File size limits: 10MB sync, 200MB async
Processing time target: <3 seconds for typical photos
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
233
ocr/app/services/job_queue.py
Normal file
233
ocr/app/services/job_queue.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""Redis-based job queue for async OCR processing."""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
import redis.asyncio as redis
|
||||
|
||||
from app.config import settings
|
||||
from app.models import JobResponse, JobStatus, OcrResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Job TTL in seconds (1 hour)
|
||||
JOB_TTL = 3600
|
||||
|
||||
# Key prefixes
|
||||
JOB_PREFIX = "ocr:job:"
|
||||
JOB_DATA_PREFIX = "ocr:job:data:"
|
||||
JOB_RESULT_PREFIX = "ocr:job:result:"
|
||||
|
||||
|
||||
class JobQueue:
|
||||
"""Manages async OCR jobs using Redis."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize job queue."""
|
||||
self._redis: Optional[redis.Redis] = None
|
||||
|
||||
async def get_redis(self) -> redis.Redis:
|
||||
"""Get or create Redis connection."""
|
||||
if self._redis is None:
|
||||
self._redis = redis.Redis(
|
||||
host=settings.redis_host,
|
||||
port=settings.redis_port,
|
||||
db=settings.redis_db,
|
||||
decode_responses=True,
|
||||
)
|
||||
return self._redis
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close Redis connection."""
|
||||
if self._redis:
|
||||
await self._redis.close()
|
||||
self._redis = None
|
||||
|
||||
async def submit_job(
|
||||
self,
|
||||
file_bytes: bytes,
|
||||
content_type: str,
|
||||
callback_url: Optional[str] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Submit a new OCR job.
|
||||
|
||||
Args:
|
||||
file_bytes: Raw file bytes to process
|
||||
content_type: MIME type of the file
|
||||
callback_url: Optional URL to call when job completes
|
||||
|
||||
Returns:
|
||||
Job ID
|
||||
"""
|
||||
r = await self.get_redis()
|
||||
job_id = str(uuid.uuid4())
|
||||
|
||||
# Store job metadata
|
||||
job_meta = {
|
||||
"status": JobStatus.PENDING.value,
|
||||
"progress": 0,
|
||||
"content_type": content_type,
|
||||
"callback_url": callback_url or "",
|
||||
}
|
||||
|
||||
# Store file data separately (binary)
|
||||
data_key = f"{JOB_DATA_PREFIX}{job_id}"
|
||||
meta_key = f"{JOB_PREFIX}{job_id}"
|
||||
|
||||
# Use pipeline for atomic operation
|
||||
async with r.pipeline() as pipe:
|
||||
# Store metadata as hash
|
||||
await pipe.hset(meta_key, mapping=job_meta) # type: ignore
|
||||
await pipe.expire(meta_key, JOB_TTL)
|
||||
|
||||
# Store binary data
|
||||
await pipe.set(data_key, file_bytes)
|
||||
await pipe.expire(data_key, JOB_TTL)
|
||||
|
||||
await pipe.execute()
|
||||
|
||||
logger.info(f"Job {job_id} submitted")
|
||||
return job_id
|
||||
|
||||
async def get_job_status(self, job_id: str) -> Optional[JobResponse]:
|
||||
"""
|
||||
Get the status of a job.
|
||||
|
||||
Args:
|
||||
job_id: Job ID to check
|
||||
|
||||
Returns:
|
||||
JobResponse or None if job doesn't exist
|
||||
"""
|
||||
r = await self.get_redis()
|
||||
meta_key = f"{JOB_PREFIX}{job_id}"
|
||||
result_key = f"{JOB_RESULT_PREFIX}{job_id}"
|
||||
|
||||
# Get job metadata
|
||||
meta = await r.hgetall(meta_key) # type: ignore
|
||||
if not meta:
|
||||
return None
|
||||
|
||||
status = JobStatus(meta.get("status", JobStatus.PENDING.value))
|
||||
progress = int(meta.get("progress", 0))
|
||||
error = meta.get("error")
|
||||
|
||||
# Get result if completed
|
||||
result = None
|
||||
if status == JobStatus.COMPLETED:
|
||||
result_json = await r.get(result_key)
|
||||
if result_json:
|
||||
result_dict = json.loads(result_json)
|
||||
result = OcrResponse(**result_dict)
|
||||
|
||||
return JobResponse(
|
||||
jobId=job_id,
|
||||
status=status,
|
||||
progress=progress if status == JobStatus.PROCESSING else None,
|
||||
result=result,
|
||||
error=error if status == JobStatus.FAILED else None,
|
||||
)
|
||||
|
||||
async def update_job_progress(self, job_id: str, progress: int) -> None:
|
||||
"""Update job progress percentage."""
|
||||
r = await self.get_redis()
|
||||
meta_key = f"{JOB_PREFIX}{job_id}"
|
||||
|
||||
await r.hset(meta_key, mapping={ # type: ignore
|
||||
"status": JobStatus.PROCESSING.value,
|
||||
"progress": progress,
|
||||
})
|
||||
|
||||
async def complete_job(self, job_id: str, result: OcrResponse) -> None:
|
||||
"""Mark job as completed with result."""
|
||||
r = await self.get_redis()
|
||||
meta_key = f"{JOB_PREFIX}{job_id}"
|
||||
result_key = f"{JOB_RESULT_PREFIX}{job_id}"
|
||||
data_key = f"{JOB_DATA_PREFIX}{job_id}"
|
||||
|
||||
# Store result
|
||||
result_dict = result.model_dump(by_alias=True)
|
||||
result_json = json.dumps(result_dict)
|
||||
|
||||
async with r.pipeline() as pipe:
|
||||
# Update status
|
||||
await pipe.hset(meta_key, mapping={ # type: ignore
|
||||
"status": JobStatus.COMPLETED.value,
|
||||
"progress": 100,
|
||||
})
|
||||
|
||||
# Store result
|
||||
await pipe.set(result_key, result_json)
|
||||
await pipe.expire(result_key, JOB_TTL)
|
||||
|
||||
# Delete file data (no longer needed)
|
||||
await pipe.delete(data_key)
|
||||
|
||||
await pipe.execute()
|
||||
|
||||
logger.info(f"Job {job_id} completed")
|
||||
|
||||
# TODO: Trigger callback if configured
|
||||
meta = await r.hgetall(meta_key) # type: ignore
|
||||
callback_url = meta.get("callback_url")
|
||||
if callback_url:
|
||||
# Fire-and-forget callback (don't block)
|
||||
asyncio.create_task(self._send_callback(callback_url, job_id, result_dict))
|
||||
|
||||
async def fail_job(self, job_id: str, error: str) -> None:
|
||||
"""Mark job as failed with error message."""
|
||||
r = await self.get_redis()
|
||||
meta_key = f"{JOB_PREFIX}{job_id}"
|
||||
data_key = f"{JOB_DATA_PREFIX}{job_id}"
|
||||
|
||||
async with r.pipeline() as pipe:
|
||||
await pipe.hset(meta_key, mapping={ # type: ignore
|
||||
"status": JobStatus.FAILED.value,
|
||||
"error": error,
|
||||
})
|
||||
# Delete file data
|
||||
await pipe.delete(data_key)
|
||||
await pipe.execute()
|
||||
|
||||
logger.error(f"Job {job_id} failed: {error}")
|
||||
|
||||
async def get_job_data(self, job_id: str) -> Optional[bytes]:
|
||||
"""Get the file data for a job."""
|
||||
r = await self.get_redis()
|
||||
data_key = f"{JOB_DATA_PREFIX}{job_id}"
|
||||
|
||||
# Get raw bytes (not decoded)
|
||||
raw_redis = redis.Redis(
|
||||
host=settings.redis_host,
|
||||
port=settings.redis_port,
|
||||
db=settings.redis_db,
|
||||
decode_responses=False,
|
||||
)
|
||||
try:
|
||||
data = await raw_redis.get(data_key)
|
||||
return data # type: ignore
|
||||
finally:
|
||||
await raw_redis.close()
|
||||
|
||||
async def _send_callback(
|
||||
self, url: str, job_id: str, result: dict
|
||||
) -> None:
|
||||
"""Send callback notification when job completes."""
|
||||
try:
|
||||
import httpx
|
||||
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
await client.post(
|
||||
url,
|
||||
json={"jobId": job_id, "result": result},
|
||||
)
|
||||
logger.info(f"Callback sent for job {job_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Callback failed for job {job_id}: {e}")
|
||||
|
||||
|
||||
# Singleton instance
|
||||
job_queue = JobQueue()
|
||||
Reference in New Issue
Block a user