feat: add core OCR API integration (refs #65)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 5m59s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 5m59s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
OCR Service (Python/FastAPI):
- POST /extract for synchronous OCR extraction
- POST /jobs and GET /jobs/{job_id} for async processing
- Image preprocessing (deskew, denoise) for accuracy
- HEIC conversion via pillow-heif
- Redis job queue for async processing
Backend (Fastify):
- POST /api/ocr/extract - authenticated proxy to OCR
- POST /api/ocr/jobs - async job submission
- GET /api/ocr/jobs/:jobId - job polling
- Multipart file upload handling
- JWT authentication required
File size limits: 10MB sync, 200MB async
Processing time target: <3 seconds for typical photos
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
208
backend/src/features/ocr/domain/ocr.service.ts
Normal file
208
backend/src/features/ocr/domain/ocr.service.ts
Normal file
@@ -0,0 +1,208 @@
|
||||
/**
|
||||
* @ai-summary Domain service for OCR operations
|
||||
*/
|
||||
import { logger } from '../../../core/logging/logger';
|
||||
import { ocrClient, JobNotFoundError } from '../external/ocr-client';
|
||||
import type {
|
||||
JobResponse,
|
||||
OcrExtractRequest,
|
||||
OcrJobSubmitRequest,
|
||||
OcrResponse,
|
||||
} from './ocr.types';
|
||||
|
||||
/** Maximum file size for sync processing (10MB) */
|
||||
const MAX_SYNC_SIZE = 10 * 1024 * 1024;
|
||||
|
||||
/** Maximum file size for async processing (200MB) */
|
||||
const MAX_ASYNC_SIZE = 200 * 1024 * 1024;
|
||||
|
||||
/** Supported MIME types */
|
||||
const SUPPORTED_TYPES = new Set([
|
||||
'image/jpeg',
|
||||
'image/png',
|
||||
'image/heic',
|
||||
'image/heif',
|
||||
'application/pdf',
|
||||
]);
|
||||
|
||||
/**
|
||||
* Domain service for OCR operations.
|
||||
* Handles business logic and validation for OCR requests.
|
||||
*/
|
||||
export class OcrService {
|
||||
/**
|
||||
* Extract text from an image using synchronous OCR.
|
||||
*
|
||||
* @param userId - User ID for logging
|
||||
* @param request - OCR extraction request
|
||||
* @returns OCR extraction result
|
||||
*/
|
||||
async extract(userId: string, request: OcrExtractRequest): Promise<OcrResponse> {
|
||||
// Validate file size for sync processing
|
||||
if (request.fileBuffer.length > MAX_SYNC_SIZE) {
|
||||
const err: any = new Error(
|
||||
`File too large for sync processing. Max: ${MAX_SYNC_SIZE / (1024 * 1024)}MB. Use async job submission for larger files.`
|
||||
);
|
||||
err.statusCode = 413;
|
||||
throw err;
|
||||
}
|
||||
|
||||
// Validate content type
|
||||
if (!SUPPORTED_TYPES.has(request.contentType)) {
|
||||
const err: any = new Error(
|
||||
`Unsupported file type: ${request.contentType}. Supported: ${[...SUPPORTED_TYPES].join(', ')}`
|
||||
);
|
||||
err.statusCode = 415;
|
||||
throw err;
|
||||
}
|
||||
|
||||
logger.info('OCR extract requested', {
|
||||
operation: 'ocr.service.extract',
|
||||
userId,
|
||||
contentType: request.contentType,
|
||||
fileSize: request.fileBuffer.length,
|
||||
preprocess: request.preprocess ?? true,
|
||||
});
|
||||
|
||||
try {
|
||||
const result = await ocrClient.extract(
|
||||
request.fileBuffer,
|
||||
request.contentType,
|
||||
request.preprocess ?? true
|
||||
);
|
||||
|
||||
logger.info('OCR extract completed', {
|
||||
operation: 'ocr.service.extract.success',
|
||||
userId,
|
||||
success: result.success,
|
||||
documentType: result.documentType,
|
||||
confidence: result.confidence,
|
||||
processingTimeMs: result.processingTimeMs,
|
||||
textLength: result.rawText.length,
|
||||
});
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
logger.error('OCR extract failed', {
|
||||
operation: 'ocr.service.extract.error',
|
||||
userId,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Submit an async OCR job for large files.
|
||||
*
|
||||
* @param userId - User ID for logging
|
||||
* @param request - Job submission request
|
||||
* @returns Job response with job ID
|
||||
*/
|
||||
async submitJob(userId: string, request: OcrJobSubmitRequest): Promise<JobResponse> {
|
||||
// Validate file size for async processing
|
||||
if (request.fileBuffer.length > MAX_ASYNC_SIZE) {
|
||||
const err: any = new Error(
|
||||
`File too large. Max: ${MAX_ASYNC_SIZE / (1024 * 1024)}MB.`
|
||||
);
|
||||
err.statusCode = 413;
|
||||
throw err;
|
||||
}
|
||||
|
||||
// Validate content type
|
||||
if (!SUPPORTED_TYPES.has(request.contentType)) {
|
||||
const err: any = new Error(
|
||||
`Unsupported file type: ${request.contentType}. Supported: ${[...SUPPORTED_TYPES].join(', ')}`
|
||||
);
|
||||
err.statusCode = 415;
|
||||
throw err;
|
||||
}
|
||||
|
||||
logger.info('OCR job submit requested', {
|
||||
operation: 'ocr.service.submitJob',
|
||||
userId,
|
||||
contentType: request.contentType,
|
||||
fileSize: request.fileBuffer.length,
|
||||
hasCallback: !!request.callbackUrl,
|
||||
});
|
||||
|
||||
try {
|
||||
const result = await ocrClient.submitJob(
|
||||
request.fileBuffer,
|
||||
request.contentType,
|
||||
request.callbackUrl
|
||||
);
|
||||
|
||||
logger.info('OCR job submitted', {
|
||||
operation: 'ocr.service.submitJob.success',
|
||||
userId,
|
||||
jobId: result.jobId,
|
||||
status: result.status,
|
||||
});
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
logger.error('OCR job submit failed', {
|
||||
operation: 'ocr.service.submitJob.error',
|
||||
userId,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the status of an async OCR job.
|
||||
*
|
||||
* @param userId - User ID for logging
|
||||
* @param jobId - Job ID to check
|
||||
* @returns Job status response
|
||||
*/
|
||||
async getJobStatus(userId: string, jobId: string): Promise<JobResponse> {
|
||||
logger.debug('OCR job status requested', {
|
||||
operation: 'ocr.service.getJobStatus',
|
||||
userId,
|
||||
jobId,
|
||||
});
|
||||
|
||||
try {
|
||||
const result = await ocrClient.getJobStatus(jobId);
|
||||
|
||||
logger.debug('OCR job status retrieved', {
|
||||
operation: 'ocr.service.getJobStatus.success',
|
||||
userId,
|
||||
jobId,
|
||||
status: result.status,
|
||||
progress: result.progress,
|
||||
});
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
if (error instanceof JobNotFoundError) {
|
||||
const err: any = new Error(`Job ${jobId} not found. Jobs expire after 1 hour.`);
|
||||
err.statusCode = 404;
|
||||
throw err;
|
||||
}
|
||||
|
||||
logger.error('OCR job status failed', {
|
||||
operation: 'ocr.service.getJobStatus.error',
|
||||
userId,
|
||||
jobId,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the OCR service is available.
|
||||
*
|
||||
* @returns true if OCR service is healthy
|
||||
*/
|
||||
async isServiceHealthy(): Promise<boolean> {
|
||||
return ocrClient.isHealthy();
|
||||
}
|
||||
}
|
||||
|
||||
/** Singleton instance */
|
||||
export const ocrService = new OcrService();
|
||||
Reference in New Issue
Block a user