Add filename .pdf extension fallback and %PDF magic bytes validation to extractManual controller. Update getJobStatus to return 410 Gone for expired jobs. Add 16 unit tests covering all acceptance criteria. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
398 lines
11 KiB
TypeScript
398 lines
11 KiB
TypeScript
/**
|
|
* @ai-summary Domain service for OCR operations
|
|
*/
|
|
import { logger } from '../../../core/logging/logger';
|
|
import { ocrClient, JobNotFoundError } from '../external/ocr-client';
|
|
import type {
|
|
JobResponse,
|
|
ManualJobResponse,
|
|
ManualJobSubmitRequest,
|
|
OcrExtractRequest,
|
|
OcrJobSubmitRequest,
|
|
OcrResponse,
|
|
ReceiptExtractRequest,
|
|
ReceiptExtractionResponse,
|
|
VinExtractionResponse,
|
|
} from './ocr.types';
|
|
|
|
/** Maximum file size for sync processing (10MB) */
|
|
const MAX_SYNC_SIZE = 10 * 1024 * 1024;
|
|
|
|
/** Maximum file size for async processing (200MB) */
|
|
const MAX_ASYNC_SIZE = 200 * 1024 * 1024;
|
|
|
|
/** Supported MIME types */
|
|
const SUPPORTED_TYPES = new Set([
|
|
'image/jpeg',
|
|
'image/png',
|
|
'image/heic',
|
|
'image/heif',
|
|
'application/pdf',
|
|
]);
|
|
|
|
/** Image-only MIME types for receipt extraction (no PDF) */
|
|
const SUPPORTED_IMAGE_TYPES = new Set([
|
|
'image/jpeg',
|
|
'image/png',
|
|
'image/heic',
|
|
'image/heif',
|
|
]);
|
|
|
|
/**
|
|
* Domain service for OCR operations.
|
|
* Handles business logic and validation for OCR requests.
|
|
*/
|
|
export class OcrService {
|
|
/**
|
|
* Extract text from an image using synchronous OCR.
|
|
*
|
|
* @param userId - User ID for logging
|
|
* @param request - OCR extraction request
|
|
* @returns OCR extraction result
|
|
*/
|
|
async extract(userId: string, request: OcrExtractRequest): Promise<OcrResponse> {
|
|
// Validate file size for sync processing
|
|
if (request.fileBuffer.length > MAX_SYNC_SIZE) {
|
|
const err: any = new Error(
|
|
`File too large for sync processing. Max: ${MAX_SYNC_SIZE / (1024 * 1024)}MB. Use async job submission for larger files.`
|
|
);
|
|
err.statusCode = 413;
|
|
throw err;
|
|
}
|
|
|
|
// Validate content type
|
|
if (!SUPPORTED_TYPES.has(request.contentType)) {
|
|
const err: any = new Error(
|
|
`Unsupported file type: ${request.contentType}. Supported: ${[...SUPPORTED_TYPES].join(', ')}`
|
|
);
|
|
err.statusCode = 415;
|
|
throw err;
|
|
}
|
|
|
|
logger.info('OCR extract requested', {
|
|
operation: 'ocr.service.extract',
|
|
userId,
|
|
contentType: request.contentType,
|
|
fileSize: request.fileBuffer.length,
|
|
preprocess: request.preprocess ?? true,
|
|
});
|
|
|
|
try {
|
|
const result = await ocrClient.extract(
|
|
request.fileBuffer,
|
|
request.contentType,
|
|
request.preprocess ?? true
|
|
);
|
|
|
|
logger.info('OCR extract completed', {
|
|
operation: 'ocr.service.extract.success',
|
|
userId,
|
|
success: result.success,
|
|
documentType: result.documentType,
|
|
confidence: result.confidence,
|
|
processingTimeMs: result.processingTimeMs,
|
|
textLength: result.rawText.length,
|
|
});
|
|
|
|
return result;
|
|
} catch (error) {
|
|
logger.error('OCR extract failed', {
|
|
operation: 'ocr.service.extract.error',
|
|
userId,
|
|
error: error instanceof Error ? error.message : 'Unknown error',
|
|
});
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract VIN from an image using VIN-specific OCR.
|
|
*
|
|
* @param userId - User ID for logging
|
|
* @param request - OCR extraction request
|
|
* @returns VIN extraction result
|
|
*/
|
|
async extractVin(userId: string, request: OcrExtractRequest): Promise<VinExtractionResponse> {
|
|
if (request.fileBuffer.length > MAX_SYNC_SIZE) {
|
|
const err: any = new Error(
|
|
`File too large. Max: ${MAX_SYNC_SIZE / (1024 * 1024)}MB.`
|
|
);
|
|
err.statusCode = 413;
|
|
throw err;
|
|
}
|
|
|
|
if (!SUPPORTED_TYPES.has(request.contentType)) {
|
|
const err: any = new Error(
|
|
`Unsupported file type: ${request.contentType}. Supported: ${[...SUPPORTED_TYPES].join(', ')}`
|
|
);
|
|
err.statusCode = 415;
|
|
throw err;
|
|
}
|
|
|
|
logger.info('VIN extract requested', {
|
|
operation: 'ocr.service.extractVin',
|
|
userId,
|
|
contentType: request.contentType,
|
|
fileSize: request.fileBuffer.length,
|
|
});
|
|
|
|
try {
|
|
const result = await ocrClient.extractVin(
|
|
request.fileBuffer,
|
|
request.contentType
|
|
);
|
|
|
|
logger.info('VIN extract completed', {
|
|
operation: 'ocr.service.extractVin.success',
|
|
userId,
|
|
success: result.success,
|
|
vin: result.vin,
|
|
confidence: result.confidence,
|
|
processingTimeMs: result.processingTimeMs,
|
|
});
|
|
|
|
return result;
|
|
} catch (error) {
|
|
logger.error('VIN extract failed', {
|
|
operation: 'ocr.service.extractVin.error',
|
|
userId,
|
|
error: error instanceof Error ? error.message : 'Unknown error',
|
|
});
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract data from a receipt image using receipt-specific OCR.
|
|
*
|
|
* @param userId - User ID for logging
|
|
* @param request - Receipt extraction request
|
|
* @returns Receipt extraction result
|
|
*/
|
|
async extractReceipt(userId: string, request: ReceiptExtractRequest): Promise<ReceiptExtractionResponse> {
|
|
if (request.fileBuffer.length > MAX_SYNC_SIZE) {
|
|
const err: any = new Error(
|
|
`File too large. Max: ${MAX_SYNC_SIZE / (1024 * 1024)}MB.`
|
|
);
|
|
err.statusCode = 413;
|
|
throw err;
|
|
}
|
|
|
|
if (!SUPPORTED_IMAGE_TYPES.has(request.contentType)) {
|
|
const err: any = new Error(
|
|
`Unsupported file type: ${request.contentType}. Supported: ${[...SUPPORTED_IMAGE_TYPES].join(', ')}`
|
|
);
|
|
err.statusCode = 415;
|
|
throw err;
|
|
}
|
|
|
|
logger.info('Receipt extract requested', {
|
|
operation: 'ocr.service.extractReceipt',
|
|
userId,
|
|
contentType: request.contentType,
|
|
fileSize: request.fileBuffer.length,
|
|
receiptType: request.receiptType,
|
|
});
|
|
|
|
try {
|
|
const result = await ocrClient.extractReceipt(
|
|
request.fileBuffer,
|
|
request.contentType,
|
|
request.receiptType
|
|
);
|
|
|
|
logger.info('Receipt extract completed', {
|
|
operation: 'ocr.service.extractReceipt.success',
|
|
userId,
|
|
success: result.success,
|
|
receiptType: result.receiptType,
|
|
fieldCount: Object.keys(result.extractedFields).length,
|
|
processingTimeMs: result.processingTimeMs,
|
|
});
|
|
|
|
return result;
|
|
} catch (error) {
|
|
logger.error('Receipt extract failed', {
|
|
operation: 'ocr.service.extractReceipt.error',
|
|
userId,
|
|
error: error instanceof Error ? error.message : 'Unknown error',
|
|
});
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Submit an async OCR job for large files.
|
|
*
|
|
* @param userId - User ID for logging
|
|
* @param request - Job submission request
|
|
* @returns Job response with job ID
|
|
*/
|
|
async submitJob(userId: string, request: OcrJobSubmitRequest): Promise<JobResponse> {
|
|
// Validate file size for async processing
|
|
if (request.fileBuffer.length > MAX_ASYNC_SIZE) {
|
|
const err: any = new Error(
|
|
`File too large. Max: ${MAX_ASYNC_SIZE / (1024 * 1024)}MB.`
|
|
);
|
|
err.statusCode = 413;
|
|
throw err;
|
|
}
|
|
|
|
// Validate content type
|
|
if (!SUPPORTED_TYPES.has(request.contentType)) {
|
|
const err: any = new Error(
|
|
`Unsupported file type: ${request.contentType}. Supported: ${[...SUPPORTED_TYPES].join(', ')}`
|
|
);
|
|
err.statusCode = 415;
|
|
throw err;
|
|
}
|
|
|
|
logger.info('OCR job submit requested', {
|
|
operation: 'ocr.service.submitJob',
|
|
userId,
|
|
contentType: request.contentType,
|
|
fileSize: request.fileBuffer.length,
|
|
hasCallback: !!request.callbackUrl,
|
|
});
|
|
|
|
try {
|
|
const result = await ocrClient.submitJob(
|
|
request.fileBuffer,
|
|
request.contentType,
|
|
request.callbackUrl
|
|
);
|
|
|
|
logger.info('OCR job submitted', {
|
|
operation: 'ocr.service.submitJob.success',
|
|
userId,
|
|
jobId: result.jobId,
|
|
status: result.status,
|
|
});
|
|
|
|
return result;
|
|
} catch (error) {
|
|
logger.error('OCR job submit failed', {
|
|
operation: 'ocr.service.submitJob.error',
|
|
userId,
|
|
error: error instanceof Error ? error.message : 'Unknown error',
|
|
});
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Submit an async manual extraction job for PDF owner's manuals.
|
|
*
|
|
* @param userId - User ID for logging
|
|
* @param request - Manual job submission request
|
|
* @returns Manual job response with job ID
|
|
*/
|
|
async submitManualJob(userId: string, request: ManualJobSubmitRequest): Promise<ManualJobResponse> {
|
|
// Validate file size for async processing (200MB max)
|
|
if (request.fileBuffer.length > MAX_ASYNC_SIZE) {
|
|
const err: any = new Error(
|
|
`File too large. Max: ${MAX_ASYNC_SIZE / (1024 * 1024)}MB.`
|
|
);
|
|
err.statusCode = 413;
|
|
throw err;
|
|
}
|
|
|
|
// Manual extraction only supports PDF
|
|
if (request.contentType !== 'application/pdf') {
|
|
const err: any = new Error(
|
|
`Unsupported file type: ${request.contentType}. Manual extraction requires PDF files.`
|
|
);
|
|
err.statusCode = 400;
|
|
throw err;
|
|
}
|
|
|
|
logger.info('Manual job submit requested', {
|
|
operation: 'ocr.service.submitManualJob',
|
|
userId,
|
|
contentType: request.contentType,
|
|
fileSize: request.fileBuffer.length,
|
|
hasVehicleId: !!request.vehicleId,
|
|
});
|
|
|
|
try {
|
|
const result = await ocrClient.submitManualJob(
|
|
request.fileBuffer,
|
|
request.contentType,
|
|
request.vehicleId
|
|
);
|
|
|
|
logger.info('Manual job submitted', {
|
|
operation: 'ocr.service.submitManualJob.success',
|
|
userId,
|
|
jobId: result.jobId,
|
|
status: result.status,
|
|
estimatedSeconds: result.estimatedSeconds,
|
|
});
|
|
|
|
return result;
|
|
} catch (error) {
|
|
logger.error('Manual job submit failed', {
|
|
operation: 'ocr.service.submitManualJob.error',
|
|
userId,
|
|
error: error instanceof Error ? error.message : 'Unknown error',
|
|
});
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get the status of an async OCR job.
|
|
*
|
|
* @param userId - User ID for logging
|
|
* @param jobId - Job ID to check
|
|
* @returns Job status response
|
|
*/
|
|
async getJobStatus(userId: string, jobId: string): Promise<JobResponse> {
|
|
logger.debug('OCR job status requested', {
|
|
operation: 'ocr.service.getJobStatus',
|
|
userId,
|
|
jobId,
|
|
});
|
|
|
|
try {
|
|
const result = await ocrClient.getJobStatus(jobId);
|
|
|
|
logger.debug('OCR job status retrieved', {
|
|
operation: 'ocr.service.getJobStatus.success',
|
|
userId,
|
|
jobId,
|
|
status: result.status,
|
|
progress: result.progress,
|
|
});
|
|
|
|
return result;
|
|
} catch (error) {
|
|
if (error instanceof JobNotFoundError) {
|
|
const err: any = new Error('Job expired (max 2 hours). Please resubmit.');
|
|
err.statusCode = 410;
|
|
throw err;
|
|
}
|
|
|
|
logger.error('OCR job status failed', {
|
|
operation: 'ocr.service.getJobStatus.error',
|
|
userId,
|
|
jobId,
|
|
error: error instanceof Error ? error.message : 'Unknown error',
|
|
});
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if the OCR service is available.
|
|
*
|
|
* @returns true if OCR service is healthy
|
|
*/
|
|
async isServiceHealthy(): Promise<boolean> {
|
|
return ocrClient.isHealthy();
|
|
}
|
|
}
|
|
|
|
/** Singleton instance */
|
|
export const ocrService = new OcrService();
|