Add filename .pdf extension fallback and %PDF magic bytes validation to extractManual controller. Update getJobStatus to return 410 Gone for expired jobs. Add 16 unit tests covering all acceptance criteria. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
620 lines
17 KiB
TypeScript
620 lines
17 KiB
TypeScript
/**
|
|
* @ai-summary Controller for OCR API endpoints
|
|
*/
|
|
import { FastifyReply, FastifyRequest } from 'fastify';
|
|
import { logger } from '../../../core/logging/logger';
|
|
import { ocrService } from '../domain/ocr.service';
|
|
import type { ExtractQuery, JobIdParams, JobSubmitBody } from './ocr.validation';
|
|
|
|
/** Supported MIME types for OCR */
|
|
const SUPPORTED_TYPES = new Set([
|
|
'image/jpeg',
|
|
'image/png',
|
|
'image/heic',
|
|
'image/heif',
|
|
'application/pdf',
|
|
]);
|
|
|
|
/** Image-only MIME types for receipt extraction (no PDF) */
|
|
const SUPPORTED_IMAGE_TYPES = new Set([
|
|
'image/jpeg',
|
|
'image/png',
|
|
'image/heic',
|
|
'image/heif',
|
|
]);
|
|
|
|
export class OcrController {
|
|
/**
|
|
* POST /api/ocr/extract
|
|
* Extract text from an uploaded image using synchronous OCR.
|
|
*/
|
|
async extract(
|
|
request: FastifyRequest<{ Querystring: ExtractQuery }>,
|
|
reply: FastifyReply
|
|
) {
|
|
const userId = (request as any).user?.sub as string;
|
|
const preprocess = request.query.preprocess !== false;
|
|
|
|
logger.info('OCR extract requested', {
|
|
operation: 'ocr.controller.extract',
|
|
userId,
|
|
preprocess,
|
|
});
|
|
|
|
// Get uploaded file
|
|
const file = await (request as any).file({ limits: { files: 1 } });
|
|
if (!file) {
|
|
logger.warn('No file provided for OCR', {
|
|
operation: 'ocr.controller.extract.no_file',
|
|
userId,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'No file provided',
|
|
});
|
|
}
|
|
|
|
// Validate content type
|
|
const contentType = file.mimetype as string;
|
|
if (!SUPPORTED_TYPES.has(contentType)) {
|
|
logger.warn('Unsupported file type for OCR', {
|
|
operation: 'ocr.controller.extract.unsupported_type',
|
|
userId,
|
|
contentType,
|
|
fileName: file.filename,
|
|
});
|
|
return reply.code(415).send({
|
|
error: 'Unsupported Media Type',
|
|
message: `Unsupported file type: ${contentType}. Supported: JPEG, PNG, HEIC, PDF`,
|
|
});
|
|
}
|
|
|
|
// Read file content
|
|
const chunks: Buffer[] = [];
|
|
for await (const chunk of file.file) {
|
|
chunks.push(chunk);
|
|
}
|
|
const fileBuffer = Buffer.concat(chunks);
|
|
|
|
if (fileBuffer.length === 0) {
|
|
logger.warn('Empty file provided for OCR', {
|
|
operation: 'ocr.controller.extract.empty_file',
|
|
userId,
|
|
fileName: file.filename,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'Empty file provided',
|
|
});
|
|
}
|
|
|
|
try {
|
|
const result = await ocrService.extract(userId, {
|
|
fileBuffer,
|
|
contentType,
|
|
preprocess,
|
|
});
|
|
|
|
logger.info('OCR extract completed', {
|
|
operation: 'ocr.controller.extract.success',
|
|
userId,
|
|
success: result.success,
|
|
documentType: result.documentType,
|
|
processingTimeMs: result.processingTimeMs,
|
|
});
|
|
|
|
return reply.code(200).send(result);
|
|
} catch (error: any) {
|
|
if (error.statusCode === 413) {
|
|
return reply.code(413).send({
|
|
error: 'Payload Too Large',
|
|
message: error.message,
|
|
});
|
|
}
|
|
if (error.statusCode === 415) {
|
|
return reply.code(415).send({
|
|
error: 'Unsupported Media Type',
|
|
message: error.message,
|
|
});
|
|
}
|
|
|
|
logger.error('OCR extract failed', {
|
|
operation: 'ocr.controller.extract.error',
|
|
userId,
|
|
error: error.message,
|
|
});
|
|
|
|
return reply.code(500).send({
|
|
error: 'Internal Server Error',
|
|
message: 'OCR processing failed',
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* POST /api/ocr/extract/vin
|
|
* Extract VIN from an uploaded image using VIN-specific OCR.
|
|
*/
|
|
async extractVin(
|
|
request: FastifyRequest,
|
|
reply: FastifyReply
|
|
) {
|
|
const userId = (request as any).user?.sub as string;
|
|
|
|
logger.info('VIN extract requested', {
|
|
operation: 'ocr.controller.extractVin',
|
|
userId,
|
|
});
|
|
|
|
const file = await (request as any).file({ limits: { files: 1 } });
|
|
if (!file) {
|
|
logger.warn('No file provided for VIN extraction', {
|
|
operation: 'ocr.controller.extractVin.no_file',
|
|
userId,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'No file provided',
|
|
});
|
|
}
|
|
|
|
const contentType = file.mimetype as string;
|
|
if (!SUPPORTED_TYPES.has(contentType)) {
|
|
logger.warn('Unsupported file type for VIN extraction', {
|
|
operation: 'ocr.controller.extractVin.unsupported_type',
|
|
userId,
|
|
contentType,
|
|
fileName: file.filename,
|
|
});
|
|
return reply.code(415).send({
|
|
error: 'Unsupported Media Type',
|
|
message: `Unsupported file type: ${contentType}. Supported: JPEG, PNG, HEIC, PDF`,
|
|
});
|
|
}
|
|
|
|
const chunks: Buffer[] = [];
|
|
for await (const chunk of file.file) {
|
|
chunks.push(chunk);
|
|
}
|
|
const fileBuffer = Buffer.concat(chunks);
|
|
|
|
if (fileBuffer.length === 0) {
|
|
logger.warn('Empty file provided for VIN extraction', {
|
|
operation: 'ocr.controller.extractVin.empty_file',
|
|
userId,
|
|
fileName: file.filename,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'Empty file provided',
|
|
});
|
|
}
|
|
|
|
try {
|
|
const result = await ocrService.extractVin(userId, {
|
|
fileBuffer,
|
|
contentType,
|
|
});
|
|
|
|
logger.info('VIN extract completed', {
|
|
operation: 'ocr.controller.extractVin.success',
|
|
userId,
|
|
success: result.success,
|
|
processingTimeMs: result.processingTimeMs,
|
|
});
|
|
|
|
return reply.code(200).send(result);
|
|
} catch (error: any) {
|
|
if (error.statusCode === 413) {
|
|
return reply.code(413).send({
|
|
error: 'Payload Too Large',
|
|
message: error.message,
|
|
});
|
|
}
|
|
if (error.statusCode === 415) {
|
|
return reply.code(415).send({
|
|
error: 'Unsupported Media Type',
|
|
message: error.message,
|
|
});
|
|
}
|
|
|
|
logger.error('VIN extract failed', {
|
|
operation: 'ocr.controller.extractVin.error',
|
|
userId,
|
|
error: error.message,
|
|
});
|
|
|
|
return reply.code(500).send({
|
|
error: 'Internal Server Error',
|
|
message: 'VIN extraction failed',
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* POST /api/ocr/extract/receipt
|
|
* Extract data from a receipt image using receipt-specific OCR.
|
|
*/
|
|
async extractReceipt(
|
|
request: FastifyRequest,
|
|
reply: FastifyReply
|
|
) {
|
|
const userId = (request as any).user?.sub as string;
|
|
|
|
logger.info('Receipt extract requested', {
|
|
operation: 'ocr.controller.extractReceipt',
|
|
userId,
|
|
});
|
|
|
|
const file = await (request as any).file({ limits: { files: 1 } });
|
|
if (!file) {
|
|
logger.warn('No file provided for receipt extraction', {
|
|
operation: 'ocr.controller.extractReceipt.no_file',
|
|
userId,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'No file provided',
|
|
});
|
|
}
|
|
|
|
const contentType = file.mimetype as string;
|
|
if (!SUPPORTED_IMAGE_TYPES.has(contentType)) {
|
|
logger.warn('Unsupported file type for receipt extraction', {
|
|
operation: 'ocr.controller.extractReceipt.unsupported_type',
|
|
userId,
|
|
contentType,
|
|
fileName: file.filename,
|
|
});
|
|
return reply.code(415).send({
|
|
error: 'Unsupported Media Type',
|
|
message: `Unsupported file type: ${contentType}. Supported: JPEG, PNG, HEIC`,
|
|
});
|
|
}
|
|
|
|
const chunks: Buffer[] = [];
|
|
for await (const chunk of file.file) {
|
|
chunks.push(chunk);
|
|
}
|
|
const fileBuffer = Buffer.concat(chunks);
|
|
|
|
if (fileBuffer.length === 0) {
|
|
logger.warn('Empty file provided for receipt extraction', {
|
|
operation: 'ocr.controller.extractReceipt.empty_file',
|
|
userId,
|
|
fileName: file.filename,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'Empty file provided',
|
|
});
|
|
}
|
|
|
|
// Get optional receipt_type from form fields
|
|
const receiptType = file.fields?.receipt_type?.value as string | undefined;
|
|
|
|
try {
|
|
const result = await ocrService.extractReceipt(userId, {
|
|
fileBuffer,
|
|
contentType,
|
|
receiptType,
|
|
});
|
|
|
|
logger.info('Receipt extract completed', {
|
|
operation: 'ocr.controller.extractReceipt.success',
|
|
userId,
|
|
success: result.success,
|
|
receiptType: result.receiptType,
|
|
processingTimeMs: result.processingTimeMs,
|
|
});
|
|
|
|
return reply.code(200).send(result);
|
|
} catch (error: any) {
|
|
if (error.statusCode === 413) {
|
|
return reply.code(413).send({
|
|
error: 'Payload Too Large',
|
|
message: error.message,
|
|
});
|
|
}
|
|
if (error.statusCode === 415) {
|
|
return reply.code(415).send({
|
|
error: 'Unsupported Media Type',
|
|
message: error.message,
|
|
});
|
|
}
|
|
if (error.statusCode === 422) {
|
|
return reply.code(422).send({
|
|
error: 'Unprocessable Entity',
|
|
message: error.message,
|
|
});
|
|
}
|
|
|
|
logger.error('Receipt extract failed', {
|
|
operation: 'ocr.controller.extractReceipt.error',
|
|
userId,
|
|
error: error.message,
|
|
});
|
|
|
|
return reply.code(500).send({
|
|
error: 'Internal Server Error',
|
|
message: 'Receipt extraction failed',
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* POST /api/ocr/extract/manual
|
|
* Submit an async manual extraction job for PDF owner's manuals.
|
|
* Requires Pro tier (document.scanMaintenanceSchedule).
|
|
*/
|
|
async extractManual(
|
|
request: FastifyRequest,
|
|
reply: FastifyReply
|
|
) {
|
|
const userId = (request as any).user?.sub as string;
|
|
|
|
logger.info('Manual extract requested', {
|
|
operation: 'ocr.controller.extractManual',
|
|
userId,
|
|
});
|
|
|
|
const file = await (request as any).file({ limits: { files: 1 } });
|
|
if (!file) {
|
|
logger.warn('No file provided for manual extraction', {
|
|
operation: 'ocr.controller.extractManual.no_file',
|
|
userId,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'No file provided',
|
|
});
|
|
}
|
|
|
|
const contentType = file.mimetype as string;
|
|
const fileName = file.filename as string | undefined;
|
|
const isPdfMime = contentType === 'application/pdf';
|
|
const isPdfExtension = fileName?.toLowerCase().endsWith('.pdf') ?? false;
|
|
|
|
if (!isPdfMime && !isPdfExtension) {
|
|
logger.warn('Non-PDF file provided for manual extraction', {
|
|
operation: 'ocr.controller.extractManual.not_pdf',
|
|
userId,
|
|
contentType,
|
|
fileName,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: `Manual extraction requires PDF files. Received: ${contentType}`,
|
|
});
|
|
}
|
|
|
|
const chunks: Buffer[] = [];
|
|
for await (const chunk of file.file) {
|
|
chunks.push(chunk);
|
|
}
|
|
const fileBuffer = Buffer.concat(chunks);
|
|
|
|
if (fileBuffer.length === 0) {
|
|
logger.warn('Empty file provided for manual extraction', {
|
|
operation: 'ocr.controller.extractManual.empty_file',
|
|
userId,
|
|
fileName,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'Empty file provided',
|
|
});
|
|
}
|
|
|
|
// Validate PDF magic bytes (%PDF)
|
|
const PDF_MAGIC = Buffer.from('%PDF');
|
|
if (fileBuffer.length < 4 || !fileBuffer.subarray(0, 4).equals(PDF_MAGIC)) {
|
|
logger.warn('File lacks PDF magic bytes', {
|
|
operation: 'ocr.controller.extractManual.invalid_magic',
|
|
userId,
|
|
fileName,
|
|
firstBytes: fileBuffer.subarray(0, 4).toString('hex'),
|
|
});
|
|
return reply.code(415).send({
|
|
error: 'Unsupported Media Type',
|
|
message: 'File does not appear to be a valid PDF (missing %PDF header)',
|
|
});
|
|
}
|
|
|
|
// Get optional vehicle_id from form fields
|
|
const vehicleId = file.fields?.vehicle_id?.value as string | undefined;
|
|
|
|
try {
|
|
const result = await ocrService.submitManualJob(userId, {
|
|
fileBuffer,
|
|
contentType,
|
|
vehicleId,
|
|
});
|
|
|
|
logger.info('Manual extract job submitted', {
|
|
operation: 'ocr.controller.extractManual.success',
|
|
userId,
|
|
jobId: result.jobId,
|
|
status: result.status,
|
|
estimatedSeconds: result.estimatedSeconds,
|
|
});
|
|
|
|
return reply.code(202).send(result);
|
|
} catch (error: any) {
|
|
if (error.statusCode === 413) {
|
|
return reply.code(413).send({
|
|
error: 'Payload Too Large',
|
|
message: error.message,
|
|
});
|
|
}
|
|
if (error.statusCode === 400) {
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: error.message,
|
|
});
|
|
}
|
|
|
|
logger.error('Manual extract failed', {
|
|
operation: 'ocr.controller.extractManual.error',
|
|
userId,
|
|
error: error.message,
|
|
});
|
|
|
|
return reply.code(500).send({
|
|
error: 'Internal Server Error',
|
|
message: 'Manual extraction submission failed',
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* POST /api/ocr/jobs
|
|
* Submit an async OCR job for large files.
|
|
*/
|
|
async submitJob(
|
|
request: FastifyRequest<{ Body: JobSubmitBody }>,
|
|
reply: FastifyReply
|
|
) {
|
|
const userId = (request as any).user?.sub as string;
|
|
|
|
logger.info('OCR job submit requested', {
|
|
operation: 'ocr.controller.submitJob',
|
|
userId,
|
|
});
|
|
|
|
// Get uploaded file
|
|
const file = await (request as any).file({ limits: { files: 1 } });
|
|
if (!file) {
|
|
logger.warn('No file provided for OCR job', {
|
|
operation: 'ocr.controller.submitJob.no_file',
|
|
userId,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'No file provided',
|
|
});
|
|
}
|
|
|
|
// Validate content type
|
|
const contentType = file.mimetype as string;
|
|
if (!SUPPORTED_TYPES.has(contentType)) {
|
|
logger.warn('Unsupported file type for OCR job', {
|
|
operation: 'ocr.controller.submitJob.unsupported_type',
|
|
userId,
|
|
contentType,
|
|
fileName: file.filename,
|
|
});
|
|
return reply.code(415).send({
|
|
error: 'Unsupported Media Type',
|
|
message: `Unsupported file type: ${contentType}. Supported: JPEG, PNG, HEIC, PDF`,
|
|
});
|
|
}
|
|
|
|
// Read file content
|
|
const chunks: Buffer[] = [];
|
|
for await (const chunk of file.file) {
|
|
chunks.push(chunk);
|
|
}
|
|
const fileBuffer = Buffer.concat(chunks);
|
|
|
|
if (fileBuffer.length === 0) {
|
|
logger.warn('Empty file provided for OCR job', {
|
|
operation: 'ocr.controller.submitJob.empty_file',
|
|
userId,
|
|
fileName: file.filename,
|
|
});
|
|
return reply.code(400).send({
|
|
error: 'Bad Request',
|
|
message: 'Empty file provided',
|
|
});
|
|
}
|
|
|
|
// Get callback URL from form data (if present)
|
|
const callbackUrl = file.fields?.callbackUrl?.value as string | undefined;
|
|
|
|
try {
|
|
const result = await ocrService.submitJob(userId, {
|
|
fileBuffer,
|
|
contentType,
|
|
callbackUrl,
|
|
});
|
|
|
|
logger.info('OCR job submitted', {
|
|
operation: 'ocr.controller.submitJob.success',
|
|
userId,
|
|
jobId: result.jobId,
|
|
status: result.status,
|
|
});
|
|
|
|
return reply.code(202).send(result);
|
|
} catch (error: any) {
|
|
if (error.statusCode === 413) {
|
|
return reply.code(413).send({
|
|
error: 'Payload Too Large',
|
|
message: error.message,
|
|
});
|
|
}
|
|
if (error.statusCode === 415) {
|
|
return reply.code(415).send({
|
|
error: 'Unsupported Media Type',
|
|
message: error.message,
|
|
});
|
|
}
|
|
|
|
logger.error('OCR job submit failed', {
|
|
operation: 'ocr.controller.submitJob.error',
|
|
userId,
|
|
error: error.message,
|
|
});
|
|
|
|
return reply.code(500).send({
|
|
error: 'Internal Server Error',
|
|
message: 'Job submission failed',
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* GET /api/ocr/jobs/:jobId
|
|
* Get the status of an async OCR job.
|
|
*/
|
|
async getJobStatus(
|
|
request: FastifyRequest<{ Params: JobIdParams }>,
|
|
reply: FastifyReply
|
|
) {
|
|
const userId = (request as any).user?.sub as string;
|
|
const { jobId } = request.params;
|
|
|
|
logger.debug('OCR job status requested', {
|
|
operation: 'ocr.controller.getJobStatus',
|
|
userId,
|
|
jobId,
|
|
});
|
|
|
|
try {
|
|
const result = await ocrService.getJobStatus(userId, jobId);
|
|
|
|
return reply.code(200).send(result);
|
|
} catch (error: any) {
|
|
if (error.statusCode === 410) {
|
|
return reply.code(410).send({
|
|
error: 'Gone',
|
|
message: error.message,
|
|
});
|
|
}
|
|
|
|
logger.error('OCR job status failed', {
|
|
operation: 'ocr.controller.getJobStatus.error',
|
|
userId,
|
|
jobId,
|
|
error: error.message,
|
|
});
|
|
|
|
return reply.code(500).send({
|
|
error: 'Internal Server Error',
|
|
message: 'Failed to retrieve job status',
|
|
});
|
|
}
|
|
}
|
|
}
|