From a281cea9c5673bb2fdc91b343da4b62a4a1ea067 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:37:18 -0600 Subject: [PATCH] feat: add backend OCR manual proxy endpoint (refs #135) Add POST /api/ocr/extract/manual endpoint that proxies to the Python OCR service's manual extraction pipeline. Includes Pro tier gating via document.scanMaintenanceSchedule, PDF-only validation, 200MB file size limit, and async 202 job response for polling via existing job status endpoint. Co-Authored-By: Claude Opus 4.6 --- .../src/features/ocr/api/ocr.controller.ts | 106 +++++++++ backend/src/features/ocr/api/ocr.routes.ts | 6 + .../src/features/ocr/domain/ocr.service.ts | 62 +++++ backend/src/features/ocr/domain/ocr.types.ts | 46 ++++ .../src/features/ocr/external/ocr-client.ts | 57 ++++- .../ocr/tests/unit/ocr-manual.test.ts | 213 ++++++++++++++++++ 6 files changed, 489 insertions(+), 1 deletion(-) create mode 100644 backend/src/features/ocr/tests/unit/ocr-manual.test.ts diff --git a/backend/src/features/ocr/api/ocr.controller.ts b/backend/src/features/ocr/api/ocr.controller.ts index c7c61af..fa1053b 100644 --- a/backend/src/features/ocr/api/ocr.controller.ts +++ b/backend/src/features/ocr/api/ocr.controller.ts @@ -336,6 +336,112 @@ export class OcrController { } } + /** + * POST /api/ocr/extract/manual + * Submit an async manual extraction job for PDF owner's manuals. + * Requires Pro tier (document.scanMaintenanceSchedule). + */ + async extractManual( + request: FastifyRequest, + reply: FastifyReply + ) { + const userId = (request as any).user?.sub as string; + + logger.info('Manual extract requested', { + operation: 'ocr.controller.extractManual', + userId, + }); + + const file = await (request as any).file({ limits: { files: 1 } }); + if (!file) { + logger.warn('No file provided for manual extraction', { + operation: 'ocr.controller.extractManual.no_file', + userId, + }); + return reply.code(400).send({ + error: 'Bad Request', + message: 'No file provided', + }); + } + + const contentType = file.mimetype as string; + if (contentType !== 'application/pdf') { + logger.warn('Non-PDF file provided for manual extraction', { + operation: 'ocr.controller.extractManual.not_pdf', + userId, + contentType, + fileName: file.filename, + }); + return reply.code(400).send({ + error: 'Bad Request', + message: `Manual extraction requires PDF files. Received: ${contentType}`, + }); + } + + const chunks: Buffer[] = []; + for await (const chunk of file.file) { + chunks.push(chunk); + } + const fileBuffer = Buffer.concat(chunks); + + if (fileBuffer.length === 0) { + logger.warn('Empty file provided for manual extraction', { + operation: 'ocr.controller.extractManual.empty_file', + userId, + fileName: file.filename, + }); + return reply.code(400).send({ + error: 'Bad Request', + message: 'Empty file provided', + }); + } + + // Get optional vehicle_id from form fields + const vehicleId = file.fields?.vehicle_id?.value as string | undefined; + + try { + const result = await ocrService.submitManualJob(userId, { + fileBuffer, + contentType, + vehicleId, + }); + + logger.info('Manual extract job submitted', { + operation: 'ocr.controller.extractManual.success', + userId, + jobId: result.jobId, + status: result.status, + estimatedSeconds: result.estimatedSeconds, + }); + + return reply.code(202).send(result); + } catch (error: any) { + if (error.statusCode === 413) { + return reply.code(413).send({ + error: 'Payload Too Large', + message: error.message, + }); + } + if (error.statusCode === 400) { + return reply.code(400).send({ + error: 'Bad Request', + message: error.message, + }); + } + + logger.error('Manual extract failed', { + operation: 'ocr.controller.extractManual.error', + userId, + error: error.message, + }); + + return reply.code(500).send({ + error: 'Internal Server Error', + message: 'Manual extraction submission failed', + }); + } + } + /** * POST /api/ocr/jobs * Submit an async OCR job for large files. diff --git a/backend/src/features/ocr/api/ocr.routes.ts b/backend/src/features/ocr/api/ocr.routes.ts index 67b25d7..f64685b 100644 --- a/backend/src/features/ocr/api/ocr.routes.ts +++ b/backend/src/features/ocr/api/ocr.routes.ts @@ -29,6 +29,12 @@ export const ocrRoutes: FastifyPluginAsync = async ( handler: ctrl.extractReceipt.bind(ctrl), }); + // POST /api/ocr/extract/manual - Manual extraction (Pro tier required) + fastify.post('/ocr/extract/manual', { + preHandler: [requireAuth, fastify.requireTier({ featureKey: 'document.scanMaintenanceSchedule' })], + handler: ctrl.extractManual.bind(ctrl), + }); + // POST /api/ocr/jobs - Submit async OCR job fastify.post('/ocr/jobs', { preHandler: [requireAuth], diff --git a/backend/src/features/ocr/domain/ocr.service.ts b/backend/src/features/ocr/domain/ocr.service.ts index fd95d36..5c2af9f 100644 --- a/backend/src/features/ocr/domain/ocr.service.ts +++ b/backend/src/features/ocr/domain/ocr.service.ts @@ -5,6 +5,8 @@ import { logger } from '../../../core/logging/logger'; import { ocrClient, JobNotFoundError } from '../external/ocr-client'; import type { JobResponse, + ManualJobResponse, + ManualJobSubmitRequest, OcrExtractRequest, OcrJobSubmitRequest, OcrResponse, @@ -278,6 +280,66 @@ export class OcrService { } } + /** + * Submit an async manual extraction job for PDF owner's manuals. + * + * @param userId - User ID for logging + * @param request - Manual job submission request + * @returns Manual job response with job ID + */ + async submitManualJob(userId: string, request: ManualJobSubmitRequest): Promise { + // Validate file size for async processing (200MB max) + if (request.fileBuffer.length > MAX_ASYNC_SIZE) { + const err: any = new Error( + `File too large. Max: ${MAX_ASYNC_SIZE / (1024 * 1024)}MB.` + ); + err.statusCode = 413; + throw err; + } + + // Manual extraction only supports PDF + if (request.contentType !== 'application/pdf') { + const err: any = new Error( + `Unsupported file type: ${request.contentType}. Manual extraction requires PDF files.` + ); + err.statusCode = 400; + throw err; + } + + logger.info('Manual job submit requested', { + operation: 'ocr.service.submitManualJob', + userId, + contentType: request.contentType, + fileSize: request.fileBuffer.length, + hasVehicleId: !!request.vehicleId, + }); + + try { + const result = await ocrClient.submitManualJob( + request.fileBuffer, + request.contentType, + request.vehicleId + ); + + logger.info('Manual job submitted', { + operation: 'ocr.service.submitManualJob.success', + userId, + jobId: result.jobId, + status: result.status, + estimatedSeconds: result.estimatedSeconds, + }); + + return result; + } catch (error) { + logger.error('Manual job submit failed', { + operation: 'ocr.service.submitManualJob.error', + userId, + error: error instanceof Error ? error.message : 'Unknown error', + }); + throw error; + } + } + /** * Get the status of an async OCR job. * diff --git a/backend/src/features/ocr/domain/ocr.types.ts b/backend/src/features/ocr/domain/ocr.types.ts index 7ec5c15..9209962 100644 --- a/backend/src/features/ocr/domain/ocr.types.ts +++ b/backend/src/features/ocr/domain/ocr.types.ts @@ -79,3 +79,49 @@ export interface OcrJobSubmitRequest { contentType: string; callbackUrl?: string; } + +/** Request to submit a manual extraction job */ +export interface ManualJobSubmitRequest { + fileBuffer: Buffer; + contentType: string; + vehicleId?: string; +} + +/** Vehicle info extracted from a manual */ +export interface ManualVehicleInfo { + make: string | null; + model: string | null; + year: number | null; +} + +/** A single maintenance schedule item extracted from a manual */ +export interface MaintenanceScheduleItem { + service: string; + intervalMiles: number | null; + intervalMonths: number | null; + details: string | null; + confidence: number; + subtypes: string[]; +} + +/** Result of manual extraction (nested in ManualJobResponse.result) */ +export interface ManualExtractionResult { + success: boolean; + vehicleInfo: ManualVehicleInfo; + maintenanceSchedules: MaintenanceScheduleItem[]; + rawTables: unknown[]; + processingTimeMs: number; + totalPages: number; + pagesProcessed: number; + error: string | null; +} + +/** Response for async manual extraction job */ +export interface ManualJobResponse { + jobId: string; + status: JobStatus; + progress?: number; + estimatedSeconds?: number; + result?: ManualExtractionResult; + error?: string; +} diff --git a/backend/src/features/ocr/external/ocr-client.ts b/backend/src/features/ocr/external/ocr-client.ts index 8388c1c..a4b453a 100644 --- a/backend/src/features/ocr/external/ocr-client.ts +++ b/backend/src/features/ocr/external/ocr-client.ts @@ -2,7 +2,7 @@ * @ai-summary HTTP client for OCR service communication */ import { logger } from '../../../core/logging/logger'; -import type { JobResponse, OcrResponse, ReceiptExtractionResponse, VinExtractionResponse } from '../domain/ocr.types'; +import type { JobResponse, ManualJobResponse, OcrResponse, ReceiptExtractionResponse, VinExtractionResponse } from '../domain/ocr.types'; /** OCR service configuration */ const OCR_SERVICE_URL = process.env.OCR_SERVICE_URL || 'http://mvp-ocr:8000'; @@ -265,6 +265,61 @@ export class OcrClient { return (await response.json()) as JobResponse; } + /** + * Submit an async manual extraction job for PDF owner's manuals. + * + * @param fileBuffer - PDF file buffer + * @param contentType - MIME type of the file (must be application/pdf) + * @param vehicleId - Optional vehicle ID for context + * @returns Manual job submission response + */ + async submitManualJob( + fileBuffer: Buffer, + contentType: string, + vehicleId?: string + ): Promise { + const formData = this.buildFormData(fileBuffer, contentType); + if (vehicleId) { + formData.append('vehicle_id', vehicleId); + } + + const url = `${this.baseUrl}/extract/manual`; + + logger.info('OCR manual job submit request', { + operation: 'ocr.client.submitManualJob', + url, + contentType, + fileSize: fileBuffer.length, + hasVehicleId: !!vehicleId, + }); + + const response = await this.fetchWithTimeout(url, { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + const errorText = await response.text(); + logger.error('OCR manual job submit failed', { + operation: 'ocr.client.submitManualJob.error', + status: response.status, + error: errorText, + }); + throw new Error(`OCR service error: ${response.status} - ${errorText}`); + } + + const result = (await response.json()) as ManualJobResponse; + + logger.info('OCR manual job submitted', { + operation: 'ocr.client.submitManualJob.success', + jobId: result.jobId, + status: result.status, + estimatedSeconds: result.estimatedSeconds, + }); + + return result; + } + /** * Check if the OCR service is healthy. * diff --git a/backend/src/features/ocr/tests/unit/ocr-manual.test.ts b/backend/src/features/ocr/tests/unit/ocr-manual.test.ts new file mode 100644 index 0000000..10b497d --- /dev/null +++ b/backend/src/features/ocr/tests/unit/ocr-manual.test.ts @@ -0,0 +1,213 @@ +/** + * @ai-summary Unit tests for OCR manual extraction endpoint + */ + +import { OcrService } from '../../domain/ocr.service'; +import { ocrClient } from '../../external/ocr-client'; +import type { ManualJobResponse } from '../../domain/ocr.types'; + +jest.mock('../../external/ocr-client'); +jest.mock('../../../../core/logging/logger'); + +const mockSubmitManualJob = ocrClient.submitManualJob as jest.MockedFunction< + typeof ocrClient.submitManualJob +>; + +describe('OcrService.submitManualJob', () => { + let service: OcrService; + + const userId = 'test-user-id'; + + const mockManualJobResponse: ManualJobResponse = { + jobId: 'manual-job-123', + status: 'pending', + progress: 0, + estimatedSeconds: 45, + result: undefined, + error: undefined, + }; + + const mockCompletedJobResponse: ManualJobResponse = { + jobId: 'manual-job-123', + status: 'completed', + progress: 100, + result: { + success: true, + vehicleInfo: { + make: 'Honda', + model: 'Civic', + year: 2023, + }, + maintenanceSchedules: [ + { + service: 'Engine Oil Change', + intervalMiles: 5000, + intervalMonths: 6, + details: 'Use 0W-20 full synthetic oil', + confidence: 0.95, + subtypes: ['oil_change'], + }, + { + service: 'Tire Rotation', + intervalMiles: 7500, + intervalMonths: 6, + details: null, + confidence: 0.90, + subtypes: ['tire_rotation'], + }, + ], + rawTables: [], + processingTimeMs: 45000, + totalPages: 120, + pagesProcessed: 120, + error: null, + }, + error: undefined, + }; + + beforeEach(() => { + jest.clearAllMocks(); + service = new OcrService(); + }); + + describe('valid manual job submission', () => { + it('should return 202-style response with jobId for PDF submission', async () => { + mockSubmitManualJob.mockResolvedValue(mockManualJobResponse); + + const result = await service.submitManualJob(userId, { + fileBuffer: Buffer.from('fake-pdf-data'), + contentType: 'application/pdf', + }); + + expect(result.jobId).toBe('manual-job-123'); + expect(result.status).toBe('pending'); + expect(result.progress).toBe(0); + expect(result.estimatedSeconds).toBe(45); + expect(result.result).toBeUndefined(); + }); + + it('should pass vehicleId to client when provided', async () => { + mockSubmitManualJob.mockResolvedValue(mockManualJobResponse); + + await service.submitManualJob(userId, { + fileBuffer: Buffer.from('fake-pdf-data'), + contentType: 'application/pdf', + vehicleId: 'vehicle-abc', + }); + + expect(mockSubmitManualJob).toHaveBeenCalledWith( + expect.any(Buffer), + 'application/pdf', + 'vehicle-abc' + ); + }); + + it('should call client without vehicleId when not provided', async () => { + mockSubmitManualJob.mockResolvedValue(mockManualJobResponse); + + await service.submitManualJob(userId, { + fileBuffer: Buffer.from('fake-pdf-data'), + contentType: 'application/pdf', + }); + + expect(mockSubmitManualJob).toHaveBeenCalledWith( + expect.any(Buffer), + 'application/pdf', + undefined + ); + }); + }); + + describe('completed job result', () => { + it('should return completed result with maintenanceSchedules', async () => { + mockSubmitManualJob.mockResolvedValue(mockCompletedJobResponse); + + const result = await service.submitManualJob(userId, { + fileBuffer: Buffer.from('fake-pdf-data'), + contentType: 'application/pdf', + }); + + expect(result.status).toBe('completed'); + expect(result.result).toBeDefined(); + expect(result.result!.success).toBe(true); + expect(result.result!.maintenanceSchedules).toHaveLength(2); + expect(result.result!.maintenanceSchedules[0].service).toBe('Engine Oil Change'); + expect(result.result!.maintenanceSchedules[0].intervalMiles).toBe(5000); + expect(result.result!.maintenanceSchedules[0].subtypes).toEqual(['oil_change']); + expect(result.result!.vehicleInfo.make).toBe('Honda'); + }); + }); + + describe('error handling', () => { + it('should throw 400 for non-PDF file (JPEG)', async () => { + await expect( + service.submitManualJob(userId, { + fileBuffer: Buffer.from('fake-image-data'), + contentType: 'image/jpeg', + }) + ).rejects.toMatchObject({ + statusCode: 400, + }); + }); + + it('should throw 400 for non-PDF file (PNG)', async () => { + await expect( + service.submitManualJob(userId, { + fileBuffer: Buffer.from('fake-image-data'), + contentType: 'image/png', + }) + ).rejects.toMatchObject({ + statusCode: 400, + }); + }); + + it('should throw 400 for text/plain', async () => { + await expect( + service.submitManualJob(userId, { + fileBuffer: Buffer.from('not a pdf'), + contentType: 'text/plain', + }) + ).rejects.toMatchObject({ + statusCode: 400, + }); + }); + + it('should throw 413 for oversized file', async () => { + const largeBuffer = Buffer.alloc(201 * 1024 * 1024); // 201MB + + await expect( + service.submitManualJob(userId, { + fileBuffer: largeBuffer, + contentType: 'application/pdf', + }) + ).rejects.toMatchObject({ + statusCode: 413, + }); + }); + + it('should accept file at 200MB boundary', async () => { + mockSubmitManualJob.mockResolvedValue(mockManualJobResponse); + const exactBuffer = Buffer.alloc(200 * 1024 * 1024); // exactly 200MB + + const result = await service.submitManualJob(userId, { + fileBuffer: exactBuffer, + contentType: 'application/pdf', + }); + + expect(result.jobId).toBe('manual-job-123'); + }); + + it('should propagate OCR service errors', async () => { + mockSubmitManualJob.mockRejectedValue( + new Error('OCR service error: 500 - Internal error') + ); + + await expect( + service.submitManualJob(userId, { + fileBuffer: Buffer.from('fake-pdf-data'), + contentType: 'application/pdf', + }) + ).rejects.toThrow('OCR service error: 500 - Internal error'); + }); + }); +});