feat: Expand OCR with fuel receipt scanning and maintenance extraction (#129) #147

Merged
egullickson merged 26 commits from issue-129-expand-ocr-fuel-receipt-maintenance into main 2026-02-13 02:25:55 +00:00
6 changed files with 489 additions and 1 deletions
Showing only changes of commit a281cea9c5 - Show all commits

View File

@@ -336,6 +336,112 @@ export class OcrController {
} }
} }
/**
* POST /api/ocr/extract/manual
* Submit an async manual extraction job for PDF owner's manuals.
* Requires Pro tier (document.scanMaintenanceSchedule).
*/
async extractManual(
request: FastifyRequest,
reply: FastifyReply
) {
const userId = (request as any).user?.sub as string;
logger.info('Manual extract requested', {
operation: 'ocr.controller.extractManual',
userId,
});
const file = await (request as any).file({ limits: { files: 1 } });
if (!file) {
logger.warn('No file provided for manual extraction', {
operation: 'ocr.controller.extractManual.no_file',
userId,
});
return reply.code(400).send({
error: 'Bad Request',
message: 'No file provided',
});
}
const contentType = file.mimetype as string;
if (contentType !== 'application/pdf') {
logger.warn('Non-PDF file provided for manual extraction', {
operation: 'ocr.controller.extractManual.not_pdf',
userId,
contentType,
fileName: file.filename,
});
return reply.code(400).send({
error: 'Bad Request',
message: `Manual extraction requires PDF files. Received: ${contentType}`,
});
}
const chunks: Buffer[] = [];
for await (const chunk of file.file) {
chunks.push(chunk);
}
const fileBuffer = Buffer.concat(chunks);
if (fileBuffer.length === 0) {
logger.warn('Empty file provided for manual extraction', {
operation: 'ocr.controller.extractManual.empty_file',
userId,
fileName: file.filename,
});
return reply.code(400).send({
error: 'Bad Request',
message: 'Empty file provided',
});
}
// Get optional vehicle_id from form fields
const vehicleId = file.fields?.vehicle_id?.value as string | undefined;
try {
const result = await ocrService.submitManualJob(userId, {
fileBuffer,
contentType,
vehicleId,
});
logger.info('Manual extract job submitted', {
operation: 'ocr.controller.extractManual.success',
userId,
jobId: result.jobId,
status: result.status,
estimatedSeconds: result.estimatedSeconds,
});
return reply.code(202).send(result);
} catch (error: any) {
if (error.statusCode === 413) {
return reply.code(413).send({
error: 'Payload Too Large',
message: error.message,
});
}
if (error.statusCode === 400) {
return reply.code(400).send({
error: 'Bad Request',
message: error.message,
});
}
logger.error('Manual extract failed', {
operation: 'ocr.controller.extractManual.error',
userId,
error: error.message,
});
return reply.code(500).send({
error: 'Internal Server Error',
message: 'Manual extraction submission failed',
});
}
}
/** /**
* POST /api/ocr/jobs * POST /api/ocr/jobs
* Submit an async OCR job for large files. * Submit an async OCR job for large files.

View File

@@ -29,6 +29,12 @@ export const ocrRoutes: FastifyPluginAsync = async (
handler: ctrl.extractReceipt.bind(ctrl), handler: ctrl.extractReceipt.bind(ctrl),
}); });
// POST /api/ocr/extract/manual - Manual extraction (Pro tier required)
fastify.post('/ocr/extract/manual', {
preHandler: [requireAuth, fastify.requireTier({ featureKey: 'document.scanMaintenanceSchedule' })],
handler: ctrl.extractManual.bind(ctrl),
});
// POST /api/ocr/jobs - Submit async OCR job // POST /api/ocr/jobs - Submit async OCR job
fastify.post('/ocr/jobs', { fastify.post('/ocr/jobs', {
preHandler: [requireAuth], preHandler: [requireAuth],

View File

@@ -5,6 +5,8 @@ import { logger } from '../../../core/logging/logger';
import { ocrClient, JobNotFoundError } from '../external/ocr-client'; import { ocrClient, JobNotFoundError } from '../external/ocr-client';
import type { import type {
JobResponse, JobResponse,
ManualJobResponse,
ManualJobSubmitRequest,
OcrExtractRequest, OcrExtractRequest,
OcrJobSubmitRequest, OcrJobSubmitRequest,
OcrResponse, OcrResponse,
@@ -278,6 +280,66 @@ export class OcrService {
} }
} }
/**
* Submit an async manual extraction job for PDF owner's manuals.
*
* @param userId - User ID for logging
* @param request - Manual job submission request
* @returns Manual job response with job ID
*/
async submitManualJob(userId: string, request: ManualJobSubmitRequest): Promise<ManualJobResponse> {
// Validate file size for async processing (200MB max)
if (request.fileBuffer.length > MAX_ASYNC_SIZE) {
const err: any = new Error(
`File too large. Max: ${MAX_ASYNC_SIZE / (1024 * 1024)}MB.`
);
err.statusCode = 413;
throw err;
}
// Manual extraction only supports PDF
if (request.contentType !== 'application/pdf') {
const err: any = new Error(
`Unsupported file type: ${request.contentType}. Manual extraction requires PDF files.`
);
err.statusCode = 400;
throw err;
}
logger.info('Manual job submit requested', {
operation: 'ocr.service.submitManualJob',
userId,
contentType: request.contentType,
fileSize: request.fileBuffer.length,
hasVehicleId: !!request.vehicleId,
});
try {
const result = await ocrClient.submitManualJob(
request.fileBuffer,
request.contentType,
request.vehicleId
);
logger.info('Manual job submitted', {
operation: 'ocr.service.submitManualJob.success',
userId,
jobId: result.jobId,
status: result.status,
estimatedSeconds: result.estimatedSeconds,
});
return result;
} catch (error) {
logger.error('Manual job submit failed', {
operation: 'ocr.service.submitManualJob.error',
userId,
error: error instanceof Error ? error.message : 'Unknown error',
});
throw error;
}
}
/** /**
* Get the status of an async OCR job. * Get the status of an async OCR job.
* *

View File

@@ -79,3 +79,49 @@ export interface OcrJobSubmitRequest {
contentType: string; contentType: string;
callbackUrl?: string; callbackUrl?: string;
} }
/** Request to submit a manual extraction job */
export interface ManualJobSubmitRequest {
fileBuffer: Buffer;
contentType: string;
vehicleId?: string;
}
/** Vehicle info extracted from a manual */
export interface ManualVehicleInfo {
make: string | null;
model: string | null;
year: number | null;
}
/** A single maintenance schedule item extracted from a manual */
export interface MaintenanceScheduleItem {
service: string;
intervalMiles: number | null;
intervalMonths: number | null;
details: string | null;
confidence: number;
subtypes: string[];
}
/** Result of manual extraction (nested in ManualJobResponse.result) */
export interface ManualExtractionResult {
success: boolean;
vehicleInfo: ManualVehicleInfo;
maintenanceSchedules: MaintenanceScheduleItem[];
rawTables: unknown[];
processingTimeMs: number;
totalPages: number;
pagesProcessed: number;
error: string | null;
}
/** Response for async manual extraction job */
export interface ManualJobResponse {
jobId: string;
status: JobStatus;
progress?: number;
estimatedSeconds?: number;
result?: ManualExtractionResult;
error?: string;
}

View File

@@ -2,7 +2,7 @@
* @ai-summary HTTP client for OCR service communication * @ai-summary HTTP client for OCR service communication
*/ */
import { logger } from '../../../core/logging/logger'; import { logger } from '../../../core/logging/logger';
import type { JobResponse, OcrResponse, ReceiptExtractionResponse, VinExtractionResponse } from '../domain/ocr.types'; import type { JobResponse, ManualJobResponse, OcrResponse, ReceiptExtractionResponse, VinExtractionResponse } from '../domain/ocr.types';
/** OCR service configuration */ /** OCR service configuration */
const OCR_SERVICE_URL = process.env.OCR_SERVICE_URL || 'http://mvp-ocr:8000'; const OCR_SERVICE_URL = process.env.OCR_SERVICE_URL || 'http://mvp-ocr:8000';
@@ -265,6 +265,61 @@ export class OcrClient {
return (await response.json()) as JobResponse; return (await response.json()) as JobResponse;
} }
/**
* Submit an async manual extraction job for PDF owner's manuals.
*
* @param fileBuffer - PDF file buffer
* @param contentType - MIME type of the file (must be application/pdf)
* @param vehicleId - Optional vehicle ID for context
* @returns Manual job submission response
*/
async submitManualJob(
fileBuffer: Buffer,
contentType: string,
vehicleId?: string
): Promise<ManualJobResponse> {
const formData = this.buildFormData(fileBuffer, contentType);
if (vehicleId) {
formData.append('vehicle_id', vehicleId);
}
const url = `${this.baseUrl}/extract/manual`;
logger.info('OCR manual job submit request', {
operation: 'ocr.client.submitManualJob',
url,
contentType,
fileSize: fileBuffer.length,
hasVehicleId: !!vehicleId,
});
const response = await this.fetchWithTimeout(url, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const errorText = await response.text();
logger.error('OCR manual job submit failed', {
operation: 'ocr.client.submitManualJob.error',
status: response.status,
error: errorText,
});
throw new Error(`OCR service error: ${response.status} - ${errorText}`);
}
const result = (await response.json()) as ManualJobResponse;
logger.info('OCR manual job submitted', {
operation: 'ocr.client.submitManualJob.success',
jobId: result.jobId,
status: result.status,
estimatedSeconds: result.estimatedSeconds,
});
return result;
}
/** /**
* Check if the OCR service is healthy. * Check if the OCR service is healthy.
* *

View File

@@ -0,0 +1,213 @@
/**
* @ai-summary Unit tests for OCR manual extraction endpoint
*/
import { OcrService } from '../../domain/ocr.service';
import { ocrClient } from '../../external/ocr-client';
import type { ManualJobResponse } from '../../domain/ocr.types';
jest.mock('../../external/ocr-client');
jest.mock('../../../../core/logging/logger');
const mockSubmitManualJob = ocrClient.submitManualJob as jest.MockedFunction<
typeof ocrClient.submitManualJob
>;
describe('OcrService.submitManualJob', () => {
let service: OcrService;
const userId = 'test-user-id';
const mockManualJobResponse: ManualJobResponse = {
jobId: 'manual-job-123',
status: 'pending',
progress: 0,
estimatedSeconds: 45,
result: undefined,
error: undefined,
};
const mockCompletedJobResponse: ManualJobResponse = {
jobId: 'manual-job-123',
status: 'completed',
progress: 100,
result: {
success: true,
vehicleInfo: {
make: 'Honda',
model: 'Civic',
year: 2023,
},
maintenanceSchedules: [
{
service: 'Engine Oil Change',
intervalMiles: 5000,
intervalMonths: 6,
details: 'Use 0W-20 full synthetic oil',
confidence: 0.95,
subtypes: ['oil_change'],
},
{
service: 'Tire Rotation',
intervalMiles: 7500,
intervalMonths: 6,
details: null,
confidence: 0.90,
subtypes: ['tire_rotation'],
},
],
rawTables: [],
processingTimeMs: 45000,
totalPages: 120,
pagesProcessed: 120,
error: null,
},
error: undefined,
};
beforeEach(() => {
jest.clearAllMocks();
service = new OcrService();
});
describe('valid manual job submission', () => {
it('should return 202-style response with jobId for PDF submission', async () => {
mockSubmitManualJob.mockResolvedValue(mockManualJobResponse);
const result = await service.submitManualJob(userId, {
fileBuffer: Buffer.from('fake-pdf-data'),
contentType: 'application/pdf',
});
expect(result.jobId).toBe('manual-job-123');
expect(result.status).toBe('pending');
expect(result.progress).toBe(0);
expect(result.estimatedSeconds).toBe(45);
expect(result.result).toBeUndefined();
});
it('should pass vehicleId to client when provided', async () => {
mockSubmitManualJob.mockResolvedValue(mockManualJobResponse);
await service.submitManualJob(userId, {
fileBuffer: Buffer.from('fake-pdf-data'),
contentType: 'application/pdf',
vehicleId: 'vehicle-abc',
});
expect(mockSubmitManualJob).toHaveBeenCalledWith(
expect.any(Buffer),
'application/pdf',
'vehicle-abc'
);
});
it('should call client without vehicleId when not provided', async () => {
mockSubmitManualJob.mockResolvedValue(mockManualJobResponse);
await service.submitManualJob(userId, {
fileBuffer: Buffer.from('fake-pdf-data'),
contentType: 'application/pdf',
});
expect(mockSubmitManualJob).toHaveBeenCalledWith(
expect.any(Buffer),
'application/pdf',
undefined
);
});
});
describe('completed job result', () => {
it('should return completed result with maintenanceSchedules', async () => {
mockSubmitManualJob.mockResolvedValue(mockCompletedJobResponse);
const result = await service.submitManualJob(userId, {
fileBuffer: Buffer.from('fake-pdf-data'),
contentType: 'application/pdf',
});
expect(result.status).toBe('completed');
expect(result.result).toBeDefined();
expect(result.result!.success).toBe(true);
expect(result.result!.maintenanceSchedules).toHaveLength(2);
expect(result.result!.maintenanceSchedules[0].service).toBe('Engine Oil Change');
expect(result.result!.maintenanceSchedules[0].intervalMiles).toBe(5000);
expect(result.result!.maintenanceSchedules[0].subtypes).toEqual(['oil_change']);
expect(result.result!.vehicleInfo.make).toBe('Honda');
});
});
describe('error handling', () => {
it('should throw 400 for non-PDF file (JPEG)', async () => {
await expect(
service.submitManualJob(userId, {
fileBuffer: Buffer.from('fake-image-data'),
contentType: 'image/jpeg',
})
).rejects.toMatchObject({
statusCode: 400,
});
});
it('should throw 400 for non-PDF file (PNG)', async () => {
await expect(
service.submitManualJob(userId, {
fileBuffer: Buffer.from('fake-image-data'),
contentType: 'image/png',
})
).rejects.toMatchObject({
statusCode: 400,
});
});
it('should throw 400 for text/plain', async () => {
await expect(
service.submitManualJob(userId, {
fileBuffer: Buffer.from('not a pdf'),
contentType: 'text/plain',
})
).rejects.toMatchObject({
statusCode: 400,
});
});
it('should throw 413 for oversized file', async () => {
const largeBuffer = Buffer.alloc(201 * 1024 * 1024); // 201MB
await expect(
service.submitManualJob(userId, {
fileBuffer: largeBuffer,
contentType: 'application/pdf',
})
).rejects.toMatchObject({
statusCode: 413,
});
});
it('should accept file at 200MB boundary', async () => {
mockSubmitManualJob.mockResolvedValue(mockManualJobResponse);
const exactBuffer = Buffer.alloc(200 * 1024 * 1024); // exactly 200MB
const result = await service.submitManualJob(userId, {
fileBuffer: exactBuffer,
contentType: 'application/pdf',
});
expect(result.jobId).toBe('manual-job-123');
});
it('should propagate OCR service errors', async () => {
mockSubmitManualJob.mockRejectedValue(
new Error('OCR service error: 500 - Internal error')
);
await expect(
service.submitManualJob(userId, {
fileBuffer: Buffer.from('fake-pdf-data'),
contentType: 'application/pdf',
})
).rejects.toThrow('OCR service error: 500 - Internal error');
});
});
});