From d9a40f7d379b7587ff72808ec8ffdc359475aa74 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 13 Feb 2026 08:44:03 -0600 Subject: [PATCH] feat: add receipt classifier and OCR integration (refs #157) - New ReceiptClassifier module with keyword-based classification for fuel vs maintenance receipts from email text and OCR raw text - Classifier-first pipeline: classify from email subject/body keywords before falling back to OCR-based classification - Fuel keywords: gas, fuel, gallons, octane, pump, diesel, unleaded, shell, chevron, exxon, bp - Maintenance keywords: oil change, brake, alignment, tire, rotation, inspection, labor, parts, service, repair, transmission, coolant - Confident classification (>= 2 keyword matches) routes to specific OCR endpoint; unclassified falls back to both endpoints + rawText classification + field-count heuristic Co-Authored-By: Claude Opus 4.6 --- .../domain/email-ingestion.service.ts | 122 +++++++++++----- .../domain/email-ingestion.types.ts | 11 ++ .../domain/receipt-classifier.ts | 130 ++++++++++++++++++ backend/src/features/email-ingestion/index.ts | 3 + 4 files changed, 234 insertions(+), 32 deletions(-) create mode 100644 backend/src/features/email-ingestion/domain/receipt-classifier.ts diff --git a/backend/src/features/email-ingestion/domain/email-ingestion.service.ts b/backend/src/features/email-ingestion/domain/email-ingestion.service.ts index 05e4ed9..17d70dd 100644 --- a/backend/src/features/email-ingestion/domain/email-ingestion.service.ts +++ b/backend/src/features/email-ingestion/domain/email-ingestion.service.ts @@ -16,6 +16,7 @@ import { TemplateService } from '../../notifications/domain/template.service'; import { EmailService } from '../../notifications/domain/email.service'; import { ocrService } from '../../ocr/domain/ocr.service'; import type { ReceiptExtractionResponse } from '../../ocr/domain/ocr.types'; +import { ReceiptClassifier } from './receipt-classifier'; import type { ResendWebhookEvent, EmailProcessingResult, @@ -51,6 +52,7 @@ export class EmailIngestionService { private notificationsRepository: NotificationsRepository; private templateService: TemplateService; private emailService: EmailService; + private classifier: ReceiptClassifier; constructor(dbPool?: Pool) { const p = dbPool || pool; @@ -61,6 +63,7 @@ export class EmailIngestionService { this.notificationsRepository = new NotificationsRepository(p); this.templateService = new TemplateService(); this.emailService = new EmailService(); + this.classifier = new ReceiptClassifier(); } // ======================== @@ -102,28 +105,38 @@ export class EmailIngestionService { return; } - // 5. Process first valid image attachment through OCR - const ocrResult = await this.processAttachmentsWithOcr(userId, validAttachments); + // 5. Classify receipt from email text first + const emailClassification = this.classifier.classifyFromText(subject, event.data.text); + logger.info('Email text classification result', { + emailId, + type: emailClassification.type, + confidence: emailClassification.confidence, + }); + + // 6. Process attachments through OCR using classification + const ocrResult = await this.processAttachmentsWithClassification( + userId, validAttachments, emailClassification, emailId + ); if (!ocrResult) { await this.handleOcrFailure(emailId, senderEmail, userName, subject, 'No receipt data could be extracted from attachments'); return; } - // 6. Build extracted data from OCR result + // 7. Build extracted data from OCR result const extractedData = this.mapOcrToExtractedData(ocrResult.response); const recordType = ocrResult.recordType; - // 7. Handle vehicle association + // 8. Handle vehicle association const processingResult = await this.handleVehicleAssociation( userId, userName, senderEmail, recordType, extractedData ); - // 8. Mark as completed + // 9. Mark as completed await this.repository.updateQueueStatus(emailId, 'completed', { processingResult, }); - // 9. Send confirmation email + // 10. Send confirmation email await this.sendConfirmationEmail(senderEmail, userName, processingResult); logger.info('Email processing completed successfully', { @@ -239,51 +252,100 @@ export class EmailIngestionService { // ======================== /** - * Process attachments through OCR, trying fuel then maintenance receipt extraction. - * Returns the first successful result. + * Process attachments using classifier-driven OCR extraction. + * If email text classification is confident, calls the specific OCR endpoint. + * If not, performs general OCR and classifies from rawText. + * Returns null if no usable result or receipt is unclassified. */ - private async processAttachmentsWithOcr( + private async processAttachmentsWithClassification( userId: string, - attachments: ParsedEmailAttachment[] + attachments: ParsedEmailAttachment[], + emailClassification: { type: string; confidence: number }, + emailId: string ): Promise<{ response: ReceiptExtractionResponse; recordType: EmailRecordType } | null> { - // Process only image attachments that the receipt OCR supports const imageAttachments = attachments.filter(att => OCR_RECEIPT_IMAGE_TYPES.has(att.contentType)); for (const attachment of imageAttachments) { - const result = await this.classifyAndExtract(userId, attachment); - if (result) return result; + // If email text gave a confident classification, call the specific OCR endpoint first + if (emailClassification.type === 'fuel') { + const result = await this.extractFuelReceipt(userId, attachment); + if (result?.success) return { response: result, recordType: 'fuel_log' }; + // Fuel OCR failed, try maintenance as fallback + const fallbackResult = await this.extractMaintenanceReceipt(userId, attachment); + if (fallbackResult?.success) return { response: fallbackResult, recordType: 'maintenance_record' }; + continue; + } + + if (emailClassification.type === 'maintenance') { + const result = await this.extractMaintenanceReceipt(userId, attachment); + if (result?.success) return { response: result, recordType: 'maintenance_record' }; + // Maintenance OCR failed, try fuel as fallback + const fallbackResult = await this.extractFuelReceipt(userId, attachment); + if (fallbackResult?.success) return { response: fallbackResult, recordType: 'fuel_log' }; + continue; + } + + // Email text was not confident - try both OCR endpoints and classify from rawText + const fuelResult = await this.extractFuelReceipt(userId, attachment); + const maintenanceResult = await this.extractMaintenanceReceipt(userId, attachment); + + // Use rawText from whichever succeeded for secondary classification + const rawText = fuelResult?.rawText || maintenanceResult?.rawText || ''; + if (rawText) { + const ocrClassification = this.classifier.classifyFromOcrRawText(rawText); + logger.info('OCR rawText classification result', { + emailId, + type: ocrClassification.type, + confidence: ocrClassification.confidence, + }); + + if (ocrClassification.type === 'fuel' && fuelResult?.success) { + return { response: fuelResult, recordType: 'fuel_log' }; + } + if (ocrClassification.type === 'maintenance' && maintenanceResult?.success) { + return { response: maintenanceResult, recordType: 'maintenance_record' }; + } + } + + // Both classifiers failed - fall back to field-count heuristic + const fallback = this.selectBestResultByFields(fuelResult, maintenanceResult); + if (fallback) return fallback; } return null; } /** - * Try both fuel and maintenance OCR extractors, return the better result + * Extract fuel receipt via OCR. Returns null on failure. */ - private async classifyAndExtract( + private async extractFuelReceipt( userId: string, attachment: ParsedEmailAttachment - ): Promise<{ response: ReceiptExtractionResponse; recordType: EmailRecordType } | null> { - let fuelResult: ReceiptExtractionResponse | null = null; - let maintenanceResult: ReceiptExtractionResponse | null = null; - - // Try fuel receipt extraction + ): Promise { try { - fuelResult = await ocrService.extractReceipt(userId, { + return await ocrService.extractReceipt(userId, { fileBuffer: attachment.content, contentType: attachment.contentType, receiptType: 'fuel', }); } catch (error) { - logger.info('Fuel receipt extraction failed, trying maintenance', { + logger.info('Fuel receipt extraction failed', { filename: attachment.filename, error: error instanceof Error ? error.message : String(error), }); + return null; } + } - // Try maintenance receipt extraction + /** + * Extract maintenance receipt via OCR. Returns null on failure. + */ + private async extractMaintenanceReceipt( + userId: string, + attachment: ParsedEmailAttachment + ): Promise { try { - maintenanceResult = await ocrService.extractMaintenanceReceipt(userId, { + return await ocrService.extractMaintenanceReceipt(userId, { fileBuffer: attachment.content, contentType: attachment.contentType, }); @@ -292,16 +354,15 @@ export class EmailIngestionService { filename: attachment.filename, error: error instanceof Error ? error.message : String(error), }); + return null; } - - // Compare results and pick the best one - return this.selectBestResult(fuelResult, maintenanceResult); } /** - * Select the better OCR result based on extracted field count and success + * Last-resort fallback: select the better OCR result based on domain-specific + * fields and field count when keyword classifiers could not decide. */ - private selectBestResult( + private selectBestResultByFields( fuelResult: ReceiptExtractionResponse | null, maintenanceResult: ReceiptExtractionResponse | null ): { response: ReceiptExtractionResponse; recordType: EmailRecordType } | null { @@ -316,7 +377,6 @@ export class EmailIngestionService { return null; } - // Check for fuel-specific fields to improve classification const hasFuelFields = fuelResult?.extractedFields['gallons'] || fuelResult?.extractedFields['price_per_gallon'] || fuelResult?.extractedFields['fuel_type']; @@ -325,7 +385,6 @@ export class EmailIngestionService { maintenanceResult?.extractedFields['shop_name'] || maintenanceResult?.extractedFields['description']; - // Prefer the result with domain-specific fields if (hasFuelFields && !hasMaintenanceFields) { return { response: fuelResult!, recordType: 'fuel_log' }; } @@ -333,7 +392,6 @@ export class EmailIngestionService { return { response: maintenanceResult!, recordType: 'maintenance_record' }; } - // Fall back to field count comparison if (fuelFieldCount >= maintenanceFieldCount && fuelResult?.success) { return { response: fuelResult, recordType: 'fuel_log' }; } diff --git a/backend/src/features/email-ingestion/domain/email-ingestion.types.ts b/backend/src/features/email-ingestion/domain/email-ingestion.types.ts index 30792ad..dc267ca 100644 --- a/backend/src/features/email-ingestion/domain/email-ingestion.types.ts +++ b/backend/src/features/email-ingestion/domain/email-ingestion.types.ts @@ -13,6 +13,17 @@ export type PendingAssociationStatus = 'pending' | 'resolved' | 'expired'; export type EmailRecordType = 'fuel_log' | 'maintenance_record'; +// ======================== +// Receipt Classification +// ======================== + +export type ReceiptClassificationType = 'fuel' | 'maintenance' | 'unclassified'; + +export interface ClassificationResult { + type: ReceiptClassificationType; + confidence: number; +} + // ======================== // Database Records // ======================== diff --git a/backend/src/features/email-ingestion/domain/receipt-classifier.ts b/backend/src/features/email-ingestion/domain/receipt-classifier.ts new file mode 100644 index 0000000..417f495 --- /dev/null +++ b/backend/src/features/email-ingestion/domain/receipt-classifier.ts @@ -0,0 +1,130 @@ +/** + * @ai-summary Classifies receipt type from email text or OCR raw text + * @ai-context Uses keyword matching to determine fuel vs maintenance receipts + * before falling back to OCR-based classification. Returns confidence score. + */ + +import { logger } from '../../../core/logging/logger'; +import type { ClassificationResult, ReceiptClassificationType } from './email-ingestion.types'; + +/** Fuel-related keywords (case-insensitive matching) */ +const FUEL_KEYWORDS: string[] = [ + 'gas', + 'fuel', + 'gallons', + 'octane', + 'pump', + 'diesel', + 'unleaded', + 'shell', + 'chevron', + 'exxon', + 'bp', +]; + +/** Maintenance-related keywords (case-insensitive matching). Multi-word entries matched as phrases. */ +const MAINTENANCE_KEYWORDS: string[] = [ + 'oil change', + 'brake', + 'alignment', + 'tire', + 'rotation', + 'inspection', + 'labor', + 'parts', + 'service', + 'repair', + 'transmission', + 'coolant', +]; + +/** Minimum keyword matches required for a confident classification */ +const CONFIDENCE_THRESHOLD = 2; + +export class ReceiptClassifier { + /** + * Classify receipt type from email subject and body text. + * Returns a confident result if >= 2 keyword matches for one type. + */ + classifyFromText(subject: string | null, body: string | null): ClassificationResult { + const text = [subject || '', body || ''].join(' '); + return this.classifyText(text, 'email'); + } + + /** + * Classify receipt type from OCR raw text output. + * Uses same keyword matching as email text classification. + */ + classifyFromOcrRawText(rawText: string): ClassificationResult { + return this.classifyText(rawText, 'ocr'); + } + + /** + * Core keyword matching logic shared by email and OCR classification. + */ + private classifyText(text: string, source: 'email' | 'ocr'): ClassificationResult { + const normalizedText = text.toLowerCase(); + + const fuelMatches = this.countKeywordMatches(normalizedText, FUEL_KEYWORDS); + const maintenanceMatches = this.countKeywordMatches(normalizedText, MAINTENANCE_KEYWORDS); + + logger.info('Receipt classification keyword analysis', { + source, + fuelMatches, + maintenanceMatches, + textLength: text.length, + }); + + // Both below threshold - unclassified + if (fuelMatches < CONFIDENCE_THRESHOLD && maintenanceMatches < CONFIDENCE_THRESHOLD) { + return { type: 'unclassified', confidence: 0 }; + } + + // Clear winner with threshold met + if (fuelMatches >= CONFIDENCE_THRESHOLD && fuelMatches > maintenanceMatches) { + return { + type: 'fuel', + confidence: Math.min(fuelMatches / (fuelMatches + maintenanceMatches), 1), + }; + } + + if (maintenanceMatches >= CONFIDENCE_THRESHOLD && maintenanceMatches > fuelMatches) { + return { + type: 'maintenance', + confidence: Math.min(maintenanceMatches / (fuelMatches + maintenanceMatches), 1), + }; + } + + // Tie with both meeting threshold - unclassified (ambiguous) + if (fuelMatches >= CONFIDENCE_THRESHOLD && maintenanceMatches >= CONFIDENCE_THRESHOLD) { + return { type: 'unclassified', confidence: 0 }; + } + + return { type: 'unclassified', confidence: 0 }; + } + + /** + * Count how many keywords from the list appear in the text. + * Multi-word keywords are matched as phrases. + */ + private countKeywordMatches(normalizedText: string, keywords: string[]): number { + let matches = 0; + for (const keyword of keywords) { + if (normalizedText.includes(keyword)) { + matches++; + } + } + return matches; + } + + /** + * Map classifier type to the EmailRecordType used in the processing pipeline. + */ + static toRecordType(classificationType: ReceiptClassificationType): 'fuel_log' | 'maintenance_record' | null { + switch (classificationType) { + case 'fuel': return 'fuel_log'; + case 'maintenance': return 'maintenance_record'; + case 'unclassified': return null; + } + } +} diff --git a/backend/src/features/email-ingestion/index.ts b/backend/src/features/email-ingestion/index.ts index e2b1a03..d7781d2 100644 --- a/backend/src/features/email-ingestion/index.ts +++ b/backend/src/features/email-ingestion/index.ts @@ -6,14 +6,17 @@ export { emailIngestionWebhookRoutes } from './api/email-ingestion.routes'; export { EmailIngestionService } from './domain/email-ingestion.service'; export { EmailIngestionRepository } from './data/email-ingestion.repository'; +export { ReceiptClassifier } from './domain/receipt-classifier'; export { ResendInboundClient } from './external/resend-inbound.client'; export type { ParsedEmailResult, ParsedEmailAttachment } from './external/resend-inbound.client'; export type { + ClassificationResult, EmailIngestionQueueRecord, EmailIngestionStatus, EmailProcessingResult, ExtractedReceiptData, PendingVehicleAssociation, + ReceiptClassificationType, ResendWebhookEvent, ResendWebhookEventData, } from './domain/email-ingestion.types';