feat: add receipt classifier and OCR integration (refs #157)
- New ReceiptClassifier module with keyword-based classification for fuel vs maintenance receipts from email text and OCR raw text - Classifier-first pipeline: classify from email subject/body keywords before falling back to OCR-based classification - Fuel keywords: gas, fuel, gallons, octane, pump, diesel, unleaded, shell, chevron, exxon, bp - Maintenance keywords: oil change, brake, alignment, tire, rotation, inspection, labor, parts, service, repair, transmission, coolant - Confident classification (>= 2 keyword matches) routes to specific OCR endpoint; unclassified falls back to both endpoints + rawText classification + field-count heuristic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ import { TemplateService } from '../../notifications/domain/template.service';
|
||||
import { EmailService } from '../../notifications/domain/email.service';
|
||||
import { ocrService } from '../../ocr/domain/ocr.service';
|
||||
import type { ReceiptExtractionResponse } from '../../ocr/domain/ocr.types';
|
||||
import { ReceiptClassifier } from './receipt-classifier';
|
||||
import type {
|
||||
ResendWebhookEvent,
|
||||
EmailProcessingResult,
|
||||
@@ -51,6 +52,7 @@ export class EmailIngestionService {
|
||||
private notificationsRepository: NotificationsRepository;
|
||||
private templateService: TemplateService;
|
||||
private emailService: EmailService;
|
||||
private classifier: ReceiptClassifier;
|
||||
|
||||
constructor(dbPool?: Pool) {
|
||||
const p = dbPool || pool;
|
||||
@@ -61,6 +63,7 @@ export class EmailIngestionService {
|
||||
this.notificationsRepository = new NotificationsRepository(p);
|
||||
this.templateService = new TemplateService();
|
||||
this.emailService = new EmailService();
|
||||
this.classifier = new ReceiptClassifier();
|
||||
}
|
||||
|
||||
// ========================
|
||||
@@ -102,28 +105,38 @@ export class EmailIngestionService {
|
||||
return;
|
||||
}
|
||||
|
||||
// 5. Process first valid image attachment through OCR
|
||||
const ocrResult = await this.processAttachmentsWithOcr(userId, validAttachments);
|
||||
// 5. Classify receipt from email text first
|
||||
const emailClassification = this.classifier.classifyFromText(subject, event.data.text);
|
||||
logger.info('Email text classification result', {
|
||||
emailId,
|
||||
type: emailClassification.type,
|
||||
confidence: emailClassification.confidence,
|
||||
});
|
||||
|
||||
// 6. Process attachments through OCR using classification
|
||||
const ocrResult = await this.processAttachmentsWithClassification(
|
||||
userId, validAttachments, emailClassification, emailId
|
||||
);
|
||||
if (!ocrResult) {
|
||||
await this.handleOcrFailure(emailId, senderEmail, userName, subject, 'No receipt data could be extracted from attachments');
|
||||
return;
|
||||
}
|
||||
|
||||
// 6. Build extracted data from OCR result
|
||||
// 7. Build extracted data from OCR result
|
||||
const extractedData = this.mapOcrToExtractedData(ocrResult.response);
|
||||
const recordType = ocrResult.recordType;
|
||||
|
||||
// 7. Handle vehicle association
|
||||
// 8. Handle vehicle association
|
||||
const processingResult = await this.handleVehicleAssociation(
|
||||
userId, userName, senderEmail, recordType, extractedData
|
||||
);
|
||||
|
||||
// 8. Mark as completed
|
||||
// 9. Mark as completed
|
||||
await this.repository.updateQueueStatus(emailId, 'completed', {
|
||||
processingResult,
|
||||
});
|
||||
|
||||
// 9. Send confirmation email
|
||||
// 10. Send confirmation email
|
||||
await this.sendConfirmationEmail(senderEmail, userName, processingResult);
|
||||
|
||||
logger.info('Email processing completed successfully', {
|
||||
@@ -239,51 +252,100 @@ export class EmailIngestionService {
|
||||
// ========================
|
||||
|
||||
/**
|
||||
* Process attachments through OCR, trying fuel then maintenance receipt extraction.
|
||||
* Returns the first successful result.
|
||||
* Process attachments using classifier-driven OCR extraction.
|
||||
* If email text classification is confident, calls the specific OCR endpoint.
|
||||
* If not, performs general OCR and classifies from rawText.
|
||||
* Returns null if no usable result or receipt is unclassified.
|
||||
*/
|
||||
private async processAttachmentsWithOcr(
|
||||
private async processAttachmentsWithClassification(
|
||||
userId: string,
|
||||
attachments: ParsedEmailAttachment[]
|
||||
attachments: ParsedEmailAttachment[],
|
||||
emailClassification: { type: string; confidence: number },
|
||||
emailId: string
|
||||
): Promise<{ response: ReceiptExtractionResponse; recordType: EmailRecordType } | null> {
|
||||
// Process only image attachments that the receipt OCR supports
|
||||
const imageAttachments = attachments.filter(att => OCR_RECEIPT_IMAGE_TYPES.has(att.contentType));
|
||||
|
||||
for (const attachment of imageAttachments) {
|
||||
const result = await this.classifyAndExtract(userId, attachment);
|
||||
if (result) return result;
|
||||
// If email text gave a confident classification, call the specific OCR endpoint first
|
||||
if (emailClassification.type === 'fuel') {
|
||||
const result = await this.extractFuelReceipt(userId, attachment);
|
||||
if (result?.success) return { response: result, recordType: 'fuel_log' };
|
||||
// Fuel OCR failed, try maintenance as fallback
|
||||
const fallbackResult = await this.extractMaintenanceReceipt(userId, attachment);
|
||||
if (fallbackResult?.success) return { response: fallbackResult, recordType: 'maintenance_record' };
|
||||
continue;
|
||||
}
|
||||
|
||||
if (emailClassification.type === 'maintenance') {
|
||||
const result = await this.extractMaintenanceReceipt(userId, attachment);
|
||||
if (result?.success) return { response: result, recordType: 'maintenance_record' };
|
||||
// Maintenance OCR failed, try fuel as fallback
|
||||
const fallbackResult = await this.extractFuelReceipt(userId, attachment);
|
||||
if (fallbackResult?.success) return { response: fallbackResult, recordType: 'fuel_log' };
|
||||
continue;
|
||||
}
|
||||
|
||||
// Email text was not confident - try both OCR endpoints and classify from rawText
|
||||
const fuelResult = await this.extractFuelReceipt(userId, attachment);
|
||||
const maintenanceResult = await this.extractMaintenanceReceipt(userId, attachment);
|
||||
|
||||
// Use rawText from whichever succeeded for secondary classification
|
||||
const rawText = fuelResult?.rawText || maintenanceResult?.rawText || '';
|
||||
if (rawText) {
|
||||
const ocrClassification = this.classifier.classifyFromOcrRawText(rawText);
|
||||
logger.info('OCR rawText classification result', {
|
||||
emailId,
|
||||
type: ocrClassification.type,
|
||||
confidence: ocrClassification.confidence,
|
||||
});
|
||||
|
||||
if (ocrClassification.type === 'fuel' && fuelResult?.success) {
|
||||
return { response: fuelResult, recordType: 'fuel_log' };
|
||||
}
|
||||
if (ocrClassification.type === 'maintenance' && maintenanceResult?.success) {
|
||||
return { response: maintenanceResult, recordType: 'maintenance_record' };
|
||||
}
|
||||
}
|
||||
|
||||
// Both classifiers failed - fall back to field-count heuristic
|
||||
const fallback = this.selectBestResultByFields(fuelResult, maintenanceResult);
|
||||
if (fallback) return fallback;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try both fuel and maintenance OCR extractors, return the better result
|
||||
* Extract fuel receipt via OCR. Returns null on failure.
|
||||
*/
|
||||
private async classifyAndExtract(
|
||||
private async extractFuelReceipt(
|
||||
userId: string,
|
||||
attachment: ParsedEmailAttachment
|
||||
): Promise<{ response: ReceiptExtractionResponse; recordType: EmailRecordType } | null> {
|
||||
let fuelResult: ReceiptExtractionResponse | null = null;
|
||||
let maintenanceResult: ReceiptExtractionResponse | null = null;
|
||||
|
||||
// Try fuel receipt extraction
|
||||
): Promise<ReceiptExtractionResponse | null> {
|
||||
try {
|
||||
fuelResult = await ocrService.extractReceipt(userId, {
|
||||
return await ocrService.extractReceipt(userId, {
|
||||
fileBuffer: attachment.content,
|
||||
contentType: attachment.contentType,
|
||||
receiptType: 'fuel',
|
||||
});
|
||||
} catch (error) {
|
||||
logger.info('Fuel receipt extraction failed, trying maintenance', {
|
||||
logger.info('Fuel receipt extraction failed', {
|
||||
filename: attachment.filename,
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Try maintenance receipt extraction
|
||||
/**
|
||||
* Extract maintenance receipt via OCR. Returns null on failure.
|
||||
*/
|
||||
private async extractMaintenanceReceipt(
|
||||
userId: string,
|
||||
attachment: ParsedEmailAttachment
|
||||
): Promise<ReceiptExtractionResponse | null> {
|
||||
try {
|
||||
maintenanceResult = await ocrService.extractMaintenanceReceipt(userId, {
|
||||
return await ocrService.extractMaintenanceReceipt(userId, {
|
||||
fileBuffer: attachment.content,
|
||||
contentType: attachment.contentType,
|
||||
});
|
||||
@@ -292,16 +354,15 @@ export class EmailIngestionService {
|
||||
filename: attachment.filename,
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
// Compare results and pick the best one
|
||||
return this.selectBestResult(fuelResult, maintenanceResult);
|
||||
}
|
||||
|
||||
/**
|
||||
* Select the better OCR result based on extracted field count and success
|
||||
* Last-resort fallback: select the better OCR result based on domain-specific
|
||||
* fields and field count when keyword classifiers could not decide.
|
||||
*/
|
||||
private selectBestResult(
|
||||
private selectBestResultByFields(
|
||||
fuelResult: ReceiptExtractionResponse | null,
|
||||
maintenanceResult: ReceiptExtractionResponse | null
|
||||
): { response: ReceiptExtractionResponse; recordType: EmailRecordType } | null {
|
||||
@@ -316,7 +377,6 @@ export class EmailIngestionService {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check for fuel-specific fields to improve classification
|
||||
const hasFuelFields = fuelResult?.extractedFields['gallons'] ||
|
||||
fuelResult?.extractedFields['price_per_gallon'] ||
|
||||
fuelResult?.extractedFields['fuel_type'];
|
||||
@@ -325,7 +385,6 @@ export class EmailIngestionService {
|
||||
maintenanceResult?.extractedFields['shop_name'] ||
|
||||
maintenanceResult?.extractedFields['description'];
|
||||
|
||||
// Prefer the result with domain-specific fields
|
||||
if (hasFuelFields && !hasMaintenanceFields) {
|
||||
return { response: fuelResult!, recordType: 'fuel_log' };
|
||||
}
|
||||
@@ -333,7 +392,6 @@ export class EmailIngestionService {
|
||||
return { response: maintenanceResult!, recordType: 'maintenance_record' };
|
||||
}
|
||||
|
||||
// Fall back to field count comparison
|
||||
if (fuelFieldCount >= maintenanceFieldCount && fuelResult?.success) {
|
||||
return { response: fuelResult, recordType: 'fuel_log' };
|
||||
}
|
||||
|
||||
@@ -13,6 +13,17 @@ export type PendingAssociationStatus = 'pending' | 'resolved' | 'expired';
|
||||
|
||||
export type EmailRecordType = 'fuel_log' | 'maintenance_record';
|
||||
|
||||
// ========================
|
||||
// Receipt Classification
|
||||
// ========================
|
||||
|
||||
export type ReceiptClassificationType = 'fuel' | 'maintenance' | 'unclassified';
|
||||
|
||||
export interface ClassificationResult {
|
||||
type: ReceiptClassificationType;
|
||||
confidence: number;
|
||||
}
|
||||
|
||||
// ========================
|
||||
// Database Records
|
||||
// ========================
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
/**
|
||||
* @ai-summary Classifies receipt type from email text or OCR raw text
|
||||
* @ai-context Uses keyword matching to determine fuel vs maintenance receipts
|
||||
* before falling back to OCR-based classification. Returns confidence score.
|
||||
*/
|
||||
|
||||
import { logger } from '../../../core/logging/logger';
|
||||
import type { ClassificationResult, ReceiptClassificationType } from './email-ingestion.types';
|
||||
|
||||
/** Fuel-related keywords (case-insensitive matching) */
|
||||
const FUEL_KEYWORDS: string[] = [
|
||||
'gas',
|
||||
'fuel',
|
||||
'gallons',
|
||||
'octane',
|
||||
'pump',
|
||||
'diesel',
|
||||
'unleaded',
|
||||
'shell',
|
||||
'chevron',
|
||||
'exxon',
|
||||
'bp',
|
||||
];
|
||||
|
||||
/** Maintenance-related keywords (case-insensitive matching). Multi-word entries matched as phrases. */
|
||||
const MAINTENANCE_KEYWORDS: string[] = [
|
||||
'oil change',
|
||||
'brake',
|
||||
'alignment',
|
||||
'tire',
|
||||
'rotation',
|
||||
'inspection',
|
||||
'labor',
|
||||
'parts',
|
||||
'service',
|
||||
'repair',
|
||||
'transmission',
|
||||
'coolant',
|
||||
];
|
||||
|
||||
/** Minimum keyword matches required for a confident classification */
|
||||
const CONFIDENCE_THRESHOLD = 2;
|
||||
|
||||
export class ReceiptClassifier {
|
||||
/**
|
||||
* Classify receipt type from email subject and body text.
|
||||
* Returns a confident result if >= 2 keyword matches for one type.
|
||||
*/
|
||||
classifyFromText(subject: string | null, body: string | null): ClassificationResult {
|
||||
const text = [subject || '', body || ''].join(' ');
|
||||
return this.classifyText(text, 'email');
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify receipt type from OCR raw text output.
|
||||
* Uses same keyword matching as email text classification.
|
||||
*/
|
||||
classifyFromOcrRawText(rawText: string): ClassificationResult {
|
||||
return this.classifyText(rawText, 'ocr');
|
||||
}
|
||||
|
||||
/**
|
||||
* Core keyword matching logic shared by email and OCR classification.
|
||||
*/
|
||||
private classifyText(text: string, source: 'email' | 'ocr'): ClassificationResult {
|
||||
const normalizedText = text.toLowerCase();
|
||||
|
||||
const fuelMatches = this.countKeywordMatches(normalizedText, FUEL_KEYWORDS);
|
||||
const maintenanceMatches = this.countKeywordMatches(normalizedText, MAINTENANCE_KEYWORDS);
|
||||
|
||||
logger.info('Receipt classification keyword analysis', {
|
||||
source,
|
||||
fuelMatches,
|
||||
maintenanceMatches,
|
||||
textLength: text.length,
|
||||
});
|
||||
|
||||
// Both below threshold - unclassified
|
||||
if (fuelMatches < CONFIDENCE_THRESHOLD && maintenanceMatches < CONFIDENCE_THRESHOLD) {
|
||||
return { type: 'unclassified', confidence: 0 };
|
||||
}
|
||||
|
||||
// Clear winner with threshold met
|
||||
if (fuelMatches >= CONFIDENCE_THRESHOLD && fuelMatches > maintenanceMatches) {
|
||||
return {
|
||||
type: 'fuel',
|
||||
confidence: Math.min(fuelMatches / (fuelMatches + maintenanceMatches), 1),
|
||||
};
|
||||
}
|
||||
|
||||
if (maintenanceMatches >= CONFIDENCE_THRESHOLD && maintenanceMatches > fuelMatches) {
|
||||
return {
|
||||
type: 'maintenance',
|
||||
confidence: Math.min(maintenanceMatches / (fuelMatches + maintenanceMatches), 1),
|
||||
};
|
||||
}
|
||||
|
||||
// Tie with both meeting threshold - unclassified (ambiguous)
|
||||
if (fuelMatches >= CONFIDENCE_THRESHOLD && maintenanceMatches >= CONFIDENCE_THRESHOLD) {
|
||||
return { type: 'unclassified', confidence: 0 };
|
||||
}
|
||||
|
||||
return { type: 'unclassified', confidence: 0 };
|
||||
}
|
||||
|
||||
/**
|
||||
* Count how many keywords from the list appear in the text.
|
||||
* Multi-word keywords are matched as phrases.
|
||||
*/
|
||||
private countKeywordMatches(normalizedText: string, keywords: string[]): number {
|
||||
let matches = 0;
|
||||
for (const keyword of keywords) {
|
||||
if (normalizedText.includes(keyword)) {
|
||||
matches++;
|
||||
}
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Map classifier type to the EmailRecordType used in the processing pipeline.
|
||||
*/
|
||||
static toRecordType(classificationType: ReceiptClassificationType): 'fuel_log' | 'maintenance_record' | null {
|
||||
switch (classificationType) {
|
||||
case 'fuel': return 'fuel_log';
|
||||
case 'maintenance': return 'maintenance_record';
|
||||
case 'unclassified': return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,14 +6,17 @@
|
||||
export { emailIngestionWebhookRoutes } from './api/email-ingestion.routes';
|
||||
export { EmailIngestionService } from './domain/email-ingestion.service';
|
||||
export { EmailIngestionRepository } from './data/email-ingestion.repository';
|
||||
export { ReceiptClassifier } from './domain/receipt-classifier';
|
||||
export { ResendInboundClient } from './external/resend-inbound.client';
|
||||
export type { ParsedEmailResult, ParsedEmailAttachment } from './external/resend-inbound.client';
|
||||
export type {
|
||||
ClassificationResult,
|
||||
EmailIngestionQueueRecord,
|
||||
EmailIngestionStatus,
|
||||
EmailProcessingResult,
|
||||
ExtractedReceiptData,
|
||||
PendingVehicleAssociation,
|
||||
ReceiptClassificationType,
|
||||
ResendWebhookEvent,
|
||||
ResendWebhookEventData,
|
||||
} from './domain/email-ingestion.types';
|
||||
|
||||
Reference in New Issue
Block a user