feat: add receipt classifier and OCR integration (refs #157)

- New ReceiptClassifier module with keyword-based classification for fuel vs maintenance receipts from email text and OCR raw text - Classifier-first pipeline: classify from email subject/body keywords before falling back to OCR-based classification - Fuel keywords: gas, fuel, gallons, octane, pump, diesel, unleaded, shell, chevron, exxon, bp - Maintenance keywords: oil change, brake, alignment, tire, rotation, inspection, labor, parts, service, repair, transmission, coolant - Confident classification (>= 2 keyword matches) routes to specific OCR endpoint; unclassified falls back to both endpoints + rawText classification + field-count heuristic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 08:44:03 -06:00
parent e7f3728771
commit d9a40f7d37
4 changed files with 234 additions and 32 deletions
--- a/backend/src/features/email-ingestion/domain/email-ingestion.service.ts
+++ b/backend/src/features/email-ingestion/domain/email-ingestion.service.ts
@@ -16,6 +16,7 @@ import { TemplateService } from '../../notifications/domain/template.service';
 import { EmailService } from '../../notifications/domain/email.service';
 import { ocrService } from '../../ocr/domain/ocr.service';
 import type { ReceiptExtractionResponse } from '../../ocr/domain/ocr.types';
+import { ReceiptClassifier } from './receipt-classifier';
 import type {
  ResendWebhookEvent,
  EmailProcessingResult,
@@ -51,6 +52,7 @@ export class EmailIngestionService {
  private notificationsRepository: NotificationsRepository;
  private templateService: TemplateService;
  private emailService: EmailService;
+  private classifier: ReceiptClassifier;

  constructor(dbPool?: Pool) {
    const p = dbPool || pool;
@@ -61,6 +63,7 @@ export class EmailIngestionService {
    this.notificationsRepository = new NotificationsRepository(p);
    this.templateService = new TemplateService();
    this.emailService = new EmailService();
+    this.classifier = new ReceiptClassifier();
  }

  // ========================
@@ -102,28 +105,38 @@ export class EmailIngestionService {
        return;
      }

-      // 5. Process first valid image attachment through OCR
-      const ocrResult = await this.processAttachmentsWithOcr(userId, validAttachments);
+      // 5. Classify receipt from email text first
+      const emailClassification = this.classifier.classifyFromText(subject, event.data.text);
+      logger.info('Email text classification result', {
+        emailId,
+        type: emailClassification.type,
+        confidence: emailClassification.confidence,
+      });
+
+      // 6. Process attachments through OCR using classification
+      const ocrResult = await this.processAttachmentsWithClassification(
+        userId, validAttachments, emailClassification, emailId
+      );
      if (!ocrResult) {
        await this.handleOcrFailure(emailId, senderEmail, userName, subject, 'No receipt data could be extracted from attachments');
        return;
      }

-      // 6. Build extracted data from OCR result
+      // 7. Build extracted data from OCR result
      const extractedData = this.mapOcrToExtractedData(ocrResult.response);
      const recordType = ocrResult.recordType;

-      // 7. Handle vehicle association
+      // 8. Handle vehicle association
      const processingResult = await this.handleVehicleAssociation(
        userId, userName, senderEmail, recordType, extractedData
      );

-      // 8. Mark as completed
+      // 9. Mark as completed
      await this.repository.updateQueueStatus(emailId, 'completed', {
        processingResult,
      });

-      // 9. Send confirmation email
+      // 10. Send confirmation email
      await this.sendConfirmationEmail(senderEmail, userName, processingResult);

      logger.info('Email processing completed successfully', {
@@ -239,51 +252,100 @@ export class EmailIngestionService {
  // ========================

  /**
-   * Process attachments through OCR, trying fuel then maintenance receipt extraction.
-   * Returns the first successful result.
+   * Process attachments using classifier-driven OCR extraction.
+   * If email text classification is confident, calls the specific OCR endpoint.
+   * If not, performs general OCR and classifies from rawText.
+   * Returns null if no usable result or receipt is unclassified.
   */
-  private async processAttachmentsWithOcr(
+  private async processAttachmentsWithClassification(
    userId: string,
-    attachments: ParsedEmailAttachment[]
+    attachments: ParsedEmailAttachment[],
+    emailClassification: { type: string; confidence: number },
+    emailId: string
  ): Promise<{ response: ReceiptExtractionResponse; recordType: EmailRecordType } | null> {
-    // Process only image attachments that the receipt OCR supports
    const imageAttachments = attachments.filter(att => OCR_RECEIPT_IMAGE_TYPES.has(att.contentType));

    for (const attachment of imageAttachments) {
-      const result = await this.classifyAndExtract(userId, attachment);
-      if (result) return result;
+      // If email text gave a confident classification, call the specific OCR endpoint first
+      if (emailClassification.type === 'fuel') {
+        const result = await this.extractFuelReceipt(userId, attachment);
+        if (result?.success) return { response: result, recordType: 'fuel_log' };
+        // Fuel OCR failed, try maintenance as fallback
+        const fallbackResult = await this.extractMaintenanceReceipt(userId, attachment);
+        if (fallbackResult?.success) return { response: fallbackResult, recordType: 'maintenance_record' };
+        continue;
+      }
+
+      if (emailClassification.type === 'maintenance') {
+        const result = await this.extractMaintenanceReceipt(userId, attachment);
+        if (result?.success) return { response: result, recordType: 'maintenance_record' };
+        // Maintenance OCR failed, try fuel as fallback
+        const fallbackResult = await this.extractFuelReceipt(userId, attachment);
+        if (fallbackResult?.success) return { response: fallbackResult, recordType: 'fuel_log' };
+        continue;
+      }
+
+      // Email text was not confident - try both OCR endpoints and classify from rawText
+      const fuelResult = await this.extractFuelReceipt(userId, attachment);
+      const maintenanceResult = await this.extractMaintenanceReceipt(userId, attachment);
+
+      // Use rawText from whichever succeeded for secondary classification
+      const rawText = fuelResult?.rawText || maintenanceResult?.rawText || '';
+      if (rawText) {
+        const ocrClassification = this.classifier.classifyFromOcrRawText(rawText);
+        logger.info('OCR rawText classification result', {
+          emailId,
+          type: ocrClassification.type,
+          confidence: ocrClassification.confidence,
+        });
+
+        if (ocrClassification.type === 'fuel' && fuelResult?.success) {
+          return { response: fuelResult, recordType: 'fuel_log' };
+        }
+        if (ocrClassification.type === 'maintenance' && maintenanceResult?.success) {
+          return { response: maintenanceResult, recordType: 'maintenance_record' };
+        }
+      }
+
+      // Both classifiers failed - fall back to field-count heuristic
+      const fallback = this.selectBestResultByFields(fuelResult, maintenanceResult);
+      if (fallback) return fallback;
    }

    return null;
  }

  /**
-   * Try both fuel and maintenance OCR extractors, return the better result
+   * Extract fuel receipt via OCR. Returns null on failure.
   */
-  private async classifyAndExtract(
+  private async extractFuelReceipt(
    userId: string,
    attachment: ParsedEmailAttachment
-  ): Promise<{ response: ReceiptExtractionResponse; recordType: EmailRecordType } | null> {
-    let fuelResult: ReceiptExtractionResponse | null = null;
-    let maintenanceResult: ReceiptExtractionResponse | null = null;
-
-    // Try fuel receipt extraction
+  ): Promise<ReceiptExtractionResponse | null> {
    try {
-      fuelResult = await ocrService.extractReceipt(userId, {
+      return await ocrService.extractReceipt(userId, {
        fileBuffer: attachment.content,
        contentType: attachment.contentType,
        receiptType: 'fuel',
      });
    } catch (error) {
-      logger.info('Fuel receipt extraction failed, trying maintenance', {
+      logger.info('Fuel receipt extraction failed', {
        filename: attachment.filename,
        error: error instanceof Error ? error.message : String(error),
      });
+      return null;
    }
+  }

-    // Try maintenance receipt extraction
+  /**
+   * Extract maintenance receipt via OCR. Returns null on failure.
+   */
+  private async extractMaintenanceReceipt(
+    userId: string,
+    attachment: ParsedEmailAttachment
+  ): Promise<ReceiptExtractionResponse | null> {
    try {
-      maintenanceResult = await ocrService.extractMaintenanceReceipt(userId, {
+      return await ocrService.extractMaintenanceReceipt(userId, {
        fileBuffer: attachment.content,
        contentType: attachment.contentType,
      });
@@ -292,16 +354,15 @@ export class EmailIngestionService {
        filename: attachment.filename,
        error: error instanceof Error ? error.message : String(error),
      });
+      return null;
    }
-
-    // Compare results and pick the best one
-    return this.selectBestResult(fuelResult, maintenanceResult);
  }

  /**
-   * Select the better OCR result based on extracted field count and success
+   * Last-resort fallback: select the better OCR result based on domain-specific
+   * fields and field count when keyword classifiers could not decide.
   */
-  private selectBestResult(
+  private selectBestResultByFields(
    fuelResult: ReceiptExtractionResponse | null,
    maintenanceResult: ReceiptExtractionResponse | null
  ): { response: ReceiptExtractionResponse; recordType: EmailRecordType } | null {
@@ -316,7 +377,6 @@ export class EmailIngestionService {
      return null;
    }

-    // Check for fuel-specific fields to improve classification
    const hasFuelFields = fuelResult?.extractedFields['gallons'] ||
      fuelResult?.extractedFields['price_per_gallon'] ||
      fuelResult?.extractedFields['fuel_type'];
@@ -325,7 +385,6 @@ export class EmailIngestionService {
      maintenanceResult?.extractedFields['shop_name'] ||
      maintenanceResult?.extractedFields['description'];

-    // Prefer the result with domain-specific fields
    if (hasFuelFields && !hasMaintenanceFields) {
      return { response: fuelResult!, recordType: 'fuel_log' };
    }
@@ -333,7 +392,6 @@ export class EmailIngestionService {
      return { response: maintenanceResult!, recordType: 'maintenance_record' };
    }

-    // Fall back to field count comparison
    if (fuelFieldCount >= maintenanceFieldCount && fuelResult?.success) {
      return { response: fuelResult, recordType: 'fuel_log' };
    }
--- a/backend/src/features/email-ingestion/domain/email-ingestion.types.ts
+++ b/backend/src/features/email-ingestion/domain/email-ingestion.types.ts
@@ -13,6 +13,17 @@ export type PendingAssociationStatus = 'pending' | 'resolved' | 'expired';

 export type EmailRecordType = 'fuel_log' | 'maintenance_record';

+// ========================
+// Receipt Classification
+// ========================
+
+export type ReceiptClassificationType = 'fuel' | 'maintenance' | 'unclassified';
+
+export interface ClassificationResult {
+  type: ReceiptClassificationType;
+  confidence: number;
+}
+
 // ========================
 // Database Records
 // ========================
--- a/backend/src/features/email-ingestion/domain/receipt-classifier.ts
+++ b/backend/src/features/email-ingestion/domain/receipt-classifier.ts
@@ -0,0 +1,130 @@
+/**
+ * @ai-summary Classifies receipt type from email text or OCR raw text
+ * @ai-context Uses keyword matching to determine fuel vs maintenance receipts
+ *   before falling back to OCR-based classification. Returns confidence score.
+ */
+
+import { logger } from '../../../core/logging/logger';
+import type { ClassificationResult, ReceiptClassificationType } from './email-ingestion.types';
+
+/** Fuel-related keywords (case-insensitive matching) */
+const FUEL_KEYWORDS: string[] = [
+  'gas',
+  'fuel',
+  'gallons',
+  'octane',
+  'pump',
+  'diesel',
+  'unleaded',
+  'shell',
+  'chevron',
+  'exxon',
+  'bp',
+];
+
+/** Maintenance-related keywords (case-insensitive matching). Multi-word entries matched as phrases. */
+const MAINTENANCE_KEYWORDS: string[] = [
+  'oil change',
+  'brake',
+  'alignment',
+  'tire',
+  'rotation',
+  'inspection',
+  'labor',
+  'parts',
+  'service',
+  'repair',
+  'transmission',
+  'coolant',
+];
+
+/** Minimum keyword matches required for a confident classification */
+const CONFIDENCE_THRESHOLD = 2;
+
+export class ReceiptClassifier {
+  /**
+   * Classify receipt type from email subject and body text.
+   * Returns a confident result if >= 2 keyword matches for one type.
+   */
+  classifyFromText(subject: string | null, body: string | null): ClassificationResult {
+    const text = [subject || '', body || ''].join(' ');
+    return this.classifyText(text, 'email');
+  }
+
+  /**
+   * Classify receipt type from OCR raw text output.
+   * Uses same keyword matching as email text classification.
+   */
+  classifyFromOcrRawText(rawText: string): ClassificationResult {
+    return this.classifyText(rawText, 'ocr');
+  }
+
+  /**
+   * Core keyword matching logic shared by email and OCR classification.
+   */
+  private classifyText(text: string, source: 'email' | 'ocr'): ClassificationResult {
+    const normalizedText = text.toLowerCase();
+
+    const fuelMatches = this.countKeywordMatches(normalizedText, FUEL_KEYWORDS);
+    const maintenanceMatches = this.countKeywordMatches(normalizedText, MAINTENANCE_KEYWORDS);
+
+    logger.info('Receipt classification keyword analysis', {
+      source,
+      fuelMatches,
+      maintenanceMatches,
+      textLength: text.length,
+    });
+
+    // Both below threshold - unclassified
+    if (fuelMatches < CONFIDENCE_THRESHOLD && maintenanceMatches < CONFIDENCE_THRESHOLD) {
+      return { type: 'unclassified', confidence: 0 };
+    }
+
+    // Clear winner with threshold met
+    if (fuelMatches >= CONFIDENCE_THRESHOLD && fuelMatches > maintenanceMatches) {
+      return {
+        type: 'fuel',
+        confidence: Math.min(fuelMatches / (fuelMatches + maintenanceMatches), 1),
+      };
+    }
+
+    if (maintenanceMatches >= CONFIDENCE_THRESHOLD && maintenanceMatches > fuelMatches) {
+      return {
+        type: 'maintenance',
+        confidence: Math.min(maintenanceMatches / (fuelMatches + maintenanceMatches), 1),
+      };
+    }
+
+    // Tie with both meeting threshold - unclassified (ambiguous)
+    if (fuelMatches >= CONFIDENCE_THRESHOLD && maintenanceMatches >= CONFIDENCE_THRESHOLD) {
+      return { type: 'unclassified', confidence: 0 };
+    }
+
+    return { type: 'unclassified', confidence: 0 };
+  }
+
+  /**
+   * Count how many keywords from the list appear in the text.
+   * Multi-word keywords are matched as phrases.
+   */
+  private countKeywordMatches(normalizedText: string, keywords: string[]): number {
+    let matches = 0;
+    for (const keyword of keywords) {
+      if (normalizedText.includes(keyword)) {
+        matches++;
+      }
+    }
+    return matches;
+  }
+
+  /**
+   * Map classifier type to the EmailRecordType used in the processing pipeline.
+   */
+  static toRecordType(classificationType: ReceiptClassificationType): 'fuel_log' | 'maintenance_record' | null {
+    switch (classificationType) {
+      case 'fuel': return 'fuel_log';
+      case 'maintenance': return 'maintenance_record';
+      case 'unclassified': return null;
+    }
+  }
+}
--- a/backend/src/features/email-ingestion/index.ts
+++ b/backend/src/features/email-ingestion/index.ts
@@ -6,14 +6,17 @@
 export { emailIngestionWebhookRoutes } from './api/email-ingestion.routes';
 export { EmailIngestionService } from './domain/email-ingestion.service';
 export { EmailIngestionRepository } from './data/email-ingestion.repository';
+export { ReceiptClassifier } from './domain/receipt-classifier';
 export { ResendInboundClient } from './external/resend-inbound.client';
 export type { ParsedEmailResult, ParsedEmailAttachment } from './external/resend-inbound.client';
 export type {
+  ClassificationResult,
  EmailIngestionQueueRecord,
  EmailIngestionStatus,
  EmailProcessingResult,
  ExtractedReceiptData,
  PendingVehicleAssociation,
+  ReceiptClassificationType,
  ResendWebhookEvent,
  ResendWebhookEventData,
 } from './domain/email-ingestion.types';