2026-02-13 22:19:45 +00:00
4 changed files with 234 additions and 32 deletions
--- a/backend/src/features/email-ingestion/domain/email-ingestion.service.ts
+++ b/backend/src/features/email-ingestion/domain/email-ingestion.service.ts
@@ -16,6 +16,7 @@ import { TemplateService } from '../../notifications/domain/template.service';
 import { EmailService } from '../../notifications/domain/email.service';
 import { ocrService } from '../../ocr/domain/ocr.service';
 import type { ReceiptExtractionResponse } from '../../ocr/domain/ocr.types';
 import { ReceiptClassifier } from './receipt-classifier';
 import type {
  ResendWebhookEvent,
  EmailProcessingResult,
@@ -51,6 +52,7 @@ export class EmailIngestionService {
  private notificationsRepository: NotificationsRepository;
  private templateService: TemplateService;
  private emailService: EmailService;
  private classifier: ReceiptClassifier;
  constructor(dbPool?: Pool) {
    const p = dbPool || pool;
@@ -61,6 +63,7 @@ export class EmailIngestionService {
    this.notificationsRepository = new NotificationsRepository(p);
    this.templateService = new TemplateService();
    this.emailService = new EmailService();
    this.classifier = new ReceiptClassifier();
  }
  // ========================
@@ -102,28 +105,38 @@ export class EmailIngestionService {
        return;
      }
-      // 5. Process first valid image attachment through OCR
+      // 5. Classify receipt from email text first
-      const ocrResult = await this.processAttachmentsWithOcr(userId, validAttachments);
+      const emailClassification = this.classifier.classifyFromText(subject, event.data.text);
      logger.info('Email text classification result', {
        emailId,
        type: emailClassification.type,
        confidence: emailClassification.confidence,
      });
      // 6. Process attachments through OCR using classification
      const ocrResult = await this.processAttachmentsWithClassification(
        userId, validAttachments, emailClassification, emailId
      );
      if (!ocrResult) {
        await this.handleOcrFailure(emailId, senderEmail, userName, subject, 'No receipt data could be extracted from attachments');
        return;
      }
-      // 6. Build extracted data from OCR result
+      // 7. Build extracted data from OCR result
      const extractedData = this.mapOcrToExtractedData(ocrResult.response);
      const recordType = ocrResult.recordType;
-      // 7. Handle vehicle association
+      // 8. Handle vehicle association
      const processingResult = await this.handleVehicleAssociation(
        userId, userName, senderEmail, recordType, extractedData
      );
-      // 8. Mark as completed
+      // 9. Mark as completed
      await this.repository.updateQueueStatus(emailId, 'completed', {
        processingResult,
      });
-      // 9. Send confirmation email
+      // 10. Send confirmation email
      await this.sendConfirmationEmail(senderEmail, userName, processingResult);
      logger.info('Email processing completed successfully', {
@@ -239,51 +252,100 @@ export class EmailIngestionService {
  // ========================
  /**
-   * Process attachments through OCR, trying fuel then maintenance receipt extraction.
+   * Process attachments using classifier-driven OCR extraction.
-   * Returns the first successful result.
+   * If email text classification is confident, calls the specific OCR endpoint.
   * If not, performs general OCR and classifies from rawText.
   * Returns null if no usable result or receipt is unclassified.
   */
-  private async processAttachmentsWithOcr(
+  private async processAttachmentsWithClassification(
    userId: string,
-    attachments: ParsedEmailAttachment[]
+    attachments: ParsedEmailAttachment[],
    emailClassification: { type: string; confidence: number },
    emailId: string
  ): Promise<{ response: ReceiptExtractionResponse; recordType: EmailRecordType } | null> {
    // Process only image attachments that the receipt OCR supports
    const imageAttachments = attachments.filter(att => OCR_RECEIPT_IMAGE_TYPES.has(att.contentType));
    for (const attachment of imageAttachments) {
-      const result = await this.classifyAndExtract(userId, attachment);
+      // If email text gave a confident classification, call the specific OCR endpoint first
-      if (result) return result;
+      if (emailClassification.type === 'fuel') {
        const result = await this.extractFuelReceipt(userId, attachment);
        if (result?.success) return { response: result, recordType: 'fuel_log' };
        // Fuel OCR failed, try maintenance as fallback
        const fallbackResult = await this.extractMaintenanceReceipt(userId, attachment);
        if (fallbackResult?.success) return { response: fallbackResult, recordType: 'maintenance_record' };
        continue;
      }
      if (emailClassification.type === 'maintenance') {
        const result = await this.extractMaintenanceReceipt(userId, attachment);
        if (result?.success) return { response: result, recordType: 'maintenance_record' };
        // Maintenance OCR failed, try fuel as fallback
        const fallbackResult = await this.extractFuelReceipt(userId, attachment);
        if (fallbackResult?.success) return { response: fallbackResult, recordType: 'fuel_log' };
        continue;
      }
      // Email text was not confident - try both OCR endpoints and classify from rawText
      const fuelResult = await this.extractFuelReceipt(userId, attachment);
      const maintenanceResult = await this.extractMaintenanceReceipt(userId, attachment);
      // Use rawText from whichever succeeded for secondary classification
      const rawText = fuelResult?.rawText || maintenanceResult?.rawText || '';
      if (rawText) {
        const ocrClassification = this.classifier.classifyFromOcrRawText(rawText);
        logger.info('OCR rawText classification result', {
          emailId,
          type: ocrClassification.type,
          confidence: ocrClassification.confidence,
        });
        if (ocrClassification.type === 'fuel' && fuelResult?.success) {
          return { response: fuelResult, recordType: 'fuel_log' };
        }
        if (ocrClassification.type === 'maintenance' && maintenanceResult?.success) {
          return { response: maintenanceResult, recordType: 'maintenance_record' };
        }
      }
      // Both classifiers failed - fall back to field-count heuristic
      const fallback = this.selectBestResultByFields(fuelResult, maintenanceResult);
      if (fallback) return fallback;
    }
    return null;
  }
  /**
-   * Try both fuel and maintenance OCR extractors, return the better result
+   * Extract fuel receipt via OCR. Returns null on failure.
   */
-  private async classifyAndExtract(
+  private async extractFuelReceipt(
    userId: string,
    attachment: ParsedEmailAttachment
-  ): Promise<{ response: ReceiptExtractionResponse; recordType: EmailRecordType } | null> {
+  ): Promise<ReceiptExtractionResponse | null> {
    let fuelResult: ReceiptExtractionResponse | null = null;
    let maintenanceResult: ReceiptExtractionResponse | null = null;
    // Try fuel receipt extraction
    try {
-      fuelResult = await ocrService.extractReceipt(userId, {
+      return await ocrService.extractReceipt(userId, {
        fileBuffer: attachment.content,
        contentType: attachment.contentType,
        receiptType: 'fuel',
      });
    } catch (error) {
-      logger.info('Fuel receipt extraction failed, trying maintenance', {
+      logger.info('Fuel receipt extraction failed', {
        filename: attachment.filename,
        error: error instanceof Error ? error.message : String(error),
      });
      return null;
    }
  }
-    // Try maintenance receipt extraction
+  /**
   * Extract maintenance receipt via OCR. Returns null on failure.
   */
  private async extractMaintenanceReceipt(
    userId: string,
    attachment: ParsedEmailAttachment
  ): Promise<ReceiptExtractionResponse | null> {
    try {
-      maintenanceResult = await ocrService.extractMaintenanceReceipt(userId, {
+      return await ocrService.extractMaintenanceReceipt(userId, {
        fileBuffer: attachment.content,
        contentType: attachment.contentType,
      });
@@ -292,16 +354,15 @@ export class EmailIngestionService {
        filename: attachment.filename,
        error: error instanceof Error ? error.message : String(error),
      });
      return null;
    }
    // Compare results and pick the best one
    return this.selectBestResult(fuelResult, maintenanceResult);
  }
  /**
-   * Select the better OCR result based on extracted field count and success
+   * Last-resort fallback: select the better OCR result based on domain-specific
   * fields and field count when keyword classifiers could not decide.
   */
-  private selectBestResult(
+  private selectBestResultByFields(
    fuelResult: ReceiptExtractionResponse | null,
    maintenanceResult: ReceiptExtractionResponse | null
  ): { response: ReceiptExtractionResponse; recordType: EmailRecordType } | null {
@@ -316,7 +377,6 @@ export class EmailIngestionService {
      return null;
    }
    // Check for fuel-specific fields to improve classification
    const hasFuelFields = fuelResult?.extractedFields['gallons'] ||
      fuelResult?.extractedFields['price_per_gallon'] ||
      fuelResult?.extractedFields['fuel_type'];
@@ -325,7 +385,6 @@ export class EmailIngestionService {
      maintenanceResult?.extractedFields['shop_name'] ||
      maintenanceResult?.extractedFields['description'];
    // Prefer the result with domain-specific fields
    if (hasFuelFields && !hasMaintenanceFields) {
      return { response: fuelResult!, recordType: 'fuel_log' };
    }
@@ -333,7 +392,6 @@ export class EmailIngestionService {
      return { response: maintenanceResult!, recordType: 'maintenance_record' };
    }
    // Fall back to field count comparison
    if (fuelFieldCount >= maintenanceFieldCount && fuelResult?.success) {
      return { response: fuelResult, recordType: 'fuel_log' };
    }
--- a/backend/src/features/email-ingestion/domain/email-ingestion.types.ts
+++ b/backend/src/features/email-ingestion/domain/email-ingestion.types.ts
@@ -13,6 +13,17 @@ export type PendingAssociationStatus = 'pending' | 'resolved' | 'expired';
 export type EmailRecordType = 'fuel_log' | 'maintenance_record';
 // ========================
 // Receipt Classification
 // ========================
 export type ReceiptClassificationType = 'fuel' | 'maintenance' | 'unclassified';
 export interface ClassificationResult {
  type: ReceiptClassificationType;
  confidence: number;
 }
 // ========================
 // Database Records
 // ========================
--- a/backend/src/features/email-ingestion/domain/receipt-classifier.ts
+++ b/backend/src/features/email-ingestion/domain/receipt-classifier.ts
@@ -0,0 +1,130 @@
 /**
 * @ai-summary Classifies receipt type from email text or OCR raw text
 * @ai-context Uses keyword matching to determine fuel vs maintenance receipts
 *   before falling back to OCR-based classification. Returns confidence score.
 */
 import { logger } from '../../../core/logging/logger';
 import type { ClassificationResult, ReceiptClassificationType } from './email-ingestion.types';
 /** Fuel-related keywords (case-insensitive matching) */
 const FUEL_KEYWORDS: string[] = [
  'gas',
  'fuel',
  'gallons',
  'octane',
  'pump',
  'diesel',
  'unleaded',
  'shell',
  'chevron',
  'exxon',
  'bp',
 ];
 /** Maintenance-related keywords (case-insensitive matching). Multi-word entries matched as phrases. */
 const MAINTENANCE_KEYWORDS: string[] = [
  'oil change',
  'brake',
  'alignment',
  'tire',
  'rotation',
  'inspection',
  'labor',
  'parts',
  'service',
  'repair',
  'transmission',
  'coolant',
 ];
 /** Minimum keyword matches required for a confident classification */
 const CONFIDENCE_THRESHOLD = 2;
 export class ReceiptClassifier {
  /**
   * Classify receipt type from email subject and body text.
   * Returns a confident result if >= 2 keyword matches for one type.
   */
  classifyFromText(subject: string | null, body: string | null): ClassificationResult {
    const text = [subject || '', body || ''].join(' ');
    return this.classifyText(text, 'email');
  }
  /**
   * Classify receipt type from OCR raw text output.
   * Uses same keyword matching as email text classification.
   */
  classifyFromOcrRawText(rawText: string): ClassificationResult {
    return this.classifyText(rawText, 'ocr');
  }
  /**
   * Core keyword matching logic shared by email and OCR classification.
   */
  private classifyText(text: string, source: 'email' | 'ocr'): ClassificationResult {
    const normalizedText = text.toLowerCase();
    const fuelMatches = this.countKeywordMatches(normalizedText, FUEL_KEYWORDS);
    const maintenanceMatches = this.countKeywordMatches(normalizedText, MAINTENANCE_KEYWORDS);
    logger.info('Receipt classification keyword analysis', {
      source,
      fuelMatches,
      maintenanceMatches,
      textLength: text.length,
    });
    // Both below threshold - unclassified
    if (fuelMatches < CONFIDENCE_THRESHOLD && maintenanceMatches < CONFIDENCE_THRESHOLD) {
      return { type: 'unclassified', confidence: 0 };
    }
    // Clear winner with threshold met
    if (fuelMatches >= CONFIDENCE_THRESHOLD && fuelMatches > maintenanceMatches) {
      return {
        type: 'fuel',
        confidence: Math.min(fuelMatches / (fuelMatches + maintenanceMatches), 1),
      };
    }
    if (maintenanceMatches >= CONFIDENCE_THRESHOLD && maintenanceMatches > fuelMatches) {
      return {
        type: 'maintenance',
        confidence: Math.min(maintenanceMatches / (fuelMatches + maintenanceMatches), 1),
      };
    }
    // Tie with both meeting threshold - unclassified (ambiguous)
    if (fuelMatches >= CONFIDENCE_THRESHOLD && maintenanceMatches >= CONFIDENCE_THRESHOLD) {
      return { type: 'unclassified', confidence: 0 };
    }
    return { type: 'unclassified', confidence: 0 };
  }
  /**
   * Count how many keywords from the list appear in the text.
   * Multi-word keywords are matched as phrases.
   */
  private countKeywordMatches(normalizedText: string, keywords: string[]): number {
    let matches = 0;
    for (const keyword of keywords) {
      if (normalizedText.includes(keyword)) {
        matches++;
      }
    }
    return matches;
  }
  /**
   * Map classifier type to the EmailRecordType used in the processing pipeline.
   */
  static toRecordType(classificationType: ReceiptClassificationType): 'fuel_log' | 'maintenance_record' | null {
    switch (classificationType) {
      case 'fuel': return 'fuel_log';
      case 'maintenance': return 'maintenance_record';
      case 'unclassified': return null;
    }
  }
 }
--- a/backend/src/features/email-ingestion/index.ts
+++ b/backend/src/features/email-ingestion/index.ts
@@ -6,14 +6,17 @@
 export { emailIngestionWebhookRoutes } from './api/email-ingestion.routes';
 export { EmailIngestionService } from './domain/email-ingestion.service';
 export { EmailIngestionRepository } from './data/email-ingestion.repository';
 export { ReceiptClassifier } from './domain/receipt-classifier';
 export { ResendInboundClient } from './external/resend-inbound.client';
 export type { ParsedEmailResult, ParsedEmailAttachment } from './external/resend-inbound.client';
 export type {
  ClassificationResult,
  EmailIngestionQueueRecord,
  EmailIngestionStatus,
  EmailProcessingResult,
  ExtractedReceiptData,
  PendingVehicleAssociation,
  ReceiptClassificationType,
  ResendWebhookEvent,
  ResendWebhookEventData,
 } from './domain/email-ingestion.types';