feat: add receipt OCR pipeline (refs #69)

Implement receipt-specific OCR extraction for fuel receipts: - Pattern matching modules for date, currency, and fuel data extraction - Receipt-optimized image preprocessing for thermal receipts - POST /extract/receipt endpoint with field extraction - Confidence scoring per extracted field - Cross-validation of fuel receipt data - Unit tests for all pattern matchers Extracted fields: merchantName, transactionDate, totalAmount, fuelQuantity, pricePerUnit, fuelGrade Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 20:43:30 -06:00
parent a2f0abb14c
commit 6319d50fb1
16 changed files with 2845 additions and 2 deletions
--- a/ocr/app/extractors/init.py
+++ b/ocr/app/extractors/init.py
@@ -1,10 +1,23 @@
 """Extractors package for domain-specific OCR extraction."""
 from app.extractors.base import BaseExtractor, ExtractionResult
 from app.extractors.vin_extractor import VinExtractor, vin_extractor
+from app.extractors.receipt_extractor import (
+    ReceiptExtractor,
+    receipt_extractor,
+    ReceiptExtractionResult,
+    ExtractedField,
+)
+from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor

 __all__ = [
    "BaseExtractor",
    "ExtractionResult",
    "VinExtractor",
    "vin_extractor",
+    "ReceiptExtractor",
+    "receipt_extractor",
+    "ReceiptExtractionResult",
+    "ExtractedField",
+    "FuelReceiptExtractor",
+    "fuel_receipt_extractor",
 ]
--- a/ocr/app/extractors/fuel_receipt.py
+++ b/ocr/app/extractors/fuel_receipt.py
@@ -0,0 +1,193 @@
+"""Fuel receipt specialization with validation and cross-checking."""
+import logging
+from dataclasses import dataclass
+from typing import Optional
+
+from app.extractors.receipt_extractor import (
+    ExtractedField,
+    ReceiptExtractionResult,
+    receipt_extractor,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class FuelReceiptValidation:
+    """Validation result for fuel receipt extraction."""
+
+    is_valid: bool
+    issues: list[str]
+    confidence_score: float
+
+
+class FuelReceiptExtractor:
+    """Specialized fuel receipt extractor with cross-validation.
+
+    Provides additional validation and confidence scoring specific
+    to fuel receipts by cross-checking extracted values.
+    """
+
+    # Expected fields for a complete fuel receipt
+    REQUIRED_FIELDS = ["totalAmount"]
+    OPTIONAL_FIELDS = [
+        "merchantName",
+        "transactionDate",
+        "fuelQuantity",
+        "pricePerUnit",
+        "fuelGrade",
+    ]
+
+    def extract(
+        self,
+        image_bytes: bytes,
+        content_type: Optional[str] = None,
+    ) -> ReceiptExtractionResult:
+        """
+        Extract fuel receipt data with validation.
+
+        Args:
+            image_bytes: Raw image bytes
+            content_type: MIME type
+
+        Returns:
+            ReceiptExtractionResult with fuel-specific extraction
+        """
+        # Use base receipt extractor with fuel hint
+        result = receipt_extractor.extract(
+            image_bytes=image_bytes,
+            content_type=content_type,
+            receipt_type="fuel",
+        )
+
+        if not result.success:
+            return result
+
+        # Validate and cross-check fuel fields
+        validation = self._validate_fuel_receipt(result.extracted_fields)
+
+        if validation.issues:
+            logger.warning(
+                f"Fuel receipt validation issues: {validation.issues}"
+            )
+
+        # Update overall confidence based on validation
+        result.extracted_fields = self._adjust_confidences(
+            result.extracted_fields, validation
+        )
+
+        return result
+
+    def _validate_fuel_receipt(
+        self, fields: dict[str, ExtractedField]
+    ) -> FuelReceiptValidation:
+        """
+        Validate extracted fuel receipt fields.
+
+        Cross-checks:
+        - total = quantity * price per unit (within tolerance)
+        - quantity is reasonable for a single fill-up
+        - price per unit is within market range
+
+        Args:
+            fields: Extracted fields
+
+        Returns:
+            FuelReceiptValidation with issues and confidence
+        """
+        issues = []
+        confidence_score = 1.0
+
+        # Check required fields
+        for field_name in self.REQUIRED_FIELDS:
+            if field_name not in fields:
+                issues.append(f"Missing required field: {field_name}")
+                confidence_score *= 0.5
+
+        # Cross-validate total = quantity * price
+        if all(
+            f in fields for f in ["totalAmount", "fuelQuantity", "pricePerUnit"]
+        ):
+            total = fields["totalAmount"].value
+            quantity = fields["fuelQuantity"].value
+            price = fields["pricePerUnit"].value
+
+            calculated_total = quantity * price
+            tolerance = 0.10  # Allow 10% tolerance for rounding
+
+            if abs(total - calculated_total) > total * tolerance:
+                issues.append(
+                    f"Total ({total}) doesn't match quantity ({quantity}) * "
+                    f"price ({price}) = {calculated_total:.2f}"
+                )
+                confidence_score *= 0.7
+
+        # Validate quantity is reasonable
+        if "fuelQuantity" in fields:
+            quantity = fields["fuelQuantity"].value
+            if quantity < 0.5:
+                issues.append(f"Fuel quantity too small: {quantity}")
+                confidence_score *= 0.6
+            elif quantity > 40:  # 40 gallons is very large tank
+                issues.append(f"Fuel quantity unusually large: {quantity}")
+                confidence_score *= 0.8
+
+        # Validate price is reasonable (current US market range)
+        if "pricePerUnit" in fields:
+            price = fields["pricePerUnit"].value
+            if price < 1.50:
+                issues.append(f"Price per unit too low: ${price}")
+                confidence_score *= 0.7
+            elif price > 7.00:
+                issues.append(f"Price per unit unusually high: ${price}")
+                confidence_score *= 0.8
+
+        # Validate fuel grade
+        if "fuelGrade" in fields:
+            grade = fields["fuelGrade"].value
+            valid_grades = ["87", "89", "91", "93", "DIESEL", "E85"]
+            if grade not in valid_grades:
+                issues.append(f"Unknown fuel grade: {grade}")
+                confidence_score *= 0.9
+
+        is_valid = len(issues) == 0
+        return FuelReceiptValidation(
+            is_valid=is_valid,
+            issues=issues,
+            confidence_score=confidence_score,
+        )
+
+    def _adjust_confidences(
+        self,
+        fields: dict[str, ExtractedField],
+        validation: FuelReceiptValidation,
+    ) -> dict[str, ExtractedField]:
+        """
+        Adjust field confidences based on validation.
+
+        Args:
+            fields: Extracted fields
+            validation: Validation result
+
+        Returns:
+            Fields with adjusted confidences
+        """
+        if validation.is_valid:
+            # Boost confidences when cross-validation passes
+            boost = 1.1
+        else:
+            # Reduce confidences when there are issues
+            boost = validation.confidence_score
+
+        adjusted = {}
+        for name, field in fields.items():
+            adjusted[name] = ExtractedField(
+                value=field.value,
+                confidence=min(1.0, field.confidence * boost),
+            )
+
+        return adjusted
+
+
+# Singleton instance
+fuel_receipt_extractor = FuelReceiptExtractor()
--- a/ocr/app/extractors/receipt_extractor.py
+++ b/ocr/app/extractors/receipt_extractor.py
@@ -0,0 +1,345 @@
+"""Receipt-specific OCR extractor with field extraction."""
+import io
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+import magic
+import pytesseract
+from PIL import Image
+from pillow_heif import register_heif_opener
+
+from app.config import settings
+from app.extractors.base import BaseExtractor
+from app.preprocessors.receipt_preprocessor import receipt_preprocessor
+from app.patterns import currency_matcher, date_matcher, fuel_matcher
+
+# Register HEIF/HEIC opener
+register_heif_opener()
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ExtractedField:
+    """A single extracted field with confidence."""
+
+    value: Any
+    confidence: float
+
+
+@dataclass
+class ReceiptExtractionResult:
+    """Result of receipt extraction."""
+
+    success: bool
+    receipt_type: str = "unknown"
+    extracted_fields: dict[str, ExtractedField] = field(default_factory=dict)
+    raw_text: str = ""
+    processing_time_ms: int = 0
+    error: Optional[str] = None
+
+
+class ReceiptExtractor(BaseExtractor):
+    """Receipt-specific OCR extractor for fuel and general receipts."""
+
+    # Supported MIME types
+    SUPPORTED_TYPES = {
+        "image/jpeg",
+        "image/png",
+        "image/heic",
+        "image/heif",
+    }
+
+    def __init__(self) -> None:
+        """Initialize receipt extractor."""
+        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+
+    def extract(
+        self,
+        image_bytes: bytes,
+        content_type: Optional[str] = None,
+        receipt_type: Optional[str] = None,
+    ) -> ReceiptExtractionResult:
+        """
+        Extract data from a receipt image.
+
+        Args:
+            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
+            content_type: MIME type (auto-detected if not provided)
+            receipt_type: Hint for receipt type ("fuel" for specialized extraction)
+
+        Returns:
+            ReceiptExtractionResult with extracted fields
+        """
+        start_time = time.time()
+
+        # Detect content type if not provided
+        if not content_type:
+            content_type = self._detect_mime_type(image_bytes)
+
+        # Validate content type
+        if content_type not in self.SUPPORTED_TYPES:
+            return ReceiptExtractionResult(
+                success=False,
+                error=f"Unsupported file type: {content_type}",
+                processing_time_ms=int((time.time() - start_time) * 1000),
+            )
+
+        try:
+            # Apply receipt-optimized preprocessing
+            preprocessing_result = receipt_preprocessor.preprocess(image_bytes)
+            preprocessed_bytes = preprocessing_result.image_bytes
+
+            # Perform OCR
+            raw_text = self._perform_ocr(preprocessed_bytes)
+
+            if not raw_text.strip():
+                # Try with less aggressive preprocessing
+                preprocessing_result = receipt_preprocessor.preprocess(
+                    image_bytes,
+                    apply_threshold=False,
+                )
+                preprocessed_bytes = preprocessing_result.image_bytes
+                raw_text = self._perform_ocr(preprocessed_bytes)
+
+            if not raw_text.strip():
+                return ReceiptExtractionResult(
+                    success=False,
+                    error="No text found in image",
+                    processing_time_ms=int((time.time() - start_time) * 1000),
+                )
+
+            # Detect receipt type if not specified
+            detected_type = receipt_type or self._detect_receipt_type(raw_text)
+
+            # Extract fields based on receipt type
+            if detected_type == "fuel":
+                extracted_fields = self._extract_fuel_fields(raw_text)
+            else:
+                extracted_fields = self._extract_generic_fields(raw_text)
+
+            processing_time_ms = int((time.time() - start_time) * 1000)
+
+            logger.info(
+                f"Receipt extraction: type={detected_type}, "
+                f"fields={len(extracted_fields)}, "
+                f"time={processing_time_ms}ms"
+            )
+
+            return ReceiptExtractionResult(
+                success=True,
+                receipt_type=detected_type,
+                extracted_fields=extracted_fields,
+                raw_text=raw_text,
+                processing_time_ms=processing_time_ms,
+            )
+
+        except Exception as e:
+            logger.error(f"Receipt extraction failed: {e}", exc_info=True)
+            return ReceiptExtractionResult(
+                success=False,
+                error=str(e),
+                processing_time_ms=int((time.time() - start_time) * 1000),
+            )
+
+    def _detect_mime_type(self, file_bytes: bytes) -> str:
+        """Detect MIME type using python-magic."""
+        mime = magic.Magic(mime=True)
+        detected = mime.from_buffer(file_bytes)
+        return detected or "application/octet-stream"
+
+    def _perform_ocr(self, image_bytes: bytes, psm: int = 6) -> str:
+        """
+        Perform OCR on preprocessed image.
+
+        Args:
+            image_bytes: Preprocessed image bytes
+            psm: Tesseract page segmentation mode
+                 4 = Assume single column of text
+                 6 = Uniform block of text
+
+        Returns:
+            Raw OCR text
+        """
+        image = Image.open(io.BytesIO(image_bytes))
+
+        # Configure Tesseract for receipt OCR
+        # PSM 4 works well for columnar receipt text
+        config = f"--psm {psm}"
+
+        return pytesseract.image_to_string(image, config=config)
+
+    def _detect_receipt_type(self, text: str) -> str:
+        """
+        Detect receipt type based on content.
+
+        Args:
+            text: OCR text
+
+        Returns:
+            Receipt type: "fuel", "retail", or "unknown"
+        """
+        text_upper = text.upper()
+
+        # Fuel receipt indicators
+        fuel_keywords = [
+            "GALLON", "GAL", "FUEL", "GAS", "DIESEL", "UNLEADED",
+            "REGULAR", "PREMIUM", "OCTANE", "PPG", "PUMP",
+        ]
+
+        fuel_score = sum(1 for kw in fuel_keywords if kw in text_upper)
+
+        # Check for known gas stations
+        if fuel_matcher.extract_merchant_name(text):
+            merchant, _ = fuel_matcher.extract_merchant_name(text)
+            if any(
+                station in merchant.upper()
+                for station in fuel_matcher.STATION_NAMES
+            ):
+                fuel_score += 3
+
+        if fuel_score >= 2:
+            return "fuel"
+
+        return "unknown"
+
+    def _extract_fuel_fields(self, text: str) -> dict[str, ExtractedField]:
+        """
+        Extract fuel-specific fields from receipt text.
+
+        Args:
+            text: OCR text
+
+        Returns:
+            Dictionary of extracted fields
+        """
+        fields: dict[str, ExtractedField] = {}
+
+        # Extract merchant name
+        merchant_result = fuel_matcher.extract_merchant_name(text)
+        if merchant_result:
+            merchant_name, confidence = merchant_result
+            fields["merchantName"] = ExtractedField(
+                value=merchant_name,
+                confidence=confidence,
+            )
+
+        # Extract transaction date
+        date_match = date_matcher.extract_best_date(text)
+        if date_match:
+            fields["transactionDate"] = ExtractedField(
+                value=date_match.value,
+                confidence=date_match.confidence,
+            )
+
+        # Extract total amount
+        total_match = currency_matcher.extract_total(text)
+        if total_match:
+            fields["totalAmount"] = ExtractedField(
+                value=total_match.value,
+                confidence=total_match.confidence,
+            )
+
+        # Extract fuel quantity
+        quantity_match = fuel_matcher.extract_quantity(text)
+        if quantity_match:
+            fields["fuelQuantity"] = ExtractedField(
+                value=quantity_match.value,
+                confidence=quantity_match.confidence,
+            )
+
+        # Extract price per unit
+        price_match = fuel_matcher.extract_price_per_unit(text)
+        if price_match:
+            fields["pricePerUnit"] = ExtractedField(
+                value=price_match.value,
+                confidence=price_match.confidence,
+            )
+
+        # Extract fuel grade
+        grade_match = fuel_matcher.extract_grade(text)
+        if grade_match:
+            fields["fuelGrade"] = ExtractedField(
+                value=grade_match.value,
+                confidence=grade_match.confidence,
+            )
+
+        # Calculate derived values if we have enough data
+        if "totalAmount" in fields and "fuelQuantity" in fields:
+            if "pricePerUnit" not in fields:
+                # Calculate price per unit from total and quantity
+                calculated_price = (
+                    fields["totalAmount"].value / fields["fuelQuantity"].value
+                )
+                # Only use if reasonable
+                if 1.0 <= calculated_price <= 10.0:
+                    fields["pricePerUnit"] = ExtractedField(
+                        value=round(calculated_price, 3),
+                        confidence=min(
+                            fields["totalAmount"].confidence,
+                            fields["fuelQuantity"].confidence,
+                        )
+                        * 0.8,  # Lower confidence for calculated value
+                    )
+
+        return fields
+
+    def _extract_generic_fields(self, text: str) -> dict[str, ExtractedField]:
+        """
+        Extract generic fields from receipt text.
+
+        Args:
+            text: OCR text
+
+        Returns:
+            Dictionary of extracted fields
+        """
+        fields: dict[str, ExtractedField] = {}
+
+        # Extract date
+        date_match = date_matcher.extract_best_date(text)
+        if date_match:
+            fields["transactionDate"] = ExtractedField(
+                value=date_match.value,
+                confidence=date_match.confidence,
+            )
+
+        # Extract total amount
+        total_match = currency_matcher.extract_total(text)
+        if total_match:
+            fields["totalAmount"] = ExtractedField(
+                value=total_match.value,
+                confidence=total_match.confidence,
+            )
+
+        # Try to get merchant from first line
+        lines = [l.strip() for l in text.split("\n") if l.strip()]
+        if lines:
+            fields["merchantName"] = ExtractedField(
+                value=lines[0][:50],
+                confidence=0.40,
+            )
+
+        return fields
+
+    def validate(self, data: Any) -> bool:
+        """
+        Validate extracted receipt data.
+
+        Args:
+            data: Extracted data to validate
+
+        Returns:
+            True if data has minimum required fields
+        """
+        if not isinstance(data, dict):
+            return False
+
+        # Minimum: must have at least total amount or date
+        return "totalAmount" in data or "transactionDate" in data
+
+
+# Singleton instance
+receipt_extractor = ReceiptExtractor()