feat: add receipt OCR pipeline (refs #69)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 32s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m20s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

Implement receipt-specific OCR extraction for fuel receipts:

- Pattern matching modules for date, currency, and fuel data extraction
- Receipt-optimized image preprocessing for thermal receipts
- POST /extract/receipt endpoint with field extraction
- Confidence scoring per extracted field
- Cross-validation of fuel receipt data
- Unit tests for all pattern matchers

Extracted fields: merchantName, transactionDate, totalAmount,
fuelQuantity, pricePerUnit, fuelGrade

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 20:43:30 -06:00
parent a2f0abb14c
commit 6319d50fb1
16 changed files with 2845 additions and 2 deletions

View File

@@ -0,0 +1,193 @@
"""Fuel receipt specialization with validation and cross-checking."""
import logging
from dataclasses import dataclass
from typing import Optional
from app.extractors.receipt_extractor import (
ExtractedField,
ReceiptExtractionResult,
receipt_extractor,
)
logger = logging.getLogger(__name__)
@dataclass
class FuelReceiptValidation:
"""Validation result for fuel receipt extraction."""
is_valid: bool
issues: list[str]
confidence_score: float
class FuelReceiptExtractor:
"""Specialized fuel receipt extractor with cross-validation.
Provides additional validation and confidence scoring specific
to fuel receipts by cross-checking extracted values.
"""
# Expected fields for a complete fuel receipt
REQUIRED_FIELDS = ["totalAmount"]
OPTIONAL_FIELDS = [
"merchantName",
"transactionDate",
"fuelQuantity",
"pricePerUnit",
"fuelGrade",
]
def extract(
self,
image_bytes: bytes,
content_type: Optional[str] = None,
) -> ReceiptExtractionResult:
"""
Extract fuel receipt data with validation.
Args:
image_bytes: Raw image bytes
content_type: MIME type
Returns:
ReceiptExtractionResult with fuel-specific extraction
"""
# Use base receipt extractor with fuel hint
result = receipt_extractor.extract(
image_bytes=image_bytes,
content_type=content_type,
receipt_type="fuel",
)
if not result.success:
return result
# Validate and cross-check fuel fields
validation = self._validate_fuel_receipt(result.extracted_fields)
if validation.issues:
logger.warning(
f"Fuel receipt validation issues: {validation.issues}"
)
# Update overall confidence based on validation
result.extracted_fields = self._adjust_confidences(
result.extracted_fields, validation
)
return result
def _validate_fuel_receipt(
self, fields: dict[str, ExtractedField]
) -> FuelReceiptValidation:
"""
Validate extracted fuel receipt fields.
Cross-checks:
- total = quantity * price per unit (within tolerance)
- quantity is reasonable for a single fill-up
- price per unit is within market range
Args:
fields: Extracted fields
Returns:
FuelReceiptValidation with issues and confidence
"""
issues = []
confidence_score = 1.0
# Check required fields
for field_name in self.REQUIRED_FIELDS:
if field_name not in fields:
issues.append(f"Missing required field: {field_name}")
confidence_score *= 0.5
# Cross-validate total = quantity * price
if all(
f in fields for f in ["totalAmount", "fuelQuantity", "pricePerUnit"]
):
total = fields["totalAmount"].value
quantity = fields["fuelQuantity"].value
price = fields["pricePerUnit"].value
calculated_total = quantity * price
tolerance = 0.10 # Allow 10% tolerance for rounding
if abs(total - calculated_total) > total * tolerance:
issues.append(
f"Total ({total}) doesn't match quantity ({quantity}) * "
f"price ({price}) = {calculated_total:.2f}"
)
confidence_score *= 0.7
# Validate quantity is reasonable
if "fuelQuantity" in fields:
quantity = fields["fuelQuantity"].value
if quantity < 0.5:
issues.append(f"Fuel quantity too small: {quantity}")
confidence_score *= 0.6
elif quantity > 40: # 40 gallons is very large tank
issues.append(f"Fuel quantity unusually large: {quantity}")
confidence_score *= 0.8
# Validate price is reasonable (current US market range)
if "pricePerUnit" in fields:
price = fields["pricePerUnit"].value
if price < 1.50:
issues.append(f"Price per unit too low: ${price}")
confidence_score *= 0.7
elif price > 7.00:
issues.append(f"Price per unit unusually high: ${price}")
confidence_score *= 0.8
# Validate fuel grade
if "fuelGrade" in fields:
grade = fields["fuelGrade"].value
valid_grades = ["87", "89", "91", "93", "DIESEL", "E85"]
if grade not in valid_grades:
issues.append(f"Unknown fuel grade: {grade}")
confidence_score *= 0.9
is_valid = len(issues) == 0
return FuelReceiptValidation(
is_valid=is_valid,
issues=issues,
confidence_score=confidence_score,
)
def _adjust_confidences(
self,
fields: dict[str, ExtractedField],
validation: FuelReceiptValidation,
) -> dict[str, ExtractedField]:
"""
Adjust field confidences based on validation.
Args:
fields: Extracted fields
validation: Validation result
Returns:
Fields with adjusted confidences
"""
if validation.is_valid:
# Boost confidences when cross-validation passes
boost = 1.1
else:
# Reduce confidences when there are issues
boost = validation.confidence_score
adjusted = {}
for name, field in fields.items():
adjusted[name] = ExtractedField(
value=field.value,
confidence=min(1.0, field.confidence * boost),
)
return adjusted
# Singleton instance
fuel_receipt_extractor = FuelReceiptExtractor()