chore: add PDF support to receipt OCR pipeline (refs #182)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 38s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 22s
Deploy to Staging / Verify Staging (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 38s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 22s
Deploy to Staging / Verify Staging (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
The receipt extractor only accepted image MIME types, rejecting PDFs at the OCR layer. Added application/pdf to supported types and PDF-to-image conversion (first page at 300 DPI) before OCR preprocessing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -98,7 +98,7 @@ class MaintenanceReceiptExtractor:
|
|||||||
"""Extract maintenance receipt fields from an image.
|
"""Extract maintenance receipt fields from an image.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_bytes: Raw image bytes (HEIC, JPEG, PNG).
|
image_bytes: Raw image or PDF bytes (HEIC, JPEG, PNG, PDF).
|
||||||
content_type: MIME type (auto-detected if not provided).
|
content_type: MIME type (auto-detected if not provided).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
"""Receipt-specific OCR extractor with field extraction."""
|
"""Receipt-specific OCR extractor with field extraction."""
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
@@ -47,6 +48,7 @@ class ReceiptExtractor(BaseExtractor):
|
|||||||
"image/png",
|
"image/png",
|
||||||
"image/heic",
|
"image/heic",
|
||||||
"image/heif",
|
"image/heif",
|
||||||
|
"application/pdf",
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
@@ -63,7 +65,7 @@ class ReceiptExtractor(BaseExtractor):
|
|||||||
Extract data from a receipt image.
|
Extract data from a receipt image.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
|
image_bytes: Raw image or PDF bytes (HEIC, JPEG, PNG, PDF)
|
||||||
content_type: MIME type (auto-detected if not provided)
|
content_type: MIME type (auto-detected if not provided)
|
||||||
receipt_type: Hint for receipt type ("fuel" for specialized extraction)
|
receipt_type: Hint for receipt type ("fuel" for specialized extraction)
|
||||||
|
|
||||||
@@ -85,6 +87,16 @@ class ReceiptExtractor(BaseExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Convert PDF to image (first page)
|
||||||
|
if content_type == "application/pdf":
|
||||||
|
image_bytes = self._extract_pdf_first_page(image_bytes)
|
||||||
|
if not image_bytes:
|
||||||
|
return ReceiptExtractionResult(
|
||||||
|
success=False,
|
||||||
|
error="Failed to extract image from PDF",
|
||||||
|
processing_time_ms=int((time.time() - start_time) * 1000),
|
||||||
|
)
|
||||||
|
|
||||||
# Apply receipt-optimized preprocessing
|
# Apply receipt-optimized preprocessing
|
||||||
preprocessing_result = receipt_preprocessor.preprocess(image_bytes)
|
preprocessing_result = receipt_preprocessor.preprocess(image_bytes)
|
||||||
preprocessed_bytes = preprocessing_result.image_bytes
|
preprocessed_bytes = preprocessing_result.image_bytes
|
||||||
@@ -147,6 +159,23 @@ class ReceiptExtractor(BaseExtractor):
|
|||||||
detected = mime.from_buffer(file_bytes)
|
detected = mime.from_buffer(file_bytes)
|
||||||
return detected or "application/octet-stream"
|
return detected or "application/octet-stream"
|
||||||
|
|
||||||
|
def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
|
||||||
|
"""Extract first page of PDF as PNG image for OCR processing."""
|
||||||
|
try:
|
||||||
|
from pdf2image import convert_from_bytes
|
||||||
|
|
||||||
|
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
|
||||||
|
if images:
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
images[0].save(buffer, format="PNG")
|
||||||
|
return buffer.getvalue()
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("pdf2image not available, PDF support limited")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF first page extraction failed: {e}")
|
||||||
|
|
||||||
|
return b""
|
||||||
|
|
||||||
def _perform_ocr(self, image_bytes: bytes) -> str:
|
def _perform_ocr(self, image_bytes: bytes) -> str:
|
||||||
"""
|
"""
|
||||||
Perform OCR on preprocessed image via engine abstraction.
|
Perform OCR on preprocessed image via engine abstraction.
|
||||||
|
|||||||
@@ -281,9 +281,9 @@ async def extract_maintenance_receipt(
|
|||||||
- Gemini semantic field extraction from OCR text
|
- Gemini semantic field extraction from OCR text
|
||||||
- Regex cross-validation for dates, amounts, odometer
|
- Regex cross-validation for dates, amounts, odometer
|
||||||
|
|
||||||
Supports HEIC, JPEG, PNG formats.
|
Supports HEIC, JPEG, PNG, and PDF formats.
|
||||||
|
|
||||||
- **file**: Maintenance receipt image file (max 10MB)
|
- **file**: Maintenance receipt image or PDF file (max 10MB)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
- **receiptType**: "maintenance"
|
- **receiptType**: "maintenance"
|
||||||
|
|||||||
Reference in New Issue
Block a user