From 653c535165a2f3cc5590346ab0c60c77acf2e084 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 13 Feb 2026 21:22:40 -0600 Subject: [PATCH] chore: add PDF support to receipt OCR pipeline (refs #182) The receipt extractor only accepted image MIME types, rejecting PDFs at the OCR layer. Added application/pdf to supported types and PDF-to-image conversion (first page at 300 DPI) before OCR preprocessing. Co-Authored-By: Claude Opus 4.6 --- .../maintenance_receipt_extractor.py | 2 +- ocr/app/extractors/receipt_extractor.py | 31 ++++++++++++++++++- ocr/app/routers/extract.py | 4 +-- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/ocr/app/extractors/maintenance_receipt_extractor.py b/ocr/app/extractors/maintenance_receipt_extractor.py index 93285ba..d5b4d13 100644 --- a/ocr/app/extractors/maintenance_receipt_extractor.py +++ b/ocr/app/extractors/maintenance_receipt_extractor.py @@ -98,7 +98,7 @@ class MaintenanceReceiptExtractor: """Extract maintenance receipt fields from an image. Args: - image_bytes: Raw image bytes (HEIC, JPEG, PNG). + image_bytes: Raw image or PDF bytes (HEIC, JPEG, PNG, PDF). content_type: MIME type (auto-detected if not provided). Returns: diff --git a/ocr/app/extractors/receipt_extractor.py b/ocr/app/extractors/receipt_extractor.py index 111cfb1..07ee32a 100644 --- a/ocr/app/extractors/receipt_extractor.py +++ b/ocr/app/extractors/receipt_extractor.py @@ -1,4 +1,5 @@ """Receipt-specific OCR extractor with field extraction.""" +import io import logging import time from dataclasses import dataclass, field @@ -47,6 +48,7 @@ class ReceiptExtractor(BaseExtractor): "image/png", "image/heic", "image/heif", + "application/pdf", } def __init__(self) -> None: @@ -63,7 +65,7 @@ class ReceiptExtractor(BaseExtractor): Extract data from a receipt image. Args: - image_bytes: Raw image bytes (HEIC, JPEG, PNG) + image_bytes: Raw image or PDF bytes (HEIC, JPEG, PNG, PDF) content_type: MIME type (auto-detected if not provided) receipt_type: Hint for receipt type ("fuel" for specialized extraction) @@ -85,6 +87,16 @@ class ReceiptExtractor(BaseExtractor): ) try: + # Convert PDF to image (first page) + if content_type == "application/pdf": + image_bytes = self._extract_pdf_first_page(image_bytes) + if not image_bytes: + return ReceiptExtractionResult( + success=False, + error="Failed to extract image from PDF", + processing_time_ms=int((time.time() - start_time) * 1000), + ) + # Apply receipt-optimized preprocessing preprocessing_result = receipt_preprocessor.preprocess(image_bytes) preprocessed_bytes = preprocessing_result.image_bytes @@ -147,6 +159,23 @@ class ReceiptExtractor(BaseExtractor): detected = mime.from_buffer(file_bytes) return detected or "application/octet-stream" + def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes: + """Extract first page of PDF as PNG image for OCR processing.""" + try: + from pdf2image import convert_from_bytes + + images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300) + if images: + buffer = io.BytesIO() + images[0].save(buffer, format="PNG") + return buffer.getvalue() + except ImportError: + logger.warning("pdf2image not available, PDF support limited") + except Exception as e: + logger.error(f"PDF first page extraction failed: {e}") + + return b"" + def _perform_ocr(self, image_bytes: bytes) -> str: """ Perform OCR on preprocessed image via engine abstraction. diff --git a/ocr/app/routers/extract.py b/ocr/app/routers/extract.py index 3c1d02f..52cf0d7 100644 --- a/ocr/app/routers/extract.py +++ b/ocr/app/routers/extract.py @@ -281,9 +281,9 @@ async def extract_maintenance_receipt( - Gemini semantic field extraction from OCR text - Regex cross-validation for dates, amounts, odometer - Supports HEIC, JPEG, PNG formats. + Supports HEIC, JPEG, PNG, and PDF formats. - - **file**: Maintenance receipt image file (max 10MB) + - **file**: Maintenance receipt image or PDF file (max 10MB) Returns: - **receiptType**: "maintenance"