From 653c535165a2f3cc5590346ab0c60c77acf2e084 Mon Sep 17 00:00:00 2001
From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com>
Date: Fri, 13 Feb 2026 21:22:40 -0600
Subject: [PATCH] chore: add PDF support to receipt OCR pipeline (refs #182)

The receipt extractor only accepted image MIME types, rejecting PDFs at
the OCR layer. Added application/pdf to supported types and PDF-to-image
conversion (first page at 300 DPI) before OCR preprocessing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../maintenance_receipt_extractor.py          |  2 +-
 ocr/app/extractors/receipt_extractor.py       | 31 ++++++++++++++++++-
 ocr/app/routers/extract.py                    |  4 +--
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/ocr/app/extractors/maintenance_receipt_extractor.py b/ocr/app/extractors/maintenance_receipt_extractor.py
index 93285ba..d5b4d13 100644
--- a/ocr/app/extractors/maintenance_receipt_extractor.py
+++ b/ocr/app/extractors/maintenance_receipt_extractor.py
@@ -98,7 +98,7 @@ class MaintenanceReceiptExtractor:
         """Extract maintenance receipt fields from an image.
 
         Args:
-            image_bytes: Raw image bytes (HEIC, JPEG, PNG).
+            image_bytes: Raw image or PDF bytes (HEIC, JPEG, PNG, PDF).
             content_type: MIME type (auto-detected if not provided).
 
         Returns:
diff --git a/ocr/app/extractors/receipt_extractor.py b/ocr/app/extractors/receipt_extractor.py
index 111cfb1..07ee32a 100644
--- a/ocr/app/extractors/receipt_extractor.py
+++ b/ocr/app/extractors/receipt_extractor.py
@@ -1,4 +1,5 @@
 """Receipt-specific OCR extractor with field extraction."""
+import io
 import logging
 import time
 from dataclasses import dataclass, field
@@ -47,6 +48,7 @@ class ReceiptExtractor(BaseExtractor):
         "image/png",
         "image/heic",
         "image/heif",
+        "application/pdf",
     }
 
     def __init__(self) -> None:
@@ -63,7 +65,7 @@ class ReceiptExtractor(BaseExtractor):
         Extract data from a receipt image.
 
         Args:
-            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
+            image_bytes: Raw image or PDF bytes (HEIC, JPEG, PNG, PDF)
             content_type: MIME type (auto-detected if not provided)
             receipt_type: Hint for receipt type ("fuel" for specialized extraction)
 
@@ -85,6 +87,16 @@ class ReceiptExtractor(BaseExtractor):
             )
 
         try:
+            # Convert PDF to image (first page)
+            if content_type == "application/pdf":
+                image_bytes = self._extract_pdf_first_page(image_bytes)
+                if not image_bytes:
+                    return ReceiptExtractionResult(
+                        success=False,
+                        error="Failed to extract image from PDF",
+                        processing_time_ms=int((time.time() - start_time) * 1000),
+                    )
+
             # Apply receipt-optimized preprocessing
             preprocessing_result = receipt_preprocessor.preprocess(image_bytes)
             preprocessed_bytes = preprocessing_result.image_bytes
@@ -147,6 +159,23 @@ class ReceiptExtractor(BaseExtractor):
         detected = mime.from_buffer(file_bytes)
         return detected or "application/octet-stream"
 
+    def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
+        """Extract first page of PDF as PNG image for OCR processing."""
+        try:
+            from pdf2image import convert_from_bytes
+
+            images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
+            if images:
+                buffer = io.BytesIO()
+                images[0].save(buffer, format="PNG")
+                return buffer.getvalue()
+        except ImportError:
+            logger.warning("pdf2image not available, PDF support limited")
+        except Exception as e:
+            logger.error(f"PDF first page extraction failed: {e}")
+
+        return b""
+
     def _perform_ocr(self, image_bytes: bytes) -> str:
         """
         Perform OCR on preprocessed image via engine abstraction.
diff --git a/ocr/app/routers/extract.py b/ocr/app/routers/extract.py
index 3c1d02f..52cf0d7 100644
--- a/ocr/app/routers/extract.py
+++ b/ocr/app/routers/extract.py
@@ -281,9 +281,9 @@ async def extract_maintenance_receipt(
     - Gemini semantic field extraction from OCR text
     - Regex cross-validation for dates, amounts, odometer
 
-    Supports HEIC, JPEG, PNG formats.
+    Supports HEIC, JPEG, PNG, and PDF formats.
 
-    - **file**: Maintenance receipt image file (max 10MB)
+    - **file**: Maintenance receipt image or PDF file (max 10MB)
 
     Returns:
     - **receiptType**: "maintenance"