From 5e4515da7ca84c1ef300caa96cd462af32d1c2ec Mon Sep 17 00:00:00 2001
From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com>
Date: Fri, 13 Feb 2026 21:34:17 -0600
Subject: [PATCH] fix: use PyMuPDF instead of pdf2image for PDF-to-image
 conversion (refs #182)

pdf2image requires poppler-utils which is not installed in the OCR
container. PyMuPDF is already in requirements.txt and can render PDF
pages to PNG at 300 DPI natively without extra system dependencies.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ocr/app/extractors/receipt_extractor.py | 17 ++++++++++-------
 ocr/app/services/ocr_service.py         | 17 +++++++++--------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/ocr/app/extractors/receipt_extractor.py b/ocr/app/extractors/receipt_extractor.py
index 07ee32a..8468398 100644
--- a/ocr/app/extractors/receipt_extractor.py
+++ b/ocr/app/extractors/receipt_extractor.py
@@ -162,15 +162,18 @@ class ReceiptExtractor(BaseExtractor):
     def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
         """Extract first page of PDF as PNG image for OCR processing."""
         try:
-            from pdf2image import convert_from_bytes
+            import fitz  # PyMuPDF
 
-            images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
-            if images:
-                buffer = io.BytesIO()
-                images[0].save(buffer, format="PNG")
-                return buffer.getvalue()
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            page = doc[0]
+            # Render at 300 DPI (default is 72, so scale factor = 300/72)
+            mat = fitz.Matrix(300 / 72, 300 / 72)
+            pix = page.get_pixmap(matrix=mat)
+            png_bytes = pix.tobytes("png")
+            doc.close()
+            return png_bytes
         except ImportError:
-            logger.warning("pdf2image not available, PDF support limited")
+            logger.warning("PyMuPDF not available, PDF support limited")
         except Exception as e:
             logger.error(f"PDF first page extraction failed: {e}")
 
diff --git a/ocr/app/services/ocr_service.py b/ocr/app/services/ocr_service.py
index 4d06452..4d32dfe 100644
--- a/ocr/app/services/ocr_service.py
+++ b/ocr/app/services/ocr_service.py
@@ -141,16 +141,17 @@ class OcrService:
     def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
         """Extract first page of PDF as PNG image."""
         try:
-            # Use pdf2image if available, otherwise return empty
-            from pdf2image import convert_from_bytes
+            import fitz  # PyMuPDF
 
-            images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
-            if images:
-                buffer = io.BytesIO()
-                images[0].save(buffer, format="PNG")
-                return buffer.getvalue()
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            page = doc[0]
+            mat = fitz.Matrix(300 / 72, 300 / 72)
+            pix = page.get_pixmap(matrix=mat)
+            png_bytes = pix.tobytes("png")
+            doc.close()
+            return png_bytes
         except ImportError:
-            logger.warning("pdf2image not available, PDF support limited")
+            logger.warning("PyMuPDF not available, PDF support limited")
         except Exception as e:
             logger.error(f"PDF extraction failed: {e}")