From 5e4515da7ca84c1ef300caa96cd462af32d1c2ec Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Fri, 13 Feb 2026 21:34:17 -0600 Subject: [PATCH] fix: use PyMuPDF instead of pdf2image for PDF-to-image conversion (refs #182) pdf2image requires poppler-utils which is not installed in the OCR container. PyMuPDF is already in requirements.txt and can render PDF pages to PNG at 300 DPI natively without extra system dependencies. Co-Authored-By: Claude Opus 4.6 --- ocr/app/extractors/receipt_extractor.py | 17 ++++++++++------- ocr/app/services/ocr_service.py | 17 +++++++++-------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/ocr/app/extractors/receipt_extractor.py b/ocr/app/extractors/receipt_extractor.py index 07ee32a..8468398 100644 --- a/ocr/app/extractors/receipt_extractor.py +++ b/ocr/app/extractors/receipt_extractor.py @@ -162,15 +162,18 @@ class ReceiptExtractor(BaseExtractor): def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes: """Extract first page of PDF as PNG image for OCR processing.""" try: - from pdf2image import convert_from_bytes + import fitz # PyMuPDF - images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300) - if images: - buffer = io.BytesIO() - images[0].save(buffer, format="PNG") - return buffer.getvalue() + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + page = doc[0] + # Render at 300 DPI (default is 72, so scale factor = 300/72) + mat = fitz.Matrix(300 / 72, 300 / 72) + pix = page.get_pixmap(matrix=mat) + png_bytes = pix.tobytes("png") + doc.close() + return png_bytes except ImportError: - logger.warning("pdf2image not available, PDF support limited") + logger.warning("PyMuPDF not available, PDF support limited") except Exception as e: logger.error(f"PDF first page extraction failed: {e}") diff --git a/ocr/app/services/ocr_service.py b/ocr/app/services/ocr_service.py index 4d06452..4d32dfe 100644 --- a/ocr/app/services/ocr_service.py +++ b/ocr/app/services/ocr_service.py @@ -141,16 +141,17 @@ class OcrService: def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes: """Extract first page of PDF as PNG image.""" try: - # Use pdf2image if available, otherwise return empty - from pdf2image import convert_from_bytes + import fitz # PyMuPDF - images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300) - if images: - buffer = io.BytesIO() - images[0].save(buffer, format="PNG") - return buffer.getvalue() + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + page = doc[0] + mat = fitz.Matrix(300 / 72, 300 / 72) + pix = page.get_pixmap(matrix=mat) + png_bytes = pix.tobytes("png") + doc.close() + return png_bytes except ImportError: - logger.warning("pdf2image not available, PDF support limited") + logger.warning("PyMuPDF not available, PDF support limited") except Exception as e: logger.error(f"PDF extraction failed: {e}")