fix: use PyMuPDF instead of pdf2image for PDF-to-image conversion (refs #182)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 37s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 52s
Deploy to Staging / Verify Staging (pull_request) Successful in 9s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

pdf2image requires poppler-utils which is not installed in the OCR
container. PyMuPDF is already in requirements.txt and can render PDF
pages to PNG at 300 DPI natively without extra system dependencies.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-13 21:34:17 -06:00
parent 5877b531f9
commit 5e4515da7c
2 changed files with 19 additions and 15 deletions

View File

@@ -162,15 +162,18 @@ class ReceiptExtractor(BaseExtractor):
def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes: def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
"""Extract first page of PDF as PNG image for OCR processing.""" """Extract first page of PDF as PNG image for OCR processing."""
try: try:
from pdf2image import convert_from_bytes import fitz # PyMuPDF
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300) doc = fitz.open(stream=pdf_bytes, filetype="pdf")
if images: page = doc[0]
buffer = io.BytesIO() # Render at 300 DPI (default is 72, so scale factor = 300/72)
images[0].save(buffer, format="PNG") mat = fitz.Matrix(300 / 72, 300 / 72)
return buffer.getvalue() pix = page.get_pixmap(matrix=mat)
png_bytes = pix.tobytes("png")
doc.close()
return png_bytes
except ImportError: except ImportError:
logger.warning("pdf2image not available, PDF support limited") logger.warning("PyMuPDF not available, PDF support limited")
except Exception as e: except Exception as e:
logger.error(f"PDF first page extraction failed: {e}") logger.error(f"PDF first page extraction failed: {e}")

View File

@@ -141,16 +141,17 @@ class OcrService:
def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes: def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
"""Extract first page of PDF as PNG image.""" """Extract first page of PDF as PNG image."""
try: try:
# Use pdf2image if available, otherwise return empty import fitz # PyMuPDF
from pdf2image import convert_from_bytes
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300) doc = fitz.open(stream=pdf_bytes, filetype="pdf")
if images: page = doc[0]
buffer = io.BytesIO() mat = fitz.Matrix(300 / 72, 300 / 72)
images[0].save(buffer, format="PNG") pix = page.get_pixmap(matrix=mat)
return buffer.getvalue() png_bytes = pix.tobytes("png")
doc.close()
return png_bytes
except ImportError: except ImportError:
logger.warning("pdf2image not available, PDF support limited") logger.warning("PyMuPDF not available, PDF support limited")
except Exception as e: except Exception as e:
logger.error(f"PDF extraction failed: {e}") logger.error(f"PDF extraction failed: {e}")