diff --git a/ocr/app/extractors/receipt_extractor.py b/ocr/app/extractors/receipt_extractor.py index 07ee32a..8468398 100644 --- a/ocr/app/extractors/receipt_extractor.py +++ b/ocr/app/extractors/receipt_extractor.py @@ -162,15 +162,18 @@ class ReceiptExtractor(BaseExtractor): def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes: """Extract first page of PDF as PNG image for OCR processing.""" try: - from pdf2image import convert_from_bytes + import fitz # PyMuPDF - images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300) - if images: - buffer = io.BytesIO() - images[0].save(buffer, format="PNG") - return buffer.getvalue() + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + page = doc[0] + # Render at 300 DPI (default is 72, so scale factor = 300/72) + mat = fitz.Matrix(300 / 72, 300 / 72) + pix = page.get_pixmap(matrix=mat) + png_bytes = pix.tobytes("png") + doc.close() + return png_bytes except ImportError: - logger.warning("pdf2image not available, PDF support limited") + logger.warning("PyMuPDF not available, PDF support limited") except Exception as e: logger.error(f"PDF first page extraction failed: {e}") diff --git a/ocr/app/services/ocr_service.py b/ocr/app/services/ocr_service.py index 4d06452..4d32dfe 100644 --- a/ocr/app/services/ocr_service.py +++ b/ocr/app/services/ocr_service.py @@ -141,16 +141,17 @@ class OcrService: def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes: """Extract first page of PDF as PNG image.""" try: - # Use pdf2image if available, otherwise return empty - from pdf2image import convert_from_bytes + import fitz # PyMuPDF - images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300) - if images: - buffer = io.BytesIO() - images[0].save(buffer, format="PNG") - return buffer.getvalue() + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + page = doc[0] + mat = fitz.Matrix(300 / 72, 300 / 72) + pix = page.get_pixmap(matrix=mat) + png_bytes = pix.tobytes("png") + doc.close() + return png_bytes except ImportError: - logger.warning("pdf2image not available, PDF support limited") + logger.warning("PyMuPDF not available, PDF support limited") except Exception as e: logger.error(f"PDF extraction failed: {e}")