"""PDF preprocessing for owner's manual extraction.""" import io import logging from dataclasses import dataclass, field from typing import Iterator, Optional import fitz # PyMuPDF from PIL import Image logger = logging.getLogger(__name__) @dataclass class PdfPageContent: """Content extracted from a single PDF page.""" page_number: int has_text: bool text_content: str image_bytes: Optional[bytes] # Rendered image for scanned pages width: int height: int @dataclass class PdfInfo: """Information about a PDF document.""" total_pages: int has_text_layer: bool is_scanned: bool # True if most pages lack text layer file_size_bytes: int title: Optional[str] author: Optional[str] metadata: dict = field(default_factory=dict) class PdfPreprocessor: """Preprocess PDFs for OCR extraction. Handles two scenarios: 1. Native PDFs with text layer - extract text directly 2. Scanned PDFs - render pages to images for OCR Uses PyMuPDF (fitz) for both text extraction and image rendering. """ # DPI for rendering scanned pages DEFAULT_DPI = 300 # Minimum text length to consider a page has text MIN_TEXT_LENGTH = 50 # Maximum pages to sample for scan detection SAMPLE_PAGES = 10 def get_pdf_info(self, pdf_bytes: bytes) -> PdfInfo: """ Analyze PDF and return metadata. Args: pdf_bytes: Raw PDF bytes Returns: PdfInfo with document metadata """ doc = fitz.open(stream=pdf_bytes, filetype="pdf") try: total_pages = len(doc) metadata = doc.metadata or {} # Sample pages to determine if scanned text_pages = 0 sample_count = min(total_pages, self.SAMPLE_PAGES) # Sample from beginning, middle, and end if total_pages <= self.SAMPLE_PAGES: sample_indices = list(range(total_pages)) else: sample_indices = [ 0, 1, 2, # Beginning total_pages // 2 - 1, total_pages // 2, total_pages // 2 + 1, # Middle total_pages - 3, total_pages - 2, total_pages - 1, # End ] sample_indices = [i for i in sample_indices if 0 <= i < total_pages] for page_idx in sample_indices: page = doc[page_idx] text = page.get_text().strip() if len(text) >= self.MIN_TEXT_LENGTH: text_pages += 1 # Consider it a scanned PDF if less than half of sampled pages have text has_text_layer = text_pages > 0 is_scanned = text_pages < len(sample_indices) / 2 return PdfInfo( total_pages=total_pages, has_text_layer=has_text_layer, is_scanned=is_scanned, file_size_bytes=len(pdf_bytes), title=metadata.get("title"), author=metadata.get("author"), metadata=metadata, ) finally: doc.close() def extract_text_from_page( self, pdf_bytes: bytes, page_number: int ) -> PdfPageContent: """ Extract content from a single PDF page. Args: pdf_bytes: Raw PDF bytes page_number: Zero-indexed page number Returns: PdfPageContent with text and/or image """ doc = fitz.open(stream=pdf_bytes, filetype="pdf") try: if page_number >= len(doc): raise ValueError(f"Page {page_number} does not exist (max: {len(doc) - 1})") page = doc[page_number] text = page.get_text().strip() has_text = len(text) >= self.MIN_TEXT_LENGTH rect = page.rect width = int(rect.width) height = int(rect.height) # If page has text, we don't need to render image_bytes = None if not has_text: image_bytes = self._render_page_to_image(page, self.DEFAULT_DPI) return PdfPageContent( page_number=page_number, has_text=has_text, text_content=text, image_bytes=image_bytes, width=width, height=height, ) finally: doc.close() def extract_all_pages( self, pdf_bytes: bytes, dpi: int = DEFAULT_DPI, force_ocr: bool = False, ) -> Iterator[PdfPageContent]: """ Extract content from all pages as a generator. Args: pdf_bytes: Raw PDF bytes dpi: DPI for rendering scanned pages force_ocr: If True, render all pages regardless of text layer Yields: PdfPageContent for each page """ doc = fitz.open(stream=pdf_bytes, filetype="pdf") try: for page_number in range(len(doc)): page = doc[page_number] text = page.get_text().strip() has_text = len(text) >= self.MIN_TEXT_LENGTH rect = page.rect width = int(rect.width) height = int(rect.height) # Render to image if no text or force_ocr image_bytes = None if not has_text or force_ocr: image_bytes = self._render_page_to_image(page, dpi) yield PdfPageContent( page_number=page_number, has_text=has_text, text_content=text if has_text else "", image_bytes=image_bytes, width=width, height=height, ) finally: doc.close() def extract_page_range( self, pdf_bytes: bytes, start_page: int, end_page: int, dpi: int = DEFAULT_DPI, ) -> list[PdfPageContent]: """ Extract content from a range of pages. Args: pdf_bytes: Raw PDF bytes start_page: First page (zero-indexed) end_page: Last page (exclusive) dpi: DPI for rendering Returns: List of PdfPageContent """ doc = fitz.open(stream=pdf_bytes, filetype="pdf") try: results = [] end_page = min(end_page, len(doc)) for page_number in range(start_page, end_page): page = doc[page_number] text = page.get_text().strip() has_text = len(text) >= self.MIN_TEXT_LENGTH rect = page.rect width = int(rect.width) height = int(rect.height) image_bytes = None if not has_text: image_bytes = self._render_page_to_image(page, dpi) results.append( PdfPageContent( page_number=page_number, has_text=has_text, text_content=text if has_text else "", image_bytes=image_bytes, width=width, height=height, ) ) return results finally: doc.close() def find_maintenance_section( self, pdf_bytes: bytes, keywords: Optional[list[str]] = None ) -> list[int]: """ Find pages likely containing maintenance schedules. Args: pdf_bytes: Raw PDF bytes keywords: Keywords to search for (defaults to common terms) Returns: List of page numbers likely containing maintenance info """ if keywords is None: keywords = [ "maintenance schedule", "maintenance interval", "service schedule", "service interval", "recommended maintenance", "scheduled maintenance", "routine maintenance", "periodic maintenance", "owner's maintenance", "maintenance requirements", ] doc = fitz.open(stream=pdf_bytes, filetype="pdf") try: maintenance_pages = [] for page_number in range(len(doc)): page = doc[page_number] text = page.get_text().lower() for keyword in keywords: if keyword.lower() in text: maintenance_pages.append(page_number) break return maintenance_pages finally: doc.close() def _render_page_to_image(self, page: fitz.Page, dpi: int) -> bytes: """ Render a PDF page to PNG image bytes. Args: page: PyMuPDF page object dpi: Target DPI for rendering Returns: PNG image bytes """ # Calculate scale factor from DPI # Default PDF resolution is 72 DPI scale = dpi / 72.0 matrix = fitz.Matrix(scale, scale) # Render page to pixmap pixmap = page.get_pixmap(matrix=matrix) # Convert to PNG bytes png_bytes = pixmap.tobytes("png") return png_bytes def render_page_for_table_detection( self, pdf_bytes: bytes, page_number: int, dpi: int = 150 ) -> bytes: """ Render a page at lower DPI for table detection (faster). Args: pdf_bytes: Raw PDF bytes page_number: Page to render dpi: DPI for rendering (lower for faster processing) Returns: PNG image bytes """ doc = fitz.open(stream=pdf_bytes, filetype="pdf") try: if page_number >= len(doc): raise ValueError(f"Page {page_number} does not exist") page = doc[page_number] return self._render_page_to_image(page, dpi) finally: doc.close() # Singleton instance pdf_preprocessor = PdfPreprocessor()