Files
motovaultpro/ocr/app/preprocessors/pdf_preprocessor.py
Eric Gullickson 3eb54211cb
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
feat: add owner's manual OCR pipeline (refs #71)
Implement async PDF processing for owner's manuals with maintenance
schedule extraction:

- Add PDF preprocessor with PyMuPDF for text/scanned PDF handling
- Add maintenance pattern matching (mileage, time, fluid specs)
- Add service name mapping to maintenance subtypes
- Add table detection and parsing for schedule tables
- Add manual extractor orchestrating the complete pipeline
- Add POST /extract/manual endpoint for async job submission
- Add Redis job queue support for manual extraction jobs
- Add progress tracking during processing

Processing pipeline:
1. Analyze PDF structure (text layer vs scanned)
2. Find maintenance schedule sections
3. Extract text or OCR scanned pages at 300 DPI
4. Detect and parse maintenance tables
5. Normalize service names and extract intervals
6. Return structured maintenance schedules with confidence scores

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 21:30:20 -06:00

354 lines
10 KiB
Python

"""PDF preprocessing for owner's manual extraction."""
import io
import logging
from dataclasses import dataclass, field
from typing import Iterator, Optional
import fitz # PyMuPDF
from PIL import Image
logger = logging.getLogger(__name__)
@dataclass
class PdfPageContent:
"""Content extracted from a single PDF page."""
page_number: int
has_text: bool
text_content: str
image_bytes: Optional[bytes] # Rendered image for scanned pages
width: int
height: int
@dataclass
class PdfInfo:
"""Information about a PDF document."""
total_pages: int
has_text_layer: bool
is_scanned: bool # True if most pages lack text layer
file_size_bytes: int
title: Optional[str]
author: Optional[str]
metadata: dict = field(default_factory=dict)
class PdfPreprocessor:
"""Preprocess PDFs for OCR extraction.
Handles two scenarios:
1. Native PDFs with text layer - extract text directly
2. Scanned PDFs - render pages to images for OCR
Uses PyMuPDF (fitz) for both text extraction and image rendering.
"""
# DPI for rendering scanned pages
DEFAULT_DPI = 300
# Minimum text length to consider a page has text
MIN_TEXT_LENGTH = 50
# Maximum pages to sample for scan detection
SAMPLE_PAGES = 10
def get_pdf_info(self, pdf_bytes: bytes) -> PdfInfo:
"""
Analyze PDF and return metadata.
Args:
pdf_bytes: Raw PDF bytes
Returns:
PdfInfo with document metadata
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
try:
total_pages = len(doc)
metadata = doc.metadata or {}
# Sample pages to determine if scanned
text_pages = 0
sample_count = min(total_pages, self.SAMPLE_PAGES)
# Sample from beginning, middle, and end
if total_pages <= self.SAMPLE_PAGES:
sample_indices = list(range(total_pages))
else:
sample_indices = [
0, 1, 2, # Beginning
total_pages // 2 - 1, total_pages // 2, total_pages // 2 + 1, # Middle
total_pages - 3, total_pages - 2, total_pages - 1, # End
]
sample_indices = [i for i in sample_indices if 0 <= i < total_pages]
for page_idx in sample_indices:
page = doc[page_idx]
text = page.get_text().strip()
if len(text) >= self.MIN_TEXT_LENGTH:
text_pages += 1
# Consider it a scanned PDF if less than half of sampled pages have text
has_text_layer = text_pages > 0
is_scanned = text_pages < len(sample_indices) / 2
return PdfInfo(
total_pages=total_pages,
has_text_layer=has_text_layer,
is_scanned=is_scanned,
file_size_bytes=len(pdf_bytes),
title=metadata.get("title"),
author=metadata.get("author"),
metadata=metadata,
)
finally:
doc.close()
def extract_text_from_page(
self, pdf_bytes: bytes, page_number: int
) -> PdfPageContent:
"""
Extract content from a single PDF page.
Args:
pdf_bytes: Raw PDF bytes
page_number: Zero-indexed page number
Returns:
PdfPageContent with text and/or image
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
try:
if page_number >= len(doc):
raise ValueError(f"Page {page_number} does not exist (max: {len(doc) - 1})")
page = doc[page_number]
text = page.get_text().strip()
has_text = len(text) >= self.MIN_TEXT_LENGTH
rect = page.rect
width = int(rect.width)
height = int(rect.height)
# If page has text, we don't need to render
image_bytes = None
if not has_text:
image_bytes = self._render_page_to_image(page, self.DEFAULT_DPI)
return PdfPageContent(
page_number=page_number,
has_text=has_text,
text_content=text,
image_bytes=image_bytes,
width=width,
height=height,
)
finally:
doc.close()
def extract_all_pages(
self,
pdf_bytes: bytes,
dpi: int = DEFAULT_DPI,
force_ocr: bool = False,
) -> Iterator[PdfPageContent]:
"""
Extract content from all pages as a generator.
Args:
pdf_bytes: Raw PDF bytes
dpi: DPI for rendering scanned pages
force_ocr: If True, render all pages regardless of text layer
Yields:
PdfPageContent for each page
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
try:
for page_number in range(len(doc)):
page = doc[page_number]
text = page.get_text().strip()
has_text = len(text) >= self.MIN_TEXT_LENGTH
rect = page.rect
width = int(rect.width)
height = int(rect.height)
# Render to image if no text or force_ocr
image_bytes = None
if not has_text or force_ocr:
image_bytes = self._render_page_to_image(page, dpi)
yield PdfPageContent(
page_number=page_number,
has_text=has_text,
text_content=text if has_text else "",
image_bytes=image_bytes,
width=width,
height=height,
)
finally:
doc.close()
def extract_page_range(
self,
pdf_bytes: bytes,
start_page: int,
end_page: int,
dpi: int = DEFAULT_DPI,
) -> list[PdfPageContent]:
"""
Extract content from a range of pages.
Args:
pdf_bytes: Raw PDF bytes
start_page: First page (zero-indexed)
end_page: Last page (exclusive)
dpi: DPI for rendering
Returns:
List of PdfPageContent
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
try:
results = []
end_page = min(end_page, len(doc))
for page_number in range(start_page, end_page):
page = doc[page_number]
text = page.get_text().strip()
has_text = len(text) >= self.MIN_TEXT_LENGTH
rect = page.rect
width = int(rect.width)
height = int(rect.height)
image_bytes = None
if not has_text:
image_bytes = self._render_page_to_image(page, dpi)
results.append(
PdfPageContent(
page_number=page_number,
has_text=has_text,
text_content=text if has_text else "",
image_bytes=image_bytes,
width=width,
height=height,
)
)
return results
finally:
doc.close()
def find_maintenance_section(
self, pdf_bytes: bytes, keywords: Optional[list[str]] = None
) -> list[int]:
"""
Find pages likely containing maintenance schedules.
Args:
pdf_bytes: Raw PDF bytes
keywords: Keywords to search for (defaults to common terms)
Returns:
List of page numbers likely containing maintenance info
"""
if keywords is None:
keywords = [
"maintenance schedule",
"maintenance interval",
"service schedule",
"service interval",
"recommended maintenance",
"scheduled maintenance",
"routine maintenance",
"periodic maintenance",
"owner's maintenance",
"maintenance requirements",
]
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
try:
maintenance_pages = []
for page_number in range(len(doc)):
page = doc[page_number]
text = page.get_text().lower()
for keyword in keywords:
if keyword.lower() in text:
maintenance_pages.append(page_number)
break
return maintenance_pages
finally:
doc.close()
def _render_page_to_image(self, page: fitz.Page, dpi: int) -> bytes:
"""
Render a PDF page to PNG image bytes.
Args:
page: PyMuPDF page object
dpi: Target DPI for rendering
Returns:
PNG image bytes
"""
# Calculate scale factor from DPI
# Default PDF resolution is 72 DPI
scale = dpi / 72.0
matrix = fitz.Matrix(scale, scale)
# Render page to pixmap
pixmap = page.get_pixmap(matrix=matrix)
# Convert to PNG bytes
png_bytes = pixmap.tobytes("png")
return png_bytes
def render_page_for_table_detection(
self, pdf_bytes: bytes, page_number: int, dpi: int = 150
) -> bytes:
"""
Render a page at lower DPI for table detection (faster).
Args:
pdf_bytes: Raw PDF bytes
page_number: Page to render
dpi: DPI for rendering (lower for faster processing)
Returns:
PNG image bytes
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
try:
if page_number >= len(doc):
raise ValueError(f"Page {page_number} does not exist")
page = doc[page_number]
return self._render_page_to_image(page, dpi)
finally:
doc.close()
# Singleton instance
pdf_preprocessor = PdfPreprocessor()