feat: add owner's manual OCR pipeline (refs #71)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement async PDF processing for owner's manuals with maintenance schedule extraction: - Add PDF preprocessor with PyMuPDF for text/scanned PDF handling - Add maintenance pattern matching (mileage, time, fluid specs) - Add service name mapping to maintenance subtypes - Add table detection and parsing for schedule tables - Add manual extractor orchestrating the complete pipeline - Add POST /extract/manual endpoint for async job submission - Add Redis job queue support for manual extraction jobs - Add progress tracking during processing Processing pipeline: 1. Analyze PDF structure (text layer vs scanned) 2. Find maintenance schedule sections 3. Extract text or OCR scanned pages at 300 DPI 4. Detect and parse maintenance tables 5. Normalize service names and extract intervals 6. Return structured maintenance schedules with confidence scores Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,12 @@ from app.preprocessors.receipt_preprocessor import (
|
||||
ReceiptPreprocessor,
|
||||
receipt_preprocessor,
|
||||
)
|
||||
from app.preprocessors.pdf_preprocessor import (
|
||||
PdfPreprocessor,
|
||||
pdf_preprocessor,
|
||||
PdfPageContent,
|
||||
PdfInfo,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ImagePreprocessor",
|
||||
@@ -13,4 +19,8 @@ __all__ = [
|
||||
"vin_preprocessor",
|
||||
"ReceiptPreprocessor",
|
||||
"receipt_preprocessor",
|
||||
"PdfPreprocessor",
|
||||
"pdf_preprocessor",
|
||||
"PdfPageContent",
|
||||
"PdfInfo",
|
||||
]
|
||||
|
||||
353
ocr/app/preprocessors/pdf_preprocessor.py
Normal file
353
ocr/app/preprocessors/pdf_preprocessor.py
Normal file
@@ -0,0 +1,353 @@
|
||||
"""PDF preprocessing for owner's manual extraction."""
|
||||
import io
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Iterator, Optional
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PdfPageContent:
|
||||
"""Content extracted from a single PDF page."""
|
||||
|
||||
page_number: int
|
||||
has_text: bool
|
||||
text_content: str
|
||||
image_bytes: Optional[bytes] # Rendered image for scanned pages
|
||||
width: int
|
||||
height: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class PdfInfo:
|
||||
"""Information about a PDF document."""
|
||||
|
||||
total_pages: int
|
||||
has_text_layer: bool
|
||||
is_scanned: bool # True if most pages lack text layer
|
||||
file_size_bytes: int
|
||||
title: Optional[str]
|
||||
author: Optional[str]
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class PdfPreprocessor:
|
||||
"""Preprocess PDFs for OCR extraction.
|
||||
|
||||
Handles two scenarios:
|
||||
1. Native PDFs with text layer - extract text directly
|
||||
2. Scanned PDFs - render pages to images for OCR
|
||||
|
||||
Uses PyMuPDF (fitz) for both text extraction and image rendering.
|
||||
"""
|
||||
|
||||
# DPI for rendering scanned pages
|
||||
DEFAULT_DPI = 300
|
||||
|
||||
# Minimum text length to consider a page has text
|
||||
MIN_TEXT_LENGTH = 50
|
||||
|
||||
# Maximum pages to sample for scan detection
|
||||
SAMPLE_PAGES = 10
|
||||
|
||||
def get_pdf_info(self, pdf_bytes: bytes) -> PdfInfo:
|
||||
"""
|
||||
Analyze PDF and return metadata.
|
||||
|
||||
Args:
|
||||
pdf_bytes: Raw PDF bytes
|
||||
|
||||
Returns:
|
||||
PdfInfo with document metadata
|
||||
"""
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
|
||||
try:
|
||||
total_pages = len(doc)
|
||||
metadata = doc.metadata or {}
|
||||
|
||||
# Sample pages to determine if scanned
|
||||
text_pages = 0
|
||||
sample_count = min(total_pages, self.SAMPLE_PAGES)
|
||||
|
||||
# Sample from beginning, middle, and end
|
||||
if total_pages <= self.SAMPLE_PAGES:
|
||||
sample_indices = list(range(total_pages))
|
||||
else:
|
||||
sample_indices = [
|
||||
0, 1, 2, # Beginning
|
||||
total_pages // 2 - 1, total_pages // 2, total_pages // 2 + 1, # Middle
|
||||
total_pages - 3, total_pages - 2, total_pages - 1, # End
|
||||
]
|
||||
sample_indices = [i for i in sample_indices if 0 <= i < total_pages]
|
||||
|
||||
for page_idx in sample_indices:
|
||||
page = doc[page_idx]
|
||||
text = page.get_text().strip()
|
||||
if len(text) >= self.MIN_TEXT_LENGTH:
|
||||
text_pages += 1
|
||||
|
||||
# Consider it a scanned PDF if less than half of sampled pages have text
|
||||
has_text_layer = text_pages > 0
|
||||
is_scanned = text_pages < len(sample_indices) / 2
|
||||
|
||||
return PdfInfo(
|
||||
total_pages=total_pages,
|
||||
has_text_layer=has_text_layer,
|
||||
is_scanned=is_scanned,
|
||||
file_size_bytes=len(pdf_bytes),
|
||||
title=metadata.get("title"),
|
||||
author=metadata.get("author"),
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
def extract_text_from_page(
|
||||
self, pdf_bytes: bytes, page_number: int
|
||||
) -> PdfPageContent:
|
||||
"""
|
||||
Extract content from a single PDF page.
|
||||
|
||||
Args:
|
||||
pdf_bytes: Raw PDF bytes
|
||||
page_number: Zero-indexed page number
|
||||
|
||||
Returns:
|
||||
PdfPageContent with text and/or image
|
||||
"""
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
|
||||
try:
|
||||
if page_number >= len(doc):
|
||||
raise ValueError(f"Page {page_number} does not exist (max: {len(doc) - 1})")
|
||||
|
||||
page = doc[page_number]
|
||||
text = page.get_text().strip()
|
||||
has_text = len(text) >= self.MIN_TEXT_LENGTH
|
||||
|
||||
rect = page.rect
|
||||
width = int(rect.width)
|
||||
height = int(rect.height)
|
||||
|
||||
# If page has text, we don't need to render
|
||||
image_bytes = None
|
||||
if not has_text:
|
||||
image_bytes = self._render_page_to_image(page, self.DEFAULT_DPI)
|
||||
|
||||
return PdfPageContent(
|
||||
page_number=page_number,
|
||||
has_text=has_text,
|
||||
text_content=text,
|
||||
image_bytes=image_bytes,
|
||||
width=width,
|
||||
height=height,
|
||||
)
|
||||
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
def extract_all_pages(
|
||||
self,
|
||||
pdf_bytes: bytes,
|
||||
dpi: int = DEFAULT_DPI,
|
||||
force_ocr: bool = False,
|
||||
) -> Iterator[PdfPageContent]:
|
||||
"""
|
||||
Extract content from all pages as a generator.
|
||||
|
||||
Args:
|
||||
pdf_bytes: Raw PDF bytes
|
||||
dpi: DPI for rendering scanned pages
|
||||
force_ocr: If True, render all pages regardless of text layer
|
||||
|
||||
Yields:
|
||||
PdfPageContent for each page
|
||||
"""
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
|
||||
try:
|
||||
for page_number in range(len(doc)):
|
||||
page = doc[page_number]
|
||||
text = page.get_text().strip()
|
||||
has_text = len(text) >= self.MIN_TEXT_LENGTH
|
||||
|
||||
rect = page.rect
|
||||
width = int(rect.width)
|
||||
height = int(rect.height)
|
||||
|
||||
# Render to image if no text or force_ocr
|
||||
image_bytes = None
|
||||
if not has_text or force_ocr:
|
||||
image_bytes = self._render_page_to_image(page, dpi)
|
||||
|
||||
yield PdfPageContent(
|
||||
page_number=page_number,
|
||||
has_text=has_text,
|
||||
text_content=text if has_text else "",
|
||||
image_bytes=image_bytes,
|
||||
width=width,
|
||||
height=height,
|
||||
)
|
||||
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
def extract_page_range(
|
||||
self,
|
||||
pdf_bytes: bytes,
|
||||
start_page: int,
|
||||
end_page: int,
|
||||
dpi: int = DEFAULT_DPI,
|
||||
) -> list[PdfPageContent]:
|
||||
"""
|
||||
Extract content from a range of pages.
|
||||
|
||||
Args:
|
||||
pdf_bytes: Raw PDF bytes
|
||||
start_page: First page (zero-indexed)
|
||||
end_page: Last page (exclusive)
|
||||
dpi: DPI for rendering
|
||||
|
||||
Returns:
|
||||
List of PdfPageContent
|
||||
"""
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
|
||||
try:
|
||||
results = []
|
||||
end_page = min(end_page, len(doc))
|
||||
|
||||
for page_number in range(start_page, end_page):
|
||||
page = doc[page_number]
|
||||
text = page.get_text().strip()
|
||||
has_text = len(text) >= self.MIN_TEXT_LENGTH
|
||||
|
||||
rect = page.rect
|
||||
width = int(rect.width)
|
||||
height = int(rect.height)
|
||||
|
||||
image_bytes = None
|
||||
if not has_text:
|
||||
image_bytes = self._render_page_to_image(page, dpi)
|
||||
|
||||
results.append(
|
||||
PdfPageContent(
|
||||
page_number=page_number,
|
||||
has_text=has_text,
|
||||
text_content=text if has_text else "",
|
||||
image_bytes=image_bytes,
|
||||
width=width,
|
||||
height=height,
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
def find_maintenance_section(
|
||||
self, pdf_bytes: bytes, keywords: Optional[list[str]] = None
|
||||
) -> list[int]:
|
||||
"""
|
||||
Find pages likely containing maintenance schedules.
|
||||
|
||||
Args:
|
||||
pdf_bytes: Raw PDF bytes
|
||||
keywords: Keywords to search for (defaults to common terms)
|
||||
|
||||
Returns:
|
||||
List of page numbers likely containing maintenance info
|
||||
"""
|
||||
if keywords is None:
|
||||
keywords = [
|
||||
"maintenance schedule",
|
||||
"maintenance interval",
|
||||
"service schedule",
|
||||
"service interval",
|
||||
"recommended maintenance",
|
||||
"scheduled maintenance",
|
||||
"routine maintenance",
|
||||
"periodic maintenance",
|
||||
"owner's maintenance",
|
||||
"maintenance requirements",
|
||||
]
|
||||
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
|
||||
try:
|
||||
maintenance_pages = []
|
||||
|
||||
for page_number in range(len(doc)):
|
||||
page = doc[page_number]
|
||||
text = page.get_text().lower()
|
||||
|
||||
for keyword in keywords:
|
||||
if keyword.lower() in text:
|
||||
maintenance_pages.append(page_number)
|
||||
break
|
||||
|
||||
return maintenance_pages
|
||||
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
def _render_page_to_image(self, page: fitz.Page, dpi: int) -> bytes:
|
||||
"""
|
||||
Render a PDF page to PNG image bytes.
|
||||
|
||||
Args:
|
||||
page: PyMuPDF page object
|
||||
dpi: Target DPI for rendering
|
||||
|
||||
Returns:
|
||||
PNG image bytes
|
||||
"""
|
||||
# Calculate scale factor from DPI
|
||||
# Default PDF resolution is 72 DPI
|
||||
scale = dpi / 72.0
|
||||
matrix = fitz.Matrix(scale, scale)
|
||||
|
||||
# Render page to pixmap
|
||||
pixmap = page.get_pixmap(matrix=matrix)
|
||||
|
||||
# Convert to PNG bytes
|
||||
png_bytes = pixmap.tobytes("png")
|
||||
|
||||
return png_bytes
|
||||
|
||||
def render_page_for_table_detection(
|
||||
self, pdf_bytes: bytes, page_number: int, dpi: int = 150
|
||||
) -> bytes:
|
||||
"""
|
||||
Render a page at lower DPI for table detection (faster).
|
||||
|
||||
Args:
|
||||
pdf_bytes: Raw PDF bytes
|
||||
page_number: Page to render
|
||||
dpi: DPI for rendering (lower for faster processing)
|
||||
|
||||
Returns:
|
||||
PNG image bytes
|
||||
"""
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
|
||||
try:
|
||||
if page_number >= len(doc):
|
||||
raise ValueError(f"Page {page_number} does not exist")
|
||||
|
||||
page = doc[page_number]
|
||||
return self._render_page_to_image(page, dpi)
|
||||
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
|
||||
# Singleton instance
|
||||
pdf_preprocessor = PdfPreprocessor()
|
||||
Reference in New Issue
Block a user