feat: add owner's manual OCR pipeline (refs #71)

Implement async PDF processing for owner's manuals with maintenance schedule extraction: - Add PDF preprocessor with PyMuPDF for text/scanned PDF handling - Add maintenance pattern matching (mileage, time, fluid specs) - Add service name mapping to maintenance subtypes - Add table detection and parsing for schedule tables - Add manual extractor orchestrating the complete pipeline - Add POST /extract/manual endpoint for async job submission - Add Redis job queue support for manual extraction jobs - Add progress tracking during processing Processing pipeline: 1. Analyze PDF structure (text layer vs scanned) 2. Find maintenance schedule sections 3. Extract text or OCR scanned pages at 300 DPI 4. Detect and parse maintenance tables 5. Normalize service names and extract intervals 6. Return structured maintenance schedules with confidence scores Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 21:30:20 -06:00
parent b226ca59de
commit 3eb54211cb
20 changed files with 2904 additions and 14 deletions
--- a/ocr/app/table_extraction/init.py
+++ b/ocr/app/table_extraction/init.py
@@ -0,0 +1,12 @@
+"""Table extraction components for maintenance schedule parsing."""
+from app.table_extraction.detector import TableDetector, table_detector, DetectedTable
+from app.table_extraction.parser import TableParser, table_parser, ParsedScheduleRow
+
+__all__ = [
+    "TableDetector",
+    "table_detector",
+    "DetectedTable",
+    "TableParser",
+    "table_parser",
+    "ParsedScheduleRow",
+]
--- a/ocr/app/table_extraction/detector.py
+++ b/ocr/app/table_extraction/detector.py
@@ -0,0 +1,322 @@
+"""Table detection for maintenance schedule extraction."""
+import io
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+
+import cv2
+import numpy as np
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DetectedTable:
+    """A detected table in a document."""
+
+    page_number: int
+    x: int
+    y: int
+    width: int
+    height: int
+    confidence: float
+    is_maintenance_table: bool
+    header_row: Optional[list[str]] = None
+    raw_content: list[list[str]] = field(default_factory=list)
+
+
+class TableDetector:
+    """Detect tables in document pages.
+
+    Uses computer vision techniques to identify table regions:
+    1. Line detection for bordered tables
+    2. Text alignment analysis for borderless tables
+    3. Header keyword matching for maintenance schedule identification
+    """
+
+    # Keywords indicating maintenance schedule table headers
+    MAINTENANCE_HEADERS = [
+        "service", "maintenance", "item", "operation",
+        "miles", "mi", "km", "kilometers",
+        "months", "mo", "interval",
+        "check", "replace", "inspect", "change",
+        "schedule", "frequency",
+    ]
+
+    # Keywords in content that indicate maintenance
+    MAINTENANCE_CONTENT_KEYWORDS = [
+        "oil", "filter", "brake", "tire", "coolant",
+        "fluid", "spark plug", "belt", "hose",
+        "inspect", "replace", "change", "check",
+    ]
+
+    def detect_tables_in_image(
+        self, image_bytes: bytes, page_number: int = 0
+    ) -> list[DetectedTable]:
+        """
+        Detect tables in an image using line detection.
+
+        Args:
+            image_bytes: PNG/JPEG image bytes
+            page_number: Page number for the result
+
+        Returns:
+            List of DetectedTable objects
+        """
+        # Load image
+        nparr = np.frombuffer(image_bytes, np.uint8)
+        img = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE)
+
+        if img is None:
+            logger.warning("Failed to decode image for table detection")
+            return []
+
+        # Apply threshold
+        _, binary = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY_INV)
+
+        # Detect horizontal lines
+        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
+        horizontal_lines = cv2.morphologyEx(
+            binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=2
+        )
+
+        # Detect vertical lines
+        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
+        vertical_lines = cv2.morphologyEx(
+            binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2
+        )
+
+        # Combine lines
+        table_mask = cv2.add(horizontal_lines, vertical_lines)
+
+        # Find contours
+        contours, _ = cv2.findContours(
+            table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+        )
+
+        tables = []
+        height, width = img.shape[:2]
+
+        for contour in contours:
+            x, y, w, h = cv2.boundingRect(contour)
+
+            # Filter by size (tables should be reasonably large)
+            if w < width * 0.3 or h < height * 0.05:
+                continue
+            if w > width * 0.95 and h > height * 0.95:
+                continue  # Skip full-page rectangles
+
+            # Calculate confidence based on aspect ratio and size
+            aspect_ratio = w / h if h > 0 else 0
+            size_ratio = (w * h) / (width * height)
+
+            # Tables typically have reasonable aspect ratios
+            if 0.5 <= aspect_ratio <= 10 and 0.01 <= size_ratio <= 0.8:
+                confidence = min(0.9, 0.5 + size_ratio + (1 - abs(aspect_ratio - 2) / 10))
+
+                tables.append(
+                    DetectedTable(
+                        page_number=page_number,
+                        x=x,
+                        y=y,
+                        width=w,
+                        height=h,
+                        confidence=confidence,
+                        is_maintenance_table=False,  # Will be determined later
+                    )
+                )
+
+        logger.debug(f"Detected {len(tables)} potential tables on page {page_number}")
+        return tables
+
+    def detect_tables_in_text(
+        self, text: str, page_number: int = 0
+    ) -> list[DetectedTable]:
+        """
+        Detect table-like structures in text using pattern analysis.
+
+        Useful for native PDFs where text is available.
+
+        Args:
+            text: Extracted text content
+            page_number: Page number
+
+        Returns:
+            List of DetectedTable with content populated
+        """
+        tables = []
+        lines = text.split("\n")
+
+        # Look for patterns that suggest tabular data
+        # - Multiple columns separated by whitespace or tabs
+        # - Consistent column alignment across rows
+
+        current_table_lines: list[str] = []
+        in_table = False
+        table_start_idx = 0
+
+        for i, line in enumerate(lines):
+            # Check if line looks like table row
+            is_table_row = self._is_table_row(line)
+
+            if is_table_row:
+                if not in_table:
+                    in_table = True
+                    table_start_idx = i
+                    current_table_lines = []
+                current_table_lines.append(line)
+            else:
+                if in_table and len(current_table_lines) >= 3:
+                    # End of table, process it
+                    table = self._process_text_table(
+                        current_table_lines, page_number, table_start_idx
+                    )
+                    if table:
+                        tables.append(table)
+                in_table = False
+                current_table_lines = []
+
+        # Handle table at end of text
+        if in_table and len(current_table_lines) >= 3:
+            table = self._process_text_table(
+                current_table_lines, page_number, table_start_idx
+            )
+            if table:
+                tables.append(table)
+
+        return tables
+
+    def is_maintenance_table(
+        self, table: DetectedTable, full_text: Optional[str] = None
+    ) -> bool:
+        """
+        Determine if a detected table is a maintenance schedule.
+
+        Args:
+            table: Detected table to analyze
+            full_text: Optional surrounding text for context
+
+        Returns:
+            True if likely a maintenance schedule table
+        """
+        # Check header row for maintenance keywords
+        if table.header_row:
+            header_text = " ".join(table.header_row).lower()
+            header_matches = sum(
+                1 for kw in self.MAINTENANCE_HEADERS if kw in header_text
+            )
+            if header_matches >= 2:
+                return True
+
+        # Check content for maintenance keywords
+        if table.raw_content:
+            content_text = " ".join(
+                " ".join(row) for row in table.raw_content
+            ).lower()
+            content_matches = sum(
+                1 for kw in self.MAINTENANCE_CONTENT_KEYWORDS if kw in content_text
+            )
+            if content_matches >= 3:
+                return True
+
+        # Check surrounding text
+        if full_text:
+            text_lower = full_text.lower()
+            context_keywords = [
+                "maintenance schedule",
+                "service schedule",
+                "maintenance interval",
+                "recommended maintenance",
+            ]
+            if any(kw in text_lower for kw in context_keywords):
+                return True
+
+        return False
+
+    def _is_table_row(self, line: str) -> bool:
+        """Check if a line looks like a table row."""
+        # Skip empty lines
+        stripped = line.strip()
+        if not stripped:
+            return False
+
+        # Check for multiple whitespace-separated columns
+        parts = re.split(r"\s{2,}|\t", stripped)
+        if len(parts) >= 2:
+            # At least 2 columns with content
+            non_empty = [p for p in parts if p.strip()]
+            return len(non_empty) >= 2
+
+        # Check for common table patterns
+        # e.g., "Service Item          5,000 miles    6 months"
+        if re.search(r"\d+[,.]?\d*\s*(miles?|mi\.?|km|months?|mo\.?)", stripped, re.I):
+            return True
+
+        return False
+
+    def _process_text_table(
+        self, lines: list[str], page_number: int, start_line: int
+    ) -> Optional[DetectedTable]:
+        """Process extracted text lines into a table structure."""
+        if not lines:
+            return None
+
+        # Parse rows
+        rows = []
+        for line in lines:
+            # Split on multiple whitespace or tabs
+            parts = re.split(r"\s{2,}|\t", line.strip())
+            cells = [p.strip() for p in parts if p.strip()]
+            if cells:
+                rows.append(cells)
+
+        if len(rows) < 2:
+            return None
+
+        # First row is likely header
+        header_row = rows[0]
+
+        # Check if this looks like a maintenance table
+        table = DetectedTable(
+            page_number=page_number,
+            x=0,  # Text tables don't have coordinates
+            y=start_line,
+            width=0,
+            height=len(rows),
+            confidence=0.7,
+            is_maintenance_table=False,
+            header_row=header_row,
+            raw_content=rows[1:],
+        )
+
+        # Determine if it's a maintenance table
+        table.is_maintenance_table = self.is_maintenance_table(table)
+
+        if table.is_maintenance_table:
+            table.confidence = 0.85
+
+        return table
+
+    def extract_table_text_from_region(
+        self, image_bytes: bytes, table: DetectedTable
+    ) -> list[list[str]]:
+        """
+        Extract text from a table region using OCR.
+
+        Args:
+            image_bytes: Full page image
+            table: Detected table with coordinates
+
+        Returns:
+            2D list of cell contents
+        """
+        # This would use Tesseract on the cropped region
+        # For now, return empty - actual OCR will be done in manual_extractor
+        logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}")
+        return []
+
+
+# Singleton instance
+table_detector = TableDetector()
--- a/ocr/app/table_extraction/parser.py
+++ b/ocr/app/table_extraction/parser.py
@@ -0,0 +1,357 @@
+"""Parse maintenance schedule tables into structured data."""
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+
+from app.patterns.maintenance_patterns import maintenance_matcher
+from app.patterns.service_mapping import service_mapper
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ParsedScheduleRow:
+    """A parsed maintenance schedule row."""
+
+    service: str
+    normalized_service: Optional[str]
+    subtypes: list[str]
+    interval_miles: Optional[int]
+    interval_months: Optional[int]
+    details: Optional[str]
+    fluid_spec: Optional[str]
+    confidence: float
+    raw_row: list[str] = field(default_factory=list)
+
+
+class TableParser:
+    """Parse detected tables into maintenance schedules.
+
+    Handles various table formats:
+    - Service | Miles | Months | Notes
+    - Service | Interval | Description
+    - Miles/Months header with service rows
+    """
+
+    # Common column header patterns
+    COLUMN_PATTERNS = {
+        "service": [
+            r"service", r"item", r"maintenance", r"operation",
+            r"component", r"part", r"system", r"description",
+        ],
+        "miles": [
+            r"miles?", r"mi\.?", r"mileage", r"odometer",
+            r"km", r"kilometers?",
+        ],
+        "months": [
+            r"months?", r"mo\.?", r"time", r"interval",
+            r"years?", r"yr\.?",
+        ],
+        "details": [
+            r"notes?", r"details?", r"remarks?", r"comments?",
+            r"specification", r"specs?", r"procedure",
+        ],
+    }
+
+    def parse_table(
+        self,
+        header_row: list[str],
+        data_rows: list[list[str]],
+    ) -> list[ParsedScheduleRow]:
+        """
+        Parse a maintenance table into structured schedule rows.
+
+        Args:
+            header_row: Table header cells
+            data_rows: Table data rows
+
+        Returns:
+            List of ParsedScheduleRow objects
+        """
+        # Identify column types
+        column_types = self._identify_columns(header_row)
+
+        if not column_types:
+            logger.warning("Could not identify table columns")
+            return self._parse_without_headers(data_rows)
+
+        results = []
+
+        for row in data_rows:
+            parsed = self._parse_row(row, column_types)
+            if parsed:
+                results.append(parsed)
+
+        return results
+
+    def parse_text_block(self, text: str) -> list[ParsedScheduleRow]:
+        """
+        Parse maintenance schedules from unstructured text.
+
+        Useful when table detection fails but text contains schedule info.
+
+        Args:
+            text: Text block that may contain maintenance schedules
+
+        Returns:
+            List of ParsedScheduleRow objects
+        """
+        results = []
+        lines = text.split("\n")
+
+        for line in lines:
+            # Look for lines with service + interval pattern
+            service_match = service_mapper.map_service(line)
+            mileage_match = maintenance_matcher.extract_mileage_interval(line)
+            time_match = maintenance_matcher.extract_time_interval(line)
+
+            if service_match and (mileage_match or time_match):
+                # Extract fluid spec if present
+                fluid_match = maintenance_matcher.extract_fluid_spec(line)
+
+                results.append(
+                    ParsedScheduleRow(
+                        service=line.strip(),
+                        normalized_service=service_match.normalized_name,
+                        subtypes=service_match.subtypes,
+                        interval_miles=mileage_match.value if mileage_match else None,
+                        interval_months=time_match.value if time_match else None,
+                        details=None,
+                        fluid_spec=fluid_match.value if fluid_match else None,
+                        confidence=min(
+                            service_match.confidence,
+                            mileage_match.confidence if mileage_match else 1.0,
+                            time_match.confidence if time_match else 1.0,
+                        ),
+                        raw_row=[line],
+                    )
+                )
+
+        return results
+
+    def _identify_columns(
+        self, header_row: list[str]
+    ) -> dict[int, str]:
+        """
+        Identify column types from header row.
+
+        Args:
+            header_row: Table header cells
+
+        Returns:
+            Dict mapping column index to type
+        """
+        column_types: dict[int, str] = {}
+
+        for i, header in enumerate(header_row):
+            header_lower = header.lower().strip()
+
+            for col_type, patterns in self.COLUMN_PATTERNS.items():
+                for pattern in patterns:
+                    if re.search(pattern, header_lower, re.IGNORECASE):
+                        column_types[i] = col_type
+                        break
+                if i in column_types:
+                    break
+
+        # If no service column found, assume first column
+        if "service" not in column_types.values() and header_row:
+            for i, header in enumerate(header_row):
+                if i not in column_types:
+                    column_types[i] = "service"
+                    break
+
+        return column_types
+
+    def _parse_row(
+        self,
+        row: list[str],
+        column_types: dict[int, str],
+    ) -> Optional[ParsedScheduleRow]:
+        """
+        Parse a single data row using identified column types.
+
+        Args:
+            row: Table row cells
+            column_types: Column index to type mapping
+
+        Returns:
+            ParsedScheduleRow or None
+        """
+        service = ""
+        interval_miles: Optional[int] = None
+        interval_months: Optional[int] = None
+        details: Optional[str] = None
+        fluid_spec: Optional[str] = None
+
+        # Extract values based on column types
+        for i, cell in enumerate(row):
+            cell_value = cell.strip()
+            if not cell_value:
+                continue
+
+            col_type = column_types.get(i)
+
+            if col_type == "service":
+                service = cell_value
+            elif col_type == "miles":
+                miles = self._extract_miles(cell_value)
+                if miles:
+                    interval_miles = miles
+            elif col_type == "months":
+                months = self._extract_months(cell_value)
+                if months:
+                    interval_months = months
+            elif col_type == "details":
+                details = cell_value
+                # Also check for fluid specs in details
+                fluid_match = maintenance_matcher.extract_fluid_spec(cell_value)
+                if fluid_match:
+                    fluid_spec = fluid_match.value
+
+        # If no explicit miles/months columns, try to extract from service text
+        if not interval_miles and not interval_months:
+            mileage_match = maintenance_matcher.extract_mileage_interval(service)
+            time_match = maintenance_matcher.extract_time_interval(service)
+            if mileage_match:
+                interval_miles = mileage_match.value
+            if time_match:
+                interval_months = time_match.value
+
+        # Check for intervals in any cell
+        if not interval_miles:
+            for cell in row:
+                mileage_match = maintenance_matcher.extract_mileage_interval(cell)
+                if mileage_match:
+                    interval_miles = mileage_match.value
+                    break
+
+        if not interval_months:
+            for cell in row:
+                time_match = maintenance_matcher.extract_time_interval(cell)
+                if time_match:
+                    interval_months = time_match.value
+                    break
+
+        # Skip if no service identified
+        if not service:
+            return None
+
+        # Map service to normalized name and subtypes
+        service_match = service_mapper.map_service(service)
+
+        normalized_service = service_match.normalized_name if service_match else None
+        subtypes = service_match.subtypes if service_match else []
+        service_confidence = service_match.confidence if service_match else 0.5
+
+        # Calculate overall confidence
+        interval_confidence = 0.0
+        if interval_miles:
+            interval_confidence = max(interval_confidence, 0.8)
+        if interval_months:
+            interval_confidence = max(interval_confidence, 0.8)
+
+        confidence = (service_confidence + interval_confidence) / 2 if interval_confidence else service_confidence * 0.7
+
+        return ParsedScheduleRow(
+            service=service,
+            normalized_service=normalized_service,
+            subtypes=subtypes,
+            interval_miles=interval_miles,
+            interval_months=interval_months,
+            details=details,
+            fluid_spec=fluid_spec,
+            confidence=confidence,
+            raw_row=row,
+        )
+
+    def _parse_without_headers(
+        self, data_rows: list[list[str]]
+    ) -> list[ParsedScheduleRow]:
+        """
+        Parse table without clear headers by analyzing content.
+
+        Args:
+            data_rows: Table rows
+
+        Returns:
+            List of ParsedScheduleRow
+        """
+        results = []
+
+        for row in data_rows:
+            if not row:
+                continue
+
+            # Join all cells and try to extract info
+            row_text = " ".join(row)
+
+            service_match = service_mapper.map_service(row_text)
+            mileage_match = maintenance_matcher.extract_mileage_interval(row_text)
+            time_match = maintenance_matcher.extract_time_interval(row_text)
+            fluid_match = maintenance_matcher.extract_fluid_spec(row_text)
+
+            if service_match:
+                results.append(
+                    ParsedScheduleRow(
+                        service=row[0] if row else row_text,
+                        normalized_service=service_match.normalized_name,
+                        subtypes=service_match.subtypes,
+                        interval_miles=mileage_match.value if mileage_match else None,
+                        interval_months=time_match.value if time_match else None,
+                        details=None,
+                        fluid_spec=fluid_match.value if fluid_match else None,
+                        confidence=service_match.confidence * 0.8,  # Reduce for no-header parsing
+                        raw_row=row,
+                    )
+                )
+
+        return results
+
+    def _extract_miles(self, text: str) -> Optional[int]:
+        """Extract mileage value from cell text."""
+        # First try pattern matcher
+        match = maintenance_matcher.extract_mileage_interval(text)
+        if match:
+            return match.value
+
+        # Try simple number extraction
+        # Look for patterns like "5,000", "5000", "5K"
+        number_match = re.search(r"([\d,]+)(?:K)?", text.replace(" ", ""), re.IGNORECASE)
+        if number_match:
+            num_str = number_match.group(1).replace(",", "")
+            try:
+                value = int(num_str)
+                # Handle "5K" notation
+                if "K" in text.upper() and value < 1000:
+                    value *= 1000
+                if 500 <= value <= 150000:
+                    return value
+            except ValueError:
+                pass
+
+        return None
+
+    def _extract_months(self, text: str) -> Optional[int]:
+        """Extract month interval from cell text."""
+        # First try pattern matcher
+        match = maintenance_matcher.extract_time_interval(text)
+        if match:
+            return match.value
+
+        # Try simple number extraction
+        number_match = re.search(r"(\d+)", text)
+        if number_match:
+            try:
+                value = int(number_match.group(1))
+                if 1 <= value <= 120:
+                    return value
+            except ValueError:
+                pass
+
+        return None
+
+
+# Singleton instance
+table_parser = TableParser()