Files
motovaultpro/ocr/app/table_extraction/parser.py
Eric Gullickson 3eb54211cb
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
feat: add owner's manual OCR pipeline (refs #71)
Implement async PDF processing for owner's manuals with maintenance
schedule extraction:

- Add PDF preprocessor with PyMuPDF for text/scanned PDF handling
- Add maintenance pattern matching (mileage, time, fluid specs)
- Add service name mapping to maintenance subtypes
- Add table detection and parsing for schedule tables
- Add manual extractor orchestrating the complete pipeline
- Add POST /extract/manual endpoint for async job submission
- Add Redis job queue support for manual extraction jobs
- Add progress tracking during processing

Processing pipeline:
1. Analyze PDF structure (text layer vs scanned)
2. Find maintenance schedule sections
3. Extract text or OCR scanned pages at 300 DPI
4. Detect and parse maintenance tables
5. Normalize service names and extract intervals
6. Return structured maintenance schedules with confidence scores

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 21:30:20 -06:00

358 lines
12 KiB
Python

"""Parse maintenance schedule tables into structured data."""
import logging
import re
from dataclasses import dataclass, field
from typing import Optional
from app.patterns.maintenance_patterns import maintenance_matcher
from app.patterns.service_mapping import service_mapper
logger = logging.getLogger(__name__)
@dataclass
class ParsedScheduleRow:
"""A parsed maintenance schedule row."""
service: str
normalized_service: Optional[str]
subtypes: list[str]
interval_miles: Optional[int]
interval_months: Optional[int]
details: Optional[str]
fluid_spec: Optional[str]
confidence: float
raw_row: list[str] = field(default_factory=list)
class TableParser:
"""Parse detected tables into maintenance schedules.
Handles various table formats:
- Service | Miles | Months | Notes
- Service | Interval | Description
- Miles/Months header with service rows
"""
# Common column header patterns
COLUMN_PATTERNS = {
"service": [
r"service", r"item", r"maintenance", r"operation",
r"component", r"part", r"system", r"description",
],
"miles": [
r"miles?", r"mi\.?", r"mileage", r"odometer",
r"km", r"kilometers?",
],
"months": [
r"months?", r"mo\.?", r"time", r"interval",
r"years?", r"yr\.?",
],
"details": [
r"notes?", r"details?", r"remarks?", r"comments?",
r"specification", r"specs?", r"procedure",
],
}
def parse_table(
self,
header_row: list[str],
data_rows: list[list[str]],
) -> list[ParsedScheduleRow]:
"""
Parse a maintenance table into structured schedule rows.
Args:
header_row: Table header cells
data_rows: Table data rows
Returns:
List of ParsedScheduleRow objects
"""
# Identify column types
column_types = self._identify_columns(header_row)
if not column_types:
logger.warning("Could not identify table columns")
return self._parse_without_headers(data_rows)
results = []
for row in data_rows:
parsed = self._parse_row(row, column_types)
if parsed:
results.append(parsed)
return results
def parse_text_block(self, text: str) -> list[ParsedScheduleRow]:
"""
Parse maintenance schedules from unstructured text.
Useful when table detection fails but text contains schedule info.
Args:
text: Text block that may contain maintenance schedules
Returns:
List of ParsedScheduleRow objects
"""
results = []
lines = text.split("\n")
for line in lines:
# Look for lines with service + interval pattern
service_match = service_mapper.map_service(line)
mileage_match = maintenance_matcher.extract_mileage_interval(line)
time_match = maintenance_matcher.extract_time_interval(line)
if service_match and (mileage_match or time_match):
# Extract fluid spec if present
fluid_match = maintenance_matcher.extract_fluid_spec(line)
results.append(
ParsedScheduleRow(
service=line.strip(),
normalized_service=service_match.normalized_name,
subtypes=service_match.subtypes,
interval_miles=mileage_match.value if mileage_match else None,
interval_months=time_match.value if time_match else None,
details=None,
fluid_spec=fluid_match.value if fluid_match else None,
confidence=min(
service_match.confidence,
mileage_match.confidence if mileage_match else 1.0,
time_match.confidence if time_match else 1.0,
),
raw_row=[line],
)
)
return results
def _identify_columns(
self, header_row: list[str]
) -> dict[int, str]:
"""
Identify column types from header row.
Args:
header_row: Table header cells
Returns:
Dict mapping column index to type
"""
column_types: dict[int, str] = {}
for i, header in enumerate(header_row):
header_lower = header.lower().strip()
for col_type, patterns in self.COLUMN_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, header_lower, re.IGNORECASE):
column_types[i] = col_type
break
if i in column_types:
break
# If no service column found, assume first column
if "service" not in column_types.values() and header_row:
for i, header in enumerate(header_row):
if i not in column_types:
column_types[i] = "service"
break
return column_types
def _parse_row(
self,
row: list[str],
column_types: dict[int, str],
) -> Optional[ParsedScheduleRow]:
"""
Parse a single data row using identified column types.
Args:
row: Table row cells
column_types: Column index to type mapping
Returns:
ParsedScheduleRow or None
"""
service = ""
interval_miles: Optional[int] = None
interval_months: Optional[int] = None
details: Optional[str] = None
fluid_spec: Optional[str] = None
# Extract values based on column types
for i, cell in enumerate(row):
cell_value = cell.strip()
if not cell_value:
continue
col_type = column_types.get(i)
if col_type == "service":
service = cell_value
elif col_type == "miles":
miles = self._extract_miles(cell_value)
if miles:
interval_miles = miles
elif col_type == "months":
months = self._extract_months(cell_value)
if months:
interval_months = months
elif col_type == "details":
details = cell_value
# Also check for fluid specs in details
fluid_match = maintenance_matcher.extract_fluid_spec(cell_value)
if fluid_match:
fluid_spec = fluid_match.value
# If no explicit miles/months columns, try to extract from service text
if not interval_miles and not interval_months:
mileage_match = maintenance_matcher.extract_mileage_interval(service)
time_match = maintenance_matcher.extract_time_interval(service)
if mileage_match:
interval_miles = mileage_match.value
if time_match:
interval_months = time_match.value
# Check for intervals in any cell
if not interval_miles:
for cell in row:
mileage_match = maintenance_matcher.extract_mileage_interval(cell)
if mileage_match:
interval_miles = mileage_match.value
break
if not interval_months:
for cell in row:
time_match = maintenance_matcher.extract_time_interval(cell)
if time_match:
interval_months = time_match.value
break
# Skip if no service identified
if not service:
return None
# Map service to normalized name and subtypes
service_match = service_mapper.map_service(service)
normalized_service = service_match.normalized_name if service_match else None
subtypes = service_match.subtypes if service_match else []
service_confidence = service_match.confidence if service_match else 0.5
# Calculate overall confidence
interval_confidence = 0.0
if interval_miles:
interval_confidence = max(interval_confidence, 0.8)
if interval_months:
interval_confidence = max(interval_confidence, 0.8)
confidence = (service_confidence + interval_confidence) / 2 if interval_confidence else service_confidence * 0.7
return ParsedScheduleRow(
service=service,
normalized_service=normalized_service,
subtypes=subtypes,
interval_miles=interval_miles,
interval_months=interval_months,
details=details,
fluid_spec=fluid_spec,
confidence=confidence,
raw_row=row,
)
def _parse_without_headers(
self, data_rows: list[list[str]]
) -> list[ParsedScheduleRow]:
"""
Parse table without clear headers by analyzing content.
Args:
data_rows: Table rows
Returns:
List of ParsedScheduleRow
"""
results = []
for row in data_rows:
if not row:
continue
# Join all cells and try to extract info
row_text = " ".join(row)
service_match = service_mapper.map_service(row_text)
mileage_match = maintenance_matcher.extract_mileage_interval(row_text)
time_match = maintenance_matcher.extract_time_interval(row_text)
fluid_match = maintenance_matcher.extract_fluid_spec(row_text)
if service_match:
results.append(
ParsedScheduleRow(
service=row[0] if row else row_text,
normalized_service=service_match.normalized_name,
subtypes=service_match.subtypes,
interval_miles=mileage_match.value if mileage_match else None,
interval_months=time_match.value if time_match else None,
details=None,
fluid_spec=fluid_match.value if fluid_match else None,
confidence=service_match.confidence * 0.8, # Reduce for no-header parsing
raw_row=row,
)
)
return results
def _extract_miles(self, text: str) -> Optional[int]:
"""Extract mileage value from cell text."""
# First try pattern matcher
match = maintenance_matcher.extract_mileage_interval(text)
if match:
return match.value
# Try simple number extraction
# Look for patterns like "5,000", "5000", "5K"
number_match = re.search(r"([\d,]+)(?:K)?", text.replace(" ", ""), re.IGNORECASE)
if number_match:
num_str = number_match.group(1).replace(",", "")
try:
value = int(num_str)
# Handle "5K" notation
if "K" in text.upper() and value < 1000:
value *= 1000
if 500 <= value <= 150000:
return value
except ValueError:
pass
return None
def _extract_months(self, text: str) -> Optional[int]:
"""Extract month interval from cell text."""
# First try pattern matcher
match = maintenance_matcher.extract_time_interval(text)
if match:
return match.value
# Try simple number extraction
number_match = re.search(r"(\d+)", text)
if number_match:
try:
value = int(number_match.group(1))
if 1 <= value <= 120:
return value
except ValueError:
pass
return None
# Singleton instance
table_parser = TableParser()