"""Parse maintenance schedule tables into structured data.""" import logging import re from dataclasses import dataclass, field from typing import Optional from app.patterns.maintenance_patterns import maintenance_matcher from app.patterns.service_mapping import service_mapper logger = logging.getLogger(__name__) @dataclass class ParsedScheduleRow: """A parsed maintenance schedule row.""" service: str normalized_service: Optional[str] subtypes: list[str] interval_miles: Optional[int] interval_months: Optional[int] details: Optional[str] fluid_spec: Optional[str] confidence: float raw_row: list[str] = field(default_factory=list) class TableParser: """Parse detected tables into maintenance schedules. Handles various table formats: - Service | Miles | Months | Notes - Service | Interval | Description - Miles/Months header with service rows """ # Common column header patterns COLUMN_PATTERNS = { "service": [ r"service", r"item", r"maintenance", r"operation", r"component", r"part", r"system", r"description", ], "miles": [ r"miles?", r"mi\.?", r"mileage", r"odometer", r"km", r"kilometers?", ], "months": [ r"months?", r"mo\.?", r"time", r"interval", r"years?", r"yr\.?", ], "details": [ r"notes?", r"details?", r"remarks?", r"comments?", r"specification", r"specs?", r"procedure", ], } def parse_table( self, header_row: list[str], data_rows: list[list[str]], ) -> list[ParsedScheduleRow]: """ Parse a maintenance table into structured schedule rows. Args: header_row: Table header cells data_rows: Table data rows Returns: List of ParsedScheduleRow objects """ # Identify column types column_types = self._identify_columns(header_row) if not column_types: logger.warning("Could not identify table columns") return self._parse_without_headers(data_rows) results = [] for row in data_rows: parsed = self._parse_row(row, column_types) if parsed: results.append(parsed) return results def parse_text_block(self, text: str) -> list[ParsedScheduleRow]: """ Parse maintenance schedules from unstructured text. Useful when table detection fails but text contains schedule info. Args: text: Text block that may contain maintenance schedules Returns: List of ParsedScheduleRow objects """ results = [] lines = text.split("\n") for line in lines: # Look for lines with service + interval pattern service_match = service_mapper.map_service(line) mileage_match = maintenance_matcher.extract_mileage_interval(line) time_match = maintenance_matcher.extract_time_interval(line) if service_match and (mileage_match or time_match): # Extract fluid spec if present fluid_match = maintenance_matcher.extract_fluid_spec(line) results.append( ParsedScheduleRow( service=line.strip(), normalized_service=service_match.normalized_name, subtypes=service_match.subtypes, interval_miles=mileage_match.value if mileage_match else None, interval_months=time_match.value if time_match else None, details=None, fluid_spec=fluid_match.value if fluid_match else None, confidence=min( service_match.confidence, mileage_match.confidence if mileage_match else 1.0, time_match.confidence if time_match else 1.0, ), raw_row=[line], ) ) return results def _identify_columns( self, header_row: list[str] ) -> dict[int, str]: """ Identify column types from header row. Args: header_row: Table header cells Returns: Dict mapping column index to type """ column_types: dict[int, str] = {} for i, header in enumerate(header_row): header_lower = header.lower().strip() for col_type, patterns in self.COLUMN_PATTERNS.items(): for pattern in patterns: if re.search(pattern, header_lower, re.IGNORECASE): column_types[i] = col_type break if i in column_types: break # If no service column found, assume first column if "service" not in column_types.values() and header_row: for i, header in enumerate(header_row): if i not in column_types: column_types[i] = "service" break return column_types def _parse_row( self, row: list[str], column_types: dict[int, str], ) -> Optional[ParsedScheduleRow]: """ Parse a single data row using identified column types. Args: row: Table row cells column_types: Column index to type mapping Returns: ParsedScheduleRow or None """ service = "" interval_miles: Optional[int] = None interval_months: Optional[int] = None details: Optional[str] = None fluid_spec: Optional[str] = None # Extract values based on column types for i, cell in enumerate(row): cell_value = cell.strip() if not cell_value: continue col_type = column_types.get(i) if col_type == "service": service = cell_value elif col_type == "miles": miles = self._extract_miles(cell_value) if miles: interval_miles = miles elif col_type == "months": months = self._extract_months(cell_value) if months: interval_months = months elif col_type == "details": details = cell_value # Also check for fluid specs in details fluid_match = maintenance_matcher.extract_fluid_spec(cell_value) if fluid_match: fluid_spec = fluid_match.value # If no explicit miles/months columns, try to extract from service text if not interval_miles and not interval_months: mileage_match = maintenance_matcher.extract_mileage_interval(service) time_match = maintenance_matcher.extract_time_interval(service) if mileage_match: interval_miles = mileage_match.value if time_match: interval_months = time_match.value # Check for intervals in any cell if not interval_miles: for cell in row: mileage_match = maintenance_matcher.extract_mileage_interval(cell) if mileage_match: interval_miles = mileage_match.value break if not interval_months: for cell in row: time_match = maintenance_matcher.extract_time_interval(cell) if time_match: interval_months = time_match.value break # Skip if no service identified if not service: return None # Map service to normalized name and subtypes service_match = service_mapper.map_service(service) normalized_service = service_match.normalized_name if service_match else None subtypes = service_match.subtypes if service_match else [] service_confidence = service_match.confidence if service_match else 0.5 # Calculate overall confidence interval_confidence = 0.0 if interval_miles: interval_confidence = max(interval_confidence, 0.8) if interval_months: interval_confidence = max(interval_confidence, 0.8) confidence = (service_confidence + interval_confidence) / 2 if interval_confidence else service_confidence * 0.7 return ParsedScheduleRow( service=service, normalized_service=normalized_service, subtypes=subtypes, interval_miles=interval_miles, interval_months=interval_months, details=details, fluid_spec=fluid_spec, confidence=confidence, raw_row=row, ) def _parse_without_headers( self, data_rows: list[list[str]] ) -> list[ParsedScheduleRow]: """ Parse table without clear headers by analyzing content. Args: data_rows: Table rows Returns: List of ParsedScheduleRow """ results = [] for row in data_rows: if not row: continue # Join all cells and try to extract info row_text = " ".join(row) service_match = service_mapper.map_service(row_text) mileage_match = maintenance_matcher.extract_mileage_interval(row_text) time_match = maintenance_matcher.extract_time_interval(row_text) fluid_match = maintenance_matcher.extract_fluid_spec(row_text) if service_match: results.append( ParsedScheduleRow( service=row[0] if row else row_text, normalized_service=service_match.normalized_name, subtypes=service_match.subtypes, interval_miles=mileage_match.value if mileage_match else None, interval_months=time_match.value if time_match else None, details=None, fluid_spec=fluid_match.value if fluid_match else None, confidence=service_match.confidence * 0.8, # Reduce for no-header parsing raw_row=row, ) ) return results def _extract_miles(self, text: str) -> Optional[int]: """Extract mileage value from cell text.""" # First try pattern matcher match = maintenance_matcher.extract_mileage_interval(text) if match: return match.value # Try simple number extraction # Look for patterns like "5,000", "5000", "5K" number_match = re.search(r"([\d,]+)(?:K)?", text.replace(" ", ""), re.IGNORECASE) if number_match: num_str = number_match.group(1).replace(",", "") try: value = int(num_str) # Handle "5K" notation if "K" in text.upper() and value < 1000: value *= 1000 if 500 <= value <= 150000: return value except ValueError: pass return None def _extract_months(self, text: str) -> Optional[int]: """Extract month interval from cell text.""" # First try pattern matcher match = maintenance_matcher.extract_time_interval(text) if match: return match.value # Try simple number extraction number_match = re.search(r"(\d+)", text) if number_match: try: value = int(number_match.group(1)) if 1 <= value <= 120: return value except ValueError: pass return None # Singleton instance table_parser = TableParser()