"""Maintenance schedule pattern matching for owner's manual extraction.""" import re from dataclasses import dataclass from typing import Optional @dataclass class MileageIntervalMatch: """Result of mileage interval pattern matching.""" value: int # Miles raw_match: str confidence: float pattern_name: str @dataclass class TimeIntervalMatch: """Result of time interval pattern matching.""" value: int # Months raw_match: str confidence: float pattern_name: str @dataclass class FluidSpecMatch: """Result of fluid specification pattern matching.""" value: str # e.g., "0W-20", "ATF-Z1", "DOT 4" fluid_type: str # e.g., "oil", "transmission", "brake" raw_match: str confidence: float class MaintenancePatternMatcher: """Extract maintenance-specific data from owner's manual text.""" # Mileage interval patterns MILEAGE_PATTERNS = [ # "every 5,000 miles" or "every 5000 miles" ( r"every\s+([\d,]+)\s*(?:miles?|mi\.?)", "every_miles", 0.95, ), # "at 30,000 mi" or "at 30000 miles" ( r"at\s+([\d,]+)\s*(?:miles?|mi\.?)", "at_miles", 0.93, ), # "5,000 miles or" (interval before "or") ( r"([\d,]+)\s*(?:miles?|mi\.?)\s*(?:or|/)", "miles_or", 0.90, ), # "every 5,000-7,500 miles" (range - take lower) ( r"every\s+([\d,]+)\s*[-–]\s*[\d,]+\s*(?:miles?|mi\.?)", "miles_range", 0.88, ), # "7,500 mi/12 months" (interval with slash) ( r"([\d,]+)\s*(?:miles?|mi\.?)\s*/", "miles_slash", 0.87, ), # Standalone "X,XXX miles" in table context ( r"(? 12 months ( r"\bannually\b", "annually", 0.95, ), # "semi-annually" or "semi-annual" -> 6 months ( r"\bsemi-?annual(?:ly)?\b", "semi_annual", 0.95, ), # "every year" -> 12 months ( r"every\s+year", "every_year", 0.93, ), # "every 2 years" -> 24 months ( r"every\s+(\d+)\s*years?", "every_years", 0.93, ), # "12 mo/7,500 mi" or "12 months/" ( r"(\d+)\s*(?:mo(?:nths?)?\.?)\s*/", "months_slash", 0.87, ), # Standalone "X months" in table context ( r"(? Optional[MileageIntervalMatch]: """ Extract mileage interval from text. Args: text: Text to search for mileage intervals Returns: MileageIntervalMatch or None if no interval found """ text_lower = text.lower() for pattern, name, confidence in self.MILEAGE_PATTERNS: match = re.search(pattern, text_lower, re.IGNORECASE) if match: # Extract the number and remove commas mileage_str = match.group(1).replace(",", "") mileage = int(mileage_str) if self._is_reasonable_mileage(mileage): return MileageIntervalMatch( value=mileage, raw_match=match.group(0), confidence=confidence, pattern_name=name, ) return None def extract_time_interval(self, text: str) -> Optional[TimeIntervalMatch]: """ Extract time interval from text. Args: text: Text to search for time intervals Returns: TimeIntervalMatch or None if no interval found """ text_lower = text.lower() for pattern, name, confidence in self.TIME_PATTERNS: match = re.search(pattern, text_lower, re.IGNORECASE) if match: # Handle special cases if name == "annually": months = 12 elif name == "semi_annual": months = 6 elif name == "every_year": months = 12 elif name == "every_years": years = int(match.group(1)) months = years * 12 else: months = int(match.group(1)) if self._is_reasonable_months(months): return TimeIntervalMatch( value=months, raw_match=match.group(0), confidence=confidence, pattern_name=name, ) return None def extract_fluid_spec(self, text: str) -> Optional[FluidSpecMatch]: """ Extract fluid specification from text. Args: text: Text to search for fluid specs Returns: FluidSpecMatch or None if no spec found """ for pattern, fluid_type, confidence in self.FLUID_PATTERNS: match = re.search(pattern, text, re.IGNORECASE) if match: return FluidSpecMatch( value=match.group(1).upper() if fluid_type != "coolant" else match.group(1), fluid_type=fluid_type, raw_match=match.group(0), confidence=confidence, ) return None def extract_all_fluid_specs(self, text: str) -> list[FluidSpecMatch]: """ Extract all fluid specifications from text. Args: text: Text to search for fluid specs Returns: List of FluidSpecMatch objects """ results = [] seen_values: set[str] = set() for pattern, fluid_type, confidence in self.FLUID_PATTERNS: for match in re.finditer(pattern, text, re.IGNORECASE): value = match.group(1).upper() if fluid_type != "coolant" else match.group(1) if value not in seen_values: seen_values.add(value) results.append( FluidSpecMatch( value=value, fluid_type=fluid_type, raw_match=match.group(0), confidence=confidence, ) ) return results def extract_combined_interval( self, text: str ) -> tuple[Optional[MileageIntervalMatch], Optional[TimeIntervalMatch]]: """ Extract both mileage and time intervals from a combined pattern. Many schedules use patterns like "every 5,000 miles or 6 months". Args: text: Text to search Returns: Tuple of (mileage_match, time_match) """ mileage = self.extract_mileage_interval(text) time = self.extract_time_interval(text) return mileage, time def _is_reasonable_mileage(self, mileage: int) -> bool: """Check if mileage interval is reasonable for maintenance.""" # Typical ranges: 1,000 to 100,000 miles return 500 <= mileage <= 150000 def _is_reasonable_months(self, months: int) -> bool: """Check if month interval is reasonable for maintenance.""" # Typical ranges: 1 to 120 months (10 years) return 1 <= months <= 120 # Singleton instance maintenance_matcher = MaintenancePatternMatcher()