""" Engine Specification Parser Parses engine specifications from JSON vehicle data into structured components. Handles displacement, configuration, cylinders, fuel type, and aspiration. CRITICAL REQUIREMENT: L-configuration normalization - L3 → I3 (L-configuration treated as Inline) - L4 → I4 (L-configuration treated as Inline) Standard format: {displacement}L {config}{cylinders} {modifiers} Examples: - "2.0L I4" → 2.0L, Inline, 4-cylinder - "1.5L L3 PLUG-IN HYBRID EV- (PHEV)" → 1.5L, Inline (normalized), 3-cyl, Plug-in Hybrid - "2.4L H4" → 2.4L, Horizontal (Subaru Boxer), 4-cylinder Usage: parser = EngineSpecParser() spec = parser.parse_engine_string("1.5L L3 PLUG-IN HYBRID EV- (PHEV)") # spec.configuration == "I" (normalized from L) """ import re import logging from typing import Optional, List, Pattern from dataclasses import dataclass logger = logging.getLogger(__name__) @dataclass class EngineSpec: """Parsed engine specification""" displacement_l: Optional[float] # Engine displacement in liters configuration: str # I, V, H, Electric, Unknown cylinders: Optional[int] # Number of cylinders fuel_type: str # Gasoline, Hybrid variants, Electric, Flex Fuel aspiration: str # Natural, Turbocharged, Supercharged raw_string: str # Original engine string def __str__(self) -> str: return f"EngineSpec({self.displacement_l}L {self.configuration}{self.cylinders}, {self.fuel_type}, {self.aspiration})" class EngineSpecParser: """Parse engine specifications with L→I normalization""" def __init__(self): """Initialize engine parser with regex patterns""" # Primary engine pattern: {displacement}L {config}{cylinders} # Supports I, V, H, L, W configurations self.engine_pattern = re.compile(r'(\d+\.?\d*)L\s+([IVHLW])(\d+)', re.IGNORECASE) # Hybrid detection patterns (most specific first) self.hybrid_patterns = [ (re.compile(r'PLUG-IN HYBRID EV-?\s*\(PHEV\)', re.IGNORECASE), 'Plug-in Hybrid'), (re.compile(r'FULL HYBRID EV-?\s*\(FHEV\)', re.IGNORECASE), 'Full Hybrid'), (re.compile(r'HYBRID', re.IGNORECASE), 'Hybrid'), ] # Other fuel type patterns self.fuel_patterns = [ (re.compile(r'FLEX', re.IGNORECASE), 'Flex Fuel'), (re.compile(r'ELECTRIC', re.IGNORECASE), 'Electric'), ] # Aspiration patterns self.aspiration_patterns = [ (re.compile(r'TURBO', re.IGNORECASE), 'Turbocharged'), (re.compile(r'SUPERCHARGED|SC', re.IGNORECASE), 'Supercharged'), ] logger.debug("EngineSpecParser initialized with regex patterns") def normalize_configuration(self, config: str) -> str: """ CRITICAL: Convert L-configuration to I (Inline) L-configurations are alternate notation for Inline engines. W-configurations are W-type engines (VW Group, Bentley, etc.) Args: config: Configuration character (I, V, H, L, W) Returns: Normalized configuration (L becomes I, others unchanged) """ config_upper = config.upper() if config_upper == 'L': logger.debug(f"Normalizing L-configuration to I (Inline)") return 'I' return config_upper def extract_fuel_type(self, engine_str: str) -> str: """ Extract fuel type from engine string Priority order: 1. Hybrid patterns (PHEV, FHEV, HYBRID) 2. Other fuel types (FLEX, ELECTRIC) 3. Default to Gasoline Args: engine_str: Original engine string Returns: Detected fuel type """ # Check hybrid patterns first (most specific) for pattern, fuel_type in self.hybrid_patterns: if pattern.search(engine_str): logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'") return fuel_type # Check other fuel types for pattern, fuel_type in self.fuel_patterns: if pattern.search(engine_str): logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'") return fuel_type # Default to gasoline return 'Gasoline' def extract_aspiration(self, engine_str: str) -> str: """ Extract aspiration type from engine string Args: engine_str: Original engine string Returns: Detected aspiration type """ for pattern, aspiration in self.aspiration_patterns: if pattern.search(engine_str): logger.debug(f"Detected aspiration '{aspiration}' from '{engine_str}'") return aspiration return 'Natural' # Default to naturally aspirated def parse_engine_string(self, engine_str: str) -> EngineSpec: """ Parse complete engine specification Args: engine_str: Engine specification string Returns: EngineSpec with parsed components """ if not engine_str or not engine_str.strip(): logger.warning("Empty engine string provided") return self.create_fallback_engine("Empty Engine String") engine_str = engine_str.strip() # Try to match standard engine pattern match = self.engine_pattern.match(engine_str) if not match: logger.warning(f"Could not parse engine string: '{engine_str}'") return self.create_fallback_engine(engine_str) try: # Extract basic components displacement = float(match.group(1)) raw_config = match.group(2) cylinders = int(match.group(3)) # CRITICAL: Apply L→I normalization config = self.normalize_configuration(raw_config) # Extract fuel type and aspiration from modifiers fuel_type = self.extract_fuel_type(engine_str) aspiration = self.extract_aspiration(engine_str) # Log L→I normalization when it occurs if raw_config.upper() == 'L' and config == 'I': logger.info(f"L→I normalization applied: '{engine_str}' → {displacement}L I{cylinders}") spec = EngineSpec( displacement_l=displacement, configuration=config, cylinders=cylinders, fuel_type=fuel_type, aspiration=aspiration, raw_string=engine_str ) logger.debug(f"Parsed '{engine_str}' → {spec}") return spec except (ValueError, IndexError) as e: logger.error(f"Failed to parse matched components from '{engine_str}': {e}") return self.create_fallback_engine(engine_str) def create_fallback_engine(self, raw_string: str) -> EngineSpec: """ Create fallback engine spec for unparseable strings Args: raw_string: Original engine string that couldn't be parsed Returns: EngineSpec with unknown values but preserved raw string """ logger.debug(f"Creating fallback engine for '{raw_string}'") return EngineSpec( displacement_l=None, configuration="Unknown", cylinders=None, fuel_type="Unknown", aspiration="Natural", raw_string=raw_string ) def create_electric_motor(self) -> EngineSpec: """ Create default electric motor spec for empty engines arrays Common for Tesla, Lucid, and other electric vehicles that have empty engines arrays in their JSON data. Returns: EngineSpec configured for electric motor """ logger.debug("Creating default electric motor spec") return EngineSpec( displacement_l=None, # N/A for electric configuration="Electric", # Special designation cylinders=None, # N/A for electric fuel_type="Electric", aspiration=None, # N/A for electric raw_string="Electric Motor" ) def parse_multiple_engines(self, engine_strings: List[str]) -> List[EngineSpec]: """ Parse multiple engine specifications Args: engine_strings: List of engine specification strings Returns: List of parsed EngineSpec objects """ if not engine_strings: # Handle empty engines array (common for electric vehicles) logger.info("Empty engines array detected - creating electric motor") return [self.create_electric_motor()] specs = [] for engine_str in engine_strings: spec = self.parse_engine_string(engine_str) specs.append(spec) logger.debug(f"Parsed {len(specs)} engines from {len(engine_strings)} strings") return specs def get_unique_engines(self, engine_specs: List[EngineSpec]) -> List[EngineSpec]: """ Get unique engines based on key attributes Args: engine_specs: List of engine specifications Returns: List of unique engine specifications """ seen = set() unique_specs = [] for spec in engine_specs: # Create key based on engine characteristics key = ( spec.displacement_l, spec.configuration, spec.cylinders, spec.fuel_type, spec.aspiration ) if key not in seen: seen.add(key) unique_specs.append(spec) else: logger.debug(f"Skipping duplicate engine: {spec}") logger.info(f"Reduced {len(engine_specs)} engines to {len(unique_specs)} unique engines") return unique_specs def validate_engine_spec(self, spec: EngineSpec) -> List[str]: """ Validate engine specification for data quality issues Args: spec: Engine specification to validate Returns: List of validation warnings (empty if no issues) """ warnings = [] # Check displacement if spec.displacement_l is not None: if spec.displacement_l <= 0: warnings.append(f"Invalid displacement: {spec.displacement_l}") elif spec.displacement_l > 20: # Unrealistic for production cars warnings.append(f"Unusually large displacement: {spec.displacement_l}L") # Check cylinders if spec.cylinders is not None: if spec.cylinders <= 0: warnings.append(f"Invalid cylinder count: {spec.cylinders}") elif spec.cylinders > 16: # Very rare in production warnings.append(f"Unusually high cylinder count: {spec.cylinders}") # Check configuration consistency if spec.configuration == "Electric" and spec.displacement_l is not None: warnings.append("Electric motor should not have displacement") if spec.configuration not in ["I", "V", "H", "W", "Electric", "Unknown"]: warnings.append(f"Unexpected configuration: {spec.configuration}") # Check fuel type consistency if spec.fuel_type == "Electric" and spec.configuration != "Electric": warnings.append("Electric fuel type should have Electric configuration") return warnings # Example usage and testing functions def example_usage(): """Demonstrate EngineSpecParser usage""" print("🔧 EngineSpecParser Example Usage") print("=" * 40) parser = EngineSpecParser() # Test cases from actual JSON data test_engines = [ # Standard engines "2.0L I4", "3.5L V6", # L→I normalization examples (CRITICAL) "1.5L L3", "1.2L L3 FULL HYBRID EV- (FHEV)", # Subaru Boxer engines "2.4L H4", # W-configuration engines (VW Group, Bentley) "6.0L W12", "4.0L W8", # Hybrid examples "2.5L I4 FULL HYBRID EV- (FHEV)", "1.5L L3 PLUG-IN HYBRID EV- (PHEV)", # Flex fuel "5.6L V8 FLEX", # Electric "1.8L I4 ELECTRIC", ] for engine_str in test_engines: spec = parser.parse_engine_string(engine_str) print(f"\nInput: \"{engine_str}\"") print(f" → {spec.displacement_l}L {spec.configuration}{spec.cylinders}") print(f" → Fuel: {spec.fuel_type}, Aspiration: {spec.aspiration}") # Highlight L→I normalization if 'L' in engine_str and spec.configuration == 'I' and not 'ELECTRIC' in engine_str.upper(): print(f" 🎯 L→I NORMALIZED") # Test electric vehicle handling print(f"\n⚡ Electric Vehicle Handling:") electric_spec = parser.create_electric_motor() print(f" Default: {electric_spec.raw_string}") print(f" → Config: {electric_spec.configuration}, Fuel: {electric_spec.fuel_type}") if __name__ == "__main__": example_usage()