Files
motovaultpro/mvp-platform-services/vehicles/etl/utils/engine_spec_parser.py
Eric Gullickson a052040e3a Initial Commit
2025-09-17 16:09:15 -05:00

392 lines
14 KiB
Python

"""
Engine Specification Parser
Parses engine specifications from JSON vehicle data into structured components.
Handles displacement, configuration, cylinders, fuel type, and aspiration.
CRITICAL REQUIREMENT: L-configuration normalization
- L3 → I3 (L-configuration treated as Inline)
- L4 → I4 (L-configuration treated as Inline)
Standard format: {displacement}L {config}{cylinders} {modifiers}
Examples:
- "2.0L I4" → 2.0L, Inline, 4-cylinder
- "1.5L L3 PLUG-IN HYBRID EV- (PHEV)" → 1.5L, Inline (normalized), 3-cyl, Plug-in Hybrid
- "2.4L H4" → 2.4L, Horizontal (Subaru Boxer), 4-cylinder
Usage:
parser = EngineSpecParser()
spec = parser.parse_engine_string("1.5L L3 PLUG-IN HYBRID EV- (PHEV)")
# spec.configuration == "I" (normalized from L)
"""
import re
import logging
from typing import Optional, List, Pattern
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class EngineSpec:
"""Parsed engine specification"""
displacement_l: Optional[float] # Engine displacement in liters
configuration: str # I, V, H, Electric, Unknown
cylinders: Optional[int] # Number of cylinders
fuel_type: str # Gasoline, Hybrid variants, Electric, Flex Fuel
aspiration: str # Natural, Turbocharged, Supercharged
raw_string: str # Original engine string
def __str__(self) -> str:
return f"EngineSpec({self.displacement_l}L {self.configuration}{self.cylinders}, {self.fuel_type}, {self.aspiration})"
class EngineSpecParser:
"""Parse engine specifications with L→I normalization"""
def __init__(self):
"""Initialize engine parser with regex patterns"""
# Primary engine pattern: {displacement}L {config}{cylinders}
# Supports I, V, H, L, W configurations
self.engine_pattern = re.compile(r'(\d+\.?\d*)L\s+([IVHLW])(\d+)', re.IGNORECASE)
# Hybrid detection patterns (most specific first)
self.hybrid_patterns = [
(re.compile(r'PLUG-IN HYBRID EV-?\s*\(PHEV\)', re.IGNORECASE), 'Plug-in Hybrid'),
(re.compile(r'FULL HYBRID EV-?\s*\(FHEV\)', re.IGNORECASE), 'Full Hybrid'),
(re.compile(r'HYBRID', re.IGNORECASE), 'Hybrid'),
]
# Other fuel type patterns
self.fuel_patterns = [
(re.compile(r'FLEX', re.IGNORECASE), 'Flex Fuel'),
(re.compile(r'ELECTRIC', re.IGNORECASE), 'Electric'),
]
# Aspiration patterns
self.aspiration_patterns = [
(re.compile(r'TURBO', re.IGNORECASE), 'Turbocharged'),
(re.compile(r'SUPERCHARGED|SC', re.IGNORECASE), 'Supercharged'),
]
logger.debug("EngineSpecParser initialized with regex patterns")
def normalize_configuration(self, config: str) -> str:
"""
CRITICAL: Convert L-configuration to I (Inline)
L-configurations are alternate notation for Inline engines.
W-configurations are W-type engines (VW Group, Bentley, etc.)
Args:
config: Configuration character (I, V, H, L, W)
Returns:
Normalized configuration (L becomes I, others unchanged)
"""
config_upper = config.upper()
if config_upper == 'L':
logger.debug(f"Normalizing L-configuration to I (Inline)")
return 'I'
return config_upper
def extract_fuel_type(self, engine_str: str) -> str:
"""
Extract fuel type from engine string
Priority order:
1. Hybrid patterns (PHEV, FHEV, HYBRID)
2. Other fuel types (FLEX, ELECTRIC)
3. Default to Gasoline
Args:
engine_str: Original engine string
Returns:
Detected fuel type
"""
# Check hybrid patterns first (most specific)
for pattern, fuel_type in self.hybrid_patterns:
if pattern.search(engine_str):
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
return fuel_type
# Check other fuel types
for pattern, fuel_type in self.fuel_patterns:
if pattern.search(engine_str):
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
return fuel_type
# Default to gasoline
return 'Gasoline'
def extract_aspiration(self, engine_str: str) -> str:
"""
Extract aspiration type from engine string
Args:
engine_str: Original engine string
Returns:
Detected aspiration type
"""
for pattern, aspiration in self.aspiration_patterns:
if pattern.search(engine_str):
logger.debug(f"Detected aspiration '{aspiration}' from '{engine_str}'")
return aspiration
return 'Natural' # Default to naturally aspirated
def parse_engine_string(self, engine_str: str) -> EngineSpec:
"""
Parse complete engine specification
Args:
engine_str: Engine specification string
Returns:
EngineSpec with parsed components
"""
if not engine_str or not engine_str.strip():
logger.warning("Empty engine string provided")
return self.create_fallback_engine("Empty Engine String")
engine_str = engine_str.strip()
# Try to match standard engine pattern
match = self.engine_pattern.match(engine_str)
if not match:
logger.warning(f"Could not parse engine string: '{engine_str}'")
return self.create_fallback_engine(engine_str)
try:
# Extract basic components
displacement = float(match.group(1))
raw_config = match.group(2)
cylinders = int(match.group(3))
# CRITICAL: Apply L→I normalization
config = self.normalize_configuration(raw_config)
# Extract fuel type and aspiration from modifiers
fuel_type = self.extract_fuel_type(engine_str)
aspiration = self.extract_aspiration(engine_str)
# Log L→I normalization when it occurs
if raw_config.upper() == 'L' and config == 'I':
logger.info(f"L→I normalization applied: '{engine_str}'{displacement}L I{cylinders}")
spec = EngineSpec(
displacement_l=displacement,
configuration=config,
cylinders=cylinders,
fuel_type=fuel_type,
aspiration=aspiration,
raw_string=engine_str
)
logger.debug(f"Parsed '{engine_str}'{spec}")
return spec
except (ValueError, IndexError) as e:
logger.error(f"Failed to parse matched components from '{engine_str}': {e}")
return self.create_fallback_engine(engine_str)
def create_fallback_engine(self, raw_string: str) -> EngineSpec:
"""
Create fallback engine spec for unparseable strings
Args:
raw_string: Original engine string that couldn't be parsed
Returns:
EngineSpec with unknown values but preserved raw string
"""
logger.debug(f"Creating fallback engine for '{raw_string}'")
return EngineSpec(
displacement_l=None,
configuration="Unknown",
cylinders=None,
fuel_type="Unknown",
aspiration="Natural",
raw_string=raw_string
)
def create_electric_motor(self) -> EngineSpec:
"""
Create default electric motor spec for empty engines arrays
Common for Tesla, Lucid, and other electric vehicles that have
empty engines arrays in their JSON data.
Returns:
EngineSpec configured for electric motor
"""
logger.debug("Creating default electric motor spec")
return EngineSpec(
displacement_l=None, # N/A for electric
configuration="Electric", # Special designation
cylinders=None, # N/A for electric
fuel_type="Electric",
aspiration=None, # N/A for electric
raw_string="Electric Motor"
)
def parse_multiple_engines(self, engine_strings: List[str]) -> List[EngineSpec]:
"""
Parse multiple engine specifications
Args:
engine_strings: List of engine specification strings
Returns:
List of parsed EngineSpec objects
"""
if not engine_strings:
# Handle empty engines array (common for electric vehicles)
logger.info("Empty engines array detected - creating electric motor")
return [self.create_electric_motor()]
specs = []
for engine_str in engine_strings:
spec = self.parse_engine_string(engine_str)
specs.append(spec)
logger.debug(f"Parsed {len(specs)} engines from {len(engine_strings)} strings")
return specs
def get_unique_engines(self, engine_specs: List[EngineSpec]) -> List[EngineSpec]:
"""
Get unique engines based on key attributes
Args:
engine_specs: List of engine specifications
Returns:
List of unique engine specifications
"""
seen = set()
unique_specs = []
for spec in engine_specs:
# Create key based on engine characteristics
key = (
spec.displacement_l,
spec.configuration,
spec.cylinders,
spec.fuel_type,
spec.aspiration
)
if key not in seen:
seen.add(key)
unique_specs.append(spec)
else:
logger.debug(f"Skipping duplicate engine: {spec}")
logger.info(f"Reduced {len(engine_specs)} engines to {len(unique_specs)} unique engines")
return unique_specs
def validate_engine_spec(self, spec: EngineSpec) -> List[str]:
"""
Validate engine specification for data quality issues
Args:
spec: Engine specification to validate
Returns:
List of validation warnings (empty if no issues)
"""
warnings = []
# Check displacement
if spec.displacement_l is not None:
if spec.displacement_l <= 0:
warnings.append(f"Invalid displacement: {spec.displacement_l}")
elif spec.displacement_l > 20: # Unrealistic for production cars
warnings.append(f"Unusually large displacement: {spec.displacement_l}L")
# Check cylinders
if spec.cylinders is not None:
if spec.cylinders <= 0:
warnings.append(f"Invalid cylinder count: {spec.cylinders}")
elif spec.cylinders > 16: # Very rare in production
warnings.append(f"Unusually high cylinder count: {spec.cylinders}")
# Check configuration consistency
if spec.configuration == "Electric" and spec.displacement_l is not None:
warnings.append("Electric motor should not have displacement")
if spec.configuration not in ["I", "V", "H", "W", "Electric", "Unknown"]:
warnings.append(f"Unexpected configuration: {spec.configuration}")
# Check fuel type consistency
if spec.fuel_type == "Electric" and spec.configuration != "Electric":
warnings.append("Electric fuel type should have Electric configuration")
return warnings
# Example usage and testing functions
def example_usage():
"""Demonstrate EngineSpecParser usage"""
print("🔧 EngineSpecParser Example Usage")
print("=" * 40)
parser = EngineSpecParser()
# Test cases from actual JSON data
test_engines = [
# Standard engines
"2.0L I4",
"3.5L V6",
# L→I normalization examples (CRITICAL)
"1.5L L3",
"1.2L L3 FULL HYBRID EV- (FHEV)",
# Subaru Boxer engines
"2.4L H4",
# W-configuration engines (VW Group, Bentley)
"6.0L W12",
"4.0L W8",
# Hybrid examples
"2.5L I4 FULL HYBRID EV- (FHEV)",
"1.5L L3 PLUG-IN HYBRID EV- (PHEV)",
# Flex fuel
"5.6L V8 FLEX",
# Electric
"1.8L I4 ELECTRIC",
]
for engine_str in test_engines:
spec = parser.parse_engine_string(engine_str)
print(f"\nInput: \"{engine_str}\"")
print(f"{spec.displacement_l}L {spec.configuration}{spec.cylinders}")
print(f" → Fuel: {spec.fuel_type}, Aspiration: {spec.aspiration}")
# Highlight L→I normalization
if 'L' in engine_str and spec.configuration == 'I' and not 'ELECTRIC' in engine_str.upper():
print(f" 🎯 L→I NORMALIZED")
# Test electric vehicle handling
print(f"\n⚡ Electric Vehicle Handling:")
electric_spec = parser.create_electric_motor()
print(f" Default: {electric_spec.raw_string}")
print(f" → Config: {electric_spec.configuration}, Fuel: {electric_spec.fuel_type}")
if __name__ == "__main__":
example_usage()