Initial Commit
This commit is contained in:
392
mvp-platform-services/vehicles/etl/utils/engine_spec_parser.py
Normal file
392
mvp-platform-services/vehicles/etl/utils/engine_spec_parser.py
Normal file
@@ -0,0 +1,392 @@
|
||||
"""
|
||||
Engine Specification Parser
|
||||
|
||||
Parses engine specifications from JSON vehicle data into structured components.
|
||||
Handles displacement, configuration, cylinders, fuel type, and aspiration.
|
||||
|
||||
CRITICAL REQUIREMENT: L-configuration normalization
|
||||
- L3 → I3 (L-configuration treated as Inline)
|
||||
- L4 → I4 (L-configuration treated as Inline)
|
||||
|
||||
Standard format: {displacement}L {config}{cylinders} {modifiers}
|
||||
Examples:
|
||||
- "2.0L I4" → 2.0L, Inline, 4-cylinder
|
||||
- "1.5L L3 PLUG-IN HYBRID EV- (PHEV)" → 1.5L, Inline (normalized), 3-cyl, Plug-in Hybrid
|
||||
- "2.4L H4" → 2.4L, Horizontal (Subaru Boxer), 4-cylinder
|
||||
|
||||
Usage:
|
||||
parser = EngineSpecParser()
|
||||
spec = parser.parse_engine_string("1.5L L3 PLUG-IN HYBRID EV- (PHEV)")
|
||||
# spec.configuration == "I" (normalized from L)
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Optional, List, Pattern
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EngineSpec:
|
||||
"""Parsed engine specification"""
|
||||
displacement_l: Optional[float] # Engine displacement in liters
|
||||
configuration: str # I, V, H, Electric, Unknown
|
||||
cylinders: Optional[int] # Number of cylinders
|
||||
fuel_type: str # Gasoline, Hybrid variants, Electric, Flex Fuel
|
||||
aspiration: str # Natural, Turbocharged, Supercharged
|
||||
raw_string: str # Original engine string
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"EngineSpec({self.displacement_l}L {self.configuration}{self.cylinders}, {self.fuel_type}, {self.aspiration})"
|
||||
|
||||
|
||||
class EngineSpecParser:
|
||||
"""Parse engine specifications with L→I normalization"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize engine parser with regex patterns"""
|
||||
|
||||
# Primary engine pattern: {displacement}L {config}{cylinders}
|
||||
# Supports I, V, H, L, W configurations
|
||||
self.engine_pattern = re.compile(r'(\d+\.?\d*)L\s+([IVHLW])(\d+)', re.IGNORECASE)
|
||||
|
||||
# Hybrid detection patterns (most specific first)
|
||||
self.hybrid_patterns = [
|
||||
(re.compile(r'PLUG-IN HYBRID EV-?\s*\(PHEV\)', re.IGNORECASE), 'Plug-in Hybrid'),
|
||||
(re.compile(r'FULL HYBRID EV-?\s*\(FHEV\)', re.IGNORECASE), 'Full Hybrid'),
|
||||
(re.compile(r'HYBRID', re.IGNORECASE), 'Hybrid'),
|
||||
]
|
||||
|
||||
# Other fuel type patterns
|
||||
self.fuel_patterns = [
|
||||
(re.compile(r'FLEX', re.IGNORECASE), 'Flex Fuel'),
|
||||
(re.compile(r'ELECTRIC', re.IGNORECASE), 'Electric'),
|
||||
]
|
||||
|
||||
# Aspiration patterns
|
||||
self.aspiration_patterns = [
|
||||
(re.compile(r'TURBO', re.IGNORECASE), 'Turbocharged'),
|
||||
(re.compile(r'SUPERCHARGED|SC', re.IGNORECASE), 'Supercharged'),
|
||||
]
|
||||
|
||||
logger.debug("EngineSpecParser initialized with regex patterns")
|
||||
|
||||
def normalize_configuration(self, config: str) -> str:
|
||||
"""
|
||||
CRITICAL: Convert L-configuration to I (Inline)
|
||||
|
||||
L-configurations are alternate notation for Inline engines.
|
||||
W-configurations are W-type engines (VW Group, Bentley, etc.)
|
||||
|
||||
Args:
|
||||
config: Configuration character (I, V, H, L, W)
|
||||
|
||||
Returns:
|
||||
Normalized configuration (L becomes I, others unchanged)
|
||||
"""
|
||||
config_upper = config.upper()
|
||||
|
||||
if config_upper == 'L':
|
||||
logger.debug(f"Normalizing L-configuration to I (Inline)")
|
||||
return 'I'
|
||||
|
||||
return config_upper
|
||||
|
||||
def extract_fuel_type(self, engine_str: str) -> str:
|
||||
"""
|
||||
Extract fuel type from engine string
|
||||
|
||||
Priority order:
|
||||
1. Hybrid patterns (PHEV, FHEV, HYBRID)
|
||||
2. Other fuel types (FLEX, ELECTRIC)
|
||||
3. Default to Gasoline
|
||||
|
||||
Args:
|
||||
engine_str: Original engine string
|
||||
|
||||
Returns:
|
||||
Detected fuel type
|
||||
"""
|
||||
# Check hybrid patterns first (most specific)
|
||||
for pattern, fuel_type in self.hybrid_patterns:
|
||||
if pattern.search(engine_str):
|
||||
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
|
||||
return fuel_type
|
||||
|
||||
# Check other fuel types
|
||||
for pattern, fuel_type in self.fuel_patterns:
|
||||
if pattern.search(engine_str):
|
||||
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
|
||||
return fuel_type
|
||||
|
||||
# Default to gasoline
|
||||
return 'Gasoline'
|
||||
|
||||
def extract_aspiration(self, engine_str: str) -> str:
|
||||
"""
|
||||
Extract aspiration type from engine string
|
||||
|
||||
Args:
|
||||
engine_str: Original engine string
|
||||
|
||||
Returns:
|
||||
Detected aspiration type
|
||||
"""
|
||||
for pattern, aspiration in self.aspiration_patterns:
|
||||
if pattern.search(engine_str):
|
||||
logger.debug(f"Detected aspiration '{aspiration}' from '{engine_str}'")
|
||||
return aspiration
|
||||
|
||||
return 'Natural' # Default to naturally aspirated
|
||||
|
||||
def parse_engine_string(self, engine_str: str) -> EngineSpec:
|
||||
"""
|
||||
Parse complete engine specification
|
||||
|
||||
Args:
|
||||
engine_str: Engine specification string
|
||||
|
||||
Returns:
|
||||
EngineSpec with parsed components
|
||||
"""
|
||||
if not engine_str or not engine_str.strip():
|
||||
logger.warning("Empty engine string provided")
|
||||
return self.create_fallback_engine("Empty Engine String")
|
||||
|
||||
engine_str = engine_str.strip()
|
||||
|
||||
# Try to match standard engine pattern
|
||||
match = self.engine_pattern.match(engine_str)
|
||||
|
||||
if not match:
|
||||
logger.warning(f"Could not parse engine string: '{engine_str}'")
|
||||
return self.create_fallback_engine(engine_str)
|
||||
|
||||
try:
|
||||
# Extract basic components
|
||||
displacement = float(match.group(1))
|
||||
raw_config = match.group(2)
|
||||
cylinders = int(match.group(3))
|
||||
|
||||
# CRITICAL: Apply L→I normalization
|
||||
config = self.normalize_configuration(raw_config)
|
||||
|
||||
# Extract fuel type and aspiration from modifiers
|
||||
fuel_type = self.extract_fuel_type(engine_str)
|
||||
aspiration = self.extract_aspiration(engine_str)
|
||||
|
||||
# Log L→I normalization when it occurs
|
||||
if raw_config.upper() == 'L' and config == 'I':
|
||||
logger.info(f"L→I normalization applied: '{engine_str}' → {displacement}L I{cylinders}")
|
||||
|
||||
spec = EngineSpec(
|
||||
displacement_l=displacement,
|
||||
configuration=config,
|
||||
cylinders=cylinders,
|
||||
fuel_type=fuel_type,
|
||||
aspiration=aspiration,
|
||||
raw_string=engine_str
|
||||
)
|
||||
|
||||
logger.debug(f"Parsed '{engine_str}' → {spec}")
|
||||
return spec
|
||||
|
||||
except (ValueError, IndexError) as e:
|
||||
logger.error(f"Failed to parse matched components from '{engine_str}': {e}")
|
||||
return self.create_fallback_engine(engine_str)
|
||||
|
||||
def create_fallback_engine(self, raw_string: str) -> EngineSpec:
|
||||
"""
|
||||
Create fallback engine spec for unparseable strings
|
||||
|
||||
Args:
|
||||
raw_string: Original engine string that couldn't be parsed
|
||||
|
||||
Returns:
|
||||
EngineSpec with unknown values but preserved raw string
|
||||
"""
|
||||
logger.debug(f"Creating fallback engine for '{raw_string}'")
|
||||
|
||||
return EngineSpec(
|
||||
displacement_l=None,
|
||||
configuration="Unknown",
|
||||
cylinders=None,
|
||||
fuel_type="Unknown",
|
||||
aspiration="Natural",
|
||||
raw_string=raw_string
|
||||
)
|
||||
|
||||
def create_electric_motor(self) -> EngineSpec:
|
||||
"""
|
||||
Create default electric motor spec for empty engines arrays
|
||||
|
||||
Common for Tesla, Lucid, and other electric vehicles that have
|
||||
empty engines arrays in their JSON data.
|
||||
|
||||
Returns:
|
||||
EngineSpec configured for electric motor
|
||||
"""
|
||||
logger.debug("Creating default electric motor spec")
|
||||
|
||||
return EngineSpec(
|
||||
displacement_l=None, # N/A for electric
|
||||
configuration="Electric", # Special designation
|
||||
cylinders=None, # N/A for electric
|
||||
fuel_type="Electric",
|
||||
aspiration=None, # N/A for electric
|
||||
raw_string="Electric Motor"
|
||||
)
|
||||
|
||||
def parse_multiple_engines(self, engine_strings: List[str]) -> List[EngineSpec]:
|
||||
"""
|
||||
Parse multiple engine specifications
|
||||
|
||||
Args:
|
||||
engine_strings: List of engine specification strings
|
||||
|
||||
Returns:
|
||||
List of parsed EngineSpec objects
|
||||
"""
|
||||
if not engine_strings:
|
||||
# Handle empty engines array (common for electric vehicles)
|
||||
logger.info("Empty engines array detected - creating electric motor")
|
||||
return [self.create_electric_motor()]
|
||||
|
||||
specs = []
|
||||
for engine_str in engine_strings:
|
||||
spec = self.parse_engine_string(engine_str)
|
||||
specs.append(spec)
|
||||
|
||||
logger.debug(f"Parsed {len(specs)} engines from {len(engine_strings)} strings")
|
||||
return specs
|
||||
|
||||
def get_unique_engines(self, engine_specs: List[EngineSpec]) -> List[EngineSpec]:
|
||||
"""
|
||||
Get unique engines based on key attributes
|
||||
|
||||
Args:
|
||||
engine_specs: List of engine specifications
|
||||
|
||||
Returns:
|
||||
List of unique engine specifications
|
||||
"""
|
||||
seen = set()
|
||||
unique_specs = []
|
||||
|
||||
for spec in engine_specs:
|
||||
# Create key based on engine characteristics
|
||||
key = (
|
||||
spec.displacement_l,
|
||||
spec.configuration,
|
||||
spec.cylinders,
|
||||
spec.fuel_type,
|
||||
spec.aspiration
|
||||
)
|
||||
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_specs.append(spec)
|
||||
else:
|
||||
logger.debug(f"Skipping duplicate engine: {spec}")
|
||||
|
||||
logger.info(f"Reduced {len(engine_specs)} engines to {len(unique_specs)} unique engines")
|
||||
return unique_specs
|
||||
|
||||
def validate_engine_spec(self, spec: EngineSpec) -> List[str]:
|
||||
"""
|
||||
Validate engine specification for data quality issues
|
||||
|
||||
Args:
|
||||
spec: Engine specification to validate
|
||||
|
||||
Returns:
|
||||
List of validation warnings (empty if no issues)
|
||||
"""
|
||||
warnings = []
|
||||
|
||||
# Check displacement
|
||||
if spec.displacement_l is not None:
|
||||
if spec.displacement_l <= 0:
|
||||
warnings.append(f"Invalid displacement: {spec.displacement_l}")
|
||||
elif spec.displacement_l > 20: # Unrealistic for production cars
|
||||
warnings.append(f"Unusually large displacement: {spec.displacement_l}L")
|
||||
|
||||
# Check cylinders
|
||||
if spec.cylinders is not None:
|
||||
if spec.cylinders <= 0:
|
||||
warnings.append(f"Invalid cylinder count: {spec.cylinders}")
|
||||
elif spec.cylinders > 16: # Very rare in production
|
||||
warnings.append(f"Unusually high cylinder count: {spec.cylinders}")
|
||||
|
||||
# Check configuration consistency
|
||||
if spec.configuration == "Electric" and spec.displacement_l is not None:
|
||||
warnings.append("Electric motor should not have displacement")
|
||||
|
||||
if spec.configuration not in ["I", "V", "H", "W", "Electric", "Unknown"]:
|
||||
warnings.append(f"Unexpected configuration: {spec.configuration}")
|
||||
|
||||
# Check fuel type consistency
|
||||
if spec.fuel_type == "Electric" and spec.configuration != "Electric":
|
||||
warnings.append("Electric fuel type should have Electric configuration")
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
# Example usage and testing functions
|
||||
def example_usage():
|
||||
"""Demonstrate EngineSpecParser usage"""
|
||||
print("🔧 EngineSpecParser Example Usage")
|
||||
print("=" * 40)
|
||||
|
||||
parser = EngineSpecParser()
|
||||
|
||||
# Test cases from actual JSON data
|
||||
test_engines = [
|
||||
# Standard engines
|
||||
"2.0L I4",
|
||||
"3.5L V6",
|
||||
|
||||
# L→I normalization examples (CRITICAL)
|
||||
"1.5L L3",
|
||||
"1.2L L3 FULL HYBRID EV- (FHEV)",
|
||||
|
||||
# Subaru Boxer engines
|
||||
"2.4L H4",
|
||||
|
||||
# W-configuration engines (VW Group, Bentley)
|
||||
"6.0L W12",
|
||||
"4.0L W8",
|
||||
|
||||
# Hybrid examples
|
||||
"2.5L I4 FULL HYBRID EV- (FHEV)",
|
||||
"1.5L L3 PLUG-IN HYBRID EV- (PHEV)",
|
||||
|
||||
# Flex fuel
|
||||
"5.6L V8 FLEX",
|
||||
|
||||
# Electric
|
||||
"1.8L I4 ELECTRIC",
|
||||
]
|
||||
|
||||
for engine_str in test_engines:
|
||||
spec = parser.parse_engine_string(engine_str)
|
||||
|
||||
print(f"\nInput: \"{engine_str}\"")
|
||||
print(f" → {spec.displacement_l}L {spec.configuration}{spec.cylinders}")
|
||||
print(f" → Fuel: {spec.fuel_type}, Aspiration: {spec.aspiration}")
|
||||
|
||||
# Highlight L→I normalization
|
||||
if 'L' in engine_str and spec.configuration == 'I' and not 'ELECTRIC' in engine_str.upper():
|
||||
print(f" 🎯 L→I NORMALIZED")
|
||||
|
||||
# Test electric vehicle handling
|
||||
print(f"\n⚡ Electric Vehicle Handling:")
|
||||
electric_spec = parser.create_electric_motor()
|
||||
print(f" Default: {electric_spec.raw_string}")
|
||||
print(f" → Config: {electric_spec.configuration}, Fuel: {electric_spec.fuel_type}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
Reference in New Issue
Block a user