392 lines
14 KiB
Python
392 lines
14 KiB
Python
"""
|
|
Engine Specification Parser
|
|
|
|
Parses engine specifications from JSON vehicle data into structured components.
|
|
Handles displacement, configuration, cylinders, fuel type, and aspiration.
|
|
|
|
CRITICAL REQUIREMENT: L-configuration normalization
|
|
- L3 → I3 (L-configuration treated as Inline)
|
|
- L4 → I4 (L-configuration treated as Inline)
|
|
|
|
Standard format: {displacement}L {config}{cylinders} {modifiers}
|
|
Examples:
|
|
- "2.0L I4" → 2.0L, Inline, 4-cylinder
|
|
- "1.5L L3 PLUG-IN HYBRID EV- (PHEV)" → 1.5L, Inline (normalized), 3-cyl, Plug-in Hybrid
|
|
- "2.4L H4" → 2.4L, Horizontal (Subaru Boxer), 4-cylinder
|
|
|
|
Usage:
|
|
parser = EngineSpecParser()
|
|
spec = parser.parse_engine_string("1.5L L3 PLUG-IN HYBRID EV- (PHEV)")
|
|
# spec.configuration == "I" (normalized from L)
|
|
"""
|
|
|
|
import re
|
|
import logging
|
|
from typing import Optional, List, Pattern
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class EngineSpec:
|
|
"""Parsed engine specification"""
|
|
displacement_l: Optional[float] # Engine displacement in liters
|
|
configuration: str # I, V, H, Electric, Unknown
|
|
cylinders: Optional[int] # Number of cylinders
|
|
fuel_type: str # Gasoline, Hybrid variants, Electric, Flex Fuel
|
|
aspiration: str # Natural, Turbocharged, Supercharged
|
|
raw_string: str # Original engine string
|
|
|
|
def __str__(self) -> str:
|
|
return f"EngineSpec({self.displacement_l}L {self.configuration}{self.cylinders}, {self.fuel_type}, {self.aspiration})"
|
|
|
|
|
|
class EngineSpecParser:
|
|
"""Parse engine specifications with L→I normalization"""
|
|
|
|
def __init__(self):
|
|
"""Initialize engine parser with regex patterns"""
|
|
|
|
# Primary engine pattern: {displacement}L {config}{cylinders}
|
|
# Supports I, V, H, L, W configurations
|
|
self.engine_pattern = re.compile(r'(\d+\.?\d*)L\s+([IVHLW])(\d+)', re.IGNORECASE)
|
|
|
|
# Hybrid detection patterns (most specific first)
|
|
self.hybrid_patterns = [
|
|
(re.compile(r'PLUG-IN HYBRID EV-?\s*\(PHEV\)', re.IGNORECASE), 'Plug-in Hybrid'),
|
|
(re.compile(r'FULL HYBRID EV-?\s*\(FHEV\)', re.IGNORECASE), 'Full Hybrid'),
|
|
(re.compile(r'HYBRID', re.IGNORECASE), 'Hybrid'),
|
|
]
|
|
|
|
# Other fuel type patterns
|
|
self.fuel_patterns = [
|
|
(re.compile(r'FLEX', re.IGNORECASE), 'Flex Fuel'),
|
|
(re.compile(r'ELECTRIC', re.IGNORECASE), 'Electric'),
|
|
]
|
|
|
|
# Aspiration patterns
|
|
self.aspiration_patterns = [
|
|
(re.compile(r'TURBO', re.IGNORECASE), 'Turbocharged'),
|
|
(re.compile(r'SUPERCHARGED|SC', re.IGNORECASE), 'Supercharged'),
|
|
]
|
|
|
|
logger.debug("EngineSpecParser initialized with regex patterns")
|
|
|
|
def normalize_configuration(self, config: str) -> str:
|
|
"""
|
|
CRITICAL: Convert L-configuration to I (Inline)
|
|
|
|
L-configurations are alternate notation for Inline engines.
|
|
W-configurations are W-type engines (VW Group, Bentley, etc.)
|
|
|
|
Args:
|
|
config: Configuration character (I, V, H, L, W)
|
|
|
|
Returns:
|
|
Normalized configuration (L becomes I, others unchanged)
|
|
"""
|
|
config_upper = config.upper()
|
|
|
|
if config_upper == 'L':
|
|
logger.debug(f"Normalizing L-configuration to I (Inline)")
|
|
return 'I'
|
|
|
|
return config_upper
|
|
|
|
def extract_fuel_type(self, engine_str: str) -> str:
|
|
"""
|
|
Extract fuel type from engine string
|
|
|
|
Priority order:
|
|
1. Hybrid patterns (PHEV, FHEV, HYBRID)
|
|
2. Other fuel types (FLEX, ELECTRIC)
|
|
3. Default to Gasoline
|
|
|
|
Args:
|
|
engine_str: Original engine string
|
|
|
|
Returns:
|
|
Detected fuel type
|
|
"""
|
|
# Check hybrid patterns first (most specific)
|
|
for pattern, fuel_type in self.hybrid_patterns:
|
|
if pattern.search(engine_str):
|
|
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
|
|
return fuel_type
|
|
|
|
# Check other fuel types
|
|
for pattern, fuel_type in self.fuel_patterns:
|
|
if pattern.search(engine_str):
|
|
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
|
|
return fuel_type
|
|
|
|
# Default to gasoline
|
|
return 'Gasoline'
|
|
|
|
def extract_aspiration(self, engine_str: str) -> str:
|
|
"""
|
|
Extract aspiration type from engine string
|
|
|
|
Args:
|
|
engine_str: Original engine string
|
|
|
|
Returns:
|
|
Detected aspiration type
|
|
"""
|
|
for pattern, aspiration in self.aspiration_patterns:
|
|
if pattern.search(engine_str):
|
|
logger.debug(f"Detected aspiration '{aspiration}' from '{engine_str}'")
|
|
return aspiration
|
|
|
|
return 'Natural' # Default to naturally aspirated
|
|
|
|
def parse_engine_string(self, engine_str: str) -> EngineSpec:
|
|
"""
|
|
Parse complete engine specification
|
|
|
|
Args:
|
|
engine_str: Engine specification string
|
|
|
|
Returns:
|
|
EngineSpec with parsed components
|
|
"""
|
|
if not engine_str or not engine_str.strip():
|
|
logger.warning("Empty engine string provided")
|
|
return self.create_fallback_engine("Empty Engine String")
|
|
|
|
engine_str = engine_str.strip()
|
|
|
|
# Try to match standard engine pattern
|
|
match = self.engine_pattern.match(engine_str)
|
|
|
|
if not match:
|
|
logger.warning(f"Could not parse engine string: '{engine_str}'")
|
|
return self.create_fallback_engine(engine_str)
|
|
|
|
try:
|
|
# Extract basic components
|
|
displacement = float(match.group(1))
|
|
raw_config = match.group(2)
|
|
cylinders = int(match.group(3))
|
|
|
|
# CRITICAL: Apply L→I normalization
|
|
config = self.normalize_configuration(raw_config)
|
|
|
|
# Extract fuel type and aspiration from modifiers
|
|
fuel_type = self.extract_fuel_type(engine_str)
|
|
aspiration = self.extract_aspiration(engine_str)
|
|
|
|
# Log L→I normalization when it occurs
|
|
if raw_config.upper() == 'L' and config == 'I':
|
|
logger.info(f"L→I normalization applied: '{engine_str}' → {displacement}L I{cylinders}")
|
|
|
|
spec = EngineSpec(
|
|
displacement_l=displacement,
|
|
configuration=config,
|
|
cylinders=cylinders,
|
|
fuel_type=fuel_type,
|
|
aspiration=aspiration,
|
|
raw_string=engine_str
|
|
)
|
|
|
|
logger.debug(f"Parsed '{engine_str}' → {spec}")
|
|
return spec
|
|
|
|
except (ValueError, IndexError) as e:
|
|
logger.error(f"Failed to parse matched components from '{engine_str}': {e}")
|
|
return self.create_fallback_engine(engine_str)
|
|
|
|
def create_fallback_engine(self, raw_string: str) -> EngineSpec:
|
|
"""
|
|
Create fallback engine spec for unparseable strings
|
|
|
|
Args:
|
|
raw_string: Original engine string that couldn't be parsed
|
|
|
|
Returns:
|
|
EngineSpec with unknown values but preserved raw string
|
|
"""
|
|
logger.debug(f"Creating fallback engine for '{raw_string}'")
|
|
|
|
return EngineSpec(
|
|
displacement_l=None,
|
|
configuration="Unknown",
|
|
cylinders=None,
|
|
fuel_type="Unknown",
|
|
aspiration="Natural",
|
|
raw_string=raw_string
|
|
)
|
|
|
|
def create_electric_motor(self) -> EngineSpec:
|
|
"""
|
|
Create default electric motor spec for empty engines arrays
|
|
|
|
Common for Tesla, Lucid, and other electric vehicles that have
|
|
empty engines arrays in their JSON data.
|
|
|
|
Returns:
|
|
EngineSpec configured for electric motor
|
|
"""
|
|
logger.debug("Creating default electric motor spec")
|
|
|
|
return EngineSpec(
|
|
displacement_l=None, # N/A for electric
|
|
configuration="Electric", # Special designation
|
|
cylinders=None, # N/A for electric
|
|
fuel_type="Electric",
|
|
aspiration=None, # N/A for electric
|
|
raw_string="Electric Motor"
|
|
)
|
|
|
|
def parse_multiple_engines(self, engine_strings: List[str]) -> List[EngineSpec]:
|
|
"""
|
|
Parse multiple engine specifications
|
|
|
|
Args:
|
|
engine_strings: List of engine specification strings
|
|
|
|
Returns:
|
|
List of parsed EngineSpec objects
|
|
"""
|
|
if not engine_strings:
|
|
# Handle empty engines array (common for electric vehicles)
|
|
logger.info("Empty engines array detected - creating electric motor")
|
|
return [self.create_electric_motor()]
|
|
|
|
specs = []
|
|
for engine_str in engine_strings:
|
|
spec = self.parse_engine_string(engine_str)
|
|
specs.append(spec)
|
|
|
|
logger.debug(f"Parsed {len(specs)} engines from {len(engine_strings)} strings")
|
|
return specs
|
|
|
|
def get_unique_engines(self, engine_specs: List[EngineSpec]) -> List[EngineSpec]:
|
|
"""
|
|
Get unique engines based on key attributes
|
|
|
|
Args:
|
|
engine_specs: List of engine specifications
|
|
|
|
Returns:
|
|
List of unique engine specifications
|
|
"""
|
|
seen = set()
|
|
unique_specs = []
|
|
|
|
for spec in engine_specs:
|
|
# Create key based on engine characteristics
|
|
key = (
|
|
spec.displacement_l,
|
|
spec.configuration,
|
|
spec.cylinders,
|
|
spec.fuel_type,
|
|
spec.aspiration
|
|
)
|
|
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique_specs.append(spec)
|
|
else:
|
|
logger.debug(f"Skipping duplicate engine: {spec}")
|
|
|
|
logger.info(f"Reduced {len(engine_specs)} engines to {len(unique_specs)} unique engines")
|
|
return unique_specs
|
|
|
|
def validate_engine_spec(self, spec: EngineSpec) -> List[str]:
|
|
"""
|
|
Validate engine specification for data quality issues
|
|
|
|
Args:
|
|
spec: Engine specification to validate
|
|
|
|
Returns:
|
|
List of validation warnings (empty if no issues)
|
|
"""
|
|
warnings = []
|
|
|
|
# Check displacement
|
|
if spec.displacement_l is not None:
|
|
if spec.displacement_l <= 0:
|
|
warnings.append(f"Invalid displacement: {spec.displacement_l}")
|
|
elif spec.displacement_l > 20: # Unrealistic for production cars
|
|
warnings.append(f"Unusually large displacement: {spec.displacement_l}L")
|
|
|
|
# Check cylinders
|
|
if spec.cylinders is not None:
|
|
if spec.cylinders <= 0:
|
|
warnings.append(f"Invalid cylinder count: {spec.cylinders}")
|
|
elif spec.cylinders > 16: # Very rare in production
|
|
warnings.append(f"Unusually high cylinder count: {spec.cylinders}")
|
|
|
|
# Check configuration consistency
|
|
if spec.configuration == "Electric" and spec.displacement_l is not None:
|
|
warnings.append("Electric motor should not have displacement")
|
|
|
|
if spec.configuration not in ["I", "V", "H", "W", "Electric", "Unknown"]:
|
|
warnings.append(f"Unexpected configuration: {spec.configuration}")
|
|
|
|
# Check fuel type consistency
|
|
if spec.fuel_type == "Electric" and spec.configuration != "Electric":
|
|
warnings.append("Electric fuel type should have Electric configuration")
|
|
|
|
return warnings
|
|
|
|
|
|
# Example usage and testing functions
|
|
def example_usage():
|
|
"""Demonstrate EngineSpecParser usage"""
|
|
print("🔧 EngineSpecParser Example Usage")
|
|
print("=" * 40)
|
|
|
|
parser = EngineSpecParser()
|
|
|
|
# Test cases from actual JSON data
|
|
test_engines = [
|
|
# Standard engines
|
|
"2.0L I4",
|
|
"3.5L V6",
|
|
|
|
# L→I normalization examples (CRITICAL)
|
|
"1.5L L3",
|
|
"1.2L L3 FULL HYBRID EV- (FHEV)",
|
|
|
|
# Subaru Boxer engines
|
|
"2.4L H4",
|
|
|
|
# W-configuration engines (VW Group, Bentley)
|
|
"6.0L W12",
|
|
"4.0L W8",
|
|
|
|
# Hybrid examples
|
|
"2.5L I4 FULL HYBRID EV- (FHEV)",
|
|
"1.5L L3 PLUG-IN HYBRID EV- (PHEV)",
|
|
|
|
# Flex fuel
|
|
"5.6L V8 FLEX",
|
|
|
|
# Electric
|
|
"1.8L I4 ELECTRIC",
|
|
]
|
|
|
|
for engine_str in test_engines:
|
|
spec = parser.parse_engine_string(engine_str)
|
|
|
|
print(f"\nInput: \"{engine_str}\"")
|
|
print(f" → {spec.displacement_l}L {spec.configuration}{spec.cylinders}")
|
|
print(f" → Fuel: {spec.fuel_type}, Aspiration: {spec.aspiration}")
|
|
|
|
# Highlight L→I normalization
|
|
if 'L' in engine_str and spec.configuration == 'I' and not 'ELECTRIC' in engine_str.upper():
|
|
print(f" 🎯 L→I NORMALIZED")
|
|
|
|
# Test electric vehicle handling
|
|
print(f"\n⚡ Electric Vehicle Handling:")
|
|
electric_spec = parser.create_electric_motor()
|
|
print(f" Default: {electric_spec.raw_string}")
|
|
print(f" → Config: {electric_spec.configuration}, Fuel: {electric_spec.fuel_type}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
example_usage() |