Initial Commit
This commit is contained in:
3
mvp-platform-services/vehicles/etl/utils/__init__.py
Executable file
3
mvp-platform-services/vehicles/etl/utils/__init__.py
Executable file
@@ -0,0 +1,3 @@
|
||||
from .make_filter import MakeFilter
|
||||
|
||||
__all__ = ['MakeFilter']
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
mvp-platform-services/vehicles/etl/utils/__pycache__/logging.cpython-312.pyc
Executable file
BIN
mvp-platform-services/vehicles/etl/utils/__pycache__/logging.cpython-312.pyc
Executable file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
392
mvp-platform-services/vehicles/etl/utils/engine_spec_parser.py
Normal file
392
mvp-platform-services/vehicles/etl/utils/engine_spec_parser.py
Normal file
@@ -0,0 +1,392 @@
|
||||
"""
|
||||
Engine Specification Parser
|
||||
|
||||
Parses engine specifications from JSON vehicle data into structured components.
|
||||
Handles displacement, configuration, cylinders, fuel type, and aspiration.
|
||||
|
||||
CRITICAL REQUIREMENT: L-configuration normalization
|
||||
- L3 → I3 (L-configuration treated as Inline)
|
||||
- L4 → I4 (L-configuration treated as Inline)
|
||||
|
||||
Standard format: {displacement}L {config}{cylinders} {modifiers}
|
||||
Examples:
|
||||
- "2.0L I4" → 2.0L, Inline, 4-cylinder
|
||||
- "1.5L L3 PLUG-IN HYBRID EV- (PHEV)" → 1.5L, Inline (normalized), 3-cyl, Plug-in Hybrid
|
||||
- "2.4L H4" → 2.4L, Horizontal (Subaru Boxer), 4-cylinder
|
||||
|
||||
Usage:
|
||||
parser = EngineSpecParser()
|
||||
spec = parser.parse_engine_string("1.5L L3 PLUG-IN HYBRID EV- (PHEV)")
|
||||
# spec.configuration == "I" (normalized from L)
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Optional, List, Pattern
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EngineSpec:
|
||||
"""Parsed engine specification"""
|
||||
displacement_l: Optional[float] # Engine displacement in liters
|
||||
configuration: str # I, V, H, Electric, Unknown
|
||||
cylinders: Optional[int] # Number of cylinders
|
||||
fuel_type: str # Gasoline, Hybrid variants, Electric, Flex Fuel
|
||||
aspiration: str # Natural, Turbocharged, Supercharged
|
||||
raw_string: str # Original engine string
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"EngineSpec({self.displacement_l}L {self.configuration}{self.cylinders}, {self.fuel_type}, {self.aspiration})"
|
||||
|
||||
|
||||
class EngineSpecParser:
|
||||
"""Parse engine specifications with L→I normalization"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize engine parser with regex patterns"""
|
||||
|
||||
# Primary engine pattern: {displacement}L {config}{cylinders}
|
||||
# Supports I, V, H, L, W configurations
|
||||
self.engine_pattern = re.compile(r'(\d+\.?\d*)L\s+([IVHLW])(\d+)', re.IGNORECASE)
|
||||
|
||||
# Hybrid detection patterns (most specific first)
|
||||
self.hybrid_patterns = [
|
||||
(re.compile(r'PLUG-IN HYBRID EV-?\s*\(PHEV\)', re.IGNORECASE), 'Plug-in Hybrid'),
|
||||
(re.compile(r'FULL HYBRID EV-?\s*\(FHEV\)', re.IGNORECASE), 'Full Hybrid'),
|
||||
(re.compile(r'HYBRID', re.IGNORECASE), 'Hybrid'),
|
||||
]
|
||||
|
||||
# Other fuel type patterns
|
||||
self.fuel_patterns = [
|
||||
(re.compile(r'FLEX', re.IGNORECASE), 'Flex Fuel'),
|
||||
(re.compile(r'ELECTRIC', re.IGNORECASE), 'Electric'),
|
||||
]
|
||||
|
||||
# Aspiration patterns
|
||||
self.aspiration_patterns = [
|
||||
(re.compile(r'TURBO', re.IGNORECASE), 'Turbocharged'),
|
||||
(re.compile(r'SUPERCHARGED|SC', re.IGNORECASE), 'Supercharged'),
|
||||
]
|
||||
|
||||
logger.debug("EngineSpecParser initialized with regex patterns")
|
||||
|
||||
def normalize_configuration(self, config: str) -> str:
|
||||
"""
|
||||
CRITICAL: Convert L-configuration to I (Inline)
|
||||
|
||||
L-configurations are alternate notation for Inline engines.
|
||||
W-configurations are W-type engines (VW Group, Bentley, etc.)
|
||||
|
||||
Args:
|
||||
config: Configuration character (I, V, H, L, W)
|
||||
|
||||
Returns:
|
||||
Normalized configuration (L becomes I, others unchanged)
|
||||
"""
|
||||
config_upper = config.upper()
|
||||
|
||||
if config_upper == 'L':
|
||||
logger.debug(f"Normalizing L-configuration to I (Inline)")
|
||||
return 'I'
|
||||
|
||||
return config_upper
|
||||
|
||||
def extract_fuel_type(self, engine_str: str) -> str:
|
||||
"""
|
||||
Extract fuel type from engine string
|
||||
|
||||
Priority order:
|
||||
1. Hybrid patterns (PHEV, FHEV, HYBRID)
|
||||
2. Other fuel types (FLEX, ELECTRIC)
|
||||
3. Default to Gasoline
|
||||
|
||||
Args:
|
||||
engine_str: Original engine string
|
||||
|
||||
Returns:
|
||||
Detected fuel type
|
||||
"""
|
||||
# Check hybrid patterns first (most specific)
|
||||
for pattern, fuel_type in self.hybrid_patterns:
|
||||
if pattern.search(engine_str):
|
||||
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
|
||||
return fuel_type
|
||||
|
||||
# Check other fuel types
|
||||
for pattern, fuel_type in self.fuel_patterns:
|
||||
if pattern.search(engine_str):
|
||||
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
|
||||
return fuel_type
|
||||
|
||||
# Default to gasoline
|
||||
return 'Gasoline'
|
||||
|
||||
def extract_aspiration(self, engine_str: str) -> str:
|
||||
"""
|
||||
Extract aspiration type from engine string
|
||||
|
||||
Args:
|
||||
engine_str: Original engine string
|
||||
|
||||
Returns:
|
||||
Detected aspiration type
|
||||
"""
|
||||
for pattern, aspiration in self.aspiration_patterns:
|
||||
if pattern.search(engine_str):
|
||||
logger.debug(f"Detected aspiration '{aspiration}' from '{engine_str}'")
|
||||
return aspiration
|
||||
|
||||
return 'Natural' # Default to naturally aspirated
|
||||
|
||||
def parse_engine_string(self, engine_str: str) -> EngineSpec:
|
||||
"""
|
||||
Parse complete engine specification
|
||||
|
||||
Args:
|
||||
engine_str: Engine specification string
|
||||
|
||||
Returns:
|
||||
EngineSpec with parsed components
|
||||
"""
|
||||
if not engine_str or not engine_str.strip():
|
||||
logger.warning("Empty engine string provided")
|
||||
return self.create_fallback_engine("Empty Engine String")
|
||||
|
||||
engine_str = engine_str.strip()
|
||||
|
||||
# Try to match standard engine pattern
|
||||
match = self.engine_pattern.match(engine_str)
|
||||
|
||||
if not match:
|
||||
logger.warning(f"Could not parse engine string: '{engine_str}'")
|
||||
return self.create_fallback_engine(engine_str)
|
||||
|
||||
try:
|
||||
# Extract basic components
|
||||
displacement = float(match.group(1))
|
||||
raw_config = match.group(2)
|
||||
cylinders = int(match.group(3))
|
||||
|
||||
# CRITICAL: Apply L→I normalization
|
||||
config = self.normalize_configuration(raw_config)
|
||||
|
||||
# Extract fuel type and aspiration from modifiers
|
||||
fuel_type = self.extract_fuel_type(engine_str)
|
||||
aspiration = self.extract_aspiration(engine_str)
|
||||
|
||||
# Log L→I normalization when it occurs
|
||||
if raw_config.upper() == 'L' and config == 'I':
|
||||
logger.info(f"L→I normalization applied: '{engine_str}' → {displacement}L I{cylinders}")
|
||||
|
||||
spec = EngineSpec(
|
||||
displacement_l=displacement,
|
||||
configuration=config,
|
||||
cylinders=cylinders,
|
||||
fuel_type=fuel_type,
|
||||
aspiration=aspiration,
|
||||
raw_string=engine_str
|
||||
)
|
||||
|
||||
logger.debug(f"Parsed '{engine_str}' → {spec}")
|
||||
return spec
|
||||
|
||||
except (ValueError, IndexError) as e:
|
||||
logger.error(f"Failed to parse matched components from '{engine_str}': {e}")
|
||||
return self.create_fallback_engine(engine_str)
|
||||
|
||||
def create_fallback_engine(self, raw_string: str) -> EngineSpec:
|
||||
"""
|
||||
Create fallback engine spec for unparseable strings
|
||||
|
||||
Args:
|
||||
raw_string: Original engine string that couldn't be parsed
|
||||
|
||||
Returns:
|
||||
EngineSpec with unknown values but preserved raw string
|
||||
"""
|
||||
logger.debug(f"Creating fallback engine for '{raw_string}'")
|
||||
|
||||
return EngineSpec(
|
||||
displacement_l=None,
|
||||
configuration="Unknown",
|
||||
cylinders=None,
|
||||
fuel_type="Unknown",
|
||||
aspiration="Natural",
|
||||
raw_string=raw_string
|
||||
)
|
||||
|
||||
def create_electric_motor(self) -> EngineSpec:
|
||||
"""
|
||||
Create default electric motor spec for empty engines arrays
|
||||
|
||||
Common for Tesla, Lucid, and other electric vehicles that have
|
||||
empty engines arrays in their JSON data.
|
||||
|
||||
Returns:
|
||||
EngineSpec configured for electric motor
|
||||
"""
|
||||
logger.debug("Creating default electric motor spec")
|
||||
|
||||
return EngineSpec(
|
||||
displacement_l=None, # N/A for electric
|
||||
configuration="Electric", # Special designation
|
||||
cylinders=None, # N/A for electric
|
||||
fuel_type="Electric",
|
||||
aspiration=None, # N/A for electric
|
||||
raw_string="Electric Motor"
|
||||
)
|
||||
|
||||
def parse_multiple_engines(self, engine_strings: List[str]) -> List[EngineSpec]:
|
||||
"""
|
||||
Parse multiple engine specifications
|
||||
|
||||
Args:
|
||||
engine_strings: List of engine specification strings
|
||||
|
||||
Returns:
|
||||
List of parsed EngineSpec objects
|
||||
"""
|
||||
if not engine_strings:
|
||||
# Handle empty engines array (common for electric vehicles)
|
||||
logger.info("Empty engines array detected - creating electric motor")
|
||||
return [self.create_electric_motor()]
|
||||
|
||||
specs = []
|
||||
for engine_str in engine_strings:
|
||||
spec = self.parse_engine_string(engine_str)
|
||||
specs.append(spec)
|
||||
|
||||
logger.debug(f"Parsed {len(specs)} engines from {len(engine_strings)} strings")
|
||||
return specs
|
||||
|
||||
def get_unique_engines(self, engine_specs: List[EngineSpec]) -> List[EngineSpec]:
|
||||
"""
|
||||
Get unique engines based on key attributes
|
||||
|
||||
Args:
|
||||
engine_specs: List of engine specifications
|
||||
|
||||
Returns:
|
||||
List of unique engine specifications
|
||||
"""
|
||||
seen = set()
|
||||
unique_specs = []
|
||||
|
||||
for spec in engine_specs:
|
||||
# Create key based on engine characteristics
|
||||
key = (
|
||||
spec.displacement_l,
|
||||
spec.configuration,
|
||||
spec.cylinders,
|
||||
spec.fuel_type,
|
||||
spec.aspiration
|
||||
)
|
||||
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_specs.append(spec)
|
||||
else:
|
||||
logger.debug(f"Skipping duplicate engine: {spec}")
|
||||
|
||||
logger.info(f"Reduced {len(engine_specs)} engines to {len(unique_specs)} unique engines")
|
||||
return unique_specs
|
||||
|
||||
def validate_engine_spec(self, spec: EngineSpec) -> List[str]:
|
||||
"""
|
||||
Validate engine specification for data quality issues
|
||||
|
||||
Args:
|
||||
spec: Engine specification to validate
|
||||
|
||||
Returns:
|
||||
List of validation warnings (empty if no issues)
|
||||
"""
|
||||
warnings = []
|
||||
|
||||
# Check displacement
|
||||
if spec.displacement_l is not None:
|
||||
if spec.displacement_l <= 0:
|
||||
warnings.append(f"Invalid displacement: {spec.displacement_l}")
|
||||
elif spec.displacement_l > 20: # Unrealistic for production cars
|
||||
warnings.append(f"Unusually large displacement: {spec.displacement_l}L")
|
||||
|
||||
# Check cylinders
|
||||
if spec.cylinders is not None:
|
||||
if spec.cylinders <= 0:
|
||||
warnings.append(f"Invalid cylinder count: {spec.cylinders}")
|
||||
elif spec.cylinders > 16: # Very rare in production
|
||||
warnings.append(f"Unusually high cylinder count: {spec.cylinders}")
|
||||
|
||||
# Check configuration consistency
|
||||
if spec.configuration == "Electric" and spec.displacement_l is not None:
|
||||
warnings.append("Electric motor should not have displacement")
|
||||
|
||||
if spec.configuration not in ["I", "V", "H", "W", "Electric", "Unknown"]:
|
||||
warnings.append(f"Unexpected configuration: {spec.configuration}")
|
||||
|
||||
# Check fuel type consistency
|
||||
if spec.fuel_type == "Electric" and spec.configuration != "Electric":
|
||||
warnings.append("Electric fuel type should have Electric configuration")
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
# Example usage and testing functions
|
||||
def example_usage():
|
||||
"""Demonstrate EngineSpecParser usage"""
|
||||
print("🔧 EngineSpecParser Example Usage")
|
||||
print("=" * 40)
|
||||
|
||||
parser = EngineSpecParser()
|
||||
|
||||
# Test cases from actual JSON data
|
||||
test_engines = [
|
||||
# Standard engines
|
||||
"2.0L I4",
|
||||
"3.5L V6",
|
||||
|
||||
# L→I normalization examples (CRITICAL)
|
||||
"1.5L L3",
|
||||
"1.2L L3 FULL HYBRID EV- (FHEV)",
|
||||
|
||||
# Subaru Boxer engines
|
||||
"2.4L H4",
|
||||
|
||||
# W-configuration engines (VW Group, Bentley)
|
||||
"6.0L W12",
|
||||
"4.0L W8",
|
||||
|
||||
# Hybrid examples
|
||||
"2.5L I4 FULL HYBRID EV- (FHEV)",
|
||||
"1.5L L3 PLUG-IN HYBRID EV- (PHEV)",
|
||||
|
||||
# Flex fuel
|
||||
"5.6L V8 FLEX",
|
||||
|
||||
# Electric
|
||||
"1.8L I4 ELECTRIC",
|
||||
]
|
||||
|
||||
for engine_str in test_engines:
|
||||
spec = parser.parse_engine_string(engine_str)
|
||||
|
||||
print(f"\nInput: \"{engine_str}\"")
|
||||
print(f" → {spec.displacement_l}L {spec.configuration}{spec.cylinders}")
|
||||
print(f" → Fuel: {spec.fuel_type}, Aspiration: {spec.aspiration}")
|
||||
|
||||
# Highlight L→I normalization
|
||||
if 'L' in engine_str and spec.configuration == 'I' and not 'ELECTRIC' in engine_str.upper():
|
||||
print(f" 🎯 L→I NORMALIZED")
|
||||
|
||||
# Test electric vehicle handling
|
||||
print(f"\n⚡ Electric Vehicle Handling:")
|
||||
electric_spec = parser.create_electric_motor()
|
||||
print(f" Default: {electric_spec.raw_string}")
|
||||
print(f" → Config: {electric_spec.configuration}, Fuel: {electric_spec.fuel_type}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
28
mvp-platform-services/vehicles/etl/utils/logging.py
Executable file
28
mvp-platform-services/vehicles/etl/utils/logging.py
Executable file
@@ -0,0 +1,28 @@
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
def setup_logging(log_level: str = "INFO"):
|
||||
"""Setup logging configuration"""
|
||||
|
||||
# Create logs directory if it doesn't exist
|
||||
log_dir = Path("logs")
|
||||
log_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level.upper()),
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout),
|
||||
logging.FileHandler(
|
||||
log_dir / f"etl_{datetime.now().strftime('%Y%m%d')}.log"
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Set specific logger levels
|
||||
logging.getLogger("pymssql").setLevel(logging.WARNING)
|
||||
logging.getLogger("psycopg2").setLevel(logging.WARNING)
|
||||
logging.getLogger("redis").setLevel(logging.WARNING)
|
||||
112
mvp-platform-services/vehicles/etl/utils/make_filter.py
Normal file
112
mvp-platform-services/vehicles/etl/utils/make_filter.py
Normal file
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Make filtering service for ETL pipeline optimization.
|
||||
Filters processing to only allowed mainstream makes/brands.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from typing import List, Set
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MakeFilter:
|
||||
"""Service to filter ETL processing by allowed makes/brands"""
|
||||
|
||||
def __init__(self, config_path: str = '/app/etl/sources/makes.json'):
|
||||
"""
|
||||
Initialize make filter
|
||||
|
||||
Args:
|
||||
config_path: Path to makes.json configuration file
|
||||
"""
|
||||
self.config_path = Path(config_path)
|
||||
self.allowed_makes: List[str] = []
|
||||
self.allowed_makes_set: Set[str] = set()
|
||||
self.allowed_makes_sql: str = ""
|
||||
self._load_makes()
|
||||
|
||||
def _load_makes(self):
|
||||
"""Load allowed makes from JSON configuration"""
|
||||
try:
|
||||
# Try multiple paths for container and local development
|
||||
config_paths = [
|
||||
self.config_path,
|
||||
Path('/app/etl/sources/makes.json'), # Primary container path - administrator controlled
|
||||
Path(__file__).parent.parent / 'sources' / 'makes.json', # Local development
|
||||
Path(__file__).parent.parent.parent / 'makes.json' # Fallback to main makes.json if needed
|
||||
]
|
||||
|
||||
config_data = None
|
||||
used_path = None
|
||||
|
||||
for path in config_paths:
|
||||
if path.exists():
|
||||
with open(path, 'r') as f:
|
||||
config_data = json.load(f)
|
||||
used_path = path
|
||||
break
|
||||
|
||||
if not config_data:
|
||||
raise FileNotFoundError(f"Could not find makes.json in any of: {config_paths}")
|
||||
|
||||
self.allowed_makes = config_data.get('manufacturers', [])
|
||||
self.allowed_makes_set = set(self.allowed_makes)
|
||||
self.allowed_makes_sql = self._build_sql_in_clause()
|
||||
|
||||
logger.info(f"Loaded {len(self.allowed_makes)} allowed makes from {used_path}")
|
||||
logger.debug(f"Allowed makes: {', '.join(sorted(self.allowed_makes[:10]))}{'...' if len(self.allowed_makes) > 10 else ''}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load make configuration: {e}")
|
||||
raise
|
||||
|
||||
def _build_sql_in_clause(self) -> str:
|
||||
"""Build SQL IN clause for make filtering"""
|
||||
if not self.allowed_makes:
|
||||
return "()"
|
||||
|
||||
# Escape single quotes and build IN clause
|
||||
escaped_names = [name.replace("'", "''") for name in self.allowed_makes]
|
||||
return "(" + ",".join(f"'{name}'" for name in escaped_names) + ")"
|
||||
|
||||
def is_make_allowed(self, make_name: str) -> bool:
|
||||
"""
|
||||
Check if a make is in the allowed list
|
||||
|
||||
Args:
|
||||
make_name: Name to check
|
||||
|
||||
Returns:
|
||||
True if make is allowed, False otherwise
|
||||
"""
|
||||
return make_name in self.allowed_makes_set
|
||||
|
||||
def get_allowed_makes(self) -> List[str]:
|
||||
"""Get list of allowed makes"""
|
||||
return self.allowed_makes.copy()
|
||||
|
||||
def get_sql_filter(self, column_name: str = 'Name') -> str:
|
||||
"""
|
||||
Get SQL WHERE clause for make filtering
|
||||
|
||||
Args:
|
||||
column_name: Name of the make column
|
||||
|
||||
Returns:
|
||||
SQL WHERE clause fragment
|
||||
"""
|
||||
return f"{column_name} IN {self.allowed_makes_sql}"
|
||||
|
||||
def reload_configuration(self):
|
||||
"""Reload makes configuration from file"""
|
||||
logger.info("Reloading make configuration...")
|
||||
self._load_makes()
|
||||
|
||||
def get_filter_stats(self) -> dict:
|
||||
"""Get filtering statistics"""
|
||||
return {
|
||||
'total_allowed_makes': len(self.allowed_makes),
|
||||
'config_path': str(self.config_path),
|
||||
'sql_clause_length': len(self.allowed_makes_sql)
|
||||
}
|
||||
317
mvp-platform-services/vehicles/etl/utils/make_name_mapper.py
Normal file
317
mvp-platform-services/vehicles/etl/utils/make_name_mapper.py
Normal file
@@ -0,0 +1,317 @@
|
||||
"""
|
||||
Make Name Mapper Utility
|
||||
|
||||
Converts JSON filenames to proper display names for database storage.
|
||||
Handles underscore-to-space conversion, title casing, and special capitalization cases.
|
||||
|
||||
Critical for converting:
|
||||
- alfa_romeo.json → "Alfa Romeo"
|
||||
- bmw.json → "BMW"
|
||||
- land_rover.json → "Land Rover"
|
||||
|
||||
Usage:
|
||||
mapper = MakeNameMapper()
|
||||
display_name = mapper.normalize_make_name('alfa_romeo.json') # Returns "Alfa Romeo"
|
||||
"""
|
||||
|
||||
import json
|
||||
import glob
|
||||
import os
|
||||
import logging
|
||||
from typing import Set, Dict, List, Optional
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationReport:
|
||||
"""Make name validation report"""
|
||||
total_files: int
|
||||
valid_mappings: int
|
||||
mismatches: List[Dict[str, str]]
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
"""Calculate success rate as percentage"""
|
||||
return self.valid_mappings / self.total_files if self.total_files > 0 else 0.0
|
||||
|
||||
|
||||
class MakeNameMapper:
|
||||
"""Convert JSON filenames to proper make display names"""
|
||||
|
||||
def __init__(self, sources_dir: Optional[str] = None):
|
||||
"""
|
||||
Initialize make name mapper
|
||||
|
||||
Args:
|
||||
sources_dir: Directory containing sources/makes.json for validation
|
||||
"""
|
||||
self.sources_dir = sources_dir or "sources"
|
||||
|
||||
# Special capitalization cases that don't follow standard title case
|
||||
self.special_cases = {
|
||||
'Bmw': 'BMW', # Bayerische Motoren Werke
|
||||
'Gmc': 'GMC', # General Motors Company
|
||||
'Mini': 'MINI', # Brand styling requirement
|
||||
'Mclaren': 'McLaren', # Scottish naming convention
|
||||
}
|
||||
|
||||
# Load authoritative makes list for validation
|
||||
self.authoritative_makes = self._load_authoritative_makes()
|
||||
|
||||
logger.debug(f"MakeNameMapper initialized with {len(self.authoritative_makes)} authoritative makes")
|
||||
|
||||
def _load_authoritative_makes(self) -> Set[str]:
|
||||
"""Load authoritative makes list from sources/makes.json"""
|
||||
makes_file = os.path.join(self.sources_dir, 'makes.json')
|
||||
|
||||
try:
|
||||
if os.path.exists(makes_file):
|
||||
with open(makes_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
makes_set = set(data.get('manufacturers', []))
|
||||
logger.info(f"Loaded {len(makes_set)} authoritative makes from {makes_file}")
|
||||
return makes_set
|
||||
else:
|
||||
logger.warning(f"Authoritative makes file not found: {makes_file}")
|
||||
return self._get_fallback_makes()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load authoritative makes from {makes_file}: {e}")
|
||||
return self._get_fallback_makes()
|
||||
|
||||
def _get_fallback_makes(self) -> Set[str]:
|
||||
"""Fallback authoritative makes list if file is not available"""
|
||||
return {
|
||||
'Acura', 'Alfa Romeo', 'Aston Martin', 'Audi', 'BMW', 'Bentley',
|
||||
'Buick', 'Cadillac', 'Chevrolet', 'Chrysler', 'Dodge', 'Ferrari',
|
||||
'Fiat', 'Ford', 'Genesis', 'Geo', 'GMC', 'Honda', 'Hummer',
|
||||
'Hyundai', 'Infiniti', 'Isuzu', 'Jaguar', 'Jeep', 'Kia',
|
||||
'Lamborghini', 'Land Rover', 'Lexus', 'Lincoln', 'Lotus', 'Lucid',
|
||||
'MINI', 'Maserati', 'Mazda', 'McLaren', 'Mercury', 'Mitsubishi',
|
||||
'Nissan', 'Oldsmobile', 'Plymouth', 'Polestar', 'Pontiac',
|
||||
'Porsche', 'Ram', 'Rivian', 'Rolls Royce', 'Saab', 'Saturn',
|
||||
'Scion', 'Smart', 'Subaru', 'Tesla', 'Toyota', 'Volkswagen',
|
||||
'Volvo'
|
||||
}
|
||||
|
||||
def normalize_make_name(self, filename: str) -> str:
|
||||
"""
|
||||
Convert filename to proper display name
|
||||
|
||||
Args:
|
||||
filename: JSON filename (e.g., 'alfa_romeo.json')
|
||||
|
||||
Returns:
|
||||
Normalized display name (e.g., 'Alfa Romeo')
|
||||
"""
|
||||
try:
|
||||
# Remove .json extension
|
||||
base_name = filename.replace('.json', '')
|
||||
|
||||
# Handle edge case of empty string
|
||||
if not base_name:
|
||||
logger.warning(f"Empty base name after removing .json from '{filename}'")
|
||||
return "Unknown"
|
||||
|
||||
# Replace underscores with spaces
|
||||
spaced_name = base_name.replace('_', ' ')
|
||||
|
||||
# Apply title case
|
||||
title_cased = spaced_name.title()
|
||||
|
||||
# Apply special capitalization cases
|
||||
normalized = self.special_cases.get(title_cased, title_cased)
|
||||
|
||||
logger.debug(f"Normalized '{filename}' → '{normalized}'")
|
||||
return normalized
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to normalize make name '{filename}': {e}")
|
||||
return "Unknown"
|
||||
|
||||
def validate_mapping(self, filename: str, display_name: str) -> bool:
|
||||
"""
|
||||
Validate mapped name against authoritative list
|
||||
|
||||
Args:
|
||||
filename: Original JSON filename
|
||||
display_name: Normalized display name
|
||||
|
||||
Returns:
|
||||
True if display name is in authoritative list
|
||||
"""
|
||||
is_valid = display_name in self.authoritative_makes
|
||||
|
||||
if not is_valid:
|
||||
logger.warning(f"Make '{display_name}' from '{filename}' not found in authoritative list")
|
||||
|
||||
return is_valid
|
||||
|
||||
def get_all_mappings(self, json_files_dir: str) -> Dict[str, str]:
|
||||
"""
|
||||
Get complete filename → display name mapping for all JSON files
|
||||
|
||||
Args:
|
||||
json_files_dir: Directory containing make JSON files
|
||||
|
||||
Returns:
|
||||
Dictionary mapping filenames to display names
|
||||
"""
|
||||
mappings = {}
|
||||
|
||||
try:
|
||||
pattern = os.path.join(json_files_dir, '*.json')
|
||||
json_files = glob.glob(pattern)
|
||||
|
||||
logger.info(f"Found {len(json_files)} JSON files in {json_files_dir}")
|
||||
|
||||
for file_path in json_files:
|
||||
filename = os.path.basename(file_path)
|
||||
display_name = self.normalize_make_name(filename)
|
||||
mappings[filename] = display_name
|
||||
|
||||
return mappings
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get all mappings from {json_files_dir}: {e}")
|
||||
return {}
|
||||
|
||||
def validate_all_mappings(self, json_files_dir: str) -> ValidationReport:
|
||||
"""
|
||||
Validate all mappings against authoritative list
|
||||
|
||||
Args:
|
||||
json_files_dir: Directory containing make JSON files
|
||||
|
||||
Returns:
|
||||
ValidationReport with results
|
||||
"""
|
||||
mappings = self.get_all_mappings(json_files_dir)
|
||||
mismatches = []
|
||||
|
||||
for filename, display_name in mappings.items():
|
||||
if not self.validate_mapping(filename, display_name):
|
||||
mismatches.append({
|
||||
'filename': filename,
|
||||
'mapped_name': display_name,
|
||||
'status': 'NOT_FOUND_IN_AUTHORITATIVE'
|
||||
})
|
||||
|
||||
report = ValidationReport(
|
||||
total_files=len(mappings),
|
||||
valid_mappings=len(mappings) - len(mismatches),
|
||||
mismatches=mismatches
|
||||
)
|
||||
|
||||
logger.info(f"Validation complete: {report.valid_mappings}/{report.total_files} valid ({report.success_rate:.1%})")
|
||||
|
||||
return report
|
||||
|
||||
def get_filename_for_display_name(self, display_name: str) -> Optional[str]:
|
||||
"""
|
||||
Reverse lookup: get JSON filename for a display name
|
||||
|
||||
Args:
|
||||
display_name: Make display name (e.g., 'Alfa Romeo')
|
||||
|
||||
Returns:
|
||||
JSON filename (e.g., 'alfa_romeo.json') or None if not found
|
||||
"""
|
||||
# Convert display name back to filename format
|
||||
# Handle special cases in reverse
|
||||
reverse_special_cases = {v: k for k, v in self.special_cases.items()}
|
||||
|
||||
if display_name in reverse_special_cases:
|
||||
# Special case: BMW → Bmw, etc.
|
||||
base_name = reverse_special_cases[display_name].lower()
|
||||
else:
|
||||
# Standard case: convert to lowercase, spaces to underscores
|
||||
base_name = display_name.lower().replace(' ', '_')
|
||||
|
||||
filename = f"{base_name}.json"
|
||||
|
||||
logger.debug(f"Reverse lookup: '{display_name}' → '{filename}'")
|
||||
return filename
|
||||
|
||||
def print_validation_report(self, report: ValidationReport) -> None:
|
||||
"""
|
||||
Print formatted validation report
|
||||
|
||||
Args:
|
||||
report: ValidationReport to display
|
||||
"""
|
||||
print(f"📋 Make Name Validation Report")
|
||||
print(f"=" * 35)
|
||||
print(f"Total files: {report.total_files}")
|
||||
print(f"Valid mappings: {report.valid_mappings}")
|
||||
print(f"Success rate: {report.success_rate:.1%}")
|
||||
|
||||
if report.mismatches:
|
||||
print(f"\n⚠️ Mismatches ({len(report.mismatches)}):")
|
||||
for mismatch in report.mismatches:
|
||||
print(f" {mismatch['filename']} → {mismatch['mapped_name']}")
|
||||
print(f" Status: {mismatch['status']}")
|
||||
else:
|
||||
print(f"\n🎉 All mappings are valid!")
|
||||
|
||||
def get_make_statistics(self, json_files_dir: str) -> Dict[str, int]:
|
||||
"""
|
||||
Get statistics about make name transformations
|
||||
|
||||
Args:
|
||||
json_files_dir: Directory containing make JSON files
|
||||
|
||||
Returns:
|
||||
Dictionary with transformation statistics
|
||||
"""
|
||||
mappings = self.get_all_mappings(json_files_dir)
|
||||
|
||||
single_words = 0
|
||||
multi_words = 0
|
||||
special_cases = 0
|
||||
|
||||
for filename, display_name in mappings.items():
|
||||
if display_name in self.special_cases.values():
|
||||
special_cases += 1
|
||||
elif ' ' in display_name:
|
||||
multi_words += 1
|
||||
else:
|
||||
single_words += 1
|
||||
|
||||
return {
|
||||
'total': len(mappings),
|
||||
'single_words': single_words,
|
||||
'multi_words': multi_words,
|
||||
'special_cases': special_cases
|
||||
}
|
||||
|
||||
|
||||
# Example usage and testing functions
|
||||
def example_usage():
|
||||
"""Demonstrate MakeNameMapper usage"""
|
||||
print("🏷️ MakeNameMapper Example Usage")
|
||||
print("=" * 35)
|
||||
|
||||
mapper = MakeNameMapper()
|
||||
|
||||
# Test individual conversions
|
||||
test_files = [
|
||||
'toyota.json',
|
||||
'alfa_romeo.json',
|
||||
'bmw.json',
|
||||
'land_rover.json',
|
||||
'mclaren.json'
|
||||
]
|
||||
|
||||
for filename in test_files:
|
||||
display_name = mapper.normalize_make_name(filename)
|
||||
is_valid = mapper.validate_mapping(filename, display_name)
|
||||
status = "✅" if is_valid else "⚠️"
|
||||
|
||||
print(f"{status} {filename:20} → {display_name}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
Reference in New Issue
Block a user