Initial Commit
This commit is contained in:
@@ -0,0 +1,314 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Engine Specification Parsing Examples
|
||||
|
||||
This file contains comprehensive examples of engine parsing patterns
|
||||
found in the JSON vehicle data, demonstrating the L→I normalization
|
||||
and hybrid/electric detection requirements.
|
||||
|
||||
Usage:
|
||||
python engine-parsing-examples.py
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, List
|
||||
|
||||
|
||||
@dataclass
|
||||
class EngineSpec:
|
||||
"""Parsed engine specification"""
|
||||
displacement_l: Optional[float]
|
||||
configuration: str # I, V, H, Electric
|
||||
cylinders: Optional[int]
|
||||
fuel_type: str # Gasoline, Hybrid, Electric, Flex Fuel
|
||||
aspiration: str # Natural, Turbo, Supercharged
|
||||
raw_string: str
|
||||
|
||||
|
||||
class EngineSpecParser:
|
||||
"""Engine specification parser with L→I normalization"""
|
||||
|
||||
def __init__(self):
|
||||
# Primary pattern: {displacement}L {config}{cylinders}
|
||||
self.engine_pattern = re.compile(r'(\d+\.?\d*)L\s+([IVHL])(\d+)')
|
||||
|
||||
# Hybrid patterns
|
||||
self.hybrid_patterns = [
|
||||
re.compile(r'PLUG-IN HYBRID EV-?\s*\(PHEV\)', re.IGNORECASE),
|
||||
re.compile(r'FULL HYBRID EV-?\s*\(FHEV\)', re.IGNORECASE),
|
||||
re.compile(r'HYBRID', re.IGNORECASE),
|
||||
]
|
||||
|
||||
# Other fuel type patterns
|
||||
self.fuel_patterns = [
|
||||
(re.compile(r'FLEX', re.IGNORECASE), 'Flex Fuel'),
|
||||
(re.compile(r'ELECTRIC', re.IGNORECASE), 'Electric'),
|
||||
]
|
||||
|
||||
# Aspiration patterns
|
||||
self.aspiration_patterns = [
|
||||
(re.compile(r'TURBO', re.IGNORECASE), 'Turbocharged'),
|
||||
(re.compile(r'SUPERCHARGED|SC', re.IGNORECASE), 'Supercharged'),
|
||||
]
|
||||
|
||||
def normalize_configuration(self, config: str) -> str:
|
||||
"""CRITICAL: Convert L to I (L-configuration becomes Inline)"""
|
||||
return 'I' if config == 'L' else config
|
||||
|
||||
def extract_fuel_type(self, engine_str: str) -> str:
|
||||
"""Extract fuel type from engine string"""
|
||||
# Check hybrid patterns first (most specific)
|
||||
for pattern in self.hybrid_patterns:
|
||||
if pattern.search(engine_str):
|
||||
if 'PLUG-IN' in engine_str.upper():
|
||||
return 'Plug-in Hybrid'
|
||||
elif 'FULL' in engine_str.upper():
|
||||
return 'Full Hybrid'
|
||||
else:
|
||||
return 'Hybrid'
|
||||
|
||||
# Check other fuel types
|
||||
for pattern, fuel_type in self.fuel_patterns:
|
||||
if pattern.search(engine_str):
|
||||
return fuel_type
|
||||
|
||||
return 'Gasoline' # Default
|
||||
|
||||
def extract_aspiration(self, engine_str: str) -> str:
|
||||
"""Extract aspiration from engine string"""
|
||||
for pattern, aspiration in self.aspiration_patterns:
|
||||
if pattern.search(engine_str):
|
||||
return aspiration
|
||||
return 'Natural' # Default
|
||||
|
||||
def parse_engine_string(self, engine_str: str) -> EngineSpec:
|
||||
"""Parse complete engine specification"""
|
||||
match = self.engine_pattern.match(engine_str)
|
||||
|
||||
if not match:
|
||||
# Handle unparseable engines
|
||||
return self.create_fallback_engine(engine_str)
|
||||
|
||||
displacement = float(match.group(1))
|
||||
config = self.normalize_configuration(match.group(2)) # L→I here!
|
||||
cylinders = int(match.group(3))
|
||||
|
||||
fuel_type = self.extract_fuel_type(engine_str)
|
||||
aspiration = self.extract_aspiration(engine_str)
|
||||
|
||||
return EngineSpec(
|
||||
displacement_l=displacement,
|
||||
configuration=config,
|
||||
cylinders=cylinders,
|
||||
fuel_type=fuel_type,
|
||||
aspiration=aspiration,
|
||||
raw_string=engine_str
|
||||
)
|
||||
|
||||
def create_fallback_engine(self, raw_string: str) -> EngineSpec:
|
||||
"""Create fallback for unparseable engines"""
|
||||
return EngineSpec(
|
||||
displacement_l=None,
|
||||
configuration="Unknown",
|
||||
cylinders=None,
|
||||
fuel_type="Unknown",
|
||||
aspiration="Natural",
|
||||
raw_string=raw_string
|
||||
)
|
||||
|
||||
def create_electric_motor(self) -> EngineSpec:
|
||||
"""Create default electric motor for empty engines arrays"""
|
||||
return EngineSpec(
|
||||
displacement_l=None,
|
||||
configuration="Electric",
|
||||
cylinders=None,
|
||||
fuel_type="Electric",
|
||||
aspiration=None,
|
||||
raw_string="Electric Motor"
|
||||
)
|
||||
|
||||
|
||||
def demonstrate_engine_parsing():
|
||||
"""Demonstrate engine parsing with real examples from JSON files"""
|
||||
|
||||
parser = EngineSpecParser()
|
||||
|
||||
# Test cases from actual JSON data
|
||||
test_engines = [
|
||||
# Standard engines
|
||||
"2.0L I4",
|
||||
"3.5L V6",
|
||||
"5.6L V8",
|
||||
|
||||
# L→I normalization examples (CRITICAL)
|
||||
"1.5L L3",
|
||||
"2.0L L4",
|
||||
"1.2L L3 FULL HYBRID EV- (FHEV)",
|
||||
|
||||
# Subaru Boxer engines
|
||||
"2.4L H4",
|
||||
"2.0L H4",
|
||||
|
||||
# Hybrid examples from Nissan
|
||||
"2.5L I4 FULL HYBRID EV- (FHEV)",
|
||||
"1.5L L3 PLUG-IN HYBRID EV- (PHEV)",
|
||||
|
||||
# Flex fuel examples
|
||||
"5.6L V8 FLEX",
|
||||
"4.0L V6 FLEX",
|
||||
|
||||
# Electric examples
|
||||
"1.8L I4 ELECTRIC",
|
||||
|
||||
# Unparseable examples (should create fallback)
|
||||
"Custom Hybrid System",
|
||||
"V12 Twin-Turbo Custom",
|
||||
"V10 Plus",
|
||||
]
|
||||
|
||||
print("🔧 Engine Specification Parsing Examples")
|
||||
print("=" * 50)
|
||||
|
||||
for engine_str in test_engines:
|
||||
spec = parser.parse_engine_string(engine_str)
|
||||
|
||||
print(f"\nInput: \"{engine_str}\"")
|
||||
print(f" Displacement: {spec.displacement_l}L")
|
||||
print(f" Configuration: {spec.configuration}")
|
||||
print(f" Cylinders: {spec.cylinders}")
|
||||
print(f" Fuel Type: {spec.fuel_type}")
|
||||
print(f" Aspiration: {spec.aspiration}")
|
||||
|
||||
# Highlight L→I normalization
|
||||
if 'L' in engine_str and spec.configuration == 'I':
|
||||
print(f" 🎯 L→I NORMALIZED: L{spec.cylinders} became I{spec.cylinders}")
|
||||
|
||||
# Demonstrate electric vehicle handling
|
||||
print(f"\n\n⚡ Electric Vehicle Default Engine:")
|
||||
electric_spec = parser.create_electric_motor()
|
||||
print(f" Name: {electric_spec.raw_string}")
|
||||
print(f" Configuration: {electric_spec.configuration}")
|
||||
print(f" Fuel Type: {electric_spec.fuel_type}")
|
||||
|
||||
|
||||
def demonstrate_l_to_i_normalization():
|
||||
"""Specifically demonstrate L→I normalization requirement"""
|
||||
|
||||
parser = EngineSpecParser()
|
||||
|
||||
print("\n\n🎯 L→I Configuration Normalization")
|
||||
print("=" * 40)
|
||||
print("CRITICAL REQUIREMENT: All L-configurations must become I (Inline)")
|
||||
|
||||
l_configuration_examples = [
|
||||
"1.5L L3",
|
||||
"2.0L L4",
|
||||
"1.2L L3 FULL HYBRID EV- (FHEV)",
|
||||
"1.5L L3 PLUG-IN HYBRID EV- (PHEV)",
|
||||
]
|
||||
|
||||
for engine_str in l_configuration_examples:
|
||||
spec = parser.parse_engine_string(engine_str)
|
||||
original_config = engine_str.split()[1][0] # Extract L from "L3"
|
||||
|
||||
print(f"\nOriginal: \"{engine_str}\"")
|
||||
print(f" Input Configuration: {original_config}{spec.cylinders}")
|
||||
print(f" Output Configuration: {spec.configuration}{spec.cylinders}")
|
||||
print(f" ✅ Normalized: {original_config}→{spec.configuration}")
|
||||
|
||||
|
||||
def demonstrate_database_storage():
|
||||
"""Show how parsed engines map to database records"""
|
||||
|
||||
parser = EngineSpecParser()
|
||||
|
||||
print("\n\n💾 Database Storage Examples")
|
||||
print("=" * 35)
|
||||
print("SQL: INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)")
|
||||
|
||||
examples = [
|
||||
"2.0L I4",
|
||||
"1.5L L3 PLUG-IN HYBRID EV- (PHEV)", # L→I case
|
||||
"2.4L H4", # Subaru Boxer
|
||||
"5.6L V8 FLEX",
|
||||
]
|
||||
|
||||
for engine_str in examples:
|
||||
spec = parser.parse_engine_string(engine_str)
|
||||
|
||||
# Format as SQL INSERT values
|
||||
sql_values = (
|
||||
f"('{spec.raw_string}', NULL, {spec.displacement_l}, "
|
||||
f"{spec.cylinders}, '{spec.fuel_type}', '{spec.aspiration}')"
|
||||
)
|
||||
|
||||
print(f"\nEngine: \"{engine_str}\"")
|
||||
print(f" SQL: VALUES {sql_values}")
|
||||
|
||||
if 'L' in engine_str and spec.configuration == 'I':
|
||||
print(f" 🎯 Note: L{spec.cylinders} normalized to I{spec.cylinders}")
|
||||
|
||||
# Electric motor example
|
||||
electric_spec = parser.create_electric_motor()
|
||||
sql_values = (
|
||||
f"('{electric_spec.raw_string}', NULL, NULL, "
|
||||
f"NULL, '{electric_spec.fuel_type}', NULL)"
|
||||
)
|
||||
print(f"\nElectric Vehicle:")
|
||||
print(f" SQL: VALUES {sql_values}")
|
||||
|
||||
|
||||
def run_validation_tests():
|
||||
"""Run validation tests to ensure parsing works correctly"""
|
||||
|
||||
parser = EngineSpecParser()
|
||||
|
||||
print("\n\n✅ Validation Tests")
|
||||
print("=" * 20)
|
||||
|
||||
# Test L→I normalization
|
||||
test_cases = [
|
||||
("1.5L L3", "I", 3),
|
||||
("2.0L L4", "I", 4),
|
||||
("1.2L L3 FULL HYBRID EV- (FHEV)", "I", 3),
|
||||
]
|
||||
|
||||
for engine_str, expected_config, expected_cylinders in test_cases:
|
||||
spec = parser.parse_engine_string(engine_str)
|
||||
|
||||
assert spec.configuration == expected_config, \
|
||||
f"Expected {expected_config}, got {spec.configuration}"
|
||||
assert spec.cylinders == expected_cylinders, \
|
||||
f"Expected {expected_cylinders} cylinders, got {spec.cylinders}"
|
||||
|
||||
print(f"✅ {engine_str} → {spec.configuration}{spec.cylinders}")
|
||||
|
||||
# Test hybrid detection
|
||||
hybrid_cases = [
|
||||
("2.5L I4 FULL HYBRID EV- (FHEV)", "Full Hybrid"),
|
||||
("1.5L L3 PLUG-IN HYBRID EV- (PHEV)", "Plug-in Hybrid"),
|
||||
]
|
||||
|
||||
for engine_str, expected_fuel_type in hybrid_cases:
|
||||
spec = parser.parse_engine_string(engine_str)
|
||||
assert spec.fuel_type == expected_fuel_type, \
|
||||
f"Expected {expected_fuel_type}, got {spec.fuel_type}"
|
||||
print(f"✅ {engine_str} → {spec.fuel_type}")
|
||||
|
||||
print("\n🎉 All validation tests passed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demonstrate_engine_parsing()
|
||||
demonstrate_l_to_i_normalization()
|
||||
demonstrate_database_storage()
|
||||
run_validation_tests()
|
||||
|
||||
print("\n\n📋 Summary")
|
||||
print("=" * 10)
|
||||
print("✅ Engine parsing patterns implemented")
|
||||
print("✅ L→I normalization working correctly")
|
||||
print("✅ Hybrid/electric detection functional")
|
||||
print("✅ Database storage format validated")
|
||||
print("\n🚀 Ready for integration into ETL system!")
|
||||
@@ -0,0 +1,334 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Make Name Mapping Examples
|
||||
|
||||
This file demonstrates the complete make name normalization process,
|
||||
converting JSON filenames to proper display names for the database.
|
||||
|
||||
Usage:
|
||||
python make-mapping-examples.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import glob
|
||||
import os
|
||||
from typing import Dict, Set, List, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationReport:
|
||||
"""Make name validation report"""
|
||||
total_files: int
|
||||
valid_mappings: int
|
||||
mismatches: List[Dict[str, str]]
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
return self.valid_mappings / self.total_files if self.total_files > 0 else 0.0
|
||||
|
||||
|
||||
class MakeNameMapper:
|
||||
"""Convert JSON filenames to proper make display names"""
|
||||
|
||||
def __init__(self):
|
||||
# Special capitalization cases
|
||||
self.special_cases = {
|
||||
'Bmw': 'BMW', # Bayerische Motoren Werke
|
||||
'Gmc': 'GMC', # General Motors Company
|
||||
'Mini': 'MINI', # Brand styling
|
||||
'Mclaren': 'McLaren', # Scottish naming convention
|
||||
}
|
||||
|
||||
# Authoritative makes list (would be loaded from sources/makes.json)
|
||||
self.authoritative_makes = {
|
||||
'Acura', 'Alfa Romeo', 'Aston Martin', 'Audi', 'BMW', 'Bentley',
|
||||
'Buick', 'Cadillac', 'Chevrolet', 'Chrysler', 'Dodge', 'Ferrari',
|
||||
'Fiat', 'Ford', 'Genesis', 'Geo', 'GMC', 'Honda', 'Hummer',
|
||||
'Hyundai', 'Infiniti', 'Isuzu', 'Jaguar', 'Jeep', 'Kia',
|
||||
'Lamborghini', 'Land Rover', 'Lexus', 'Lincoln', 'Lotus', 'Lucid',
|
||||
'MINI', 'Maserati', 'Mazda', 'McLaren', 'Mercury', 'Mitsubishi',
|
||||
'Nissan', 'Oldsmobile', 'Plymouth', 'Polestar', 'Pontiac',
|
||||
'Porsche', 'Ram', 'Rivian', 'Rolls Royce', 'Saab', 'Saturn',
|
||||
'Scion', 'Smart', 'Subaru', 'Tesla', 'Toyota', 'Volkswagen',
|
||||
'Volvo'
|
||||
}
|
||||
|
||||
def normalize_make_name(self, filename: str) -> str:
|
||||
"""Convert filename to proper display name"""
|
||||
# Remove .json extension
|
||||
base_name = filename.replace('.json', '')
|
||||
|
||||
# Replace underscores with spaces
|
||||
spaced_name = base_name.replace('_', ' ')
|
||||
|
||||
# Apply title case
|
||||
title_cased = spaced_name.title()
|
||||
|
||||
# Apply special cases
|
||||
return self.special_cases.get(title_cased, title_cased)
|
||||
|
||||
def validate_mapping(self, filename: str, display_name: str) -> bool:
|
||||
"""Validate mapped name against authoritative list"""
|
||||
return display_name in self.authoritative_makes
|
||||
|
||||
def get_all_mappings(self) -> Dict[str, str]:
|
||||
"""Get complete filename → display name mapping"""
|
||||
# Simulate the 55 JSON files found in the actual directory
|
||||
json_files = [
|
||||
'acura.json', 'alfa_romeo.json', 'aston_martin.json', 'audi.json',
|
||||
'bentley.json', 'bmw.json', 'buick.json', 'cadillac.json',
|
||||
'chevrolet.json', 'chrysler.json', 'dodge.json', 'ferrari.json',
|
||||
'fiat.json', 'ford.json', 'genesis.json', 'geo.json', 'gmc.json',
|
||||
'honda.json', 'hummer.json', 'hyundai.json', 'infiniti.json',
|
||||
'isuzu.json', 'jaguar.json', 'jeep.json', 'kia.json',
|
||||
'lamborghini.json', 'land_rover.json', 'lexus.json', 'lincoln.json',
|
||||
'lotus.json', 'lucid.json', 'maserati.json', 'mazda.json',
|
||||
'mclaren.json', 'mercury.json', 'mini.json', 'mitsubishi.json',
|
||||
'nissan.json', 'oldsmobile.json', 'plymouth.json', 'polestar.json',
|
||||
'pontiac.json', 'porsche.json', 'ram.json', 'rivian.json',
|
||||
'rolls_royce.json', 'saab.json', 'saturn.json', 'scion.json',
|
||||
'smart.json', 'subaru.json', 'tesla.json', 'toyota.json',
|
||||
'volkswagen.json', 'volvo.json'
|
||||
]
|
||||
|
||||
mappings = {}
|
||||
for filename in json_files:
|
||||
display_name = self.normalize_make_name(filename)
|
||||
mappings[filename] = display_name
|
||||
|
||||
return mappings
|
||||
|
||||
def validate_all_mappings(self) -> ValidationReport:
|
||||
"""Validate all mappings against authoritative list"""
|
||||
mappings = self.get_all_mappings()
|
||||
mismatches = []
|
||||
|
||||
for filename, display_name in mappings.items():
|
||||
if not self.validate_mapping(filename, display_name):
|
||||
mismatches.append({
|
||||
'filename': filename,
|
||||
'mapped_name': display_name,
|
||||
'status': 'NOT_FOUND_IN_AUTHORITATIVE'
|
||||
})
|
||||
|
||||
return ValidationReport(
|
||||
total_files=len(mappings),
|
||||
valid_mappings=len(mappings) - len(mismatches),
|
||||
mismatches=mismatches
|
||||
)
|
||||
|
||||
|
||||
def demonstrate_make_name_mapping():
|
||||
"""Demonstrate make name normalization process"""
|
||||
|
||||
mapper = MakeNameMapper()
|
||||
|
||||
print("🏷️ Make Name Mapping Examples")
|
||||
print("=" * 40)
|
||||
|
||||
# Test cases showing different transformation types
|
||||
test_cases = [
|
||||
# Single word makes (standard title case)
|
||||
('toyota.json', 'Toyota'),
|
||||
('honda.json', 'Honda'),
|
||||
('ford.json', 'Ford'),
|
||||
|
||||
# Multi-word makes (underscore → space + title case)
|
||||
('alfa_romeo.json', 'Alfa Romeo'),
|
||||
('land_rover.json', 'Land Rover'),
|
||||
('rolls_royce.json', 'Rolls Royce'),
|
||||
('aston_martin.json', 'Aston Martin'),
|
||||
|
||||
# Special capitalization cases
|
||||
('bmw.json', 'BMW'),
|
||||
('gmc.json', 'GMC'),
|
||||
('mini.json', 'MINI'),
|
||||
('mclaren.json', 'McLaren'),
|
||||
]
|
||||
|
||||
for filename, expected in test_cases:
|
||||
result = mapper.normalize_make_name(filename)
|
||||
status = "✅" if result == expected else "❌"
|
||||
|
||||
print(f"{status} {filename:20} → {result:15} (expected: {expected})")
|
||||
|
||||
if result != expected:
|
||||
print(f" ⚠️ MISMATCH: Expected '{expected}', got '{result}'")
|
||||
|
||||
|
||||
def demonstrate_complete_mapping():
|
||||
"""Show complete mapping of all 55 make files"""
|
||||
|
||||
mapper = MakeNameMapper()
|
||||
all_mappings = mapper.get_all_mappings()
|
||||
|
||||
print(f"\n\n📋 Complete Make Name Mappings ({len(all_mappings)} files)")
|
||||
print("=" * 50)
|
||||
|
||||
# Group by transformation type for clarity
|
||||
single_words = []
|
||||
multi_words = []
|
||||
special_cases = []
|
||||
|
||||
for filename, display_name in sorted(all_mappings.items()):
|
||||
if '_' in filename:
|
||||
multi_words.append((filename, display_name))
|
||||
elif display_name in ['BMW', 'GMC', 'MINI', 'McLaren']:
|
||||
special_cases.append((filename, display_name))
|
||||
else:
|
||||
single_words.append((filename, display_name))
|
||||
|
||||
print("\n🔤 Single Word Makes (Standard Title Case):")
|
||||
for filename, display_name in single_words:
|
||||
print(f" {filename:20} → {display_name}")
|
||||
|
||||
print(f"\n📝 Multi-Word Makes (Underscore → Space, {len(multi_words)} total):")
|
||||
for filename, display_name in multi_words:
|
||||
print(f" {filename:20} → {display_name}")
|
||||
|
||||
print(f"\n⭐ Special Capitalization Cases ({len(special_cases)} total):")
|
||||
for filename, display_name in special_cases:
|
||||
print(f" {filename:20} → {display_name}")
|
||||
|
||||
|
||||
def demonstrate_validation():
|
||||
"""Demonstrate validation against authoritative makes list"""
|
||||
|
||||
mapper = MakeNameMapper()
|
||||
report = mapper.validate_all_mappings()
|
||||
|
||||
print(f"\n\n✅ Validation Report")
|
||||
print("=" * 20)
|
||||
print(f"Total files processed: {report.total_files}")
|
||||
print(f"Valid mappings: {report.valid_mappings}")
|
||||
print(f"Success rate: {report.success_rate:.1%}")
|
||||
|
||||
if report.mismatches:
|
||||
print(f"\n⚠️ Mismatches found ({len(report.mismatches)}):")
|
||||
for mismatch in report.mismatches:
|
||||
print(f" {mismatch['filename']} → {mismatch['mapped_name']}")
|
||||
print(f" Status: {mismatch['status']}")
|
||||
else:
|
||||
print("\n🎉 All mappings valid!")
|
||||
|
||||
|
||||
def demonstrate_database_integration():
|
||||
"""Show how mappings integrate with database operations"""
|
||||
|
||||
mapper = MakeNameMapper()
|
||||
|
||||
print(f"\n\n💾 Database Integration Example")
|
||||
print("=" * 35)
|
||||
|
||||
sample_files = ['toyota.json', 'alfa_romeo.json', 'bmw.json', 'land_rover.json']
|
||||
|
||||
print("SQL: INSERT INTO vehicles.make (name) VALUES")
|
||||
|
||||
for i, filename in enumerate(sample_files):
|
||||
display_name = mapper.normalize_make_name(filename)
|
||||
comma = "," if i < len(sample_files) - 1 else ";"
|
||||
|
||||
print(f" ('{display_name}'){comma}")
|
||||
print(f" -- From file: {filename}")
|
||||
|
||||
|
||||
def demonstrate_error_handling():
|
||||
"""Demonstrate error handling for edge cases"""
|
||||
|
||||
mapper = MakeNameMapper()
|
||||
|
||||
print(f"\n\n🛠️ Error Handling Examples")
|
||||
print("=" * 30)
|
||||
|
||||
edge_cases = [
|
||||
'unknown_brand.json',
|
||||
'test__multiple__underscores.json',
|
||||
'no_extension',
|
||||
'.json', # Only extension
|
||||
]
|
||||
|
||||
for filename in edge_cases:
|
||||
try:
|
||||
display_name = mapper.normalize_make_name(filename)
|
||||
is_valid = mapper.validate_mapping(filename, display_name)
|
||||
status = "✅ Valid" if is_valid else "⚠️ Not in authoritative list"
|
||||
|
||||
print(f" {filename:35} → {display_name:15} ({status})")
|
||||
except Exception as e:
|
||||
print(f" {filename:35} → ERROR: {e}")
|
||||
|
||||
|
||||
def run_validation_tests():
|
||||
"""Run comprehensive validation tests"""
|
||||
|
||||
mapper = MakeNameMapper()
|
||||
|
||||
print(f"\n\n🧪 Validation Tests")
|
||||
print("=" * 20)
|
||||
|
||||
# Test cases with expected results
|
||||
test_cases = [
|
||||
('toyota.json', 'Toyota', True),
|
||||
('alfa_romeo.json', 'Alfa Romeo', True),
|
||||
('bmw.json', 'BMW', True),
|
||||
('gmc.json', 'GMC', True),
|
||||
('mclaren.json', 'McLaren', True),
|
||||
('unknown_brand.json', 'Unknown Brand', False),
|
||||
]
|
||||
|
||||
passed = 0
|
||||
for filename, expected_name, expected_valid in test_cases:
|
||||
actual_name = mapper.normalize_make_name(filename)
|
||||
actual_valid = mapper.validate_mapping(filename, actual_name)
|
||||
|
||||
name_correct = actual_name == expected_name
|
||||
valid_correct = actual_valid == expected_valid
|
||||
|
||||
if name_correct and valid_correct:
|
||||
print(f"✅ {filename} → {actual_name} (valid: {actual_valid})")
|
||||
passed += 1
|
||||
else:
|
||||
print(f"❌ {filename}")
|
||||
if not name_correct:
|
||||
print(f" Name: Expected '{expected_name}', got '{actual_name}'")
|
||||
if not valid_correct:
|
||||
print(f" Valid: Expected {expected_valid}, got {actual_valid}")
|
||||
|
||||
print(f"\n📊 Test Results: {passed}/{len(test_cases)} tests passed")
|
||||
|
||||
if passed == len(test_cases):
|
||||
print("🎉 All validation tests passed!")
|
||||
return True
|
||||
else:
|
||||
print("⚠️ Some tests failed!")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demonstrate_make_name_mapping()
|
||||
demonstrate_complete_mapping()
|
||||
demonstrate_validation()
|
||||
demonstrate_database_integration()
|
||||
demonstrate_error_handling()
|
||||
|
||||
success = run_validation_tests()
|
||||
|
||||
print("\n\n📋 Summary")
|
||||
print("=" * 10)
|
||||
print("✅ Make name normalization patterns implemented")
|
||||
print("✅ Special capitalization cases handled")
|
||||
print("✅ Multi-word make names (underscore → space) working")
|
||||
print("✅ Validation against authoritative list functional")
|
||||
print("✅ Database integration format demonstrated")
|
||||
|
||||
if success:
|
||||
print("\n🚀 Ready for integration into ETL system!")
|
||||
else:
|
||||
print("\n⚠️ Review failed tests before integration")
|
||||
|
||||
print("\nKey Implementation Notes:")
|
||||
print("• filename.replace('.json', '').replace('_', ' ').title()")
|
||||
print("• Special cases: BMW, GMC, MINI, McLaren")
|
||||
print("• Validation against sources/makes.json required")
|
||||
print("• Handle unknown makes gracefully (log warning, continue)")
|
||||
@@ -0,0 +1,449 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sample JSON Processing Examples
|
||||
|
||||
This file demonstrates complete processing of JSON vehicle data,
|
||||
from file reading through database-ready output structures.
|
||||
|
||||
Usage:
|
||||
python sample-json-processing.py
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import List, Dict, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class EngineSpec:
|
||||
"""Parsed engine specification"""
|
||||
displacement_l: Optional[float]
|
||||
configuration: str
|
||||
cylinders: Optional[int]
|
||||
fuel_type: str
|
||||
aspiration: str
|
||||
raw_string: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelData:
|
||||
"""Model information for a specific year"""
|
||||
name: str
|
||||
engines: List[EngineSpec]
|
||||
trims: List[str] # From submodels
|
||||
|
||||
|
||||
@dataclass
|
||||
class YearData:
|
||||
"""Vehicle data for a specific year"""
|
||||
year: int
|
||||
models: List[ModelData]
|
||||
|
||||
|
||||
@dataclass
|
||||
class MakeData:
|
||||
"""Complete make information"""
|
||||
name: str # Normalized display name
|
||||
filename: str # Original JSON filename
|
||||
years: List[YearData]
|
||||
|
||||
@property
|
||||
def total_models(self) -> int:
|
||||
return sum(len(year.models) for year in self.years)
|
||||
|
||||
@property
|
||||
def total_engines(self) -> int:
|
||||
return sum(len(model.engines)
|
||||
for year in self.years
|
||||
for model in year.models)
|
||||
|
||||
@property
|
||||
def total_trims(self) -> int:
|
||||
return sum(len(model.trims)
|
||||
for year in self.years
|
||||
for model in year.models)
|
||||
|
||||
|
||||
class JsonProcessor:
|
||||
"""Process JSON vehicle files into structured data"""
|
||||
|
||||
def __init__(self):
|
||||
# Import our utility classes
|
||||
from engine_parsing_examples import EngineSpecParser
|
||||
from make_mapping_examples import MakeNameMapper
|
||||
|
||||
self.engine_parser = EngineSpecParser()
|
||||
self.make_mapper = MakeNameMapper()
|
||||
|
||||
def process_json_file(self, json_data: Dict[str, Any], filename: str) -> MakeData:
|
||||
"""Process complete JSON file into structured data"""
|
||||
|
||||
# Get the make name (first key in JSON)
|
||||
make_key = list(json_data.keys())[0]
|
||||
display_name = self.make_mapper.normalize_make_name(filename)
|
||||
|
||||
years_data = []
|
||||
for year_entry in json_data[make_key]:
|
||||
year = int(year_entry['year'])
|
||||
models_data = []
|
||||
|
||||
for model_entry in year_entry.get('models', []):
|
||||
model_name = model_entry['name']
|
||||
|
||||
# Process engines
|
||||
engines = []
|
||||
engine_strings = model_entry.get('engines', [])
|
||||
|
||||
if not engine_strings:
|
||||
# Electric vehicle - create default engine
|
||||
engines.append(self.engine_parser.create_electric_motor())
|
||||
else:
|
||||
for engine_str in engine_strings:
|
||||
engine_spec = self.engine_parser.parse_engine_string(engine_str)
|
||||
engines.append(engine_spec)
|
||||
|
||||
# Process trims (from submodels)
|
||||
trims = model_entry.get('submodels', [])
|
||||
|
||||
models_data.append(ModelData(
|
||||
name=model_name,
|
||||
engines=engines,
|
||||
trims=trims
|
||||
))
|
||||
|
||||
years_data.append(YearData(
|
||||
year=year,
|
||||
models=models_data
|
||||
))
|
||||
|
||||
return MakeData(
|
||||
name=display_name,
|
||||
filename=filename,
|
||||
years=years_data
|
||||
)
|
||||
|
||||
|
||||
def demonstrate_tesla_processing():
|
||||
"""Demonstrate processing Tesla JSON (electric vehicle example)"""
|
||||
|
||||
# Sample Tesla data (simplified from actual tesla.json)
|
||||
tesla_json = {
|
||||
"tesla": [
|
||||
{
|
||||
"year": "2024",
|
||||
"models": [
|
||||
{
|
||||
"name": "3",
|
||||
"engines": [], # Empty - electric vehicle
|
||||
"submodels": [
|
||||
"Long Range AWD",
|
||||
"Performance",
|
||||
"Standard Plus"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "y",
|
||||
"engines": [], # Empty - electric vehicle
|
||||
"submodels": [
|
||||
"Long Range",
|
||||
"Performance"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2023",
|
||||
"models": [
|
||||
{
|
||||
"name": "s",
|
||||
"engines": [], # Empty - electric vehicle
|
||||
"submodels": [
|
||||
"Plaid",
|
||||
"Long Range Plus"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
processor = JsonProcessor()
|
||||
make_data = processor.process_json_file(tesla_json, 'tesla.json')
|
||||
|
||||
print("⚡ Tesla JSON Processing Example")
|
||||
print("=" * 35)
|
||||
print(f"Filename: tesla.json")
|
||||
print(f"Display Name: {make_data.name}")
|
||||
print(f"Years: {len(make_data.years)}")
|
||||
print(f"Total Models: {make_data.total_models}")
|
||||
print(f"Total Engines: {make_data.total_engines}")
|
||||
print(f"Total Trims: {make_data.total_trims}")
|
||||
|
||||
print(f"\nDetailed Breakdown:")
|
||||
for year_data in make_data.years:
|
||||
print(f"\n {year_data.year}:")
|
||||
for model in year_data.models:
|
||||
print(f" Model: {model.name}")
|
||||
print(f" Engines: {[e.raw_string for e in model.engines]}")
|
||||
print(f" Trims: {model.trims}")
|
||||
|
||||
|
||||
def demonstrate_subaru_processing():
|
||||
"""Demonstrate processing Subaru JSON (Boxer engines, H4 configuration)"""
|
||||
|
||||
# Sample Subaru data showing H4 engines
|
||||
subaru_json = {
|
||||
"subaru": [
|
||||
{
|
||||
"year": "2024",
|
||||
"models": [
|
||||
{
|
||||
"name": "crosstrek",
|
||||
"engines": [
|
||||
"2.0L H4",
|
||||
"2.0L H4 PLUG-IN HYBRID EV- (PHEV)",
|
||||
"2.5L H4"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"Premium",
|
||||
"Limited",
|
||||
"Hybrid"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "forester",
|
||||
"engines": [
|
||||
"2.5L H4"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"Premium",
|
||||
"Sport",
|
||||
"Limited"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
processor = JsonProcessor()
|
||||
make_data = processor.process_json_file(subaru_json, 'subaru.json')
|
||||
|
||||
print(f"\n\n🚗 Subaru JSON Processing Example (Boxer Engines)")
|
||||
print("=" * 50)
|
||||
print(f"Display Name: {make_data.name}")
|
||||
|
||||
for year_data in make_data.years:
|
||||
print(f"\n{year_data.year}:")
|
||||
for model in year_data.models:
|
||||
print(f" {model.name}:")
|
||||
for engine in model.engines:
|
||||
config_note = " (Boxer)" if engine.configuration == 'H' else ""
|
||||
hybrid_note = " (Hybrid)" if 'Hybrid' in engine.fuel_type else ""
|
||||
print(f" Engine: {engine.raw_string}")
|
||||
print(f" → {engine.displacement_l}L {engine.configuration}{engine.cylinders}{config_note}{hybrid_note}")
|
||||
|
||||
|
||||
def demonstrate_l_to_i_processing():
|
||||
"""Demonstrate L→I normalization during processing"""
|
||||
|
||||
# Sample data with L-configuration engines
|
||||
nissan_json = {
|
||||
"nissan": [
|
||||
{
|
||||
"year": "2024",
|
||||
"models": [
|
||||
{
|
||||
"name": "versa",
|
||||
"engines": [
|
||||
"1.6L I4"
|
||||
],
|
||||
"submodels": ["S", "SV", "SR"]
|
||||
},
|
||||
{
|
||||
"name": "kicks",
|
||||
"engines": [
|
||||
"1.5L L3 PLUG-IN HYBRID EV- (PHEV)" # L3 → I3
|
||||
],
|
||||
"submodels": ["S", "SV", "SR"]
|
||||
},
|
||||
{
|
||||
"name": "note",
|
||||
"engines": [
|
||||
"1.2L L3 FULL HYBRID EV- (FHEV)" # L3 → I3
|
||||
],
|
||||
"submodels": ["Base", "Premium"]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
processor = JsonProcessor()
|
||||
make_data = processor.process_json_file(nissan_json, 'nissan.json')
|
||||
|
||||
print(f"\n\n🎯 L→I Normalization Processing Example")
|
||||
print("=" * 42)
|
||||
|
||||
for year_data in make_data.years:
|
||||
for model in year_data.models:
|
||||
for engine in model.engines:
|
||||
original_config = "L" if "L3" in engine.raw_string else "I"
|
||||
normalized_config = engine.configuration
|
||||
|
||||
print(f"Model: {model.name}")
|
||||
print(f" Input: \"{engine.raw_string}\"")
|
||||
print(f" Configuration: {original_config}{engine.cylinders} → {normalized_config}{engine.cylinders}")
|
||||
|
||||
if original_config == "L" and normalized_config == "I":
|
||||
print(f" 🎯 NORMALIZED: L→I conversion applied")
|
||||
print()
|
||||
|
||||
|
||||
def demonstrate_database_ready_output():
|
||||
"""Show how processed data maps to database tables"""
|
||||
|
||||
# Sample mixed data
|
||||
sample_json = {
|
||||
"toyota": [
|
||||
{
|
||||
"year": "2024",
|
||||
"models": [
|
||||
{
|
||||
"name": "camry",
|
||||
"engines": [
|
||||
"2.5L I4",
|
||||
"2.5L I4 FULL HYBRID EV- (FHEV)"
|
||||
],
|
||||
"submodels": [
|
||||
"LE",
|
||||
"XLE",
|
||||
"Hybrid LE"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
processor = JsonProcessor()
|
||||
make_data = processor.process_json_file(sample_json, 'toyota.json')
|
||||
|
||||
print(f"\n\n💾 Database-Ready Output")
|
||||
print("=" * 25)
|
||||
|
||||
# Show SQL INSERT statements
|
||||
print("-- Make table")
|
||||
print(f"INSERT INTO vehicles.make (name) VALUES ('{make_data.name}');")
|
||||
|
||||
print(f"\n-- Model table (assuming make_id = 1)")
|
||||
for year_data in make_data.years:
|
||||
for model in year_data.models:
|
||||
print(f"INSERT INTO vehicles.model (make_id, name) VALUES (1, '{model.name}');")
|
||||
|
||||
print(f"\n-- Model Year table (assuming model_id = 1)")
|
||||
for year_data in make_data.years:
|
||||
print(f"INSERT INTO vehicles.model_year (model_id, year) VALUES (1, {year_data.year});")
|
||||
|
||||
print(f"\n-- Engine table")
|
||||
unique_engines = set()
|
||||
for year_data in make_data.years:
|
||||
for model in year_data.models:
|
||||
for engine in model.engines:
|
||||
engine_key = (engine.raw_string, engine.displacement_l, engine.cylinders, engine.fuel_type)
|
||||
if engine_key not in unique_engines:
|
||||
unique_engines.add(engine_key)
|
||||
print(f"INSERT INTO vehicles.engine (name, displacement_l, cylinders, fuel_type, aspiration)")
|
||||
print(f" VALUES ('{engine.raw_string}', {engine.displacement_l}, {engine.cylinders}, '{engine.fuel_type}', '{engine.aspiration}');")
|
||||
|
||||
print(f"\n-- Trim table (assuming model_year_id = 1)")
|
||||
for year_data in make_data.years:
|
||||
for model in year_data.models:
|
||||
for trim in model.trims:
|
||||
print(f"INSERT INTO vehicles.trim (model_year_id, name) VALUES (1, '{trim}');")
|
||||
|
||||
|
||||
def run_processing_validation():
|
||||
"""Validate that processing works correctly"""
|
||||
|
||||
print(f"\n\n✅ Processing Validation")
|
||||
print("=" * 25)
|
||||
|
||||
processor = JsonProcessor()
|
||||
|
||||
# Test cases
|
||||
test_cases = [
|
||||
# Tesla (electric, empty engines)
|
||||
('tesla.json', {"tesla": [{"year": "2024", "models": [{"name": "3", "engines": [], "submodels": ["Base"]}]}]}),
|
||||
# Subaru (H4 engines)
|
||||
('subaru.json', {"subaru": [{"year": "2024", "models": [{"name": "crosstrek", "engines": ["2.0L H4"], "submodels": ["Base"]}]}]}),
|
||||
# Nissan (L→I normalization)
|
||||
('nissan.json', {"nissan": [{"year": "2024", "models": [{"name": "kicks", "engines": ["1.5L L3"], "submodels": ["Base"]}]}]})
|
||||
]
|
||||
|
||||
for filename, json_data in test_cases:
|
||||
try:
|
||||
make_data = processor.process_json_file(json_data, filename)
|
||||
|
||||
# Basic validation
|
||||
assert make_data.name is not None, "Make name should not be None"
|
||||
assert len(make_data.years) > 0, "Should have at least one year"
|
||||
assert make_data.total_models > 0, "Should have at least one model"
|
||||
|
||||
print(f"✅ {filename} processed successfully")
|
||||
print(f" Make: {make_data.name}, Models: {make_data.total_models}, Engines: {make_data.total_engines}")
|
||||
|
||||
# Special validations
|
||||
if filename == 'tesla.json':
|
||||
# Should have electric motors for empty engines
|
||||
for year_data in make_data.years:
|
||||
for model in year_data.models:
|
||||
assert all(e.fuel_type == 'Electric' for e in model.engines), "Tesla should have electric engines"
|
||||
|
||||
if filename == 'nissan.json':
|
||||
# Should have L→I normalization
|
||||
for year_data in make_data.years:
|
||||
for model in year_data.models:
|
||||
for engine in model.engines:
|
||||
if 'L3' in engine.raw_string:
|
||||
assert engine.configuration == 'I', "L3 should become I3"
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ {filename} failed: {e}")
|
||||
return False
|
||||
|
||||
print(f"\n🎉 All processing validation tests passed!")
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demonstrate_tesla_processing()
|
||||
demonstrate_subaru_processing()
|
||||
demonstrate_l_to_i_processing()
|
||||
demonstrate_database_ready_output()
|
||||
|
||||
success = run_processing_validation()
|
||||
|
||||
print("\n\n📋 Summary")
|
||||
print("=" * 10)
|
||||
print("✅ JSON file processing implemented")
|
||||
print("✅ Electric vehicle handling (empty engines → Electric Motor)")
|
||||
print("✅ L→I normalization during processing")
|
||||
print("✅ Database-ready output structures")
|
||||
print("✅ Make name normalization integrated")
|
||||
print("✅ Engine specification parsing integrated")
|
||||
|
||||
if success:
|
||||
print("\n🚀 Ready for ETL pipeline integration!")
|
||||
else:
|
||||
print("\n⚠️ Review failed validations")
|
||||
|
||||
print("\nNext Steps:")
|
||||
print("• Integrate with PostgreSQL loader")
|
||||
print("• Add batch processing for all 55 files")
|
||||
print("• Implement clear/append modes")
|
||||
print("• Add CLI interface")
|
||||
print("• Create comprehensive test suite")
|
||||
Reference in New Issue
Block a user