Initial Commit
This commit is contained in:
349
mvp-platform-services/vehicles/etl/validate_utilities.py
Normal file
349
mvp-platform-services/vehicles/etl/validate_utilities.py
Normal file
@@ -0,0 +1,349 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate Utilities Against Actual JSON Files
|
||||
|
||||
This script validates the MakeNameMapper and EngineSpecParser utilities
|
||||
against the actual 55 JSON files in the sources/makes/ directory.
|
||||
|
||||
Performs comprehensive validation of:
|
||||
- Make name normalization (all 55 files)
|
||||
- Engine specification parsing (all engines across all files)
|
||||
- L→I normalization detection and verification
|
||||
- Electric vehicle handling (Tesla, Lucid, etc.)
|
||||
- Data quality reporting
|
||||
|
||||
Usage:
|
||||
python validate_utilities.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from utils.make_name_mapper import MakeNameMapper
|
||||
from utils.engine_spec_parser import EngineSpecParser
|
||||
|
||||
|
||||
class ValidationResults:
|
||||
"""Container for validation results"""
|
||||
|
||||
def __init__(self):
|
||||
self.make_results = {}
|
||||
self.engine_results = {}
|
||||
self.l_to_i_cases = []
|
||||
self.electric_vehicles = []
|
||||
self.parsing_errors = []
|
||||
self.quality_issues = []
|
||||
|
||||
def add_make_result(self, filename: str, display_name: str, is_valid: bool):
|
||||
"""Add make validation result"""
|
||||
self.make_results[filename] = {
|
||||
'display_name': display_name,
|
||||
'is_valid': is_valid
|
||||
}
|
||||
|
||||
def add_engine_result(self, make: str, engine_str: str, spec, warnings: List[str]):
|
||||
"""Add engine validation result"""
|
||||
if make not in self.engine_results:
|
||||
self.engine_results[make] = []
|
||||
|
||||
self.engine_results[make].append({
|
||||
'original': engine_str,
|
||||
'parsed': spec,
|
||||
'warnings': warnings
|
||||
})
|
||||
|
||||
def add_l_to_i_case(self, make: str, original: str, normalized: str):
|
||||
"""Record L→I normalization case"""
|
||||
self.l_to_i_cases.append({
|
||||
'make': make,
|
||||
'original': original,
|
||||
'normalized': normalized
|
||||
})
|
||||
|
||||
def add_electric_vehicle(self, make: str, model: str, year: str):
|
||||
"""Record electric vehicle (empty engines)"""
|
||||
self.electric_vehicles.append({
|
||||
'make': make,
|
||||
'model': model,
|
||||
'year': year
|
||||
})
|
||||
|
||||
def add_parsing_error(self, make: str, engine_str: str, error: str):
|
||||
"""Record parsing error"""
|
||||
self.parsing_errors.append({
|
||||
'make': make,
|
||||
'engine': engine_str,
|
||||
'error': error
|
||||
})
|
||||
|
||||
|
||||
def validate_json_files(json_dir: str) -> ValidationResults:
|
||||
"""
|
||||
Validate utilities against all JSON files
|
||||
|
||||
Args:
|
||||
json_dir: Directory containing make JSON files
|
||||
|
||||
Returns:
|
||||
ValidationResults object with all findings
|
||||
"""
|
||||
results = ValidationResults()
|
||||
|
||||
# Initialize utilities
|
||||
make_mapper = MakeNameMapper(sources_dir="sources")
|
||||
engine_parser = EngineSpecParser()
|
||||
|
||||
# Find all JSON files
|
||||
json_files = list(Path(json_dir).glob("*.json"))
|
||||
print(f"🔍 Validating against {len(json_files)} JSON files...")
|
||||
|
||||
for json_file in json_files:
|
||||
filename = json_file.name
|
||||
print(f"\n📄 Processing {filename}...")
|
||||
|
||||
try:
|
||||
# Validate make name mapping
|
||||
display_name = make_mapper.normalize_make_name(filename)
|
||||
is_valid = make_mapper.validate_mapping(filename, display_name)
|
||||
results.add_make_result(filename, display_name, is_valid)
|
||||
|
||||
if not is_valid:
|
||||
print(f" ⚠️ Make name validation failed: {filename} → {display_name}")
|
||||
else:
|
||||
print(f" ✅ Make name: {filename} → {display_name}")
|
||||
|
||||
# Load and parse JSON content
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
# Get make key (should be first key)
|
||||
if not json_data:
|
||||
print(f" ⚠️ Empty JSON file: {filename}")
|
||||
continue
|
||||
|
||||
make_key = list(json_data.keys())[0]
|
||||
make_data = json_data[make_key]
|
||||
|
||||
# Process all engines in this make
|
||||
total_engines = 0
|
||||
parsed_engines = 0
|
||||
empty_engine_models = 0
|
||||
|
||||
for year_entry in make_data:
|
||||
year = year_entry.get('year', 'unknown')
|
||||
|
||||
for model_entry in year_entry.get('models', []):
|
||||
model_name = model_entry.get('name', 'unknown')
|
||||
engines_list = model_entry.get('engines', [])
|
||||
|
||||
if not engines_list:
|
||||
# Electric vehicle case
|
||||
results.add_electric_vehicle(display_name, model_name, year)
|
||||
empty_engine_models += 1
|
||||
|
||||
# Test electric motor creation
|
||||
electric_spec = engine_parser.create_electric_motor()
|
||||
results.add_engine_result(display_name, "EMPTY_ARRAY", electric_spec, [])
|
||||
else:
|
||||
# Parse each engine
|
||||
for engine_str in engines_list:
|
||||
total_engines += 1
|
||||
|
||||
try:
|
||||
spec = engine_parser.parse_engine_string(engine_str)
|
||||
warnings = engine_parser.validate_engine_spec(spec)
|
||||
|
||||
results.add_engine_result(display_name, engine_str, spec, warnings)
|
||||
parsed_engines += 1
|
||||
|
||||
# Check for L→I normalization
|
||||
if 'L' in engine_str and spec.configuration == 'I' and 'ELECTRIC' not in engine_str.upper():
|
||||
normalized_str = engine_str.replace('L3', 'I3').replace('L4', 'I4').replace(' L', ' I')
|
||||
results.add_l_to_i_case(display_name, engine_str, normalized_str)
|
||||
|
||||
except Exception as e:
|
||||
results.add_parsing_error(display_name, engine_str, str(e))
|
||||
|
||||
print(f" 📊 Engines: {parsed_engines}/{total_engines} parsed successfully")
|
||||
if empty_engine_models > 0:
|
||||
print(f" ⚡ Electric vehicles: {empty_engine_models} models with empty engines")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Failed to process {filename}: {e}")
|
||||
results.add_parsing_error(filename, "FILE_PROCESSING", str(e))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def generate_validation_report(results: ValidationResults) -> None:
|
||||
"""Generate comprehensive validation report"""
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("📋 COMPREHENSIVE VALIDATION REPORT")
|
||||
print("="*60)
|
||||
|
||||
# Make Name Validation Summary
|
||||
total_makes = len(results.make_results)
|
||||
valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
|
||||
|
||||
print(f"\n🏷️ MAKE NAME VALIDATION")
|
||||
print(f" Total files: {total_makes}")
|
||||
print(f" Valid mappings: {valid_makes}")
|
||||
print(f" Success rate: {valid_makes/total_makes:.1%}")
|
||||
|
||||
# Show invalid makes
|
||||
invalid_makes = [(f, r['display_name']) for f, r in results.make_results.items() if not r['is_valid']]
|
||||
if invalid_makes:
|
||||
print(f"\n ⚠️ Invalid makes ({len(invalid_makes)}):")
|
||||
for filename, display_name in invalid_makes:
|
||||
print(f" {filename} → {display_name}")
|
||||
else:
|
||||
print(f" ✅ All make names are valid!")
|
||||
|
||||
# Engine Parsing Summary
|
||||
total_engines = sum(len(engines) for engines in results.engine_results.values())
|
||||
successful_parses = sum(1 for engines in results.engine_results.values()
|
||||
for engine in engines if engine['parsed'].configuration != 'Unknown')
|
||||
|
||||
print(f"\n🔧 ENGINE SPECIFICATION PARSING")
|
||||
print(f" Total engines: {total_engines}")
|
||||
print(f" Successfully parsed: {successful_parses}")
|
||||
print(f" Success rate: {successful_parses/total_engines:.1%}")
|
||||
|
||||
# L→I Normalization Cases
|
||||
print(f"\n🎯 L→I NORMALIZATION (CRITICAL)")
|
||||
print(f" Cases found: {len(results.l_to_i_cases)}")
|
||||
|
||||
if results.l_to_i_cases:
|
||||
print(f" Examples:")
|
||||
for case in results.l_to_i_cases[:10]: # Show first 10
|
||||
print(f" {case['make']}: '{case['original']}' → '{case['normalized']}'")
|
||||
|
||||
if len(results.l_to_i_cases) > 10:
|
||||
print(f" ... and {len(results.l_to_i_cases) - 10} more cases")
|
||||
else:
|
||||
print(f" ⚠️ No L→I normalization cases found in data")
|
||||
|
||||
# Electric Vehicle Handling
|
||||
print(f"\n⚡ ELECTRIC VEHICLE HANDLING")
|
||||
print(f" Models with empty engines: {len(results.electric_vehicles)}")
|
||||
|
||||
if results.electric_vehicles:
|
||||
# Group by make
|
||||
ev_by_make = defaultdict(list)
|
||||
for ev in results.electric_vehicles:
|
||||
ev_by_make[ev['make']].append(f"{ev['year']} {ev['model']}")
|
||||
|
||||
print(f" Electric vehicles by make:")
|
||||
for make, models in ev_by_make.items():
|
||||
print(f" {make}: {len(models)} models")
|
||||
if make.lower() in ['tesla', 'lucid', 'rivian']: # Show details for known EVs
|
||||
for model in models[:5]: # Show first 5
|
||||
print(f" {model}")
|
||||
if len(models) > 5:
|
||||
print(f" ... and {len(models) - 5} more")
|
||||
|
||||
# Configuration Distribution
|
||||
print(f"\n📊 ENGINE CONFIGURATION DISTRIBUTION")
|
||||
config_counts = defaultdict(int)
|
||||
for engines in results.engine_results.values():
|
||||
for engine in engines:
|
||||
config_counts[engine['parsed'].configuration] += 1
|
||||
|
||||
for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True):
|
||||
percentage = count / total_engines * 100 if total_engines > 0 else 0
|
||||
print(f" {config}: {count} engines ({percentage:.1f}%)")
|
||||
|
||||
# Fuel Type Distribution
|
||||
print(f"\n⛽ FUEL TYPE DISTRIBUTION")
|
||||
fuel_counts = defaultdict(int)
|
||||
for engines in results.engine_results.values():
|
||||
for engine in engines:
|
||||
fuel_counts[engine['parsed'].fuel_type] += 1
|
||||
|
||||
for fuel, count in sorted(fuel_counts.items(), key=lambda x: x[1], reverse=True):
|
||||
percentage = count / total_engines * 100 if total_engines > 0 else 0
|
||||
print(f" {fuel}: {count} engines ({percentage:.1f}%)")
|
||||
|
||||
# Data Quality Issues
|
||||
if results.parsing_errors:
|
||||
print(f"\n❌ PARSING ERRORS ({len(results.parsing_errors)})")
|
||||
error_types = defaultdict(list)
|
||||
for error in results.parsing_errors:
|
||||
error_types[error['error']].append(error)
|
||||
|
||||
for error_type, errors in error_types.items():
|
||||
print(f" {error_type}: {len(errors)} occurrences")
|
||||
for error in errors[:3]: # Show first 3 examples
|
||||
if error.get('engine'):
|
||||
print(f" {error['make']}: '{error['engine']}'")
|
||||
else:
|
||||
print(f" {error['make']}")
|
||||
|
||||
# Overall Assessment
|
||||
print(f"\n🎉 OVERALL ASSESSMENT")
|
||||
|
||||
if valid_makes == total_makes:
|
||||
print(f" ✅ Make name normalization: PERFECT")
|
||||
else:
|
||||
print(f" ⚠️ Make name normalization: {valid_makes/total_makes:.1%}")
|
||||
|
||||
if successful_parses / total_engines >= 0.95: # 95% threshold
|
||||
print(f" ✅ Engine parsing: EXCELLENT ({successful_parses/total_engines:.1%})")
|
||||
elif successful_parses / total_engines >= 0.85: # 85% threshold
|
||||
print(f" ✅ Engine parsing: GOOD ({successful_parses/total_engines:.1%})")
|
||||
else:
|
||||
print(f" ⚠️ Engine parsing: NEEDS IMPROVEMENT ({successful_parses/total_engines:.1%})")
|
||||
|
||||
if results.l_to_i_cases:
|
||||
print(f" ✅ L→I normalization: WORKING ({len(results.l_to_i_cases)} cases)")
|
||||
else:
|
||||
print(f" ℹ️ L→I normalization: NO CASES FOUND (may be normal)")
|
||||
|
||||
if results.electric_vehicles:
|
||||
print(f" ✅ Electric vehicle handling: WORKING ({len(results.electric_vehicles)} models)")
|
||||
else:
|
||||
print(f" ℹ️ Electric vehicle handling: NO EMPTY ENGINES FOUND")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main validation script"""
|
||||
# Determine JSON files directory
|
||||
current_dir = Path(__file__).parent
|
||||
json_dir = current_dir / "sources" / "makes"
|
||||
|
||||
if not json_dir.exists():
|
||||
print(f"❌ JSON files directory not found: {json_dir}")
|
||||
print(f" Current directory: {current_dir}")
|
||||
print(f" Looking for: sources/makes/ directory")
|
||||
return 1
|
||||
|
||||
print(f"🚀 Starting validation against: {json_dir}")
|
||||
|
||||
# Run validation
|
||||
results = validate_json_files(str(json_dir))
|
||||
|
||||
# Generate report
|
||||
generate_validation_report(results)
|
||||
|
||||
# Return success/failure code
|
||||
total_makes = len(results.make_results)
|
||||
valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
|
||||
|
||||
if valid_makes == total_makes and not results.parsing_errors:
|
||||
print(f"\n🎉 VALIDATION PASSED - Ready for implementation!")
|
||||
return 0
|
||||
else:
|
||||
print(f"\n⚠️ VALIDATION ISSUES FOUND - Review before proceeding")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user