Files
motovaultpro/mvp-platform-services/vehicles/etl/validate_utilities.py
Eric Gullickson a052040e3a Initial Commit
2025-09-17 16:09:15 -05:00

349 lines
13 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Validate Utilities Against Actual JSON Files
This script validates the MakeNameMapper and EngineSpecParser utilities
against the actual 55 JSON files in the sources/makes/ directory.
Performs comprehensive validation of:
- Make name normalization (all 55 files)
- Engine specification parsing (all engines across all files)
- L→I normalization detection and verification
- Electric vehicle handling (Tesla, Lucid, etc.)
- Data quality reporting
Usage:
python validate_utilities.py
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Tuple
from collections import defaultdict
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser
class ValidationResults:
"""Container for validation results"""
def __init__(self):
self.make_results = {}
self.engine_results = {}
self.l_to_i_cases = []
self.electric_vehicles = []
self.parsing_errors = []
self.quality_issues = []
def add_make_result(self, filename: str, display_name: str, is_valid: bool):
"""Add make validation result"""
self.make_results[filename] = {
'display_name': display_name,
'is_valid': is_valid
}
def add_engine_result(self, make: str, engine_str: str, spec, warnings: List[str]):
"""Add engine validation result"""
if make not in self.engine_results:
self.engine_results[make] = []
self.engine_results[make].append({
'original': engine_str,
'parsed': spec,
'warnings': warnings
})
def add_l_to_i_case(self, make: str, original: str, normalized: str):
"""Record L→I normalization case"""
self.l_to_i_cases.append({
'make': make,
'original': original,
'normalized': normalized
})
def add_electric_vehicle(self, make: str, model: str, year: str):
"""Record electric vehicle (empty engines)"""
self.electric_vehicles.append({
'make': make,
'model': model,
'year': year
})
def add_parsing_error(self, make: str, engine_str: str, error: str):
"""Record parsing error"""
self.parsing_errors.append({
'make': make,
'engine': engine_str,
'error': error
})
def validate_json_files(json_dir: str) -> ValidationResults:
"""
Validate utilities against all JSON files
Args:
json_dir: Directory containing make JSON files
Returns:
ValidationResults object with all findings
"""
results = ValidationResults()
# Initialize utilities
make_mapper = MakeNameMapper(sources_dir="sources")
engine_parser = EngineSpecParser()
# Find all JSON files
json_files = list(Path(json_dir).glob("*.json"))
print(f"🔍 Validating against {len(json_files)} JSON files...")
for json_file in json_files:
filename = json_file.name
print(f"\n📄 Processing {filename}...")
try:
# Validate make name mapping
display_name = make_mapper.normalize_make_name(filename)
is_valid = make_mapper.validate_mapping(filename, display_name)
results.add_make_result(filename, display_name, is_valid)
if not is_valid:
print(f" ⚠️ Make name validation failed: {filename}{display_name}")
else:
print(f" ✅ Make name: {filename}{display_name}")
# Load and parse JSON content
with open(json_file, 'r', encoding='utf-8') as f:
json_data = json.load(f)
# Get make key (should be first key)
if not json_data:
print(f" ⚠️ Empty JSON file: {filename}")
continue
make_key = list(json_data.keys())[0]
make_data = json_data[make_key]
# Process all engines in this make
total_engines = 0
parsed_engines = 0
empty_engine_models = 0
for year_entry in make_data:
year = year_entry.get('year', 'unknown')
for model_entry in year_entry.get('models', []):
model_name = model_entry.get('name', 'unknown')
engines_list = model_entry.get('engines', [])
if not engines_list:
# Electric vehicle case
results.add_electric_vehicle(display_name, model_name, year)
empty_engine_models += 1
# Test electric motor creation
electric_spec = engine_parser.create_electric_motor()
results.add_engine_result(display_name, "EMPTY_ARRAY", electric_spec, [])
else:
# Parse each engine
for engine_str in engines_list:
total_engines += 1
try:
spec = engine_parser.parse_engine_string(engine_str)
warnings = engine_parser.validate_engine_spec(spec)
results.add_engine_result(display_name, engine_str, spec, warnings)
parsed_engines += 1
# Check for L→I normalization
if 'L' in engine_str and spec.configuration == 'I' and 'ELECTRIC' not in engine_str.upper():
normalized_str = engine_str.replace('L3', 'I3').replace('L4', 'I4').replace(' L', ' I')
results.add_l_to_i_case(display_name, engine_str, normalized_str)
except Exception as e:
results.add_parsing_error(display_name, engine_str, str(e))
print(f" 📊 Engines: {parsed_engines}/{total_engines} parsed successfully")
if empty_engine_models > 0:
print(f" ⚡ Electric vehicles: {empty_engine_models} models with empty engines")
except Exception as e:
print(f" ❌ Failed to process {filename}: {e}")
results.add_parsing_error(filename, "FILE_PROCESSING", str(e))
return results
def generate_validation_report(results: ValidationResults) -> None:
"""Generate comprehensive validation report"""
print("\n" + "="*60)
print("📋 COMPREHENSIVE VALIDATION REPORT")
print("="*60)
# Make Name Validation Summary
total_makes = len(results.make_results)
valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
print(f"\n🏷️ MAKE NAME VALIDATION")
print(f" Total files: {total_makes}")
print(f" Valid mappings: {valid_makes}")
print(f" Success rate: {valid_makes/total_makes:.1%}")
# Show invalid makes
invalid_makes = [(f, r['display_name']) for f, r in results.make_results.items() if not r['is_valid']]
if invalid_makes:
print(f"\n ⚠️ Invalid makes ({len(invalid_makes)}):")
for filename, display_name in invalid_makes:
print(f" {filename}{display_name}")
else:
print(f" ✅ All make names are valid!")
# Engine Parsing Summary
total_engines = sum(len(engines) for engines in results.engine_results.values())
successful_parses = sum(1 for engines in results.engine_results.values()
for engine in engines if engine['parsed'].configuration != 'Unknown')
print(f"\n🔧 ENGINE SPECIFICATION PARSING")
print(f" Total engines: {total_engines}")
print(f" Successfully parsed: {successful_parses}")
print(f" Success rate: {successful_parses/total_engines:.1%}")
# L→I Normalization Cases
print(f"\n🎯 L→I NORMALIZATION (CRITICAL)")
print(f" Cases found: {len(results.l_to_i_cases)}")
if results.l_to_i_cases:
print(f" Examples:")
for case in results.l_to_i_cases[:10]: # Show first 10
print(f" {case['make']}: '{case['original']}''{case['normalized']}'")
if len(results.l_to_i_cases) > 10:
print(f" ... and {len(results.l_to_i_cases) - 10} more cases")
else:
print(f" ⚠️ No L→I normalization cases found in data")
# Electric Vehicle Handling
print(f"\n⚡ ELECTRIC VEHICLE HANDLING")
print(f" Models with empty engines: {len(results.electric_vehicles)}")
if results.electric_vehicles:
# Group by make
ev_by_make = defaultdict(list)
for ev in results.electric_vehicles:
ev_by_make[ev['make']].append(f"{ev['year']} {ev['model']}")
print(f" Electric vehicles by make:")
for make, models in ev_by_make.items():
print(f" {make}: {len(models)} models")
if make.lower() in ['tesla', 'lucid', 'rivian']: # Show details for known EVs
for model in models[:5]: # Show first 5
print(f" {model}")
if len(models) > 5:
print(f" ... and {len(models) - 5} more")
# Configuration Distribution
print(f"\n📊 ENGINE CONFIGURATION DISTRIBUTION")
config_counts = defaultdict(int)
for engines in results.engine_results.values():
for engine in engines:
config_counts[engine['parsed'].configuration] += 1
for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True):
percentage = count / total_engines * 100 if total_engines > 0 else 0
print(f" {config}: {count} engines ({percentage:.1f}%)")
# Fuel Type Distribution
print(f"\n⛽ FUEL TYPE DISTRIBUTION")
fuel_counts = defaultdict(int)
for engines in results.engine_results.values():
for engine in engines:
fuel_counts[engine['parsed'].fuel_type] += 1
for fuel, count in sorted(fuel_counts.items(), key=lambda x: x[1], reverse=True):
percentage = count / total_engines * 100 if total_engines > 0 else 0
print(f" {fuel}: {count} engines ({percentage:.1f}%)")
# Data Quality Issues
if results.parsing_errors:
print(f"\n❌ PARSING ERRORS ({len(results.parsing_errors)})")
error_types = defaultdict(list)
for error in results.parsing_errors:
error_types[error['error']].append(error)
for error_type, errors in error_types.items():
print(f" {error_type}: {len(errors)} occurrences")
for error in errors[:3]: # Show first 3 examples
if error.get('engine'):
print(f" {error['make']}: '{error['engine']}'")
else:
print(f" {error['make']}")
# Overall Assessment
print(f"\n🎉 OVERALL ASSESSMENT")
if valid_makes == total_makes:
print(f" ✅ Make name normalization: PERFECT")
else:
print(f" ⚠️ Make name normalization: {valid_makes/total_makes:.1%}")
if successful_parses / total_engines >= 0.95: # 95% threshold
print(f" ✅ Engine parsing: EXCELLENT ({successful_parses/total_engines:.1%})")
elif successful_parses / total_engines >= 0.85: # 85% threshold
print(f" ✅ Engine parsing: GOOD ({successful_parses/total_engines:.1%})")
else:
print(f" ⚠️ Engine parsing: NEEDS IMPROVEMENT ({successful_parses/total_engines:.1%})")
if results.l_to_i_cases:
print(f" ✅ L→I normalization: WORKING ({len(results.l_to_i_cases)} cases)")
else:
print(f" L→I normalization: NO CASES FOUND (may be normal)")
if results.electric_vehicles:
print(f" ✅ Electric vehicle handling: WORKING ({len(results.electric_vehicles)} models)")
else:
print(f" Electric vehicle handling: NO EMPTY ENGINES FOUND")
def main():
"""Main validation script"""
# Determine JSON files directory
current_dir = Path(__file__).parent
json_dir = current_dir / "sources" / "makes"
if not json_dir.exists():
print(f"❌ JSON files directory not found: {json_dir}")
print(f" Current directory: {current_dir}")
print(f" Looking for: sources/makes/ directory")
return 1
print(f"🚀 Starting validation against: {json_dir}")
# Run validation
results = validate_json_files(str(json_dir))
# Generate report
generate_validation_report(results)
# Return success/failure code
total_makes = len(results.make_results)
valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
if valid_makes == total_makes and not results.parsing_errors:
print(f"\n🎉 VALIDATION PASSED - Ready for implementation!")
return 0
else:
print(f"\n⚠️ VALIDATION ISSUES FOUND - Review before proceeding")
return 1
if __name__ == "__main__":
sys.exit(main())