Initial Commit

2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions
--- a/mvp-platform-services/vehicles/etl/validate_utilities.py
+++ b/mvp-platform-services/vehicles/etl/validate_utilities.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+"""
+Validate Utilities Against Actual JSON Files
+
+This script validates the MakeNameMapper and EngineSpecParser utilities
+against the actual 55 JSON files in the sources/makes/ directory.
+
+Performs comprehensive validation of:
+- Make name normalization (all 55 files)
+- Engine specification parsing (all engines across all files)
+- L→I normalization detection and verification
+- Electric vehicle handling (Tesla, Lucid, etc.)
+- Data quality reporting
+
+Usage:
+    python validate_utilities.py
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple
+from collections import defaultdict
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+
+from utils.make_name_mapper import MakeNameMapper
+from utils.engine_spec_parser import EngineSpecParser
+
+
+class ValidationResults:
+    """Container for validation results"""
+    
+    def __init__(self):
+        self.make_results = {}
+        self.engine_results = {}
+        self.l_to_i_cases = []
+        self.electric_vehicles = []
+        self.parsing_errors = []
+        self.quality_issues = []
+        
+    def add_make_result(self, filename: str, display_name: str, is_valid: bool):
+        """Add make validation result"""
+        self.make_results[filename] = {
+            'display_name': display_name,
+            'is_valid': is_valid
+        }
+    
+    def add_engine_result(self, make: str, engine_str: str, spec, warnings: List[str]):
+        """Add engine validation result"""
+        if make not in self.engine_results:
+            self.engine_results[make] = []
+            
+        self.engine_results[make].append({
+            'original': engine_str,
+            'parsed': spec,
+            'warnings': warnings
+        })
+    
+    def add_l_to_i_case(self, make: str, original: str, normalized: str):
+        """Record L→I normalization case"""
+        self.l_to_i_cases.append({
+            'make': make,
+            'original': original,
+            'normalized': normalized
+        })
+    
+    def add_electric_vehicle(self, make: str, model: str, year: str):
+        """Record electric vehicle (empty engines)"""
+        self.electric_vehicles.append({
+            'make': make,
+            'model': model,
+            'year': year
+        })
+    
+    def add_parsing_error(self, make: str, engine_str: str, error: str):
+        """Record parsing error"""
+        self.parsing_errors.append({
+            'make': make,
+            'engine': engine_str,
+            'error': error
+        })
+
+
+def validate_json_files(json_dir: str) -> ValidationResults:
+    """
+    Validate utilities against all JSON files
+    
+    Args:
+        json_dir: Directory containing make JSON files
+        
+    Returns:
+        ValidationResults object with all findings
+    """
+    results = ValidationResults()
+    
+    # Initialize utilities
+    make_mapper = MakeNameMapper(sources_dir="sources")
+    engine_parser = EngineSpecParser()
+    
+    # Find all JSON files
+    json_files = list(Path(json_dir).glob("*.json"))
+    print(f"🔍 Validating against {len(json_files)} JSON files...")
+    
+    for json_file in json_files:
+        filename = json_file.name
+        print(f"\n📄 Processing {filename}...")
+        
+        try:
+            # Validate make name mapping
+            display_name = make_mapper.normalize_make_name(filename)
+            is_valid = make_mapper.validate_mapping(filename, display_name)
+            results.add_make_result(filename, display_name, is_valid)
+            
+            if not is_valid:
+                print(f"  ⚠️  Make name validation failed: {filename} → {display_name}")
+            else:
+                print(f"  ✅ Make name: {filename} → {display_name}")
+            
+            # Load and parse JSON content
+            with open(json_file, 'r', encoding='utf-8') as f:
+                json_data = json.load(f)
+            
+            # Get make key (should be first key)
+            if not json_data:
+                print(f"  ⚠️  Empty JSON file: {filename}")
+                continue
+                
+            make_key = list(json_data.keys())[0]
+            make_data = json_data[make_key]
+            
+            # Process all engines in this make
+            total_engines = 0
+            parsed_engines = 0
+            empty_engine_models = 0
+            
+            for year_entry in make_data:
+                year = year_entry.get('year', 'unknown')
+                
+                for model_entry in year_entry.get('models', []):
+                    model_name = model_entry.get('name', 'unknown')
+                    engines_list = model_entry.get('engines', [])
+                    
+                    if not engines_list:
+                        # Electric vehicle case
+                        results.add_electric_vehicle(display_name, model_name, year)
+                        empty_engine_models += 1
+                        
+                        # Test electric motor creation
+                        electric_spec = engine_parser.create_electric_motor()
+                        results.add_engine_result(display_name, "EMPTY_ARRAY", electric_spec, [])
+                    else:
+                        # Parse each engine
+                        for engine_str in engines_list:
+                            total_engines += 1
+                            
+                            try:
+                                spec = engine_parser.parse_engine_string(engine_str)
+                                warnings = engine_parser.validate_engine_spec(spec)
+                                
+                                results.add_engine_result(display_name, engine_str, spec, warnings)
+                                parsed_engines += 1
+                                
+                                # Check for L→I normalization
+                                if 'L' in engine_str and spec.configuration == 'I' and 'ELECTRIC' not in engine_str.upper():
+                                    normalized_str = engine_str.replace('L3', 'I3').replace('L4', 'I4').replace(' L', ' I')
+                                    results.add_l_to_i_case(display_name, engine_str, normalized_str)
+                                
+                            except Exception as e:
+                                results.add_parsing_error(display_name, engine_str, str(e))
+            
+            print(f"  📊 Engines: {parsed_engines}/{total_engines} parsed successfully")
+            if empty_engine_models > 0:
+                print(f"  ⚡ Electric vehicles: {empty_engine_models} models with empty engines")
+                
+        except Exception as e:
+            print(f"  ❌ Failed to process {filename}: {e}")
+            results.add_parsing_error(filename, "FILE_PROCESSING", str(e))
+    
+    return results
+
+
+def generate_validation_report(results: ValidationResults) -> None:
+    """Generate comprehensive validation report"""
+    
+    print("\n" + "="*60)
+    print("📋 COMPREHENSIVE VALIDATION REPORT")
+    print("="*60)
+    
+    # Make Name Validation Summary
+    total_makes = len(results.make_results)
+    valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
+    
+    print(f"\n🏷️  MAKE NAME VALIDATION")
+    print(f"   Total files: {total_makes}")
+    print(f"   Valid mappings: {valid_makes}")
+    print(f"   Success rate: {valid_makes/total_makes:.1%}")
+    
+    # Show invalid makes
+    invalid_makes = [(f, r['display_name']) for f, r in results.make_results.items() if not r['is_valid']]
+    if invalid_makes:
+        print(f"\n   ⚠️  Invalid makes ({len(invalid_makes)}):")
+        for filename, display_name in invalid_makes:
+            print(f"      {filename} → {display_name}")
+    else:
+        print(f"   ✅ All make names are valid!")
+    
+    # Engine Parsing Summary
+    total_engines = sum(len(engines) for engines in results.engine_results.values())
+    successful_parses = sum(1 for engines in results.engine_results.values() 
+                          for engine in engines if engine['parsed'].configuration != 'Unknown')
+    
+    print(f"\n🔧 ENGINE SPECIFICATION PARSING")
+    print(f"   Total engines: {total_engines}")
+    print(f"   Successfully parsed: {successful_parses}")
+    print(f"   Success rate: {successful_parses/total_engines:.1%}")
+    
+    # L→I Normalization Cases
+    print(f"\n🎯 L→I NORMALIZATION (CRITICAL)")
+    print(f"   Cases found: {len(results.l_to_i_cases)}")
+    
+    if results.l_to_i_cases:
+        print(f"   Examples:")
+        for case in results.l_to_i_cases[:10]:  # Show first 10
+            print(f"      {case['make']}: '{case['original']}' → '{case['normalized']}'")
+        
+        if len(results.l_to_i_cases) > 10:
+            print(f"      ... and {len(results.l_to_i_cases) - 10} more cases")
+    else:
+        print(f"   ⚠️  No L→I normalization cases found in data")
+    
+    # Electric Vehicle Handling
+    print(f"\n⚡ ELECTRIC VEHICLE HANDLING")
+    print(f"   Models with empty engines: {len(results.electric_vehicles)}")
+    
+    if results.electric_vehicles:
+        # Group by make
+        ev_by_make = defaultdict(list)
+        for ev in results.electric_vehicles:
+            ev_by_make[ev['make']].append(f"{ev['year']} {ev['model']}")
+        
+        print(f"   Electric vehicles by make:")
+        for make, models in ev_by_make.items():
+            print(f"      {make}: {len(models)} models")
+            if make.lower() in ['tesla', 'lucid', 'rivian']:  # Show details for known EVs
+                for model in models[:5]:  # Show first 5
+                    print(f"         {model}")
+                if len(models) > 5:
+                    print(f"         ... and {len(models) - 5} more")
+    
+    # Configuration Distribution
+    print(f"\n📊 ENGINE CONFIGURATION DISTRIBUTION")
+    config_counts = defaultdict(int)
+    for engines in results.engine_results.values():
+        for engine in engines:
+            config_counts[engine['parsed'].configuration] += 1
+    
+    for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True):
+        percentage = count / total_engines * 100 if total_engines > 0 else 0
+        print(f"   {config}: {count} engines ({percentage:.1f}%)")
+    
+    # Fuel Type Distribution
+    print(f"\n⛽ FUEL TYPE DISTRIBUTION")
+    fuel_counts = defaultdict(int)
+    for engines in results.engine_results.values():
+        for engine in engines:
+            fuel_counts[engine['parsed'].fuel_type] += 1
+    
+    for fuel, count in sorted(fuel_counts.items(), key=lambda x: x[1], reverse=True):
+        percentage = count / total_engines * 100 if total_engines > 0 else 0
+        print(f"   {fuel}: {count} engines ({percentage:.1f}%)")
+    
+    # Data Quality Issues
+    if results.parsing_errors:
+        print(f"\n❌ PARSING ERRORS ({len(results.parsing_errors)})")
+        error_types = defaultdict(list)
+        for error in results.parsing_errors:
+            error_types[error['error']].append(error)
+        
+        for error_type, errors in error_types.items():
+            print(f"   {error_type}: {len(errors)} occurrences")
+            for error in errors[:3]:  # Show first 3 examples
+                if error.get('engine'):
+                    print(f"      {error['make']}: '{error['engine']}'")
+                else:
+                    print(f"      {error['make']}")
+    
+    # Overall Assessment
+    print(f"\n🎉 OVERALL ASSESSMENT")
+    
+    if valid_makes == total_makes:
+        print(f"   ✅ Make name normalization: PERFECT")
+    else:
+        print(f"   ⚠️  Make name normalization: {valid_makes/total_makes:.1%}")
+    
+    if successful_parses / total_engines >= 0.95:  # 95% threshold
+        print(f"   ✅ Engine parsing: EXCELLENT ({successful_parses/total_engines:.1%})")
+    elif successful_parses / total_engines >= 0.85:  # 85% threshold  
+        print(f"   ✅ Engine parsing: GOOD ({successful_parses/total_engines:.1%})")
+    else:
+        print(f"   ⚠️  Engine parsing: NEEDS IMPROVEMENT ({successful_parses/total_engines:.1%})")
+    
+    if results.l_to_i_cases:
+        print(f"   ✅ L→I normalization: WORKING ({len(results.l_to_i_cases)} cases)")
+    else:
+        print(f"   ℹ️  L→I normalization: NO CASES FOUND (may be normal)")
+    
+    if results.electric_vehicles:
+        print(f"   ✅ Electric vehicle handling: WORKING ({len(results.electric_vehicles)} models)")
+    else:
+        print(f"   ℹ️  Electric vehicle handling: NO EMPTY ENGINES FOUND")
+
+
+def main():
+    """Main validation script"""
+    # Determine JSON files directory
+    current_dir = Path(__file__).parent
+    json_dir = current_dir / "sources" / "makes"
+    
+    if not json_dir.exists():
+        print(f"❌ JSON files directory not found: {json_dir}")
+        print(f"   Current directory: {current_dir}")
+        print(f"   Looking for: sources/makes/ directory")
+        return 1
+    
+    print(f"🚀 Starting validation against: {json_dir}")
+    
+    # Run validation
+    results = validate_json_files(str(json_dir))
+    
+    # Generate report
+    generate_validation_report(results)
+    
+    # Return success/failure code
+    total_makes = len(results.make_results)
+    valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
+    
+    if valid_makes == total_makes and not results.parsing_errors:
+        print(f"\n🎉 VALIDATION PASSED - Ready for implementation!")
+        return 0
+    else:
+        print(f"\n⚠️  VALIDATION ISSUES FOUND - Review before proceeding")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())