Initial Commit

This commit is contained in:
Eric Gullickson
2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions

View File

@@ -0,0 +1,349 @@
#!/usr/bin/env python3
"""
Validate Utilities Against Actual JSON Files
This script validates the MakeNameMapper and EngineSpecParser utilities
against the actual 55 JSON files in the sources/makes/ directory.
Performs comprehensive validation of:
- Make name normalization (all 55 files)
- Engine specification parsing (all engines across all files)
- L→I normalization detection and verification
- Electric vehicle handling (Tesla, Lucid, etc.)
- Data quality reporting
Usage:
python validate_utilities.py
"""
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Tuple
from collections import defaultdict
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser
class ValidationResults:
"""Container for validation results"""
def __init__(self):
self.make_results = {}
self.engine_results = {}
self.l_to_i_cases = []
self.electric_vehicles = []
self.parsing_errors = []
self.quality_issues = []
def add_make_result(self, filename: str, display_name: str, is_valid: bool):
"""Add make validation result"""
self.make_results[filename] = {
'display_name': display_name,
'is_valid': is_valid
}
def add_engine_result(self, make: str, engine_str: str, spec, warnings: List[str]):
"""Add engine validation result"""
if make not in self.engine_results:
self.engine_results[make] = []
self.engine_results[make].append({
'original': engine_str,
'parsed': spec,
'warnings': warnings
})
def add_l_to_i_case(self, make: str, original: str, normalized: str):
"""Record L→I normalization case"""
self.l_to_i_cases.append({
'make': make,
'original': original,
'normalized': normalized
})
def add_electric_vehicle(self, make: str, model: str, year: str):
"""Record electric vehicle (empty engines)"""
self.electric_vehicles.append({
'make': make,
'model': model,
'year': year
})
def add_parsing_error(self, make: str, engine_str: str, error: str):
"""Record parsing error"""
self.parsing_errors.append({
'make': make,
'engine': engine_str,
'error': error
})
def validate_json_files(json_dir: str) -> ValidationResults:
"""
Validate utilities against all JSON files
Args:
json_dir: Directory containing make JSON files
Returns:
ValidationResults object with all findings
"""
results = ValidationResults()
# Initialize utilities
make_mapper = MakeNameMapper(sources_dir="sources")
engine_parser = EngineSpecParser()
# Find all JSON files
json_files = list(Path(json_dir).glob("*.json"))
print(f"🔍 Validating against {len(json_files)} JSON files...")
for json_file in json_files:
filename = json_file.name
print(f"\n📄 Processing {filename}...")
try:
# Validate make name mapping
display_name = make_mapper.normalize_make_name(filename)
is_valid = make_mapper.validate_mapping(filename, display_name)
results.add_make_result(filename, display_name, is_valid)
if not is_valid:
print(f" ⚠️ Make name validation failed: {filename}{display_name}")
else:
print(f" ✅ Make name: {filename}{display_name}")
# Load and parse JSON content
with open(json_file, 'r', encoding='utf-8') as f:
json_data = json.load(f)
# Get make key (should be first key)
if not json_data:
print(f" ⚠️ Empty JSON file: {filename}")
continue
make_key = list(json_data.keys())[0]
make_data = json_data[make_key]
# Process all engines in this make
total_engines = 0
parsed_engines = 0
empty_engine_models = 0
for year_entry in make_data:
year = year_entry.get('year', 'unknown')
for model_entry in year_entry.get('models', []):
model_name = model_entry.get('name', 'unknown')
engines_list = model_entry.get('engines', [])
if not engines_list:
# Electric vehicle case
results.add_electric_vehicle(display_name, model_name, year)
empty_engine_models += 1
# Test electric motor creation
electric_spec = engine_parser.create_electric_motor()
results.add_engine_result(display_name, "EMPTY_ARRAY", electric_spec, [])
else:
# Parse each engine
for engine_str in engines_list:
total_engines += 1
try:
spec = engine_parser.parse_engine_string(engine_str)
warnings = engine_parser.validate_engine_spec(spec)
results.add_engine_result(display_name, engine_str, spec, warnings)
parsed_engines += 1
# Check for L→I normalization
if 'L' in engine_str and spec.configuration == 'I' and 'ELECTRIC' not in engine_str.upper():
normalized_str = engine_str.replace('L3', 'I3').replace('L4', 'I4').replace(' L', ' I')
results.add_l_to_i_case(display_name, engine_str, normalized_str)
except Exception as e:
results.add_parsing_error(display_name, engine_str, str(e))
print(f" 📊 Engines: {parsed_engines}/{total_engines} parsed successfully")
if empty_engine_models > 0:
print(f" ⚡ Electric vehicles: {empty_engine_models} models with empty engines")
except Exception as e:
print(f" ❌ Failed to process {filename}: {e}")
results.add_parsing_error(filename, "FILE_PROCESSING", str(e))
return results
def generate_validation_report(results: ValidationResults) -> None:
"""Generate comprehensive validation report"""
print("\n" + "="*60)
print("📋 COMPREHENSIVE VALIDATION REPORT")
print("="*60)
# Make Name Validation Summary
total_makes = len(results.make_results)
valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
print(f"\n🏷️ MAKE NAME VALIDATION")
print(f" Total files: {total_makes}")
print(f" Valid mappings: {valid_makes}")
print(f" Success rate: {valid_makes/total_makes:.1%}")
# Show invalid makes
invalid_makes = [(f, r['display_name']) for f, r in results.make_results.items() if not r['is_valid']]
if invalid_makes:
print(f"\n ⚠️ Invalid makes ({len(invalid_makes)}):")
for filename, display_name in invalid_makes:
print(f" {filename}{display_name}")
else:
print(f" ✅ All make names are valid!")
# Engine Parsing Summary
total_engines = sum(len(engines) for engines in results.engine_results.values())
successful_parses = sum(1 for engines in results.engine_results.values()
for engine in engines if engine['parsed'].configuration != 'Unknown')
print(f"\n🔧 ENGINE SPECIFICATION PARSING")
print(f" Total engines: {total_engines}")
print(f" Successfully parsed: {successful_parses}")
print(f" Success rate: {successful_parses/total_engines:.1%}")
# L→I Normalization Cases
print(f"\n🎯 L→I NORMALIZATION (CRITICAL)")
print(f" Cases found: {len(results.l_to_i_cases)}")
if results.l_to_i_cases:
print(f" Examples:")
for case in results.l_to_i_cases[:10]: # Show first 10
print(f" {case['make']}: '{case['original']}''{case['normalized']}'")
if len(results.l_to_i_cases) > 10:
print(f" ... and {len(results.l_to_i_cases) - 10} more cases")
else:
print(f" ⚠️ No L→I normalization cases found in data")
# Electric Vehicle Handling
print(f"\n⚡ ELECTRIC VEHICLE HANDLING")
print(f" Models with empty engines: {len(results.electric_vehicles)}")
if results.electric_vehicles:
# Group by make
ev_by_make = defaultdict(list)
for ev in results.electric_vehicles:
ev_by_make[ev['make']].append(f"{ev['year']} {ev['model']}")
print(f" Electric vehicles by make:")
for make, models in ev_by_make.items():
print(f" {make}: {len(models)} models")
if make.lower() in ['tesla', 'lucid', 'rivian']: # Show details for known EVs
for model in models[:5]: # Show first 5
print(f" {model}")
if len(models) > 5:
print(f" ... and {len(models) - 5} more")
# Configuration Distribution
print(f"\n📊 ENGINE CONFIGURATION DISTRIBUTION")
config_counts = defaultdict(int)
for engines in results.engine_results.values():
for engine in engines:
config_counts[engine['parsed'].configuration] += 1
for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True):
percentage = count / total_engines * 100 if total_engines > 0 else 0
print(f" {config}: {count} engines ({percentage:.1f}%)")
# Fuel Type Distribution
print(f"\n⛽ FUEL TYPE DISTRIBUTION")
fuel_counts = defaultdict(int)
for engines in results.engine_results.values():
for engine in engines:
fuel_counts[engine['parsed'].fuel_type] += 1
for fuel, count in sorted(fuel_counts.items(), key=lambda x: x[1], reverse=True):
percentage = count / total_engines * 100 if total_engines > 0 else 0
print(f" {fuel}: {count} engines ({percentage:.1f}%)")
# Data Quality Issues
if results.parsing_errors:
print(f"\n❌ PARSING ERRORS ({len(results.parsing_errors)})")
error_types = defaultdict(list)
for error in results.parsing_errors:
error_types[error['error']].append(error)
for error_type, errors in error_types.items():
print(f" {error_type}: {len(errors)} occurrences")
for error in errors[:3]: # Show first 3 examples
if error.get('engine'):
print(f" {error['make']}: '{error['engine']}'")
else:
print(f" {error['make']}")
# Overall Assessment
print(f"\n🎉 OVERALL ASSESSMENT")
if valid_makes == total_makes:
print(f" ✅ Make name normalization: PERFECT")
else:
print(f" ⚠️ Make name normalization: {valid_makes/total_makes:.1%}")
if successful_parses / total_engines >= 0.95: # 95% threshold
print(f" ✅ Engine parsing: EXCELLENT ({successful_parses/total_engines:.1%})")
elif successful_parses / total_engines >= 0.85: # 85% threshold
print(f" ✅ Engine parsing: GOOD ({successful_parses/total_engines:.1%})")
else:
print(f" ⚠️ Engine parsing: NEEDS IMPROVEMENT ({successful_parses/total_engines:.1%})")
if results.l_to_i_cases:
print(f" ✅ L→I normalization: WORKING ({len(results.l_to_i_cases)} cases)")
else:
print(f" L→I normalization: NO CASES FOUND (may be normal)")
if results.electric_vehicles:
print(f" ✅ Electric vehicle handling: WORKING ({len(results.electric_vehicles)} models)")
else:
print(f" Electric vehicle handling: NO EMPTY ENGINES FOUND")
def main():
"""Main validation script"""
# Determine JSON files directory
current_dir = Path(__file__).parent
json_dir = current_dir / "sources" / "makes"
if not json_dir.exists():
print(f"❌ JSON files directory not found: {json_dir}")
print(f" Current directory: {current_dir}")
print(f" Looking for: sources/makes/ directory")
return 1
print(f"🚀 Starting validation against: {json_dir}")
# Run validation
results = validate_json_files(str(json_dir))
# Generate report
generate_validation_report(results)
# Return success/failure code
total_makes = len(results.make_results)
valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
if valid_makes == total_makes and not results.parsing_errors:
print(f"\n🎉 VALIDATION PASSED - Ready for implementation!")
return 0
else:
print(f"\n⚠️ VALIDATION ISSUES FOUND - Review before proceeding")
return 1
if __name__ == "__main__":
sys.exit(main())