349 lines
13 KiB
Python
349 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Validate Utilities Against Actual JSON Files
|
||
|
||
This script validates the MakeNameMapper and EngineSpecParser utilities
|
||
against the actual 55 JSON files in the sources/makes/ directory.
|
||
|
||
Performs comprehensive validation of:
|
||
- Make name normalization (all 55 files)
|
||
- Engine specification parsing (all engines across all files)
|
||
- L→I normalization detection and verification
|
||
- Electric vehicle handling (Tesla, Lucid, etc.)
|
||
- Data quality reporting
|
||
|
||
Usage:
|
||
python validate_utilities.py
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
from typing import Dict, List, Tuple
|
||
from collections import defaultdict
|
||
|
||
# Add parent directory to path for imports
|
||
sys.path.insert(0, str(Path(__file__).parent))
|
||
|
||
from utils.make_name_mapper import MakeNameMapper
|
||
from utils.engine_spec_parser import EngineSpecParser
|
||
|
||
|
||
class ValidationResults:
|
||
"""Container for validation results"""
|
||
|
||
def __init__(self):
|
||
self.make_results = {}
|
||
self.engine_results = {}
|
||
self.l_to_i_cases = []
|
||
self.electric_vehicles = []
|
||
self.parsing_errors = []
|
||
self.quality_issues = []
|
||
|
||
def add_make_result(self, filename: str, display_name: str, is_valid: bool):
|
||
"""Add make validation result"""
|
||
self.make_results[filename] = {
|
||
'display_name': display_name,
|
||
'is_valid': is_valid
|
||
}
|
||
|
||
def add_engine_result(self, make: str, engine_str: str, spec, warnings: List[str]):
|
||
"""Add engine validation result"""
|
||
if make not in self.engine_results:
|
||
self.engine_results[make] = []
|
||
|
||
self.engine_results[make].append({
|
||
'original': engine_str,
|
||
'parsed': spec,
|
||
'warnings': warnings
|
||
})
|
||
|
||
def add_l_to_i_case(self, make: str, original: str, normalized: str):
|
||
"""Record L→I normalization case"""
|
||
self.l_to_i_cases.append({
|
||
'make': make,
|
||
'original': original,
|
||
'normalized': normalized
|
||
})
|
||
|
||
def add_electric_vehicle(self, make: str, model: str, year: str):
|
||
"""Record electric vehicle (empty engines)"""
|
||
self.electric_vehicles.append({
|
||
'make': make,
|
||
'model': model,
|
||
'year': year
|
||
})
|
||
|
||
def add_parsing_error(self, make: str, engine_str: str, error: str):
|
||
"""Record parsing error"""
|
||
self.parsing_errors.append({
|
||
'make': make,
|
||
'engine': engine_str,
|
||
'error': error
|
||
})
|
||
|
||
|
||
def validate_json_files(json_dir: str) -> ValidationResults:
|
||
"""
|
||
Validate utilities against all JSON files
|
||
|
||
Args:
|
||
json_dir: Directory containing make JSON files
|
||
|
||
Returns:
|
||
ValidationResults object with all findings
|
||
"""
|
||
results = ValidationResults()
|
||
|
||
# Initialize utilities
|
||
make_mapper = MakeNameMapper(sources_dir="sources")
|
||
engine_parser = EngineSpecParser()
|
||
|
||
# Find all JSON files
|
||
json_files = list(Path(json_dir).glob("*.json"))
|
||
print(f"🔍 Validating against {len(json_files)} JSON files...")
|
||
|
||
for json_file in json_files:
|
||
filename = json_file.name
|
||
print(f"\n📄 Processing {filename}...")
|
||
|
||
try:
|
||
# Validate make name mapping
|
||
display_name = make_mapper.normalize_make_name(filename)
|
||
is_valid = make_mapper.validate_mapping(filename, display_name)
|
||
results.add_make_result(filename, display_name, is_valid)
|
||
|
||
if not is_valid:
|
||
print(f" ⚠️ Make name validation failed: {filename} → {display_name}")
|
||
else:
|
||
print(f" ✅ Make name: {filename} → {display_name}")
|
||
|
||
# Load and parse JSON content
|
||
with open(json_file, 'r', encoding='utf-8') as f:
|
||
json_data = json.load(f)
|
||
|
||
# Get make key (should be first key)
|
||
if not json_data:
|
||
print(f" ⚠️ Empty JSON file: {filename}")
|
||
continue
|
||
|
||
make_key = list(json_data.keys())[0]
|
||
make_data = json_data[make_key]
|
||
|
||
# Process all engines in this make
|
||
total_engines = 0
|
||
parsed_engines = 0
|
||
empty_engine_models = 0
|
||
|
||
for year_entry in make_data:
|
||
year = year_entry.get('year', 'unknown')
|
||
|
||
for model_entry in year_entry.get('models', []):
|
||
model_name = model_entry.get('name', 'unknown')
|
||
engines_list = model_entry.get('engines', [])
|
||
|
||
if not engines_list:
|
||
# Electric vehicle case
|
||
results.add_electric_vehicle(display_name, model_name, year)
|
||
empty_engine_models += 1
|
||
|
||
# Test electric motor creation
|
||
electric_spec = engine_parser.create_electric_motor()
|
||
results.add_engine_result(display_name, "EMPTY_ARRAY", electric_spec, [])
|
||
else:
|
||
# Parse each engine
|
||
for engine_str in engines_list:
|
||
total_engines += 1
|
||
|
||
try:
|
||
spec = engine_parser.parse_engine_string(engine_str)
|
||
warnings = engine_parser.validate_engine_spec(spec)
|
||
|
||
results.add_engine_result(display_name, engine_str, spec, warnings)
|
||
parsed_engines += 1
|
||
|
||
# Check for L→I normalization
|
||
if 'L' in engine_str and spec.configuration == 'I' and 'ELECTRIC' not in engine_str.upper():
|
||
normalized_str = engine_str.replace('L3', 'I3').replace('L4', 'I4').replace(' L', ' I')
|
||
results.add_l_to_i_case(display_name, engine_str, normalized_str)
|
||
|
||
except Exception as e:
|
||
results.add_parsing_error(display_name, engine_str, str(e))
|
||
|
||
print(f" 📊 Engines: {parsed_engines}/{total_engines} parsed successfully")
|
||
if empty_engine_models > 0:
|
||
print(f" ⚡ Electric vehicles: {empty_engine_models} models with empty engines")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Failed to process {filename}: {e}")
|
||
results.add_parsing_error(filename, "FILE_PROCESSING", str(e))
|
||
|
||
return results
|
||
|
||
|
||
def generate_validation_report(results: ValidationResults) -> None:
|
||
"""Generate comprehensive validation report"""
|
||
|
||
print("\n" + "="*60)
|
||
print("📋 COMPREHENSIVE VALIDATION REPORT")
|
||
print("="*60)
|
||
|
||
# Make Name Validation Summary
|
||
total_makes = len(results.make_results)
|
||
valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
|
||
|
||
print(f"\n🏷️ MAKE NAME VALIDATION")
|
||
print(f" Total files: {total_makes}")
|
||
print(f" Valid mappings: {valid_makes}")
|
||
print(f" Success rate: {valid_makes/total_makes:.1%}")
|
||
|
||
# Show invalid makes
|
||
invalid_makes = [(f, r['display_name']) for f, r in results.make_results.items() if not r['is_valid']]
|
||
if invalid_makes:
|
||
print(f"\n ⚠️ Invalid makes ({len(invalid_makes)}):")
|
||
for filename, display_name in invalid_makes:
|
||
print(f" {filename} → {display_name}")
|
||
else:
|
||
print(f" ✅ All make names are valid!")
|
||
|
||
# Engine Parsing Summary
|
||
total_engines = sum(len(engines) for engines in results.engine_results.values())
|
||
successful_parses = sum(1 for engines in results.engine_results.values()
|
||
for engine in engines if engine['parsed'].configuration != 'Unknown')
|
||
|
||
print(f"\n🔧 ENGINE SPECIFICATION PARSING")
|
||
print(f" Total engines: {total_engines}")
|
||
print(f" Successfully parsed: {successful_parses}")
|
||
print(f" Success rate: {successful_parses/total_engines:.1%}")
|
||
|
||
# L→I Normalization Cases
|
||
print(f"\n🎯 L→I NORMALIZATION (CRITICAL)")
|
||
print(f" Cases found: {len(results.l_to_i_cases)}")
|
||
|
||
if results.l_to_i_cases:
|
||
print(f" Examples:")
|
||
for case in results.l_to_i_cases[:10]: # Show first 10
|
||
print(f" {case['make']}: '{case['original']}' → '{case['normalized']}'")
|
||
|
||
if len(results.l_to_i_cases) > 10:
|
||
print(f" ... and {len(results.l_to_i_cases) - 10} more cases")
|
||
else:
|
||
print(f" ⚠️ No L→I normalization cases found in data")
|
||
|
||
# Electric Vehicle Handling
|
||
print(f"\n⚡ ELECTRIC VEHICLE HANDLING")
|
||
print(f" Models with empty engines: {len(results.electric_vehicles)}")
|
||
|
||
if results.electric_vehicles:
|
||
# Group by make
|
||
ev_by_make = defaultdict(list)
|
||
for ev in results.electric_vehicles:
|
||
ev_by_make[ev['make']].append(f"{ev['year']} {ev['model']}")
|
||
|
||
print(f" Electric vehicles by make:")
|
||
for make, models in ev_by_make.items():
|
||
print(f" {make}: {len(models)} models")
|
||
if make.lower() in ['tesla', 'lucid', 'rivian']: # Show details for known EVs
|
||
for model in models[:5]: # Show first 5
|
||
print(f" {model}")
|
||
if len(models) > 5:
|
||
print(f" ... and {len(models) - 5} more")
|
||
|
||
# Configuration Distribution
|
||
print(f"\n📊 ENGINE CONFIGURATION DISTRIBUTION")
|
||
config_counts = defaultdict(int)
|
||
for engines in results.engine_results.values():
|
||
for engine in engines:
|
||
config_counts[engine['parsed'].configuration] += 1
|
||
|
||
for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True):
|
||
percentage = count / total_engines * 100 if total_engines > 0 else 0
|
||
print(f" {config}: {count} engines ({percentage:.1f}%)")
|
||
|
||
# Fuel Type Distribution
|
||
print(f"\n⛽ FUEL TYPE DISTRIBUTION")
|
||
fuel_counts = defaultdict(int)
|
||
for engines in results.engine_results.values():
|
||
for engine in engines:
|
||
fuel_counts[engine['parsed'].fuel_type] += 1
|
||
|
||
for fuel, count in sorted(fuel_counts.items(), key=lambda x: x[1], reverse=True):
|
||
percentage = count / total_engines * 100 if total_engines > 0 else 0
|
||
print(f" {fuel}: {count} engines ({percentage:.1f}%)")
|
||
|
||
# Data Quality Issues
|
||
if results.parsing_errors:
|
||
print(f"\n❌ PARSING ERRORS ({len(results.parsing_errors)})")
|
||
error_types = defaultdict(list)
|
||
for error in results.parsing_errors:
|
||
error_types[error['error']].append(error)
|
||
|
||
for error_type, errors in error_types.items():
|
||
print(f" {error_type}: {len(errors)} occurrences")
|
||
for error in errors[:3]: # Show first 3 examples
|
||
if error.get('engine'):
|
||
print(f" {error['make']}: '{error['engine']}'")
|
||
else:
|
||
print(f" {error['make']}")
|
||
|
||
# Overall Assessment
|
||
print(f"\n🎉 OVERALL ASSESSMENT")
|
||
|
||
if valid_makes == total_makes:
|
||
print(f" ✅ Make name normalization: PERFECT")
|
||
else:
|
||
print(f" ⚠️ Make name normalization: {valid_makes/total_makes:.1%}")
|
||
|
||
if successful_parses / total_engines >= 0.95: # 95% threshold
|
||
print(f" ✅ Engine parsing: EXCELLENT ({successful_parses/total_engines:.1%})")
|
||
elif successful_parses / total_engines >= 0.85: # 85% threshold
|
||
print(f" ✅ Engine parsing: GOOD ({successful_parses/total_engines:.1%})")
|
||
else:
|
||
print(f" ⚠️ Engine parsing: NEEDS IMPROVEMENT ({successful_parses/total_engines:.1%})")
|
||
|
||
if results.l_to_i_cases:
|
||
print(f" ✅ L→I normalization: WORKING ({len(results.l_to_i_cases)} cases)")
|
||
else:
|
||
print(f" ℹ️ L→I normalization: NO CASES FOUND (may be normal)")
|
||
|
||
if results.electric_vehicles:
|
||
print(f" ✅ Electric vehicle handling: WORKING ({len(results.electric_vehicles)} models)")
|
||
else:
|
||
print(f" ℹ️ Electric vehicle handling: NO EMPTY ENGINES FOUND")
|
||
|
||
|
||
def main():
|
||
"""Main validation script"""
|
||
# Determine JSON files directory
|
||
current_dir = Path(__file__).parent
|
||
json_dir = current_dir / "sources" / "makes"
|
||
|
||
if not json_dir.exists():
|
||
print(f"❌ JSON files directory not found: {json_dir}")
|
||
print(f" Current directory: {current_dir}")
|
||
print(f" Looking for: sources/makes/ directory")
|
||
return 1
|
||
|
||
print(f"🚀 Starting validation against: {json_dir}")
|
||
|
||
# Run validation
|
||
results = validate_json_files(str(json_dir))
|
||
|
||
# Generate report
|
||
generate_validation_report(results)
|
||
|
||
# Return success/failure code
|
||
total_makes = len(results.make_results)
|
||
valid_makes = sum(1 for r in results.make_results.values() if r['is_valid'])
|
||
|
||
if valid_makes == total_makes and not results.parsing_errors:
|
||
print(f"\n🎉 VALIDATION PASSED - Ready for implementation!")
|
||
return 0
|
||
else:
|
||
print(f"\n⚠️ VALIDATION ISSUES FOUND - Review before proceeding")
|
||
return 1
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main()) |