#!/usr/bin/env python3 """ Validate Utilities Against Actual JSON Files This script validates the MakeNameMapper and EngineSpecParser utilities against the actual 55 JSON files in the sources/makes/ directory. Performs comprehensive validation of: - Make name normalization (all 55 files) - Engine specification parsing (all engines across all files) - L→I normalization detection and verification - Electric vehicle handling (Tesla, Lucid, etc.) - Data quality reporting Usage: python validate_utilities.py """ import json import os import sys from pathlib import Path from typing import Dict, List, Tuple from collections import defaultdict # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) from utils.make_name_mapper import MakeNameMapper from utils.engine_spec_parser import EngineSpecParser class ValidationResults: """Container for validation results""" def __init__(self): self.make_results = {} self.engine_results = {} self.l_to_i_cases = [] self.electric_vehicles = [] self.parsing_errors = [] self.quality_issues = [] def add_make_result(self, filename: str, display_name: str, is_valid: bool): """Add make validation result""" self.make_results[filename] = { 'display_name': display_name, 'is_valid': is_valid } def add_engine_result(self, make: str, engine_str: str, spec, warnings: List[str]): """Add engine validation result""" if make not in self.engine_results: self.engine_results[make] = [] self.engine_results[make].append({ 'original': engine_str, 'parsed': spec, 'warnings': warnings }) def add_l_to_i_case(self, make: str, original: str, normalized: str): """Record L→I normalization case""" self.l_to_i_cases.append({ 'make': make, 'original': original, 'normalized': normalized }) def add_electric_vehicle(self, make: str, model: str, year: str): """Record electric vehicle (empty engines)""" self.electric_vehicles.append({ 'make': make, 'model': model, 'year': year }) def add_parsing_error(self, make: str, engine_str: str, error: str): """Record parsing error""" self.parsing_errors.append({ 'make': make, 'engine': engine_str, 'error': error }) def validate_json_files(json_dir: str) -> ValidationResults: """ Validate utilities against all JSON files Args: json_dir: Directory containing make JSON files Returns: ValidationResults object with all findings """ results = ValidationResults() # Initialize utilities make_mapper = MakeNameMapper(sources_dir="sources") engine_parser = EngineSpecParser() # Find all JSON files json_files = list(Path(json_dir).glob("*.json")) print(f"šŸ” Validating against {len(json_files)} JSON files...") for json_file in json_files: filename = json_file.name print(f"\nšŸ“„ Processing {filename}...") try: # Validate make name mapping display_name = make_mapper.normalize_make_name(filename) is_valid = make_mapper.validate_mapping(filename, display_name) results.add_make_result(filename, display_name, is_valid) if not is_valid: print(f" āš ļø Make name validation failed: {filename} → {display_name}") else: print(f" āœ… Make name: {filename} → {display_name}") # Load and parse JSON content with open(json_file, 'r', encoding='utf-8') as f: json_data = json.load(f) # Get make key (should be first key) if not json_data: print(f" āš ļø Empty JSON file: {filename}") continue make_key = list(json_data.keys())[0] make_data = json_data[make_key] # Process all engines in this make total_engines = 0 parsed_engines = 0 empty_engine_models = 0 for year_entry in make_data: year = year_entry.get('year', 'unknown') for model_entry in year_entry.get('models', []): model_name = model_entry.get('name', 'unknown') engines_list = model_entry.get('engines', []) if not engines_list: # Electric vehicle case results.add_electric_vehicle(display_name, model_name, year) empty_engine_models += 1 # Test electric motor creation electric_spec = engine_parser.create_electric_motor() results.add_engine_result(display_name, "EMPTY_ARRAY", electric_spec, []) else: # Parse each engine for engine_str in engines_list: total_engines += 1 try: spec = engine_parser.parse_engine_string(engine_str) warnings = engine_parser.validate_engine_spec(spec) results.add_engine_result(display_name, engine_str, spec, warnings) parsed_engines += 1 # Check for L→I normalization if 'L' in engine_str and spec.configuration == 'I' and 'ELECTRIC' not in engine_str.upper(): normalized_str = engine_str.replace('L3', 'I3').replace('L4', 'I4').replace(' L', ' I') results.add_l_to_i_case(display_name, engine_str, normalized_str) except Exception as e: results.add_parsing_error(display_name, engine_str, str(e)) print(f" šŸ“Š Engines: {parsed_engines}/{total_engines} parsed successfully") if empty_engine_models > 0: print(f" ⚔ Electric vehicles: {empty_engine_models} models with empty engines") except Exception as e: print(f" āŒ Failed to process {filename}: {e}") results.add_parsing_error(filename, "FILE_PROCESSING", str(e)) return results def generate_validation_report(results: ValidationResults) -> None: """Generate comprehensive validation report""" print("\n" + "="*60) print("šŸ“‹ COMPREHENSIVE VALIDATION REPORT") print("="*60) # Make Name Validation Summary total_makes = len(results.make_results) valid_makes = sum(1 for r in results.make_results.values() if r['is_valid']) print(f"\nšŸ·ļø MAKE NAME VALIDATION") print(f" Total files: {total_makes}") print(f" Valid mappings: {valid_makes}") print(f" Success rate: {valid_makes/total_makes:.1%}") # Show invalid makes invalid_makes = [(f, r['display_name']) for f, r in results.make_results.items() if not r['is_valid']] if invalid_makes: print(f"\n āš ļø Invalid makes ({len(invalid_makes)}):") for filename, display_name in invalid_makes: print(f" {filename} → {display_name}") else: print(f" āœ… All make names are valid!") # Engine Parsing Summary total_engines = sum(len(engines) for engines in results.engine_results.values()) successful_parses = sum(1 for engines in results.engine_results.values() for engine in engines if engine['parsed'].configuration != 'Unknown') print(f"\nšŸ”§ ENGINE SPECIFICATION PARSING") print(f" Total engines: {total_engines}") print(f" Successfully parsed: {successful_parses}") print(f" Success rate: {successful_parses/total_engines:.1%}") # L→I Normalization Cases print(f"\nšŸŽÆ L→I NORMALIZATION (CRITICAL)") print(f" Cases found: {len(results.l_to_i_cases)}") if results.l_to_i_cases: print(f" Examples:") for case in results.l_to_i_cases[:10]: # Show first 10 print(f" {case['make']}: '{case['original']}' → '{case['normalized']}'") if len(results.l_to_i_cases) > 10: print(f" ... and {len(results.l_to_i_cases) - 10} more cases") else: print(f" āš ļø No L→I normalization cases found in data") # Electric Vehicle Handling print(f"\n⚔ ELECTRIC VEHICLE HANDLING") print(f" Models with empty engines: {len(results.electric_vehicles)}") if results.electric_vehicles: # Group by make ev_by_make = defaultdict(list) for ev in results.electric_vehicles: ev_by_make[ev['make']].append(f"{ev['year']} {ev['model']}") print(f" Electric vehicles by make:") for make, models in ev_by_make.items(): print(f" {make}: {len(models)} models") if make.lower() in ['tesla', 'lucid', 'rivian']: # Show details for known EVs for model in models[:5]: # Show first 5 print(f" {model}") if len(models) > 5: print(f" ... and {len(models) - 5} more") # Configuration Distribution print(f"\nšŸ“Š ENGINE CONFIGURATION DISTRIBUTION") config_counts = defaultdict(int) for engines in results.engine_results.values(): for engine in engines: config_counts[engine['parsed'].configuration] += 1 for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True): percentage = count / total_engines * 100 if total_engines > 0 else 0 print(f" {config}: {count} engines ({percentage:.1f}%)") # Fuel Type Distribution print(f"\n⛽ FUEL TYPE DISTRIBUTION") fuel_counts = defaultdict(int) for engines in results.engine_results.values(): for engine in engines: fuel_counts[engine['parsed'].fuel_type] += 1 for fuel, count in sorted(fuel_counts.items(), key=lambda x: x[1], reverse=True): percentage = count / total_engines * 100 if total_engines > 0 else 0 print(f" {fuel}: {count} engines ({percentage:.1f}%)") # Data Quality Issues if results.parsing_errors: print(f"\nāŒ PARSING ERRORS ({len(results.parsing_errors)})") error_types = defaultdict(list) for error in results.parsing_errors: error_types[error['error']].append(error) for error_type, errors in error_types.items(): print(f" {error_type}: {len(errors)} occurrences") for error in errors[:3]: # Show first 3 examples if error.get('engine'): print(f" {error['make']}: '{error['engine']}'") else: print(f" {error['make']}") # Overall Assessment print(f"\nšŸŽ‰ OVERALL ASSESSMENT") if valid_makes == total_makes: print(f" āœ… Make name normalization: PERFECT") else: print(f" āš ļø Make name normalization: {valid_makes/total_makes:.1%}") if successful_parses / total_engines >= 0.95: # 95% threshold print(f" āœ… Engine parsing: EXCELLENT ({successful_parses/total_engines:.1%})") elif successful_parses / total_engines >= 0.85: # 85% threshold print(f" āœ… Engine parsing: GOOD ({successful_parses/total_engines:.1%})") else: print(f" āš ļø Engine parsing: NEEDS IMPROVEMENT ({successful_parses/total_engines:.1%})") if results.l_to_i_cases: print(f" āœ… L→I normalization: WORKING ({len(results.l_to_i_cases)} cases)") else: print(f" ā„¹ļø L→I normalization: NO CASES FOUND (may be normal)") if results.electric_vehicles: print(f" āœ… Electric vehicle handling: WORKING ({len(results.electric_vehicles)} models)") else: print(f" ā„¹ļø Electric vehicle handling: NO EMPTY ENGINES FOUND") def main(): """Main validation script""" # Determine JSON files directory current_dir = Path(__file__).parent json_dir = current_dir / "sources" / "makes" if not json_dir.exists(): print(f"āŒ JSON files directory not found: {json_dir}") print(f" Current directory: {current_dir}") print(f" Looking for: sources/makes/ directory") return 1 print(f"šŸš€ Starting validation against: {json_dir}") # Run validation results = validate_json_files(str(json_dir)) # Generate report generate_validation_report(results) # Return success/failure code total_makes = len(results.make_results) valid_makes = sum(1 for r in results.make_results.values() if r['is_valid']) if valid_makes == total_makes and not results.parsing_errors: print(f"\nšŸŽ‰ VALIDATION PASSED - Ready for implementation!") return 0 else: print(f"\nāš ļø VALIDATION ISSUES FOUND - Review before proceeding") return 1 if __name__ == "__main__": sys.exit(main())