Initial Commit

2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions
--- a/mvp-platform-services/vehicles/etl/extractors/json_extractor.py
+++ b/mvp-platform-services/vehicles/etl/extractors/json_extractor.py
@@ -0,0 +1,629 @@
+"""
+JSON Extractor for Manual Vehicle Data Processing
+
+Extracts and normalizes vehicle data from JSON files into database-ready structures.
+Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive
+data processing with L→I normalization and make name conversion.
+
+Key Features:
+- Extract make/model/year/trim/engine data from JSON files
+- Handle electric vehicles (empty engines → default motor)
+- Data validation and quality assurance
+- Progress tracking and error reporting
+
+Usage:
+    extractor = JsonExtractor(make_mapper, engine_parser)
+    make_data = extractor.extract_make_data('sources/makes/toyota.json')
+    all_data = extractor.extract_all_makes('sources/makes/')
+"""
+
+import json
+import os
+import glob
+import logging
+from typing import List, Dict, Optional, Generator, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+
+# Import our utilities (handle both relative and direct imports)
+try:
+    from ..utils.make_name_mapper import MakeNameMapper
+    from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec
+except ImportError:
+    # Fallback for direct execution
+    import sys
+    import os
+    sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+    from utils.make_name_mapper import MakeNameMapper
+    from utils.engine_spec_parser import EngineSpecParser, EngineSpec
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ValidationResult:
+    """JSON validation result"""
+    is_valid: bool
+    errors: List[str]
+    warnings: List[str]
+    
+    @property
+    def has_errors(self) -> bool:
+        return len(self.errors) > 0
+    
+    @property
+    def has_warnings(self) -> bool:
+        return len(self.warnings) > 0
+
+
+@dataclass
+class ModelData:
+    """Extracted model data with normalized engines and trims"""
+    name: str                          # Model name from JSON
+    years: List[int]                   # Years this model appears in
+    engines: List[EngineSpec]          # Parsed and normalized engines
+    trims: List[str]                   # Trim names (from submodels)
+    is_electric: bool = False          # True if empty engines array detected
+    
+    @property
+    def total_trims(self) -> int:
+        return len(self.trims)
+    
+    @property
+    def total_engines(self) -> int:
+        return len(self.engines)
+    
+    @property
+    def year_range(self) -> str:
+        if not self.years:
+            return "Unknown"
+        return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0])
+
+
+@dataclass  
+class MakeData:
+    """Complete make data with models, engines, and metadata"""
+    name: str                          # Normalized display name (e.g., "Alfa Romeo")
+    filename: str                      # Original JSON filename
+    models: List[ModelData]
+    processing_errors: List[str]       # Any errors during extraction
+    processing_warnings: List[str]     # Any warnings during extraction
+    
+    @property
+    def total_models(self) -> int:
+        return len(self.models)
+    
+    @property
+    def total_engines(self) -> int:
+        return sum(model.total_engines for model in self.models)
+    
+    @property
+    def total_trims(self) -> int:
+        return sum(model.total_trims for model in self.models)
+    
+    @property
+    def electric_models_count(self) -> int:
+        return sum(1 for model in self.models if model.is_electric)
+    
+    @property
+    def year_range(self) -> str:
+        all_years = []
+        for model in self.models:
+            all_years.extend(model.years)
+        
+        if not all_years:
+            return "Unknown"
+        return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0])
+
+
+@dataclass
+class ExtractionResult:
+    """Results of extracting all makes"""
+    makes: List[MakeData]
+    total_files_processed: int
+    successful_extractions: int
+    failed_extractions: int
+    total_models: int
+    total_engines: int
+    total_electric_models: int
+    
+    @property
+    def success_rate(self) -> float:
+        return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0
+
+
+class JsonExtractor:
+    """Extract normalized vehicle data from JSON files"""
+    
+    def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser):
+        """
+        Initialize JSON extractor with utilities
+        
+        Args:
+            make_mapper: For normalizing make names from filenames
+            engine_parser: For parsing engine specifications with L→I normalization
+        """
+        self.make_mapper = make_mapper
+        self.engine_parser = engine_parser
+        
+        logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser")
+    
+    def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult:
+        """
+        Validate JSON structure before processing
+        
+        Args:
+            json_data: Loaded JSON data
+            filename: Source filename for error context
+            
+        Returns:
+            ValidationResult with validity status and any issues
+        """
+        errors = []
+        warnings = []
+        
+        try:
+            # Check top-level structure
+            if not isinstance(json_data, dict):
+                errors.append("JSON must be a dictionary")
+                return ValidationResult(False, errors, warnings)
+            
+            # Should have exactly one key (the make name)
+            if len(json_data.keys()) != 1:
+                errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}")
+                return ValidationResult(False, errors, warnings)
+            
+            make_key = list(json_data.keys())[0]
+            make_data = json_data[make_key]
+            
+            # Make data should be a list of year entries
+            if not isinstance(make_data, list):
+                errors.append(f"Make data for '{make_key}' must be a list")
+                return ValidationResult(False, errors, warnings)
+            
+            if len(make_data) == 0:
+                warnings.append(f"Make '{make_key}' has no year entries")
+            
+            # Validate year entries
+            for i, year_entry in enumerate(make_data):
+                if not isinstance(year_entry, dict):
+                    errors.append(f"Year entry {i} must be a dictionary")
+                    continue
+                
+                # Check required fields
+                if 'year' not in year_entry:
+                    errors.append(f"Year entry {i} missing 'year' field")
+                
+                if 'models' not in year_entry:
+                    errors.append(f"Year entry {i} missing 'models' field")
+                    continue
+                
+                # Validate year
+                try:
+                    year = int(year_entry['year'])
+                    if year < 1900 or year > 2030:
+                        warnings.append(f"Unusual year value: {year}")
+                except (ValueError, TypeError):
+                    errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}")
+                
+                # Validate models
+                models = year_entry['models']
+                if not isinstance(models, list):
+                    errors.append(f"Models in year entry {i} must be a list")
+                    continue
+                
+                for j, model in enumerate(models):
+                    if not isinstance(model, dict):
+                        errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary")
+                        continue
+                    
+                    if 'name' not in model:
+                        errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field")
+                    
+                    # Engines and submodels are optional but should be lists if present
+                    if 'engines' in model and not isinstance(model['engines'], list):
+                        errors.append(f"Engines for model {model.get('name')} must be a list")
+                    
+                    if 'submodels' in model and not isinstance(model['submodels'], list):
+                        errors.append(f"Submodels for model {model.get('name')} must be a list")
+            
+        except Exception as e:
+            errors.append(f"Unexpected error during validation: {str(e)}")
+        
+        is_valid = len(errors) == 0
+        
+        if errors:
+            logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors")
+        elif warnings:
+            logger.info(f"JSON validation for {filename}: {len(warnings)} warnings")
+        else:
+            logger.debug(f"JSON validation passed for {filename}")
+        
+        return ValidationResult(is_valid, errors, warnings)
+    
+    def extract_make_data(self, json_file_path: str) -> MakeData:
+        """
+        Extract complete make data from a single JSON file
+        
+        Args:
+            json_file_path: Path to JSON file
+            
+        Returns:
+            MakeData with extracted and normalized data
+        """
+        filename = os.path.basename(json_file_path)
+        logger.info(f"Extracting make data from {filename}")
+        
+        processing_errors = []
+        processing_warnings = []
+        
+        try:
+            # Load and validate JSON
+            with open(json_file_path, 'r', encoding='utf-8') as f:
+                json_data = json.load(f)
+            
+            validation = self.validate_json_structure(json_data, filename)
+            processing_errors.extend(validation.errors)
+            processing_warnings.extend(validation.warnings)
+            
+            if not validation.is_valid:
+                logger.error(f"JSON validation failed for {filename}")
+                return MakeData(
+                    name=self.make_mapper.normalize_make_name(filename),
+                    filename=filename,
+                    models=[],
+                    processing_errors=processing_errors,
+                    processing_warnings=processing_warnings
+                )
+            
+            # Get normalized make name
+            make_name = self.make_mapper.normalize_make_name(filename)
+            logger.debug(f"Normalized make name: {filename} → {make_name}")
+            
+            # Extract data
+            make_key = list(json_data.keys())[0]
+            year_entries = json_data[make_key]
+            
+            # Group models by name across all years
+            models_by_name = {}  # model_name -> {years: set, engines: set, trims: set}
+            
+            for year_entry in year_entries:
+                try:
+                    year = int(year_entry['year'])
+                    models_list = year_entry.get('models', [])
+                    
+                    for model_entry in models_list:
+                        model_name = model_entry.get('name', '').strip()
+                        if not model_name:
+                            processing_warnings.append(f"Empty model name in year {year}")
+                            continue
+                        
+                        # Initialize model data if not seen before
+                        if model_name not in models_by_name:
+                            models_by_name[model_name] = {
+                                'years': set(),
+                                'engines': set(),
+                                'trims': set()
+                            }
+                        
+                        # Add year
+                        models_by_name[model_name]['years'].add(year)
+                        
+                        # Add engines
+                        engines_list = model_entry.get('engines', [])
+                        for engine_str in engines_list:
+                            if engine_str and engine_str.strip():
+                                models_by_name[model_name]['engines'].add(engine_str.strip())
+                        
+                        # Add trims (from submodels)
+                        submodels_list = model_entry.get('submodels', [])
+                        for trim in submodels_list:
+                            if trim and trim.strip():
+                                models_by_name[model_name]['trims'].add(trim.strip())
+                
+                except (ValueError, TypeError) as e:
+                    processing_errors.append(f"Error processing year entry: {str(e)}")
+                    continue
+            
+            # Convert to ModelData objects
+            models = []
+            for model_name, model_info in models_by_name.items():
+                try:
+                    # Parse engines
+                    engine_specs = []
+                    is_electric = False
+                    
+                    if not model_info['engines']:
+                        # Empty engines array - electric vehicle
+                        is_electric = True
+                        electric_spec = self.engine_parser.create_electric_motor()
+                        engine_specs = [electric_spec]
+                        logger.debug(f"Created electric motor for {make_name} {model_name}")
+                    else:
+                        # Parse each engine string
+                        for engine_str in model_info['engines']:
+                            spec = self.engine_parser.parse_engine_string(engine_str)
+                            engine_specs.append(spec)
+                    
+                    # Remove duplicate engines based on key attributes
+                    unique_engines = self.engine_parser.get_unique_engines(engine_specs)
+                    
+                    # Create model data
+                    model_data = ModelData(
+                        name=model_name,
+                        years=sorted(list(model_info['years'])),
+                        engines=unique_engines,
+                        trims=sorted(list(model_info['trims'])),
+                        is_electric=is_electric
+                    )
+                    
+                    models.append(model_data)
+                    
+                except Exception as e:
+                    processing_errors.append(f"Error processing model {model_name}: {str(e)}")
+                    continue
+            
+            # Sort models by name
+            models.sort(key=lambda m: m.name)
+            
+            make_data = MakeData(
+                name=make_name,
+                filename=filename,
+                models=models,
+                processing_errors=processing_errors,
+                processing_warnings=processing_warnings
+            )
+            
+            logger.info(f"Extracted {filename}: {len(models)} models, "
+                       f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models")
+            
+            return make_data
+            
+        except Exception as e:
+            logger.error(f"Failed to extract make data from {filename}: {str(e)}")
+            processing_errors.append(f"Fatal error: {str(e)}")
+            
+            return MakeData(
+                name=self.make_mapper.normalize_make_name(filename),
+                filename=filename,
+                models=[],
+                processing_errors=processing_errors,
+                processing_warnings=processing_warnings
+            )
+    
+    def extract_all_makes(self, sources_dir: str) -> ExtractionResult:
+        """
+        Process all JSON files in the sources directory
+        
+        Args:
+            sources_dir: Directory containing JSON make files
+            
+        Returns:
+            ExtractionResult with all extracted data and statistics
+        """
+        logger.info(f"Starting extraction of all makes from {sources_dir}")
+        
+        # Find all JSON files
+        pattern = os.path.join(sources_dir, '*.json')
+        json_files = glob.glob(pattern)
+        
+        if not json_files:
+            logger.warning(f"No JSON files found in {sources_dir}")
+            return ExtractionResult(
+                makes=[],
+                total_files_processed=0,
+                successful_extractions=0,
+                failed_extractions=0,
+                total_models=0,
+                total_engines=0,
+                total_electric_models=0
+            )
+        
+        logger.info(f"Found {len(json_files)} JSON files to process")
+        
+        makes = []
+        successful_extractions = 0
+        failed_extractions = 0
+        
+        # Sort files for consistent processing order
+        json_files.sort()
+        
+        for json_file in json_files:
+            try:
+                make_data = self.extract_make_data(json_file)
+                makes.append(make_data)
+                
+                if make_data.processing_errors:
+                    failed_extractions += 1
+                    logger.error(f"Extraction completed with errors for {make_data.filename}")
+                else:
+                    successful_extractions += 1
+                    logger.debug(f"Extraction successful for {make_data.filename}")
+                
+            except Exception as e:
+                logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}")
+                failed_extractions += 1
+                
+                # Create minimal make data for failed file
+                filename = os.path.basename(json_file)
+                failed_make = MakeData(
+                    name=self.make_mapper.normalize_make_name(filename),
+                    filename=filename,
+                    models=[],
+                    processing_errors=[f"Fatal extraction error: {str(e)}"],
+                    processing_warnings=[]
+                )
+                makes.append(failed_make)
+        
+        # Calculate statistics
+        total_models = sum(make.total_models for make in makes)
+        total_engines = sum(make.total_engines for make in makes)
+        total_electric_models = sum(make.electric_models_count for make in makes)
+        
+        result = ExtractionResult(
+            makes=makes,
+            total_files_processed=len(json_files),
+            successful_extractions=successful_extractions,
+            failed_extractions=failed_extractions,
+            total_models=total_models,
+            total_engines=total_engines,
+            total_electric_models=total_electric_models
+        )
+        
+        logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, "
+                   f"{total_models} models, {total_engines} engines, {total_electric_models} electric models")
+        
+        return result
+    
+    def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]:
+        """
+        Get detailed extraction statistics
+        
+        Args:
+            result: ExtractionResult from extract_all_makes
+            
+        Returns:
+            Dictionary with detailed statistics
+        """
+        stats = {
+            'files': {
+                'total_processed': result.total_files_processed,
+                'successful': result.successful_extractions,
+                'failed': result.failed_extractions,
+                'success_rate': result.success_rate
+            },
+            'data': {
+                'total_makes': len(result.makes),
+                'total_models': result.total_models,
+                'total_engines': result.total_engines,
+                'electric_models': result.total_electric_models
+            },
+            'quality': {
+                'makes_with_errors': sum(1 for make in result.makes if make.processing_errors),
+                'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings),
+                'total_errors': sum(len(make.processing_errors) for make in result.makes),
+                'total_warnings': sum(len(make.processing_warnings) for make in result.makes)
+            }
+        }
+        
+        # Add make-specific statistics
+        make_stats = []
+        for make in result.makes:
+            make_stat = {
+                'name': make.name,
+                'filename': make.filename,
+                'models': make.total_models,
+                'engines': make.total_engines,
+                'trims': make.total_trims,
+                'electric_models': make.electric_models_count,
+                'year_range': make.year_range,
+                'errors': len(make.processing_errors),
+                'warnings': len(make.processing_warnings)
+            }
+            make_stats.append(make_stat)
+        
+        stats['makes'] = make_stats
+        
+        return stats
+    
+    def print_extraction_report(self, result: ExtractionResult) -> None:
+        """
+        Print detailed extraction report
+        
+        Args:
+            result: ExtractionResult from extract_all_makes
+        """
+        stats = self.get_extraction_statistics(result)
+        
+        print(f"🚀 JSON EXTRACTION REPORT")
+        print(f"=" * 50)
+        
+        # File processing summary
+        print(f"\n📁 FILE PROCESSING")
+        print(f"   Files processed: {stats['files']['total_processed']}")
+        print(f"   Successful: {stats['files']['successful']}")
+        print(f"   Failed: {stats['files']['failed']}")
+        print(f"   Success rate: {stats['files']['success_rate']:.1%}")
+        
+        # Data summary
+        print(f"\n📊 DATA EXTRACTED")
+        print(f"   Makes: {stats['data']['total_makes']}")
+        print(f"   Models: {stats['data']['total_models']}")
+        print(f"   Engines: {stats['data']['total_engines']}")
+        print(f"   Electric models: {stats['data']['electric_models']}")
+        
+        # Quality summary
+        print(f"\n🔍 QUALITY ASSESSMENT")
+        print(f"   Makes with errors: {stats['quality']['makes_with_errors']}")
+        print(f"   Makes with warnings: {stats['quality']['makes_with_warnings']}")
+        print(f"   Total errors: {stats['quality']['total_errors']}")
+        print(f"   Total warnings: {stats['quality']['total_warnings']}")
+        
+        # Show problematic makes
+        if stats['quality']['makes_with_errors'] > 0:
+            print(f"\n⚠️  MAKES WITH ERRORS:")
+            for make in result.makes:
+                if make.processing_errors:
+                    print(f"   {make.name} ({make.filename}): {len(make.processing_errors)} errors")
+        
+        # Show top makes by data volume
+        print(f"\n🏆 TOP MAKES BY MODEL COUNT:")
+        top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
+        for make in top_makes:
+            print(f"   {make.name}: {make.total_models} models, {make.total_engines} engines")
+
+
+# Example usage and testing functions
+def example_usage():
+    """Demonstrate JsonExtractor usage"""
+    print("🚀 JsonExtractor Example Usage")
+    print("=" * 40)
+    
+    # Use direct imports for example usage
+    try:
+        from ..utils.make_name_mapper import MakeNameMapper
+        from ..utils.engine_spec_parser import EngineSpecParser
+    except ImportError:
+        # Fallback for direct execution
+        import sys
+        import os
+        sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+        from utils.make_name_mapper import MakeNameMapper
+        from utils.engine_spec_parser import EngineSpecParser
+    
+    # Initialize utilities
+    make_mapper = MakeNameMapper()
+    engine_parser = EngineSpecParser()
+    
+    # Create extractor
+    extractor = JsonExtractor(make_mapper, engine_parser)
+    
+    # Extract single make
+    sources_dir = "sources/makes"
+    if os.path.exists(sources_dir):
+        toyota_file = os.path.join(sources_dir, "toyota.json")
+        if os.path.exists(toyota_file):
+            print(f"\n📄 Extracting from toyota.json...")
+            toyota_data = extractor.extract_make_data(toyota_file)
+            
+            print(f"   Make: {toyota_data.name}")
+            print(f"   Models: {toyota_data.total_models}")
+            print(f"   Engines: {toyota_data.total_engines}")
+            print(f"   Electric models: {toyota_data.electric_models_count}")
+            print(f"   Year range: {toyota_data.year_range}")
+            
+            if toyota_data.processing_errors:
+                print(f"   Errors: {len(toyota_data.processing_errors)}")
+            if toyota_data.processing_warnings:
+                print(f"   Warnings: {len(toyota_data.processing_warnings)}")
+        
+        # Extract all makes
+        print(f"\n🔄 Extracting all makes...")
+        result = extractor.extract_all_makes(sources_dir)
+        extractor.print_extraction_report(result)
+    else:
+        print(f"Sources directory not found: {sources_dir}")
+
+
+if __name__ == "__main__":
+    example_usage()