motovaultpro/mvp-platform-services/vehicles/etl/extractors/json_extractor.py

"""
JSON Extractor for Manual Vehicle Data Processing

Extracts and normalizes vehicle data from JSON files into database-ready structures.
Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive
data processing with L→I normalization and make name conversion.

Key Features:
- Extract make/model/year/trim/engine data from JSON files
- Handle electric vehicles (empty engines → default motor)
- Data validation and quality assurance
- Progress tracking and error reporting

Usage:
    extractor = JsonExtractor(make_mapper, engine_parser)
    make_data = extractor.extract_make_data('sources/makes/toyota.json')
    all_data = extractor.extract_all_makes('sources/makes/')
"""

import json
import os
import glob
import logging
from typing import List, Dict, Optional, Generator, Tuple
from dataclasses import dataclass
from pathlib import Path

# Import our utilities (handle both relative and direct imports)
try:
    from ..utils.make_name_mapper import MakeNameMapper
    from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec
except ImportError:
    # Fallback for direct execution
    import sys
    import os
    sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
    from utils.make_name_mapper import MakeNameMapper
    from utils.engine_spec_parser import EngineSpecParser, EngineSpec

logger = logging.getLogger(__name__)


@dataclass
class ValidationResult:
    """JSON validation result"""
    is_valid: bool
    errors: List[str]
    warnings: List[str]

    @property
    def has_errors(self) -> bool:
        return len(self.errors) > 0

    @property
    def has_warnings(self) -> bool:
        return len(self.warnings) > 0


@dataclass
class ModelData:
    """Extracted model data with normalized engines and trims"""
    name: str                          # Model name from JSON
    years: List[int]                   # Years this model appears in
    engines: List[EngineSpec]          # Parsed and normalized engines
    trims: List[str]                   # Trim names (from submodels)
    is_electric: bool = False          # True if empty engines array detected

    @property
    def total_trims(self) -> int:
        return len(self.trims)

    @property
    def total_engines(self) -> int:
        return len(self.engines)

    @property
    def year_range(self) -> str:
        if not self.years:
            return "Unknown"
        return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0])


@dataclass
class MakeData:
    """Complete make data with models, engines, and metadata"""
    name: str                          # Normalized display name (e.g., "Alfa Romeo")
    filename: str                      # Original JSON filename
    models: List[ModelData]
    processing_errors: List[str]       # Any errors during extraction
    processing_warnings: List[str]     # Any warnings during extraction

    @property
    def total_models(self) -> int:
        return len(self.models)

    @property
    def total_engines(self) -> int:
        return sum(model.total_engines for model in self.models)

    @property
    def total_trims(self) -> int:
        return sum(model.total_trims for model in self.models)

    @property
    def electric_models_count(self) -> int:
        return sum(1 for model in self.models if model.is_electric)

    @property
    def year_range(self) -> str:
        all_years = []
        for model in self.models:
            all_years.extend(model.years)

        if not all_years:
            return "Unknown"
        return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0])


@dataclass
class ExtractionResult:
    """Results of extracting all makes"""
    makes: List[MakeData]
    total_files_processed: int
    successful_extractions: int
    failed_extractions: int
    total_models: int
    total_engines: int
    total_electric_models: int

    @property
    def success_rate(self) -> float:
        return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0


class JsonExtractor:
    """Extract normalized vehicle data from JSON files"""

    def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser):
        """
        Initialize JSON extractor with utilities

        Args:
            make_mapper: For normalizing make names from filenames
            engine_parser: For parsing engine specifications with L→I normalization
        """
        self.make_mapper = make_mapper
        self.engine_parser = engine_parser

        logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser")

    def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult:
        """
        Validate JSON structure before processing

        Args:
            json_data: Loaded JSON data
            filename: Source filename for error context

        Returns:
            ValidationResult with validity status and any issues
        """
        errors = []
        warnings = []

        try:
            # Check top-level structure
            if not isinstance(json_data, dict):
                errors.append("JSON must be a dictionary")
                return ValidationResult(False, errors, warnings)

            # Should have exactly one key (the make name)
            if len(json_data.keys()) != 1:
                errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}")
                return ValidationResult(False, errors, warnings)

            make_key = list(json_data.keys())[0]
            make_data = json_data[make_key]

            # Make data should be a list of year entries
            if not isinstance(make_data, list):
                errors.append(f"Make data for '{make_key}' must be a list")
                return ValidationResult(False, errors, warnings)

            if len(make_data) == 0:
                warnings.append(f"Make '{make_key}' has no year entries")

            # Validate year entries
            for i, year_entry in enumerate(make_data):
                if not isinstance(year_entry, dict):
                    errors.append(f"Year entry {i} must be a dictionary")
                    continue

                # Check required fields
                if 'year' not in year_entry:
                    errors.append(f"Year entry {i} missing 'year' field")

                if 'models' not in year_entry:
                    errors.append(f"Year entry {i} missing 'models' field")
                    continue

                # Validate year
                try:
                    year = int(year_entry['year'])
                    if year < 1900 or year > 2030:
                        warnings.append(f"Unusual year value: {year}")
                except (ValueError, TypeError):
                    errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}")

                # Validate models
                models = year_entry['models']
                if not isinstance(models, list):
                    errors.append(f"Models in year entry {i} must be a list")
                    continue

                for j, model in enumerate(models):
                    if not isinstance(model, dict):
                        errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary")
                        continue

                    if 'name' not in model:
                        errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field")

                    # Engines and submodels are optional but should be lists if present
                    if 'engines' in model and not isinstance(model['engines'], list):
                        errors.append(f"Engines for model {model.get('name')} must be a list")

                    if 'submodels' in model and not isinstance(model['submodels'], list):
                        errors.append(f"Submodels for model {model.get('name')} must be a list")

        except Exception as e:
            errors.append(f"Unexpected error during validation: {str(e)}")

        is_valid = len(errors) == 0

        if errors:
            logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors")
        elif warnings:
            logger.info(f"JSON validation for {filename}: {len(warnings)} warnings")
        else:
            logger.debug(f"JSON validation passed for {filename}")

        return ValidationResult(is_valid, errors, warnings)

    def extract_make_data(self, json_file_path: str) -> MakeData:
        """
        Extract complete make data from a single JSON file

        Args:
            json_file_path: Path to JSON file

        Returns:
            MakeData with extracted and normalized data
        """
        filename = os.path.basename(json_file_path)
        logger.info(f"Extracting make data from {filename}")

        processing_errors = []
        processing_warnings = []

        try:
            # Load and validate JSON
            with open(json_file_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)

            validation = self.validate_json_structure(json_data, filename)
            processing_errors.extend(validation.errors)
            processing_warnings.extend(validation.warnings)

            if not validation.is_valid:
                logger.error(f"JSON validation failed for {filename}")
                return MakeData(
                    name=self.make_mapper.normalize_make_name(filename),
                    filename=filename,
                    models=[],
                    processing_errors=processing_errors,
                    processing_warnings=processing_warnings
                )

            # Get normalized make name
            make_name = self.make_mapper.normalize_make_name(filename)
            logger.debug(f"Normalized make name: {filename} → {make_name}")

            # Extract data
            make_key = list(json_data.keys())[0]
            year_entries = json_data[make_key]

            # Group models by name across all years
            models_by_name = {}  # model_name -> {years: set, engines: set, trims: set}

            for year_entry in year_entries:
                try:
                    year = int(year_entry['year'])
                    models_list = year_entry.get('models', [])

                    for model_entry in models_list:
                        model_name = model_entry.get('name', '').strip()
                        if not model_name:
                            processing_warnings.append(f"Empty model name in year {year}")
                            continue

                        # Initialize model data if not seen before
                        if model_name not in models_by_name:
                            models_by_name[model_name] = {
                                'years': set(),
                                'engines': set(),
                                'trims': set()
                            }

                        # Add year
                        models_by_name[model_name]['years'].add(year)

                        # Add engines
                        engines_list = model_entry.get('engines', [])
                        for engine_str in engines_list:
                            if engine_str and engine_str.strip():
                                models_by_name[model_name]['engines'].add(engine_str.strip())

                        # Add trims (from submodels)
                        submodels_list = model_entry.get('submodels', [])
                        for trim in submodels_list:
                            if trim and trim.strip():
                                models_by_name[model_name]['trims'].add(trim.strip())

                except (ValueError, TypeError) as e:
                    processing_errors.append(f"Error processing year entry: {str(e)}")
                    continue

            # Convert to ModelData objects
            models = []
            for model_name, model_info in models_by_name.items():
                try:
                    # Parse engines
                    engine_specs = []
                    is_electric = False

                    if not model_info['engines']:
                        # Empty engines array - electric vehicle
                        is_electric = True
                        electric_spec = self.engine_parser.create_electric_motor()
                        engine_specs = [electric_spec]
                        logger.debug(f"Created electric motor for {make_name} {model_name}")
                    else:
                        # Parse each engine string
                        for engine_str in model_info['engines']:
                            spec = self.engine_parser.parse_engine_string(engine_str)
                            engine_specs.append(spec)

                    # Remove duplicate engines based on key attributes
                    unique_engines = self.engine_parser.get_unique_engines(engine_specs)

                    # Create model data
                    model_data = ModelData(
                        name=model_name,
                        years=sorted(list(model_info['years'])),
                        engines=unique_engines,
                        trims=sorted(list(model_info['trims'])),
                        is_electric=is_electric
                    )

                    models.append(model_data)

                except Exception as e:
                    processing_errors.append(f"Error processing model {model_name}: {str(e)}")
                    continue

            # Sort models by name
            models.sort(key=lambda m: m.name)

            make_data = MakeData(
                name=make_name,
                filename=filename,
                models=models,
                processing_errors=processing_errors,
                processing_warnings=processing_warnings
            )

            logger.info(f"Extracted {filename}: {len(models)} models, "
                       f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models")

            return make_data

        except Exception as e:
            logger.error(f"Failed to extract make data from {filename}: {str(e)}")
            processing_errors.append(f"Fatal error: {str(e)}")

            return MakeData(
                name=self.make_mapper.normalize_make_name(filename),
                filename=filename,
                models=[],
                processing_errors=processing_errors,
                processing_warnings=processing_warnings
            )

    def extract_all_makes(self, sources_dir: str) -> ExtractionResult:
        """
        Process all JSON files in the sources directory

        Args:
            sources_dir: Directory containing JSON make files

        Returns:
            ExtractionResult with all extracted data and statistics
        """
        logger.info(f"Starting extraction of all makes from {sources_dir}")

        # Find all JSON files
        pattern = os.path.join(sources_dir, '*.json')
        json_files = glob.glob(pattern)

        if not json_files:
            logger.warning(f"No JSON files found in {sources_dir}")
            return ExtractionResult(
                makes=[],
                total_files_processed=0,
                successful_extractions=0,
                failed_extractions=0,
                total_models=0,
                total_engines=0,
                total_electric_models=0
            )

        logger.info(f"Found {len(json_files)} JSON files to process")

        makes = []
        successful_extractions = 0
        failed_extractions = 0

        # Sort files for consistent processing order
        json_files.sort()

        for json_file in json_files:
            try:
                make_data = self.extract_make_data(json_file)
                makes.append(make_data)

                if make_data.processing_errors:
                    failed_extractions += 1
                    logger.error(f"Extraction completed with errors for {make_data.filename}")
                else:
                    successful_extractions += 1
                    logger.debug(f"Extraction successful for {make_data.filename}")

            except Exception as e:
                logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}")
                failed_extractions += 1

                # Create minimal make data for failed file
                filename = os.path.basename(json_file)
                failed_make = MakeData(
                    name=self.make_mapper.normalize_make_name(filename),
                    filename=filename,
                    models=[],
                    processing_errors=[f"Fatal extraction error: {str(e)}"],
                    processing_warnings=[]
                )
                makes.append(failed_make)

        # Calculate statistics
        total_models = sum(make.total_models for make in makes)
        total_engines = sum(make.total_engines for make in makes)
        total_electric_models = sum(make.electric_models_count for make in makes)

        result = ExtractionResult(
            makes=makes,
            total_files_processed=len(json_files),
            successful_extractions=successful_extractions,
            failed_extractions=failed_extractions,
            total_models=total_models,
            total_engines=total_engines,
            total_electric_models=total_electric_models
        )

        logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, "
                   f"{total_models} models, {total_engines} engines, {total_electric_models} electric models")

        return result

    def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]:
        """
        Get detailed extraction statistics

        Args:
            result: ExtractionResult from extract_all_makes

        Returns:
            Dictionary with detailed statistics
        """
        stats = {
            'files': {
                'total_processed': result.total_files_processed,
                'successful': result.successful_extractions,
                'failed': result.failed_extractions,
                'success_rate': result.success_rate
            },
            'data': {
                'total_makes': len(result.makes),
                'total_models': result.total_models,
                'total_engines': result.total_engines,
                'electric_models': result.total_electric_models
            },
            'quality': {
                'makes_with_errors': sum(1 for make in result.makes if make.processing_errors),
                'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings),
                'total_errors': sum(len(make.processing_errors) for make in result.makes),
                'total_warnings': sum(len(make.processing_warnings) for make in result.makes)
            }
        }

        # Add make-specific statistics
        make_stats = []
        for make in result.makes:
            make_stat = {
                'name': make.name,
                'filename': make.filename,
                'models': make.total_models,
                'engines': make.total_engines,
                'trims': make.total_trims,
                'electric_models': make.electric_models_count,
                'year_range': make.year_range,
                'errors': len(make.processing_errors),
                'warnings': len(make.processing_warnings)
            }
            make_stats.append(make_stat)

        stats['makes'] = make_stats

        return stats

    def print_extraction_report(self, result: ExtractionResult) -> None:
        """
        Print detailed extraction report

        Args:
            result: ExtractionResult from extract_all_makes
        """
        stats = self.get_extraction_statistics(result)

        print(f"🚀 JSON EXTRACTION REPORT")
        print(f"=" * 50)

        # File processing summary
        print(f"\n📁 FILE PROCESSING")
        print(f"   Files processed: {stats['files']['total_processed']}")
        print(f"   Successful: {stats['files']['successful']}")
        print(f"   Failed: {stats['files']['failed']}")
        print(f"   Success rate: {stats['files']['success_rate']:.1%}")

        # Data summary
        print(f"\n📊 DATA EXTRACTED")
        print(f"   Makes: {stats['data']['total_makes']}")
        print(f"   Models: {stats['data']['total_models']}")
        print(f"   Engines: {stats['data']['total_engines']}")
        print(f"   Electric models: {stats['data']['electric_models']}")

        # Quality summary
        print(f"\n🔍 QUALITY ASSESSMENT")
        print(f"   Makes with errors: {stats['quality']['makes_with_errors']}")
        print(f"   Makes with warnings: {stats['quality']['makes_with_warnings']}")
        print(f"   Total errors: {stats['quality']['total_errors']}")
        print(f"   Total warnings: {stats['quality']['total_warnings']}")

        # Show problematic makes
        if stats['quality']['makes_with_errors'] > 0:
            print(f"\n⚠️  MAKES WITH ERRORS:")
            for make in result.makes:
                if make.processing_errors:
                    print(f"   {make.name} ({make.filename}): {len(make.processing_errors)} errors")

        # Show top makes by data volume
        print(f"\n🏆 TOP MAKES BY MODEL COUNT:")
        top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
        for make in top_makes:
            print(f"   {make.name}: {make.total_models} models, {make.total_engines} engines")


# Example usage and testing functions
def example_usage():
    """Demonstrate JsonExtractor usage"""
    print("🚀 JsonExtractor Example Usage")
    print("=" * 40)

    # Use direct imports for example usage
    try:
        from ..utils.make_name_mapper import MakeNameMapper
        from ..utils.engine_spec_parser import EngineSpecParser
    except ImportError:
        # Fallback for direct execution
        import sys
        import os
        sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
        from utils.make_name_mapper import MakeNameMapper
        from utils.engine_spec_parser import EngineSpecParser

    # Initialize utilities
    make_mapper = MakeNameMapper()
    engine_parser = EngineSpecParser()

    # Create extractor
    extractor = JsonExtractor(make_mapper, engine_parser)

    # Extract single make
    sources_dir = "sources/makes"
    if os.path.exists(sources_dir):
        toyota_file = os.path.join(sources_dir, "toyota.json")
        if os.path.exists(toyota_file):
            print(f"\n📄 Extracting from toyota.json...")
            toyota_data = extractor.extract_make_data(toyota_file)

            print(f"   Make: {toyota_data.name}")
            print(f"   Models: {toyota_data.total_models}")
            print(f"   Engines: {toyota_data.total_engines}")
            print(f"   Electric models: {toyota_data.electric_models_count}")
            print(f"   Year range: {toyota_data.year_range}")

            if toyota_data.processing_errors:
                print(f"   Errors: {len(toyota_data.processing_errors)}")
            if toyota_data.processing_warnings:
                print(f"   Warnings: {len(toyota_data.processing_warnings)}")

        # Extract all makes
        print(f"\n🔄 Extracting all makes...")
        result = extractor.extract_all_makes(sources_dir)
        extractor.print_extraction_report(result)
    else:
        print(f"Sources directory not found: {sources_dir}")


if __name__ == "__main__":
    example_usage()