""" JSON Extractor for Manual Vehicle Data Processing Extracts and normalizes vehicle data from JSON files into database-ready structures. Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive data processing with L→I normalization and make name conversion. Key Features: - Extract make/model/year/trim/engine data from JSON files - Handle electric vehicles (empty engines → default motor) - Data validation and quality assurance - Progress tracking and error reporting Usage: extractor = JsonExtractor(make_mapper, engine_parser) make_data = extractor.extract_make_data('sources/makes/toyota.json') all_data = extractor.extract_all_makes('sources/makes/') """ import json import os import glob import logging from typing import List, Dict, Optional, Generator, Tuple from dataclasses import dataclass from pathlib import Path # Import our utilities (handle both relative and direct imports) try: from ..utils.make_name_mapper import MakeNameMapper from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec except ImportError: # Fallback for direct execution import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from utils.make_name_mapper import MakeNameMapper from utils.engine_spec_parser import EngineSpecParser, EngineSpec logger = logging.getLogger(__name__) @dataclass class ValidationResult: """JSON validation result""" is_valid: bool errors: List[str] warnings: List[str] @property def has_errors(self) -> bool: return len(self.errors) > 0 @property def has_warnings(self) -> bool: return len(self.warnings) > 0 @dataclass class ModelData: """Extracted model data with normalized engines and trims""" name: str # Model name from JSON years: List[int] # Years this model appears in engines: List[EngineSpec] # Parsed and normalized engines trims: List[str] # Trim names (from submodels) is_electric: bool = False # True if empty engines array detected @property def total_trims(self) -> int: return len(self.trims) @property def total_engines(self) -> int: return len(self.engines) @property def year_range(self) -> str: if not self.years: return "Unknown" return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0]) @dataclass class MakeData: """Complete make data with models, engines, and metadata""" name: str # Normalized display name (e.g., "Alfa Romeo") filename: str # Original JSON filename models: List[ModelData] processing_errors: List[str] # Any errors during extraction processing_warnings: List[str] # Any warnings during extraction @property def total_models(self) -> int: return len(self.models) @property def total_engines(self) -> int: return sum(model.total_engines for model in self.models) @property def total_trims(self) -> int: return sum(model.total_trims for model in self.models) @property def electric_models_count(self) -> int: return sum(1 for model in self.models if model.is_electric) @property def year_range(self) -> str: all_years = [] for model in self.models: all_years.extend(model.years) if not all_years: return "Unknown" return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0]) @dataclass class ExtractionResult: """Results of extracting all makes""" makes: List[MakeData] total_files_processed: int successful_extractions: int failed_extractions: int total_models: int total_engines: int total_electric_models: int @property def success_rate(self) -> float: return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0 class JsonExtractor: """Extract normalized vehicle data from JSON files""" def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser): """ Initialize JSON extractor with utilities Args: make_mapper: For normalizing make names from filenames engine_parser: For parsing engine specifications with L→I normalization """ self.make_mapper = make_mapper self.engine_parser = engine_parser logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser") def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult: """ Validate JSON structure before processing Args: json_data: Loaded JSON data filename: Source filename for error context Returns: ValidationResult with validity status and any issues """ errors = [] warnings = [] try: # Check top-level structure if not isinstance(json_data, dict): errors.append("JSON must be a dictionary") return ValidationResult(False, errors, warnings) # Should have exactly one key (the make name) if len(json_data.keys()) != 1: errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}") return ValidationResult(False, errors, warnings) make_key = list(json_data.keys())[0] make_data = json_data[make_key] # Make data should be a list of year entries if not isinstance(make_data, list): errors.append(f"Make data for '{make_key}' must be a list") return ValidationResult(False, errors, warnings) if len(make_data) == 0: warnings.append(f"Make '{make_key}' has no year entries") # Validate year entries for i, year_entry in enumerate(make_data): if not isinstance(year_entry, dict): errors.append(f"Year entry {i} must be a dictionary") continue # Check required fields if 'year' not in year_entry: errors.append(f"Year entry {i} missing 'year' field") if 'models' not in year_entry: errors.append(f"Year entry {i} missing 'models' field") continue # Validate year try: year = int(year_entry['year']) if year < 1900 or year > 2030: warnings.append(f"Unusual year value: {year}") except (ValueError, TypeError): errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}") # Validate models models = year_entry['models'] if not isinstance(models, list): errors.append(f"Models in year entry {i} must be a list") continue for j, model in enumerate(models): if not isinstance(model, dict): errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary") continue if 'name' not in model: errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field") # Engines and submodels are optional but should be lists if present if 'engines' in model and not isinstance(model['engines'], list): errors.append(f"Engines for model {model.get('name')} must be a list") if 'submodels' in model and not isinstance(model['submodels'], list): errors.append(f"Submodels for model {model.get('name')} must be a list") except Exception as e: errors.append(f"Unexpected error during validation: {str(e)}") is_valid = len(errors) == 0 if errors: logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors") elif warnings: logger.info(f"JSON validation for {filename}: {len(warnings)} warnings") else: logger.debug(f"JSON validation passed for {filename}") return ValidationResult(is_valid, errors, warnings) def extract_make_data(self, json_file_path: str) -> MakeData: """ Extract complete make data from a single JSON file Args: json_file_path: Path to JSON file Returns: MakeData with extracted and normalized data """ filename = os.path.basename(json_file_path) logger.info(f"Extracting make data from {filename}") processing_errors = [] processing_warnings = [] try: # Load and validate JSON with open(json_file_path, 'r', encoding='utf-8') as f: json_data = json.load(f) validation = self.validate_json_structure(json_data, filename) processing_errors.extend(validation.errors) processing_warnings.extend(validation.warnings) if not validation.is_valid: logger.error(f"JSON validation failed for {filename}") return MakeData( name=self.make_mapper.normalize_make_name(filename), filename=filename, models=[], processing_errors=processing_errors, processing_warnings=processing_warnings ) # Get normalized make name make_name = self.make_mapper.normalize_make_name(filename) logger.debug(f"Normalized make name: {filename} → {make_name}") # Extract data make_key = list(json_data.keys())[0] year_entries = json_data[make_key] # Group models by name across all years models_by_name = {} # model_name -> {years: set, engines: set, trims: set} for year_entry in year_entries: try: year = int(year_entry['year']) models_list = year_entry.get('models', []) for model_entry in models_list: model_name = model_entry.get('name', '').strip() if not model_name: processing_warnings.append(f"Empty model name in year {year}") continue # Initialize model data if not seen before if model_name not in models_by_name: models_by_name[model_name] = { 'years': set(), 'engines': set(), 'trims': set() } # Add year models_by_name[model_name]['years'].add(year) # Add engines engines_list = model_entry.get('engines', []) for engine_str in engines_list: if engine_str and engine_str.strip(): models_by_name[model_name]['engines'].add(engine_str.strip()) # Add trims (from submodels) submodels_list = model_entry.get('submodels', []) for trim in submodels_list: if trim and trim.strip(): models_by_name[model_name]['trims'].add(trim.strip()) except (ValueError, TypeError) as e: processing_errors.append(f"Error processing year entry: {str(e)}") continue # Convert to ModelData objects models = [] for model_name, model_info in models_by_name.items(): try: # Parse engines engine_specs = [] is_electric = False if not model_info['engines']: # Empty engines array - electric vehicle is_electric = True electric_spec = self.engine_parser.create_electric_motor() engine_specs = [electric_spec] logger.debug(f"Created electric motor for {make_name} {model_name}") else: # Parse each engine string for engine_str in model_info['engines']: spec = self.engine_parser.parse_engine_string(engine_str) engine_specs.append(spec) # Remove duplicate engines based on key attributes unique_engines = self.engine_parser.get_unique_engines(engine_specs) # Create model data model_data = ModelData( name=model_name, years=sorted(list(model_info['years'])), engines=unique_engines, trims=sorted(list(model_info['trims'])), is_electric=is_electric ) models.append(model_data) except Exception as e: processing_errors.append(f"Error processing model {model_name}: {str(e)}") continue # Sort models by name models.sort(key=lambda m: m.name) make_data = MakeData( name=make_name, filename=filename, models=models, processing_errors=processing_errors, processing_warnings=processing_warnings ) logger.info(f"Extracted {filename}: {len(models)} models, " f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models") return make_data except Exception as e: logger.error(f"Failed to extract make data from {filename}: {str(e)}") processing_errors.append(f"Fatal error: {str(e)}") return MakeData( name=self.make_mapper.normalize_make_name(filename), filename=filename, models=[], processing_errors=processing_errors, processing_warnings=processing_warnings ) def extract_all_makes(self, sources_dir: str) -> ExtractionResult: """ Process all JSON files in the sources directory Args: sources_dir: Directory containing JSON make files Returns: ExtractionResult with all extracted data and statistics """ logger.info(f"Starting extraction of all makes from {sources_dir}") # Find all JSON files pattern = os.path.join(sources_dir, '*.json') json_files = glob.glob(pattern) if not json_files: logger.warning(f"No JSON files found in {sources_dir}") return ExtractionResult( makes=[], total_files_processed=0, successful_extractions=0, failed_extractions=0, total_models=0, total_engines=0, total_electric_models=0 ) logger.info(f"Found {len(json_files)} JSON files to process") makes = [] successful_extractions = 0 failed_extractions = 0 # Sort files for consistent processing order json_files.sort() for json_file in json_files: try: make_data = self.extract_make_data(json_file) makes.append(make_data) if make_data.processing_errors: failed_extractions += 1 logger.error(f"Extraction completed with errors for {make_data.filename}") else: successful_extractions += 1 logger.debug(f"Extraction successful for {make_data.filename}") except Exception as e: logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}") failed_extractions += 1 # Create minimal make data for failed file filename = os.path.basename(json_file) failed_make = MakeData( name=self.make_mapper.normalize_make_name(filename), filename=filename, models=[], processing_errors=[f"Fatal extraction error: {str(e)}"], processing_warnings=[] ) makes.append(failed_make) # Calculate statistics total_models = sum(make.total_models for make in makes) total_engines = sum(make.total_engines for make in makes) total_electric_models = sum(make.electric_models_count for make in makes) result = ExtractionResult( makes=makes, total_files_processed=len(json_files), successful_extractions=successful_extractions, failed_extractions=failed_extractions, total_models=total_models, total_engines=total_engines, total_electric_models=total_electric_models ) logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, " f"{total_models} models, {total_engines} engines, {total_electric_models} electric models") return result def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]: """ Get detailed extraction statistics Args: result: ExtractionResult from extract_all_makes Returns: Dictionary with detailed statistics """ stats = { 'files': { 'total_processed': result.total_files_processed, 'successful': result.successful_extractions, 'failed': result.failed_extractions, 'success_rate': result.success_rate }, 'data': { 'total_makes': len(result.makes), 'total_models': result.total_models, 'total_engines': result.total_engines, 'electric_models': result.total_electric_models }, 'quality': { 'makes_with_errors': sum(1 for make in result.makes if make.processing_errors), 'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings), 'total_errors': sum(len(make.processing_errors) for make in result.makes), 'total_warnings': sum(len(make.processing_warnings) for make in result.makes) } } # Add make-specific statistics make_stats = [] for make in result.makes: make_stat = { 'name': make.name, 'filename': make.filename, 'models': make.total_models, 'engines': make.total_engines, 'trims': make.total_trims, 'electric_models': make.electric_models_count, 'year_range': make.year_range, 'errors': len(make.processing_errors), 'warnings': len(make.processing_warnings) } make_stats.append(make_stat) stats['makes'] = make_stats return stats def print_extraction_report(self, result: ExtractionResult) -> None: """ Print detailed extraction report Args: result: ExtractionResult from extract_all_makes """ stats = self.get_extraction_statistics(result) print(f"šŸš€ JSON EXTRACTION REPORT") print(f"=" * 50) # File processing summary print(f"\nšŸ“ FILE PROCESSING") print(f" Files processed: {stats['files']['total_processed']}") print(f" Successful: {stats['files']['successful']}") print(f" Failed: {stats['files']['failed']}") print(f" Success rate: {stats['files']['success_rate']:.1%}") # Data summary print(f"\nšŸ“Š DATA EXTRACTED") print(f" Makes: {stats['data']['total_makes']}") print(f" Models: {stats['data']['total_models']}") print(f" Engines: {stats['data']['total_engines']}") print(f" Electric models: {stats['data']['electric_models']}") # Quality summary print(f"\nšŸ” QUALITY ASSESSMENT") print(f" Makes with errors: {stats['quality']['makes_with_errors']}") print(f" Makes with warnings: {stats['quality']['makes_with_warnings']}") print(f" Total errors: {stats['quality']['total_errors']}") print(f" Total warnings: {stats['quality']['total_warnings']}") # Show problematic makes if stats['quality']['makes_with_errors'] > 0: print(f"\nāš ļø MAKES WITH ERRORS:") for make in result.makes: if make.processing_errors: print(f" {make.name} ({make.filename}): {len(make.processing_errors)} errors") # Show top makes by data volume print(f"\nšŸ† TOP MAKES BY MODEL COUNT:") top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10] for make in top_makes: print(f" {make.name}: {make.total_models} models, {make.total_engines} engines") # Example usage and testing functions def example_usage(): """Demonstrate JsonExtractor usage""" print("šŸš€ JsonExtractor Example Usage") print("=" * 40) # Use direct imports for example usage try: from ..utils.make_name_mapper import MakeNameMapper from ..utils.engine_spec_parser import EngineSpecParser except ImportError: # Fallback for direct execution import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from utils.make_name_mapper import MakeNameMapper from utils.engine_spec_parser import EngineSpecParser # Initialize utilities make_mapper = MakeNameMapper() engine_parser = EngineSpecParser() # Create extractor extractor = JsonExtractor(make_mapper, engine_parser) # Extract single make sources_dir = "sources/makes" if os.path.exists(sources_dir): toyota_file = os.path.join(sources_dir, "toyota.json") if os.path.exists(toyota_file): print(f"\nšŸ“„ Extracting from toyota.json...") toyota_data = extractor.extract_make_data(toyota_file) print(f" Make: {toyota_data.name}") print(f" Models: {toyota_data.total_models}") print(f" Engines: {toyota_data.total_engines}") print(f" Electric models: {toyota_data.electric_models_count}") print(f" Year range: {toyota_data.year_range}") if toyota_data.processing_errors: print(f" Errors: {len(toyota_data.processing_errors)}") if toyota_data.processing_warnings: print(f" Warnings: {len(toyota_data.processing_warnings)}") # Extract all makes print(f"\nšŸ”„ Extracting all makes...") result = extractor.extract_all_makes(sources_dir) extractor.print_extraction_report(result) else: print(f"Sources directory not found: {sources_dir}") if __name__ == "__main__": example_usage()