Initial Commit

This commit is contained in:
Eric Gullickson
2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions

View File

@@ -0,0 +1,629 @@
"""
JSON Extractor for Manual Vehicle Data Processing
Extracts and normalizes vehicle data from JSON files into database-ready structures.
Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive
data processing with L→I normalization and make name conversion.
Key Features:
- Extract make/model/year/trim/engine data from JSON files
- Handle electric vehicles (empty engines → default motor)
- Data validation and quality assurance
- Progress tracking and error reporting
Usage:
extractor = JsonExtractor(make_mapper, engine_parser)
make_data = extractor.extract_make_data('sources/makes/toyota.json')
all_data = extractor.extract_all_makes('sources/makes/')
"""
import json
import os
import glob
import logging
from typing import List, Dict, Optional, Generator, Tuple
from dataclasses import dataclass
from pathlib import Path
# Import our utilities (handle both relative and direct imports)
try:
from ..utils.make_name_mapper import MakeNameMapper
from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser, EngineSpec
logger = logging.getLogger(__name__)
@dataclass
class ValidationResult:
"""JSON validation result"""
is_valid: bool
errors: List[str]
warnings: List[str]
@property
def has_errors(self) -> bool:
return len(self.errors) > 0
@property
def has_warnings(self) -> bool:
return len(self.warnings) > 0
@dataclass
class ModelData:
"""Extracted model data with normalized engines and trims"""
name: str # Model name from JSON
years: List[int] # Years this model appears in
engines: List[EngineSpec] # Parsed and normalized engines
trims: List[str] # Trim names (from submodels)
is_electric: bool = False # True if empty engines array detected
@property
def total_trims(self) -> int:
return len(self.trims)
@property
def total_engines(self) -> int:
return len(self.engines)
@property
def year_range(self) -> str:
if not self.years:
return "Unknown"
return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0])
@dataclass
class MakeData:
"""Complete make data with models, engines, and metadata"""
name: str # Normalized display name (e.g., "Alfa Romeo")
filename: str # Original JSON filename
models: List[ModelData]
processing_errors: List[str] # Any errors during extraction
processing_warnings: List[str] # Any warnings during extraction
@property
def total_models(self) -> int:
return len(self.models)
@property
def total_engines(self) -> int:
return sum(model.total_engines for model in self.models)
@property
def total_trims(self) -> int:
return sum(model.total_trims for model in self.models)
@property
def electric_models_count(self) -> int:
return sum(1 for model in self.models if model.is_electric)
@property
def year_range(self) -> str:
all_years = []
for model in self.models:
all_years.extend(model.years)
if not all_years:
return "Unknown"
return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0])
@dataclass
class ExtractionResult:
"""Results of extracting all makes"""
makes: List[MakeData]
total_files_processed: int
successful_extractions: int
failed_extractions: int
total_models: int
total_engines: int
total_electric_models: int
@property
def success_rate(self) -> float:
return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0
class JsonExtractor:
"""Extract normalized vehicle data from JSON files"""
def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser):
"""
Initialize JSON extractor with utilities
Args:
make_mapper: For normalizing make names from filenames
engine_parser: For parsing engine specifications with L→I normalization
"""
self.make_mapper = make_mapper
self.engine_parser = engine_parser
logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser")
def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult:
"""
Validate JSON structure before processing
Args:
json_data: Loaded JSON data
filename: Source filename for error context
Returns:
ValidationResult with validity status and any issues
"""
errors = []
warnings = []
try:
# Check top-level structure
if not isinstance(json_data, dict):
errors.append("JSON must be a dictionary")
return ValidationResult(False, errors, warnings)
# Should have exactly one key (the make name)
if len(json_data.keys()) != 1:
errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}")
return ValidationResult(False, errors, warnings)
make_key = list(json_data.keys())[0]
make_data = json_data[make_key]
# Make data should be a list of year entries
if not isinstance(make_data, list):
errors.append(f"Make data for '{make_key}' must be a list")
return ValidationResult(False, errors, warnings)
if len(make_data) == 0:
warnings.append(f"Make '{make_key}' has no year entries")
# Validate year entries
for i, year_entry in enumerate(make_data):
if not isinstance(year_entry, dict):
errors.append(f"Year entry {i} must be a dictionary")
continue
# Check required fields
if 'year' not in year_entry:
errors.append(f"Year entry {i} missing 'year' field")
if 'models' not in year_entry:
errors.append(f"Year entry {i} missing 'models' field")
continue
# Validate year
try:
year = int(year_entry['year'])
if year < 1900 or year > 2030:
warnings.append(f"Unusual year value: {year}")
except (ValueError, TypeError):
errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}")
# Validate models
models = year_entry['models']
if not isinstance(models, list):
errors.append(f"Models in year entry {i} must be a list")
continue
for j, model in enumerate(models):
if not isinstance(model, dict):
errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary")
continue
if 'name' not in model:
errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field")
# Engines and submodels are optional but should be lists if present
if 'engines' in model and not isinstance(model['engines'], list):
errors.append(f"Engines for model {model.get('name')} must be a list")
if 'submodels' in model and not isinstance(model['submodels'], list):
errors.append(f"Submodels for model {model.get('name')} must be a list")
except Exception as e:
errors.append(f"Unexpected error during validation: {str(e)}")
is_valid = len(errors) == 0
if errors:
logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors")
elif warnings:
logger.info(f"JSON validation for {filename}: {len(warnings)} warnings")
else:
logger.debug(f"JSON validation passed for {filename}")
return ValidationResult(is_valid, errors, warnings)
def extract_make_data(self, json_file_path: str) -> MakeData:
"""
Extract complete make data from a single JSON file
Args:
json_file_path: Path to JSON file
Returns:
MakeData with extracted and normalized data
"""
filename = os.path.basename(json_file_path)
logger.info(f"Extracting make data from {filename}")
processing_errors = []
processing_warnings = []
try:
# Load and validate JSON
with open(json_file_path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
validation = self.validate_json_structure(json_data, filename)
processing_errors.extend(validation.errors)
processing_warnings.extend(validation.warnings)
if not validation.is_valid:
logger.error(f"JSON validation failed for {filename}")
return MakeData(
name=self.make_mapper.normalize_make_name(filename),
filename=filename,
models=[],
processing_errors=processing_errors,
processing_warnings=processing_warnings
)
# Get normalized make name
make_name = self.make_mapper.normalize_make_name(filename)
logger.debug(f"Normalized make name: {filename}{make_name}")
# Extract data
make_key = list(json_data.keys())[0]
year_entries = json_data[make_key]
# Group models by name across all years
models_by_name = {} # model_name -> {years: set, engines: set, trims: set}
for year_entry in year_entries:
try:
year = int(year_entry['year'])
models_list = year_entry.get('models', [])
for model_entry in models_list:
model_name = model_entry.get('name', '').strip()
if not model_name:
processing_warnings.append(f"Empty model name in year {year}")
continue
# Initialize model data if not seen before
if model_name not in models_by_name:
models_by_name[model_name] = {
'years': set(),
'engines': set(),
'trims': set()
}
# Add year
models_by_name[model_name]['years'].add(year)
# Add engines
engines_list = model_entry.get('engines', [])
for engine_str in engines_list:
if engine_str and engine_str.strip():
models_by_name[model_name]['engines'].add(engine_str.strip())
# Add trims (from submodels)
submodels_list = model_entry.get('submodels', [])
for trim in submodels_list:
if trim and trim.strip():
models_by_name[model_name]['trims'].add(trim.strip())
except (ValueError, TypeError) as e:
processing_errors.append(f"Error processing year entry: {str(e)}")
continue
# Convert to ModelData objects
models = []
for model_name, model_info in models_by_name.items():
try:
# Parse engines
engine_specs = []
is_electric = False
if not model_info['engines']:
# Empty engines array - electric vehicle
is_electric = True
electric_spec = self.engine_parser.create_electric_motor()
engine_specs = [electric_spec]
logger.debug(f"Created electric motor for {make_name} {model_name}")
else:
# Parse each engine string
for engine_str in model_info['engines']:
spec = self.engine_parser.parse_engine_string(engine_str)
engine_specs.append(spec)
# Remove duplicate engines based on key attributes
unique_engines = self.engine_parser.get_unique_engines(engine_specs)
# Create model data
model_data = ModelData(
name=model_name,
years=sorted(list(model_info['years'])),
engines=unique_engines,
trims=sorted(list(model_info['trims'])),
is_electric=is_electric
)
models.append(model_data)
except Exception as e:
processing_errors.append(f"Error processing model {model_name}: {str(e)}")
continue
# Sort models by name
models.sort(key=lambda m: m.name)
make_data = MakeData(
name=make_name,
filename=filename,
models=models,
processing_errors=processing_errors,
processing_warnings=processing_warnings
)
logger.info(f"Extracted {filename}: {len(models)} models, "
f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models")
return make_data
except Exception as e:
logger.error(f"Failed to extract make data from {filename}: {str(e)}")
processing_errors.append(f"Fatal error: {str(e)}")
return MakeData(
name=self.make_mapper.normalize_make_name(filename),
filename=filename,
models=[],
processing_errors=processing_errors,
processing_warnings=processing_warnings
)
def extract_all_makes(self, sources_dir: str) -> ExtractionResult:
"""
Process all JSON files in the sources directory
Args:
sources_dir: Directory containing JSON make files
Returns:
ExtractionResult with all extracted data and statistics
"""
logger.info(f"Starting extraction of all makes from {sources_dir}")
# Find all JSON files
pattern = os.path.join(sources_dir, '*.json')
json_files = glob.glob(pattern)
if not json_files:
logger.warning(f"No JSON files found in {sources_dir}")
return ExtractionResult(
makes=[],
total_files_processed=0,
successful_extractions=0,
failed_extractions=0,
total_models=0,
total_engines=0,
total_electric_models=0
)
logger.info(f"Found {len(json_files)} JSON files to process")
makes = []
successful_extractions = 0
failed_extractions = 0
# Sort files for consistent processing order
json_files.sort()
for json_file in json_files:
try:
make_data = self.extract_make_data(json_file)
makes.append(make_data)
if make_data.processing_errors:
failed_extractions += 1
logger.error(f"Extraction completed with errors for {make_data.filename}")
else:
successful_extractions += 1
logger.debug(f"Extraction successful for {make_data.filename}")
except Exception as e:
logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}")
failed_extractions += 1
# Create minimal make data for failed file
filename = os.path.basename(json_file)
failed_make = MakeData(
name=self.make_mapper.normalize_make_name(filename),
filename=filename,
models=[],
processing_errors=[f"Fatal extraction error: {str(e)}"],
processing_warnings=[]
)
makes.append(failed_make)
# Calculate statistics
total_models = sum(make.total_models for make in makes)
total_engines = sum(make.total_engines for make in makes)
total_electric_models = sum(make.electric_models_count for make in makes)
result = ExtractionResult(
makes=makes,
total_files_processed=len(json_files),
successful_extractions=successful_extractions,
failed_extractions=failed_extractions,
total_models=total_models,
total_engines=total_engines,
total_electric_models=total_electric_models
)
logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, "
f"{total_models} models, {total_engines} engines, {total_electric_models} electric models")
return result
def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]:
"""
Get detailed extraction statistics
Args:
result: ExtractionResult from extract_all_makes
Returns:
Dictionary with detailed statistics
"""
stats = {
'files': {
'total_processed': result.total_files_processed,
'successful': result.successful_extractions,
'failed': result.failed_extractions,
'success_rate': result.success_rate
},
'data': {
'total_makes': len(result.makes),
'total_models': result.total_models,
'total_engines': result.total_engines,
'electric_models': result.total_electric_models
},
'quality': {
'makes_with_errors': sum(1 for make in result.makes if make.processing_errors),
'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings),
'total_errors': sum(len(make.processing_errors) for make in result.makes),
'total_warnings': sum(len(make.processing_warnings) for make in result.makes)
}
}
# Add make-specific statistics
make_stats = []
for make in result.makes:
make_stat = {
'name': make.name,
'filename': make.filename,
'models': make.total_models,
'engines': make.total_engines,
'trims': make.total_trims,
'electric_models': make.electric_models_count,
'year_range': make.year_range,
'errors': len(make.processing_errors),
'warnings': len(make.processing_warnings)
}
make_stats.append(make_stat)
stats['makes'] = make_stats
return stats
def print_extraction_report(self, result: ExtractionResult) -> None:
"""
Print detailed extraction report
Args:
result: ExtractionResult from extract_all_makes
"""
stats = self.get_extraction_statistics(result)
print(f"🚀 JSON EXTRACTION REPORT")
print(f"=" * 50)
# File processing summary
print(f"\n📁 FILE PROCESSING")
print(f" Files processed: {stats['files']['total_processed']}")
print(f" Successful: {stats['files']['successful']}")
print(f" Failed: {stats['files']['failed']}")
print(f" Success rate: {stats['files']['success_rate']:.1%}")
# Data summary
print(f"\n📊 DATA EXTRACTED")
print(f" Makes: {stats['data']['total_makes']}")
print(f" Models: {stats['data']['total_models']}")
print(f" Engines: {stats['data']['total_engines']}")
print(f" Electric models: {stats['data']['electric_models']}")
# Quality summary
print(f"\n🔍 QUALITY ASSESSMENT")
print(f" Makes with errors: {stats['quality']['makes_with_errors']}")
print(f" Makes with warnings: {stats['quality']['makes_with_warnings']}")
print(f" Total errors: {stats['quality']['total_errors']}")
print(f" Total warnings: {stats['quality']['total_warnings']}")
# Show problematic makes
if stats['quality']['makes_with_errors'] > 0:
print(f"\n⚠️ MAKES WITH ERRORS:")
for make in result.makes:
if make.processing_errors:
print(f" {make.name} ({make.filename}): {len(make.processing_errors)} errors")
# Show top makes by data volume
print(f"\n🏆 TOP MAKES BY MODEL COUNT:")
top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
for make in top_makes:
print(f" {make.name}: {make.total_models} models, {make.total_engines} engines")
# Example usage and testing functions
def example_usage():
"""Demonstrate JsonExtractor usage"""
print("🚀 JsonExtractor Example Usage")
print("=" * 40)
# Use direct imports for example usage
try:
from ..utils.make_name_mapper import MakeNameMapper
from ..utils.engine_spec_parser import EngineSpecParser
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser
# Initialize utilities
make_mapper = MakeNameMapper()
engine_parser = EngineSpecParser()
# Create extractor
extractor = JsonExtractor(make_mapper, engine_parser)
# Extract single make
sources_dir = "sources/makes"
if os.path.exists(sources_dir):
toyota_file = os.path.join(sources_dir, "toyota.json")
if os.path.exists(toyota_file):
print(f"\n📄 Extracting from toyota.json...")
toyota_data = extractor.extract_make_data(toyota_file)
print(f" Make: {toyota_data.name}")
print(f" Models: {toyota_data.total_models}")
print(f" Engines: {toyota_data.total_engines}")
print(f" Electric models: {toyota_data.electric_models_count}")
print(f" Year range: {toyota_data.year_range}")
if toyota_data.processing_errors:
print(f" Errors: {len(toyota_data.processing_errors)}")
if toyota_data.processing_warnings:
print(f" Warnings: {len(toyota_data.processing_warnings)}")
# Extract all makes
print(f"\n🔄 Extracting all makes...")
result = extractor.extract_all_makes(sources_dir)
extractor.print_extraction_report(result)
else:
print(f"Sources directory not found: {sources_dir}")
if __name__ == "__main__":
example_usage()