629 lines
24 KiB
Python
629 lines
24 KiB
Python
"""
|
|
JSON Extractor for Manual Vehicle Data Processing
|
|
|
|
Extracts and normalizes vehicle data from JSON files into database-ready structures.
|
|
Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive
|
|
data processing with L→I normalization and make name conversion.
|
|
|
|
Key Features:
|
|
- Extract make/model/year/trim/engine data from JSON files
|
|
- Handle electric vehicles (empty engines → default motor)
|
|
- Data validation and quality assurance
|
|
- Progress tracking and error reporting
|
|
|
|
Usage:
|
|
extractor = JsonExtractor(make_mapper, engine_parser)
|
|
make_data = extractor.extract_make_data('sources/makes/toyota.json')
|
|
all_data = extractor.extract_all_makes('sources/makes/')
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import glob
|
|
import logging
|
|
from typing import List, Dict, Optional, Generator, Tuple
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
# Import our utilities (handle both relative and direct imports)
|
|
try:
|
|
from ..utils.make_name_mapper import MakeNameMapper
|
|
from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec
|
|
except ImportError:
|
|
# Fallback for direct execution
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
|
from utils.make_name_mapper import MakeNameMapper
|
|
from utils.engine_spec_parser import EngineSpecParser, EngineSpec
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ValidationResult:
|
|
"""JSON validation result"""
|
|
is_valid: bool
|
|
errors: List[str]
|
|
warnings: List[str]
|
|
|
|
@property
|
|
def has_errors(self) -> bool:
|
|
return len(self.errors) > 0
|
|
|
|
@property
|
|
def has_warnings(self) -> bool:
|
|
return len(self.warnings) > 0
|
|
|
|
|
|
@dataclass
|
|
class ModelData:
|
|
"""Extracted model data with normalized engines and trims"""
|
|
name: str # Model name from JSON
|
|
years: List[int] # Years this model appears in
|
|
engines: List[EngineSpec] # Parsed and normalized engines
|
|
trims: List[str] # Trim names (from submodels)
|
|
is_electric: bool = False # True if empty engines array detected
|
|
|
|
@property
|
|
def total_trims(self) -> int:
|
|
return len(self.trims)
|
|
|
|
@property
|
|
def total_engines(self) -> int:
|
|
return len(self.engines)
|
|
|
|
@property
|
|
def year_range(self) -> str:
|
|
if not self.years:
|
|
return "Unknown"
|
|
return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0])
|
|
|
|
|
|
@dataclass
|
|
class MakeData:
|
|
"""Complete make data with models, engines, and metadata"""
|
|
name: str # Normalized display name (e.g., "Alfa Romeo")
|
|
filename: str # Original JSON filename
|
|
models: List[ModelData]
|
|
processing_errors: List[str] # Any errors during extraction
|
|
processing_warnings: List[str] # Any warnings during extraction
|
|
|
|
@property
|
|
def total_models(self) -> int:
|
|
return len(self.models)
|
|
|
|
@property
|
|
def total_engines(self) -> int:
|
|
return sum(model.total_engines for model in self.models)
|
|
|
|
@property
|
|
def total_trims(self) -> int:
|
|
return sum(model.total_trims for model in self.models)
|
|
|
|
@property
|
|
def electric_models_count(self) -> int:
|
|
return sum(1 for model in self.models if model.is_electric)
|
|
|
|
@property
|
|
def year_range(self) -> str:
|
|
all_years = []
|
|
for model in self.models:
|
|
all_years.extend(model.years)
|
|
|
|
if not all_years:
|
|
return "Unknown"
|
|
return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0])
|
|
|
|
|
|
@dataclass
|
|
class ExtractionResult:
|
|
"""Results of extracting all makes"""
|
|
makes: List[MakeData]
|
|
total_files_processed: int
|
|
successful_extractions: int
|
|
failed_extractions: int
|
|
total_models: int
|
|
total_engines: int
|
|
total_electric_models: int
|
|
|
|
@property
|
|
def success_rate(self) -> float:
|
|
return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0
|
|
|
|
|
|
class JsonExtractor:
|
|
"""Extract normalized vehicle data from JSON files"""
|
|
|
|
def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser):
|
|
"""
|
|
Initialize JSON extractor with utilities
|
|
|
|
Args:
|
|
make_mapper: For normalizing make names from filenames
|
|
engine_parser: For parsing engine specifications with L→I normalization
|
|
"""
|
|
self.make_mapper = make_mapper
|
|
self.engine_parser = engine_parser
|
|
|
|
logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser")
|
|
|
|
def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult:
|
|
"""
|
|
Validate JSON structure before processing
|
|
|
|
Args:
|
|
json_data: Loaded JSON data
|
|
filename: Source filename for error context
|
|
|
|
Returns:
|
|
ValidationResult with validity status and any issues
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
try:
|
|
# Check top-level structure
|
|
if not isinstance(json_data, dict):
|
|
errors.append("JSON must be a dictionary")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
# Should have exactly one key (the make name)
|
|
if len(json_data.keys()) != 1:
|
|
errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
make_key = list(json_data.keys())[0]
|
|
make_data = json_data[make_key]
|
|
|
|
# Make data should be a list of year entries
|
|
if not isinstance(make_data, list):
|
|
errors.append(f"Make data for '{make_key}' must be a list")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
if len(make_data) == 0:
|
|
warnings.append(f"Make '{make_key}' has no year entries")
|
|
|
|
# Validate year entries
|
|
for i, year_entry in enumerate(make_data):
|
|
if not isinstance(year_entry, dict):
|
|
errors.append(f"Year entry {i} must be a dictionary")
|
|
continue
|
|
|
|
# Check required fields
|
|
if 'year' not in year_entry:
|
|
errors.append(f"Year entry {i} missing 'year' field")
|
|
|
|
if 'models' not in year_entry:
|
|
errors.append(f"Year entry {i} missing 'models' field")
|
|
continue
|
|
|
|
# Validate year
|
|
try:
|
|
year = int(year_entry['year'])
|
|
if year < 1900 or year > 2030:
|
|
warnings.append(f"Unusual year value: {year}")
|
|
except (ValueError, TypeError):
|
|
errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}")
|
|
|
|
# Validate models
|
|
models = year_entry['models']
|
|
if not isinstance(models, list):
|
|
errors.append(f"Models in year entry {i} must be a list")
|
|
continue
|
|
|
|
for j, model in enumerate(models):
|
|
if not isinstance(model, dict):
|
|
errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary")
|
|
continue
|
|
|
|
if 'name' not in model:
|
|
errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field")
|
|
|
|
# Engines and submodels are optional but should be lists if present
|
|
if 'engines' in model and not isinstance(model['engines'], list):
|
|
errors.append(f"Engines for model {model.get('name')} must be a list")
|
|
|
|
if 'submodels' in model and not isinstance(model['submodels'], list):
|
|
errors.append(f"Submodels for model {model.get('name')} must be a list")
|
|
|
|
except Exception as e:
|
|
errors.append(f"Unexpected error during validation: {str(e)}")
|
|
|
|
is_valid = len(errors) == 0
|
|
|
|
if errors:
|
|
logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors")
|
|
elif warnings:
|
|
logger.info(f"JSON validation for {filename}: {len(warnings)} warnings")
|
|
else:
|
|
logger.debug(f"JSON validation passed for {filename}")
|
|
|
|
return ValidationResult(is_valid, errors, warnings)
|
|
|
|
def extract_make_data(self, json_file_path: str) -> MakeData:
|
|
"""
|
|
Extract complete make data from a single JSON file
|
|
|
|
Args:
|
|
json_file_path: Path to JSON file
|
|
|
|
Returns:
|
|
MakeData with extracted and normalized data
|
|
"""
|
|
filename = os.path.basename(json_file_path)
|
|
logger.info(f"Extracting make data from {filename}")
|
|
|
|
processing_errors = []
|
|
processing_warnings = []
|
|
|
|
try:
|
|
# Load and validate JSON
|
|
with open(json_file_path, 'r', encoding='utf-8') as f:
|
|
json_data = json.load(f)
|
|
|
|
validation = self.validate_json_structure(json_data, filename)
|
|
processing_errors.extend(validation.errors)
|
|
processing_warnings.extend(validation.warnings)
|
|
|
|
if not validation.is_valid:
|
|
logger.error(f"JSON validation failed for {filename}")
|
|
return MakeData(
|
|
name=self.make_mapper.normalize_make_name(filename),
|
|
filename=filename,
|
|
models=[],
|
|
processing_errors=processing_errors,
|
|
processing_warnings=processing_warnings
|
|
)
|
|
|
|
# Get normalized make name
|
|
make_name = self.make_mapper.normalize_make_name(filename)
|
|
logger.debug(f"Normalized make name: {filename} → {make_name}")
|
|
|
|
# Extract data
|
|
make_key = list(json_data.keys())[0]
|
|
year_entries = json_data[make_key]
|
|
|
|
# Group models by name across all years
|
|
models_by_name = {} # model_name -> {years: set, engines: set, trims: set}
|
|
|
|
for year_entry in year_entries:
|
|
try:
|
|
year = int(year_entry['year'])
|
|
models_list = year_entry.get('models', [])
|
|
|
|
for model_entry in models_list:
|
|
model_name = model_entry.get('name', '').strip()
|
|
if not model_name:
|
|
processing_warnings.append(f"Empty model name in year {year}")
|
|
continue
|
|
|
|
# Initialize model data if not seen before
|
|
if model_name not in models_by_name:
|
|
models_by_name[model_name] = {
|
|
'years': set(),
|
|
'engines': set(),
|
|
'trims': set()
|
|
}
|
|
|
|
# Add year
|
|
models_by_name[model_name]['years'].add(year)
|
|
|
|
# Add engines
|
|
engines_list = model_entry.get('engines', [])
|
|
for engine_str in engines_list:
|
|
if engine_str and engine_str.strip():
|
|
models_by_name[model_name]['engines'].add(engine_str.strip())
|
|
|
|
# Add trims (from submodels)
|
|
submodels_list = model_entry.get('submodels', [])
|
|
for trim in submodels_list:
|
|
if trim and trim.strip():
|
|
models_by_name[model_name]['trims'].add(trim.strip())
|
|
|
|
except (ValueError, TypeError) as e:
|
|
processing_errors.append(f"Error processing year entry: {str(e)}")
|
|
continue
|
|
|
|
# Convert to ModelData objects
|
|
models = []
|
|
for model_name, model_info in models_by_name.items():
|
|
try:
|
|
# Parse engines
|
|
engine_specs = []
|
|
is_electric = False
|
|
|
|
if not model_info['engines']:
|
|
# Empty engines array - electric vehicle
|
|
is_electric = True
|
|
electric_spec = self.engine_parser.create_electric_motor()
|
|
engine_specs = [electric_spec]
|
|
logger.debug(f"Created electric motor for {make_name} {model_name}")
|
|
else:
|
|
# Parse each engine string
|
|
for engine_str in model_info['engines']:
|
|
spec = self.engine_parser.parse_engine_string(engine_str)
|
|
engine_specs.append(spec)
|
|
|
|
# Remove duplicate engines based on key attributes
|
|
unique_engines = self.engine_parser.get_unique_engines(engine_specs)
|
|
|
|
# Create model data
|
|
model_data = ModelData(
|
|
name=model_name,
|
|
years=sorted(list(model_info['years'])),
|
|
engines=unique_engines,
|
|
trims=sorted(list(model_info['trims'])),
|
|
is_electric=is_electric
|
|
)
|
|
|
|
models.append(model_data)
|
|
|
|
except Exception as e:
|
|
processing_errors.append(f"Error processing model {model_name}: {str(e)}")
|
|
continue
|
|
|
|
# Sort models by name
|
|
models.sort(key=lambda m: m.name)
|
|
|
|
make_data = MakeData(
|
|
name=make_name,
|
|
filename=filename,
|
|
models=models,
|
|
processing_errors=processing_errors,
|
|
processing_warnings=processing_warnings
|
|
)
|
|
|
|
logger.info(f"Extracted {filename}: {len(models)} models, "
|
|
f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models")
|
|
|
|
return make_data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to extract make data from {filename}: {str(e)}")
|
|
processing_errors.append(f"Fatal error: {str(e)}")
|
|
|
|
return MakeData(
|
|
name=self.make_mapper.normalize_make_name(filename),
|
|
filename=filename,
|
|
models=[],
|
|
processing_errors=processing_errors,
|
|
processing_warnings=processing_warnings
|
|
)
|
|
|
|
def extract_all_makes(self, sources_dir: str) -> ExtractionResult:
|
|
"""
|
|
Process all JSON files in the sources directory
|
|
|
|
Args:
|
|
sources_dir: Directory containing JSON make files
|
|
|
|
Returns:
|
|
ExtractionResult with all extracted data and statistics
|
|
"""
|
|
logger.info(f"Starting extraction of all makes from {sources_dir}")
|
|
|
|
# Find all JSON files
|
|
pattern = os.path.join(sources_dir, '*.json')
|
|
json_files = glob.glob(pattern)
|
|
|
|
if not json_files:
|
|
logger.warning(f"No JSON files found in {sources_dir}")
|
|
return ExtractionResult(
|
|
makes=[],
|
|
total_files_processed=0,
|
|
successful_extractions=0,
|
|
failed_extractions=0,
|
|
total_models=0,
|
|
total_engines=0,
|
|
total_electric_models=0
|
|
)
|
|
|
|
logger.info(f"Found {len(json_files)} JSON files to process")
|
|
|
|
makes = []
|
|
successful_extractions = 0
|
|
failed_extractions = 0
|
|
|
|
# Sort files for consistent processing order
|
|
json_files.sort()
|
|
|
|
for json_file in json_files:
|
|
try:
|
|
make_data = self.extract_make_data(json_file)
|
|
makes.append(make_data)
|
|
|
|
if make_data.processing_errors:
|
|
failed_extractions += 1
|
|
logger.error(f"Extraction completed with errors for {make_data.filename}")
|
|
else:
|
|
successful_extractions += 1
|
|
logger.debug(f"Extraction successful for {make_data.filename}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}")
|
|
failed_extractions += 1
|
|
|
|
# Create minimal make data for failed file
|
|
filename = os.path.basename(json_file)
|
|
failed_make = MakeData(
|
|
name=self.make_mapper.normalize_make_name(filename),
|
|
filename=filename,
|
|
models=[],
|
|
processing_errors=[f"Fatal extraction error: {str(e)}"],
|
|
processing_warnings=[]
|
|
)
|
|
makes.append(failed_make)
|
|
|
|
# Calculate statistics
|
|
total_models = sum(make.total_models for make in makes)
|
|
total_engines = sum(make.total_engines for make in makes)
|
|
total_electric_models = sum(make.electric_models_count for make in makes)
|
|
|
|
result = ExtractionResult(
|
|
makes=makes,
|
|
total_files_processed=len(json_files),
|
|
successful_extractions=successful_extractions,
|
|
failed_extractions=failed_extractions,
|
|
total_models=total_models,
|
|
total_engines=total_engines,
|
|
total_electric_models=total_electric_models
|
|
)
|
|
|
|
logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, "
|
|
f"{total_models} models, {total_engines} engines, {total_electric_models} electric models")
|
|
|
|
return result
|
|
|
|
def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]:
|
|
"""
|
|
Get detailed extraction statistics
|
|
|
|
Args:
|
|
result: ExtractionResult from extract_all_makes
|
|
|
|
Returns:
|
|
Dictionary with detailed statistics
|
|
"""
|
|
stats = {
|
|
'files': {
|
|
'total_processed': result.total_files_processed,
|
|
'successful': result.successful_extractions,
|
|
'failed': result.failed_extractions,
|
|
'success_rate': result.success_rate
|
|
},
|
|
'data': {
|
|
'total_makes': len(result.makes),
|
|
'total_models': result.total_models,
|
|
'total_engines': result.total_engines,
|
|
'electric_models': result.total_electric_models
|
|
},
|
|
'quality': {
|
|
'makes_with_errors': sum(1 for make in result.makes if make.processing_errors),
|
|
'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings),
|
|
'total_errors': sum(len(make.processing_errors) for make in result.makes),
|
|
'total_warnings': sum(len(make.processing_warnings) for make in result.makes)
|
|
}
|
|
}
|
|
|
|
# Add make-specific statistics
|
|
make_stats = []
|
|
for make in result.makes:
|
|
make_stat = {
|
|
'name': make.name,
|
|
'filename': make.filename,
|
|
'models': make.total_models,
|
|
'engines': make.total_engines,
|
|
'trims': make.total_trims,
|
|
'electric_models': make.electric_models_count,
|
|
'year_range': make.year_range,
|
|
'errors': len(make.processing_errors),
|
|
'warnings': len(make.processing_warnings)
|
|
}
|
|
make_stats.append(make_stat)
|
|
|
|
stats['makes'] = make_stats
|
|
|
|
return stats
|
|
|
|
def print_extraction_report(self, result: ExtractionResult) -> None:
|
|
"""
|
|
Print detailed extraction report
|
|
|
|
Args:
|
|
result: ExtractionResult from extract_all_makes
|
|
"""
|
|
stats = self.get_extraction_statistics(result)
|
|
|
|
print(f"🚀 JSON EXTRACTION REPORT")
|
|
print(f"=" * 50)
|
|
|
|
# File processing summary
|
|
print(f"\n📁 FILE PROCESSING")
|
|
print(f" Files processed: {stats['files']['total_processed']}")
|
|
print(f" Successful: {stats['files']['successful']}")
|
|
print(f" Failed: {stats['files']['failed']}")
|
|
print(f" Success rate: {stats['files']['success_rate']:.1%}")
|
|
|
|
# Data summary
|
|
print(f"\n📊 DATA EXTRACTED")
|
|
print(f" Makes: {stats['data']['total_makes']}")
|
|
print(f" Models: {stats['data']['total_models']}")
|
|
print(f" Engines: {stats['data']['total_engines']}")
|
|
print(f" Electric models: {stats['data']['electric_models']}")
|
|
|
|
# Quality summary
|
|
print(f"\n🔍 QUALITY ASSESSMENT")
|
|
print(f" Makes with errors: {stats['quality']['makes_with_errors']}")
|
|
print(f" Makes with warnings: {stats['quality']['makes_with_warnings']}")
|
|
print(f" Total errors: {stats['quality']['total_errors']}")
|
|
print(f" Total warnings: {stats['quality']['total_warnings']}")
|
|
|
|
# Show problematic makes
|
|
if stats['quality']['makes_with_errors'] > 0:
|
|
print(f"\n⚠️ MAKES WITH ERRORS:")
|
|
for make in result.makes:
|
|
if make.processing_errors:
|
|
print(f" {make.name} ({make.filename}): {len(make.processing_errors)} errors")
|
|
|
|
# Show top makes by data volume
|
|
print(f"\n🏆 TOP MAKES BY MODEL COUNT:")
|
|
top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
|
|
for make in top_makes:
|
|
print(f" {make.name}: {make.total_models} models, {make.total_engines} engines")
|
|
|
|
|
|
# Example usage and testing functions
|
|
def example_usage():
|
|
"""Demonstrate JsonExtractor usage"""
|
|
print("🚀 JsonExtractor Example Usage")
|
|
print("=" * 40)
|
|
|
|
# Use direct imports for example usage
|
|
try:
|
|
from ..utils.make_name_mapper import MakeNameMapper
|
|
from ..utils.engine_spec_parser import EngineSpecParser
|
|
except ImportError:
|
|
# Fallback for direct execution
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
|
from utils.make_name_mapper import MakeNameMapper
|
|
from utils.engine_spec_parser import EngineSpecParser
|
|
|
|
# Initialize utilities
|
|
make_mapper = MakeNameMapper()
|
|
engine_parser = EngineSpecParser()
|
|
|
|
# Create extractor
|
|
extractor = JsonExtractor(make_mapper, engine_parser)
|
|
|
|
# Extract single make
|
|
sources_dir = "sources/makes"
|
|
if os.path.exists(sources_dir):
|
|
toyota_file = os.path.join(sources_dir, "toyota.json")
|
|
if os.path.exists(toyota_file):
|
|
print(f"\n📄 Extracting from toyota.json...")
|
|
toyota_data = extractor.extract_make_data(toyota_file)
|
|
|
|
print(f" Make: {toyota_data.name}")
|
|
print(f" Models: {toyota_data.total_models}")
|
|
print(f" Engines: {toyota_data.total_engines}")
|
|
print(f" Electric models: {toyota_data.electric_models_count}")
|
|
print(f" Year range: {toyota_data.year_range}")
|
|
|
|
if toyota_data.processing_errors:
|
|
print(f" Errors: {len(toyota_data.processing_errors)}")
|
|
if toyota_data.processing_warnings:
|
|
print(f" Warnings: {len(toyota_data.processing_warnings)}")
|
|
|
|
# Extract all makes
|
|
print(f"\n🔄 Extracting all makes...")
|
|
result = extractor.extract_all_makes(sources_dir)
|
|
extractor.print_extraction_report(result)
|
|
else:
|
|
print(f"Sources directory not found: {sources_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
example_usage() |