Initial Commit
This commit is contained in:
0
mvp-platform-services/vehicles/etl/extractors/__init__.py
Executable file
0
mvp-platform-services/vehicles/etl/extractors/__init__.py
Executable file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
629
mvp-platform-services/vehicles/etl/extractors/json_extractor.py
Normal file
629
mvp-platform-services/vehicles/etl/extractors/json_extractor.py
Normal file
@@ -0,0 +1,629 @@
|
||||
"""
|
||||
JSON Extractor for Manual Vehicle Data Processing
|
||||
|
||||
Extracts and normalizes vehicle data from JSON files into database-ready structures.
|
||||
Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive
|
||||
data processing with L→I normalization and make name conversion.
|
||||
|
||||
Key Features:
|
||||
- Extract make/model/year/trim/engine data from JSON files
|
||||
- Handle electric vehicles (empty engines → default motor)
|
||||
- Data validation and quality assurance
|
||||
- Progress tracking and error reporting
|
||||
|
||||
Usage:
|
||||
extractor = JsonExtractor(make_mapper, engine_parser)
|
||||
make_data = extractor.extract_make_data('sources/makes/toyota.json')
|
||||
all_data = extractor.extract_all_makes('sources/makes/')
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import glob
|
||||
import logging
|
||||
from typing import List, Dict, Optional, Generator, Tuple
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
# Import our utilities (handle both relative and direct imports)
|
||||
try:
|
||||
from ..utils.make_name_mapper import MakeNameMapper
|
||||
from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec
|
||||
except ImportError:
|
||||
# Fallback for direct execution
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
from utils.make_name_mapper import MakeNameMapper
|
||||
from utils.engine_spec_parser import EngineSpecParser, EngineSpec
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationResult:
|
||||
"""JSON validation result"""
|
||||
is_valid: bool
|
||||
errors: List[str]
|
||||
warnings: List[str]
|
||||
|
||||
@property
|
||||
def has_errors(self) -> bool:
|
||||
return len(self.errors) > 0
|
||||
|
||||
@property
|
||||
def has_warnings(self) -> bool:
|
||||
return len(self.warnings) > 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelData:
|
||||
"""Extracted model data with normalized engines and trims"""
|
||||
name: str # Model name from JSON
|
||||
years: List[int] # Years this model appears in
|
||||
engines: List[EngineSpec] # Parsed and normalized engines
|
||||
trims: List[str] # Trim names (from submodels)
|
||||
is_electric: bool = False # True if empty engines array detected
|
||||
|
||||
@property
|
||||
def total_trims(self) -> int:
|
||||
return len(self.trims)
|
||||
|
||||
@property
|
||||
def total_engines(self) -> int:
|
||||
return len(self.engines)
|
||||
|
||||
@property
|
||||
def year_range(self) -> str:
|
||||
if not self.years:
|
||||
return "Unknown"
|
||||
return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0])
|
||||
|
||||
|
||||
@dataclass
|
||||
class MakeData:
|
||||
"""Complete make data with models, engines, and metadata"""
|
||||
name: str # Normalized display name (e.g., "Alfa Romeo")
|
||||
filename: str # Original JSON filename
|
||||
models: List[ModelData]
|
||||
processing_errors: List[str] # Any errors during extraction
|
||||
processing_warnings: List[str] # Any warnings during extraction
|
||||
|
||||
@property
|
||||
def total_models(self) -> int:
|
||||
return len(self.models)
|
||||
|
||||
@property
|
||||
def total_engines(self) -> int:
|
||||
return sum(model.total_engines for model in self.models)
|
||||
|
||||
@property
|
||||
def total_trims(self) -> int:
|
||||
return sum(model.total_trims for model in self.models)
|
||||
|
||||
@property
|
||||
def electric_models_count(self) -> int:
|
||||
return sum(1 for model in self.models if model.is_electric)
|
||||
|
||||
@property
|
||||
def year_range(self) -> str:
|
||||
all_years = []
|
||||
for model in self.models:
|
||||
all_years.extend(model.years)
|
||||
|
||||
if not all_years:
|
||||
return "Unknown"
|
||||
return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0])
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
"""Results of extracting all makes"""
|
||||
makes: List[MakeData]
|
||||
total_files_processed: int
|
||||
successful_extractions: int
|
||||
failed_extractions: int
|
||||
total_models: int
|
||||
total_engines: int
|
||||
total_electric_models: int
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0
|
||||
|
||||
|
||||
class JsonExtractor:
|
||||
"""Extract normalized vehicle data from JSON files"""
|
||||
|
||||
def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser):
|
||||
"""
|
||||
Initialize JSON extractor with utilities
|
||||
|
||||
Args:
|
||||
make_mapper: For normalizing make names from filenames
|
||||
engine_parser: For parsing engine specifications with L→I normalization
|
||||
"""
|
||||
self.make_mapper = make_mapper
|
||||
self.engine_parser = engine_parser
|
||||
|
||||
logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser")
|
||||
|
||||
def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult:
|
||||
"""
|
||||
Validate JSON structure before processing
|
||||
|
||||
Args:
|
||||
json_data: Loaded JSON data
|
||||
filename: Source filename for error context
|
||||
|
||||
Returns:
|
||||
ValidationResult with validity status and any issues
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
try:
|
||||
# Check top-level structure
|
||||
if not isinstance(json_data, dict):
|
||||
errors.append("JSON must be a dictionary")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
# Should have exactly one key (the make name)
|
||||
if len(json_data.keys()) != 1:
|
||||
errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
make_key = list(json_data.keys())[0]
|
||||
make_data = json_data[make_key]
|
||||
|
||||
# Make data should be a list of year entries
|
||||
if not isinstance(make_data, list):
|
||||
errors.append(f"Make data for '{make_key}' must be a list")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
if len(make_data) == 0:
|
||||
warnings.append(f"Make '{make_key}' has no year entries")
|
||||
|
||||
# Validate year entries
|
||||
for i, year_entry in enumerate(make_data):
|
||||
if not isinstance(year_entry, dict):
|
||||
errors.append(f"Year entry {i} must be a dictionary")
|
||||
continue
|
||||
|
||||
# Check required fields
|
||||
if 'year' not in year_entry:
|
||||
errors.append(f"Year entry {i} missing 'year' field")
|
||||
|
||||
if 'models' not in year_entry:
|
||||
errors.append(f"Year entry {i} missing 'models' field")
|
||||
continue
|
||||
|
||||
# Validate year
|
||||
try:
|
||||
year = int(year_entry['year'])
|
||||
if year < 1900 or year > 2030:
|
||||
warnings.append(f"Unusual year value: {year}")
|
||||
except (ValueError, TypeError):
|
||||
errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}")
|
||||
|
||||
# Validate models
|
||||
models = year_entry['models']
|
||||
if not isinstance(models, list):
|
||||
errors.append(f"Models in year entry {i} must be a list")
|
||||
continue
|
||||
|
||||
for j, model in enumerate(models):
|
||||
if not isinstance(model, dict):
|
||||
errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary")
|
||||
continue
|
||||
|
||||
if 'name' not in model:
|
||||
errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field")
|
||||
|
||||
# Engines and submodels are optional but should be lists if present
|
||||
if 'engines' in model and not isinstance(model['engines'], list):
|
||||
errors.append(f"Engines for model {model.get('name')} must be a list")
|
||||
|
||||
if 'submodels' in model and not isinstance(model['submodels'], list):
|
||||
errors.append(f"Submodels for model {model.get('name')} must be a list")
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Unexpected error during validation: {str(e)}")
|
||||
|
||||
is_valid = len(errors) == 0
|
||||
|
||||
if errors:
|
||||
logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors")
|
||||
elif warnings:
|
||||
logger.info(f"JSON validation for {filename}: {len(warnings)} warnings")
|
||||
else:
|
||||
logger.debug(f"JSON validation passed for {filename}")
|
||||
|
||||
return ValidationResult(is_valid, errors, warnings)
|
||||
|
||||
def extract_make_data(self, json_file_path: str) -> MakeData:
|
||||
"""
|
||||
Extract complete make data from a single JSON file
|
||||
|
||||
Args:
|
||||
json_file_path: Path to JSON file
|
||||
|
||||
Returns:
|
||||
MakeData with extracted and normalized data
|
||||
"""
|
||||
filename = os.path.basename(json_file_path)
|
||||
logger.info(f"Extracting make data from {filename}")
|
||||
|
||||
processing_errors = []
|
||||
processing_warnings = []
|
||||
|
||||
try:
|
||||
# Load and validate JSON
|
||||
with open(json_file_path, 'r', encoding='utf-8') as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
validation = self.validate_json_structure(json_data, filename)
|
||||
processing_errors.extend(validation.errors)
|
||||
processing_warnings.extend(validation.warnings)
|
||||
|
||||
if not validation.is_valid:
|
||||
logger.error(f"JSON validation failed for {filename}")
|
||||
return MakeData(
|
||||
name=self.make_mapper.normalize_make_name(filename),
|
||||
filename=filename,
|
||||
models=[],
|
||||
processing_errors=processing_errors,
|
||||
processing_warnings=processing_warnings
|
||||
)
|
||||
|
||||
# Get normalized make name
|
||||
make_name = self.make_mapper.normalize_make_name(filename)
|
||||
logger.debug(f"Normalized make name: {filename} → {make_name}")
|
||||
|
||||
# Extract data
|
||||
make_key = list(json_data.keys())[0]
|
||||
year_entries = json_data[make_key]
|
||||
|
||||
# Group models by name across all years
|
||||
models_by_name = {} # model_name -> {years: set, engines: set, trims: set}
|
||||
|
||||
for year_entry in year_entries:
|
||||
try:
|
||||
year = int(year_entry['year'])
|
||||
models_list = year_entry.get('models', [])
|
||||
|
||||
for model_entry in models_list:
|
||||
model_name = model_entry.get('name', '').strip()
|
||||
if not model_name:
|
||||
processing_warnings.append(f"Empty model name in year {year}")
|
||||
continue
|
||||
|
||||
# Initialize model data if not seen before
|
||||
if model_name not in models_by_name:
|
||||
models_by_name[model_name] = {
|
||||
'years': set(),
|
||||
'engines': set(),
|
||||
'trims': set()
|
||||
}
|
||||
|
||||
# Add year
|
||||
models_by_name[model_name]['years'].add(year)
|
||||
|
||||
# Add engines
|
||||
engines_list = model_entry.get('engines', [])
|
||||
for engine_str in engines_list:
|
||||
if engine_str and engine_str.strip():
|
||||
models_by_name[model_name]['engines'].add(engine_str.strip())
|
||||
|
||||
# Add trims (from submodels)
|
||||
submodels_list = model_entry.get('submodels', [])
|
||||
for trim in submodels_list:
|
||||
if trim and trim.strip():
|
||||
models_by_name[model_name]['trims'].add(trim.strip())
|
||||
|
||||
except (ValueError, TypeError) as e:
|
||||
processing_errors.append(f"Error processing year entry: {str(e)}")
|
||||
continue
|
||||
|
||||
# Convert to ModelData objects
|
||||
models = []
|
||||
for model_name, model_info in models_by_name.items():
|
||||
try:
|
||||
# Parse engines
|
||||
engine_specs = []
|
||||
is_electric = False
|
||||
|
||||
if not model_info['engines']:
|
||||
# Empty engines array - electric vehicle
|
||||
is_electric = True
|
||||
electric_spec = self.engine_parser.create_electric_motor()
|
||||
engine_specs = [electric_spec]
|
||||
logger.debug(f"Created electric motor for {make_name} {model_name}")
|
||||
else:
|
||||
# Parse each engine string
|
||||
for engine_str in model_info['engines']:
|
||||
spec = self.engine_parser.parse_engine_string(engine_str)
|
||||
engine_specs.append(spec)
|
||||
|
||||
# Remove duplicate engines based on key attributes
|
||||
unique_engines = self.engine_parser.get_unique_engines(engine_specs)
|
||||
|
||||
# Create model data
|
||||
model_data = ModelData(
|
||||
name=model_name,
|
||||
years=sorted(list(model_info['years'])),
|
||||
engines=unique_engines,
|
||||
trims=sorted(list(model_info['trims'])),
|
||||
is_electric=is_electric
|
||||
)
|
||||
|
||||
models.append(model_data)
|
||||
|
||||
except Exception as e:
|
||||
processing_errors.append(f"Error processing model {model_name}: {str(e)}")
|
||||
continue
|
||||
|
||||
# Sort models by name
|
||||
models.sort(key=lambda m: m.name)
|
||||
|
||||
make_data = MakeData(
|
||||
name=make_name,
|
||||
filename=filename,
|
||||
models=models,
|
||||
processing_errors=processing_errors,
|
||||
processing_warnings=processing_warnings
|
||||
)
|
||||
|
||||
logger.info(f"Extracted {filename}: {len(models)} models, "
|
||||
f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models")
|
||||
|
||||
return make_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract make data from {filename}: {str(e)}")
|
||||
processing_errors.append(f"Fatal error: {str(e)}")
|
||||
|
||||
return MakeData(
|
||||
name=self.make_mapper.normalize_make_name(filename),
|
||||
filename=filename,
|
||||
models=[],
|
||||
processing_errors=processing_errors,
|
||||
processing_warnings=processing_warnings
|
||||
)
|
||||
|
||||
def extract_all_makes(self, sources_dir: str) -> ExtractionResult:
|
||||
"""
|
||||
Process all JSON files in the sources directory
|
||||
|
||||
Args:
|
||||
sources_dir: Directory containing JSON make files
|
||||
|
||||
Returns:
|
||||
ExtractionResult with all extracted data and statistics
|
||||
"""
|
||||
logger.info(f"Starting extraction of all makes from {sources_dir}")
|
||||
|
||||
# Find all JSON files
|
||||
pattern = os.path.join(sources_dir, '*.json')
|
||||
json_files = glob.glob(pattern)
|
||||
|
||||
if not json_files:
|
||||
logger.warning(f"No JSON files found in {sources_dir}")
|
||||
return ExtractionResult(
|
||||
makes=[],
|
||||
total_files_processed=0,
|
||||
successful_extractions=0,
|
||||
failed_extractions=0,
|
||||
total_models=0,
|
||||
total_engines=0,
|
||||
total_electric_models=0
|
||||
)
|
||||
|
||||
logger.info(f"Found {len(json_files)} JSON files to process")
|
||||
|
||||
makes = []
|
||||
successful_extractions = 0
|
||||
failed_extractions = 0
|
||||
|
||||
# Sort files for consistent processing order
|
||||
json_files.sort()
|
||||
|
||||
for json_file in json_files:
|
||||
try:
|
||||
make_data = self.extract_make_data(json_file)
|
||||
makes.append(make_data)
|
||||
|
||||
if make_data.processing_errors:
|
||||
failed_extractions += 1
|
||||
logger.error(f"Extraction completed with errors for {make_data.filename}")
|
||||
else:
|
||||
successful_extractions += 1
|
||||
logger.debug(f"Extraction successful for {make_data.filename}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}")
|
||||
failed_extractions += 1
|
||||
|
||||
# Create minimal make data for failed file
|
||||
filename = os.path.basename(json_file)
|
||||
failed_make = MakeData(
|
||||
name=self.make_mapper.normalize_make_name(filename),
|
||||
filename=filename,
|
||||
models=[],
|
||||
processing_errors=[f"Fatal extraction error: {str(e)}"],
|
||||
processing_warnings=[]
|
||||
)
|
||||
makes.append(failed_make)
|
||||
|
||||
# Calculate statistics
|
||||
total_models = sum(make.total_models for make in makes)
|
||||
total_engines = sum(make.total_engines for make in makes)
|
||||
total_electric_models = sum(make.electric_models_count for make in makes)
|
||||
|
||||
result = ExtractionResult(
|
||||
makes=makes,
|
||||
total_files_processed=len(json_files),
|
||||
successful_extractions=successful_extractions,
|
||||
failed_extractions=failed_extractions,
|
||||
total_models=total_models,
|
||||
total_engines=total_engines,
|
||||
total_electric_models=total_electric_models
|
||||
)
|
||||
|
||||
logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, "
|
||||
f"{total_models} models, {total_engines} engines, {total_electric_models} electric models")
|
||||
|
||||
return result
|
||||
|
||||
def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]:
|
||||
"""
|
||||
Get detailed extraction statistics
|
||||
|
||||
Args:
|
||||
result: ExtractionResult from extract_all_makes
|
||||
|
||||
Returns:
|
||||
Dictionary with detailed statistics
|
||||
"""
|
||||
stats = {
|
||||
'files': {
|
||||
'total_processed': result.total_files_processed,
|
||||
'successful': result.successful_extractions,
|
||||
'failed': result.failed_extractions,
|
||||
'success_rate': result.success_rate
|
||||
},
|
||||
'data': {
|
||||
'total_makes': len(result.makes),
|
||||
'total_models': result.total_models,
|
||||
'total_engines': result.total_engines,
|
||||
'electric_models': result.total_electric_models
|
||||
},
|
||||
'quality': {
|
||||
'makes_with_errors': sum(1 for make in result.makes if make.processing_errors),
|
||||
'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings),
|
||||
'total_errors': sum(len(make.processing_errors) for make in result.makes),
|
||||
'total_warnings': sum(len(make.processing_warnings) for make in result.makes)
|
||||
}
|
||||
}
|
||||
|
||||
# Add make-specific statistics
|
||||
make_stats = []
|
||||
for make in result.makes:
|
||||
make_stat = {
|
||||
'name': make.name,
|
||||
'filename': make.filename,
|
||||
'models': make.total_models,
|
||||
'engines': make.total_engines,
|
||||
'trims': make.total_trims,
|
||||
'electric_models': make.electric_models_count,
|
||||
'year_range': make.year_range,
|
||||
'errors': len(make.processing_errors),
|
||||
'warnings': len(make.processing_warnings)
|
||||
}
|
||||
make_stats.append(make_stat)
|
||||
|
||||
stats['makes'] = make_stats
|
||||
|
||||
return stats
|
||||
|
||||
def print_extraction_report(self, result: ExtractionResult) -> None:
|
||||
"""
|
||||
Print detailed extraction report
|
||||
|
||||
Args:
|
||||
result: ExtractionResult from extract_all_makes
|
||||
"""
|
||||
stats = self.get_extraction_statistics(result)
|
||||
|
||||
print(f"🚀 JSON EXTRACTION REPORT")
|
||||
print(f"=" * 50)
|
||||
|
||||
# File processing summary
|
||||
print(f"\n📁 FILE PROCESSING")
|
||||
print(f" Files processed: {stats['files']['total_processed']}")
|
||||
print(f" Successful: {stats['files']['successful']}")
|
||||
print(f" Failed: {stats['files']['failed']}")
|
||||
print(f" Success rate: {stats['files']['success_rate']:.1%}")
|
||||
|
||||
# Data summary
|
||||
print(f"\n📊 DATA EXTRACTED")
|
||||
print(f" Makes: {stats['data']['total_makes']}")
|
||||
print(f" Models: {stats['data']['total_models']}")
|
||||
print(f" Engines: {stats['data']['total_engines']}")
|
||||
print(f" Electric models: {stats['data']['electric_models']}")
|
||||
|
||||
# Quality summary
|
||||
print(f"\n🔍 QUALITY ASSESSMENT")
|
||||
print(f" Makes with errors: {stats['quality']['makes_with_errors']}")
|
||||
print(f" Makes with warnings: {stats['quality']['makes_with_warnings']}")
|
||||
print(f" Total errors: {stats['quality']['total_errors']}")
|
||||
print(f" Total warnings: {stats['quality']['total_warnings']}")
|
||||
|
||||
# Show problematic makes
|
||||
if stats['quality']['makes_with_errors'] > 0:
|
||||
print(f"\n⚠️ MAKES WITH ERRORS:")
|
||||
for make in result.makes:
|
||||
if make.processing_errors:
|
||||
print(f" {make.name} ({make.filename}): {len(make.processing_errors)} errors")
|
||||
|
||||
# Show top makes by data volume
|
||||
print(f"\n🏆 TOP MAKES BY MODEL COUNT:")
|
||||
top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
|
||||
for make in top_makes:
|
||||
print(f" {make.name}: {make.total_models} models, {make.total_engines} engines")
|
||||
|
||||
|
||||
# Example usage and testing functions
|
||||
def example_usage():
|
||||
"""Demonstrate JsonExtractor usage"""
|
||||
print("🚀 JsonExtractor Example Usage")
|
||||
print("=" * 40)
|
||||
|
||||
# Use direct imports for example usage
|
||||
try:
|
||||
from ..utils.make_name_mapper import MakeNameMapper
|
||||
from ..utils.engine_spec_parser import EngineSpecParser
|
||||
except ImportError:
|
||||
# Fallback for direct execution
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
from utils.make_name_mapper import MakeNameMapper
|
||||
from utils.engine_spec_parser import EngineSpecParser
|
||||
|
||||
# Initialize utilities
|
||||
make_mapper = MakeNameMapper()
|
||||
engine_parser = EngineSpecParser()
|
||||
|
||||
# Create extractor
|
||||
extractor = JsonExtractor(make_mapper, engine_parser)
|
||||
|
||||
# Extract single make
|
||||
sources_dir = "sources/makes"
|
||||
if os.path.exists(sources_dir):
|
||||
toyota_file = os.path.join(sources_dir, "toyota.json")
|
||||
if os.path.exists(toyota_file):
|
||||
print(f"\n📄 Extracting from toyota.json...")
|
||||
toyota_data = extractor.extract_make_data(toyota_file)
|
||||
|
||||
print(f" Make: {toyota_data.name}")
|
||||
print(f" Models: {toyota_data.total_models}")
|
||||
print(f" Engines: {toyota_data.total_engines}")
|
||||
print(f" Electric models: {toyota_data.electric_models_count}")
|
||||
print(f" Year range: {toyota_data.year_range}")
|
||||
|
||||
if toyota_data.processing_errors:
|
||||
print(f" Errors: {len(toyota_data.processing_errors)}")
|
||||
if toyota_data.processing_warnings:
|
||||
print(f" Warnings: {len(toyota_data.processing_warnings)}")
|
||||
|
||||
# Extract all makes
|
||||
print(f"\n🔄 Extracting all makes...")
|
||||
result = extractor.extract_all_makes(sources_dir)
|
||||
extractor.print_extraction_report(result)
|
||||
else:
|
||||
print(f"Sources directory not found: {sources_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
337
mvp-platform-services/vehicles/etl/extractors/mssql_extractor.py
Executable file
337
mvp-platform-services/vehicles/etl/extractors/mssql_extractor.py
Executable file
@@ -0,0 +1,337 @@
|
||||
import logging
|
||||
from typing import List, Dict, Optional, Generator
|
||||
from ..connections import db_connections
|
||||
from ..utils.make_filter import MakeFilter
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MSSQLExtractor:
|
||||
"""Extract data from MS SQL Server source database"""
|
||||
|
||||
def __init__(self, make_filter: Optional[MakeFilter] = None):
|
||||
self.batch_size = 10000
|
||||
self.make_filter = make_filter or MakeFilter()
|
||||
logger.info(f"Initialized MSSQL extractor with {len(self.make_filter.get_allowed_makes())} allowed makes")
|
||||
|
||||
def extract_wmi_data(self) -> List[Dict]:
|
||||
"""Extract WMI (World Manufacturer Identifier) data with make filtering"""
|
||||
logger.info("Extracting WMI data from source database with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
w.Id,
|
||||
w.Wmi,
|
||||
w.ManufacturerId,
|
||||
w.MakeId,
|
||||
w.VehicleTypeId,
|
||||
w.TruckTypeId,
|
||||
w.CountryId,
|
||||
w.PublicAvailabilityDate,
|
||||
w.NonCompliant,
|
||||
w.NonCompliantReason,
|
||||
w.CreatedOn,
|
||||
w.UpdatedOn,
|
||||
w.ProcessedOn
|
||||
FROM dbo.Wmi w
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
)
|
||||
ORDER BY w.Id
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} WMI records")
|
||||
return results
|
||||
|
||||
def extract_wmi_vin_schema_mappings(self) -> List[Dict]:
|
||||
"""Extract WMI to VIN Schema mappings with year ranges and make filtering"""
|
||||
logger.info("Extracting WMI-VinSchema mappings with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
wvs.WmiId,
|
||||
wvs.VinSchemaId,
|
||||
wvs.YearFrom,
|
||||
wvs.YearTo,
|
||||
w.Wmi,
|
||||
vs.Name as SchemaName
|
||||
FROM dbo.Wmi_VinSchema wvs
|
||||
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
|
||||
JOIN dbo.VinSchema vs ON wvs.VinSchemaId = vs.Id
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
)
|
||||
AND w.MakeId IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
ORDER BY wvs.WmiId, wvs.VinSchemaId
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} WMI-VinSchema mappings (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def extract_patterns_data(self) -> Generator[List[Dict], None, None]:
|
||||
"""Extract pattern data in batches with make filtering"""
|
||||
logger.info("Extracting pattern data from source database with make filtering")
|
||||
|
||||
# First get the total count with filtering
|
||||
count_query = f"""
|
||||
SELECT COUNT(*) as total
|
||||
FROM dbo.Pattern p
|
||||
JOIN dbo.Element e ON p.ElementId = e.Id
|
||||
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
|
||||
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
|
||||
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
|
||||
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
|
||||
JOIN dbo.Make m ON wm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
AND e.Id IN (26, 27, 28, 18, 24)
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(count_query)
|
||||
total_row = self._row_to_dict(cursor, cursor.fetchone())
|
||||
total_count = total_row.get('total', 0)
|
||||
|
||||
logger.info(f"Total patterns to extract (filtered): {total_count}")
|
||||
|
||||
# Extract in batches with manufacturer filtering
|
||||
query = f"""
|
||||
SELECT
|
||||
p.Id,
|
||||
p.VinSchemaId,
|
||||
p.Keys,
|
||||
p.ElementId,
|
||||
p.AttributeId,
|
||||
e.Name as ElementName,
|
||||
e.weight,
|
||||
e.GroupName,
|
||||
vs.Name as SchemaName,
|
||||
w.Wmi,
|
||||
m.Name as MakeName
|
||||
FROM dbo.Pattern p
|
||||
JOIN dbo.Element e ON p.ElementId = e.Id
|
||||
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
|
||||
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
|
||||
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
|
||||
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
|
||||
JOIN dbo.Make m ON wm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
AND e.Id IN (26, 27, 28, 18, 24)
|
||||
ORDER BY p.Id
|
||||
OFFSET {{}} ROWS FETCH NEXT {{}} ROWS ONLY
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
for offset in tqdm(range(0, total_count, self.batch_size), desc="Extracting filtered patterns"):
|
||||
cursor.execute(query.format(offset, self.batch_size))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if rows:
|
||||
yield self._rows_to_dicts(cursor, rows)
|
||||
else:
|
||||
break
|
||||
|
||||
def extract_elements_data(self) -> List[Dict]:
|
||||
"""Extract element definitions"""
|
||||
logger.info("Extracting element data")
|
||||
|
||||
query = """
|
||||
SELECT
|
||||
Id,
|
||||
Name,
|
||||
Code,
|
||||
LookupTable,
|
||||
Description,
|
||||
IsPrivate,
|
||||
GroupName,
|
||||
DataType,
|
||||
MinAllowedValue,
|
||||
MaxAllowedValue,
|
||||
IsQS,
|
||||
Decode,
|
||||
weight
|
||||
FROM dbo.Element
|
||||
ORDER BY Id
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} element definitions")
|
||||
return results
|
||||
|
||||
def extract_reference_table(self, table_name: str) -> List[Dict]:
|
||||
"""Extract data from a reference table with make filtering"""
|
||||
logger.info(f"Extracting data from {table_name} with make filtering")
|
||||
|
||||
# Apply make filtering - filter by Make brand names (simpler and more efficient)
|
||||
if table_name == 'Manufacturer':
|
||||
# Extract manufacturers linked to filtered makes only
|
||||
query = f"""
|
||||
SELECT DISTINCT mfr.* FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
ORDER BY mfr.Id
|
||||
"""
|
||||
elif table_name == 'Make':
|
||||
# Filter makes directly by brand names (GMC, Ford, Toyota, etc.)
|
||||
query = f"""
|
||||
SELECT * FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
ORDER BY Id
|
||||
"""
|
||||
elif table_name == 'Model':
|
||||
# Filter models by allowed make brand names
|
||||
query = f"""
|
||||
SELECT md.* FROM dbo.Model md
|
||||
JOIN dbo.Make_Model mm ON md.Id = mm.ModelId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
ORDER BY md.Id
|
||||
"""
|
||||
elif table_name == 'Wmi':
|
||||
# Filter WMI records by allowed manufacturers (linked to makes) AND makes directly
|
||||
query = f"""
|
||||
SELECT w.* FROM dbo.Wmi w
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
)
|
||||
AND w.MakeId IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
ORDER BY w.Id
|
||||
"""
|
||||
else:
|
||||
# No filtering for other reference tables
|
||||
query = f"SELECT * FROM dbo.{table_name} ORDER BY Id"
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} records from {table_name} (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def extract_make_model_relationships(self) -> List[Dict]:
|
||||
"""Extract Make-Model relationships with make filtering"""
|
||||
logger.info("Extracting Make-Model relationships with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
mm.MakeId,
|
||||
mm.ModelId,
|
||||
m.Name as MakeName,
|
||||
md.Name as ModelName
|
||||
FROM dbo.Make_Model mm
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
JOIN dbo.Model md ON mm.ModelId = md.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
ORDER BY mm.MakeId, mm.ModelId
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} Make-Model relationships (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def extract_wmi_make_relationships(self) -> List[Dict]:
|
||||
"""Extract WMI-Make relationships with make filtering"""
|
||||
logger.info("Extracting WMI-Make relationships with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
wm.WmiId,
|
||||
wm.MakeId,
|
||||
w.Wmi,
|
||||
m.Name as MakeName
|
||||
FROM dbo.Wmi_Make wm
|
||||
JOIN dbo.Wmi w ON wm.WmiId = w.Id
|
||||
JOIN dbo.Make m ON wm.MakeId = m.Id
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make mk ON mm.MakeId = mk.Id
|
||||
WHERE {self.make_filter.get_sql_filter('mk.Name')}
|
||||
)
|
||||
AND w.MakeId IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
AND m.Id IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
ORDER BY wm.WmiId, wm.MakeId
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} WMI-Make relationships (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def _rows_to_dicts(self, cursor, rows) -> List[Dict]:
|
||||
"""Convert pyodbc rows to list of dicts using cursor description."""
|
||||
if not rows:
|
||||
return []
|
||||
columns = [col[0] for col in cursor.description]
|
||||
result: List[Dict] = []
|
||||
for row in rows:
|
||||
item = {columns[i]: row[i] for i in range(len(columns))}
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
def _row_to_dict(self, cursor, row) -> Dict:
|
||||
"""Convert single pyodbc row to dict."""
|
||||
if row is None:
|
||||
return {}
|
||||
columns = [col[0] for col in cursor.description]
|
||||
return {columns[i]: row[i] for i in range(len(columns))}
|
||||
@@ -0,0 +1,63 @@
|
||||
import logging
|
||||
from typing import Optional, Dict, Any, List
|
||||
from ..connections import db_connections
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class VinProcExtractor:
|
||||
"""Utilities to inspect and sample the MSSQL VIN decode stored procedure."""
|
||||
|
||||
def __init__(self, proc_name: str = 'dbo.spVinDecode'):
|
||||
self.proc_name = proc_name
|
||||
|
||||
def find_proc(self) -> Optional[Dict[str, Any]]:
|
||||
"""Locate the VIN decode proc by name pattern, return basic metadata."""
|
||||
query = """
|
||||
SELECT TOP 1
|
||||
o.name AS object_name,
|
||||
s.name AS schema_name,
|
||||
o.type_desc
|
||||
FROM sys.objects o
|
||||
JOIN sys.schemas s ON s.schema_id = o.schema_id
|
||||
WHERE o.name LIKE '%Vin%Decode%'
|
||||
ORDER BY o.create_date DESC
|
||||
"""
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute(query)
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
logger.warning("VIN decode stored procedure not found by pattern")
|
||||
return None
|
||||
return { 'object_name': row[0], 'schema_name': row[1], 'type_desc': row[2] }
|
||||
|
||||
def get_definition(self, schema: str, name: str) -> str:
|
||||
"""Return the text definition of the proc using sp_helptext semantics."""
|
||||
sql = f"EXEC {schema}.sp_helptext '{schema}.{name}'"
|
||||
definition_lines: List[str] = []
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute(sql)
|
||||
for row in cur.fetchall():
|
||||
# sp_helptext returns a single NVARCHAR column with line segments
|
||||
definition_lines.append(row[0])
|
||||
return ''.join(definition_lines)
|
||||
|
||||
def sample_execute(self, vin: str) -> Optional[List[Dict[str, Any]]]:
|
||||
"""Execute the VIN decode proc with a VIN to capture output shape."""
|
||||
# Prefer proc signature with @VIN only; if it requires year, MSSQL will error.
|
||||
sql = f"EXEC {self.proc_name} @VIN=?"
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cur = conn.cursor()
|
||||
try:
|
||||
cur.execute(sql, (vin,))
|
||||
columns = [c[0] for c in cur.description] if cur.description else []
|
||||
rows = cur.fetchall() if cur.description else []
|
||||
results: List[Dict[str, Any]] = []
|
||||
for r in rows:
|
||||
results.append({columns[i]: r[i] for i in range(len(columns))})
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.warning(f"VIN proc sample execution failed: {e}")
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user