Initial Commit

2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions
--- a/mvp-platform-services/vehicles/etl/extractors/init.py
+++ b/mvp-platform-services/vehicles/etl/extractors/init.py
--- a/mvp-platform-services/vehicles/etl/extractors/pycache/init.cpython-311.pyc
+++ b/mvp-platform-services/vehicles/etl/extractors/pycache/init.cpython-311.pyc
--- a/mvp-platform-services/vehicles/etl/extractors/pycache/init.cpython-312.pyc
+++ b/mvp-platform-services/vehicles/etl/extractors/pycache/init.cpython-312.pyc
--- a/mvp-platform-services/vehicles/etl/extractors/pycache/json_extractor.cpython-311.pyc
+++ b/mvp-platform-services/vehicles/etl/extractors/pycache/json_extractor.cpython-311.pyc
--- a/mvp-platform-services/vehicles/etl/extractors/pycache/json_extractor.cpython-312.pyc
+++ b/mvp-platform-services/vehicles/etl/extractors/pycache/json_extractor.cpython-312.pyc
--- a/mvp-platform-services/vehicles/etl/extractors/pycache/mssql_extractor.cpython-311.pyc
+++ b/mvp-platform-services/vehicles/etl/extractors/pycache/mssql_extractor.cpython-311.pyc
--- a/mvp-platform-services/vehicles/etl/extractors/pycache/mssql_extractor.cpython-312.pyc
+++ b/mvp-platform-services/vehicles/etl/extractors/pycache/mssql_extractor.cpython-312.pyc
--- a/mvp-platform-services/vehicles/etl/extractors/pycache/vin_proc_extractor.cpython-311.pyc
+++ b/mvp-platform-services/vehicles/etl/extractors/pycache/vin_proc_extractor.cpython-311.pyc
--- a/mvp-platform-services/vehicles/etl/extractors/json_extractor.py
+++ b/mvp-platform-services/vehicles/etl/extractors/json_extractor.py
@@ -0,0 +1,629 @@
+"""
+JSON Extractor for Manual Vehicle Data Processing
+
+Extracts and normalizes vehicle data from JSON files into database-ready structures.
+Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive
+data processing with L→I normalization and make name conversion.
+
+Key Features:
+- Extract make/model/year/trim/engine data from JSON files
+- Handle electric vehicles (empty engines → default motor)
+- Data validation and quality assurance
+- Progress tracking and error reporting
+
+Usage:
+    extractor = JsonExtractor(make_mapper, engine_parser)
+    make_data = extractor.extract_make_data('sources/makes/toyota.json')
+    all_data = extractor.extract_all_makes('sources/makes/')
+"""
+
+import json
+import os
+import glob
+import logging
+from typing import List, Dict, Optional, Generator, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+
+# Import our utilities (handle both relative and direct imports)
+try:
+    from ..utils.make_name_mapper import MakeNameMapper
+    from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec
+except ImportError:
+    # Fallback for direct execution
+    import sys
+    import os
+    sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+    from utils.make_name_mapper import MakeNameMapper
+    from utils.engine_spec_parser import EngineSpecParser, EngineSpec
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ValidationResult:
+    """JSON validation result"""
+    is_valid: bool
+    errors: List[str]
+    warnings: List[str]
+    
+    @property
+    def has_errors(self) -> bool:
+        return len(self.errors) > 0
+    
+    @property
+    def has_warnings(self) -> bool:
+        return len(self.warnings) > 0
+
+
+@dataclass
+class ModelData:
+    """Extracted model data with normalized engines and trims"""
+    name: str                          # Model name from JSON
+    years: List[int]                   # Years this model appears in
+    engines: List[EngineSpec]          # Parsed and normalized engines
+    trims: List[str]                   # Trim names (from submodels)
+    is_electric: bool = False          # True if empty engines array detected
+    
+    @property
+    def total_trims(self) -> int:
+        return len(self.trims)
+    
+    @property
+    def total_engines(self) -> int:
+        return len(self.engines)
+    
+    @property
+    def year_range(self) -> str:
+        if not self.years:
+            return "Unknown"
+        return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0])
+
+
+@dataclass  
+class MakeData:
+    """Complete make data with models, engines, and metadata"""
+    name: str                          # Normalized display name (e.g., "Alfa Romeo")
+    filename: str                      # Original JSON filename
+    models: List[ModelData]
+    processing_errors: List[str]       # Any errors during extraction
+    processing_warnings: List[str]     # Any warnings during extraction
+    
+    @property
+    def total_models(self) -> int:
+        return len(self.models)
+    
+    @property
+    def total_engines(self) -> int:
+        return sum(model.total_engines for model in self.models)
+    
+    @property
+    def total_trims(self) -> int:
+        return sum(model.total_trims for model in self.models)
+    
+    @property
+    def electric_models_count(self) -> int:
+        return sum(1 for model in self.models if model.is_electric)
+    
+    @property
+    def year_range(self) -> str:
+        all_years = []
+        for model in self.models:
+            all_years.extend(model.years)
+        
+        if not all_years:
+            return "Unknown"
+        return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0])
+
+
+@dataclass
+class ExtractionResult:
+    """Results of extracting all makes"""
+    makes: List[MakeData]
+    total_files_processed: int
+    successful_extractions: int
+    failed_extractions: int
+    total_models: int
+    total_engines: int
+    total_electric_models: int
+    
+    @property
+    def success_rate(self) -> float:
+        return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0
+
+
+class JsonExtractor:
+    """Extract normalized vehicle data from JSON files"""
+    
+    def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser):
+        """
+        Initialize JSON extractor with utilities
+        
+        Args:
+            make_mapper: For normalizing make names from filenames
+            engine_parser: For parsing engine specifications with L→I normalization
+        """
+        self.make_mapper = make_mapper
+        self.engine_parser = engine_parser
+        
+        logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser")
+    
+    def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult:
+        """
+        Validate JSON structure before processing
+        
+        Args:
+            json_data: Loaded JSON data
+            filename: Source filename for error context
+            
+        Returns:
+            ValidationResult with validity status and any issues
+        """
+        errors = []
+        warnings = []
+        
+        try:
+            # Check top-level structure
+            if not isinstance(json_data, dict):
+                errors.append("JSON must be a dictionary")
+                return ValidationResult(False, errors, warnings)
+            
+            # Should have exactly one key (the make name)
+            if len(json_data.keys()) != 1:
+                errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}")
+                return ValidationResult(False, errors, warnings)
+            
+            make_key = list(json_data.keys())[0]
+            make_data = json_data[make_key]
+            
+            # Make data should be a list of year entries
+            if not isinstance(make_data, list):
+                errors.append(f"Make data for '{make_key}' must be a list")
+                return ValidationResult(False, errors, warnings)
+            
+            if len(make_data) == 0:
+                warnings.append(f"Make '{make_key}' has no year entries")
+            
+            # Validate year entries
+            for i, year_entry in enumerate(make_data):
+                if not isinstance(year_entry, dict):
+                    errors.append(f"Year entry {i} must be a dictionary")
+                    continue
+                
+                # Check required fields
+                if 'year' not in year_entry:
+                    errors.append(f"Year entry {i} missing 'year' field")
+                
+                if 'models' not in year_entry:
+                    errors.append(f"Year entry {i} missing 'models' field")
+                    continue
+                
+                # Validate year
+                try:
+                    year = int(year_entry['year'])
+                    if year < 1900 or year > 2030:
+                        warnings.append(f"Unusual year value: {year}")
+                except (ValueError, TypeError):
+                    errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}")
+                
+                # Validate models
+                models = year_entry['models']
+                if not isinstance(models, list):
+                    errors.append(f"Models in year entry {i} must be a list")
+                    continue
+                
+                for j, model in enumerate(models):
+                    if not isinstance(model, dict):
+                        errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary")
+                        continue
+                    
+                    if 'name' not in model:
+                        errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field")
+                    
+                    # Engines and submodels are optional but should be lists if present
+                    if 'engines' in model and not isinstance(model['engines'], list):
+                        errors.append(f"Engines for model {model.get('name')} must be a list")
+                    
+                    if 'submodels' in model and not isinstance(model['submodels'], list):
+                        errors.append(f"Submodels for model {model.get('name')} must be a list")
+            
+        except Exception as e:
+            errors.append(f"Unexpected error during validation: {str(e)}")
+        
+        is_valid = len(errors) == 0
+        
+        if errors:
+            logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors")
+        elif warnings:
+            logger.info(f"JSON validation for {filename}: {len(warnings)} warnings")
+        else:
+            logger.debug(f"JSON validation passed for {filename}")
+        
+        return ValidationResult(is_valid, errors, warnings)
+    
+    def extract_make_data(self, json_file_path: str) -> MakeData:
+        """
+        Extract complete make data from a single JSON file
+        
+        Args:
+            json_file_path: Path to JSON file
+            
+        Returns:
+            MakeData with extracted and normalized data
+        """
+        filename = os.path.basename(json_file_path)
+        logger.info(f"Extracting make data from {filename}")
+        
+        processing_errors = []
+        processing_warnings = []
+        
+        try:
+            # Load and validate JSON
+            with open(json_file_path, 'r', encoding='utf-8') as f:
+                json_data = json.load(f)
+            
+            validation = self.validate_json_structure(json_data, filename)
+            processing_errors.extend(validation.errors)
+            processing_warnings.extend(validation.warnings)
+            
+            if not validation.is_valid:
+                logger.error(f"JSON validation failed for {filename}")
+                return MakeData(
+                    name=self.make_mapper.normalize_make_name(filename),
+                    filename=filename,
+                    models=[],
+                    processing_errors=processing_errors,
+                    processing_warnings=processing_warnings
+                )
+            
+            # Get normalized make name
+            make_name = self.make_mapper.normalize_make_name(filename)
+            logger.debug(f"Normalized make name: {filename} → {make_name}")
+            
+            # Extract data
+            make_key = list(json_data.keys())[0]
+            year_entries = json_data[make_key]
+            
+            # Group models by name across all years
+            models_by_name = {}  # model_name -> {years: set, engines: set, trims: set}
+            
+            for year_entry in year_entries:
+                try:
+                    year = int(year_entry['year'])
+                    models_list = year_entry.get('models', [])
+                    
+                    for model_entry in models_list:
+                        model_name = model_entry.get('name', '').strip()
+                        if not model_name:
+                            processing_warnings.append(f"Empty model name in year {year}")
+                            continue
+                        
+                        # Initialize model data if not seen before
+                        if model_name not in models_by_name:
+                            models_by_name[model_name] = {
+                                'years': set(),
+                                'engines': set(),
+                                'trims': set()
+                            }
+                        
+                        # Add year
+                        models_by_name[model_name]['years'].add(year)
+                        
+                        # Add engines
+                        engines_list = model_entry.get('engines', [])
+                        for engine_str in engines_list:
+                            if engine_str and engine_str.strip():
+                                models_by_name[model_name]['engines'].add(engine_str.strip())
+                        
+                        # Add trims (from submodels)
+                        submodels_list = model_entry.get('submodels', [])
+                        for trim in submodels_list:
+                            if trim and trim.strip():
+                                models_by_name[model_name]['trims'].add(trim.strip())
+                
+                except (ValueError, TypeError) as e:
+                    processing_errors.append(f"Error processing year entry: {str(e)}")
+                    continue
+            
+            # Convert to ModelData objects
+            models = []
+            for model_name, model_info in models_by_name.items():
+                try:
+                    # Parse engines
+                    engine_specs = []
+                    is_electric = False
+                    
+                    if not model_info['engines']:
+                        # Empty engines array - electric vehicle
+                        is_electric = True
+                        electric_spec = self.engine_parser.create_electric_motor()
+                        engine_specs = [electric_spec]
+                        logger.debug(f"Created electric motor for {make_name} {model_name}")
+                    else:
+                        # Parse each engine string
+                        for engine_str in model_info['engines']:
+                            spec = self.engine_parser.parse_engine_string(engine_str)
+                            engine_specs.append(spec)
+                    
+                    # Remove duplicate engines based on key attributes
+                    unique_engines = self.engine_parser.get_unique_engines(engine_specs)
+                    
+                    # Create model data
+                    model_data = ModelData(
+                        name=model_name,
+                        years=sorted(list(model_info['years'])),
+                        engines=unique_engines,
+                        trims=sorted(list(model_info['trims'])),
+                        is_electric=is_electric
+                    )
+                    
+                    models.append(model_data)
+                    
+                except Exception as e:
+                    processing_errors.append(f"Error processing model {model_name}: {str(e)}")
+                    continue
+            
+            # Sort models by name
+            models.sort(key=lambda m: m.name)
+            
+            make_data = MakeData(
+                name=make_name,
+                filename=filename,
+                models=models,
+                processing_errors=processing_errors,
+                processing_warnings=processing_warnings
+            )
+            
+            logger.info(f"Extracted {filename}: {len(models)} models, "
+                       f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models")
+            
+            return make_data
+            
+        except Exception as e:
+            logger.error(f"Failed to extract make data from {filename}: {str(e)}")
+            processing_errors.append(f"Fatal error: {str(e)}")
+            
+            return MakeData(
+                name=self.make_mapper.normalize_make_name(filename),
+                filename=filename,
+                models=[],
+                processing_errors=processing_errors,
+                processing_warnings=processing_warnings
+            )
+    
+    def extract_all_makes(self, sources_dir: str) -> ExtractionResult:
+        """
+        Process all JSON files in the sources directory
+        
+        Args:
+            sources_dir: Directory containing JSON make files
+            
+        Returns:
+            ExtractionResult with all extracted data and statistics
+        """
+        logger.info(f"Starting extraction of all makes from {sources_dir}")
+        
+        # Find all JSON files
+        pattern = os.path.join(sources_dir, '*.json')
+        json_files = glob.glob(pattern)
+        
+        if not json_files:
+            logger.warning(f"No JSON files found in {sources_dir}")
+            return ExtractionResult(
+                makes=[],
+                total_files_processed=0,
+                successful_extractions=0,
+                failed_extractions=0,
+                total_models=0,
+                total_engines=0,
+                total_electric_models=0
+            )
+        
+        logger.info(f"Found {len(json_files)} JSON files to process")
+        
+        makes = []
+        successful_extractions = 0
+        failed_extractions = 0
+        
+        # Sort files for consistent processing order
+        json_files.sort()
+        
+        for json_file in json_files:
+            try:
+                make_data = self.extract_make_data(json_file)
+                makes.append(make_data)
+                
+                if make_data.processing_errors:
+                    failed_extractions += 1
+                    logger.error(f"Extraction completed with errors for {make_data.filename}")
+                else:
+                    successful_extractions += 1
+                    logger.debug(f"Extraction successful for {make_data.filename}")
+                
+            except Exception as e:
+                logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}")
+                failed_extractions += 1
+                
+                # Create minimal make data for failed file
+                filename = os.path.basename(json_file)
+                failed_make = MakeData(
+                    name=self.make_mapper.normalize_make_name(filename),
+                    filename=filename,
+                    models=[],
+                    processing_errors=[f"Fatal extraction error: {str(e)}"],
+                    processing_warnings=[]
+                )
+                makes.append(failed_make)
+        
+        # Calculate statistics
+        total_models = sum(make.total_models for make in makes)
+        total_engines = sum(make.total_engines for make in makes)
+        total_electric_models = sum(make.electric_models_count for make in makes)
+        
+        result = ExtractionResult(
+            makes=makes,
+            total_files_processed=len(json_files),
+            successful_extractions=successful_extractions,
+            failed_extractions=failed_extractions,
+            total_models=total_models,
+            total_engines=total_engines,
+            total_electric_models=total_electric_models
+        )
+        
+        logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, "
+                   f"{total_models} models, {total_engines} engines, {total_electric_models} electric models")
+        
+        return result
+    
+    def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]:
+        """
+        Get detailed extraction statistics
+        
+        Args:
+            result: ExtractionResult from extract_all_makes
+            
+        Returns:
+            Dictionary with detailed statistics
+        """
+        stats = {
+            'files': {
+                'total_processed': result.total_files_processed,
+                'successful': result.successful_extractions,
+                'failed': result.failed_extractions,
+                'success_rate': result.success_rate
+            },
+            'data': {
+                'total_makes': len(result.makes),
+                'total_models': result.total_models,
+                'total_engines': result.total_engines,
+                'electric_models': result.total_electric_models
+            },
+            'quality': {
+                'makes_with_errors': sum(1 for make in result.makes if make.processing_errors),
+                'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings),
+                'total_errors': sum(len(make.processing_errors) for make in result.makes),
+                'total_warnings': sum(len(make.processing_warnings) for make in result.makes)
+            }
+        }
+        
+        # Add make-specific statistics
+        make_stats = []
+        for make in result.makes:
+            make_stat = {
+                'name': make.name,
+                'filename': make.filename,
+                'models': make.total_models,
+                'engines': make.total_engines,
+                'trims': make.total_trims,
+                'electric_models': make.electric_models_count,
+                'year_range': make.year_range,
+                'errors': len(make.processing_errors),
+                'warnings': len(make.processing_warnings)
+            }
+            make_stats.append(make_stat)
+        
+        stats['makes'] = make_stats
+        
+        return stats
+    
+    def print_extraction_report(self, result: ExtractionResult) -> None:
+        """
+        Print detailed extraction report
+        
+        Args:
+            result: ExtractionResult from extract_all_makes
+        """
+        stats = self.get_extraction_statistics(result)
+        
+        print(f"🚀 JSON EXTRACTION REPORT")
+        print(f"=" * 50)
+        
+        # File processing summary
+        print(f"\n📁 FILE PROCESSING")
+        print(f"   Files processed: {stats['files']['total_processed']}")
+        print(f"   Successful: {stats['files']['successful']}")
+        print(f"   Failed: {stats['files']['failed']}")
+        print(f"   Success rate: {stats['files']['success_rate']:.1%}")
+        
+        # Data summary
+        print(f"\n📊 DATA EXTRACTED")
+        print(f"   Makes: {stats['data']['total_makes']}")
+        print(f"   Models: {stats['data']['total_models']}")
+        print(f"   Engines: {stats['data']['total_engines']}")
+        print(f"   Electric models: {stats['data']['electric_models']}")
+        
+        # Quality summary
+        print(f"\n🔍 QUALITY ASSESSMENT")
+        print(f"   Makes with errors: {stats['quality']['makes_with_errors']}")
+        print(f"   Makes with warnings: {stats['quality']['makes_with_warnings']}")
+        print(f"   Total errors: {stats['quality']['total_errors']}")
+        print(f"   Total warnings: {stats['quality']['total_warnings']}")
+        
+        # Show problematic makes
+        if stats['quality']['makes_with_errors'] > 0:
+            print(f"\n⚠️  MAKES WITH ERRORS:")
+            for make in result.makes:
+                if make.processing_errors:
+                    print(f"   {make.name} ({make.filename}): {len(make.processing_errors)} errors")
+        
+        # Show top makes by data volume
+        print(f"\n🏆 TOP MAKES BY MODEL COUNT:")
+        top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
+        for make in top_makes:
+            print(f"   {make.name}: {make.total_models} models, {make.total_engines} engines")
+
+
+# Example usage and testing functions
+def example_usage():
+    """Demonstrate JsonExtractor usage"""
+    print("🚀 JsonExtractor Example Usage")
+    print("=" * 40)
+    
+    # Use direct imports for example usage
+    try:
+        from ..utils.make_name_mapper import MakeNameMapper
+        from ..utils.engine_spec_parser import EngineSpecParser
+    except ImportError:
+        # Fallback for direct execution
+        import sys
+        import os
+        sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+        from utils.make_name_mapper import MakeNameMapper
+        from utils.engine_spec_parser import EngineSpecParser
+    
+    # Initialize utilities
+    make_mapper = MakeNameMapper()
+    engine_parser = EngineSpecParser()
+    
+    # Create extractor
+    extractor = JsonExtractor(make_mapper, engine_parser)
+    
+    # Extract single make
+    sources_dir = "sources/makes"
+    if os.path.exists(sources_dir):
+        toyota_file = os.path.join(sources_dir, "toyota.json")
+        if os.path.exists(toyota_file):
+            print(f"\n📄 Extracting from toyota.json...")
+            toyota_data = extractor.extract_make_data(toyota_file)
+            
+            print(f"   Make: {toyota_data.name}")
+            print(f"   Models: {toyota_data.total_models}")
+            print(f"   Engines: {toyota_data.total_engines}")
+            print(f"   Electric models: {toyota_data.electric_models_count}")
+            print(f"   Year range: {toyota_data.year_range}")
+            
+            if toyota_data.processing_errors:
+                print(f"   Errors: {len(toyota_data.processing_errors)}")
+            if toyota_data.processing_warnings:
+                print(f"   Warnings: {len(toyota_data.processing_warnings)}")
+        
+        # Extract all makes
+        print(f"\n🔄 Extracting all makes...")
+        result = extractor.extract_all_makes(sources_dir)
+        extractor.print_extraction_report(result)
+    else:
+        print(f"Sources directory not found: {sources_dir}")
+
+
+if __name__ == "__main__":
+    example_usage()
--- a/mvp-platform-services/vehicles/etl/extractors/mssql_extractor.py
+++ b/mvp-platform-services/vehicles/etl/extractors/mssql_extractor.py
@@ -0,0 +1,337 @@
+import logging
+from typing import List, Dict, Optional, Generator
+from ..connections import db_connections
+from ..utils.make_filter import MakeFilter
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+class MSSQLExtractor:
+    """Extract data from MS SQL Server source database"""
+    
+    def __init__(self, make_filter: Optional[MakeFilter] = None):
+        self.batch_size = 10000
+        self.make_filter = make_filter or MakeFilter()
+        logger.info(f"Initialized MSSQL extractor with {len(self.make_filter.get_allowed_makes())} allowed makes")
+    
+    def extract_wmi_data(self) -> List[Dict]:
+        """Extract WMI (World Manufacturer Identifier) data with make filtering"""
+        logger.info("Extracting WMI data from source database with make filtering")
+        
+        query = f"""
+        SELECT 
+            w.Id,
+            w.Wmi,
+            w.ManufacturerId,
+            w.MakeId,
+            w.VehicleTypeId,
+            w.TruckTypeId,
+            w.CountryId,
+            w.PublicAvailabilityDate,
+            w.NonCompliant,
+            w.NonCompliantReason,
+            w.CreatedOn,
+            w.UpdatedOn,
+            w.ProcessedOn
+        FROM dbo.Wmi w
+        WHERE w.PublicAvailabilityDate <= GETDATE()
+        AND w.ManufacturerId IN (
+            SELECT DISTINCT mfr.Id 
+            FROM dbo.Manufacturer mfr
+            JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
+            JOIN dbo.Make m ON mm.MakeId = m.Id
+            WHERE {self.make_filter.get_sql_filter('m.Name')}
+        )
+        ORDER BY w.Id
+        """
+        
+        with db_connections.mssql_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute(query)
+            rows = cursor.fetchall()
+            results = self._rows_to_dicts(cursor, rows)
+
+        logger.info(f"Extracted {len(results)} WMI records")
+        return results
+    
+    def extract_wmi_vin_schema_mappings(self) -> List[Dict]:
+        """Extract WMI to VIN Schema mappings with year ranges and make filtering"""
+        logger.info("Extracting WMI-VinSchema mappings with make filtering")
+        
+        query = f"""
+        SELECT 
+            wvs.WmiId,
+            wvs.VinSchemaId,
+            wvs.YearFrom,
+            wvs.YearTo,
+            w.Wmi,
+            vs.Name as SchemaName
+        FROM dbo.Wmi_VinSchema wvs
+        JOIN dbo.Wmi w ON wvs.WmiId = w.Id
+        JOIN dbo.VinSchema vs ON wvs.VinSchemaId = vs.Id
+        WHERE w.PublicAvailabilityDate <= GETDATE()
+        AND w.ManufacturerId IN (
+            SELECT DISTINCT mfr.Id 
+            FROM dbo.Manufacturer mfr
+            JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
+            JOIN dbo.Make m ON mm.MakeId = m.Id
+            WHERE {self.make_filter.get_sql_filter('m.Name')}
+        )
+        AND w.MakeId IN (
+            SELECT Id FROM dbo.Make
+            WHERE {self.make_filter.get_sql_filter('Name')}
+        )
+        ORDER BY wvs.WmiId, wvs.VinSchemaId
+        """
+        
+        with db_connections.mssql_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute(query)
+            rows = cursor.fetchall()
+            results = self._rows_to_dicts(cursor, rows)
+
+        logger.info(f"Extracted {len(results)} WMI-VinSchema mappings (filtered by allowed makes)")
+        return results
+    
+    def extract_patterns_data(self) -> Generator[List[Dict], None, None]:
+        """Extract pattern data in batches with make filtering"""
+        logger.info("Extracting pattern data from source database with make filtering")
+        
+        # First get the total count with filtering
+        count_query = f"""
+        SELECT COUNT(*) as total 
+        FROM dbo.Pattern p
+        JOIN dbo.Element e ON p.ElementId = e.Id
+        JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
+        JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
+        JOIN dbo.Wmi w ON wvs.WmiId = w.Id
+        JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
+        JOIN dbo.Make m ON wm.MakeId = m.Id
+        WHERE {self.make_filter.get_sql_filter('m.Name')}
+        AND e.Id IN (26, 27, 28, 18, 24)
+        """
+        
+        with db_connections.mssql_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute(count_query)
+            total_row = self._row_to_dict(cursor, cursor.fetchone())
+            total_count = total_row.get('total', 0)
+        
+        logger.info(f"Total patterns to extract (filtered): {total_count}")
+        
+        # Extract in batches with manufacturer filtering
+        query = f"""
+        SELECT 
+            p.Id,
+            p.VinSchemaId,
+            p.Keys,
+            p.ElementId,
+            p.AttributeId,
+            e.Name as ElementName,
+            e.weight,
+            e.GroupName,
+            vs.Name as SchemaName,
+            w.Wmi,
+            m.Name as MakeName
+        FROM dbo.Pattern p
+        JOIN dbo.Element e ON p.ElementId = e.Id
+        JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
+        JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
+        JOIN dbo.Wmi w ON wvs.WmiId = w.Id
+        JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
+        JOIN dbo.Make m ON wm.MakeId = m.Id
+        WHERE {self.make_filter.get_sql_filter('m.Name')}
+        AND e.Id IN (26, 27, 28, 18, 24)
+        ORDER BY p.Id
+        OFFSET {{}} ROWS FETCH NEXT {{}} ROWS ONLY
+        """
+        
+        with db_connections.mssql_connection() as conn:
+            cursor = conn.cursor()
+
+            for offset in tqdm(range(0, total_count, self.batch_size), desc="Extracting filtered patterns"):
+                cursor.execute(query.format(offset, self.batch_size))
+                rows = cursor.fetchall()
+
+                if rows:
+                    yield self._rows_to_dicts(cursor, rows)
+                else:
+                    break
+    
+    def extract_elements_data(self) -> List[Dict]:
+        """Extract element definitions"""
+        logger.info("Extracting element data")
+        
+        query = """
+        SELECT 
+            Id,
+            Name,
+            Code,
+            LookupTable,
+            Description,
+            IsPrivate,
+            GroupName,
+            DataType,
+            MinAllowedValue,
+            MaxAllowedValue,
+            IsQS,
+            Decode,
+            weight
+        FROM dbo.Element
+        ORDER BY Id
+        """
+        
+        with db_connections.mssql_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute(query)
+            rows = cursor.fetchall()
+            results = self._rows_to_dicts(cursor, rows)
+
+        logger.info(f"Extracted {len(results)} element definitions")
+        return results
+    
+    def extract_reference_table(self, table_name: str) -> List[Dict]:
+        """Extract data from a reference table with make filtering"""
+        logger.info(f"Extracting data from {table_name} with make filtering")
+        
+        # Apply make filtering - filter by Make brand names (simpler and more efficient)
+        if table_name == 'Manufacturer':
+            # Extract manufacturers linked to filtered makes only
+            query = f"""
+            SELECT DISTINCT mfr.* FROM dbo.Manufacturer mfr
+            JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
+            JOIN dbo.Make m ON mm.MakeId = m.Id
+            WHERE {self.make_filter.get_sql_filter('m.Name')}
+            ORDER BY mfr.Id
+            """
+        elif table_name == 'Make':
+            # Filter makes directly by brand names (GMC, Ford, Toyota, etc.)
+            query = f"""
+            SELECT * FROM dbo.Make
+            WHERE {self.make_filter.get_sql_filter('Name')}
+            ORDER BY Id
+            """
+        elif table_name == 'Model':
+            # Filter models by allowed make brand names
+            query = f"""
+            SELECT md.* FROM dbo.Model md
+            JOIN dbo.Make_Model mm ON md.Id = mm.ModelId
+            JOIN dbo.Make m ON mm.MakeId = m.Id
+            WHERE {self.make_filter.get_sql_filter('m.Name')}
+            ORDER BY md.Id
+            """
+        elif table_name == 'Wmi':
+            # Filter WMI records by allowed manufacturers (linked to makes) AND makes directly
+            query = f"""
+            SELECT w.* FROM dbo.Wmi w
+            WHERE w.PublicAvailabilityDate <= GETDATE()
+            AND w.ManufacturerId IN (
+                SELECT DISTINCT mfr.Id 
+                FROM dbo.Manufacturer mfr
+                JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
+                JOIN dbo.Make m ON mm.MakeId = m.Id
+                WHERE {self.make_filter.get_sql_filter('m.Name')}
+            )
+            AND w.MakeId IN (
+                SELECT Id FROM dbo.Make
+                WHERE {self.make_filter.get_sql_filter('Name')}
+            )
+            ORDER BY w.Id
+            """
+        else:
+            # No filtering for other reference tables
+            query = f"SELECT * FROM dbo.{table_name} ORDER BY Id"
+        
+        with db_connections.mssql_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute(query)
+            rows = cursor.fetchall()
+            results = self._rows_to_dicts(cursor, rows)
+
+        logger.info(f"Extracted {len(results)} records from {table_name} (filtered by allowed makes)")
+        return results
+    
+    def extract_make_model_relationships(self) -> List[Dict]:
+        """Extract Make-Model relationships with make filtering"""
+        logger.info("Extracting Make-Model relationships with make filtering")
+        
+        query = f"""
+        SELECT 
+            mm.MakeId,
+            mm.ModelId,
+            m.Name as MakeName,
+            md.Name as ModelName
+        FROM dbo.Make_Model mm
+        JOIN dbo.Make m ON mm.MakeId = m.Id
+        JOIN dbo.Model md ON mm.ModelId = md.Id
+        WHERE {self.make_filter.get_sql_filter('m.Name')}
+        ORDER BY mm.MakeId, mm.ModelId
+        """
+        
+        with db_connections.mssql_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute(query)
+            rows = cursor.fetchall()
+            results = self._rows_to_dicts(cursor, rows)
+
+        logger.info(f"Extracted {len(results)} Make-Model relationships (filtered by allowed makes)")
+        return results
+    
+    def extract_wmi_make_relationships(self) -> List[Dict]:
+        """Extract WMI-Make relationships with make filtering"""
+        logger.info("Extracting WMI-Make relationships with make filtering")
+        
+        query = f"""
+        SELECT 
+            wm.WmiId,
+            wm.MakeId,
+            w.Wmi,
+            m.Name as MakeName
+        FROM dbo.Wmi_Make wm
+        JOIN dbo.Wmi w ON wm.WmiId = w.Id
+        JOIN dbo.Make m ON wm.MakeId = m.Id
+        WHERE w.PublicAvailabilityDate <= GETDATE()
+        AND w.ManufacturerId IN (
+            SELECT DISTINCT mfr.Id 
+            FROM dbo.Manufacturer mfr
+            JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
+            JOIN dbo.Make mk ON mm.MakeId = mk.Id
+            WHERE {self.make_filter.get_sql_filter('mk.Name')}
+        )
+        AND w.MakeId IN (
+            SELECT Id FROM dbo.Make
+            WHERE {self.make_filter.get_sql_filter('Name')}
+        )
+        AND m.Id IN (
+            SELECT Id FROM dbo.Make
+            WHERE {self.make_filter.get_sql_filter('Name')}
+        )
+        ORDER BY wm.WmiId, wm.MakeId
+        """
+        
+        with db_connections.mssql_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute(query)
+            rows = cursor.fetchall()
+            results = self._rows_to_dicts(cursor, rows)
+
+        logger.info(f"Extracted {len(results)} WMI-Make relationships (filtered by allowed makes)")
+        return results
+
+    def _rows_to_dicts(self, cursor, rows) -> List[Dict]:
+        """Convert pyodbc rows to list of dicts using cursor description."""
+        if not rows:
+            return []
+        columns = [col[0] for col in cursor.description]
+        result: List[Dict] = []
+        for row in rows:
+            item = {columns[i]: row[i] for i in range(len(columns))}
+            result.append(item)
+        return result
+
+    def _row_to_dict(self, cursor, row) -> Dict:
+        """Convert single pyodbc row to dict."""
+        if row is None:
+            return {}
+        columns = [col[0] for col in cursor.description]
+        return {columns[i]: row[i] for i in range(len(columns))}
--- a/mvp-platform-services/vehicles/etl/extractors/vin_proc_extractor.py
+++ b/mvp-platform-services/vehicles/etl/extractors/vin_proc_extractor.py
@@ -0,0 +1,63 @@
+import logging
+from typing import Optional, Dict, Any, List
+from ..connections import db_connections
+
+logger = logging.getLogger(__name__)
+
+class VinProcExtractor:
+    """Utilities to inspect and sample the MSSQL VIN decode stored procedure."""
+
+    def __init__(self, proc_name: str = 'dbo.spVinDecode'):
+        self.proc_name = proc_name
+
+    def find_proc(self) -> Optional[Dict[str, Any]]:
+        """Locate the VIN decode proc by name pattern, return basic metadata."""
+        query = """
+            SELECT TOP 1 
+                o.name AS object_name,
+                s.name AS schema_name,
+                o.type_desc
+            FROM sys.objects o
+            JOIN sys.schemas s ON s.schema_id = o.schema_id
+            WHERE o.name LIKE '%Vin%Decode%'
+            ORDER BY o.create_date DESC
+        """
+        with db_connections.mssql_connection() as conn:
+            cur = conn.cursor()
+            cur.execute(query)
+            row = cur.fetchone()
+            if not row:
+                logger.warning("VIN decode stored procedure not found by pattern")
+                return None
+            return { 'object_name': row[0], 'schema_name': row[1], 'type_desc': row[2] }
+
+    def get_definition(self, schema: str, name: str) -> str:
+        """Return the text definition of the proc using sp_helptext semantics."""
+        sql = f"EXEC {schema}.sp_helptext '{schema}.{name}'"
+        definition_lines: List[str] = []
+        with db_connections.mssql_connection() as conn:
+            cur = conn.cursor()
+            cur.execute(sql)
+            for row in cur.fetchall():
+                # sp_helptext returns a single NVARCHAR column with line segments
+                definition_lines.append(row[0])
+        return ''.join(definition_lines)
+
+    def sample_execute(self, vin: str) -> Optional[List[Dict[str, Any]]]:
+        """Execute the VIN decode proc with a VIN to capture output shape."""
+        # Prefer proc signature with @VIN only; if it requires year, MSSQL will error.
+        sql = f"EXEC {self.proc_name} @VIN=?"
+        with db_connections.mssql_connection() as conn:
+            cur = conn.cursor()
+            try:
+                cur.execute(sql, (vin,))
+                columns = [c[0] for c in cur.description] if cur.description else []
+                rows = cur.fetchall() if cur.description else []
+                results: List[Dict[str, Any]] = []
+                for r in rows:
+                    results.append({columns[i]: r[i] for i in range(len(columns))})
+                return results
+            except Exception as e:
+                logger.warning(f"VIN proc sample execution failed: {e}")
+                return None
+