New Vehicle Database

2025-11-10 11:20:31 -06:00
parent b50942e909
commit cd118c8f9d
66 changed files with 1552520 additions and 0 deletions
--- a/data/make-model-import/etl_generate_sql.py
+++ b/data/make-model-import/etl_generate_sql.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python3
+"""
+ETL Script for Automotive Vehicle Selection Database
+Generates SQL import files for loading into PostgreSQL
+No database connection required - pure file-based processing
+"""
+
+import json
+import os
+import re
+from pathlib import Path
+from typing import Dict, List, Set, Tuple, Optional
+
+class VehicleSQLGenerator:
+    def __init__(self):
+        self.makes_filter_dir = Path('makes-filter')
+        self.engines_data = []
+        self.automobiles_data = []
+        self.brands_data = []
+
+        # Year filter - only include vehicles 1980 or newer
+        self.min_year = 1980
+
+        # In-memory caches for fast lookups
+        self.engine_cache = {}  # Key: (displacement, config) -> engine record
+        self.transmission_cache = {}  # Key: (type, speeds, drive) -> transmission record
+        self.vehicle_records = []
+
+        # Output SQL files
+        self.engines_sql_file = 'output/01_engines.sql'
+        self.transmissions_sql_file = 'output/02_transmissions.sql'
+        self.vehicles_sql_file = 'output/03_vehicle_options.sql'
+
+    def load_json_files(self):
+        """Load the large JSON data files"""
+        print("\n📂 Loading source JSON files...")
+
+        print("  Loading engines.json...")
+        with open('engines.json', 'r', encoding='utf-8') as f:
+            self.engines_data = json.load(f)
+        print(f"  ✓ Loaded {len(self.engines_data)} engine records")
+
+        print("  Loading automobiles.json...")
+        with open('automobiles.json', 'r', encoding='utf-8') as f:
+            self.automobiles_data = json.load(f)
+        print(f"  ✓ Loaded {len(self.automobiles_data)} automobile records")
+
+        print("  Loading brands.json...")
+        with open('brands.json', 'r', encoding='utf-8') as f:
+            self.brands_data = json.load(f)
+        print(f"  ✓ Loaded {len(self.brands_data)} brand records")
+
+    def parse_engine_string(self, engine_str: str) -> Tuple[Optional[str], Optional[str]]:
+        """Parse engine string like '2.0L I4' into displacement and configuration"""
+        pattern = r'(\d+\.?\d*L?)\s*([IVL]\d+|[A-Z]+\d*)'
+        match = re.search(pattern, engine_str, re.IGNORECASE)
+
+        if match:
+            displacement = match.group(1).upper()
+            if not displacement.endswith('L'):
+                displacement += 'L'
+            configuration = match.group(2).upper()
+            return (displacement, configuration)
+
+        return (None, None)
+
+    def extract_engine_specs(self, engine_record: Dict) -> Dict:
+        """Extract relevant specs from engine JSON record"""
+        specs = engine_record.get('specs', {})
+        engine_specs = specs.get('Engine Specs', {})
+        trans_specs = specs.get('Transmission Specs', {})
+
+        return {
+            'name': engine_record.get('name', ''),
+            'displacement': engine_specs.get('Displacement', ''),
+            'configuration': engine_specs.get('Cylinders', ''),
+            'horsepower': engine_specs.get('Power', ''),
+            'torque': engine_specs.get('Torque', ''),
+            'fuel_type': engine_specs.get('Fuel', ''),
+            'fuel_system': engine_specs.get('Fuel System', ''),
+            'aspiration': engine_specs.get('Aspiration', ''),
+            'transmission_type': trans_specs.get('Gearbox', ''),
+            'drive_type': trans_specs.get('Drive Type', ''),
+            'specs_json': specs
+        }
+
+    def sql_escape(self, value):
+        """Escape values for SQL"""
+        if value is None:
+            return 'NULL'
+        if isinstance(value, (int, float)):
+            return str(value)
+        if isinstance(value, dict):
+            # Convert dict to JSON string and escape
+            json_str = json.dumps(value)
+            return "'" + json_str.replace("'", "''") + "'"
+        # String - escape single quotes
+        return "'" + str(value).replace("'", "''") + "'"
+
+    def generate_engines_sql(self):
+        """Generate SQL file for engines and transmissions"""
+        print("\n⚙️  Generating engine and transmission SQL...")
+
+        os.makedirs('output', exist_ok=True)
+
+        # Process engines
+        engines_insert_values = []
+        transmissions_set = set()
+        engine_id = 1
+
+        for engine_record in self.engines_data:
+            specs = self.extract_engine_specs(engine_record)
+
+            values = (
+                engine_id,
+                self.sql_escape(specs['name']),
+                self.sql_escape(specs['displacement']),
+                self.sql_escape(specs['configuration']),
+                self.sql_escape(specs['horsepower']),
+                self.sql_escape(specs['torque']),
+                self.sql_escape(specs['fuel_type']),
+                self.sql_escape(specs['fuel_system']),
+                self.sql_escape(specs['aspiration']),
+                self.sql_escape(specs['specs_json'])
+            )
+
+            engines_insert_values.append(f"({','.join(map(str, values))})")
+
+            # Build engine cache
+            if specs['displacement'] and specs['configuration']:
+                disp_norm = specs['displacement'].upper().strip()
+                config_norm = specs['configuration'].upper().strip()
+                key = (disp_norm, config_norm)
+                if key not in self.engine_cache:
+                    self.engine_cache[key] = engine_id
+
+            # Extract transmission
+            if specs['transmission_type'] or specs['drive_type']:
+                speeds = None
+                if specs['transmission_type']:
+                    speed_match = re.search(r'(\d+)', specs['transmission_type'])
+                    if speed_match:
+                        speeds = speed_match.group(1)
+
+                trans_tuple = (
+                    specs['transmission_type'] or 'Unknown',
+                    speeds,
+                    specs['drive_type'] or 'Unknown'
+                )
+                transmissions_set.add(trans_tuple)
+
+            engine_id += 1
+
+        # Write engines SQL file
+        print(f"  Writing {len(engines_insert_values)} engines to SQL file...")
+        with open(self.engines_sql_file, 'w', encoding='utf-8') as f:
+            f.write("-- Engines data import\n")
+            f.write("-- Generated by ETL script\n\n")
+            f.write("BEGIN;\n\n")
+
+            # Write in batches of 500 for better performance
+            batch_size = 500
+            for i in range(0, len(engines_insert_values), batch_size):
+                batch = engines_insert_values[i:i+batch_size]
+                f.write("INSERT INTO engines (id, name, displacement, configuration, horsepower, torque, fuel_type, fuel_system, aspiration, specs_json) VALUES\n")
+                f.write(",\n".join(batch))
+                f.write(";\n\n")
+
+            # Reset sequence
+            f.write(f"SELECT setval('engines_id_seq', {engine_id});\n\n")
+            f.write("COMMIT;\n")
+
+        print(f"  ✓ Wrote engines SQL to {self.engines_sql_file}")
+
+        # Write transmissions SQL file
+        print(f"  Writing {len(transmissions_set)} transmissions to SQL file...")
+        trans_id = 1
+        with open(self.transmissions_sql_file, 'w', encoding='utf-8') as f:
+            f.write("-- Transmissions data import\n")
+            f.write("-- Generated by ETL script\n\n")
+            f.write("BEGIN;\n\n")
+            f.write("INSERT INTO transmissions (id, type, speeds, drive_type) VALUES\n")
+
+            trans_values = []
+            for trans_type, speeds, drive_type in sorted(transmissions_set):
+                values = (
+                    trans_id,
+                    self.sql_escape(trans_type),
+                    self.sql_escape(speeds),
+                    self.sql_escape(drive_type)
+                )
+                trans_values.append(f"({','.join(map(str, values))})")
+
+                # Build transmission cache
+                key = (trans_type, speeds, drive_type)
+                self.transmission_cache[key] = trans_id
+                trans_id += 1
+
+            f.write(",\n".join(trans_values))
+            f.write(";\n\n")
+            f.write(f"SELECT setval('transmissions_id_seq', {trans_id});\n\n")
+            f.write("COMMIT;\n")
+
+        print(f"  ✓ Wrote transmissions SQL to {self.transmissions_sql_file}")
+        print(f"  ✓ Built engine cache with {len(self.engine_cache)} combinations")
+
+    def find_matching_engine_id(self, engine_str: str) -> Optional[int]:
+        """Find engine_id from cache based on engine string"""
+        disp, config = self.parse_engine_string(engine_str)
+        if disp and config:
+            key = (disp, config)
+            if key in self.engine_cache:
+                return self.engine_cache[key]
+
+            # Try normalized variations
+            for cached_key, engine_id in self.engine_cache.items():
+                cached_disp, cached_config = cached_key
+                if cached_disp == disp and self.config_matches(config, cached_config):
+                    return engine_id
+
+        return None
+
+    def config_matches(self, config1: str, config2: str) -> bool:
+        """Check if two engine configurations match"""
+        c1 = config1.upper().replace('-', '').replace(' ', '')
+        c2 = config2.upper().replace('-', '').replace(' ', '')
+
+        if c1 == c2:
+            return True
+
+        if c1.replace('I', 'L') == c2.replace('I', 'L'):
+            return True
+
+        if 'INLINE' in c1 or 'INLINE' in c2:
+            c1_num = re.search(r'\d+', c1)
+            c2_num = re.search(r'\d+', c2)
+            if c1_num and c2_num and c1_num.group() == c2_num.group():
+                return True
+
+        return False
+
+    def process_makes_filter(self):
+        """Process all makes-filter JSON files and build vehicle records"""
+        print(f"\n🚗 Processing makes-filter JSON files (filtering for {self.min_year}+)...")
+
+        json_files = list(self.makes_filter_dir.glob('*.json'))
+        print(f"  Found {len(json_files)} make files to process")
+
+        total_records = 0
+        filtered_records = 0
+
+        for json_file in sorted(json_files):
+            make_name = json_file.stem.replace('_', ' ').title()
+            print(f"  Processing {make_name}...")
+
+            with open(json_file, 'r', encoding='utf-8') as f:
+                make_data = json.load(f)
+
+            for brand_key, year_entries in make_data.items():
+                for year_entry in year_entries:
+                    year = int(year_entry.get('year', 0))
+                    if year == 0:
+                        continue
+
+                    # Filter out vehicles older than min_year
+                    if year < self.min_year:
+                        filtered_records += 1
+                        continue
+
+                    models = year_entry.get('models', [])
+                    for model in models:
+                        model_name = model.get('name', '')
+                        engines = model.get('engines', [])
+                        submodels = model.get('submodels', [])
+
+                        if not submodels:
+                            submodels = ['Base']
+
+                        for trim in submodels:
+                            for engine_str in engines:
+                                engine_id = self.find_matching_engine_id(engine_str)
+                                transmission_id = None
+
+                                self.vehicle_records.append({
+                                    'year': year,
+                                    'make': make_name,
+                                    'model': model_name,
+                                    'trim': trim,
+                                    'engine_id': engine_id,
+                                    'transmission_id': transmission_id
+                                })
+                                total_records += 1
+
+        print(f"  ✓ Processed {total_records} vehicle configuration records")
+        print(f"  ✓ Filtered out {filtered_records} records older than {self.min_year}")
+
+    def hybrid_backfill(self):
+        """Hybrid backfill for recent years from automobiles.json"""
+        print("\n🔄 Performing hybrid backfill for recent years...")
+
+        existing_combos = set()
+        for record in self.vehicle_records:
+            key = (record['year'], record['make'].lower(), record['model'].lower())
+            existing_combos.add(key)
+
+        brand_map = {}
+        for brand in self.brands_data:
+            brand_id = brand.get('id')
+            brand_name = brand.get('name', '').lower()
+            brand_map[brand_id] = brand_name
+
+        filtered_makes = set()
+        for json_file in self.makes_filter_dir.glob('*.json'):
+            make_name = json_file.stem.replace('_', ' ').lower()
+            filtered_makes.add(make_name)
+
+        backfill_count = 0
+        recent_years = [2023, 2024, 2025]
+
+        for auto in self.automobiles_data:
+            brand_id = auto.get('brand_id')
+            brand_name = brand_map.get(brand_id, '').lower()
+
+            if brand_name not in filtered_makes:
+                continue
+
+            auto_name = auto.get('name', '')
+            year_match = re.search(r'(202[3-5])', auto_name)
+            if not year_match:
+                continue
+
+            year = int(year_match.group(1))
+            if year not in recent_years:
+                continue
+
+            # Apply year filter to backfill as well
+            if year < self.min_year:
+                continue
+
+            model_name = auto_name
+            for remove_str in [str(year), brand_name]:
+                model_name = model_name.replace(remove_str, '')
+            model_name = model_name.strip()
+
+            key = (year, brand_name, model_name.lower())
+            if key in existing_combos:
+                continue
+
+            auto_id = auto.get('id')
+            matching_engines = [e for e in self.engines_data if e.get('automobile_id') == auto_id]
+
+            if not matching_engines:
+                continue
+
+            for engine_record in matching_engines[:3]:
+                specs = self.extract_engine_specs(engine_record)
+
+                engine_id = None
+                if specs['displacement'] and specs['configuration']:
+                    disp_norm = specs['displacement'].upper().strip()
+                    config_norm = specs['configuration'].upper().strip()
+                    key = (disp_norm, config_norm)
+                    engine_id = self.engine_cache.get(key)
+
+                self.vehicle_records.append({
+                    'year': year,
+                    'make': brand_name.title(),
+                    'model': model_name,
+                    'trim': 'Base',
+                    'engine_id': engine_id,
+                    'transmission_id': None
+                })
+                backfill_count += 1
+                existing_combos.add((year, brand_name, model_name.lower()))
+
+        print(f"  ✓ Backfilled {backfill_count} recent vehicle configurations")
+
+    def generate_vehicles_sql(self):
+        """Generate SQL file for vehicle_options"""
+        print("\n📝 Generating vehicle options SQL...")
+
+        with open(self.vehicles_sql_file, 'w', encoding='utf-8') as f:
+            f.write("-- Vehicle options data import\n")
+            f.write("-- Generated by ETL script\n\n")
+            f.write("BEGIN;\n\n")
+
+            # Write in batches of 1000
+            batch_size = 1000
+            total_batches = (len(self.vehicle_records) + batch_size - 1) // batch_size
+
+            for batch_num in range(total_batches):
+                start_idx = batch_num * batch_size
+                end_idx = min(start_idx + batch_size, len(self.vehicle_records))
+                batch = self.vehicle_records[start_idx:end_idx]
+
+                f.write("INSERT INTO vehicle_options (year, make, model, trim, engine_id, transmission_id) VALUES\n")
+
+                values_list = []
+                for record in batch:
+                    values = (
+                        record['year'],
+                        self.sql_escape(record['make']),
+                        self.sql_escape(record['model']),
+                        self.sql_escape(record['trim']),
+                        record['engine_id'] if record['engine_id'] else 'NULL',
+                        record['transmission_id'] if record['transmission_id'] else 'NULL'
+                    )
+                    values_list.append(f"({','.join(map(str, values))})")
+
+                f.write(",\n".join(values_list))
+                f.write(";\n\n")
+
+                print(f"  Batch {batch_num + 1}/{total_batches} written ({len(batch)} records)")
+
+            f.write("COMMIT;\n")
+
+        print(f"  ✓ Wrote {len(self.vehicle_records)} vehicle options to {self.vehicles_sql_file}")
+
+    def generate_stats(self):
+        """Generate statistics file"""
+        print("\n📊 Generating statistics...")
+
+        stats = {
+            'total_engines': len(self.engines_data),
+            'total_transmissions': len(self.transmission_cache),
+            'total_vehicles': len(self.vehicle_records),
+            'unique_years': len(set(r['year'] for r in self.vehicle_records)),
+            'unique_makes': len(set(r['make'] for r in self.vehicle_records)),
+            'unique_models': len(set(r['model'] for r in self.vehicle_records)),
+            'year_range': f"{min(r['year'] for r in self.vehicle_records)}-{max(r['year'] for r in self.vehicle_records)}"
+        }
+
+        with open('output/stats.txt', 'w') as f:
+            f.write("=" * 60 + "\n")
+            f.write("ETL Statistics\n")
+            f.write("=" * 60 + "\n\n")
+            for key, value in stats.items():
+                formatted_value = f"{value:,}" if isinstance(value, int) else value
+                f.write(f"{key.replace('_', ' ').title()}: {formatted_value}\n")
+
+        print("\n📊 Statistics:")
+        for key, value in stats.items():
+            formatted_value = f"{value:,}" if isinstance(value, int) else value
+            print(f"  {key.replace('_', ' ').title()}: {formatted_value}")
+
+    def run(self):
+        """Execute the complete ETL pipeline"""
+        try:
+            print("=" * 60)
+            print("🚀 Automotive Vehicle ETL - SQL Generator")
+            print(f"   Year Filter: {self.min_year} and newer")
+            print("=" * 60)
+
+            self.load_json_files()
+            self.generate_engines_sql()
+            self.process_makes_filter()
+            self.hybrid_backfill()
+            self.generate_vehicles_sql()
+            self.generate_stats()
+
+            print("\n" + "=" * 60)
+            print("✅ SQL Files Generated Successfully!")
+            print("=" * 60)
+            print("\nGenerated files:")
+            print(f"  - {self.engines_sql_file}")
+            print(f"  - {self.transmissions_sql_file}")
+            print(f"  - {self.vehicles_sql_file}")
+            print(f"  - output/stats.txt")
+            print("\nNext step: Import SQL files into database")
+            print("  cat output/*.sql | docker exec -i mvp-postgres psql -U postgres -d motovaultpro")
+
+        except Exception as e:
+            print(f"\n❌ ETL Pipeline Failed: {e}")
+            import traceback
+            traceback.print_exc()
+            raise
+
+if __name__ == '__main__':
+    etl = VehicleSQLGenerator()
+    etl.run()