""" JSON Manual Loader for Vehicles ETL Loads extracted JSON data into PostgreSQL database with referential integrity. Supports clear/append modes with duplicate handling and comprehensive progress tracking. Database Schema: - vehicles.make (id, name) - vehicles.model (id, make_id, name) - vehicles.model_year (id, model_id, year) - vehicles.trim (id, model_year_id, name) - vehicles.engine (id, name, code, displacement_l, cylinders, fuel_type, aspiration) - vehicles.trim_engine (trim_id, engine_id) Load Modes: - CLEAR: Truncate all tables and reload (destructive) - APPEND: Insert with conflict resolution (safe) Usage: loader = JsonManualLoader(postgres_loader) result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND) """ import logging from typing import List, Dict, Optional, Tuple from enum import Enum from dataclasses import dataclass from psycopg2.extras import execute_batch # Import our components (handle both relative and direct imports) try: from .postgres_loader import PostgreSQLLoader from ..extractors.json_extractor import MakeData, ModelData, ExtractionResult from ..utils.engine_spec_parser import EngineSpec from ..connections import db_connections except ImportError: # Fallback for direct execution import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # Import with fallback handling for nested imports try: from loaders.postgres_loader import PostgreSQLLoader except ImportError: # Mock PostgreSQLLoader for testing class PostgreSQLLoader: def __init__(self): self.batch_size = 1000 from extractors.json_extractor import MakeData, ModelData, ExtractionResult from utils.engine_spec_parser import EngineSpec try: from connections import db_connections except ImportError: # Mock db_connections for testing class MockDBConnections: def postgres_connection(self): raise NotImplementedError("Database connection not available in test mode") db_connections = MockDBConnections() logger = logging.getLogger(__name__) class LoadMode(Enum): """Data loading modes""" CLEAR = "clear" # Truncate and reload (destructive) APPEND = "append" # Insert with conflict handling (safe) @dataclass class LoadResult: """Result of loading operations""" total_makes: int total_models: int total_model_years: int total_trims: int total_engines: int total_trim_engine_mappings: int failed_makes: List[str] warnings: List[str] load_mode: LoadMode @property def success_count(self) -> int: return self.total_makes - len(self.failed_makes) @property def success_rate(self) -> float: return self.success_count / self.total_makes if self.total_makes > 0 else 0.0 @dataclass class LoadStatistics: """Detailed loading statistics""" makes_processed: int = 0 makes_skipped: int = 0 models_inserted: int = 0 model_years_inserted: int = 0 skipped_model_years: int = 0 trims_inserted: int = 0 engines_inserted: int = 0 trim_engine_mappings_inserted: int = 0 duplicate_makes: int = 0 duplicate_models: int = 0 duplicate_engines: int = 0 errors: List[str] = None warnings: List[str] = None def __post_init__(self): if self.errors is None: self.errors = [] if self.warnings is None: self.warnings = [] class JsonManualLoader: """Load JSON-extracted vehicle data into PostgreSQL""" def _get_id_from_result(self, result, column_name='id'): """Helper to extract ID from query result, handling both tuple and dict cursors""" if result is None: return None if isinstance(result, tuple): return result[0] # For RealDictCursor, try the column name first, fall back to key access if column_name in result: return result[column_name] # For COUNT(*) queries, the key might be 'count' if 'count' in result: return result['count'] # Fall back to first value return list(result.values())[0] if result else None def __init__(self, postgres_loader: Optional[PostgreSQLLoader] = None): """ Initialize JSON manual loader Args: postgres_loader: Existing PostgreSQL loader instance """ self.postgres_loader = postgres_loader or PostgreSQLLoader() self.batch_size = 1000 logger.info("JsonManualLoader initialized") def clear_all_tables(self) -> None: """ Clear all vehicles tables in dependency order WARNING: This is destructive and will remove all data """ logger.warning("CLEARING ALL VEHICLES TABLES - This is destructive!") tables_to_clear = [ 'trim_engine', # Many-to-many mappings first 'trim_transmission', 'performance', # Tables with foreign keys 'trim', 'model_year', 'model', 'make', 'engine', # Independent tables last 'transmission' ] with db_connections.postgres_connection() as conn: cursor = conn.cursor() for table in tables_to_clear: try: cursor.execute(f"TRUNCATE TABLE vehicles.{table} CASCADE") logger.info(f"Cleared vehicles.{table}") except Exception as e: logger.warning(f"Failed to clear vehicles.{table}: {str(e)}") conn.commit() logger.info("All vehicles tables cleared") def load_make(self, make_data: MakeData, mode: LoadMode, stats: LoadStatistics) -> int: """ Load a single make with all related data Args: make_data: Extracted make data mode: Loading mode (clear/append) stats: Statistics accumulator Returns: Make ID in database """ logger.debug(f"Loading make: {make_data.name}") try: with db_connections.postgres_connection() as conn: cursor = conn.cursor() # 1. Insert or get make (always check for existing to avoid constraint violations) # Check if make exists (case-insensitive to match database constraint) cursor.execute( "SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)", (make_data.name,) ) result = cursor.fetchone() if result: make_id = self._get_id_from_result(result) stats.duplicate_makes += 1 logger.debug(f"Make {make_data.name} already exists with ID {make_id}") else: # Insert new make with error handling for constraint violations try: cursor.execute( "INSERT INTO vehicles.make (name) VALUES (%s) RETURNING id", (make_data.name,) ) result = cursor.fetchone() make_id = self._get_id_from_result(result) logger.debug(f"Inserted make {make_data.name} with ID {make_id}") except Exception as e: if "duplicate key value violates unique constraint" in str(e): # Retry the lookup in case of race condition cursor.execute( "SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)", (make_data.name,) ) result = cursor.fetchone() if result: make_id = self._get_id_from_result(result) stats.duplicate_makes += 1 logger.debug(f"Make {make_data.name} found after retry with ID {make_id}") else: raise else: raise # 2. Process models for model_data in make_data.models: model_id = self.load_model(cursor, make_id, model_data, mode, stats) conn.commit() stats.makes_processed += 1 return make_id except Exception as e: error_msg = f"Failed to load make {make_data.name}: {str(e)}" logger.error(error_msg) stats.errors.append(error_msg) raise def load_model(self, cursor, make_id: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int: """ Load a single model with all related data Args: cursor: Database cursor make_id: Parent make ID model_data: Extracted model data mode: Loading mode stats: Statistics accumulator Returns: Model ID in database """ # 1. Insert or get model if mode == LoadMode.APPEND: cursor.execute( "SELECT id FROM vehicles.model WHERE make_id = %s AND name = %s", (make_id, model_data.name) ) result = cursor.fetchone() if result: model_id = result[0] if isinstance(result, tuple) else result['id'] stats.duplicate_models += 1 else: cursor.execute( "INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id", (make_id, model_data.name) ) model_id = self._get_id_from_result(cursor.fetchone()) stats.models_inserted += 1 else: # CLEAR mode - just insert cursor.execute( "INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id", (make_id, model_data.name) ) model_id = self._get_id_from_result(cursor.fetchone()) stats.models_inserted += 1 # 2. Insert model years and related data for year in model_data.years: model_year_id = self.load_model_year(cursor, model_id, year, model_data, mode, stats) # Skip processing if year was outside valid range if model_year_id is None: continue return model_id def load_model_year(self, cursor, model_id: int, year: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int: """ Load model year and associated trims/engines Args: cursor: Database cursor model_id: Parent model ID year: Model year model_data: Model data with trims and engines mode: Loading mode stats: Statistics accumulator Returns: Model year ID in database """ # Skip years that don't meet database constraints (must be 1950-2100) if year < 1950 or year > 2100: logger.warning(f"Skipping year {year} - outside valid range (1950-2100)") stats.skipped_model_years += 1 return None # 1. Insert or get model year if mode == LoadMode.APPEND: cursor.execute( "SELECT id FROM vehicles.model_year WHERE model_id = %s AND year = %s", (model_id, year) ) result = cursor.fetchone() if result: model_year_id = result[0] if isinstance(result, tuple) else result['id'] else: cursor.execute( "INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id", (model_id, year) ) model_year_id = self._get_id_from_result(cursor.fetchone()) stats.model_years_inserted += 1 else: # CLEAR mode - just insert cursor.execute( "INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id", (model_id, year) ) model_year_id = self._get_id_from_result(cursor.fetchone()) stats.model_years_inserted += 1 # 2. Load engines and get their IDs engine_ids = [] for engine_spec in model_data.engines: engine_id = self.load_engine(cursor, engine_spec, mode, stats) engine_ids.append(engine_id) # 3. Load trims and connect to engines for trim_name in model_data.trims: trim_id = self.load_trim(cursor, model_year_id, trim_name, engine_ids, mode, stats) return model_year_id def load_engine(self, cursor, engine_spec: EngineSpec, mode: LoadMode, stats: LoadStatistics) -> int: """ Load engine specification Args: cursor: Database cursor engine_spec: Parsed engine specification mode: Loading mode stats: Statistics accumulator Returns: Engine ID in database """ # Create a canonical engine name for database storage if engine_spec.displacement_l and engine_spec.configuration != "Unknown" and engine_spec.cylinders: engine_name = f"{engine_spec.displacement_l}L {engine_spec.configuration}{engine_spec.cylinders}" else: engine_name = engine_spec.raw_string # Generate engine code from name (remove spaces, lowercase) engine_code = engine_name.replace(" ", "").lower() # Always check for existing engine by name or code to avoid constraint violations cursor.execute(""" SELECT id FROM vehicles.engine WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s) """, (engine_name, engine_code)) result = cursor.fetchone() if result: engine_id = self._get_id_from_result(result) stats.duplicate_engines += 1 return engine_id # Insert new engine try: cursor.execute(""" INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration) VALUES (%s, %s, %s, %s, %s, %s) RETURNING id """, ( engine_name, engine_code, engine_spec.displacement_l, engine_spec.cylinders, engine_spec.fuel_type if engine_spec.fuel_type != "Unknown" else None, engine_spec.aspiration if engine_spec.aspiration != "Natural" else None )) engine_id = self._get_id_from_result(cursor.fetchone()) stats.engines_inserted += 1 return engine_id except Exception as e: if "duplicate key value violates unique constraint" in str(e): # Retry the lookup in case of race condition cursor.execute(""" SELECT id FROM vehicles.engine WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s) """, (engine_name, engine_code)) result = cursor.fetchone() if result: engine_id = self._get_id_from_result(result) stats.duplicate_engines += 1 return engine_id raise def load_trim(self, cursor, model_year_id: int, trim_name: str, engine_ids: List[int], mode: LoadMode, stats: LoadStatistics) -> int: """ Load trim and connect to engines Args: cursor: Database cursor model_year_id: Parent model year ID trim_name: Trim name engine_ids: List of engine IDs to connect mode: Loading mode stats: Statistics accumulator Returns: Trim ID in database """ # 1. Insert or get trim if mode == LoadMode.APPEND: cursor.execute( "SELECT id FROM vehicles.trim WHERE model_year_id = %s AND name = %s", (model_year_id, trim_name) ) result = cursor.fetchone() if result: trim_id = result[0] if isinstance(result, tuple) else result['id'] else: cursor.execute( "INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id", (model_year_id, trim_name) ) trim_id = self._get_id_from_result(cursor.fetchone()) stats.trims_inserted += 1 else: # CLEAR mode - just insert cursor.execute( "INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id", (model_year_id, trim_name) ) trim_id = self._get_id_from_result(cursor.fetchone()) stats.trims_inserted += 1 # 2. Connect trim to engines (always check for existing to avoid duplicates) # Deduplicate engine_ids to prevent duplicate mappings within the same trim unique_engine_ids = list(set(engine_ids)) for engine_id in unique_engine_ids: # Check if mapping already exists cursor.execute( "SELECT 1 FROM vehicles.trim_engine WHERE trim_id = %s AND engine_id = %s", (trim_id, engine_id) ) if not cursor.fetchone(): try: cursor.execute( "INSERT INTO vehicles.trim_engine (trim_id, engine_id) VALUES (%s, %s)", (trim_id, engine_id) ) stats.trim_engine_mappings_inserted += 1 except Exception as e: if "duplicate key value violates unique constraint" in str(e): # Another process may have inserted it, skip logger.debug(f"Trim-engine mapping ({trim_id}, {engine_id}) already exists, skipping") else: raise return trim_id def load_all_makes(self, makes_data: List[MakeData], mode: LoadMode) -> LoadResult: """ Load all makes with complete data Args: makes_data: List of extracted make data mode: Loading mode (clear/append) Returns: LoadResult with comprehensive statistics """ logger.info(f"Starting bulk load of {len(makes_data)} makes in {mode.value} mode") # Clear tables if in CLEAR mode if mode == LoadMode.CLEAR: self.clear_all_tables() stats = LoadStatistics() failed_makes = [] for make_data in makes_data: try: if make_data.processing_errors: logger.warning(f"Skipping make {make_data.name} due to extraction errors") stats.makes_skipped += 1 failed_makes.append(make_data.name) continue make_id = self.load_make(make_data, mode, stats) logger.info(f"Successfully loaded make {make_data.name} (ID: {make_id})") except Exception as e: logger.error(f"Failed to load make {make_data.name}: {str(e)}") failed_makes.append(make_data.name) continue # Create result result = LoadResult( total_makes=len(makes_data), total_models=stats.models_inserted, total_model_years=stats.model_years_inserted, total_trims=stats.trims_inserted, total_engines=stats.engines_inserted, total_trim_engine_mappings=stats.trim_engine_mappings_inserted, failed_makes=failed_makes, warnings=stats.warnings, load_mode=mode ) logger.info(f"Bulk load complete: {result.success_count}/{result.total_makes} makes loaded successfully") logger.info(f"Data loaded: {result.total_models} models, {result.total_engines} engines, {result.total_trims} trims") return result def get_database_statistics(self) -> Dict[str, int]: """ Get current database record counts Returns: Dictionary with table counts """ stats = {} tables = ['make', 'model', 'model_year', 'trim', 'engine', 'trim_engine'] with db_connections.postgres_connection() as conn: cursor = conn.cursor() for table in tables: cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table}") result = cursor.fetchone() stats[table] = result[0] if isinstance(result, tuple) else result['count'] return stats def validate_referential_integrity(self) -> List[str]: """ Validate referential integrity of loaded data Returns: List of integrity issues found (empty if all good) """ issues = [] with db_connections.postgres_connection() as conn: cursor = conn.cursor() # Check for orphaned models cursor.execute(""" SELECT COUNT(*) FROM vehicles.model m LEFT JOIN vehicles.make mk ON m.make_id = mk.id WHERE mk.id IS NULL """) orphaned_models = self._get_id_from_result(cursor.fetchone(), 'count') if orphaned_models > 0: issues.append(f"Found {orphaned_models} orphaned models") # Check for orphaned model_years cursor.execute(""" SELECT COUNT(*) FROM vehicles.model_year my LEFT JOIN vehicles.model m ON my.model_id = m.id WHERE m.id IS NULL """) orphaned_model_years = self._get_id_from_result(cursor.fetchone()) if orphaned_model_years > 0: issues.append(f"Found {orphaned_model_years} orphaned model_years") # Check for orphaned trims cursor.execute(""" SELECT COUNT(*) FROM vehicles.trim t LEFT JOIN vehicles.model_year my ON t.model_year_id = my.id WHERE my.id IS NULL """) orphaned_trims = self._get_id_from_result(cursor.fetchone()) if orphaned_trims > 0: issues.append(f"Found {orphaned_trims} orphaned trims") # Check for broken trim_engine mappings cursor.execute(""" SELECT COUNT(*) FROM vehicles.trim_engine te LEFT JOIN vehicles.trim t ON te.trim_id = t.id LEFT JOIN vehicles.engine e ON te.engine_id = e.id WHERE t.id IS NULL OR e.id IS NULL """) broken_mappings = self._get_id_from_result(cursor.fetchone()) if broken_mappings > 0: issues.append(f"Found {broken_mappings} broken trim_engine mappings") if issues: logger.warning(f"Referential integrity issues found: {issues}") else: logger.info("Referential integrity validation passed") return issues def print_load_report(self, result: LoadResult) -> None: """ Print comprehensive loading report Args: result: LoadResult from load operation """ print(f"šŸš€ JSON MANUAL LOADING REPORT") print(f"=" * 50) # Load summary print(f"\nšŸ“Š LOADING SUMMARY") print(f" Mode: {result.load_mode.value.upper()}") print(f" Makes processed: {result.success_count}/{result.total_makes}") print(f" Success rate: {result.success_rate:.1%}") # Data counts print(f"\nšŸ“ˆ DATA LOADED") print(f" Models: {result.total_models}") print(f" Model years: {result.total_model_years}") print(f" Trims: {result.total_trims}") print(f" Engines: {result.total_engines}") print(f" Trim-engine mappings: {result.total_trim_engine_mappings}") # Issues if result.failed_makes: print(f"\nāš ļø FAILED MAKES ({len(result.failed_makes)}):") for make in result.failed_makes: print(f" {make}") if result.warnings: print(f"\nāš ļø WARNINGS ({len(result.warnings)}):") for warning in result.warnings[:5]: # Show first 5 print(f" {warning}") if len(result.warnings) > 5: print(f" ... and {len(result.warnings) - 5} more warnings") # Database statistics print(f"\nšŸ“‹ DATABASE STATISTICS:") db_stats = self.get_database_statistics() for table, count in db_stats.items(): print(f" vehicles.{table}: {count:,} records") # Referential integrity integrity_issues = self.validate_referential_integrity() if integrity_issues: print(f"\nāŒ REFERENTIAL INTEGRITY ISSUES:") for issue in integrity_issues: print(f" {issue}") else: print(f"\nāœ… REFERENTIAL INTEGRITY: PASSED") # Example usage and testing functions def example_usage(): """Demonstrate JsonManualLoader usage""" print("šŸš€ JsonManualLoader Example Usage") print("=" * 40) # This would typically be called after JsonExtractor # For demo purposes, we'll just show the structure print("\nšŸ“‹ Typical usage flow:") print("1. Extract data with JsonExtractor") print("2. Create JsonManualLoader") print("3. Load data in APPEND or CLEAR mode") print("4. Validate and report results") print(f"\nšŸ’” Example code:") print(""" # Extract data extractor = JsonExtractor(make_mapper, engine_parser) extraction_result = extractor.extract_all_makes('sources/makes') # Load data loader = JsonManualLoader() load_result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND) # Report results loader.print_load_report(load_result) """) if __name__ == "__main__": example_usage()