Initial Commit

This commit is contained in:
Eric Gullickson
2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions

View File

@@ -0,0 +1,716 @@
"""
JSON Manual Loader for Vehicles ETL
Loads extracted JSON data into PostgreSQL database with referential integrity.
Supports clear/append modes with duplicate handling and comprehensive progress tracking.
Database Schema:
- vehicles.make (id, name)
- vehicles.model (id, make_id, name)
- vehicles.model_year (id, model_id, year)
- vehicles.trim (id, model_year_id, name)
- vehicles.engine (id, name, code, displacement_l, cylinders, fuel_type, aspiration)
- vehicles.trim_engine (trim_id, engine_id)
Load Modes:
- CLEAR: Truncate all tables and reload (destructive)
- APPEND: Insert with conflict resolution (safe)
Usage:
loader = JsonManualLoader(postgres_loader)
result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
"""
import logging
from typing import List, Dict, Optional, Tuple
from enum import Enum
from dataclasses import dataclass
from psycopg2.extras import execute_batch
# Import our components (handle both relative and direct imports)
try:
from .postgres_loader import PostgreSQLLoader
from ..extractors.json_extractor import MakeData, ModelData, ExtractionResult
from ..utils.engine_spec_parser import EngineSpec
from ..connections import db_connections
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
# Import with fallback handling for nested imports
try:
from loaders.postgres_loader import PostgreSQLLoader
except ImportError:
# Mock PostgreSQLLoader for testing
class PostgreSQLLoader:
def __init__(self):
self.batch_size = 1000
from extractors.json_extractor import MakeData, ModelData, ExtractionResult
from utils.engine_spec_parser import EngineSpec
try:
from connections import db_connections
except ImportError:
# Mock db_connections for testing
class MockDBConnections:
def postgres_connection(self):
raise NotImplementedError("Database connection not available in test mode")
db_connections = MockDBConnections()
logger = logging.getLogger(__name__)
class LoadMode(Enum):
"""Data loading modes"""
CLEAR = "clear" # Truncate and reload (destructive)
APPEND = "append" # Insert with conflict handling (safe)
@dataclass
class LoadResult:
"""Result of loading operations"""
total_makes: int
total_models: int
total_model_years: int
total_trims: int
total_engines: int
total_trim_engine_mappings: int
failed_makes: List[str]
warnings: List[str]
load_mode: LoadMode
@property
def success_count(self) -> int:
return self.total_makes - len(self.failed_makes)
@property
def success_rate(self) -> float:
return self.success_count / self.total_makes if self.total_makes > 0 else 0.0
@dataclass
class LoadStatistics:
"""Detailed loading statistics"""
makes_processed: int = 0
makes_skipped: int = 0
models_inserted: int = 0
model_years_inserted: int = 0
skipped_model_years: int = 0
trims_inserted: int = 0
engines_inserted: int = 0
trim_engine_mappings_inserted: int = 0
duplicate_makes: int = 0
duplicate_models: int = 0
duplicate_engines: int = 0
errors: List[str] = None
warnings: List[str] = None
def __post_init__(self):
if self.errors is None:
self.errors = []
if self.warnings is None:
self.warnings = []
class JsonManualLoader:
"""Load JSON-extracted vehicle data into PostgreSQL"""
def _get_id_from_result(self, result, column_name='id'):
"""Helper to extract ID from query result, handling both tuple and dict cursors"""
if result is None:
return None
if isinstance(result, tuple):
return result[0]
# For RealDictCursor, try the column name first, fall back to key access
if column_name in result:
return result[column_name]
# For COUNT(*) queries, the key might be 'count'
if 'count' in result:
return result['count']
# Fall back to first value
return list(result.values())[0] if result else None
def __init__(self, postgres_loader: Optional[PostgreSQLLoader] = None):
"""
Initialize JSON manual loader
Args:
postgres_loader: Existing PostgreSQL loader instance
"""
self.postgres_loader = postgres_loader or PostgreSQLLoader()
self.batch_size = 1000
logger.info("JsonManualLoader initialized")
def clear_all_tables(self) -> None:
"""
Clear all vehicles tables in dependency order
WARNING: This is destructive and will remove all data
"""
logger.warning("CLEARING ALL VEHICLES TABLES - This is destructive!")
tables_to_clear = [
'trim_engine', # Many-to-many mappings first
'trim_transmission',
'performance', # Tables with foreign keys
'trim',
'model_year',
'model',
'make',
'engine', # Independent tables last
'transmission'
]
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
for table in tables_to_clear:
try:
cursor.execute(f"TRUNCATE TABLE vehicles.{table} CASCADE")
logger.info(f"Cleared vehicles.{table}")
except Exception as e:
logger.warning(f"Failed to clear vehicles.{table}: {str(e)}")
conn.commit()
logger.info("All vehicles tables cleared")
def load_make(self, make_data: MakeData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load a single make with all related data
Args:
make_data: Extracted make data
mode: Loading mode (clear/append)
stats: Statistics accumulator
Returns:
Make ID in database
"""
logger.debug(f"Loading make: {make_data.name}")
try:
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# 1. Insert or get make (always check for existing to avoid constraint violations)
# Check if make exists (case-insensitive to match database constraint)
cursor.execute(
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
(make_data.name,)
)
result = cursor.fetchone()
if result:
make_id = self._get_id_from_result(result)
stats.duplicate_makes += 1
logger.debug(f"Make {make_data.name} already exists with ID {make_id}")
else:
# Insert new make with error handling for constraint violations
try:
cursor.execute(
"INSERT INTO vehicles.make (name) VALUES (%s) RETURNING id",
(make_data.name,)
)
result = cursor.fetchone()
make_id = self._get_id_from_result(result)
logger.debug(f"Inserted make {make_data.name} with ID {make_id}")
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Retry the lookup in case of race condition
cursor.execute(
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
(make_data.name,)
)
result = cursor.fetchone()
if result:
make_id = self._get_id_from_result(result)
stats.duplicate_makes += 1
logger.debug(f"Make {make_data.name} found after retry with ID {make_id}")
else:
raise
else:
raise
# 2. Process models
for model_data in make_data.models:
model_id = self.load_model(cursor, make_id, model_data, mode, stats)
conn.commit()
stats.makes_processed += 1
return make_id
except Exception as e:
error_msg = f"Failed to load make {make_data.name}: {str(e)}"
logger.error(error_msg)
stats.errors.append(error_msg)
raise
def load_model(self, cursor, make_id: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load a single model with all related data
Args:
cursor: Database cursor
make_id: Parent make ID
model_data: Extracted model data
mode: Loading mode
stats: Statistics accumulator
Returns:
Model ID in database
"""
# 1. Insert or get model
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.model WHERE make_id = %s AND name = %s",
(make_id, model_data.name)
)
result = cursor.fetchone()
if result:
model_id = result[0] if isinstance(result, tuple) else result['id']
stats.duplicate_models += 1
else:
cursor.execute(
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
(make_id, model_data.name)
)
model_id = self._get_id_from_result(cursor.fetchone())
stats.models_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
(make_id, model_data.name)
)
model_id = self._get_id_from_result(cursor.fetchone())
stats.models_inserted += 1
# 2. Insert model years and related data
for year in model_data.years:
model_year_id = self.load_model_year(cursor, model_id, year, model_data, mode, stats)
# Skip processing if year was outside valid range
if model_year_id is None:
continue
return model_id
def load_model_year(self, cursor, model_id: int, year: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load model year and associated trims/engines
Args:
cursor: Database cursor
model_id: Parent model ID
year: Model year
model_data: Model data with trims and engines
mode: Loading mode
stats: Statistics accumulator
Returns:
Model year ID in database
"""
# Skip years that don't meet database constraints (must be 1950-2100)
if year < 1950 or year > 2100:
logger.warning(f"Skipping year {year} - outside valid range (1950-2100)")
stats.skipped_model_years += 1
return None
# 1. Insert or get model year
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.model_year WHERE model_id = %s AND year = %s",
(model_id, year)
)
result = cursor.fetchone()
if result:
model_year_id = result[0] if isinstance(result, tuple) else result['id']
else:
cursor.execute(
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
(model_id, year)
)
model_year_id = self._get_id_from_result(cursor.fetchone())
stats.model_years_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
(model_id, year)
)
model_year_id = self._get_id_from_result(cursor.fetchone())
stats.model_years_inserted += 1
# 2. Load engines and get their IDs
engine_ids = []
for engine_spec in model_data.engines:
engine_id = self.load_engine(cursor, engine_spec, mode, stats)
engine_ids.append(engine_id)
# 3. Load trims and connect to engines
for trim_name in model_data.trims:
trim_id = self.load_trim(cursor, model_year_id, trim_name, engine_ids, mode, stats)
return model_year_id
def load_engine(self, cursor, engine_spec: EngineSpec, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load engine specification
Args:
cursor: Database cursor
engine_spec: Parsed engine specification
mode: Loading mode
stats: Statistics accumulator
Returns:
Engine ID in database
"""
# Create a canonical engine name for database storage
if engine_spec.displacement_l and engine_spec.configuration != "Unknown" and engine_spec.cylinders:
engine_name = f"{engine_spec.displacement_l}L {engine_spec.configuration}{engine_spec.cylinders}"
else:
engine_name = engine_spec.raw_string
# Generate engine code from name (remove spaces, lowercase)
engine_code = engine_name.replace(" ", "").lower()
# Always check for existing engine by name or code to avoid constraint violations
cursor.execute("""
SELECT id FROM vehicles.engine
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
""", (engine_name, engine_code))
result = cursor.fetchone()
if result:
engine_id = self._get_id_from_result(result)
stats.duplicate_engines += 1
return engine_id
# Insert new engine
try:
cursor.execute("""
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
VALUES (%s, %s, %s, %s, %s, %s)
RETURNING id
""", (
engine_name,
engine_code,
engine_spec.displacement_l,
engine_spec.cylinders,
engine_spec.fuel_type if engine_spec.fuel_type != "Unknown" else None,
engine_spec.aspiration if engine_spec.aspiration != "Natural" else None
))
engine_id = self._get_id_from_result(cursor.fetchone())
stats.engines_inserted += 1
return engine_id
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Retry the lookup in case of race condition
cursor.execute("""
SELECT id FROM vehicles.engine
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
""", (engine_name, engine_code))
result = cursor.fetchone()
if result:
engine_id = self._get_id_from_result(result)
stats.duplicate_engines += 1
return engine_id
raise
def load_trim(self, cursor, model_year_id: int, trim_name: str, engine_ids: List[int], mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load trim and connect to engines
Args:
cursor: Database cursor
model_year_id: Parent model year ID
trim_name: Trim name
engine_ids: List of engine IDs to connect
mode: Loading mode
stats: Statistics accumulator
Returns:
Trim ID in database
"""
# 1. Insert or get trim
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.trim WHERE model_year_id = %s AND name = %s",
(model_year_id, trim_name)
)
result = cursor.fetchone()
if result:
trim_id = result[0] if isinstance(result, tuple) else result['id']
else:
cursor.execute(
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
(model_year_id, trim_name)
)
trim_id = self._get_id_from_result(cursor.fetchone())
stats.trims_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
(model_year_id, trim_name)
)
trim_id = self._get_id_from_result(cursor.fetchone())
stats.trims_inserted += 1
# 2. Connect trim to engines (always check for existing to avoid duplicates)
# Deduplicate engine_ids to prevent duplicate mappings within the same trim
unique_engine_ids = list(set(engine_ids))
for engine_id in unique_engine_ids:
# Check if mapping already exists
cursor.execute(
"SELECT 1 FROM vehicles.trim_engine WHERE trim_id = %s AND engine_id = %s",
(trim_id, engine_id)
)
if not cursor.fetchone():
try:
cursor.execute(
"INSERT INTO vehicles.trim_engine (trim_id, engine_id) VALUES (%s, %s)",
(trim_id, engine_id)
)
stats.trim_engine_mappings_inserted += 1
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Another process may have inserted it, skip
logger.debug(f"Trim-engine mapping ({trim_id}, {engine_id}) already exists, skipping")
else:
raise
return trim_id
def load_all_makes(self, makes_data: List[MakeData], mode: LoadMode) -> LoadResult:
"""
Load all makes with complete data
Args:
makes_data: List of extracted make data
mode: Loading mode (clear/append)
Returns:
LoadResult with comprehensive statistics
"""
logger.info(f"Starting bulk load of {len(makes_data)} makes in {mode.value} mode")
# Clear tables if in CLEAR mode
if mode == LoadMode.CLEAR:
self.clear_all_tables()
stats = LoadStatistics()
failed_makes = []
for make_data in makes_data:
try:
if make_data.processing_errors:
logger.warning(f"Skipping make {make_data.name} due to extraction errors")
stats.makes_skipped += 1
failed_makes.append(make_data.name)
continue
make_id = self.load_make(make_data, mode, stats)
logger.info(f"Successfully loaded make {make_data.name} (ID: {make_id})")
except Exception as e:
logger.error(f"Failed to load make {make_data.name}: {str(e)}")
failed_makes.append(make_data.name)
continue
# Create result
result = LoadResult(
total_makes=len(makes_data),
total_models=stats.models_inserted,
total_model_years=stats.model_years_inserted,
total_trims=stats.trims_inserted,
total_engines=stats.engines_inserted,
total_trim_engine_mappings=stats.trim_engine_mappings_inserted,
failed_makes=failed_makes,
warnings=stats.warnings,
load_mode=mode
)
logger.info(f"Bulk load complete: {result.success_count}/{result.total_makes} makes loaded successfully")
logger.info(f"Data loaded: {result.total_models} models, {result.total_engines} engines, {result.total_trims} trims")
return result
def get_database_statistics(self) -> Dict[str, int]:
"""
Get current database record counts
Returns:
Dictionary with table counts
"""
stats = {}
tables = ['make', 'model', 'model_year', 'trim', 'engine', 'trim_engine']
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
for table in tables:
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table}")
result = cursor.fetchone()
stats[table] = result[0] if isinstance(result, tuple) else result['count']
return stats
def validate_referential_integrity(self) -> List[str]:
"""
Validate referential integrity of loaded data
Returns:
List of integrity issues found (empty if all good)
"""
issues = []
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Check for orphaned models
cursor.execute("""
SELECT COUNT(*) FROM vehicles.model m
LEFT JOIN vehicles.make mk ON m.make_id = mk.id
WHERE mk.id IS NULL
""")
orphaned_models = self._get_id_from_result(cursor.fetchone(), 'count')
if orphaned_models > 0:
issues.append(f"Found {orphaned_models} orphaned models")
# Check for orphaned model_years
cursor.execute("""
SELECT COUNT(*) FROM vehicles.model_year my
LEFT JOIN vehicles.model m ON my.model_id = m.id
WHERE m.id IS NULL
""")
orphaned_model_years = self._get_id_from_result(cursor.fetchone())
if orphaned_model_years > 0:
issues.append(f"Found {orphaned_model_years} orphaned model_years")
# Check for orphaned trims
cursor.execute("""
SELECT COUNT(*) FROM vehicles.trim t
LEFT JOIN vehicles.model_year my ON t.model_year_id = my.id
WHERE my.id IS NULL
""")
orphaned_trims = self._get_id_from_result(cursor.fetchone())
if orphaned_trims > 0:
issues.append(f"Found {orphaned_trims} orphaned trims")
# Check for broken trim_engine mappings
cursor.execute("""
SELECT COUNT(*) FROM vehicles.trim_engine te
LEFT JOIN vehicles.trim t ON te.trim_id = t.id
LEFT JOIN vehicles.engine e ON te.engine_id = e.id
WHERE t.id IS NULL OR e.id IS NULL
""")
broken_mappings = self._get_id_from_result(cursor.fetchone())
if broken_mappings > 0:
issues.append(f"Found {broken_mappings} broken trim_engine mappings")
if issues:
logger.warning(f"Referential integrity issues found: {issues}")
else:
logger.info("Referential integrity validation passed")
return issues
def print_load_report(self, result: LoadResult) -> None:
"""
Print comprehensive loading report
Args:
result: LoadResult from load operation
"""
print(f"🚀 JSON MANUAL LOADING REPORT")
print(f"=" * 50)
# Load summary
print(f"\n📊 LOADING SUMMARY")
print(f" Mode: {result.load_mode.value.upper()}")
print(f" Makes processed: {result.success_count}/{result.total_makes}")
print(f" Success rate: {result.success_rate:.1%}")
# Data counts
print(f"\n📈 DATA LOADED")
print(f" Models: {result.total_models}")
print(f" Model years: {result.total_model_years}")
print(f" Trims: {result.total_trims}")
print(f" Engines: {result.total_engines}")
print(f" Trim-engine mappings: {result.total_trim_engine_mappings}")
# Issues
if result.failed_makes:
print(f"\n⚠️ FAILED MAKES ({len(result.failed_makes)}):")
for make in result.failed_makes:
print(f" {make}")
if result.warnings:
print(f"\n⚠️ WARNINGS ({len(result.warnings)}):")
for warning in result.warnings[:5]: # Show first 5
print(f" {warning}")
if len(result.warnings) > 5:
print(f" ... and {len(result.warnings) - 5} more warnings")
# Database statistics
print(f"\n📋 DATABASE STATISTICS:")
db_stats = self.get_database_statistics()
for table, count in db_stats.items():
print(f" vehicles.{table}: {count:,} records")
# Referential integrity
integrity_issues = self.validate_referential_integrity()
if integrity_issues:
print(f"\n❌ REFERENTIAL INTEGRITY ISSUES:")
for issue in integrity_issues:
print(f" {issue}")
else:
print(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
# Example usage and testing functions
def example_usage():
"""Demonstrate JsonManualLoader usage"""
print("🚀 JsonManualLoader Example Usage")
print("=" * 40)
# This would typically be called after JsonExtractor
# For demo purposes, we'll just show the structure
print("\n📋 Typical usage flow:")
print("1. Extract data with JsonExtractor")
print("2. Create JsonManualLoader")
print("3. Load data in APPEND or CLEAR mode")
print("4. Validate and report results")
print(f"\n💡 Example code:")
print("""
# Extract data
extractor = JsonExtractor(make_mapper, engine_parser)
extraction_result = extractor.extract_all_makes('sources/makes')
# Load data
loader = JsonManualLoader()
load_result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
# Report results
loader.print_load_report(load_result)
""")
if __name__ == "__main__":
example_usage()