Initial Commit
This commit is contained in:
1
mvp-platform-services/vehicles/etl/loaders/__init__.py
Executable file
1
mvp-platform-services/vehicles/etl/loaders/__init__.py
Executable file
@@ -0,0 +1 @@
|
||||
# ETL Loaders
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
716
mvp-platform-services/vehicles/etl/loaders/json_manual_loader.py
Normal file
716
mvp-platform-services/vehicles/etl/loaders/json_manual_loader.py
Normal file
@@ -0,0 +1,716 @@
|
||||
"""
|
||||
JSON Manual Loader for Vehicles ETL
|
||||
|
||||
Loads extracted JSON data into PostgreSQL database with referential integrity.
|
||||
Supports clear/append modes with duplicate handling and comprehensive progress tracking.
|
||||
|
||||
Database Schema:
|
||||
- vehicles.make (id, name)
|
||||
- vehicles.model (id, make_id, name)
|
||||
- vehicles.model_year (id, model_id, year)
|
||||
- vehicles.trim (id, model_year_id, name)
|
||||
- vehicles.engine (id, name, code, displacement_l, cylinders, fuel_type, aspiration)
|
||||
- vehicles.trim_engine (trim_id, engine_id)
|
||||
|
||||
Load Modes:
|
||||
- CLEAR: Truncate all tables and reload (destructive)
|
||||
- APPEND: Insert with conflict resolution (safe)
|
||||
|
||||
Usage:
|
||||
loader = JsonManualLoader(postgres_loader)
|
||||
result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
# Import our components (handle both relative and direct imports)
|
||||
try:
|
||||
from .postgres_loader import PostgreSQLLoader
|
||||
from ..extractors.json_extractor import MakeData, ModelData, ExtractionResult
|
||||
from ..utils.engine_spec_parser import EngineSpec
|
||||
from ..connections import db_connections
|
||||
except ImportError:
|
||||
# Fallback for direct execution
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
|
||||
# Import with fallback handling for nested imports
|
||||
try:
|
||||
from loaders.postgres_loader import PostgreSQLLoader
|
||||
except ImportError:
|
||||
# Mock PostgreSQLLoader for testing
|
||||
class PostgreSQLLoader:
|
||||
def __init__(self):
|
||||
self.batch_size = 1000
|
||||
|
||||
from extractors.json_extractor import MakeData, ModelData, ExtractionResult
|
||||
from utils.engine_spec_parser import EngineSpec
|
||||
|
||||
try:
|
||||
from connections import db_connections
|
||||
except ImportError:
|
||||
# Mock db_connections for testing
|
||||
class MockDBConnections:
|
||||
def postgres_connection(self):
|
||||
raise NotImplementedError("Database connection not available in test mode")
|
||||
db_connections = MockDBConnections()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LoadMode(Enum):
|
||||
"""Data loading modes"""
|
||||
CLEAR = "clear" # Truncate and reload (destructive)
|
||||
APPEND = "append" # Insert with conflict handling (safe)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoadResult:
|
||||
"""Result of loading operations"""
|
||||
total_makes: int
|
||||
total_models: int
|
||||
total_model_years: int
|
||||
total_trims: int
|
||||
total_engines: int
|
||||
total_trim_engine_mappings: int
|
||||
failed_makes: List[str]
|
||||
warnings: List[str]
|
||||
load_mode: LoadMode
|
||||
|
||||
@property
|
||||
def success_count(self) -> int:
|
||||
return self.total_makes - len(self.failed_makes)
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
return self.success_count / self.total_makes if self.total_makes > 0 else 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoadStatistics:
|
||||
"""Detailed loading statistics"""
|
||||
makes_processed: int = 0
|
||||
makes_skipped: int = 0
|
||||
models_inserted: int = 0
|
||||
model_years_inserted: int = 0
|
||||
skipped_model_years: int = 0
|
||||
trims_inserted: int = 0
|
||||
engines_inserted: int = 0
|
||||
trim_engine_mappings_inserted: int = 0
|
||||
duplicate_makes: int = 0
|
||||
duplicate_models: int = 0
|
||||
duplicate_engines: int = 0
|
||||
errors: List[str] = None
|
||||
warnings: List[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.errors is None:
|
||||
self.errors = []
|
||||
if self.warnings is None:
|
||||
self.warnings = []
|
||||
|
||||
|
||||
class JsonManualLoader:
|
||||
"""Load JSON-extracted vehicle data into PostgreSQL"""
|
||||
|
||||
def _get_id_from_result(self, result, column_name='id'):
|
||||
"""Helper to extract ID from query result, handling both tuple and dict cursors"""
|
||||
if result is None:
|
||||
return None
|
||||
if isinstance(result, tuple):
|
||||
return result[0]
|
||||
# For RealDictCursor, try the column name first, fall back to key access
|
||||
if column_name in result:
|
||||
return result[column_name]
|
||||
# For COUNT(*) queries, the key might be 'count'
|
||||
if 'count' in result:
|
||||
return result['count']
|
||||
# Fall back to first value
|
||||
return list(result.values())[0] if result else None
|
||||
|
||||
def __init__(self, postgres_loader: Optional[PostgreSQLLoader] = None):
|
||||
"""
|
||||
Initialize JSON manual loader
|
||||
|
||||
Args:
|
||||
postgres_loader: Existing PostgreSQL loader instance
|
||||
"""
|
||||
self.postgres_loader = postgres_loader or PostgreSQLLoader()
|
||||
self.batch_size = 1000
|
||||
|
||||
logger.info("JsonManualLoader initialized")
|
||||
|
||||
def clear_all_tables(self) -> None:
|
||||
"""
|
||||
Clear all vehicles tables in dependency order
|
||||
|
||||
WARNING: This is destructive and will remove all data
|
||||
"""
|
||||
logger.warning("CLEARING ALL VEHICLES TABLES - This is destructive!")
|
||||
|
||||
tables_to_clear = [
|
||||
'trim_engine', # Many-to-many mappings first
|
||||
'trim_transmission',
|
||||
'performance', # Tables with foreign keys
|
||||
'trim',
|
||||
'model_year',
|
||||
'model',
|
||||
'make',
|
||||
'engine', # Independent tables last
|
||||
'transmission'
|
||||
]
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
for table in tables_to_clear:
|
||||
try:
|
||||
cursor.execute(f"TRUNCATE TABLE vehicles.{table} CASCADE")
|
||||
logger.info(f"Cleared vehicles.{table}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clear vehicles.{table}: {str(e)}")
|
||||
|
||||
conn.commit()
|
||||
|
||||
logger.info("All vehicles tables cleared")
|
||||
|
||||
def load_make(self, make_data: MakeData, mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load a single make with all related data
|
||||
|
||||
Args:
|
||||
make_data: Extracted make data
|
||||
mode: Loading mode (clear/append)
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Make ID in database
|
||||
"""
|
||||
logger.debug(f"Loading make: {make_data.name}")
|
||||
|
||||
try:
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 1. Insert or get make (always check for existing to avoid constraint violations)
|
||||
# Check if make exists (case-insensitive to match database constraint)
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
|
||||
(make_data.name,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
make_id = self._get_id_from_result(result)
|
||||
stats.duplicate_makes += 1
|
||||
logger.debug(f"Make {make_data.name} already exists with ID {make_id}")
|
||||
else:
|
||||
# Insert new make with error handling for constraint violations
|
||||
try:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.make (name) VALUES (%s) RETURNING id",
|
||||
(make_data.name,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
make_id = self._get_id_from_result(result)
|
||||
logger.debug(f"Inserted make {make_data.name} with ID {make_id}")
|
||||
except Exception as e:
|
||||
if "duplicate key value violates unique constraint" in str(e):
|
||||
# Retry the lookup in case of race condition
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
|
||||
(make_data.name,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
make_id = self._get_id_from_result(result)
|
||||
stats.duplicate_makes += 1
|
||||
logger.debug(f"Make {make_data.name} found after retry with ID {make_id}")
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
raise
|
||||
|
||||
# 2. Process models
|
||||
for model_data in make_data.models:
|
||||
model_id = self.load_model(cursor, make_id, model_data, mode, stats)
|
||||
|
||||
conn.commit()
|
||||
stats.makes_processed += 1
|
||||
|
||||
return make_id
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to load make {make_data.name}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
stats.errors.append(error_msg)
|
||||
raise
|
||||
|
||||
def load_model(self, cursor, make_id: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load a single model with all related data
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
make_id: Parent make ID
|
||||
model_data: Extracted model data
|
||||
mode: Loading mode
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Model ID in database
|
||||
"""
|
||||
# 1. Insert or get model
|
||||
if mode == LoadMode.APPEND:
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.model WHERE make_id = %s AND name = %s",
|
||||
(make_id, model_data.name)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
model_id = result[0] if isinstance(result, tuple) else result['id']
|
||||
stats.duplicate_models += 1
|
||||
else:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
|
||||
(make_id, model_data.name)
|
||||
)
|
||||
model_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.models_inserted += 1
|
||||
else:
|
||||
# CLEAR mode - just insert
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
|
||||
(make_id, model_data.name)
|
||||
)
|
||||
model_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.models_inserted += 1
|
||||
|
||||
# 2. Insert model years and related data
|
||||
for year in model_data.years:
|
||||
model_year_id = self.load_model_year(cursor, model_id, year, model_data, mode, stats)
|
||||
# Skip processing if year was outside valid range
|
||||
if model_year_id is None:
|
||||
continue
|
||||
|
||||
return model_id
|
||||
|
||||
def load_model_year(self, cursor, model_id: int, year: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load model year and associated trims/engines
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
model_id: Parent model ID
|
||||
year: Model year
|
||||
model_data: Model data with trims and engines
|
||||
mode: Loading mode
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Model year ID in database
|
||||
"""
|
||||
# Skip years that don't meet database constraints (must be 1950-2100)
|
||||
if year < 1950 or year > 2100:
|
||||
logger.warning(f"Skipping year {year} - outside valid range (1950-2100)")
|
||||
stats.skipped_model_years += 1
|
||||
return None
|
||||
|
||||
# 1. Insert or get model year
|
||||
if mode == LoadMode.APPEND:
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.model_year WHERE model_id = %s AND year = %s",
|
||||
(model_id, year)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
model_year_id = result[0] if isinstance(result, tuple) else result['id']
|
||||
else:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
|
||||
(model_id, year)
|
||||
)
|
||||
model_year_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.model_years_inserted += 1
|
||||
else:
|
||||
# CLEAR mode - just insert
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
|
||||
(model_id, year)
|
||||
)
|
||||
model_year_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.model_years_inserted += 1
|
||||
|
||||
# 2. Load engines and get their IDs
|
||||
engine_ids = []
|
||||
for engine_spec in model_data.engines:
|
||||
engine_id = self.load_engine(cursor, engine_spec, mode, stats)
|
||||
engine_ids.append(engine_id)
|
||||
|
||||
# 3. Load trims and connect to engines
|
||||
for trim_name in model_data.trims:
|
||||
trim_id = self.load_trim(cursor, model_year_id, trim_name, engine_ids, mode, stats)
|
||||
|
||||
return model_year_id
|
||||
|
||||
def load_engine(self, cursor, engine_spec: EngineSpec, mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load engine specification
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
engine_spec: Parsed engine specification
|
||||
mode: Loading mode
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Engine ID in database
|
||||
"""
|
||||
# Create a canonical engine name for database storage
|
||||
if engine_spec.displacement_l and engine_spec.configuration != "Unknown" and engine_spec.cylinders:
|
||||
engine_name = f"{engine_spec.displacement_l}L {engine_spec.configuration}{engine_spec.cylinders}"
|
||||
else:
|
||||
engine_name = engine_spec.raw_string
|
||||
|
||||
# Generate engine code from name (remove spaces, lowercase)
|
||||
engine_code = engine_name.replace(" ", "").lower()
|
||||
|
||||
# Always check for existing engine by name or code to avoid constraint violations
|
||||
cursor.execute("""
|
||||
SELECT id FROM vehicles.engine
|
||||
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
|
||||
""", (engine_name, engine_code))
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
engine_id = self._get_id_from_result(result)
|
||||
stats.duplicate_engines += 1
|
||||
return engine_id
|
||||
|
||||
# Insert new engine
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
RETURNING id
|
||||
""", (
|
||||
engine_name,
|
||||
engine_code,
|
||||
engine_spec.displacement_l,
|
||||
engine_spec.cylinders,
|
||||
engine_spec.fuel_type if engine_spec.fuel_type != "Unknown" else None,
|
||||
engine_spec.aspiration if engine_spec.aspiration != "Natural" else None
|
||||
))
|
||||
|
||||
engine_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.engines_inserted += 1
|
||||
|
||||
return engine_id
|
||||
except Exception as e:
|
||||
if "duplicate key value violates unique constraint" in str(e):
|
||||
# Retry the lookup in case of race condition
|
||||
cursor.execute("""
|
||||
SELECT id FROM vehicles.engine
|
||||
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
|
||||
""", (engine_name, engine_code))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
engine_id = self._get_id_from_result(result)
|
||||
stats.duplicate_engines += 1
|
||||
return engine_id
|
||||
raise
|
||||
|
||||
def load_trim(self, cursor, model_year_id: int, trim_name: str, engine_ids: List[int], mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load trim and connect to engines
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
model_year_id: Parent model year ID
|
||||
trim_name: Trim name
|
||||
engine_ids: List of engine IDs to connect
|
||||
mode: Loading mode
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Trim ID in database
|
||||
"""
|
||||
# 1. Insert or get trim
|
||||
if mode == LoadMode.APPEND:
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.trim WHERE model_year_id = %s AND name = %s",
|
||||
(model_year_id, trim_name)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
trim_id = result[0] if isinstance(result, tuple) else result['id']
|
||||
else:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
|
||||
(model_year_id, trim_name)
|
||||
)
|
||||
trim_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.trims_inserted += 1
|
||||
else:
|
||||
# CLEAR mode - just insert
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
|
||||
(model_year_id, trim_name)
|
||||
)
|
||||
trim_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.trims_inserted += 1
|
||||
|
||||
# 2. Connect trim to engines (always check for existing to avoid duplicates)
|
||||
# Deduplicate engine_ids to prevent duplicate mappings within the same trim
|
||||
unique_engine_ids = list(set(engine_ids))
|
||||
for engine_id in unique_engine_ids:
|
||||
# Check if mapping already exists
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM vehicles.trim_engine WHERE trim_id = %s AND engine_id = %s",
|
||||
(trim_id, engine_id)
|
||||
)
|
||||
|
||||
if not cursor.fetchone():
|
||||
try:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.trim_engine (trim_id, engine_id) VALUES (%s, %s)",
|
||||
(trim_id, engine_id)
|
||||
)
|
||||
stats.trim_engine_mappings_inserted += 1
|
||||
except Exception as e:
|
||||
if "duplicate key value violates unique constraint" in str(e):
|
||||
# Another process may have inserted it, skip
|
||||
logger.debug(f"Trim-engine mapping ({trim_id}, {engine_id}) already exists, skipping")
|
||||
else:
|
||||
raise
|
||||
|
||||
return trim_id
|
||||
|
||||
def load_all_makes(self, makes_data: List[MakeData], mode: LoadMode) -> LoadResult:
|
||||
"""
|
||||
Load all makes with complete data
|
||||
|
||||
Args:
|
||||
makes_data: List of extracted make data
|
||||
mode: Loading mode (clear/append)
|
||||
|
||||
Returns:
|
||||
LoadResult with comprehensive statistics
|
||||
"""
|
||||
logger.info(f"Starting bulk load of {len(makes_data)} makes in {mode.value} mode")
|
||||
|
||||
# Clear tables if in CLEAR mode
|
||||
if mode == LoadMode.CLEAR:
|
||||
self.clear_all_tables()
|
||||
|
||||
stats = LoadStatistics()
|
||||
failed_makes = []
|
||||
|
||||
for make_data in makes_data:
|
||||
try:
|
||||
if make_data.processing_errors:
|
||||
logger.warning(f"Skipping make {make_data.name} due to extraction errors")
|
||||
stats.makes_skipped += 1
|
||||
failed_makes.append(make_data.name)
|
||||
continue
|
||||
|
||||
make_id = self.load_make(make_data, mode, stats)
|
||||
logger.info(f"Successfully loaded make {make_data.name} (ID: {make_id})")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load make {make_data.name}: {str(e)}")
|
||||
failed_makes.append(make_data.name)
|
||||
continue
|
||||
|
||||
# Create result
|
||||
result = LoadResult(
|
||||
total_makes=len(makes_data),
|
||||
total_models=stats.models_inserted,
|
||||
total_model_years=stats.model_years_inserted,
|
||||
total_trims=stats.trims_inserted,
|
||||
total_engines=stats.engines_inserted,
|
||||
total_trim_engine_mappings=stats.trim_engine_mappings_inserted,
|
||||
failed_makes=failed_makes,
|
||||
warnings=stats.warnings,
|
||||
load_mode=mode
|
||||
)
|
||||
|
||||
logger.info(f"Bulk load complete: {result.success_count}/{result.total_makes} makes loaded successfully")
|
||||
logger.info(f"Data loaded: {result.total_models} models, {result.total_engines} engines, {result.total_trims} trims")
|
||||
|
||||
return result
|
||||
|
||||
def get_database_statistics(self) -> Dict[str, int]:
|
||||
"""
|
||||
Get current database record counts
|
||||
|
||||
Returns:
|
||||
Dictionary with table counts
|
||||
"""
|
||||
stats = {}
|
||||
|
||||
tables = ['make', 'model', 'model_year', 'trim', 'engine', 'trim_engine']
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
for table in tables:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table}")
|
||||
result = cursor.fetchone()
|
||||
stats[table] = result[0] if isinstance(result, tuple) else result['count']
|
||||
|
||||
return stats
|
||||
|
||||
def validate_referential_integrity(self) -> List[str]:
|
||||
"""
|
||||
Validate referential integrity of loaded data
|
||||
|
||||
Returns:
|
||||
List of integrity issues found (empty if all good)
|
||||
"""
|
||||
issues = []
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check for orphaned models
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM vehicles.model m
|
||||
LEFT JOIN vehicles.make mk ON m.make_id = mk.id
|
||||
WHERE mk.id IS NULL
|
||||
""")
|
||||
orphaned_models = self._get_id_from_result(cursor.fetchone(), 'count')
|
||||
if orphaned_models > 0:
|
||||
issues.append(f"Found {orphaned_models} orphaned models")
|
||||
|
||||
# Check for orphaned model_years
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM vehicles.model_year my
|
||||
LEFT JOIN vehicles.model m ON my.model_id = m.id
|
||||
WHERE m.id IS NULL
|
||||
""")
|
||||
orphaned_model_years = self._get_id_from_result(cursor.fetchone())
|
||||
if orphaned_model_years > 0:
|
||||
issues.append(f"Found {orphaned_model_years} orphaned model_years")
|
||||
|
||||
# Check for orphaned trims
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM vehicles.trim t
|
||||
LEFT JOIN vehicles.model_year my ON t.model_year_id = my.id
|
||||
WHERE my.id IS NULL
|
||||
""")
|
||||
orphaned_trims = self._get_id_from_result(cursor.fetchone())
|
||||
if orphaned_trims > 0:
|
||||
issues.append(f"Found {orphaned_trims} orphaned trims")
|
||||
|
||||
# Check for broken trim_engine mappings
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM vehicles.trim_engine te
|
||||
LEFT JOIN vehicles.trim t ON te.trim_id = t.id
|
||||
LEFT JOIN vehicles.engine e ON te.engine_id = e.id
|
||||
WHERE t.id IS NULL OR e.id IS NULL
|
||||
""")
|
||||
broken_mappings = self._get_id_from_result(cursor.fetchone())
|
||||
if broken_mappings > 0:
|
||||
issues.append(f"Found {broken_mappings} broken trim_engine mappings")
|
||||
|
||||
if issues:
|
||||
logger.warning(f"Referential integrity issues found: {issues}")
|
||||
else:
|
||||
logger.info("Referential integrity validation passed")
|
||||
|
||||
return issues
|
||||
|
||||
def print_load_report(self, result: LoadResult) -> None:
|
||||
"""
|
||||
Print comprehensive loading report
|
||||
|
||||
Args:
|
||||
result: LoadResult from load operation
|
||||
"""
|
||||
print(f"🚀 JSON MANUAL LOADING REPORT")
|
||||
print(f"=" * 50)
|
||||
|
||||
# Load summary
|
||||
print(f"\n📊 LOADING SUMMARY")
|
||||
print(f" Mode: {result.load_mode.value.upper()}")
|
||||
print(f" Makes processed: {result.success_count}/{result.total_makes}")
|
||||
print(f" Success rate: {result.success_rate:.1%}")
|
||||
|
||||
# Data counts
|
||||
print(f"\n📈 DATA LOADED")
|
||||
print(f" Models: {result.total_models}")
|
||||
print(f" Model years: {result.total_model_years}")
|
||||
print(f" Trims: {result.total_trims}")
|
||||
print(f" Engines: {result.total_engines}")
|
||||
print(f" Trim-engine mappings: {result.total_trim_engine_mappings}")
|
||||
|
||||
# Issues
|
||||
if result.failed_makes:
|
||||
print(f"\n⚠️ FAILED MAKES ({len(result.failed_makes)}):")
|
||||
for make in result.failed_makes:
|
||||
print(f" {make}")
|
||||
|
||||
if result.warnings:
|
||||
print(f"\n⚠️ WARNINGS ({len(result.warnings)}):")
|
||||
for warning in result.warnings[:5]: # Show first 5
|
||||
print(f" {warning}")
|
||||
if len(result.warnings) > 5:
|
||||
print(f" ... and {len(result.warnings) - 5} more warnings")
|
||||
|
||||
# Database statistics
|
||||
print(f"\n📋 DATABASE STATISTICS:")
|
||||
db_stats = self.get_database_statistics()
|
||||
for table, count in db_stats.items():
|
||||
print(f" vehicles.{table}: {count:,} records")
|
||||
|
||||
# Referential integrity
|
||||
integrity_issues = self.validate_referential_integrity()
|
||||
if integrity_issues:
|
||||
print(f"\n❌ REFERENTIAL INTEGRITY ISSUES:")
|
||||
for issue in integrity_issues:
|
||||
print(f" {issue}")
|
||||
else:
|
||||
print(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
|
||||
|
||||
|
||||
# Example usage and testing functions
|
||||
def example_usage():
|
||||
"""Demonstrate JsonManualLoader usage"""
|
||||
print("🚀 JsonManualLoader Example Usage")
|
||||
print("=" * 40)
|
||||
|
||||
# This would typically be called after JsonExtractor
|
||||
# For demo purposes, we'll just show the structure
|
||||
|
||||
print("\n📋 Typical usage flow:")
|
||||
print("1. Extract data with JsonExtractor")
|
||||
print("2. Create JsonManualLoader")
|
||||
print("3. Load data in APPEND or CLEAR mode")
|
||||
print("4. Validate and report results")
|
||||
|
||||
print(f"\n💡 Example code:")
|
||||
print("""
|
||||
# Extract data
|
||||
extractor = JsonExtractor(make_mapper, engine_parser)
|
||||
extraction_result = extractor.extract_all_makes('sources/makes')
|
||||
|
||||
# Load data
|
||||
loader = JsonManualLoader()
|
||||
load_result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
|
||||
|
||||
# Report results
|
||||
loader.print_load_report(load_result)
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
437
mvp-platform-services/vehicles/etl/loaders/mssql_loader.py
Normal file
437
mvp-platform-services/vehicles/etl/loaders/mssql_loader.py
Normal file
@@ -0,0 +1,437 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
MSSQL Database Loader
|
||||
Handles loading .bak files into MSSQL Server for ETL processing
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
import pyodbc
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
from ..config import config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MSSQLLoader:
|
||||
"""Loads database files into MSSQL Server"""
|
||||
|
||||
def __init__(self):
|
||||
self.server = config.MSSQL_HOST
|
||||
self.port = config.MSSQL_PORT
|
||||
self.database = config.MSSQL_DATABASE
|
||||
self.username = config.MSSQL_USER
|
||||
self.password = config.MSSQL_PASSWORD
|
||||
|
||||
def get_connection_string(self, database: str = "master") -> str:
|
||||
"""Get MSSQL connection string"""
|
||||
return (
|
||||
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
|
||||
f"SERVER={self.server},{self.port};"
|
||||
f"DATABASE={database};"
|
||||
f"UID={self.username};"
|
||||
f"PWD={self.password};"
|
||||
f"TrustServerCertificate=yes;"
|
||||
)
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
"""Test MSSQL connection"""
|
||||
try:
|
||||
conn_str = self.get_connection_string()
|
||||
logger.info(f"Testing MSSQL connection to: {self.server}")
|
||||
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT @@VERSION")
|
||||
version = cursor.fetchone()[0]
|
||||
logger.info(f"MSSQL connection successful: {version[:100]}...")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"MSSQL connection failed: {e}")
|
||||
return False
|
||||
|
||||
def database_exists(self, database_name: str) -> bool:
|
||||
"""Check if database exists"""
|
||||
try:
|
||||
conn_str = self.get_connection_string()
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT COUNT(*) FROM sys.databases WHERE name = ?",
|
||||
(database_name,)
|
||||
)
|
||||
count = cursor.fetchone()[0]
|
||||
return count > 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to check if database exists: {e}")
|
||||
return False
|
||||
|
||||
def get_database_state(self, database_name: str) -> Optional[str]:
|
||||
"""Return the state_desc for a database or None if not found"""
|
||||
try:
|
||||
conn_str = self.get_connection_string()
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT state_desc FROM sys.databases WHERE name = ?",
|
||||
(database_name,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get database state: {e}")
|
||||
return None
|
||||
|
||||
def drop_database(self, database_name: str) -> bool:
|
||||
"""Drop database if it exists"""
|
||||
try:
|
||||
if not self.database_exists(database_name):
|
||||
logger.info(f"Database {database_name} does not exist, skipping drop")
|
||||
return True
|
||||
|
||||
logger.info(f"Dropping database: {database_name}")
|
||||
conn_str = self.get_connection_string()
|
||||
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
conn.autocommit = True
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Kill existing connections
|
||||
cursor.execute(f"""
|
||||
ALTER DATABASE [{database_name}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;
|
||||
DROP DATABASE [{database_name}];
|
||||
""")
|
||||
|
||||
logger.info(f"Successfully dropped database: {database_name}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to drop database {database_name}: {e}")
|
||||
return False
|
||||
|
||||
def get_backup_file_info(self, bak_path: Path) -> Optional[dict]:
|
||||
"""Get information about backup file"""
|
||||
try:
|
||||
# Use the MSSQL container's mounted backup directory
|
||||
container_path = f"/backups/{bak_path.name}"
|
||||
|
||||
# For now, assume the file is accessible
|
||||
# In production, this would copy the file into the MSSQL container
|
||||
|
||||
conn_str = self.get_connection_string()
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get backup file information
|
||||
cursor.execute(f"RESTORE HEADERONLY FROM DISK = '{container_path}'")
|
||||
headers = cursor.fetchall()
|
||||
|
||||
if headers:
|
||||
header = headers[0]
|
||||
return {
|
||||
"database_name": header.DatabaseName,
|
||||
"server_name": header.ServerName,
|
||||
"backup_start_date": header.BackupStartDate,
|
||||
"backup_finish_date": header.BackupFinishDate,
|
||||
"backup_size": header.BackupSize,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not get backup file info: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def restore_database(self, bak_path: Path, target_database: str = None) -> bool:
|
||||
"""
|
||||
Restore database from .bak file
|
||||
|
||||
Args:
|
||||
bak_path: Path to .bak file
|
||||
target_database: Target database name (defaults to VPICList)
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
if target_database is None:
|
||||
target_database = self.database
|
||||
|
||||
if not bak_path.exists():
|
||||
logger.error(f"Backup file does not exist: {bak_path}")
|
||||
return False
|
||||
|
||||
logger.info(f"Starting database restore: {bak_path} -> {target_database}")
|
||||
|
||||
try:
|
||||
# Copy backup file to MSSQL container
|
||||
container_bak_path = self.copy_backup_to_container(bak_path)
|
||||
|
||||
if not container_bak_path:
|
||||
logger.error("Failed to copy backup file to container")
|
||||
return False
|
||||
|
||||
# If database exists, note the state; we will handle exclusivity in the same session below
|
||||
if self.database_exists(target_database):
|
||||
state = self.get_database_state(target_database)
|
||||
logger.info(f"Existing database detected: {target_database} (state={state})")
|
||||
else:
|
||||
logger.info(f"Target database does not exist yet: {target_database} — proceeding with restore")
|
||||
|
||||
# Restore database using a single master connection for exclusivity
|
||||
logger.info(f"Restoring database from: {container_bak_path}")
|
||||
|
||||
conn_str = self.get_connection_string()
|
||||
with pyodbc.connect(conn_str, timeout=600) as conn: # 10 minute timeout
|
||||
conn.autocommit = True
|
||||
cursor = conn.cursor()
|
||||
|
||||
# If DB exists, ensure exclusive access: kill sessions + SINGLE_USER in this session
|
||||
if self.database_exists(target_database):
|
||||
try:
|
||||
logger.info(f"Preparing exclusive access for restore: killing active sessions on {target_database}")
|
||||
kill_sql = f"""
|
||||
DECLARE @db sysname = N'{target_database}';
|
||||
DECLARE @kill nvarchar(max) = N'';
|
||||
SELECT @kill = @kill + N'KILL ' + CONVERT(nvarchar(10), session_id) + N';'
|
||||
FROM sys.dm_exec_sessions
|
||||
WHERE database_id = DB_ID(@db) AND session_id <> @@SPID;
|
||||
IF LEN(@kill) > 0 EXEC (@kill);
|
||||
"""
|
||||
cursor.execute(kill_sql)
|
||||
# Force SINGLE_USER in current session
|
||||
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
|
||||
logger.info(f"Exclusive access prepared (SINGLE_USER) for {target_database}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not fully prepare exclusive access: {e}")
|
||||
|
||||
# Get logical file names from backup
|
||||
cursor.execute(f"RESTORE FILELISTONLY FROM DISK = '{container_bak_path}'")
|
||||
files = cursor.fetchall()
|
||||
|
||||
if not files:
|
||||
logger.error("No files found in backup")
|
||||
return False
|
||||
|
||||
# Build RESTORE command with MOVE options
|
||||
data_file = None
|
||||
log_file = None
|
||||
|
||||
for file_info in files:
|
||||
logical_name = file_info.LogicalName
|
||||
file_type = file_info.Type
|
||||
|
||||
if file_type == 'D': # Data file
|
||||
data_file = logical_name
|
||||
elif file_type == 'L': # Log file
|
||||
log_file = logical_name
|
||||
|
||||
if not data_file:
|
||||
logger.error("No data file found in backup")
|
||||
return False
|
||||
|
||||
# Construct restore command
|
||||
restore_sql = f"""
|
||||
RESTORE DATABASE [{target_database}]
|
||||
FROM DISK = '{container_bak_path}'
|
||||
WITH
|
||||
MOVE '{data_file}' TO '/var/opt/mssql/data/{target_database}.mdf',
|
||||
"""
|
||||
|
||||
if log_file:
|
||||
restore_sql += f" MOVE '{log_file}' TO '/var/opt/mssql/data/{target_database}.ldf',"
|
||||
|
||||
restore_sql += """
|
||||
REPLACE,
|
||||
RECOVERY,
|
||||
STATS = 10
|
||||
"""
|
||||
|
||||
logger.info(f"Executing restore command for database: {target_database}")
|
||||
logger.debug(f"Restore SQL: {restore_sql}")
|
||||
|
||||
try:
|
||||
cursor.execute(restore_sql)
|
||||
except Exception as e:
|
||||
# If we hit exclusive access error, retry once after killing sessions again
|
||||
if 'Exclusive access could not be obtained' in str(e):
|
||||
logger.warning("Exclusive access error on RESTORE; retrying after killing sessions and reasserting SINGLE_USER...")
|
||||
try:
|
||||
cursor.execute(kill_sql)
|
||||
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
|
||||
except Exception as e2:
|
||||
logger.warning(f"Retry exclusive prep failed: {e2}")
|
||||
cursor.execute(restore_sql)
|
||||
else:
|
||||
raise
|
||||
|
||||
# Poll for database to be ONLINE
|
||||
if not self._wait_for_database_online(target_database):
|
||||
logger.error(f"Database did not come ONLINE in time: {target_database}")
|
||||
return False
|
||||
|
||||
# Small retry around database_exists to handle late readiness
|
||||
if self._retry_database_exists(target_database):
|
||||
logger.info(f"Database restore successful and ONLINE: {target_database}")
|
||||
|
||||
# Get basic database info
|
||||
cursor.execute(f"""
|
||||
SELECT
|
||||
name,
|
||||
create_date,
|
||||
compatibility_level,
|
||||
state_desc
|
||||
FROM sys.databases
|
||||
WHERE name = '{target_database}'
|
||||
""")
|
||||
|
||||
db_info = cursor.fetchone()
|
||||
if db_info:
|
||||
logger.info(f"Database info: Name={db_info.name}, Created={db_info.create_date}, Level={db_info.compatibility_level}, State={db_info.state_desc}")
|
||||
|
||||
# Optional: quick content verification with small retry window
|
||||
if not self._retry_verify_content(target_database):
|
||||
logger.warning("Database restored but content verification is inconclusive")
|
||||
|
||||
# Try to set MULTI_USER back in same session
|
||||
try:
|
||||
cursor.execute(f"ALTER DATABASE [{target_database}] SET MULTI_USER;")
|
||||
logger.info(f"Set {target_database} back to MULTI_USER")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not set MULTI_USER on {target_database}: {e}")
|
||||
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Database restore failed - database not found: {target_database}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Database restore failed: {e}")
|
||||
return False
|
||||
|
||||
def copy_backup_to_container(self, bak_path: Path) -> Optional[str]:
|
||||
"""
|
||||
Copy backup file to shared volume accessible by MSSQL container
|
||||
|
||||
Args:
|
||||
bak_path: Local path to .bak file
|
||||
|
||||
Returns:
|
||||
Container path to .bak file or None if failed
|
||||
"""
|
||||
try:
|
||||
# Use shared volume instead of docker cp
|
||||
shared_dir = Path("/app/shared")
|
||||
shared_bak_path = shared_dir / bak_path.name
|
||||
|
||||
# If the file is already in the shared dir, skip copying
|
||||
if bak_path.resolve().parent == shared_dir.resolve():
|
||||
logger.info(f"Backup already in shared volume: {bak_path}")
|
||||
else:
|
||||
logger.info(f"Copying {bak_path} to shared volume...")
|
||||
import shutil
|
||||
shutil.copy2(bak_path, shared_bak_path)
|
||||
|
||||
# Container path from MSSQL perspective
|
||||
container_path = f"/backups/{shared_bak_path.name}"
|
||||
|
||||
logger.info(f"Successfully copied to shared volume: {container_path}")
|
||||
return container_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to copy backup to shared volume: {e}")
|
||||
return None
|
||||
|
||||
def _wait_for_database_online(self, database_name: str, timeout_seconds: int = 600, interval_seconds: int = 5) -> bool:
|
||||
"""Poll MSSQL until the specified database state becomes ONLINE or timeout.
|
||||
|
||||
Returns True if ONLINE, False on timeout/error.
|
||||
"""
|
||||
logger.info(f"Waiting for database to become ONLINE: {database_name}")
|
||||
deadline = time.time() + timeout_seconds
|
||||
last_state = None
|
||||
try:
|
||||
conn_str = self.get_connection_string()
|
||||
while time.time() < deadline:
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT state_desc FROM sys.databases WHERE name = ?", (database_name,))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
state = row[0]
|
||||
if state != last_state:
|
||||
logger.info(f"Database state: {state}")
|
||||
last_state = state
|
||||
if state == 'ONLINE':
|
||||
# Optional: verify updateability is READ_WRITE
|
||||
try:
|
||||
cursor.execute("SELECT DATABASEPROPERTYEX(?, 'Updateability')", (database_name,))
|
||||
up = cursor.fetchone()[0]
|
||||
logger.info(f"Database updateability: {up}")
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
else:
|
||||
logger.info("Database entry not found yet in sys.databases")
|
||||
time.sleep(interval_seconds)
|
||||
except Exception as e:
|
||||
logger.error(f"Error while waiting for database ONLINE: {e}")
|
||||
return False
|
||||
logger.error("Timed out waiting for database to become ONLINE")
|
||||
return False
|
||||
|
||||
def _retry_database_exists(self, database_name: str, attempts: int = 6, delay_seconds: int = 5) -> bool:
|
||||
"""Retry wrapper for database existence checks."""
|
||||
for i in range(1, attempts + 1):
|
||||
if self.database_exists(database_name):
|
||||
return True
|
||||
logger.info(f"database_exists() false, retrying ({i}/{attempts})...")
|
||||
time.sleep(delay_seconds)
|
||||
return False
|
||||
|
||||
def _retry_verify_content(self, database_name: str, attempts: int = 3, delay_seconds: int = 5) -> bool:
|
||||
"""Retry wrapper around verify_database_content to allow late readiness."""
|
||||
for i in range(1, attempts + 1):
|
||||
try:
|
||||
counts = self.verify_database_content(database_name)
|
||||
if counts:
|
||||
logger.info(f"Content verification counts: {counts}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.info(f"Content verification attempt {i} failed: {e}")
|
||||
time.sleep(delay_seconds)
|
||||
return False
|
||||
|
||||
def verify_database_content(self, database_name: str = None) -> dict:
|
||||
"""
|
||||
Verify database has expected content
|
||||
|
||||
Returns:
|
||||
Dictionary with table counts
|
||||
"""
|
||||
if database_name is None:
|
||||
database_name = self.database
|
||||
|
||||
try:
|
||||
conn_str = self.get_connection_string(database_name)
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get table counts for key tables
|
||||
tables_to_check = ['Make', 'Model', 'VehicleType', 'Manufacturer']
|
||||
counts = {}
|
||||
|
||||
for table in tables_to_check:
|
||||
try:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
count = cursor.fetchone()[0]
|
||||
counts[table] = count
|
||||
logger.info(f"Table {table}: {count:,} rows")
|
||||
except:
|
||||
counts[table] = 0
|
||||
|
||||
return counts
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to verify database content: {e}")
|
||||
return {}
|
||||
354
mvp-platform-services/vehicles/etl/loaders/postgres_loader.py
Executable file
354
mvp-platform-services/vehicles/etl/loaders/postgres_loader.py
Executable file
@@ -0,0 +1,354 @@
|
||||
import logging
|
||||
from typing import List, Dict, Optional
|
||||
from psycopg2.extras import execute_batch
|
||||
from ..connections import db_connections
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PostgreSQLLoader:
|
||||
"""Load data into PostgreSQL target database"""
|
||||
|
||||
def __init__(self):
|
||||
self.batch_size = 1000
|
||||
|
||||
def load_reference_table(self, table_name: str, data: List[Dict],
|
||||
clear_existing: bool = True) -> int:
|
||||
"""Load data into a reference table"""
|
||||
if not data:
|
||||
logger.warning(f"No data to load for table {table_name}")
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(data)} records into vehicles.{table_name}")
|
||||
|
||||
# Column mapping from source (MS SQL) to target (PostgreSQL)
|
||||
column_mappings = {
|
||||
'Id': 'id',
|
||||
'Name': 'name',
|
||||
'Code': 'code',
|
||||
'MakeId': 'make_id',
|
||||
'CreateOn': 'created_at',
|
||||
'CreatedOn': 'created_at',
|
||||
'UpdateOn': 'updated_at',
|
||||
'UpdatedOn': 'updated_at',
|
||||
'Wmi': 'wmi',
|
||||
'ManufacturerId': 'manufacturer_id',
|
||||
'MakeId': 'make_id',
|
||||
'VehicleTypeId': 'vehicle_type_id',
|
||||
'TruckTypeId': 'truck_type_id',
|
||||
'CountryId': 'country_id',
|
||||
'PublicAvailabilityDate': 'public_availability_date',
|
||||
'NonCompliant': 'non_compliant',
|
||||
'NonCompliantReason': 'non_compliant_reason',
|
||||
'ProcessedOn': 'processed_on',
|
||||
'DisplayOrder': 'display_order',
|
||||
'FormType': 'form_type',
|
||||
'Description': 'description',
|
||||
'LookupTable': 'lookup_table',
|
||||
'IsPrivate': 'is_private',
|
||||
'GroupName': 'group_name',
|
||||
'DataType': 'data_type',
|
||||
'MinAllowedValue': 'min_allowed_value',
|
||||
'MaxAllowedValue': 'max_allowed_value',
|
||||
'IsQS': 'is_qs',
|
||||
'Decode': 'decode',
|
||||
'weight': 'weight',
|
||||
# ErrorCode specific mappings
|
||||
'ErrorCodeName': 'code',
|
||||
'ErrorCodeDescription': 'description'
|
||||
}
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
if clear_existing:
|
||||
cursor.execute(f"TRUNCATE TABLE vehicles.{table_name} CASCADE")
|
||||
logger.info(f"Cleared existing data from vehicles.{table_name}")
|
||||
|
||||
# Get source columns and map them to target columns
|
||||
source_columns = list(data[0].keys())
|
||||
target_columns = []
|
||||
valid_data = []
|
||||
|
||||
# Map columns and filter data
|
||||
for source_col in source_columns:
|
||||
if source_col in column_mappings:
|
||||
target_columns.append(column_mappings[source_col])
|
||||
else:
|
||||
target_columns.append(source_col.lower())
|
||||
|
||||
# Check which columns exist in target table
|
||||
cursor.execute(f"""
|
||||
SELECT column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = 'vehicles' AND table_name = '{table_name}'
|
||||
""")
|
||||
results = cursor.fetchall()
|
||||
existing_columns = {row['column_name'] if isinstance(row, dict) else row[0] for row in results}
|
||||
|
||||
# Filter to only existing columns
|
||||
final_columns = []
|
||||
final_indices = []
|
||||
for i, col in enumerate(target_columns):
|
||||
if col in existing_columns:
|
||||
final_columns.append(col)
|
||||
final_indices.append(i)
|
||||
|
||||
if not final_columns:
|
||||
logger.warning(f"No matching columns found for table {table_name}")
|
||||
return 0
|
||||
|
||||
column_str = ','.join(final_columns)
|
||||
placeholders = ','.join(['%s'] * len(final_columns))
|
||||
|
||||
# Prepare insert query
|
||||
query = f"""
|
||||
INSERT INTO vehicles.{table_name} ({column_str})
|
||||
VALUES ({placeholders})
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
# Prepare data tuples with only valid columns
|
||||
data_tuples = []
|
||||
for record in data:
|
||||
values = []
|
||||
skip_record = False
|
||||
|
||||
for i in final_indices:
|
||||
source_col = source_columns[i]
|
||||
value = record[source_col]
|
||||
|
||||
# Handle special cases for error_codes table
|
||||
if table_name == 'error_codes' and source_col in ['ErrorCodeName', 'Code'] and (value is None or value == ''):
|
||||
skip_record = True
|
||||
break
|
||||
|
||||
values.append(value)
|
||||
|
||||
if not skip_record:
|
||||
data_tuples.append(tuple(values))
|
||||
|
||||
# Execute batch insert
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
# Get final count
|
||||
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
|
||||
result = cursor.fetchone()
|
||||
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
|
||||
logger.info(f"Successfully loaded {final_count} records into vehicles.{table_name}")
|
||||
return final_count
|
||||
|
||||
def load_wmi_vin_schema_mappings(self, mappings: List[Dict]) -> int:
|
||||
"""Load WMI to VIN Schema mappings"""
|
||||
if not mappings:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(mappings)} WMI-VinSchema mappings")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clear existing mappings
|
||||
cursor.execute("TRUNCATE TABLE vehicles.wmi_vin_schemas CASCADE")
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.wmi_vin_schemas
|
||||
(wmi_id, vin_schema_id, year_from, year_to)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = []
|
||||
for mapping in mappings:
|
||||
data_tuples.append((
|
||||
mapping['WmiId'],
|
||||
mapping['VinSchemaId'],
|
||||
mapping['YearFrom'] or 1980,
|
||||
mapping['YearTo'] or 2999
|
||||
))
|
||||
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
# Get final count
|
||||
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_vin_schemas")
|
||||
result = cursor.fetchone()
|
||||
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
|
||||
logger.info(f"Successfully loaded {final_count} WMI-VinSchema mappings")
|
||||
return final_count
|
||||
|
||||
def load_make_model_relationships(self, relationships: List[Dict]) -> int:
|
||||
"""Load Make-Model relationships"""
|
||||
if not relationships:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(relationships)} Make-Model relationships")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clear existing relationships
|
||||
cursor.execute("TRUNCATE TABLE vehicles.make_models CASCADE")
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.make_models (make_id, model_id)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = []
|
||||
for rel in relationships:
|
||||
data_tuples.append((rel['MakeId'], rel['ModelId']))
|
||||
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
# Get final count
|
||||
cursor.execute("SELECT COUNT(*) FROM vehicles.make_models")
|
||||
result = cursor.fetchone()
|
||||
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
|
||||
logger.info(f"Successfully loaded {final_count} Make-Model relationships")
|
||||
return final_count
|
||||
|
||||
def load_wmi_make_relationships(self, relationships: List[Dict]) -> int:
|
||||
"""Load WMI-Make relationships"""
|
||||
if not relationships:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(relationships)} WMI-Make relationships")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clear existing relationships
|
||||
cursor.execute("TRUNCATE TABLE vehicles.wmi_makes CASCADE")
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.wmi_makes (wmi_id, make_id)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = []
|
||||
for rel in relationships:
|
||||
data_tuples.append((rel['WmiId'], rel['MakeId']))
|
||||
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
# Get final count
|
||||
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_makes")
|
||||
result = cursor.fetchone()
|
||||
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
|
||||
logger.info(f"Successfully loaded {final_count} WMI-Make relationships")
|
||||
return final_count
|
||||
|
||||
def load_model_years(self, model_years: List[Dict]) -> int:
|
||||
"""Load model year availability data"""
|
||||
if not model_years:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(model_years)} model year records")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.model_year (model_id, year)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT (model_id, year) DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = [(my['model_id'], my['year']) for my in model_years]
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
return len(model_years)
|
||||
|
||||
def load_trims(self, trims: List[Dict]) -> int:
|
||||
"""Load trim data"""
|
||||
if not trims:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(trims)} trim records")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.trim (model_year_id, name)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = [(t['model_year_id'], t['name']) for t in trims]
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
return len(trims)
|
||||
|
||||
def load_engines(self, engines: List[Dict]) -> int:
|
||||
"""Load engine data"""
|
||||
if not engines:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(engines)} engine records")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (lower(name)) DO NOTHING
|
||||
RETURNING id
|
||||
"""
|
||||
|
||||
for engine in engines:
|
||||
cursor.execute(query, (
|
||||
engine['name'],
|
||||
engine.get('code'),
|
||||
engine.get('displacement_l'),
|
||||
engine.get('cylinders'),
|
||||
engine.get('fuel_type'),
|
||||
engine.get('aspiration')
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
|
||||
return len(engines)
|
||||
|
||||
def load_trim_engine_relationships(self, relationships: List[Dict]) -> int:
|
||||
"""Load trim-engine relationships"""
|
||||
if not relationships:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(relationships)} trim-engine relationships")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.trim_engine (trim_id, engine_id)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT (trim_id, engine_id) DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = [(rel['trim_id'], rel['engine_id']) for rel in relationships]
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
return len(relationships)
|
||||
|
||||
def get_table_count(self, table_name: str) -> int:
|
||||
"""Get count of records in a table"""
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
|
||||
result = cursor.fetchone()
|
||||
return result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
Reference in New Issue
Block a user