Initial Commit

This commit is contained in:
Eric Gullickson
2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions

View File

@@ -0,0 +1 @@
# ETL Loaders

View File

@@ -0,0 +1,716 @@
"""
JSON Manual Loader for Vehicles ETL
Loads extracted JSON data into PostgreSQL database with referential integrity.
Supports clear/append modes with duplicate handling and comprehensive progress tracking.
Database Schema:
- vehicles.make (id, name)
- vehicles.model (id, make_id, name)
- vehicles.model_year (id, model_id, year)
- vehicles.trim (id, model_year_id, name)
- vehicles.engine (id, name, code, displacement_l, cylinders, fuel_type, aspiration)
- vehicles.trim_engine (trim_id, engine_id)
Load Modes:
- CLEAR: Truncate all tables and reload (destructive)
- APPEND: Insert with conflict resolution (safe)
Usage:
loader = JsonManualLoader(postgres_loader)
result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
"""
import logging
from typing import List, Dict, Optional, Tuple
from enum import Enum
from dataclasses import dataclass
from psycopg2.extras import execute_batch
# Import our components (handle both relative and direct imports)
try:
from .postgres_loader import PostgreSQLLoader
from ..extractors.json_extractor import MakeData, ModelData, ExtractionResult
from ..utils.engine_spec_parser import EngineSpec
from ..connections import db_connections
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
# Import with fallback handling for nested imports
try:
from loaders.postgres_loader import PostgreSQLLoader
except ImportError:
# Mock PostgreSQLLoader for testing
class PostgreSQLLoader:
def __init__(self):
self.batch_size = 1000
from extractors.json_extractor import MakeData, ModelData, ExtractionResult
from utils.engine_spec_parser import EngineSpec
try:
from connections import db_connections
except ImportError:
# Mock db_connections for testing
class MockDBConnections:
def postgres_connection(self):
raise NotImplementedError("Database connection not available in test mode")
db_connections = MockDBConnections()
logger = logging.getLogger(__name__)
class LoadMode(Enum):
"""Data loading modes"""
CLEAR = "clear" # Truncate and reload (destructive)
APPEND = "append" # Insert with conflict handling (safe)
@dataclass
class LoadResult:
"""Result of loading operations"""
total_makes: int
total_models: int
total_model_years: int
total_trims: int
total_engines: int
total_trim_engine_mappings: int
failed_makes: List[str]
warnings: List[str]
load_mode: LoadMode
@property
def success_count(self) -> int:
return self.total_makes - len(self.failed_makes)
@property
def success_rate(self) -> float:
return self.success_count / self.total_makes if self.total_makes > 0 else 0.0
@dataclass
class LoadStatistics:
"""Detailed loading statistics"""
makes_processed: int = 0
makes_skipped: int = 0
models_inserted: int = 0
model_years_inserted: int = 0
skipped_model_years: int = 0
trims_inserted: int = 0
engines_inserted: int = 0
trim_engine_mappings_inserted: int = 0
duplicate_makes: int = 0
duplicate_models: int = 0
duplicate_engines: int = 0
errors: List[str] = None
warnings: List[str] = None
def __post_init__(self):
if self.errors is None:
self.errors = []
if self.warnings is None:
self.warnings = []
class JsonManualLoader:
"""Load JSON-extracted vehicle data into PostgreSQL"""
def _get_id_from_result(self, result, column_name='id'):
"""Helper to extract ID from query result, handling both tuple and dict cursors"""
if result is None:
return None
if isinstance(result, tuple):
return result[0]
# For RealDictCursor, try the column name first, fall back to key access
if column_name in result:
return result[column_name]
# For COUNT(*) queries, the key might be 'count'
if 'count' in result:
return result['count']
# Fall back to first value
return list(result.values())[0] if result else None
def __init__(self, postgres_loader: Optional[PostgreSQLLoader] = None):
"""
Initialize JSON manual loader
Args:
postgres_loader: Existing PostgreSQL loader instance
"""
self.postgres_loader = postgres_loader or PostgreSQLLoader()
self.batch_size = 1000
logger.info("JsonManualLoader initialized")
def clear_all_tables(self) -> None:
"""
Clear all vehicles tables in dependency order
WARNING: This is destructive and will remove all data
"""
logger.warning("CLEARING ALL VEHICLES TABLES - This is destructive!")
tables_to_clear = [
'trim_engine', # Many-to-many mappings first
'trim_transmission',
'performance', # Tables with foreign keys
'trim',
'model_year',
'model',
'make',
'engine', # Independent tables last
'transmission'
]
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
for table in tables_to_clear:
try:
cursor.execute(f"TRUNCATE TABLE vehicles.{table} CASCADE")
logger.info(f"Cleared vehicles.{table}")
except Exception as e:
logger.warning(f"Failed to clear vehicles.{table}: {str(e)}")
conn.commit()
logger.info("All vehicles tables cleared")
def load_make(self, make_data: MakeData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load a single make with all related data
Args:
make_data: Extracted make data
mode: Loading mode (clear/append)
stats: Statistics accumulator
Returns:
Make ID in database
"""
logger.debug(f"Loading make: {make_data.name}")
try:
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# 1. Insert or get make (always check for existing to avoid constraint violations)
# Check if make exists (case-insensitive to match database constraint)
cursor.execute(
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
(make_data.name,)
)
result = cursor.fetchone()
if result:
make_id = self._get_id_from_result(result)
stats.duplicate_makes += 1
logger.debug(f"Make {make_data.name} already exists with ID {make_id}")
else:
# Insert new make with error handling for constraint violations
try:
cursor.execute(
"INSERT INTO vehicles.make (name) VALUES (%s) RETURNING id",
(make_data.name,)
)
result = cursor.fetchone()
make_id = self._get_id_from_result(result)
logger.debug(f"Inserted make {make_data.name} with ID {make_id}")
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Retry the lookup in case of race condition
cursor.execute(
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
(make_data.name,)
)
result = cursor.fetchone()
if result:
make_id = self._get_id_from_result(result)
stats.duplicate_makes += 1
logger.debug(f"Make {make_data.name} found after retry with ID {make_id}")
else:
raise
else:
raise
# 2. Process models
for model_data in make_data.models:
model_id = self.load_model(cursor, make_id, model_data, mode, stats)
conn.commit()
stats.makes_processed += 1
return make_id
except Exception as e:
error_msg = f"Failed to load make {make_data.name}: {str(e)}"
logger.error(error_msg)
stats.errors.append(error_msg)
raise
def load_model(self, cursor, make_id: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load a single model with all related data
Args:
cursor: Database cursor
make_id: Parent make ID
model_data: Extracted model data
mode: Loading mode
stats: Statistics accumulator
Returns:
Model ID in database
"""
# 1. Insert or get model
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.model WHERE make_id = %s AND name = %s",
(make_id, model_data.name)
)
result = cursor.fetchone()
if result:
model_id = result[0] if isinstance(result, tuple) else result['id']
stats.duplicate_models += 1
else:
cursor.execute(
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
(make_id, model_data.name)
)
model_id = self._get_id_from_result(cursor.fetchone())
stats.models_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
(make_id, model_data.name)
)
model_id = self._get_id_from_result(cursor.fetchone())
stats.models_inserted += 1
# 2. Insert model years and related data
for year in model_data.years:
model_year_id = self.load_model_year(cursor, model_id, year, model_data, mode, stats)
# Skip processing if year was outside valid range
if model_year_id is None:
continue
return model_id
def load_model_year(self, cursor, model_id: int, year: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load model year and associated trims/engines
Args:
cursor: Database cursor
model_id: Parent model ID
year: Model year
model_data: Model data with trims and engines
mode: Loading mode
stats: Statistics accumulator
Returns:
Model year ID in database
"""
# Skip years that don't meet database constraints (must be 1950-2100)
if year < 1950 or year > 2100:
logger.warning(f"Skipping year {year} - outside valid range (1950-2100)")
stats.skipped_model_years += 1
return None
# 1. Insert or get model year
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.model_year WHERE model_id = %s AND year = %s",
(model_id, year)
)
result = cursor.fetchone()
if result:
model_year_id = result[0] if isinstance(result, tuple) else result['id']
else:
cursor.execute(
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
(model_id, year)
)
model_year_id = self._get_id_from_result(cursor.fetchone())
stats.model_years_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
(model_id, year)
)
model_year_id = self._get_id_from_result(cursor.fetchone())
stats.model_years_inserted += 1
# 2. Load engines and get their IDs
engine_ids = []
for engine_spec in model_data.engines:
engine_id = self.load_engine(cursor, engine_spec, mode, stats)
engine_ids.append(engine_id)
# 3. Load trims and connect to engines
for trim_name in model_data.trims:
trim_id = self.load_trim(cursor, model_year_id, trim_name, engine_ids, mode, stats)
return model_year_id
def load_engine(self, cursor, engine_spec: EngineSpec, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load engine specification
Args:
cursor: Database cursor
engine_spec: Parsed engine specification
mode: Loading mode
stats: Statistics accumulator
Returns:
Engine ID in database
"""
# Create a canonical engine name for database storage
if engine_spec.displacement_l and engine_spec.configuration != "Unknown" and engine_spec.cylinders:
engine_name = f"{engine_spec.displacement_l}L {engine_spec.configuration}{engine_spec.cylinders}"
else:
engine_name = engine_spec.raw_string
# Generate engine code from name (remove spaces, lowercase)
engine_code = engine_name.replace(" ", "").lower()
# Always check for existing engine by name or code to avoid constraint violations
cursor.execute("""
SELECT id FROM vehicles.engine
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
""", (engine_name, engine_code))
result = cursor.fetchone()
if result:
engine_id = self._get_id_from_result(result)
stats.duplicate_engines += 1
return engine_id
# Insert new engine
try:
cursor.execute("""
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
VALUES (%s, %s, %s, %s, %s, %s)
RETURNING id
""", (
engine_name,
engine_code,
engine_spec.displacement_l,
engine_spec.cylinders,
engine_spec.fuel_type if engine_spec.fuel_type != "Unknown" else None,
engine_spec.aspiration if engine_spec.aspiration != "Natural" else None
))
engine_id = self._get_id_from_result(cursor.fetchone())
stats.engines_inserted += 1
return engine_id
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Retry the lookup in case of race condition
cursor.execute("""
SELECT id FROM vehicles.engine
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
""", (engine_name, engine_code))
result = cursor.fetchone()
if result:
engine_id = self._get_id_from_result(result)
stats.duplicate_engines += 1
return engine_id
raise
def load_trim(self, cursor, model_year_id: int, trim_name: str, engine_ids: List[int], mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load trim and connect to engines
Args:
cursor: Database cursor
model_year_id: Parent model year ID
trim_name: Trim name
engine_ids: List of engine IDs to connect
mode: Loading mode
stats: Statistics accumulator
Returns:
Trim ID in database
"""
# 1. Insert or get trim
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.trim WHERE model_year_id = %s AND name = %s",
(model_year_id, trim_name)
)
result = cursor.fetchone()
if result:
trim_id = result[0] if isinstance(result, tuple) else result['id']
else:
cursor.execute(
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
(model_year_id, trim_name)
)
trim_id = self._get_id_from_result(cursor.fetchone())
stats.trims_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
(model_year_id, trim_name)
)
trim_id = self._get_id_from_result(cursor.fetchone())
stats.trims_inserted += 1
# 2. Connect trim to engines (always check for existing to avoid duplicates)
# Deduplicate engine_ids to prevent duplicate mappings within the same trim
unique_engine_ids = list(set(engine_ids))
for engine_id in unique_engine_ids:
# Check if mapping already exists
cursor.execute(
"SELECT 1 FROM vehicles.trim_engine WHERE trim_id = %s AND engine_id = %s",
(trim_id, engine_id)
)
if not cursor.fetchone():
try:
cursor.execute(
"INSERT INTO vehicles.trim_engine (trim_id, engine_id) VALUES (%s, %s)",
(trim_id, engine_id)
)
stats.trim_engine_mappings_inserted += 1
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Another process may have inserted it, skip
logger.debug(f"Trim-engine mapping ({trim_id}, {engine_id}) already exists, skipping")
else:
raise
return trim_id
def load_all_makes(self, makes_data: List[MakeData], mode: LoadMode) -> LoadResult:
"""
Load all makes with complete data
Args:
makes_data: List of extracted make data
mode: Loading mode (clear/append)
Returns:
LoadResult with comprehensive statistics
"""
logger.info(f"Starting bulk load of {len(makes_data)} makes in {mode.value} mode")
# Clear tables if in CLEAR mode
if mode == LoadMode.CLEAR:
self.clear_all_tables()
stats = LoadStatistics()
failed_makes = []
for make_data in makes_data:
try:
if make_data.processing_errors:
logger.warning(f"Skipping make {make_data.name} due to extraction errors")
stats.makes_skipped += 1
failed_makes.append(make_data.name)
continue
make_id = self.load_make(make_data, mode, stats)
logger.info(f"Successfully loaded make {make_data.name} (ID: {make_id})")
except Exception as e:
logger.error(f"Failed to load make {make_data.name}: {str(e)}")
failed_makes.append(make_data.name)
continue
# Create result
result = LoadResult(
total_makes=len(makes_data),
total_models=stats.models_inserted,
total_model_years=stats.model_years_inserted,
total_trims=stats.trims_inserted,
total_engines=stats.engines_inserted,
total_trim_engine_mappings=stats.trim_engine_mappings_inserted,
failed_makes=failed_makes,
warnings=stats.warnings,
load_mode=mode
)
logger.info(f"Bulk load complete: {result.success_count}/{result.total_makes} makes loaded successfully")
logger.info(f"Data loaded: {result.total_models} models, {result.total_engines} engines, {result.total_trims} trims")
return result
def get_database_statistics(self) -> Dict[str, int]:
"""
Get current database record counts
Returns:
Dictionary with table counts
"""
stats = {}
tables = ['make', 'model', 'model_year', 'trim', 'engine', 'trim_engine']
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
for table in tables:
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table}")
result = cursor.fetchone()
stats[table] = result[0] if isinstance(result, tuple) else result['count']
return stats
def validate_referential_integrity(self) -> List[str]:
"""
Validate referential integrity of loaded data
Returns:
List of integrity issues found (empty if all good)
"""
issues = []
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Check for orphaned models
cursor.execute("""
SELECT COUNT(*) FROM vehicles.model m
LEFT JOIN vehicles.make mk ON m.make_id = mk.id
WHERE mk.id IS NULL
""")
orphaned_models = self._get_id_from_result(cursor.fetchone(), 'count')
if orphaned_models > 0:
issues.append(f"Found {orphaned_models} orphaned models")
# Check for orphaned model_years
cursor.execute("""
SELECT COUNT(*) FROM vehicles.model_year my
LEFT JOIN vehicles.model m ON my.model_id = m.id
WHERE m.id IS NULL
""")
orphaned_model_years = self._get_id_from_result(cursor.fetchone())
if orphaned_model_years > 0:
issues.append(f"Found {orphaned_model_years} orphaned model_years")
# Check for orphaned trims
cursor.execute("""
SELECT COUNT(*) FROM vehicles.trim t
LEFT JOIN vehicles.model_year my ON t.model_year_id = my.id
WHERE my.id IS NULL
""")
orphaned_trims = self._get_id_from_result(cursor.fetchone())
if orphaned_trims > 0:
issues.append(f"Found {orphaned_trims} orphaned trims")
# Check for broken trim_engine mappings
cursor.execute("""
SELECT COUNT(*) FROM vehicles.trim_engine te
LEFT JOIN vehicles.trim t ON te.trim_id = t.id
LEFT JOIN vehicles.engine e ON te.engine_id = e.id
WHERE t.id IS NULL OR e.id IS NULL
""")
broken_mappings = self._get_id_from_result(cursor.fetchone())
if broken_mappings > 0:
issues.append(f"Found {broken_mappings} broken trim_engine mappings")
if issues:
logger.warning(f"Referential integrity issues found: {issues}")
else:
logger.info("Referential integrity validation passed")
return issues
def print_load_report(self, result: LoadResult) -> None:
"""
Print comprehensive loading report
Args:
result: LoadResult from load operation
"""
print(f"🚀 JSON MANUAL LOADING REPORT")
print(f"=" * 50)
# Load summary
print(f"\n📊 LOADING SUMMARY")
print(f" Mode: {result.load_mode.value.upper()}")
print(f" Makes processed: {result.success_count}/{result.total_makes}")
print(f" Success rate: {result.success_rate:.1%}")
# Data counts
print(f"\n📈 DATA LOADED")
print(f" Models: {result.total_models}")
print(f" Model years: {result.total_model_years}")
print(f" Trims: {result.total_trims}")
print(f" Engines: {result.total_engines}")
print(f" Trim-engine mappings: {result.total_trim_engine_mappings}")
# Issues
if result.failed_makes:
print(f"\n⚠️ FAILED MAKES ({len(result.failed_makes)}):")
for make in result.failed_makes:
print(f" {make}")
if result.warnings:
print(f"\n⚠️ WARNINGS ({len(result.warnings)}):")
for warning in result.warnings[:5]: # Show first 5
print(f" {warning}")
if len(result.warnings) > 5:
print(f" ... and {len(result.warnings) - 5} more warnings")
# Database statistics
print(f"\n📋 DATABASE STATISTICS:")
db_stats = self.get_database_statistics()
for table, count in db_stats.items():
print(f" vehicles.{table}: {count:,} records")
# Referential integrity
integrity_issues = self.validate_referential_integrity()
if integrity_issues:
print(f"\n❌ REFERENTIAL INTEGRITY ISSUES:")
for issue in integrity_issues:
print(f" {issue}")
else:
print(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
# Example usage and testing functions
def example_usage():
"""Demonstrate JsonManualLoader usage"""
print("🚀 JsonManualLoader Example Usage")
print("=" * 40)
# This would typically be called after JsonExtractor
# For demo purposes, we'll just show the structure
print("\n📋 Typical usage flow:")
print("1. Extract data with JsonExtractor")
print("2. Create JsonManualLoader")
print("3. Load data in APPEND or CLEAR mode")
print("4. Validate and report results")
print(f"\n💡 Example code:")
print("""
# Extract data
extractor = JsonExtractor(make_mapper, engine_parser)
extraction_result = extractor.extract_all_makes('sources/makes')
# Load data
loader = JsonManualLoader()
load_result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
# Report results
loader.print_load_report(load_result)
""")
if __name__ == "__main__":
example_usage()

View File

@@ -0,0 +1,437 @@
#!/usr/bin/env python3
"""
MSSQL Database Loader
Handles loading .bak files into MSSQL Server for ETL processing
"""
import os
import logging
import pyodbc
import time
from pathlib import Path
from typing import Optional, List
from ..config import config
logger = logging.getLogger(__name__)
class MSSQLLoader:
"""Loads database files into MSSQL Server"""
def __init__(self):
self.server = config.MSSQL_HOST
self.port = config.MSSQL_PORT
self.database = config.MSSQL_DATABASE
self.username = config.MSSQL_USER
self.password = config.MSSQL_PASSWORD
def get_connection_string(self, database: str = "master") -> str:
"""Get MSSQL connection string"""
return (
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
f"SERVER={self.server},{self.port};"
f"DATABASE={database};"
f"UID={self.username};"
f"PWD={self.password};"
f"TrustServerCertificate=yes;"
)
def test_connection(self) -> bool:
"""Test MSSQL connection"""
try:
conn_str = self.get_connection_string()
logger.info(f"Testing MSSQL connection to: {self.server}")
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute("SELECT @@VERSION")
version = cursor.fetchone()[0]
logger.info(f"MSSQL connection successful: {version[:100]}...")
return True
except Exception as e:
logger.error(f"MSSQL connection failed: {e}")
return False
def database_exists(self, database_name: str) -> bool:
"""Check if database exists"""
try:
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM sys.databases WHERE name = ?",
(database_name,)
)
count = cursor.fetchone()[0]
return count > 0
except Exception as e:
logger.error(f"Failed to check if database exists: {e}")
return False
def get_database_state(self, database_name: str) -> Optional[str]:
"""Return the state_desc for a database or None if not found"""
try:
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT state_desc FROM sys.databases WHERE name = ?",
(database_name,)
)
row = cursor.fetchone()
return row[0] if row else None
except Exception as e:
logger.error(f"Failed to get database state: {e}")
return None
def drop_database(self, database_name: str) -> bool:
"""Drop database if it exists"""
try:
if not self.database_exists(database_name):
logger.info(f"Database {database_name} does not exist, skipping drop")
return True
logger.info(f"Dropping database: {database_name}")
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
conn.autocommit = True
cursor = conn.cursor()
# Kill existing connections
cursor.execute(f"""
ALTER DATABASE [{database_name}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;
DROP DATABASE [{database_name}];
""")
logger.info(f"Successfully dropped database: {database_name}")
return True
except Exception as e:
logger.error(f"Failed to drop database {database_name}: {e}")
return False
def get_backup_file_info(self, bak_path: Path) -> Optional[dict]:
"""Get information about backup file"""
try:
# Use the MSSQL container's mounted backup directory
container_path = f"/backups/{bak_path.name}"
# For now, assume the file is accessible
# In production, this would copy the file into the MSSQL container
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
# Get backup file information
cursor.execute(f"RESTORE HEADERONLY FROM DISK = '{container_path}'")
headers = cursor.fetchall()
if headers:
header = headers[0]
return {
"database_name": header.DatabaseName,
"server_name": header.ServerName,
"backup_start_date": header.BackupStartDate,
"backup_finish_date": header.BackupFinishDate,
"backup_size": header.BackupSize,
}
except Exception as e:
logger.warning(f"Could not get backup file info: {e}")
return None
def restore_database(self, bak_path: Path, target_database: str = None) -> bool:
"""
Restore database from .bak file
Args:
bak_path: Path to .bak file
target_database: Target database name (defaults to VPICList)
Returns:
True if successful
"""
if target_database is None:
target_database = self.database
if not bak_path.exists():
logger.error(f"Backup file does not exist: {bak_path}")
return False
logger.info(f"Starting database restore: {bak_path} -> {target_database}")
try:
# Copy backup file to MSSQL container
container_bak_path = self.copy_backup_to_container(bak_path)
if not container_bak_path:
logger.error("Failed to copy backup file to container")
return False
# If database exists, note the state; we will handle exclusivity in the same session below
if self.database_exists(target_database):
state = self.get_database_state(target_database)
logger.info(f"Existing database detected: {target_database} (state={state})")
else:
logger.info(f"Target database does not exist yet: {target_database} — proceeding with restore")
# Restore database using a single master connection for exclusivity
logger.info(f"Restoring database from: {container_bak_path}")
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=600) as conn: # 10 minute timeout
conn.autocommit = True
cursor = conn.cursor()
# If DB exists, ensure exclusive access: kill sessions + SINGLE_USER in this session
if self.database_exists(target_database):
try:
logger.info(f"Preparing exclusive access for restore: killing active sessions on {target_database}")
kill_sql = f"""
DECLARE @db sysname = N'{target_database}';
DECLARE @kill nvarchar(max) = N'';
SELECT @kill = @kill + N'KILL ' + CONVERT(nvarchar(10), session_id) + N';'
FROM sys.dm_exec_sessions
WHERE database_id = DB_ID(@db) AND session_id <> @@SPID;
IF LEN(@kill) > 0 EXEC (@kill);
"""
cursor.execute(kill_sql)
# Force SINGLE_USER in current session
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
logger.info(f"Exclusive access prepared (SINGLE_USER) for {target_database}")
except Exception as e:
logger.warning(f"Could not fully prepare exclusive access: {e}")
# Get logical file names from backup
cursor.execute(f"RESTORE FILELISTONLY FROM DISK = '{container_bak_path}'")
files = cursor.fetchall()
if not files:
logger.error("No files found in backup")
return False
# Build RESTORE command with MOVE options
data_file = None
log_file = None
for file_info in files:
logical_name = file_info.LogicalName
file_type = file_info.Type
if file_type == 'D': # Data file
data_file = logical_name
elif file_type == 'L': # Log file
log_file = logical_name
if not data_file:
logger.error("No data file found in backup")
return False
# Construct restore command
restore_sql = f"""
RESTORE DATABASE [{target_database}]
FROM DISK = '{container_bak_path}'
WITH
MOVE '{data_file}' TO '/var/opt/mssql/data/{target_database}.mdf',
"""
if log_file:
restore_sql += f" MOVE '{log_file}' TO '/var/opt/mssql/data/{target_database}.ldf',"
restore_sql += """
REPLACE,
RECOVERY,
STATS = 10
"""
logger.info(f"Executing restore command for database: {target_database}")
logger.debug(f"Restore SQL: {restore_sql}")
try:
cursor.execute(restore_sql)
except Exception as e:
# If we hit exclusive access error, retry once after killing sessions again
if 'Exclusive access could not be obtained' in str(e):
logger.warning("Exclusive access error on RESTORE; retrying after killing sessions and reasserting SINGLE_USER...")
try:
cursor.execute(kill_sql)
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
except Exception as e2:
logger.warning(f"Retry exclusive prep failed: {e2}")
cursor.execute(restore_sql)
else:
raise
# Poll for database to be ONLINE
if not self._wait_for_database_online(target_database):
logger.error(f"Database did not come ONLINE in time: {target_database}")
return False
# Small retry around database_exists to handle late readiness
if self._retry_database_exists(target_database):
logger.info(f"Database restore successful and ONLINE: {target_database}")
# Get basic database info
cursor.execute(f"""
SELECT
name,
create_date,
compatibility_level,
state_desc
FROM sys.databases
WHERE name = '{target_database}'
""")
db_info = cursor.fetchone()
if db_info:
logger.info(f"Database info: Name={db_info.name}, Created={db_info.create_date}, Level={db_info.compatibility_level}, State={db_info.state_desc}")
# Optional: quick content verification with small retry window
if not self._retry_verify_content(target_database):
logger.warning("Database restored but content verification is inconclusive")
# Try to set MULTI_USER back in same session
try:
cursor.execute(f"ALTER DATABASE [{target_database}] SET MULTI_USER;")
logger.info(f"Set {target_database} back to MULTI_USER")
except Exception as e:
logger.warning(f"Could not set MULTI_USER on {target_database}: {e}")
return True
else:
logger.error(f"Database restore failed - database not found: {target_database}")
return False
except Exception as e:
logger.error(f"Database restore failed: {e}")
return False
def copy_backup_to_container(self, bak_path: Path) -> Optional[str]:
"""
Copy backup file to shared volume accessible by MSSQL container
Args:
bak_path: Local path to .bak file
Returns:
Container path to .bak file or None if failed
"""
try:
# Use shared volume instead of docker cp
shared_dir = Path("/app/shared")
shared_bak_path = shared_dir / bak_path.name
# If the file is already in the shared dir, skip copying
if bak_path.resolve().parent == shared_dir.resolve():
logger.info(f"Backup already in shared volume: {bak_path}")
else:
logger.info(f"Copying {bak_path} to shared volume...")
import shutil
shutil.copy2(bak_path, shared_bak_path)
# Container path from MSSQL perspective
container_path = f"/backups/{shared_bak_path.name}"
logger.info(f"Successfully copied to shared volume: {container_path}")
return container_path
except Exception as e:
logger.error(f"Failed to copy backup to shared volume: {e}")
return None
def _wait_for_database_online(self, database_name: str, timeout_seconds: int = 600, interval_seconds: int = 5) -> bool:
"""Poll MSSQL until the specified database state becomes ONLINE or timeout.
Returns True if ONLINE, False on timeout/error.
"""
logger.info(f"Waiting for database to become ONLINE: {database_name}")
deadline = time.time() + timeout_seconds
last_state = None
try:
conn_str = self.get_connection_string()
while time.time() < deadline:
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute("SELECT state_desc FROM sys.databases WHERE name = ?", (database_name,))
row = cursor.fetchone()
if row:
state = row[0]
if state != last_state:
logger.info(f"Database state: {state}")
last_state = state
if state == 'ONLINE':
# Optional: verify updateability is READ_WRITE
try:
cursor.execute("SELECT DATABASEPROPERTYEX(?, 'Updateability')", (database_name,))
up = cursor.fetchone()[0]
logger.info(f"Database updateability: {up}")
except Exception:
pass
return True
else:
logger.info("Database entry not found yet in sys.databases")
time.sleep(interval_seconds)
except Exception as e:
logger.error(f"Error while waiting for database ONLINE: {e}")
return False
logger.error("Timed out waiting for database to become ONLINE")
return False
def _retry_database_exists(self, database_name: str, attempts: int = 6, delay_seconds: int = 5) -> bool:
"""Retry wrapper for database existence checks."""
for i in range(1, attempts + 1):
if self.database_exists(database_name):
return True
logger.info(f"database_exists() false, retrying ({i}/{attempts})...")
time.sleep(delay_seconds)
return False
def _retry_verify_content(self, database_name: str, attempts: int = 3, delay_seconds: int = 5) -> bool:
"""Retry wrapper around verify_database_content to allow late readiness."""
for i in range(1, attempts + 1):
try:
counts = self.verify_database_content(database_name)
if counts:
logger.info(f"Content verification counts: {counts}")
return True
except Exception as e:
logger.info(f"Content verification attempt {i} failed: {e}")
time.sleep(delay_seconds)
return False
def verify_database_content(self, database_name: str = None) -> dict:
"""
Verify database has expected content
Returns:
Dictionary with table counts
"""
if database_name is None:
database_name = self.database
try:
conn_str = self.get_connection_string(database_name)
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
# Get table counts for key tables
tables_to_check = ['Make', 'Model', 'VehicleType', 'Manufacturer']
counts = {}
for table in tables_to_check:
try:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
counts[table] = count
logger.info(f"Table {table}: {count:,} rows")
except:
counts[table] = 0
return counts
except Exception as e:
logger.error(f"Failed to verify database content: {e}")
return {}

View File

@@ -0,0 +1,354 @@
import logging
from typing import List, Dict, Optional
from psycopg2.extras import execute_batch
from ..connections import db_connections
from tqdm import tqdm
logger = logging.getLogger(__name__)
class PostgreSQLLoader:
"""Load data into PostgreSQL target database"""
def __init__(self):
self.batch_size = 1000
def load_reference_table(self, table_name: str, data: List[Dict],
clear_existing: bool = True) -> int:
"""Load data into a reference table"""
if not data:
logger.warning(f"No data to load for table {table_name}")
return 0
logger.info(f"Loading {len(data)} records into vehicles.{table_name}")
# Column mapping from source (MS SQL) to target (PostgreSQL)
column_mappings = {
'Id': 'id',
'Name': 'name',
'Code': 'code',
'MakeId': 'make_id',
'CreateOn': 'created_at',
'CreatedOn': 'created_at',
'UpdateOn': 'updated_at',
'UpdatedOn': 'updated_at',
'Wmi': 'wmi',
'ManufacturerId': 'manufacturer_id',
'MakeId': 'make_id',
'VehicleTypeId': 'vehicle_type_id',
'TruckTypeId': 'truck_type_id',
'CountryId': 'country_id',
'PublicAvailabilityDate': 'public_availability_date',
'NonCompliant': 'non_compliant',
'NonCompliantReason': 'non_compliant_reason',
'ProcessedOn': 'processed_on',
'DisplayOrder': 'display_order',
'FormType': 'form_type',
'Description': 'description',
'LookupTable': 'lookup_table',
'IsPrivate': 'is_private',
'GroupName': 'group_name',
'DataType': 'data_type',
'MinAllowedValue': 'min_allowed_value',
'MaxAllowedValue': 'max_allowed_value',
'IsQS': 'is_qs',
'Decode': 'decode',
'weight': 'weight',
# ErrorCode specific mappings
'ErrorCodeName': 'code',
'ErrorCodeDescription': 'description'
}
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
if clear_existing:
cursor.execute(f"TRUNCATE TABLE vehicles.{table_name} CASCADE")
logger.info(f"Cleared existing data from vehicles.{table_name}")
# Get source columns and map them to target columns
source_columns = list(data[0].keys())
target_columns = []
valid_data = []
# Map columns and filter data
for source_col in source_columns:
if source_col in column_mappings:
target_columns.append(column_mappings[source_col])
else:
target_columns.append(source_col.lower())
# Check which columns exist in target table
cursor.execute(f"""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = 'vehicles' AND table_name = '{table_name}'
""")
results = cursor.fetchall()
existing_columns = {row['column_name'] if isinstance(row, dict) else row[0] for row in results}
# Filter to only existing columns
final_columns = []
final_indices = []
for i, col in enumerate(target_columns):
if col in existing_columns:
final_columns.append(col)
final_indices.append(i)
if not final_columns:
logger.warning(f"No matching columns found for table {table_name}")
return 0
column_str = ','.join(final_columns)
placeholders = ','.join(['%s'] * len(final_columns))
# Prepare insert query
query = f"""
INSERT INTO vehicles.{table_name} ({column_str})
VALUES ({placeholders})
ON CONFLICT DO NOTHING
"""
# Prepare data tuples with only valid columns
data_tuples = []
for record in data:
values = []
skip_record = False
for i in final_indices:
source_col = source_columns[i]
value = record[source_col]
# Handle special cases for error_codes table
if table_name == 'error_codes' and source_col in ['ErrorCodeName', 'Code'] and (value is None or value == ''):
skip_record = True
break
values.append(value)
if not skip_record:
data_tuples.append(tuple(values))
# Execute batch insert
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} records into vehicles.{table_name}")
return final_count
def load_wmi_vin_schema_mappings(self, mappings: List[Dict]) -> int:
"""Load WMI to VIN Schema mappings"""
if not mappings:
return 0
logger.info(f"Loading {len(mappings)} WMI-VinSchema mappings")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing mappings
cursor.execute("TRUNCATE TABLE vehicles.wmi_vin_schemas CASCADE")
query = """
INSERT INTO vehicles.wmi_vin_schemas
(wmi_id, vin_schema_id, year_from, year_to)
VALUES (%s, %s, %s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for mapping in mappings:
data_tuples.append((
mapping['WmiId'],
mapping['VinSchemaId'],
mapping['YearFrom'] or 1980,
mapping['YearTo'] or 2999
))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_vin_schemas")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} WMI-VinSchema mappings")
return final_count
def load_make_model_relationships(self, relationships: List[Dict]) -> int:
"""Load Make-Model relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} Make-Model relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing relationships
cursor.execute("TRUNCATE TABLE vehicles.make_models CASCADE")
query = """
INSERT INTO vehicles.make_models (make_id, model_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for rel in relationships:
data_tuples.append((rel['MakeId'], rel['ModelId']))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.make_models")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} Make-Model relationships")
return final_count
def load_wmi_make_relationships(self, relationships: List[Dict]) -> int:
"""Load WMI-Make relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} WMI-Make relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing relationships
cursor.execute("TRUNCATE TABLE vehicles.wmi_makes CASCADE")
query = """
INSERT INTO vehicles.wmi_makes (wmi_id, make_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for rel in relationships:
data_tuples.append((rel['WmiId'], rel['MakeId']))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_makes")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} WMI-Make relationships")
return final_count
def load_model_years(self, model_years: List[Dict]) -> int:
"""Load model year availability data"""
if not model_years:
return 0
logger.info(f"Loading {len(model_years)} model year records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.model_year (model_id, year)
VALUES (%s, %s)
ON CONFLICT (model_id, year) DO NOTHING
"""
data_tuples = [(my['model_id'], my['year']) for my in model_years]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(model_years)
def load_trims(self, trims: List[Dict]) -> int:
"""Load trim data"""
if not trims:
return 0
logger.info(f"Loading {len(trims)} trim records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.trim (model_year_id, name)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = [(t['model_year_id'], t['name']) for t in trims]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(trims)
def load_engines(self, engines: List[Dict]) -> int:
"""Load engine data"""
if not engines:
return 0
logger.info(f"Loading {len(engines)} engine records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (lower(name)) DO NOTHING
RETURNING id
"""
for engine in engines:
cursor.execute(query, (
engine['name'],
engine.get('code'),
engine.get('displacement_l'),
engine.get('cylinders'),
engine.get('fuel_type'),
engine.get('aspiration')
))
conn.commit()
return len(engines)
def load_trim_engine_relationships(self, relationships: List[Dict]) -> int:
"""Load trim-engine relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} trim-engine relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.trim_engine (trim_id, engine_id)
VALUES (%s, %s)
ON CONFLICT (trim_id, engine_id) DO NOTHING
"""
data_tuples = [(rel['trim_id'], rel['engine_id']) for rel in relationships]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(relationships)
def get_table_count(self, table_name: str) -> int:
"""Get count of records in a table"""
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
result = cursor.fetchone()
return result['count'] if isinstance(result, dict) and 'count' in result else result[0]