Files
motovaultpro/mvp-platform-services/vehicles/etl/loaders/postgres_loader.py
Eric Gullickson a052040e3a Initial Commit
2025-09-17 16:09:15 -05:00

355 lines
13 KiB
Python
Executable File

import logging
from typing import List, Dict, Optional
from psycopg2.extras import execute_batch
from ..connections import db_connections
from tqdm import tqdm
logger = logging.getLogger(__name__)
class PostgreSQLLoader:
"""Load data into PostgreSQL target database"""
def __init__(self):
self.batch_size = 1000
def load_reference_table(self, table_name: str, data: List[Dict],
clear_existing: bool = True) -> int:
"""Load data into a reference table"""
if not data:
logger.warning(f"No data to load for table {table_name}")
return 0
logger.info(f"Loading {len(data)} records into vehicles.{table_name}")
# Column mapping from source (MS SQL) to target (PostgreSQL)
column_mappings = {
'Id': 'id',
'Name': 'name',
'Code': 'code',
'MakeId': 'make_id',
'CreateOn': 'created_at',
'CreatedOn': 'created_at',
'UpdateOn': 'updated_at',
'UpdatedOn': 'updated_at',
'Wmi': 'wmi',
'ManufacturerId': 'manufacturer_id',
'MakeId': 'make_id',
'VehicleTypeId': 'vehicle_type_id',
'TruckTypeId': 'truck_type_id',
'CountryId': 'country_id',
'PublicAvailabilityDate': 'public_availability_date',
'NonCompliant': 'non_compliant',
'NonCompliantReason': 'non_compliant_reason',
'ProcessedOn': 'processed_on',
'DisplayOrder': 'display_order',
'FormType': 'form_type',
'Description': 'description',
'LookupTable': 'lookup_table',
'IsPrivate': 'is_private',
'GroupName': 'group_name',
'DataType': 'data_type',
'MinAllowedValue': 'min_allowed_value',
'MaxAllowedValue': 'max_allowed_value',
'IsQS': 'is_qs',
'Decode': 'decode',
'weight': 'weight',
# ErrorCode specific mappings
'ErrorCodeName': 'code',
'ErrorCodeDescription': 'description'
}
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
if clear_existing:
cursor.execute(f"TRUNCATE TABLE vehicles.{table_name} CASCADE")
logger.info(f"Cleared existing data from vehicles.{table_name}")
# Get source columns and map them to target columns
source_columns = list(data[0].keys())
target_columns = []
valid_data = []
# Map columns and filter data
for source_col in source_columns:
if source_col in column_mappings:
target_columns.append(column_mappings[source_col])
else:
target_columns.append(source_col.lower())
# Check which columns exist in target table
cursor.execute(f"""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = 'vehicles' AND table_name = '{table_name}'
""")
results = cursor.fetchall()
existing_columns = {row['column_name'] if isinstance(row, dict) else row[0] for row in results}
# Filter to only existing columns
final_columns = []
final_indices = []
for i, col in enumerate(target_columns):
if col in existing_columns:
final_columns.append(col)
final_indices.append(i)
if not final_columns:
logger.warning(f"No matching columns found for table {table_name}")
return 0
column_str = ','.join(final_columns)
placeholders = ','.join(['%s'] * len(final_columns))
# Prepare insert query
query = f"""
INSERT INTO vehicles.{table_name} ({column_str})
VALUES ({placeholders})
ON CONFLICT DO NOTHING
"""
# Prepare data tuples with only valid columns
data_tuples = []
for record in data:
values = []
skip_record = False
for i in final_indices:
source_col = source_columns[i]
value = record[source_col]
# Handle special cases for error_codes table
if table_name == 'error_codes' and source_col in ['ErrorCodeName', 'Code'] and (value is None or value == ''):
skip_record = True
break
values.append(value)
if not skip_record:
data_tuples.append(tuple(values))
# Execute batch insert
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} records into vehicles.{table_name}")
return final_count
def load_wmi_vin_schema_mappings(self, mappings: List[Dict]) -> int:
"""Load WMI to VIN Schema mappings"""
if not mappings:
return 0
logger.info(f"Loading {len(mappings)} WMI-VinSchema mappings")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing mappings
cursor.execute("TRUNCATE TABLE vehicles.wmi_vin_schemas CASCADE")
query = """
INSERT INTO vehicles.wmi_vin_schemas
(wmi_id, vin_schema_id, year_from, year_to)
VALUES (%s, %s, %s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for mapping in mappings:
data_tuples.append((
mapping['WmiId'],
mapping['VinSchemaId'],
mapping['YearFrom'] or 1980,
mapping['YearTo'] or 2999
))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_vin_schemas")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} WMI-VinSchema mappings")
return final_count
def load_make_model_relationships(self, relationships: List[Dict]) -> int:
"""Load Make-Model relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} Make-Model relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing relationships
cursor.execute("TRUNCATE TABLE vehicles.make_models CASCADE")
query = """
INSERT INTO vehicles.make_models (make_id, model_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for rel in relationships:
data_tuples.append((rel['MakeId'], rel['ModelId']))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.make_models")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} Make-Model relationships")
return final_count
def load_wmi_make_relationships(self, relationships: List[Dict]) -> int:
"""Load WMI-Make relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} WMI-Make relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing relationships
cursor.execute("TRUNCATE TABLE vehicles.wmi_makes CASCADE")
query = """
INSERT INTO vehicles.wmi_makes (wmi_id, make_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for rel in relationships:
data_tuples.append((rel['WmiId'], rel['MakeId']))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_makes")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} WMI-Make relationships")
return final_count
def load_model_years(self, model_years: List[Dict]) -> int:
"""Load model year availability data"""
if not model_years:
return 0
logger.info(f"Loading {len(model_years)} model year records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.model_year (model_id, year)
VALUES (%s, %s)
ON CONFLICT (model_id, year) DO NOTHING
"""
data_tuples = [(my['model_id'], my['year']) for my in model_years]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(model_years)
def load_trims(self, trims: List[Dict]) -> int:
"""Load trim data"""
if not trims:
return 0
logger.info(f"Loading {len(trims)} trim records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.trim (model_year_id, name)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = [(t['model_year_id'], t['name']) for t in trims]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(trims)
def load_engines(self, engines: List[Dict]) -> int:
"""Load engine data"""
if not engines:
return 0
logger.info(f"Loading {len(engines)} engine records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (lower(name)) DO NOTHING
RETURNING id
"""
for engine in engines:
cursor.execute(query, (
engine['name'],
engine.get('code'),
engine.get('displacement_l'),
engine.get('cylinders'),
engine.get('fuel_type'),
engine.get('aspiration')
))
conn.commit()
return len(engines)
def load_trim_engine_relationships(self, relationships: List[Dict]) -> int:
"""Load trim-engine relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} trim-engine relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.trim_engine (trim_id, engine_id)
VALUES (%s, %s)
ON CONFLICT (trim_id, engine_id) DO NOTHING
"""
data_tuples = [(rel['trim_id'], rel['engine_id']) for rel in relationships]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(relationships)
def get_table_count(self, table_name: str) -> int:
"""Get count of records in a table"""
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
result = cursor.fetchone()
return result['count'] if isinstance(result, dict) and 'count' in result else result[0]