Initial Commit
This commit is contained in:
337
mvp-platform-services/vehicles/etl/extractors/mssql_extractor.py
Executable file
337
mvp-platform-services/vehicles/etl/extractors/mssql_extractor.py
Executable file
@@ -0,0 +1,337 @@
|
||||
import logging
|
||||
from typing import List, Dict, Optional, Generator
|
||||
from ..connections import db_connections
|
||||
from ..utils.make_filter import MakeFilter
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MSSQLExtractor:
|
||||
"""Extract data from MS SQL Server source database"""
|
||||
|
||||
def __init__(self, make_filter: Optional[MakeFilter] = None):
|
||||
self.batch_size = 10000
|
||||
self.make_filter = make_filter or MakeFilter()
|
||||
logger.info(f"Initialized MSSQL extractor with {len(self.make_filter.get_allowed_makes())} allowed makes")
|
||||
|
||||
def extract_wmi_data(self) -> List[Dict]:
|
||||
"""Extract WMI (World Manufacturer Identifier) data with make filtering"""
|
||||
logger.info("Extracting WMI data from source database with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
w.Id,
|
||||
w.Wmi,
|
||||
w.ManufacturerId,
|
||||
w.MakeId,
|
||||
w.VehicleTypeId,
|
||||
w.TruckTypeId,
|
||||
w.CountryId,
|
||||
w.PublicAvailabilityDate,
|
||||
w.NonCompliant,
|
||||
w.NonCompliantReason,
|
||||
w.CreatedOn,
|
||||
w.UpdatedOn,
|
||||
w.ProcessedOn
|
||||
FROM dbo.Wmi w
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
)
|
||||
ORDER BY w.Id
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} WMI records")
|
||||
return results
|
||||
|
||||
def extract_wmi_vin_schema_mappings(self) -> List[Dict]:
|
||||
"""Extract WMI to VIN Schema mappings with year ranges and make filtering"""
|
||||
logger.info("Extracting WMI-VinSchema mappings with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
wvs.WmiId,
|
||||
wvs.VinSchemaId,
|
||||
wvs.YearFrom,
|
||||
wvs.YearTo,
|
||||
w.Wmi,
|
||||
vs.Name as SchemaName
|
||||
FROM dbo.Wmi_VinSchema wvs
|
||||
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
|
||||
JOIN dbo.VinSchema vs ON wvs.VinSchemaId = vs.Id
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
)
|
||||
AND w.MakeId IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
ORDER BY wvs.WmiId, wvs.VinSchemaId
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} WMI-VinSchema mappings (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def extract_patterns_data(self) -> Generator[List[Dict], None, None]:
|
||||
"""Extract pattern data in batches with make filtering"""
|
||||
logger.info("Extracting pattern data from source database with make filtering")
|
||||
|
||||
# First get the total count with filtering
|
||||
count_query = f"""
|
||||
SELECT COUNT(*) as total
|
||||
FROM dbo.Pattern p
|
||||
JOIN dbo.Element e ON p.ElementId = e.Id
|
||||
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
|
||||
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
|
||||
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
|
||||
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
|
||||
JOIN dbo.Make m ON wm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
AND e.Id IN (26, 27, 28, 18, 24)
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(count_query)
|
||||
total_row = self._row_to_dict(cursor, cursor.fetchone())
|
||||
total_count = total_row.get('total', 0)
|
||||
|
||||
logger.info(f"Total patterns to extract (filtered): {total_count}")
|
||||
|
||||
# Extract in batches with manufacturer filtering
|
||||
query = f"""
|
||||
SELECT
|
||||
p.Id,
|
||||
p.VinSchemaId,
|
||||
p.Keys,
|
||||
p.ElementId,
|
||||
p.AttributeId,
|
||||
e.Name as ElementName,
|
||||
e.weight,
|
||||
e.GroupName,
|
||||
vs.Name as SchemaName,
|
||||
w.Wmi,
|
||||
m.Name as MakeName
|
||||
FROM dbo.Pattern p
|
||||
JOIN dbo.Element e ON p.ElementId = e.Id
|
||||
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
|
||||
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
|
||||
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
|
||||
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
|
||||
JOIN dbo.Make m ON wm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
AND e.Id IN (26, 27, 28, 18, 24)
|
||||
ORDER BY p.Id
|
||||
OFFSET {{}} ROWS FETCH NEXT {{}} ROWS ONLY
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
for offset in tqdm(range(0, total_count, self.batch_size), desc="Extracting filtered patterns"):
|
||||
cursor.execute(query.format(offset, self.batch_size))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if rows:
|
||||
yield self._rows_to_dicts(cursor, rows)
|
||||
else:
|
||||
break
|
||||
|
||||
def extract_elements_data(self) -> List[Dict]:
|
||||
"""Extract element definitions"""
|
||||
logger.info("Extracting element data")
|
||||
|
||||
query = """
|
||||
SELECT
|
||||
Id,
|
||||
Name,
|
||||
Code,
|
||||
LookupTable,
|
||||
Description,
|
||||
IsPrivate,
|
||||
GroupName,
|
||||
DataType,
|
||||
MinAllowedValue,
|
||||
MaxAllowedValue,
|
||||
IsQS,
|
||||
Decode,
|
||||
weight
|
||||
FROM dbo.Element
|
||||
ORDER BY Id
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} element definitions")
|
||||
return results
|
||||
|
||||
def extract_reference_table(self, table_name: str) -> List[Dict]:
|
||||
"""Extract data from a reference table with make filtering"""
|
||||
logger.info(f"Extracting data from {table_name} with make filtering")
|
||||
|
||||
# Apply make filtering - filter by Make brand names (simpler and more efficient)
|
||||
if table_name == 'Manufacturer':
|
||||
# Extract manufacturers linked to filtered makes only
|
||||
query = f"""
|
||||
SELECT DISTINCT mfr.* FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
ORDER BY mfr.Id
|
||||
"""
|
||||
elif table_name == 'Make':
|
||||
# Filter makes directly by brand names (GMC, Ford, Toyota, etc.)
|
||||
query = f"""
|
||||
SELECT * FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
ORDER BY Id
|
||||
"""
|
||||
elif table_name == 'Model':
|
||||
# Filter models by allowed make brand names
|
||||
query = f"""
|
||||
SELECT md.* FROM dbo.Model md
|
||||
JOIN dbo.Make_Model mm ON md.Id = mm.ModelId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
ORDER BY md.Id
|
||||
"""
|
||||
elif table_name == 'Wmi':
|
||||
# Filter WMI records by allowed manufacturers (linked to makes) AND makes directly
|
||||
query = f"""
|
||||
SELECT w.* FROM dbo.Wmi w
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
)
|
||||
AND w.MakeId IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
ORDER BY w.Id
|
||||
"""
|
||||
else:
|
||||
# No filtering for other reference tables
|
||||
query = f"SELECT * FROM dbo.{table_name} ORDER BY Id"
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} records from {table_name} (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def extract_make_model_relationships(self) -> List[Dict]:
|
||||
"""Extract Make-Model relationships with make filtering"""
|
||||
logger.info("Extracting Make-Model relationships with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
mm.MakeId,
|
||||
mm.ModelId,
|
||||
m.Name as MakeName,
|
||||
md.Name as ModelName
|
||||
FROM dbo.Make_Model mm
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
JOIN dbo.Model md ON mm.ModelId = md.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
ORDER BY mm.MakeId, mm.ModelId
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} Make-Model relationships (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def extract_wmi_make_relationships(self) -> List[Dict]:
|
||||
"""Extract WMI-Make relationships with make filtering"""
|
||||
logger.info("Extracting WMI-Make relationships with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
wm.WmiId,
|
||||
wm.MakeId,
|
||||
w.Wmi,
|
||||
m.Name as MakeName
|
||||
FROM dbo.Wmi_Make wm
|
||||
JOIN dbo.Wmi w ON wm.WmiId = w.Id
|
||||
JOIN dbo.Make m ON wm.MakeId = m.Id
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make mk ON mm.MakeId = mk.Id
|
||||
WHERE {self.make_filter.get_sql_filter('mk.Name')}
|
||||
)
|
||||
AND w.MakeId IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
AND m.Id IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
ORDER BY wm.WmiId, wm.MakeId
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} WMI-Make relationships (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def _rows_to_dicts(self, cursor, rows) -> List[Dict]:
|
||||
"""Convert pyodbc rows to list of dicts using cursor description."""
|
||||
if not rows:
|
||||
return []
|
||||
columns = [col[0] for col in cursor.description]
|
||||
result: List[Dict] = []
|
||||
for row in rows:
|
||||
item = {columns[i]: row[i] for i in range(len(columns))}
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
def _row_to_dict(self, cursor, row) -> Dict:
|
||||
"""Convert single pyodbc row to dict."""
|
||||
if row is None:
|
||||
return {}
|
||||
columns = [col[0] for col in cursor.description]
|
||||
return {columns[i]: row[i] for i in range(len(columns))}
|
||||
Reference in New Issue
Block a user