Initial Commit

This commit is contained in:
Eric Gullickson
2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions

View File

@@ -0,0 +1,337 @@
import logging
from typing import List, Dict, Optional, Generator
from ..connections import db_connections
from ..utils.make_filter import MakeFilter
from tqdm import tqdm
logger = logging.getLogger(__name__)
class MSSQLExtractor:
"""Extract data from MS SQL Server source database"""
def __init__(self, make_filter: Optional[MakeFilter] = None):
self.batch_size = 10000
self.make_filter = make_filter or MakeFilter()
logger.info(f"Initialized MSSQL extractor with {len(self.make_filter.get_allowed_makes())} allowed makes")
def extract_wmi_data(self) -> List[Dict]:
"""Extract WMI (World Manufacturer Identifier) data with make filtering"""
logger.info("Extracting WMI data from source database with make filtering")
query = f"""
SELECT
w.Id,
w.Wmi,
w.ManufacturerId,
w.MakeId,
w.VehicleTypeId,
w.TruckTypeId,
w.CountryId,
w.PublicAvailabilityDate,
w.NonCompliant,
w.NonCompliantReason,
w.CreatedOn,
w.UpdatedOn,
w.ProcessedOn
FROM dbo.Wmi w
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
)
ORDER BY w.Id
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} WMI records")
return results
def extract_wmi_vin_schema_mappings(self) -> List[Dict]:
"""Extract WMI to VIN Schema mappings with year ranges and make filtering"""
logger.info("Extracting WMI-VinSchema mappings with make filtering")
query = f"""
SELECT
wvs.WmiId,
wvs.VinSchemaId,
wvs.YearFrom,
wvs.YearTo,
w.Wmi,
vs.Name as SchemaName
FROM dbo.Wmi_VinSchema wvs
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
JOIN dbo.VinSchema vs ON wvs.VinSchemaId = vs.Id
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
)
AND w.MakeId IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
ORDER BY wvs.WmiId, wvs.VinSchemaId
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} WMI-VinSchema mappings (filtered by allowed makes)")
return results
def extract_patterns_data(self) -> Generator[List[Dict], None, None]:
"""Extract pattern data in batches with make filtering"""
logger.info("Extracting pattern data from source database with make filtering")
# First get the total count with filtering
count_query = f"""
SELECT COUNT(*) as total
FROM dbo.Pattern p
JOIN dbo.Element e ON p.ElementId = e.Id
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
JOIN dbo.Make m ON wm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
AND e.Id IN (26, 27, 28, 18, 24)
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(count_query)
total_row = self._row_to_dict(cursor, cursor.fetchone())
total_count = total_row.get('total', 0)
logger.info(f"Total patterns to extract (filtered): {total_count}")
# Extract in batches with manufacturer filtering
query = f"""
SELECT
p.Id,
p.VinSchemaId,
p.Keys,
p.ElementId,
p.AttributeId,
e.Name as ElementName,
e.weight,
e.GroupName,
vs.Name as SchemaName,
w.Wmi,
m.Name as MakeName
FROM dbo.Pattern p
JOIN dbo.Element e ON p.ElementId = e.Id
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
JOIN dbo.Make m ON wm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
AND e.Id IN (26, 27, 28, 18, 24)
ORDER BY p.Id
OFFSET {{}} ROWS FETCH NEXT {{}} ROWS ONLY
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
for offset in tqdm(range(0, total_count, self.batch_size), desc="Extracting filtered patterns"):
cursor.execute(query.format(offset, self.batch_size))
rows = cursor.fetchall()
if rows:
yield self._rows_to_dicts(cursor, rows)
else:
break
def extract_elements_data(self) -> List[Dict]:
"""Extract element definitions"""
logger.info("Extracting element data")
query = """
SELECT
Id,
Name,
Code,
LookupTable,
Description,
IsPrivate,
GroupName,
DataType,
MinAllowedValue,
MaxAllowedValue,
IsQS,
Decode,
weight
FROM dbo.Element
ORDER BY Id
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} element definitions")
return results
def extract_reference_table(self, table_name: str) -> List[Dict]:
"""Extract data from a reference table with make filtering"""
logger.info(f"Extracting data from {table_name} with make filtering")
# Apply make filtering - filter by Make brand names (simpler and more efficient)
if table_name == 'Manufacturer':
# Extract manufacturers linked to filtered makes only
query = f"""
SELECT DISTINCT mfr.* FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
ORDER BY mfr.Id
"""
elif table_name == 'Make':
# Filter makes directly by brand names (GMC, Ford, Toyota, etc.)
query = f"""
SELECT * FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
ORDER BY Id
"""
elif table_name == 'Model':
# Filter models by allowed make brand names
query = f"""
SELECT md.* FROM dbo.Model md
JOIN dbo.Make_Model mm ON md.Id = mm.ModelId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
ORDER BY md.Id
"""
elif table_name == 'Wmi':
# Filter WMI records by allowed manufacturers (linked to makes) AND makes directly
query = f"""
SELECT w.* FROM dbo.Wmi w
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
)
AND w.MakeId IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
ORDER BY w.Id
"""
else:
# No filtering for other reference tables
query = f"SELECT * FROM dbo.{table_name} ORDER BY Id"
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} records from {table_name} (filtered by allowed makes)")
return results
def extract_make_model_relationships(self) -> List[Dict]:
"""Extract Make-Model relationships with make filtering"""
logger.info("Extracting Make-Model relationships with make filtering")
query = f"""
SELECT
mm.MakeId,
mm.ModelId,
m.Name as MakeName,
md.Name as ModelName
FROM dbo.Make_Model mm
JOIN dbo.Make m ON mm.MakeId = m.Id
JOIN dbo.Model md ON mm.ModelId = md.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
ORDER BY mm.MakeId, mm.ModelId
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} Make-Model relationships (filtered by allowed makes)")
return results
def extract_wmi_make_relationships(self) -> List[Dict]:
"""Extract WMI-Make relationships with make filtering"""
logger.info("Extracting WMI-Make relationships with make filtering")
query = f"""
SELECT
wm.WmiId,
wm.MakeId,
w.Wmi,
m.Name as MakeName
FROM dbo.Wmi_Make wm
JOIN dbo.Wmi w ON wm.WmiId = w.Id
JOIN dbo.Make m ON wm.MakeId = m.Id
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make mk ON mm.MakeId = mk.Id
WHERE {self.make_filter.get_sql_filter('mk.Name')}
)
AND w.MakeId IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
AND m.Id IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
ORDER BY wm.WmiId, wm.MakeId
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} WMI-Make relationships (filtered by allowed makes)")
return results
def _rows_to_dicts(self, cursor, rows) -> List[Dict]:
"""Convert pyodbc rows to list of dicts using cursor description."""
if not rows:
return []
columns = [col[0] for col in cursor.description]
result: List[Dict] = []
for row in rows:
item = {columns[i]: row[i] for i in range(len(columns))}
result.append(item)
return result
def _row_to_dict(self, cursor, row) -> Dict:
"""Convert single pyodbc row to dict."""
if row is None:
return {}
columns = [col[0] for col in cursor.description]
return {columns[i]: row[i] for i in range(len(columns))}