Initial Commit

This commit is contained in:
Eric Gullickson
2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions

View File

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python3
"""
ETL Package Main Entry Point
Allows running ETL package as a module: python -m etl
"""
from .main import cli
if __name__ == "__main__":
cli()

View File

@@ -0,0 +1,376 @@
import logging
from typing import Dict, List, Set, Optional
from datetime import datetime
from dateutil import tz
from tqdm import tqdm
from ..connections import db_connections
from ..extractors.mssql_extractor import MSSQLExtractor
from ..loaders.postgres_loader import PostgreSQLLoader
from ..config import config
from ..utils.make_filter import MakeFilter
logger = logging.getLogger(__name__)
class NormalizedVehicleBuilder:
"""Build normalized vehicle schema from pattern-based NHTSA source data"""
def __init__(self, make_filter: Optional[MakeFilter] = None):
self.make_filter = make_filter or MakeFilter()
self.extractor = MSSQLExtractor(self.make_filter)
self.loader = PostgreSQLLoader()
logger.info(
f"Initialized normalized vehicle builder with make filtering: {len(self.make_filter.get_allowed_makes())} allowed makes"
)
def build(self):
"""Main normalized vehicle schema building process"""
logger.info("Starting normalized vehicle schema build")
try:
# Step 1: Clear and load reference tables
logger.info("Step 1: Loading reference tables (makes, models, relationships)")
self._load_reference_tables()
# Step 2: Extract year availability from WMI data
logger.info("Step 2: Building model-year availability from WMI data")
self._build_model_year_availability()
# Step 3: Extract trims and engines from pattern analysis
logger.info("Step 3: Extracting trims and engines from pattern data")
self._extract_trims_and_engines()
logger.info("Normalized vehicle schema build completed successfully")
return True
except Exception as e:
logger.error(f"Normalized schema build failed: {e}")
raise e
def _load_reference_tables(self):
"""Load basic reference tables: makes, models with proper relationships"""
# Load makes (filtered by make_filter)
makes_data = self.extractor.extract_reference_table('Make')
if makes_data:
self.loader.load_reference_table('make', makes_data)
logger.info(f"Loaded {len(makes_data)} makes")
# Get make-model relationships first
make_model_rels = self.extractor.extract_make_model_relationships()
# Load models with make_id populated from relationships
models_data = self.extractor.extract_reference_table('Model')
if models_data and make_model_rels:
# Create mapping: model_id -> make_id
model_to_make = {}
for rel in make_model_rels:
model_to_make[rel['ModelId']] = rel['MakeId']
# Add make_id to each model record
for model in models_data:
model['MakeId'] = model_to_make.get(model['Id'])
# Filter out models without make_id (orphaned models)
valid_models = [m for m in models_data if m.get('MakeId') is not None]
self.loader.load_reference_table('model', valid_models)
logger.info(f"Loaded {len(valid_models)} models with make relationships")
logger.info(f"Filtered out {len(models_data) - len(valid_models)} orphaned models")
else:
logger.warning("No models or relationships loaded")
def _build_model_year_availability(self):
"""Build model-year availability from WMI year ranges with realistic constraints"""
logger.info("Extracting model-year availability from WMI data with realistic year bounds")
# Define realistic year constraints
current_year = datetime.now().year
max_year = current_year + 1 # Allow next model year
min_year = current_year - 40 # Reasonable historical range (40 years back)
logger.info(f"Using realistic year range: {min_year} to {max_year}")
# Get WMI data with year ranges
wmi_data = self.extractor.extract_wmi_vin_schema_mappings()
# Get make-model relationships to map WMI to models
make_model_rels = self.extractor.extract_make_model_relationships()
wmi_make_rels = self.extractor.extract_wmi_make_relationships()
# Build mapping: WMI -> Make -> Models
wmi_to_models = {}
make_to_models = {}
# Build make -> models mapping
for rel in make_model_rels:
make_id = rel['MakeId']
if make_id not in make_to_models:
make_to_models[make_id] = []
make_to_models[make_id].append(rel['ModelId'])
# Build WMI -> models mapping via makes
for wmi_make in wmi_make_rels:
wmi_id = wmi_make['WmiId']
make_id = wmi_make['MakeId']
if make_id in make_to_models:
if wmi_id not in wmi_to_models:
wmi_to_models[wmi_id] = []
wmi_to_models[wmi_id].extend(make_to_models[make_id])
# Extremely conservative approach: Only allow models with explicit recent year ranges
logger.info("Building model-year availability - using only models with EXPLICIT recent VIN pattern evidence")
model_years = []
current_year = datetime.now().year
# Strategy: Only include models that have VIN patterns with explicit recent year ranges (not open-ended)
recent_threshold = current_year - 5 # Only patterns from last 5 years
# Find models that have EXPLICIT recent VIN pattern evidence (both YearFrom and YearTo defined)
recent_models_with_years = {} # model_id -> set of years with evidence
for wmi_mapping in wmi_data:
year_from = wmi_mapping['YearFrom']
year_to = wmi_mapping['YearTo']
# Skip patterns without explicit year ranges (YearTo=None means open-ended, likely old discontinued models)
if year_from is None or year_to is None:
continue
# Only consider WMI patterns that have recent, explicit activity
if year_to >= recent_threshold and year_from <= current_year + 1:
wmi_id = wmi_mapping['WmiId']
if wmi_id in wmi_to_models:
models = wmi_to_models[wmi_id]
for model_id in models:
if model_id not in recent_models_with_years:
recent_models_with_years[model_id] = set()
# Add the actual years with evidence (constrained to reasonable range)
evidence_start = max(year_from, recent_threshold)
evidence_end = min(year_to, current_year + 1)
for year in range(evidence_start, evidence_end + 1):
recent_models_with_years[model_id].add(year)
logger.info(f"Found {len(recent_models_with_years)} models with explicit recent VIN pattern evidence (patterns with defined year ranges since {recent_threshold})")
# Create model-year combinations only for years with actual VIN pattern evidence
# Apply business rules to exclude historically discontinued models
discontinued_models = self._get_discontinued_models()
for model_id, years_with_evidence in recent_models_with_years.items():
# Check if this model is in our discontinued list
if model_id in discontinued_models:
max_year = discontinued_models[model_id]
logger.info(f"Applying discontinuation rule: Model ID {model_id} discontinued after {max_year}")
# Only include years up to discontinuation year
years_with_evidence = {y for y in years_with_evidence if y <= max_year}
for year in years_with_evidence:
model_years.append({
'model_id': model_id,
'year': year
})
logger.info(f"Created {len(model_years)} model-year combinations based on explicit VIN pattern evidence")
# Remove duplicates
unique_model_years = []
seen = set()
for my in model_years:
key = (my['model_id'], my['year'])
if key not in seen:
seen.add(key)
unique_model_years.append(my)
# Load to database
if unique_model_years:
self.loader.load_model_years(unique_model_years)
logger.info(f"Generated {len(unique_model_years)} model-year availability records")
def _extract_trims_and_engines(self):
"""Extract trims and engines from pattern analysis"""
logger.info("Extracting trims and engines from pattern data")
# Get model-year IDs for mapping
model_year_mapping = self._get_model_year_mapping()
trims_data = []
engines_data = []
engine_names = set()
# Process patterns in batches
total_trims = 0
total_engines = 0
for pattern_batch in self.extractor.extract_patterns_data():
logger.info(f"Processing pattern batch: {len(pattern_batch)} patterns")
# Group patterns by (year, make, model) combination
vehicle_combinations = {}
for pattern in pattern_batch:
element_id = pattern['ElementId']
attribute_id = pattern.get('AttributeId', '')
make_name = pattern.get('MakeName', '')
# Skip if not allowed make
if not self.make_filter.is_make_allowed(make_name):
continue
# Create vehicle combination key
# We'll derive year from WMI data associated with this pattern
vin_schema_id = pattern['VinSchemaId']
key = (vin_schema_id, make_name)
if key not in vehicle_combinations:
vehicle_combinations[key] = {
'make_name': make_name,
'vin_schema_id': vin_schema_id,
'trims': set(),
'engines': set()
}
# Extract trim and engine data
if element_id == 28 and attribute_id: # Trim
vehicle_combinations[key]['trims'].add(attribute_id)
elif element_id == 18 and attribute_id: # Engine
vehicle_combinations[key]['engines'].add(attribute_id)
# Convert to trim/engine records
for combo in vehicle_combinations.values():
make_name = combo['make_name']
# For now, create generic records
# In a full implementation, you'd map these to specific model-years
for trim_name in combo['trims']:
if trim_name and len(trim_name.strip()) > 0:
# We'll need to associate these with specific model_year_ids
# For now, create a placeholder structure
trims_data.append({
'name': trim_name.strip(),
'make_name': make_name, # temporary for mapping
'source_schema': combo['vin_schema_id']
})
total_trims += 1
for engine_name in combo['engines']:
if engine_name and len(engine_name.strip()) > 0 and engine_name not in engine_names:
engine_names.add(engine_name)
engines_data.append({
'name': engine_name.strip(),
'code': None,
'displacement_l': None,
'cylinders': None,
'fuel_type': None,
'aspiration': None
})
total_engines += 1
# Load engines first (they're independent)
if engines_data:
self.loader.load_engines(engines_data)
logger.info(f"Loaded {total_engines} unique engines")
# For trims, we need to map them to actual model_year records
# This is a simplified approach - in practice you'd need more sophisticated mapping
if trims_data:
simplified_trims = self._map_trims_to_model_years(trims_data, model_year_mapping)
if simplified_trims:
self.loader.load_trims(simplified_trims)
logger.info(f"Loaded {len(simplified_trims)} trims")
def _get_model_year_mapping(self) -> Dict:
"""Get mapping of model_year records for trim association"""
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
SELECT my.id, my.model_id, my.year, m.name as model_name, mk.name as make_name
FROM vehicles.model_year my
JOIN vehicles.model m ON my.model_id = m.id
JOIN vehicles.make mk ON m.make_id = mk.id
"""
cursor.execute(query)
rows = cursor.fetchall()
mapping = {}
for row in rows:
key = (row['make_name'] if isinstance(row, dict) else row[4],
row['year'] if isinstance(row, dict) else row[2])
mapping[key] = row['id'] if isinstance(row, dict) else row[0]
return mapping
def _map_trims_to_model_years(self, trims_data: List[Dict], model_year_mapping: Dict) -> List[Dict]:
"""Map extracted trims to actual model_year records"""
mapped_trims = []
# For now, create a simplified mapping
# Associate trims with all model_years of the same make
for trim in trims_data:
make_name = trim['make_name']
trim_name = trim['name']
# Find all model_year_ids for this make
model_year_ids = []
for (mapped_make, year), model_year_id in model_year_mapping.items():
if mapped_make == make_name:
model_year_ids.append(model_year_id)
# Create trim record for each model_year (simplified approach)
# In practice, you'd need more sophisticated pattern-to-vehicle mapping
for model_year_id in model_year_ids[:5]: # Limit to avoid explosion
mapped_trims.append({
'model_year_id': model_year_id,
'name': trim_name
})
return mapped_trims
def _get_discontinued_models(self) -> Dict[int, int]:
"""Get mapping of discontinued model IDs to their last production year
This method identifies models that were historically discontinued
and should not appear in recent model year combinations.
"""
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Query for specific discontinued models by name patterns
# These are well-known discontinued models that should not appear in recent years
discontinued_patterns = [
('Jimmy%', 1991), # GMC Jimmy discontinued 1991
('S-10%', 2004), # Chevrolet S-10 discontinued 2004
('Blazer%', 2005), # Chevrolet Blazer discontinued 2005 (before recent revival)
('Astro%', 2005), # Chevrolet Astro discontinued 2005
('Safari%', 2005), # GMC Safari discontinued 2005
('Jimmy Utility%', 1991), # GMC Jimmy Utility discontinued 1991
]
discontinued_models = {}
for pattern, last_year in discontinued_patterns:
query = """
SELECT m.id, m.name, mk.name as make_name
FROM vehicles.model m
JOIN vehicles.make mk ON m.make_id = mk.id
WHERE m.name ILIKE %s
AND mk.name IN ('Chevrolet', 'GMC')
"""
cursor.execute(query, (pattern,))
rows = cursor.fetchall()
for row in rows:
model_id = row['id'] if isinstance(row, dict) else row[0]
model_name = row['name'] if isinstance(row, dict) else row[1]
make_name = row['make_name'] if isinstance(row, dict) else row[2]
discontinued_models[model_id] = last_year
logger.info(f"Marked {make_name} {model_name} (ID: {model_id}) as discontinued after {last_year}")
return discontinued_models

View File

@@ -0,0 +1,39 @@
import os
from typing import Optional
class ETLConfig:
"""ETL Configuration using environment variables"""
# MS SQL Server settings
MSSQL_HOST: str = os.getenv("MSSQL_HOST", "mvp-platform-vehicles-mssql")
MSSQL_PORT: int = int(os.getenv("MSSQL_PORT", "1433"))
MSSQL_DATABASE: str = os.getenv("MSSQL_DATABASE", "VPICList")
MSSQL_USER: str = os.getenv("MSSQL_USER", "sa")
MSSQL_PASSWORD: str = os.getenv("MSSQL_PASSWORD", "Platform123!")
# PostgreSQL settings
POSTGRES_HOST: str = os.getenv("POSTGRES_HOST", "mvp-platform-vehicles-db")
POSTGRES_PORT: int = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DATABASE: str = os.getenv("POSTGRES_DATABASE", "vehicles")
POSTGRES_USER: str = os.getenv("POSTGRES_USER", "mvp_platform_user")
POSTGRES_PASSWORD: str = os.getenv("POSTGRES_PASSWORD", "platform123")
# Redis settings
REDIS_HOST: str = os.getenv("REDIS_HOST", "mvp-platform-vehicles-redis")
REDIS_PORT: int = int(os.getenv("REDIS_PORT", "6379"))
REDIS_DB: int = int(os.getenv("REDIS_DB", "0"))
# ETL Scheduling
ETL_SCHEDULE: str = os.getenv("ETL_SCHEDULE", "0 2 * * 0") # Weekly at 2 AM on Sunday
# ETL settings
BATCH_SIZE: int = int(os.getenv("BATCH_SIZE", "10000"))
PARALLEL_WORKERS: int = int(os.getenv("PARALLEL_WORKERS", "4"))
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
# Confidence thresholds
MIN_CONFIDENCE_SCORE: int = int(os.getenv("MIN_CONFIDENCE_SCORE", "50"))
# ETL behavior toggles
DISABLE_ALL_MODELS_FALLBACK: bool = os.getenv("DISABLE_ALL_MODELS_FALLBACK", "true").lower() in ("1", "true", "yes")
config = ETLConfig()

View File

@@ -0,0 +1,152 @@
import pyodbc
import psycopg2
from psycopg2.extras import RealDictCursor
import asyncpg
import redis
from contextlib import contextmanager
import logging
import time
from typing import Optional
from .config import config
logger = logging.getLogger(__name__)
class DatabaseConnections:
"""Manage database connections with retry logic and timeouts"""
def __init__(self):
self.mssql_conn = None
self.postgres_conn = None
self.redis_client = None
self.pg_pool = None
self.max_retries = 3
self.retry_delay = 2 # seconds
def _retry_connection(self, connection_func, connection_type: str, max_retries: Optional[int] = None):
"""Retry connection with exponential backoff"""
max_retries = max_retries or self.max_retries
for attempt in range(max_retries):
try:
return connection_func()
except Exception as e:
if attempt == max_retries - 1:
logger.error(f"Failed to connect to {connection_type} after {max_retries} attempts: {e}")
raise
wait_time = self.retry_delay * (2 ** attempt)
logger.warning(f"{connection_type} connection failed (attempt {attempt + 1}/{max_retries}): {e}")
logger.info(f"Retrying in {wait_time} seconds...")
time.sleep(wait_time)
@contextmanager
def mssql_connection(self):
"""Context manager for MS SQL connection using pyodbc with retry logic"""
def _connect():
connection_string = (
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
f"SERVER={config.MSSQL_HOST},{config.MSSQL_PORT};"
f"DATABASE={config.MSSQL_DATABASE};"
f"UID={config.MSSQL_USER};"
f"PWD={config.MSSQL_PASSWORD};"
f"TrustServerCertificate=yes;"
f"Connection Timeout=30;"
f"Command Timeout=300;"
)
return pyodbc.connect(connection_string)
conn = self._retry_connection(_connect, "MSSQL")
try:
yield conn
finally:
try:
conn.close()
except Exception as e:
logger.warning(f"Error closing MSSQL connection: {e}")
@contextmanager
def postgres_connection(self):
"""Context manager for PostgreSQL connection with retry logic"""
def _connect():
return psycopg2.connect(
host=config.POSTGRES_HOST,
port=config.POSTGRES_PORT,
database=config.POSTGRES_DATABASE,
user=config.POSTGRES_USER,
password=config.POSTGRES_PASSWORD,
cursor_factory=RealDictCursor,
connect_timeout=30,
options='-c statement_timeout=300000' # 5 minutes
)
conn = self._retry_connection(_connect, "PostgreSQL")
try:
yield conn
finally:
try:
conn.close()
except Exception as e:
logger.warning(f"Error closing PostgreSQL connection: {e}")
async def create_pg_pool(self):
"""Create async PostgreSQL connection pool"""
self.pg_pool = await asyncpg.create_pool(
host=config.POSTGRES_HOST,
port=config.POSTGRES_PORT,
database=config.POSTGRES_DATABASE,
user=config.POSTGRES_USER,
password=config.POSTGRES_PASSWORD,
min_size=10,
max_size=20
)
return self.pg_pool
def get_redis_client(self):
"""Get Redis client"""
if not self.redis_client:
self.redis_client = redis.Redis(
host=config.REDIS_HOST,
port=config.REDIS_PORT,
db=config.REDIS_DB,
decode_responses=True
)
return self.redis_client
def test_connections():
"""Test all database connections for health check"""
try:
# Test MSSQL connection (use master DB to avoid failures before restore)
db = DatabaseConnections()
mssql_master_conn_str = (
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
f"SERVER={config.MSSQL_HOST},{config.MSSQL_PORT};"
f"DATABASE=master;"
f"UID={config.MSSQL_USER};"
f"PWD={config.MSSQL_PASSWORD};"
f"TrustServerCertificate=yes;"
)
import pyodbc as _pyodbc
with _pyodbc.connect(mssql_master_conn_str) as conn:
cursor = conn.cursor()
cursor.execute("SELECT 1")
cursor.fetchone()
logger.info("MSSQL connection successful (master)")
# Test PostgreSQL connection
with db.postgres_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT 1")
cursor.fetchone()
logger.info("PostgreSQL connection successful")
# Test Redis connection
redis_client = db.get_redis_client()
redis_client.ping()
logger.info("Redis connection successful")
return True
except Exception as e:
logger.error(f"Connection test failed: {e}")
return False
db_connections = DatabaseConnections()

View File

@@ -0,0 +1 @@
# ETL Downloaders

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env python3
"""
NHTSA vPIC Database Downloader
Downloads and prepares the NHTSA vPIC database file for ETL processing
"""
import os
import logging
import requests
import zipfile
from pathlib import Path
from datetime import datetime
from typing import Optional
logger = logging.getLogger(__name__)
class NHTSADownloader:
"""Downloads and manages NHTSA vPIC database files"""
def __init__(self, download_dir: str = "/app/data"):
self.download_dir = Path(download_dir)
self.download_dir.mkdir(exist_ok=True)
def get_latest_database_url(self) -> str:
"""
Get the latest NHTSA vPIC database URL
Uses July 2025 version as specified
"""
return "https://vpic.nhtsa.dot.gov/api/vPICList_lite_2025_07.bak.zip"
def download_database(self, url: Optional[str] = None) -> Optional[Path]:
"""
Download NHTSA vPIC database file
Args:
url: Database URL (defaults to latest)
Returns:
Path to downloaded .bak file or None if failed
"""
if url is None:
url = self.get_latest_database_url()
logger.info(f"Starting download of NHTSA vPIC database from: {url}")
try:
# Extract filename from URL
zip_filename = url.split('/')[-1]
zip_path = self.download_dir / zip_filename
# Download with progress
response = requests.get(url, stream=True)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
logger.info(f"Downloading {zip_filename} ({total_size:,} bytes)")
with open(zip_path, 'wb') as f:
downloaded = 0
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
progress = (downloaded / total_size) * 100
if downloaded % (1024 * 1024 * 10) == 0: # Log every 10MB
logger.info(f"Download progress: {progress:.1f}% ({downloaded:,}/{total_size:,} bytes)")
logger.info(f"Successfully downloaded: {zip_path}")
# Extract the .bak file
bak_path = self.extract_bak_file(zip_path)
# Clean up zip file
zip_path.unlink()
logger.info(f"Cleaned up zip file: {zip_path}")
return bak_path
except Exception as e:
logger.error(f"Failed to download database: {e}")
return None
def extract_bak_file(self, zip_path: Path) -> Path:
"""
Extract .bak file from zip archive
Args:
zip_path: Path to zip file
Returns:
Path to extracted .bak file
"""
logger.info(f"Extracting .bak file from: {zip_path}")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Find the .bak file
bak_files = [name for name in zip_ref.namelist() if name.endswith('.bak')]
if not bak_files:
raise ValueError("No .bak file found in zip archive")
if len(bak_files) > 1:
logger.warning(f"Multiple .bak files found, using first: {bak_files}")
bak_filename = bak_files[0]
logger.info(f"Extracting: {bak_filename}")
# Extract to download directory
zip_ref.extract(bak_filename, self.download_dir)
bak_path = self.download_dir / bak_filename
logger.info(f"Successfully extracted: {bak_path}")
return bak_path
def get_existing_bak_file(self) -> Optional[Path]:
"""
Find an existing .bak file in preferred locations.
Searches both the shared mount (/app/shared) and local download dir (/app/data).
Returns:
Path to most recent .bak file or None
"""
search_dirs = [Path("/app/shared"), self.download_dir]
candidates = []
for d in search_dirs:
try:
if d.exists():
candidates.extend(list(d.glob("*.bak")))
except Exception as e:
logger.debug(f"Skipping directory {d}: {e}")
if candidates:
latest_bak = max(candidates, key=lambda p: p.stat().st_mtime)
logger.info(f"Found existing .bak file: {latest_bak}")
return latest_bak
return None
def ensure_database_file(self, force_download: bool = False) -> Optional[Path]:
"""
Ensure we have a database file - download if needed
Args:
force_download: Force download even if file exists
Returns:
Path to .bak file or None if failed
"""
if not force_download:
existing_file = self.get_existing_bak_file()
if existing_file:
logger.info(f"Using existing database file: {existing_file}")
return existing_file
logger.info("Downloading fresh database file...")
return self.download_database()
def get_database_info(self, bak_path: Path) -> dict:
"""
Get information about the database file
Args:
bak_path: Path to .bak file
Returns:
Dictionary with file info
"""
if not bak_path.exists():
return {"exists": False}
stat = bak_path.stat()
return {
"exists": True,
"path": str(bak_path),
"size_mb": round(stat.st_size / (1024 * 1024), 1),
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
"name": bak_path.name
}

View File

@@ -0,0 +1,629 @@
"""
JSON Extractor for Manual Vehicle Data Processing
Extracts and normalizes vehicle data from JSON files into database-ready structures.
Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive
data processing with L→I normalization and make name conversion.
Key Features:
- Extract make/model/year/trim/engine data from JSON files
- Handle electric vehicles (empty engines → default motor)
- Data validation and quality assurance
- Progress tracking and error reporting
Usage:
extractor = JsonExtractor(make_mapper, engine_parser)
make_data = extractor.extract_make_data('sources/makes/toyota.json')
all_data = extractor.extract_all_makes('sources/makes/')
"""
import json
import os
import glob
import logging
from typing import List, Dict, Optional, Generator, Tuple
from dataclasses import dataclass
from pathlib import Path
# Import our utilities (handle both relative and direct imports)
try:
from ..utils.make_name_mapper import MakeNameMapper
from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser, EngineSpec
logger = logging.getLogger(__name__)
@dataclass
class ValidationResult:
"""JSON validation result"""
is_valid: bool
errors: List[str]
warnings: List[str]
@property
def has_errors(self) -> bool:
return len(self.errors) > 0
@property
def has_warnings(self) -> bool:
return len(self.warnings) > 0
@dataclass
class ModelData:
"""Extracted model data with normalized engines and trims"""
name: str # Model name from JSON
years: List[int] # Years this model appears in
engines: List[EngineSpec] # Parsed and normalized engines
trims: List[str] # Trim names (from submodels)
is_electric: bool = False # True if empty engines array detected
@property
def total_trims(self) -> int:
return len(self.trims)
@property
def total_engines(self) -> int:
return len(self.engines)
@property
def year_range(self) -> str:
if not self.years:
return "Unknown"
return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0])
@dataclass
class MakeData:
"""Complete make data with models, engines, and metadata"""
name: str # Normalized display name (e.g., "Alfa Romeo")
filename: str # Original JSON filename
models: List[ModelData]
processing_errors: List[str] # Any errors during extraction
processing_warnings: List[str] # Any warnings during extraction
@property
def total_models(self) -> int:
return len(self.models)
@property
def total_engines(self) -> int:
return sum(model.total_engines for model in self.models)
@property
def total_trims(self) -> int:
return sum(model.total_trims for model in self.models)
@property
def electric_models_count(self) -> int:
return sum(1 for model in self.models if model.is_electric)
@property
def year_range(self) -> str:
all_years = []
for model in self.models:
all_years.extend(model.years)
if not all_years:
return "Unknown"
return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0])
@dataclass
class ExtractionResult:
"""Results of extracting all makes"""
makes: List[MakeData]
total_files_processed: int
successful_extractions: int
failed_extractions: int
total_models: int
total_engines: int
total_electric_models: int
@property
def success_rate(self) -> float:
return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0
class JsonExtractor:
"""Extract normalized vehicle data from JSON files"""
def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser):
"""
Initialize JSON extractor with utilities
Args:
make_mapper: For normalizing make names from filenames
engine_parser: For parsing engine specifications with L→I normalization
"""
self.make_mapper = make_mapper
self.engine_parser = engine_parser
logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser")
def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult:
"""
Validate JSON structure before processing
Args:
json_data: Loaded JSON data
filename: Source filename for error context
Returns:
ValidationResult with validity status and any issues
"""
errors = []
warnings = []
try:
# Check top-level structure
if not isinstance(json_data, dict):
errors.append("JSON must be a dictionary")
return ValidationResult(False, errors, warnings)
# Should have exactly one key (the make name)
if len(json_data.keys()) != 1:
errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}")
return ValidationResult(False, errors, warnings)
make_key = list(json_data.keys())[0]
make_data = json_data[make_key]
# Make data should be a list of year entries
if not isinstance(make_data, list):
errors.append(f"Make data for '{make_key}' must be a list")
return ValidationResult(False, errors, warnings)
if len(make_data) == 0:
warnings.append(f"Make '{make_key}' has no year entries")
# Validate year entries
for i, year_entry in enumerate(make_data):
if not isinstance(year_entry, dict):
errors.append(f"Year entry {i} must be a dictionary")
continue
# Check required fields
if 'year' not in year_entry:
errors.append(f"Year entry {i} missing 'year' field")
if 'models' not in year_entry:
errors.append(f"Year entry {i} missing 'models' field")
continue
# Validate year
try:
year = int(year_entry['year'])
if year < 1900 or year > 2030:
warnings.append(f"Unusual year value: {year}")
except (ValueError, TypeError):
errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}")
# Validate models
models = year_entry['models']
if not isinstance(models, list):
errors.append(f"Models in year entry {i} must be a list")
continue
for j, model in enumerate(models):
if not isinstance(model, dict):
errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary")
continue
if 'name' not in model:
errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field")
# Engines and submodels are optional but should be lists if present
if 'engines' in model and not isinstance(model['engines'], list):
errors.append(f"Engines for model {model.get('name')} must be a list")
if 'submodels' in model and not isinstance(model['submodels'], list):
errors.append(f"Submodels for model {model.get('name')} must be a list")
except Exception as e:
errors.append(f"Unexpected error during validation: {str(e)}")
is_valid = len(errors) == 0
if errors:
logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors")
elif warnings:
logger.info(f"JSON validation for {filename}: {len(warnings)} warnings")
else:
logger.debug(f"JSON validation passed for {filename}")
return ValidationResult(is_valid, errors, warnings)
def extract_make_data(self, json_file_path: str) -> MakeData:
"""
Extract complete make data from a single JSON file
Args:
json_file_path: Path to JSON file
Returns:
MakeData with extracted and normalized data
"""
filename = os.path.basename(json_file_path)
logger.info(f"Extracting make data from {filename}")
processing_errors = []
processing_warnings = []
try:
# Load and validate JSON
with open(json_file_path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
validation = self.validate_json_structure(json_data, filename)
processing_errors.extend(validation.errors)
processing_warnings.extend(validation.warnings)
if not validation.is_valid:
logger.error(f"JSON validation failed for {filename}")
return MakeData(
name=self.make_mapper.normalize_make_name(filename),
filename=filename,
models=[],
processing_errors=processing_errors,
processing_warnings=processing_warnings
)
# Get normalized make name
make_name = self.make_mapper.normalize_make_name(filename)
logger.debug(f"Normalized make name: {filename}{make_name}")
# Extract data
make_key = list(json_data.keys())[0]
year_entries = json_data[make_key]
# Group models by name across all years
models_by_name = {} # model_name -> {years: set, engines: set, trims: set}
for year_entry in year_entries:
try:
year = int(year_entry['year'])
models_list = year_entry.get('models', [])
for model_entry in models_list:
model_name = model_entry.get('name', '').strip()
if not model_name:
processing_warnings.append(f"Empty model name in year {year}")
continue
# Initialize model data if not seen before
if model_name not in models_by_name:
models_by_name[model_name] = {
'years': set(),
'engines': set(),
'trims': set()
}
# Add year
models_by_name[model_name]['years'].add(year)
# Add engines
engines_list = model_entry.get('engines', [])
for engine_str in engines_list:
if engine_str and engine_str.strip():
models_by_name[model_name]['engines'].add(engine_str.strip())
# Add trims (from submodels)
submodels_list = model_entry.get('submodels', [])
for trim in submodels_list:
if trim and trim.strip():
models_by_name[model_name]['trims'].add(trim.strip())
except (ValueError, TypeError) as e:
processing_errors.append(f"Error processing year entry: {str(e)}")
continue
# Convert to ModelData objects
models = []
for model_name, model_info in models_by_name.items():
try:
# Parse engines
engine_specs = []
is_electric = False
if not model_info['engines']:
# Empty engines array - electric vehicle
is_electric = True
electric_spec = self.engine_parser.create_electric_motor()
engine_specs = [electric_spec]
logger.debug(f"Created electric motor for {make_name} {model_name}")
else:
# Parse each engine string
for engine_str in model_info['engines']:
spec = self.engine_parser.parse_engine_string(engine_str)
engine_specs.append(spec)
# Remove duplicate engines based on key attributes
unique_engines = self.engine_parser.get_unique_engines(engine_specs)
# Create model data
model_data = ModelData(
name=model_name,
years=sorted(list(model_info['years'])),
engines=unique_engines,
trims=sorted(list(model_info['trims'])),
is_electric=is_electric
)
models.append(model_data)
except Exception as e:
processing_errors.append(f"Error processing model {model_name}: {str(e)}")
continue
# Sort models by name
models.sort(key=lambda m: m.name)
make_data = MakeData(
name=make_name,
filename=filename,
models=models,
processing_errors=processing_errors,
processing_warnings=processing_warnings
)
logger.info(f"Extracted {filename}: {len(models)} models, "
f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models")
return make_data
except Exception as e:
logger.error(f"Failed to extract make data from {filename}: {str(e)}")
processing_errors.append(f"Fatal error: {str(e)}")
return MakeData(
name=self.make_mapper.normalize_make_name(filename),
filename=filename,
models=[],
processing_errors=processing_errors,
processing_warnings=processing_warnings
)
def extract_all_makes(self, sources_dir: str) -> ExtractionResult:
"""
Process all JSON files in the sources directory
Args:
sources_dir: Directory containing JSON make files
Returns:
ExtractionResult with all extracted data and statistics
"""
logger.info(f"Starting extraction of all makes from {sources_dir}")
# Find all JSON files
pattern = os.path.join(sources_dir, '*.json')
json_files = glob.glob(pattern)
if not json_files:
logger.warning(f"No JSON files found in {sources_dir}")
return ExtractionResult(
makes=[],
total_files_processed=0,
successful_extractions=0,
failed_extractions=0,
total_models=0,
total_engines=0,
total_electric_models=0
)
logger.info(f"Found {len(json_files)} JSON files to process")
makes = []
successful_extractions = 0
failed_extractions = 0
# Sort files for consistent processing order
json_files.sort()
for json_file in json_files:
try:
make_data = self.extract_make_data(json_file)
makes.append(make_data)
if make_data.processing_errors:
failed_extractions += 1
logger.error(f"Extraction completed with errors for {make_data.filename}")
else:
successful_extractions += 1
logger.debug(f"Extraction successful for {make_data.filename}")
except Exception as e:
logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}")
failed_extractions += 1
# Create minimal make data for failed file
filename = os.path.basename(json_file)
failed_make = MakeData(
name=self.make_mapper.normalize_make_name(filename),
filename=filename,
models=[],
processing_errors=[f"Fatal extraction error: {str(e)}"],
processing_warnings=[]
)
makes.append(failed_make)
# Calculate statistics
total_models = sum(make.total_models for make in makes)
total_engines = sum(make.total_engines for make in makes)
total_electric_models = sum(make.electric_models_count for make in makes)
result = ExtractionResult(
makes=makes,
total_files_processed=len(json_files),
successful_extractions=successful_extractions,
failed_extractions=failed_extractions,
total_models=total_models,
total_engines=total_engines,
total_electric_models=total_electric_models
)
logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, "
f"{total_models} models, {total_engines} engines, {total_electric_models} electric models")
return result
def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]:
"""
Get detailed extraction statistics
Args:
result: ExtractionResult from extract_all_makes
Returns:
Dictionary with detailed statistics
"""
stats = {
'files': {
'total_processed': result.total_files_processed,
'successful': result.successful_extractions,
'failed': result.failed_extractions,
'success_rate': result.success_rate
},
'data': {
'total_makes': len(result.makes),
'total_models': result.total_models,
'total_engines': result.total_engines,
'electric_models': result.total_electric_models
},
'quality': {
'makes_with_errors': sum(1 for make in result.makes if make.processing_errors),
'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings),
'total_errors': sum(len(make.processing_errors) for make in result.makes),
'total_warnings': sum(len(make.processing_warnings) for make in result.makes)
}
}
# Add make-specific statistics
make_stats = []
for make in result.makes:
make_stat = {
'name': make.name,
'filename': make.filename,
'models': make.total_models,
'engines': make.total_engines,
'trims': make.total_trims,
'electric_models': make.electric_models_count,
'year_range': make.year_range,
'errors': len(make.processing_errors),
'warnings': len(make.processing_warnings)
}
make_stats.append(make_stat)
stats['makes'] = make_stats
return stats
def print_extraction_report(self, result: ExtractionResult) -> None:
"""
Print detailed extraction report
Args:
result: ExtractionResult from extract_all_makes
"""
stats = self.get_extraction_statistics(result)
print(f"🚀 JSON EXTRACTION REPORT")
print(f"=" * 50)
# File processing summary
print(f"\n📁 FILE PROCESSING")
print(f" Files processed: {stats['files']['total_processed']}")
print(f" Successful: {stats['files']['successful']}")
print(f" Failed: {stats['files']['failed']}")
print(f" Success rate: {stats['files']['success_rate']:.1%}")
# Data summary
print(f"\n📊 DATA EXTRACTED")
print(f" Makes: {stats['data']['total_makes']}")
print(f" Models: {stats['data']['total_models']}")
print(f" Engines: {stats['data']['total_engines']}")
print(f" Electric models: {stats['data']['electric_models']}")
# Quality summary
print(f"\n🔍 QUALITY ASSESSMENT")
print(f" Makes with errors: {stats['quality']['makes_with_errors']}")
print(f" Makes with warnings: {stats['quality']['makes_with_warnings']}")
print(f" Total errors: {stats['quality']['total_errors']}")
print(f" Total warnings: {stats['quality']['total_warnings']}")
# Show problematic makes
if stats['quality']['makes_with_errors'] > 0:
print(f"\n⚠️ MAKES WITH ERRORS:")
for make in result.makes:
if make.processing_errors:
print(f" {make.name} ({make.filename}): {len(make.processing_errors)} errors")
# Show top makes by data volume
print(f"\n🏆 TOP MAKES BY MODEL COUNT:")
top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
for make in top_makes:
print(f" {make.name}: {make.total_models} models, {make.total_engines} engines")
# Example usage and testing functions
def example_usage():
"""Demonstrate JsonExtractor usage"""
print("🚀 JsonExtractor Example Usage")
print("=" * 40)
# Use direct imports for example usage
try:
from ..utils.make_name_mapper import MakeNameMapper
from ..utils.engine_spec_parser import EngineSpecParser
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser
# Initialize utilities
make_mapper = MakeNameMapper()
engine_parser = EngineSpecParser()
# Create extractor
extractor = JsonExtractor(make_mapper, engine_parser)
# Extract single make
sources_dir = "sources/makes"
if os.path.exists(sources_dir):
toyota_file = os.path.join(sources_dir, "toyota.json")
if os.path.exists(toyota_file):
print(f"\n📄 Extracting from toyota.json...")
toyota_data = extractor.extract_make_data(toyota_file)
print(f" Make: {toyota_data.name}")
print(f" Models: {toyota_data.total_models}")
print(f" Engines: {toyota_data.total_engines}")
print(f" Electric models: {toyota_data.electric_models_count}")
print(f" Year range: {toyota_data.year_range}")
if toyota_data.processing_errors:
print(f" Errors: {len(toyota_data.processing_errors)}")
if toyota_data.processing_warnings:
print(f" Warnings: {len(toyota_data.processing_warnings)}")
# Extract all makes
print(f"\n🔄 Extracting all makes...")
result = extractor.extract_all_makes(sources_dir)
extractor.print_extraction_report(result)
else:
print(f"Sources directory not found: {sources_dir}")
if __name__ == "__main__":
example_usage()

View File

@@ -0,0 +1,337 @@
import logging
from typing import List, Dict, Optional, Generator
from ..connections import db_connections
from ..utils.make_filter import MakeFilter
from tqdm import tqdm
logger = logging.getLogger(__name__)
class MSSQLExtractor:
"""Extract data from MS SQL Server source database"""
def __init__(self, make_filter: Optional[MakeFilter] = None):
self.batch_size = 10000
self.make_filter = make_filter or MakeFilter()
logger.info(f"Initialized MSSQL extractor with {len(self.make_filter.get_allowed_makes())} allowed makes")
def extract_wmi_data(self) -> List[Dict]:
"""Extract WMI (World Manufacturer Identifier) data with make filtering"""
logger.info("Extracting WMI data from source database with make filtering")
query = f"""
SELECT
w.Id,
w.Wmi,
w.ManufacturerId,
w.MakeId,
w.VehicleTypeId,
w.TruckTypeId,
w.CountryId,
w.PublicAvailabilityDate,
w.NonCompliant,
w.NonCompliantReason,
w.CreatedOn,
w.UpdatedOn,
w.ProcessedOn
FROM dbo.Wmi w
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
)
ORDER BY w.Id
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} WMI records")
return results
def extract_wmi_vin_schema_mappings(self) -> List[Dict]:
"""Extract WMI to VIN Schema mappings with year ranges and make filtering"""
logger.info("Extracting WMI-VinSchema mappings with make filtering")
query = f"""
SELECT
wvs.WmiId,
wvs.VinSchemaId,
wvs.YearFrom,
wvs.YearTo,
w.Wmi,
vs.Name as SchemaName
FROM dbo.Wmi_VinSchema wvs
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
JOIN dbo.VinSchema vs ON wvs.VinSchemaId = vs.Id
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
)
AND w.MakeId IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
ORDER BY wvs.WmiId, wvs.VinSchemaId
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} WMI-VinSchema mappings (filtered by allowed makes)")
return results
def extract_patterns_data(self) -> Generator[List[Dict], None, None]:
"""Extract pattern data in batches with make filtering"""
logger.info("Extracting pattern data from source database with make filtering")
# First get the total count with filtering
count_query = f"""
SELECT COUNT(*) as total
FROM dbo.Pattern p
JOIN dbo.Element e ON p.ElementId = e.Id
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
JOIN dbo.Make m ON wm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
AND e.Id IN (26, 27, 28, 18, 24)
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(count_query)
total_row = self._row_to_dict(cursor, cursor.fetchone())
total_count = total_row.get('total', 0)
logger.info(f"Total patterns to extract (filtered): {total_count}")
# Extract in batches with manufacturer filtering
query = f"""
SELECT
p.Id,
p.VinSchemaId,
p.Keys,
p.ElementId,
p.AttributeId,
e.Name as ElementName,
e.weight,
e.GroupName,
vs.Name as SchemaName,
w.Wmi,
m.Name as MakeName
FROM dbo.Pattern p
JOIN dbo.Element e ON p.ElementId = e.Id
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
JOIN dbo.Make m ON wm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
AND e.Id IN (26, 27, 28, 18, 24)
ORDER BY p.Id
OFFSET {{}} ROWS FETCH NEXT {{}} ROWS ONLY
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
for offset in tqdm(range(0, total_count, self.batch_size), desc="Extracting filtered patterns"):
cursor.execute(query.format(offset, self.batch_size))
rows = cursor.fetchall()
if rows:
yield self._rows_to_dicts(cursor, rows)
else:
break
def extract_elements_data(self) -> List[Dict]:
"""Extract element definitions"""
logger.info("Extracting element data")
query = """
SELECT
Id,
Name,
Code,
LookupTable,
Description,
IsPrivate,
GroupName,
DataType,
MinAllowedValue,
MaxAllowedValue,
IsQS,
Decode,
weight
FROM dbo.Element
ORDER BY Id
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} element definitions")
return results
def extract_reference_table(self, table_name: str) -> List[Dict]:
"""Extract data from a reference table with make filtering"""
logger.info(f"Extracting data from {table_name} with make filtering")
# Apply make filtering - filter by Make brand names (simpler and more efficient)
if table_name == 'Manufacturer':
# Extract manufacturers linked to filtered makes only
query = f"""
SELECT DISTINCT mfr.* FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
ORDER BY mfr.Id
"""
elif table_name == 'Make':
# Filter makes directly by brand names (GMC, Ford, Toyota, etc.)
query = f"""
SELECT * FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
ORDER BY Id
"""
elif table_name == 'Model':
# Filter models by allowed make brand names
query = f"""
SELECT md.* FROM dbo.Model md
JOIN dbo.Make_Model mm ON md.Id = mm.ModelId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
ORDER BY md.Id
"""
elif table_name == 'Wmi':
# Filter WMI records by allowed manufacturers (linked to makes) AND makes directly
query = f"""
SELECT w.* FROM dbo.Wmi w
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
)
AND w.MakeId IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
ORDER BY w.Id
"""
else:
# No filtering for other reference tables
query = f"SELECT * FROM dbo.{table_name} ORDER BY Id"
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} records from {table_name} (filtered by allowed makes)")
return results
def extract_make_model_relationships(self) -> List[Dict]:
"""Extract Make-Model relationships with make filtering"""
logger.info("Extracting Make-Model relationships with make filtering")
query = f"""
SELECT
mm.MakeId,
mm.ModelId,
m.Name as MakeName,
md.Name as ModelName
FROM dbo.Make_Model mm
JOIN dbo.Make m ON mm.MakeId = m.Id
JOIN dbo.Model md ON mm.ModelId = md.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
ORDER BY mm.MakeId, mm.ModelId
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} Make-Model relationships (filtered by allowed makes)")
return results
def extract_wmi_make_relationships(self) -> List[Dict]:
"""Extract WMI-Make relationships with make filtering"""
logger.info("Extracting WMI-Make relationships with make filtering")
query = f"""
SELECT
wm.WmiId,
wm.MakeId,
w.Wmi,
m.Name as MakeName
FROM dbo.Wmi_Make wm
JOIN dbo.Wmi w ON wm.WmiId = w.Id
JOIN dbo.Make m ON wm.MakeId = m.Id
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make mk ON mm.MakeId = mk.Id
WHERE {self.make_filter.get_sql_filter('mk.Name')}
)
AND w.MakeId IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
AND m.Id IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
ORDER BY wm.WmiId, wm.MakeId
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} WMI-Make relationships (filtered by allowed makes)")
return results
def _rows_to_dicts(self, cursor, rows) -> List[Dict]:
"""Convert pyodbc rows to list of dicts using cursor description."""
if not rows:
return []
columns = [col[0] for col in cursor.description]
result: List[Dict] = []
for row in rows:
item = {columns[i]: row[i] for i in range(len(columns))}
result.append(item)
return result
def _row_to_dict(self, cursor, row) -> Dict:
"""Convert single pyodbc row to dict."""
if row is None:
return {}
columns = [col[0] for col in cursor.description]
return {columns[i]: row[i] for i in range(len(columns))}

View File

@@ -0,0 +1,63 @@
import logging
from typing import Optional, Dict, Any, List
from ..connections import db_connections
logger = logging.getLogger(__name__)
class VinProcExtractor:
"""Utilities to inspect and sample the MSSQL VIN decode stored procedure."""
def __init__(self, proc_name: str = 'dbo.spVinDecode'):
self.proc_name = proc_name
def find_proc(self) -> Optional[Dict[str, Any]]:
"""Locate the VIN decode proc by name pattern, return basic metadata."""
query = """
SELECT TOP 1
o.name AS object_name,
s.name AS schema_name,
o.type_desc
FROM sys.objects o
JOIN sys.schemas s ON s.schema_id = o.schema_id
WHERE o.name LIKE '%Vin%Decode%'
ORDER BY o.create_date DESC
"""
with db_connections.mssql_connection() as conn:
cur = conn.cursor()
cur.execute(query)
row = cur.fetchone()
if not row:
logger.warning("VIN decode stored procedure not found by pattern")
return None
return { 'object_name': row[0], 'schema_name': row[1], 'type_desc': row[2] }
def get_definition(self, schema: str, name: str) -> str:
"""Return the text definition of the proc using sp_helptext semantics."""
sql = f"EXEC {schema}.sp_helptext '{schema}.{name}'"
definition_lines: List[str] = []
with db_connections.mssql_connection() as conn:
cur = conn.cursor()
cur.execute(sql)
for row in cur.fetchall():
# sp_helptext returns a single NVARCHAR column with line segments
definition_lines.append(row[0])
return ''.join(definition_lines)
def sample_execute(self, vin: str) -> Optional[List[Dict[str, Any]]]:
"""Execute the VIN decode proc with a VIN to capture output shape."""
# Prefer proc signature with @VIN only; if it requires year, MSSQL will error.
sql = f"EXEC {self.proc_name} @VIN=?"
with db_connections.mssql_connection() as conn:
cur = conn.cursor()
try:
cur.execute(sql, (vin,))
columns = [c[0] for c in cur.description] if cur.description else []
rows = cur.fetchall() if cur.description else []
results: List[Dict[str, Any]] = []
for r in rows:
results.append({columns[i]: r[i] for i in range(len(columns))})
return results
except Exception as e:
logger.warning(f"VIN proc sample execution failed: {e}")
return None

View File

@@ -0,0 +1 @@
# ETL Loaders

View File

@@ -0,0 +1,716 @@
"""
JSON Manual Loader for Vehicles ETL
Loads extracted JSON data into PostgreSQL database with referential integrity.
Supports clear/append modes with duplicate handling and comprehensive progress tracking.
Database Schema:
- vehicles.make (id, name)
- vehicles.model (id, make_id, name)
- vehicles.model_year (id, model_id, year)
- vehicles.trim (id, model_year_id, name)
- vehicles.engine (id, name, code, displacement_l, cylinders, fuel_type, aspiration)
- vehicles.trim_engine (trim_id, engine_id)
Load Modes:
- CLEAR: Truncate all tables and reload (destructive)
- APPEND: Insert with conflict resolution (safe)
Usage:
loader = JsonManualLoader(postgres_loader)
result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
"""
import logging
from typing import List, Dict, Optional, Tuple
from enum import Enum
from dataclasses import dataclass
from psycopg2.extras import execute_batch
# Import our components (handle both relative and direct imports)
try:
from .postgres_loader import PostgreSQLLoader
from ..extractors.json_extractor import MakeData, ModelData, ExtractionResult
from ..utils.engine_spec_parser import EngineSpec
from ..connections import db_connections
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
# Import with fallback handling for nested imports
try:
from loaders.postgres_loader import PostgreSQLLoader
except ImportError:
# Mock PostgreSQLLoader for testing
class PostgreSQLLoader:
def __init__(self):
self.batch_size = 1000
from extractors.json_extractor import MakeData, ModelData, ExtractionResult
from utils.engine_spec_parser import EngineSpec
try:
from connections import db_connections
except ImportError:
# Mock db_connections for testing
class MockDBConnections:
def postgres_connection(self):
raise NotImplementedError("Database connection not available in test mode")
db_connections = MockDBConnections()
logger = logging.getLogger(__name__)
class LoadMode(Enum):
"""Data loading modes"""
CLEAR = "clear" # Truncate and reload (destructive)
APPEND = "append" # Insert with conflict handling (safe)
@dataclass
class LoadResult:
"""Result of loading operations"""
total_makes: int
total_models: int
total_model_years: int
total_trims: int
total_engines: int
total_trim_engine_mappings: int
failed_makes: List[str]
warnings: List[str]
load_mode: LoadMode
@property
def success_count(self) -> int:
return self.total_makes - len(self.failed_makes)
@property
def success_rate(self) -> float:
return self.success_count / self.total_makes if self.total_makes > 0 else 0.0
@dataclass
class LoadStatistics:
"""Detailed loading statistics"""
makes_processed: int = 0
makes_skipped: int = 0
models_inserted: int = 0
model_years_inserted: int = 0
skipped_model_years: int = 0
trims_inserted: int = 0
engines_inserted: int = 0
trim_engine_mappings_inserted: int = 0
duplicate_makes: int = 0
duplicate_models: int = 0
duplicate_engines: int = 0
errors: List[str] = None
warnings: List[str] = None
def __post_init__(self):
if self.errors is None:
self.errors = []
if self.warnings is None:
self.warnings = []
class JsonManualLoader:
"""Load JSON-extracted vehicle data into PostgreSQL"""
def _get_id_from_result(self, result, column_name='id'):
"""Helper to extract ID from query result, handling both tuple and dict cursors"""
if result is None:
return None
if isinstance(result, tuple):
return result[0]
# For RealDictCursor, try the column name first, fall back to key access
if column_name in result:
return result[column_name]
# For COUNT(*) queries, the key might be 'count'
if 'count' in result:
return result['count']
# Fall back to first value
return list(result.values())[0] if result else None
def __init__(self, postgres_loader: Optional[PostgreSQLLoader] = None):
"""
Initialize JSON manual loader
Args:
postgres_loader: Existing PostgreSQL loader instance
"""
self.postgres_loader = postgres_loader or PostgreSQLLoader()
self.batch_size = 1000
logger.info("JsonManualLoader initialized")
def clear_all_tables(self) -> None:
"""
Clear all vehicles tables in dependency order
WARNING: This is destructive and will remove all data
"""
logger.warning("CLEARING ALL VEHICLES TABLES - This is destructive!")
tables_to_clear = [
'trim_engine', # Many-to-many mappings first
'trim_transmission',
'performance', # Tables with foreign keys
'trim',
'model_year',
'model',
'make',
'engine', # Independent tables last
'transmission'
]
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
for table in tables_to_clear:
try:
cursor.execute(f"TRUNCATE TABLE vehicles.{table} CASCADE")
logger.info(f"Cleared vehicles.{table}")
except Exception as e:
logger.warning(f"Failed to clear vehicles.{table}: {str(e)}")
conn.commit()
logger.info("All vehicles tables cleared")
def load_make(self, make_data: MakeData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load a single make with all related data
Args:
make_data: Extracted make data
mode: Loading mode (clear/append)
stats: Statistics accumulator
Returns:
Make ID in database
"""
logger.debug(f"Loading make: {make_data.name}")
try:
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# 1. Insert or get make (always check for existing to avoid constraint violations)
# Check if make exists (case-insensitive to match database constraint)
cursor.execute(
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
(make_data.name,)
)
result = cursor.fetchone()
if result:
make_id = self._get_id_from_result(result)
stats.duplicate_makes += 1
logger.debug(f"Make {make_data.name} already exists with ID {make_id}")
else:
# Insert new make with error handling for constraint violations
try:
cursor.execute(
"INSERT INTO vehicles.make (name) VALUES (%s) RETURNING id",
(make_data.name,)
)
result = cursor.fetchone()
make_id = self._get_id_from_result(result)
logger.debug(f"Inserted make {make_data.name} with ID {make_id}")
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Retry the lookup in case of race condition
cursor.execute(
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
(make_data.name,)
)
result = cursor.fetchone()
if result:
make_id = self._get_id_from_result(result)
stats.duplicate_makes += 1
logger.debug(f"Make {make_data.name} found after retry with ID {make_id}")
else:
raise
else:
raise
# 2. Process models
for model_data in make_data.models:
model_id = self.load_model(cursor, make_id, model_data, mode, stats)
conn.commit()
stats.makes_processed += 1
return make_id
except Exception as e:
error_msg = f"Failed to load make {make_data.name}: {str(e)}"
logger.error(error_msg)
stats.errors.append(error_msg)
raise
def load_model(self, cursor, make_id: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load a single model with all related data
Args:
cursor: Database cursor
make_id: Parent make ID
model_data: Extracted model data
mode: Loading mode
stats: Statistics accumulator
Returns:
Model ID in database
"""
# 1. Insert or get model
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.model WHERE make_id = %s AND name = %s",
(make_id, model_data.name)
)
result = cursor.fetchone()
if result:
model_id = result[0] if isinstance(result, tuple) else result['id']
stats.duplicate_models += 1
else:
cursor.execute(
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
(make_id, model_data.name)
)
model_id = self._get_id_from_result(cursor.fetchone())
stats.models_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
(make_id, model_data.name)
)
model_id = self._get_id_from_result(cursor.fetchone())
stats.models_inserted += 1
# 2. Insert model years and related data
for year in model_data.years:
model_year_id = self.load_model_year(cursor, model_id, year, model_data, mode, stats)
# Skip processing if year was outside valid range
if model_year_id is None:
continue
return model_id
def load_model_year(self, cursor, model_id: int, year: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load model year and associated trims/engines
Args:
cursor: Database cursor
model_id: Parent model ID
year: Model year
model_data: Model data with trims and engines
mode: Loading mode
stats: Statistics accumulator
Returns:
Model year ID in database
"""
# Skip years that don't meet database constraints (must be 1950-2100)
if year < 1950 or year > 2100:
logger.warning(f"Skipping year {year} - outside valid range (1950-2100)")
stats.skipped_model_years += 1
return None
# 1. Insert or get model year
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.model_year WHERE model_id = %s AND year = %s",
(model_id, year)
)
result = cursor.fetchone()
if result:
model_year_id = result[0] if isinstance(result, tuple) else result['id']
else:
cursor.execute(
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
(model_id, year)
)
model_year_id = self._get_id_from_result(cursor.fetchone())
stats.model_years_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
(model_id, year)
)
model_year_id = self._get_id_from_result(cursor.fetchone())
stats.model_years_inserted += 1
# 2. Load engines and get their IDs
engine_ids = []
for engine_spec in model_data.engines:
engine_id = self.load_engine(cursor, engine_spec, mode, stats)
engine_ids.append(engine_id)
# 3. Load trims and connect to engines
for trim_name in model_data.trims:
trim_id = self.load_trim(cursor, model_year_id, trim_name, engine_ids, mode, stats)
return model_year_id
def load_engine(self, cursor, engine_spec: EngineSpec, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load engine specification
Args:
cursor: Database cursor
engine_spec: Parsed engine specification
mode: Loading mode
stats: Statistics accumulator
Returns:
Engine ID in database
"""
# Create a canonical engine name for database storage
if engine_spec.displacement_l and engine_spec.configuration != "Unknown" and engine_spec.cylinders:
engine_name = f"{engine_spec.displacement_l}L {engine_spec.configuration}{engine_spec.cylinders}"
else:
engine_name = engine_spec.raw_string
# Generate engine code from name (remove spaces, lowercase)
engine_code = engine_name.replace(" ", "").lower()
# Always check for existing engine by name or code to avoid constraint violations
cursor.execute("""
SELECT id FROM vehicles.engine
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
""", (engine_name, engine_code))
result = cursor.fetchone()
if result:
engine_id = self._get_id_from_result(result)
stats.duplicate_engines += 1
return engine_id
# Insert new engine
try:
cursor.execute("""
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
VALUES (%s, %s, %s, %s, %s, %s)
RETURNING id
""", (
engine_name,
engine_code,
engine_spec.displacement_l,
engine_spec.cylinders,
engine_spec.fuel_type if engine_spec.fuel_type != "Unknown" else None,
engine_spec.aspiration if engine_spec.aspiration != "Natural" else None
))
engine_id = self._get_id_from_result(cursor.fetchone())
stats.engines_inserted += 1
return engine_id
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Retry the lookup in case of race condition
cursor.execute("""
SELECT id FROM vehicles.engine
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
""", (engine_name, engine_code))
result = cursor.fetchone()
if result:
engine_id = self._get_id_from_result(result)
stats.duplicate_engines += 1
return engine_id
raise
def load_trim(self, cursor, model_year_id: int, trim_name: str, engine_ids: List[int], mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load trim and connect to engines
Args:
cursor: Database cursor
model_year_id: Parent model year ID
trim_name: Trim name
engine_ids: List of engine IDs to connect
mode: Loading mode
stats: Statistics accumulator
Returns:
Trim ID in database
"""
# 1. Insert or get trim
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.trim WHERE model_year_id = %s AND name = %s",
(model_year_id, trim_name)
)
result = cursor.fetchone()
if result:
trim_id = result[0] if isinstance(result, tuple) else result['id']
else:
cursor.execute(
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
(model_year_id, trim_name)
)
trim_id = self._get_id_from_result(cursor.fetchone())
stats.trims_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
(model_year_id, trim_name)
)
trim_id = self._get_id_from_result(cursor.fetchone())
stats.trims_inserted += 1
# 2. Connect trim to engines (always check for existing to avoid duplicates)
# Deduplicate engine_ids to prevent duplicate mappings within the same trim
unique_engine_ids = list(set(engine_ids))
for engine_id in unique_engine_ids:
# Check if mapping already exists
cursor.execute(
"SELECT 1 FROM vehicles.trim_engine WHERE trim_id = %s AND engine_id = %s",
(trim_id, engine_id)
)
if not cursor.fetchone():
try:
cursor.execute(
"INSERT INTO vehicles.trim_engine (trim_id, engine_id) VALUES (%s, %s)",
(trim_id, engine_id)
)
stats.trim_engine_mappings_inserted += 1
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Another process may have inserted it, skip
logger.debug(f"Trim-engine mapping ({trim_id}, {engine_id}) already exists, skipping")
else:
raise
return trim_id
def load_all_makes(self, makes_data: List[MakeData], mode: LoadMode) -> LoadResult:
"""
Load all makes with complete data
Args:
makes_data: List of extracted make data
mode: Loading mode (clear/append)
Returns:
LoadResult with comprehensive statistics
"""
logger.info(f"Starting bulk load of {len(makes_data)} makes in {mode.value} mode")
# Clear tables if in CLEAR mode
if mode == LoadMode.CLEAR:
self.clear_all_tables()
stats = LoadStatistics()
failed_makes = []
for make_data in makes_data:
try:
if make_data.processing_errors:
logger.warning(f"Skipping make {make_data.name} due to extraction errors")
stats.makes_skipped += 1
failed_makes.append(make_data.name)
continue
make_id = self.load_make(make_data, mode, stats)
logger.info(f"Successfully loaded make {make_data.name} (ID: {make_id})")
except Exception as e:
logger.error(f"Failed to load make {make_data.name}: {str(e)}")
failed_makes.append(make_data.name)
continue
# Create result
result = LoadResult(
total_makes=len(makes_data),
total_models=stats.models_inserted,
total_model_years=stats.model_years_inserted,
total_trims=stats.trims_inserted,
total_engines=stats.engines_inserted,
total_trim_engine_mappings=stats.trim_engine_mappings_inserted,
failed_makes=failed_makes,
warnings=stats.warnings,
load_mode=mode
)
logger.info(f"Bulk load complete: {result.success_count}/{result.total_makes} makes loaded successfully")
logger.info(f"Data loaded: {result.total_models} models, {result.total_engines} engines, {result.total_trims} trims")
return result
def get_database_statistics(self) -> Dict[str, int]:
"""
Get current database record counts
Returns:
Dictionary with table counts
"""
stats = {}
tables = ['make', 'model', 'model_year', 'trim', 'engine', 'trim_engine']
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
for table in tables:
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table}")
result = cursor.fetchone()
stats[table] = result[0] if isinstance(result, tuple) else result['count']
return stats
def validate_referential_integrity(self) -> List[str]:
"""
Validate referential integrity of loaded data
Returns:
List of integrity issues found (empty if all good)
"""
issues = []
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Check for orphaned models
cursor.execute("""
SELECT COUNT(*) FROM vehicles.model m
LEFT JOIN vehicles.make mk ON m.make_id = mk.id
WHERE mk.id IS NULL
""")
orphaned_models = self._get_id_from_result(cursor.fetchone(), 'count')
if orphaned_models > 0:
issues.append(f"Found {orphaned_models} orphaned models")
# Check for orphaned model_years
cursor.execute("""
SELECT COUNT(*) FROM vehicles.model_year my
LEFT JOIN vehicles.model m ON my.model_id = m.id
WHERE m.id IS NULL
""")
orphaned_model_years = self._get_id_from_result(cursor.fetchone())
if orphaned_model_years > 0:
issues.append(f"Found {orphaned_model_years} orphaned model_years")
# Check for orphaned trims
cursor.execute("""
SELECT COUNT(*) FROM vehicles.trim t
LEFT JOIN vehicles.model_year my ON t.model_year_id = my.id
WHERE my.id IS NULL
""")
orphaned_trims = self._get_id_from_result(cursor.fetchone())
if orphaned_trims > 0:
issues.append(f"Found {orphaned_trims} orphaned trims")
# Check for broken trim_engine mappings
cursor.execute("""
SELECT COUNT(*) FROM vehicles.trim_engine te
LEFT JOIN vehicles.trim t ON te.trim_id = t.id
LEFT JOIN vehicles.engine e ON te.engine_id = e.id
WHERE t.id IS NULL OR e.id IS NULL
""")
broken_mappings = self._get_id_from_result(cursor.fetchone())
if broken_mappings > 0:
issues.append(f"Found {broken_mappings} broken trim_engine mappings")
if issues:
logger.warning(f"Referential integrity issues found: {issues}")
else:
logger.info("Referential integrity validation passed")
return issues
def print_load_report(self, result: LoadResult) -> None:
"""
Print comprehensive loading report
Args:
result: LoadResult from load operation
"""
print(f"🚀 JSON MANUAL LOADING REPORT")
print(f"=" * 50)
# Load summary
print(f"\n📊 LOADING SUMMARY")
print(f" Mode: {result.load_mode.value.upper()}")
print(f" Makes processed: {result.success_count}/{result.total_makes}")
print(f" Success rate: {result.success_rate:.1%}")
# Data counts
print(f"\n📈 DATA LOADED")
print(f" Models: {result.total_models}")
print(f" Model years: {result.total_model_years}")
print(f" Trims: {result.total_trims}")
print(f" Engines: {result.total_engines}")
print(f" Trim-engine mappings: {result.total_trim_engine_mappings}")
# Issues
if result.failed_makes:
print(f"\n⚠️ FAILED MAKES ({len(result.failed_makes)}):")
for make in result.failed_makes:
print(f" {make}")
if result.warnings:
print(f"\n⚠️ WARNINGS ({len(result.warnings)}):")
for warning in result.warnings[:5]: # Show first 5
print(f" {warning}")
if len(result.warnings) > 5:
print(f" ... and {len(result.warnings) - 5} more warnings")
# Database statistics
print(f"\n📋 DATABASE STATISTICS:")
db_stats = self.get_database_statistics()
for table, count in db_stats.items():
print(f" vehicles.{table}: {count:,} records")
# Referential integrity
integrity_issues = self.validate_referential_integrity()
if integrity_issues:
print(f"\n❌ REFERENTIAL INTEGRITY ISSUES:")
for issue in integrity_issues:
print(f" {issue}")
else:
print(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
# Example usage and testing functions
def example_usage():
"""Demonstrate JsonManualLoader usage"""
print("🚀 JsonManualLoader Example Usage")
print("=" * 40)
# This would typically be called after JsonExtractor
# For demo purposes, we'll just show the structure
print("\n📋 Typical usage flow:")
print("1. Extract data with JsonExtractor")
print("2. Create JsonManualLoader")
print("3. Load data in APPEND or CLEAR mode")
print("4. Validate and report results")
print(f"\n💡 Example code:")
print("""
# Extract data
extractor = JsonExtractor(make_mapper, engine_parser)
extraction_result = extractor.extract_all_makes('sources/makes')
# Load data
loader = JsonManualLoader()
load_result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
# Report results
loader.print_load_report(load_result)
""")
if __name__ == "__main__":
example_usage()

View File

@@ -0,0 +1,437 @@
#!/usr/bin/env python3
"""
MSSQL Database Loader
Handles loading .bak files into MSSQL Server for ETL processing
"""
import os
import logging
import pyodbc
import time
from pathlib import Path
from typing import Optional, List
from ..config import config
logger = logging.getLogger(__name__)
class MSSQLLoader:
"""Loads database files into MSSQL Server"""
def __init__(self):
self.server = config.MSSQL_HOST
self.port = config.MSSQL_PORT
self.database = config.MSSQL_DATABASE
self.username = config.MSSQL_USER
self.password = config.MSSQL_PASSWORD
def get_connection_string(self, database: str = "master") -> str:
"""Get MSSQL connection string"""
return (
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
f"SERVER={self.server},{self.port};"
f"DATABASE={database};"
f"UID={self.username};"
f"PWD={self.password};"
f"TrustServerCertificate=yes;"
)
def test_connection(self) -> bool:
"""Test MSSQL connection"""
try:
conn_str = self.get_connection_string()
logger.info(f"Testing MSSQL connection to: {self.server}")
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute("SELECT @@VERSION")
version = cursor.fetchone()[0]
logger.info(f"MSSQL connection successful: {version[:100]}...")
return True
except Exception as e:
logger.error(f"MSSQL connection failed: {e}")
return False
def database_exists(self, database_name: str) -> bool:
"""Check if database exists"""
try:
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM sys.databases WHERE name = ?",
(database_name,)
)
count = cursor.fetchone()[0]
return count > 0
except Exception as e:
logger.error(f"Failed to check if database exists: {e}")
return False
def get_database_state(self, database_name: str) -> Optional[str]:
"""Return the state_desc for a database or None if not found"""
try:
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT state_desc FROM sys.databases WHERE name = ?",
(database_name,)
)
row = cursor.fetchone()
return row[0] if row else None
except Exception as e:
logger.error(f"Failed to get database state: {e}")
return None
def drop_database(self, database_name: str) -> bool:
"""Drop database if it exists"""
try:
if not self.database_exists(database_name):
logger.info(f"Database {database_name} does not exist, skipping drop")
return True
logger.info(f"Dropping database: {database_name}")
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
conn.autocommit = True
cursor = conn.cursor()
# Kill existing connections
cursor.execute(f"""
ALTER DATABASE [{database_name}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;
DROP DATABASE [{database_name}];
""")
logger.info(f"Successfully dropped database: {database_name}")
return True
except Exception as e:
logger.error(f"Failed to drop database {database_name}: {e}")
return False
def get_backup_file_info(self, bak_path: Path) -> Optional[dict]:
"""Get information about backup file"""
try:
# Use the MSSQL container's mounted backup directory
container_path = f"/backups/{bak_path.name}"
# For now, assume the file is accessible
# In production, this would copy the file into the MSSQL container
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
# Get backup file information
cursor.execute(f"RESTORE HEADERONLY FROM DISK = '{container_path}'")
headers = cursor.fetchall()
if headers:
header = headers[0]
return {
"database_name": header.DatabaseName,
"server_name": header.ServerName,
"backup_start_date": header.BackupStartDate,
"backup_finish_date": header.BackupFinishDate,
"backup_size": header.BackupSize,
}
except Exception as e:
logger.warning(f"Could not get backup file info: {e}")
return None
def restore_database(self, bak_path: Path, target_database: str = None) -> bool:
"""
Restore database from .bak file
Args:
bak_path: Path to .bak file
target_database: Target database name (defaults to VPICList)
Returns:
True if successful
"""
if target_database is None:
target_database = self.database
if not bak_path.exists():
logger.error(f"Backup file does not exist: {bak_path}")
return False
logger.info(f"Starting database restore: {bak_path} -> {target_database}")
try:
# Copy backup file to MSSQL container
container_bak_path = self.copy_backup_to_container(bak_path)
if not container_bak_path:
logger.error("Failed to copy backup file to container")
return False
# If database exists, note the state; we will handle exclusivity in the same session below
if self.database_exists(target_database):
state = self.get_database_state(target_database)
logger.info(f"Existing database detected: {target_database} (state={state})")
else:
logger.info(f"Target database does not exist yet: {target_database} — proceeding with restore")
# Restore database using a single master connection for exclusivity
logger.info(f"Restoring database from: {container_bak_path}")
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=600) as conn: # 10 minute timeout
conn.autocommit = True
cursor = conn.cursor()
# If DB exists, ensure exclusive access: kill sessions + SINGLE_USER in this session
if self.database_exists(target_database):
try:
logger.info(f"Preparing exclusive access for restore: killing active sessions on {target_database}")
kill_sql = f"""
DECLARE @db sysname = N'{target_database}';
DECLARE @kill nvarchar(max) = N'';
SELECT @kill = @kill + N'KILL ' + CONVERT(nvarchar(10), session_id) + N';'
FROM sys.dm_exec_sessions
WHERE database_id = DB_ID(@db) AND session_id <> @@SPID;
IF LEN(@kill) > 0 EXEC (@kill);
"""
cursor.execute(kill_sql)
# Force SINGLE_USER in current session
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
logger.info(f"Exclusive access prepared (SINGLE_USER) for {target_database}")
except Exception as e:
logger.warning(f"Could not fully prepare exclusive access: {e}")
# Get logical file names from backup
cursor.execute(f"RESTORE FILELISTONLY FROM DISK = '{container_bak_path}'")
files = cursor.fetchall()
if not files:
logger.error("No files found in backup")
return False
# Build RESTORE command with MOVE options
data_file = None
log_file = None
for file_info in files:
logical_name = file_info.LogicalName
file_type = file_info.Type
if file_type == 'D': # Data file
data_file = logical_name
elif file_type == 'L': # Log file
log_file = logical_name
if not data_file:
logger.error("No data file found in backup")
return False
# Construct restore command
restore_sql = f"""
RESTORE DATABASE [{target_database}]
FROM DISK = '{container_bak_path}'
WITH
MOVE '{data_file}' TO '/var/opt/mssql/data/{target_database}.mdf',
"""
if log_file:
restore_sql += f" MOVE '{log_file}' TO '/var/opt/mssql/data/{target_database}.ldf',"
restore_sql += """
REPLACE,
RECOVERY,
STATS = 10
"""
logger.info(f"Executing restore command for database: {target_database}")
logger.debug(f"Restore SQL: {restore_sql}")
try:
cursor.execute(restore_sql)
except Exception as e:
# If we hit exclusive access error, retry once after killing sessions again
if 'Exclusive access could not be obtained' in str(e):
logger.warning("Exclusive access error on RESTORE; retrying after killing sessions and reasserting SINGLE_USER...")
try:
cursor.execute(kill_sql)
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
except Exception as e2:
logger.warning(f"Retry exclusive prep failed: {e2}")
cursor.execute(restore_sql)
else:
raise
# Poll for database to be ONLINE
if not self._wait_for_database_online(target_database):
logger.error(f"Database did not come ONLINE in time: {target_database}")
return False
# Small retry around database_exists to handle late readiness
if self._retry_database_exists(target_database):
logger.info(f"Database restore successful and ONLINE: {target_database}")
# Get basic database info
cursor.execute(f"""
SELECT
name,
create_date,
compatibility_level,
state_desc
FROM sys.databases
WHERE name = '{target_database}'
""")
db_info = cursor.fetchone()
if db_info:
logger.info(f"Database info: Name={db_info.name}, Created={db_info.create_date}, Level={db_info.compatibility_level}, State={db_info.state_desc}")
# Optional: quick content verification with small retry window
if not self._retry_verify_content(target_database):
logger.warning("Database restored but content verification is inconclusive")
# Try to set MULTI_USER back in same session
try:
cursor.execute(f"ALTER DATABASE [{target_database}] SET MULTI_USER;")
logger.info(f"Set {target_database} back to MULTI_USER")
except Exception as e:
logger.warning(f"Could not set MULTI_USER on {target_database}: {e}")
return True
else:
logger.error(f"Database restore failed - database not found: {target_database}")
return False
except Exception as e:
logger.error(f"Database restore failed: {e}")
return False
def copy_backup_to_container(self, bak_path: Path) -> Optional[str]:
"""
Copy backup file to shared volume accessible by MSSQL container
Args:
bak_path: Local path to .bak file
Returns:
Container path to .bak file or None if failed
"""
try:
# Use shared volume instead of docker cp
shared_dir = Path("/app/shared")
shared_bak_path = shared_dir / bak_path.name
# If the file is already in the shared dir, skip copying
if bak_path.resolve().parent == shared_dir.resolve():
logger.info(f"Backup already in shared volume: {bak_path}")
else:
logger.info(f"Copying {bak_path} to shared volume...")
import shutil
shutil.copy2(bak_path, shared_bak_path)
# Container path from MSSQL perspective
container_path = f"/backups/{shared_bak_path.name}"
logger.info(f"Successfully copied to shared volume: {container_path}")
return container_path
except Exception as e:
logger.error(f"Failed to copy backup to shared volume: {e}")
return None
def _wait_for_database_online(self, database_name: str, timeout_seconds: int = 600, interval_seconds: int = 5) -> bool:
"""Poll MSSQL until the specified database state becomes ONLINE or timeout.
Returns True if ONLINE, False on timeout/error.
"""
logger.info(f"Waiting for database to become ONLINE: {database_name}")
deadline = time.time() + timeout_seconds
last_state = None
try:
conn_str = self.get_connection_string()
while time.time() < deadline:
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute("SELECT state_desc FROM sys.databases WHERE name = ?", (database_name,))
row = cursor.fetchone()
if row:
state = row[0]
if state != last_state:
logger.info(f"Database state: {state}")
last_state = state
if state == 'ONLINE':
# Optional: verify updateability is READ_WRITE
try:
cursor.execute("SELECT DATABASEPROPERTYEX(?, 'Updateability')", (database_name,))
up = cursor.fetchone()[0]
logger.info(f"Database updateability: {up}")
except Exception:
pass
return True
else:
logger.info("Database entry not found yet in sys.databases")
time.sleep(interval_seconds)
except Exception as e:
logger.error(f"Error while waiting for database ONLINE: {e}")
return False
logger.error("Timed out waiting for database to become ONLINE")
return False
def _retry_database_exists(self, database_name: str, attempts: int = 6, delay_seconds: int = 5) -> bool:
"""Retry wrapper for database existence checks."""
for i in range(1, attempts + 1):
if self.database_exists(database_name):
return True
logger.info(f"database_exists() false, retrying ({i}/{attempts})...")
time.sleep(delay_seconds)
return False
def _retry_verify_content(self, database_name: str, attempts: int = 3, delay_seconds: int = 5) -> bool:
"""Retry wrapper around verify_database_content to allow late readiness."""
for i in range(1, attempts + 1):
try:
counts = self.verify_database_content(database_name)
if counts:
logger.info(f"Content verification counts: {counts}")
return True
except Exception as e:
logger.info(f"Content verification attempt {i} failed: {e}")
time.sleep(delay_seconds)
return False
def verify_database_content(self, database_name: str = None) -> dict:
"""
Verify database has expected content
Returns:
Dictionary with table counts
"""
if database_name is None:
database_name = self.database
try:
conn_str = self.get_connection_string(database_name)
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
# Get table counts for key tables
tables_to_check = ['Make', 'Model', 'VehicleType', 'Manufacturer']
counts = {}
for table in tables_to_check:
try:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
counts[table] = count
logger.info(f"Table {table}: {count:,} rows")
except:
counts[table] = 0
return counts
except Exception as e:
logger.error(f"Failed to verify database content: {e}")
return {}

View File

@@ -0,0 +1,354 @@
import logging
from typing import List, Dict, Optional
from psycopg2.extras import execute_batch
from ..connections import db_connections
from tqdm import tqdm
logger = logging.getLogger(__name__)
class PostgreSQLLoader:
"""Load data into PostgreSQL target database"""
def __init__(self):
self.batch_size = 1000
def load_reference_table(self, table_name: str, data: List[Dict],
clear_existing: bool = True) -> int:
"""Load data into a reference table"""
if not data:
logger.warning(f"No data to load for table {table_name}")
return 0
logger.info(f"Loading {len(data)} records into vehicles.{table_name}")
# Column mapping from source (MS SQL) to target (PostgreSQL)
column_mappings = {
'Id': 'id',
'Name': 'name',
'Code': 'code',
'MakeId': 'make_id',
'CreateOn': 'created_at',
'CreatedOn': 'created_at',
'UpdateOn': 'updated_at',
'UpdatedOn': 'updated_at',
'Wmi': 'wmi',
'ManufacturerId': 'manufacturer_id',
'MakeId': 'make_id',
'VehicleTypeId': 'vehicle_type_id',
'TruckTypeId': 'truck_type_id',
'CountryId': 'country_id',
'PublicAvailabilityDate': 'public_availability_date',
'NonCompliant': 'non_compliant',
'NonCompliantReason': 'non_compliant_reason',
'ProcessedOn': 'processed_on',
'DisplayOrder': 'display_order',
'FormType': 'form_type',
'Description': 'description',
'LookupTable': 'lookup_table',
'IsPrivate': 'is_private',
'GroupName': 'group_name',
'DataType': 'data_type',
'MinAllowedValue': 'min_allowed_value',
'MaxAllowedValue': 'max_allowed_value',
'IsQS': 'is_qs',
'Decode': 'decode',
'weight': 'weight',
# ErrorCode specific mappings
'ErrorCodeName': 'code',
'ErrorCodeDescription': 'description'
}
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
if clear_existing:
cursor.execute(f"TRUNCATE TABLE vehicles.{table_name} CASCADE")
logger.info(f"Cleared existing data from vehicles.{table_name}")
# Get source columns and map them to target columns
source_columns = list(data[0].keys())
target_columns = []
valid_data = []
# Map columns and filter data
for source_col in source_columns:
if source_col in column_mappings:
target_columns.append(column_mappings[source_col])
else:
target_columns.append(source_col.lower())
# Check which columns exist in target table
cursor.execute(f"""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = 'vehicles' AND table_name = '{table_name}'
""")
results = cursor.fetchall()
existing_columns = {row['column_name'] if isinstance(row, dict) else row[0] for row in results}
# Filter to only existing columns
final_columns = []
final_indices = []
for i, col in enumerate(target_columns):
if col in existing_columns:
final_columns.append(col)
final_indices.append(i)
if not final_columns:
logger.warning(f"No matching columns found for table {table_name}")
return 0
column_str = ','.join(final_columns)
placeholders = ','.join(['%s'] * len(final_columns))
# Prepare insert query
query = f"""
INSERT INTO vehicles.{table_name} ({column_str})
VALUES ({placeholders})
ON CONFLICT DO NOTHING
"""
# Prepare data tuples with only valid columns
data_tuples = []
for record in data:
values = []
skip_record = False
for i in final_indices:
source_col = source_columns[i]
value = record[source_col]
# Handle special cases for error_codes table
if table_name == 'error_codes' and source_col in ['ErrorCodeName', 'Code'] and (value is None or value == ''):
skip_record = True
break
values.append(value)
if not skip_record:
data_tuples.append(tuple(values))
# Execute batch insert
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} records into vehicles.{table_name}")
return final_count
def load_wmi_vin_schema_mappings(self, mappings: List[Dict]) -> int:
"""Load WMI to VIN Schema mappings"""
if not mappings:
return 0
logger.info(f"Loading {len(mappings)} WMI-VinSchema mappings")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing mappings
cursor.execute("TRUNCATE TABLE vehicles.wmi_vin_schemas CASCADE")
query = """
INSERT INTO vehicles.wmi_vin_schemas
(wmi_id, vin_schema_id, year_from, year_to)
VALUES (%s, %s, %s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for mapping in mappings:
data_tuples.append((
mapping['WmiId'],
mapping['VinSchemaId'],
mapping['YearFrom'] or 1980,
mapping['YearTo'] or 2999
))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_vin_schemas")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} WMI-VinSchema mappings")
return final_count
def load_make_model_relationships(self, relationships: List[Dict]) -> int:
"""Load Make-Model relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} Make-Model relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing relationships
cursor.execute("TRUNCATE TABLE vehicles.make_models CASCADE")
query = """
INSERT INTO vehicles.make_models (make_id, model_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for rel in relationships:
data_tuples.append((rel['MakeId'], rel['ModelId']))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.make_models")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} Make-Model relationships")
return final_count
def load_wmi_make_relationships(self, relationships: List[Dict]) -> int:
"""Load WMI-Make relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} WMI-Make relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing relationships
cursor.execute("TRUNCATE TABLE vehicles.wmi_makes CASCADE")
query = """
INSERT INTO vehicles.wmi_makes (wmi_id, make_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for rel in relationships:
data_tuples.append((rel['WmiId'], rel['MakeId']))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_makes")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} WMI-Make relationships")
return final_count
def load_model_years(self, model_years: List[Dict]) -> int:
"""Load model year availability data"""
if not model_years:
return 0
logger.info(f"Loading {len(model_years)} model year records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.model_year (model_id, year)
VALUES (%s, %s)
ON CONFLICT (model_id, year) DO NOTHING
"""
data_tuples = [(my['model_id'], my['year']) for my in model_years]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(model_years)
def load_trims(self, trims: List[Dict]) -> int:
"""Load trim data"""
if not trims:
return 0
logger.info(f"Loading {len(trims)} trim records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.trim (model_year_id, name)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = [(t['model_year_id'], t['name']) for t in trims]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(trims)
def load_engines(self, engines: List[Dict]) -> int:
"""Load engine data"""
if not engines:
return 0
logger.info(f"Loading {len(engines)} engine records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (lower(name)) DO NOTHING
RETURNING id
"""
for engine in engines:
cursor.execute(query, (
engine['name'],
engine.get('code'),
engine.get('displacement_l'),
engine.get('cylinders'),
engine.get('fuel_type'),
engine.get('aspiration')
))
conn.commit()
return len(engines)
def load_trim_engine_relationships(self, relationships: List[Dict]) -> int:
"""Load trim-engine relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} trim-engine relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.trim_engine (trim_id, engine_id)
VALUES (%s, %s)
ON CONFLICT (trim_id, engine_id) DO NOTHING
"""
data_tuples = [(rel['trim_id'], rel['engine_id']) for rel in relationships]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(relationships)
def get_table_count(self, table_name: str) -> int:
"""Get count of records in a table"""
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
result = cursor.fetchone()
return result['count'] if isinstance(result, dict) and 'count' in result else result[0]

View File

@@ -0,0 +1,348 @@
#!/usr/bin/env python3
import logging
import sys
import os
from datetime import datetime
from pathlib import Path
import click
from .config import config
from .utils.logging import setup_logging
from .scheduler import start_etl_scheduler
from .pipeline import run_etl_pipeline
from .connections import test_connections
# Import manual JSON processing components
try:
from .pipelines.manual_json_pipeline import ManualJsonPipeline, PipelineConfig, default_progress_callback
from .loaders.json_manual_loader import LoadMode
from .utils.make_name_mapper import MakeNameMapper
from .utils.engine_spec_parser import EngineSpecParser
from .extractors.json_extractor import JsonExtractor
except ImportError as e:
# Handle import errors gracefully for existing functionality
ManualJsonPipeline = None
logger = logging.getLogger(__name__)
logger.warning(f"Manual JSON processing components not available: {e}")
logger = logging.getLogger(__name__)
@click.group()
def cli():
"""MVP Platform Vehicles ETL Tool"""
setup_logging(config.LOG_LEVEL)
@cli.command()
def build_catalog():
"""Build vehicle catalog from source database"""
success = run_etl_pipeline()
if not success:
sys.exit(1)
@cli.command()
def schedule():
"""Start ETL scheduler (default mode)"""
start_etl_scheduler()
@cli.command()
@click.option('--full', is_flag=True, help='Full reload instead of incremental')
def update(full):
"""Run ETL update"""
logger.info(f"Starting ETL update (full={full})")
success = run_etl_pipeline()
if not success:
sys.exit(1)
@cli.command()
def test():
"""Test database connections"""
success = test_connections()
if not success:
logger.error("Connection test failed")
sys.exit(1)
else:
logger.info("All connections tested successfully")
@cli.command()
@click.option('--sources-dir', '-s', default='sources/makes',
help='Directory containing JSON make files (default: sources/makes)')
@click.option('--mode', '-m', type=click.Choice(['clear', 'append']), default='append',
help='Loading mode: clear (destructive) or append (safe, default)')
@click.option('--progress/--no-progress', default=True,
help='Show progress tracking (default: enabled)')
@click.option('--validate/--no-validate', default=True,
help='Validate referential integrity after loading (default: enabled)')
@click.option('--batch-size', '-b', type=int, default=1000,
help='Database batch size for inserts (default: 1000)')
@click.option('--dry-run', is_flag=True,
help='Extract and validate data without loading to database')
@click.option('--verbose', '-v', is_flag=True,
help='Enable verbose output')
def load_manual(sources_dir, mode, progress, validate, batch_size, dry_run, verbose):
"""Load vehicle data from JSON files manually
This command processes JSON files in the specified directory and loads
vehicle data into the PostgreSQL database. It supports two modes:
• APPEND mode (default): Safely add new data with duplicate detection
• CLEAR mode: Remove all existing data and reload (destructive)
Examples:
python -m etl load-manual
python -m etl load-manual --mode clear --sources-dir custom/path
python -m etl load-manual --dry-run --verbose
"""
if ManualJsonPipeline is None:
click.echo("❌ Manual JSON processing components are not available", err=True)
click.echo(" Please check your installation and dependencies", err=True)
sys.exit(1)
# Validate sources directory
sources_path = Path(sources_dir)
if not sources_path.exists():
click.echo(f"❌ Sources directory not found: {sources_dir}", err=True)
click.echo(" Please specify a valid directory with --sources-dir", err=True)
sys.exit(1)
# Count JSON files
json_files = list(sources_path.glob("*.json"))
if not json_files:
click.echo(f"❌ No JSON files found in: {sources_dir}", err=True)
click.echo(" Please ensure the directory contains *.json files", err=True)
sys.exit(1)
# Set log level if verbose
if verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Create configuration
load_mode_enum = LoadMode.CLEAR if mode == 'clear' else LoadMode.APPEND
config = PipelineConfig(
sources_directory=str(sources_path),
load_mode=load_mode_enum,
enable_progress_tracking=progress,
validate_integrity=validate,
batch_size=batch_size
)
click.echo(f"🚀 Manual JSON Processing Pipeline")
click.echo(f" Sources: {sources_dir}")
click.echo(f" Files: {len(json_files)} JSON files")
click.echo(f" Mode: {mode.upper()}")
if dry_run:
click.echo(f" Dry run: Validation only (no database changes)")
try:
# Create pipeline
pipeline = ManualJsonPipeline(str(sources_path), config)
# Progress callback for CLI
def cli_progress_callback(progress_info):
if progress:
percentage = progress_info['percentage']
phase = progress_info['phase']
files = f"{progress_info['files_completed']}/{progress_info['total_files']}"
if progress_info['files_per_second'] > 0:
rate = f"({progress_info['files_per_second']:.1f} files/sec)"
eta_min = progress_info['eta_seconds'] / 60
eta = f"ETA: {eta_min:.1f}min" if eta_min > 0 else ""
click.echo(f"[{percentage:5.1f}%] {phase}: {files} {rate} {eta}")
else:
click.echo(f"[{percentage:5.1f}%] {phase}: {files}")
if dry_run:
# Extraction only for validation
click.echo("\n📋 Running extraction validation...")
extraction_result = pipeline.run_extraction_only()
# Report extraction results
click.echo(f"\n✅ Extraction Validation Complete")
click.echo(f" Files processed: {extraction_result.total_files_processed}")
click.echo(f" Success rate: {extraction_result.success_rate:.1%}")
click.echo(f" Models extracted: {extraction_result.total_models:,}")
click.echo(f" Engines extracted: {extraction_result.total_engines:,}")
click.echo(f" Electric models: {extraction_result.total_electric_models:,}")
if extraction_result.failed_extractions > 0:
click.echo(f" ⚠️ Failed extractions: {extraction_result.failed_extractions}")
sys.exit(1)
else:
# Full pipeline execution
if mode == 'clear':
click.echo("\n⚠️ WARNING: CLEAR mode will delete all existing vehicle data!")
if not click.confirm("Are you sure you want to continue?", default=False):
click.echo("Operation cancelled")
return
click.echo(f"\n🔄 Running pipeline...")
result = pipeline.run(progress_callback=cli_progress_callback)
# Print comprehensive report
click.echo(f"\n" + "="*60)
click.echo(f"📊 PIPELINE EXECUTION REPORT")
click.echo(f"="*60)
# Performance
click.echo(f"\n⏱️ PERFORMANCE")
click.echo(f" Duration: {result.duration_seconds:.1f} seconds ({result.duration_minutes:.1f} minutes)")
click.echo(f" Processing rate: {result.files_per_second:.1f} files/sec")
click.echo(f" Loading rate: {result.records_per_second:,.0f} records/sec")
# Success rates
click.echo(f"\n📈 SUCCESS RATES")
click.echo(f" Extraction: {result.extraction_success_rate:.1%}")
click.echo(f" Loading: {result.loading_success_rate:.1%}")
click.echo(f" Overall: {result.overall_success_rate:.1%}")
# Data loaded
click.echo(f"\n💾 DATA LOADED")
click.echo(f" Makes: {result.load_result.total_makes}")
click.echo(f" Models: {result.load_result.total_models}")
click.echo(f" Engines: {result.load_result.total_engines}")
click.echo(f" Trims: {result.load_result.total_trims}")
click.echo(f" Total records: {result.total_records_loaded:,}")
# Issues
if result.load_result.failed_makes:
click.echo(f"\n⚠️ FAILED MAKES ({len(result.load_result.failed_makes)}):")
for make in result.load_result.failed_makes:
click.echo(f"{make}")
if result.integrity_issues:
click.echo(f"\n❌ INTEGRITY ISSUES ({len(result.integrity_issues)}):")
for issue in result.integrity_issues:
click.echo(f"{issue}")
else:
click.echo(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
# Final status
if result.was_successful:
click.echo(f"\n🎉 PIPELINE COMPLETED SUCCESSFULLY")
if verbose:
# Show database statistics
db_stats = pipeline.loader.get_database_statistics()
click.echo(f"\n📋 DATABASE STATISTICS:")
for table, count in db_stats.items():
click.echo(f" {table}: {count:,} records")
else:
click.echo(f"\n⚠️ PIPELINE COMPLETED WITH ISSUES")
sys.exit(1)
except KeyboardInterrupt:
click.echo(f"\n⏸️ Pipeline interrupted by user")
sys.exit(1)
except Exception as e:
click.echo(f"\n❌ Pipeline failed: {str(e)}", err=True)
if verbose:
import traceback
traceback.print_exc()
sys.exit(1)
@cli.command()
@click.option('--sources-dir', '-s', default='sources/makes',
help='Directory containing JSON make files (default: sources/makes)')
@click.option('--verbose', '-v', is_flag=True,
help='Enable verbose output with detailed statistics')
def validate_json(sources_dir, verbose):
"""Validate JSON files and show extraction statistics
This command validates the structure and content of JSON files
without loading data into the database. Useful for:
• Checking data quality before loading
• Debugging extraction issues
• Getting statistics about available data
Examples:
python -m etl validate-json
python -m etl validate-json --sources-dir custom/path --verbose
"""
if JsonExtractor is None:
click.echo("❌ JSON validation components are not available", err=True)
sys.exit(1)
# Validate sources directory
sources_path = Path(sources_dir)
if not sources_path.exists():
click.echo(f"❌ Sources directory not found: {sources_dir}", err=True)
sys.exit(1)
# Count JSON files
json_files = list(sources_path.glob("*.json"))
if not json_files:
click.echo(f"❌ No JSON files found in: {sources_dir}", err=True)
sys.exit(1)
click.echo(f"🔍 JSON File Validation")
click.echo(f" Directory: {sources_dir}")
click.echo(f" Files: {len(json_files)} JSON files")
try:
# Initialize components
make_mapper = MakeNameMapper()
engine_parser = EngineSpecParser()
extractor = JsonExtractor(make_mapper, engine_parser)
# Run extraction validation
click.echo(f"\n📋 Validating JSON structure and content...")
result = extractor.extract_all_makes(str(sources_path))
# Basic results
click.echo(f"\n✅ Validation Complete")
click.echo(f" Files processed: {result.total_files_processed}")
click.echo(f" Success rate: {result.success_rate:.1%}")
click.echo(f" Models found: {result.total_models:,}")
click.echo(f" Engines found: {result.total_engines:,}")
click.echo(f" Electric models: {result.total_electric_models:,}")
if result.failed_extractions > 0:
click.echo(f" ⚠️ Failed extractions: {result.failed_extractions}")
# Show top makes by model count
if verbose and result.makes:
click.echo(f"\n🏆 Top Makes by Model Count:")
top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
for i, make in enumerate(top_makes, 1):
click.echo(f" {i:2d}. {make.name}: {make.total_models} models, {make.total_engines} engines")
# Show makes with issues
error_makes = [make for make in result.makes if make.processing_errors]
if error_makes:
click.echo(f"\n⚠️ Makes with Processing Errors ({len(error_makes)}):")
for make in error_makes[:5]:
click.echo(f"{make.name}: {len(make.processing_errors)} errors")
if len(error_makes) > 5:
click.echo(f" ... and {len(error_makes) - 5} more")
# Show data quality insights
click.echo(f"\n📊 Data Quality Insights:")
# Engine configuration distribution
config_counts = {}
for make in result.makes:
for model in make.models:
for engine in model.engines:
config_counts[engine.configuration] = config_counts.get(engine.configuration, 0) + 1
if config_counts:
click.echo(f" Engine configurations:")
for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True):
percentage = count / result.total_engines * 100
click.echo(f" {config}: {count:,} ({percentage:.1f}%)")
if result.failed_extractions > 0:
sys.exit(1)
except Exception as e:
click.echo(f"❌ Validation failed: {str(e)}", err=True)
if verbose:
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
# Default to scheduler mode if no command provided
if len(sys.argv) == 1:
start_etl_scheduler()
else:
cli()

View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
import logging
from datetime import datetime
from .config import config
from .builders.normalized_vehicle_builder import NormalizedVehicleBuilder
from .utils.make_filter import MakeFilter
from .connections import test_connections
from .downloaders.nhtsa_downloader import NHTSADownloader
from .loaders.mssql_loader import MSSQLLoader
from .extractors.vin_proc_extractor import VinProcExtractor
logger = logging.getLogger(__name__)
def run_etl_pipeline():
"""Complete ETL pipeline execution including download and database loading"""
logger.info("Starting complete ETL pipeline")
start_time = datetime.now()
try:
# Step 1: Download NHTSA database file
logger.info("Step 1: Downloading NHTSA vPIC database")
downloader = NHTSADownloader()
bak_file = downloader.ensure_database_file(force_download=False)
if not bak_file:
logger.error("Failed to obtain NHTSA database file")
return False
db_info = downloader.get_database_info(bak_file)
logger.info(f"Using database file: {db_info['name']} ({db_info['size_mb']} MB)")
# Step 2: Load database into MSSQL
logger.info("Step 2: Loading database into MSSQL Server")
mssql_loader = MSSQLLoader()
if not mssql_loader.test_connection():
logger.error("MSSQL connection test failed")
return False
if not mssql_loader.restore_database(bak_file):
logger.error("Failed to restore database to MSSQL")
return False
# Verify MSSQL database content
content_info = mssql_loader.verify_database_content()
logger.info(f"MSSQL database loaded with tables: {content_info}")
# Step 2b: Research stored procedure definition/output for parity
try:
logger.info("Step 2b: Inspecting MSSQL VIN decode stored procedure for parity")
vpe = VinProcExtractor()
meta = vpe.find_proc()
if meta:
logger.info(f"VIN proc found: {meta['schema_name']}.{meta['object_name']} ({meta['type_desc']})")
definition = vpe.get_definition(meta['schema_name'], meta['object_name'])
logger.debug(f"VIN proc definition (first 500 chars): {definition[:500]}")
sample = vpe.sample_execute('1G1YU3D64H5602799')
if sample is not None:
logger.info(f"VIN proc sample output columns: {list(sample[0].keys()) if sample else 'no rows'}")
else:
logger.warning("VIN decode proc not found by pattern; continuing with catalog build")
except Exception as e:
logger.warning(f"VIN proc inspection failed (non-fatal): {e}")
# Step 3: Test all connections (MSSQL + PostgreSQL)
logger.info("Step 3: Testing all database connections")
if not test_connections():
logger.error("Connection test failed after database loading")
return False
# Step 4: Build normalized PostgreSQL schema from MSSQL with make filtering
logger.info("Step 4: Building normalized PostgreSQL vehicle schema from MSSQL with make filtering")
make_filter = MakeFilter()
builder = NormalizedVehicleBuilder(make_filter)
success = builder.build()
elapsed = datetime.now() - start_time
if success:
logger.info(f"Complete ETL pipeline finished successfully in {elapsed}")
logger.info("✅ ETL Summary:")
logger.info(f" - Downloaded: {db_info['name']} ({db_info['size_mb']} MB)")
logger.info(f" - MSSQL Tables: {content_info}")
logger.info(f" - PostgreSQL normalized schema: Built successfully")
return True
else:
logger.error(f"ETL pipeline failed during normalized schema building after {elapsed}")
return False
except Exception as e:
elapsed = datetime.now() - start_time
logger.error(f"ETL pipeline crashed after {elapsed}: {e}", exc_info=True)
return False

View File

@@ -0,0 +1 @@
# Pipelines package

View File

@@ -0,0 +1,465 @@
"""
Manual JSON Pipeline for Vehicle Data Processing
Coordinates end-to-end processing of JSON vehicle data:
1. Extract data from JSON files
2. Load data into PostgreSQL database
3. Progress tracking and comprehensive reporting
Key Features:
- Full extraction→loading workflow coordination
- Clear/append mode support
- Progress tracking with detailed statistics
- Comprehensive error handling and reporting
- Performance monitoring and optimization
- Referential integrity validation
Usage:
pipeline = ManualJsonPipeline(sources_dir="sources/makes")
result = pipeline.run(mode=LoadMode.APPEND, progress_callback=print_progress)
"""
import logging
import time
from typing import List, Dict, Optional, Callable, Tuple
from dataclasses import dataclass
from pathlib import Path
# Import our components (handle both relative and direct imports)
try:
from ..extractors.json_extractor import JsonExtractor, ExtractionResult
from ..loaders.json_manual_loader import JsonManualLoader, LoadMode, LoadResult
from ..utils.make_name_mapper import MakeNameMapper
from ..utils.engine_spec_parser import EngineSpecParser
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from extractors.json_extractor import JsonExtractor, ExtractionResult
from loaders.json_manual_loader import JsonManualLoader, LoadMode, LoadResult
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser
logger = logging.getLogger(__name__)
@dataclass
class PipelineConfig:
"""Pipeline configuration options"""
sources_directory: str # Directory containing JSON files
load_mode: LoadMode = LoadMode.APPEND # Loading mode
enable_progress_tracking: bool = True # Enable progress callbacks
validate_integrity: bool = True # Validate referential integrity after loading
batch_size: int = 1000 # Database batch size
log_level: str = "INFO" # Logging level
def __post_init__(self):
"""Validate configuration"""
if not self.sources_directory:
raise ValueError("sources_directory is required")
if not Path(self.sources_directory).exists():
raise ValueError(f"Sources directory does not exist: {self.sources_directory}")
@dataclass
class PipelineResult:
"""Complete pipeline execution result"""
# Configuration
config: PipelineConfig
# Timing
start_time: float
end_time: float
# Extraction results
extraction_result: ExtractionResult
# Loading results
load_result: LoadResult
# Performance metrics
total_files_processed: int
total_records_loaded: int
files_per_second: float
records_per_second: float
# Quality metrics
extraction_success_rate: float
loading_success_rate: float
overall_success_rate: float
# Validation results
integrity_issues: List[str]
@property
def duration_seconds(self) -> float:
return self.end_time - self.start_time
@property
def duration_minutes(self) -> float:
return self.duration_seconds / 60.0
@property
def was_successful(self) -> bool:
"""True if pipeline completed without critical errors"""
return (self.extraction_result.failed_extractions == 0 and
len(self.load_result.failed_makes) == 0 and
len(self.integrity_issues) == 0)
class PipelineProgress:
"""Progress tracking for pipeline execution"""
def __init__(self, total_files: int):
self.total_files = total_files
self.current_file = 0
self.current_phase = "Starting"
self.start_time = time.time()
self.phase_start_time = time.time()
def update_phase(self, phase: str) -> None:
"""Update current phase"""
self.current_phase = phase
self.phase_start_time = time.time()
def update_file_progress(self, files_completed: int) -> None:
"""Update file progress"""
self.current_file = files_completed
def get_progress_info(self) -> Dict[str, any]:
"""Get current progress information"""
elapsed = time.time() - self.start_time
phase_elapsed = time.time() - self.phase_start_time
if self.current_file > 0:
files_per_second = self.current_file / elapsed
eta_seconds = (self.total_files - self.current_file) / files_per_second if files_per_second > 0 else 0
else:
files_per_second = 0
eta_seconds = 0
return {
'phase': self.current_phase,
'files_completed': self.current_file,
'total_files': self.total_files,
'percentage': (self.current_file / self.total_files * 100) if self.total_files > 0 else 0,
'elapsed_seconds': elapsed,
'phase_elapsed_seconds': phase_elapsed,
'files_per_second': files_per_second,
'eta_seconds': eta_seconds
}
class ManualJsonPipeline:
"""End-to-end JSON processing pipeline"""
def __init__(self, sources_dir: str, config: Optional[PipelineConfig] = None):
"""
Initialize pipeline
Args:
sources_dir: Directory containing JSON files
config: Pipeline configuration (optional)
"""
self.sources_dir = sources_dir
self.config = config or PipelineConfig(sources_directory=sources_dir)
# Initialize components
self.make_mapper = MakeNameMapper()
self.engine_parser = EngineSpecParser()
self.extractor = JsonExtractor(self.make_mapper, self.engine_parser)
self.loader = JsonManualLoader()
# Progress tracking
self.progress_callback: Optional[Callable[[Dict[str, any]], None]] = None
logger.info(f"ManualJsonPipeline initialized for {sources_dir}")
def set_progress_callback(self, callback: Callable[[Dict[str, any]], None]) -> None:
"""
Set progress callback function
Args:
callback: Function to call with progress updates
"""
self.progress_callback = callback
def _update_progress(self, progress: PipelineProgress) -> None:
"""Update progress via callback if configured"""
if self.progress_callback and self.config.enable_progress_tracking:
progress_info = progress.get_progress_info()
self.progress_callback(progress_info)
def run(self, mode: Optional[LoadMode] = None, progress_callback: Optional[Callable] = None) -> PipelineResult:
"""
Execute complete pipeline
Args:
mode: Loading mode (overrides config)
progress_callback: Progress callback function (overrides config)
Returns:
PipelineResult with complete execution details
"""
start_time = time.time()
# Override config if specified
load_mode = mode or self.config.load_mode
if progress_callback:
self.set_progress_callback(progress_callback)
logger.info(f"Starting manual JSON pipeline in {load_mode.value} mode")
logger.info(f"Processing directory: {self.sources_dir}")
try:
# Count files for progress tracking
json_files = list(Path(self.sources_dir).glob("*.json"))
total_files = len(json_files)
if total_files == 0:
raise ValueError(f"No JSON files found in {self.sources_dir}")
progress = PipelineProgress(total_files)
# Phase 1: Extract data from JSON files
progress.update_phase("Extracting data from JSON files")
self._update_progress(progress)
logger.info(f"Phase 1: Extracting data from {total_files} JSON files")
extraction_result = self.extractor.extract_all_makes(self.sources_dir)
progress.update_file_progress(extraction_result.total_files_processed)
self._update_progress(progress)
if extraction_result.failed_extractions > 0:
logger.warning(f"Extraction completed with {extraction_result.failed_extractions} failures")
else:
logger.info(f"Extraction completed successfully: {extraction_result.total_models} models, {extraction_result.total_engines} engines")
# Phase 2: Load data into database
progress.update_phase("Loading data into database")
self._update_progress(progress)
logger.info(f"Phase 2: Loading {len(extraction_result.makes)} makes into database ({load_mode.value} mode)")
load_result = self.loader.load_all_makes(extraction_result.makes, load_mode)
if len(load_result.failed_makes) > 0:
logger.warning(f"Loading completed with {len(load_result.failed_makes)} failures")
else:
logger.info(f"Loading completed successfully: {load_result.success_count} makes loaded")
# Phase 3: Validate referential integrity (if enabled)
integrity_issues = []
if self.config.validate_integrity:
progress.update_phase("Validating referential integrity")
self._update_progress(progress)
logger.info("Phase 3: Validating referential integrity")
integrity_issues = self.loader.validate_referential_integrity()
if integrity_issues:
logger.warning(f"Referential integrity issues found: {len(integrity_issues)}")
else:
logger.info("Referential integrity validation passed")
# Calculate performance metrics
end_time = time.time()
duration = end_time - start_time
files_per_second = total_files / duration if duration > 0 else 0
total_records = (load_result.total_models + load_result.total_engines +
load_result.total_trims + load_result.total_trim_engine_mappings)
records_per_second = total_records / duration if duration > 0 else 0
# Calculate success rates
extraction_success_rate = extraction_result.success_rate
loading_success_rate = load_result.success_rate
overall_success_rate = min(extraction_success_rate, loading_success_rate)
# Create result
result = PipelineResult(
config=self.config,
start_time=start_time,
end_time=end_time,
extraction_result=extraction_result,
load_result=load_result,
total_files_processed=total_files,
total_records_loaded=total_records,
files_per_second=files_per_second,
records_per_second=records_per_second,
extraction_success_rate=extraction_success_rate,
loading_success_rate=loading_success_rate,
overall_success_rate=overall_success_rate,
integrity_issues=integrity_issues
)
progress.update_phase("Pipeline complete")
self._update_progress(progress)
logger.info(f"Pipeline completed in {result.duration_seconds:.1f} seconds")
logger.info(f"Performance: {files_per_second:.1f} files/sec, {records_per_second:.0f} records/sec")
logger.info(f"Overall success rate: {overall_success_rate:.1%}")
return result
except Exception as e:
end_time = time.time()
logger.error(f"Pipeline failed after {end_time - start_time:.1f} seconds: {str(e)}")
raise
def run_extraction_only(self) -> ExtractionResult:
"""
Run extraction phase only (for testing/validation)
Returns:
ExtractionResult with extracted data
"""
logger.info("Running extraction-only pipeline")
result = self.extractor.extract_all_makes(self.sources_dir)
logger.info(f"Extraction complete: {result.total_models} models, {result.total_engines} engines")
logger.info(f"Success rate: {result.success_rate:.1%}")
return result
def get_source_statistics(self) -> Dict[str, any]:
"""
Get statistics about source JSON files
Returns:
Dictionary with source file statistics
"""
json_files = list(Path(self.sources_dir).glob("*.json"))
total_size_bytes = sum(f.stat().st_size for f in json_files)
return {
'total_files': len(json_files),
'total_size_bytes': total_size_bytes,
'total_size_mb': total_size_bytes / (1024 * 1024),
'average_file_size_kb': (total_size_bytes / len(json_files) / 1024) if json_files else 0,
'directory': str(self.sources_dir)
}
def print_pipeline_report(self, result: PipelineResult) -> None:
"""
Print comprehensive pipeline execution report
Args:
result: PipelineResult from pipeline execution
"""
print(f"🚀 MANUAL JSON PIPELINE EXECUTION REPORT")
print(f"=" * 60)
# Configuration
print(f"\n⚙️ CONFIGURATION")
print(f" Sources directory: {result.config.sources_directory}")
print(f" Load mode: {result.config.load_mode.value.upper()}")
print(f" Batch size: {result.config.batch_size}")
print(f" Integrity validation: {'Enabled' if result.config.validate_integrity else 'Disabled'}")
# Performance
print(f"\n⏱️ PERFORMANCE")
print(f" Total duration: {result.duration_seconds:.1f} seconds ({result.duration_minutes:.1f} minutes)")
print(f" Files processed: {result.total_files_processed}")
print(f" Records loaded: {result.total_records_loaded:,}")
print(f" Processing rate: {result.files_per_second:.1f} files/sec")
print(f" Loading rate: {result.records_per_second:,.0f} records/sec")
# Success rates
print(f"\n📊 SUCCESS RATES")
print(f" Extraction: {result.extraction_success_rate:.1%}")
print(f" Loading: {result.loading_success_rate:.1%}")
print(f" Overall: {result.overall_success_rate:.1%}")
# Data summary
print(f"\n📈 DATA PROCESSED")
print(f" Makes: {result.load_result.total_makes}")
print(f" Models: {result.load_result.total_models}")
print(f" Model years: {result.load_result.total_model_years}")
print(f" Trims: {result.load_result.total_trims}")
print(f" Engines: {result.load_result.total_engines}")
print(f" Trim-engine mappings: {result.load_result.total_trim_engine_mappings}")
# Issues
if result.load_result.failed_makes:
print(f"\n⚠️ FAILED MAKES ({len(result.load_result.failed_makes)}):")
for make in result.load_result.failed_makes:
print(f" {make}")
if result.integrity_issues:
print(f"\n❌ REFERENTIAL INTEGRITY ISSUES ({len(result.integrity_issues)}):")
for issue in result.integrity_issues:
print(f" {issue}")
else:
print(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
# Final status
print(f"\n🎯 PIPELINE STATUS: {'SUCCESS' if result.was_successful else 'COMPLETED WITH ISSUES'}")
def default_progress_callback(progress_info: Dict[str, any]) -> None:
"""Default progress callback that prints to console"""
percentage = progress_info['percentage']
phase = progress_info['phase']
files_completed = progress_info['files_completed']
total_files = progress_info['total_files']
if progress_info['files_per_second'] > 0:
eta_minutes = progress_info['eta_seconds'] / 60
print(f"[{percentage:5.1f}%] {phase}: {files_completed}/{total_files} files "
f"({progress_info['files_per_second']:.1f} files/sec, ETA: {eta_minutes:.1f}min)")
else:
print(f"[{percentage:5.1f}%] {phase}: {files_completed}/{total_files} files")
# Example usage and testing functions
def example_usage():
"""Demonstrate ManualJsonPipeline usage"""
print("🚀 ManualJsonPipeline Example Usage")
print("=" * 40)
sources_dir = "sources/makes"
if not Path(sources_dir).exists():
print(f"❌ Sources directory not found: {sources_dir}")
return
print(f"\n💡 Example pipeline execution:")
print(f"""
# Create pipeline with configuration
config = PipelineConfig(
sources_directory="{sources_dir}",
load_mode=LoadMode.APPEND,
enable_progress_tracking=True,
validate_integrity=True
)
pipeline = ManualJsonPipeline("{sources_dir}", config)
# Run with progress tracking
result = pipeline.run(progress_callback=default_progress_callback)
# Print comprehensive report
pipeline.print_pipeline_report(result)
""")
# Show source statistics
try:
pipeline = ManualJsonPipeline(sources_dir)
stats = pipeline.get_source_statistics()
print(f"\n📊 Source Directory Statistics:")
print(f" Files: {stats['total_files']}")
print(f" Total size: {stats['total_size_mb']:.1f} MB")
print(f" Average file size: {stats['average_file_size_kb']:.1f} KB")
except Exception as e:
print(f"⚠️ Could not get source statistics: {e}")
if __name__ == "__main__":
example_usage()

View File

@@ -0,0 +1,71 @@
import schedule
import time
import logging
from datetime import datetime
# Import locally to avoid circular import
import importlib
from .config import config
logger = logging.getLogger(__name__)
def scheduled_etl_job():
"""Execute the ETL pipeline on schedule"""
start_time = datetime.now()
logger.info(f"Starting scheduled ETL job at {start_time}")
try:
# Import dynamically to avoid circular import
from .pipeline import run_etl_pipeline
success = run_etl_pipeline()
end_time = datetime.now()
duration = end_time - start_time
if success:
logger.info(f"ETL job completed successfully in {duration}")
else:
logger.error(f"ETL job failed after {duration}")
except Exception as e:
end_time = datetime.now()
duration = end_time - start_time
logger.error(f"ETL job crashed after {duration}: {e}")
def start_etl_scheduler():
"""Start the ETL scheduler"""
logger.info(f"Starting ETL scheduler with cron pattern: {config.ETL_SCHEDULE}")
# Parse cron pattern (simplified for weekly schedule)
# Format: "0 2 * * 0" = minute hour day-of-month month day-of-week
# "0 2 * * 0" = Every Sunday at 2:00 AM
if config.ETL_SCHEDULE == "0 2 * * 0":
schedule.every().sunday.at("02:00").do(scheduled_etl_job)
logger.info("Scheduled ETL to run every Sunday at 2:00 AM")
else:
# Default fallback - run once daily at 2 AM
schedule.every().day.at("02:00").do(scheduled_etl_job)
logger.warning(f"Unknown cron pattern {config.ETL_SCHEDULE}, defaulting to daily at 2:00 AM")
# Run scheduler loop
logger.info("ETL scheduler started")
while True:
try:
schedule.run_pending()
time.sleep(60) # Check every minute
except KeyboardInterrupt:
logger.info("ETL scheduler stopped by user")
break
except Exception as e:
logger.error(f"ETL scheduler error: {e}")
time.sleep(300) # Wait 5 minutes on error
if __name__ == "__main__":
# Configure logging
logging.basicConfig(
level=getattr(logging, config.LOG_LEVEL.upper()),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Start scheduler
start_etl_scheduler()

View File

@@ -0,0 +1,64 @@
{
"manufacturers": [
"Acura",
"Alfa Romeo",
"Aston Martin",
"Audi",
"BMW",
"Bentley",
"Buick",
"Cadillac",
"Chevrolet",
"Chrysler",
"Dodge",
"Ferrari",
"Fiat",
"Ford",
"GMC",
"Genesis",
"Geo",
"Honda",
"Hummer",
"Hyundai",
"Infiniti",
"Isuzu",
"Jaguar",
"Jeep",
"Kia",
"Lamborghini",
"Land Rover",
"Lexus",
"Lincoln",
"Lotus",
"Mazda",
"Maserati",
"Maybach",
"McLaren",
"Mercedes-Benz",
"Mercury",
"MINI",
"Mitsubishi",
"Nissan",
"Oldsmobile",
"Plymouth",
"Polestar",
"Pontiac",
"Porsche",
"Ram",
"Rivian",
"Rolls Royce",
"Saab",
"Saturn",
"Scion",
"Smart",
"Subaru",
"Tesla",
"Toyota",
"Volkswagen",
"Volvo",
"Karma",
"Pagani",
"Koenigsegg",
"Lucid"
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,506 @@
{
"aston_martin": [
{
"year": "2023",
"models": [
{
"name": "Vantage",
"engines": [
"4.0L V8",
"5.2L V12"
],
"submodels": [
"AMR",
"V12",
"Base"
]
}
]
},
{
"year": "2020",
"models": [
{
"name": "DB11",
"engines": [
"4.0L V8"
],
"submodels": []
},
{
"name": "Dbs",
"engines": [
"5.2L V12"
],
"submodels": []
},
{
"name": "Vantage",
"engines": [
"4.0L V8",
"5.2L V12"
],
"submodels": [
"AMR",
"V12",
"Base"
]
}
]
},
{
"year": "2019",
"models": [
{
"name": "Vantage",
"engines": [
"4.0L V8",
"5.2L V12"
],
"submodels": [
"AMR",
"V12",
"Base"
]
}
]
},
{
"year": "2018",
"models": [
{
"name": "Rapide",
"engines": [
"6.0L V12"
],
"submodels": []
}
]
},
{
"year": "2017",
"models": [
{
"name": "V12 Vantage",
"engines": [
"6.0L V12"
],
"submodels": [
"Base",
"S"
]
},
{
"name": "Vanquish",
"engines": [
"6.0L V12"
],
"submodels": [
"Carbon",
"Base",
"Volante"
]
}
]
},
{
"year": "2016",
"models": [
{
"name": "Rapide",
"engines": [
"6.0L V12"
],
"submodels": []
},
{
"name": "V12 Vantage",
"engines": [
"6.0L V12"
],
"submodels": [
"Base",
"S"
]
},
{
"name": "Vanquish",
"engines": [
"6.0L V12"
],
"submodels": [
"Carbon",
"Base",
"Volante"
]
}
]
},
{
"year": "2015",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "Rapide",
"engines": [
"6.0L V12"
],
"submodels": []
},
{
"name": "V12 Vantage",
"engines": [
"6.0L V12"
],
"submodels": [
"Base",
"S"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
},
{
"name": "Vanquish",
"engines": [
"6.0L V12"
],
"submodels": [
"Carbon",
"Base",
"Volante"
]
}
]
},
{
"year": "2014",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
},
{
"name": "Vanquish",
"engines": [
"6.0L V12"
],
"submodels": [
"Carbon",
"Base",
"Volante"
]
}
]
},
{
"year": "2013",
"models": [
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2012",
"models": [
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2011",
"models": [
{
"name": "V12 Vantage",
"engines": [
"6.0L V12"
],
"submodels": [
"Base",
"S"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2010",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2009",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2008",
"models": [
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2007",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2006",
"models": [
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2005",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "Vantage",
"engines": [
"4.0L V8",
"5.2L V12"
],
"submodels": [
"AMR",
"V12",
"Base"
]
}
]
},
{
"year": "2002",
"models": [
{
"name": "DB7",
"engines": [
"6.0L V12"
],
"submodels": [
"Vantage Volante",
"Vantage"
]
}
]
},
{
"year": "2001",
"models": [
{
"name": "DB7",
"engines": [
"6.0L V12"
],
"submodels": [
"Vantage Volante",
"Vantage"
]
}
]
},
{
"year": "1993",
"models": [
{
"name": "Virage",
"engines": [
"5.3L V8"
],
"submodels": [
"Volante"
]
}
]
},
{
"year": "1990",
"models": [
{
"name": "Virage",
"engines": [
"5.3L V8"
],
"submodels": [
"Volante"
]
}
]
},
{
"year": "1983",
"models": [
{
"name": "V 8",
"engines": [
"5.3L V8"
],
"submodels": []
}
]
}
]
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,427 @@
{
"bentley": [
{
"year": "2023",
"models": [
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
}
]
},
{
"year": "2022",
"models": [
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
}
]
},
{
"year": "2021",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
},
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
}
]
},
{
"year": "2018",
"models": [
{
"name": "Bentayga",
"engines": [
"6.0L W12"
],
"submodels": [
"W12 Signature",
"Black Edition"
]
},
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2017",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2016",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
},
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
},
{
"name": "Mulsanne",
"engines": [
"6.8L V8"
],
"submodels": [
"Base",
"Speed"
]
}
]
},
{
"year": "2014",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
},
{
"name": "Mulsanne",
"engines": [
"6.8L V8"
],
"submodels": [
"Base",
"Speed"
]
}
]
},
{
"year": "2013",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
},
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
}
]
},
{
"year": "2009",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2008",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2006",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2005",
"models": [
{
"name": "Arnage",
"engines": [
"4.4L V8",
"6.8L V8"
],
"submodels": [
"Base",
"R"
]
},
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "1999",
"models": [
{
"name": "Arnage",
"engines": [
"4.4L V8",
"6.8L V8"
],
"submodels": [
"Base",
"R"
]
}
]
},
{
"year": "1997",
"models": [
{
"name": "Brooklands",
"engines": [
"6.8L V8"
],
"submodels": []
}
]
},
{
"year": "1996",
"models": [
{
"name": "Azure",
"engines": [],
"submodels": []
}
]
},
{
"year": "1989",
"models": [
{
"name": "Turbo R",
"engines": [
"6.8L V8"
],
"submodels": []
}
]
},
{
"year": "1963",
"models": [
{
"name": "S3 Series",
"engines": [
"6.2L V8"
],
"submodels": []
}
]
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,607 @@
{
"ferrari": [
{
"year": "2024",
"models": [
{
"name": "296 Gts",
"engines": [
"3.0L V6 PLUG-IN HYBRID EV- (PHEV)"
],
"submodels": []
}
]
},
{
"year": "2022",
"models": [
{
"name": "F8 Spider",
"engines": [
"3.9L V8"
],
"submodels": []
}
]
},
{
"year": "2019",
"models": [
{
"name": "Portofino",
"engines": [
"3.9L V8"
],
"submodels": []
}
]
},
{
"year": "2018",
"models": [
{
"name": "488 Spider",
"engines": [
"3.9L V8"
],
"submodels": []
}
]
},
{
"year": "2017",
"models": [
{
"name": "Gtc4Lusso",
"engines": [
"6.3L V12"
],
"submodels": []
}
]
},
{
"year": "2016",
"models": [
{
"name": "488 Gtb",
"engines": [
"3.9L V8"
],
"submodels": []
},
{
"name": "Ff",
"engines": [
"6.3L V12"
],
"submodels": []
}
]
},
{
"year": "2015",
"models": [
{
"name": "458 Italia",
"engines": [
"4.5L V8"
],
"submodels": [
"Base"
]
},
{
"name": "458 Spider",
"engines": [
"4.5L V8"
],
"submodels": []
},
{
"name": "California T",
"engines": [
"3.8L V8",
"3.9L V8"
],
"submodels": []
},
{
"name": "F12 Berlinetta",
"engines": [
"6.3L V12"
],
"submodels": []
}
]
},
{
"year": "2014",
"models": [
{
"name": "458 Italia",
"engines": [
"4.5L V8"
],
"submodels": [
"Base"
]
},
{
"name": "California",
"engines": [
"4.3L V8"
],
"submodels": []
},
{
"name": "Laferrari",
"engines": [
"6.3L V12"
],
"submodels": []
}
]
},
{
"year": "2013",
"models": [
{
"name": "458 Italia",
"engines": [
"4.5L V8"
],
"submodels": [
"Base"
]
}
]
},
{
"year": "2012",
"models": [
{
"name": "458 Italia",
"engines": [
"4.5L V8"
],
"submodels": [
"Base"
]
},
{
"name": "Ff",
"engines": [
"6.3L V12"
],
"submodels": []
}
]
},
{
"year": "2010",
"models": [
{
"name": "458 Italia",
"engines": [
"4.5L V8"
],
"submodels": [
"Base"
]
},
{
"name": "California",
"engines": [
"4.3L V8"
],
"submodels": []
}
]
},
{
"year": "2009",
"models": [
{
"name": "599 Gtb",
"engines": [
"6.0L V12"
],
"submodels": []
}
]
},
{
"year": "2008",
"models": [
{
"name": "599 Gtb",
"engines": [
"6.0L V12"
],
"submodels": []
},
{
"name": "F430",
"engines": [
"4.3L V8"
],
"submodels": [
"Spider",
"Base"
]
}
]
},
{
"year": "2007",
"models": [
{
"name": "F430",
"engines": [
"4.3L V8"
],
"submodels": [
"Spider",
"Base"
]
}
]
},
{
"year": "2006",
"models": [
{
"name": "612 Scaglietti",
"engines": [
"5.7L V12"
],
"submodels": []
},
{
"name": "F430",
"engines": [
"4.3L V8"
],
"submodels": [
"Spider",
"Base"
]
}
]
},
{
"year": "2005",
"models": [
{
"name": "F430",
"engines": [
"4.3L V8"
],
"submodels": [
"Spider",
"Base"
]
},
{
"name": "Superamerica",
"engines": [
"5.7L V12"
],
"submodels": []
}
]
},
{
"year": "2004",
"models": [
{
"name": "360",
"engines": [
"3.6L V8"
],
"submodels": [
"Challenge Stradale",
"Modena",
"Spider"
]
},
{
"name": "575 M Maranello",
"engines": [
"5.7L V12"
],
"submodels": []
},
{
"name": "Enzo",
"engines": [],
"submodels": []
}
]
},
{
"year": "2003",
"models": [
{
"name": "360",
"engines": [
"3.6L V8"
],
"submodels": [
"Challenge Stradale",
"Modena",
"Spider"
]
}
]
},
{
"year": "2002",
"models": [
{
"name": "360",
"engines": [
"3.6L V8"
],
"submodels": [
"Challenge Stradale",
"Modena",
"Spider"
]
}
]
},
{
"year": "2001",
"models": [
{
"name": "360",
"engines": [
"3.6L V8"
],
"submodels": [
"Challenge Stradale",
"Modena",
"Spider"
]
}
]
},
{
"year": "2000",
"models": [
{
"name": "360",
"engines": [
"3.6L V8"
],
"submodels": [
"Challenge Stradale",
"Modena",
"Spider"
]
}
]
},
{
"year": "1998",
"models": [
{
"name": "456 Gt",
"engines": [],
"submodels": []
}
]
},
{
"year": "1997",
"models": [
{
"name": "550 Maranello",
"engines": [
"5.5L V12"
],
"submodels": []
},
{
"name": "F355 Spider",
"engines": [
"3.5L V8"
],
"submodels": []
}
]
},
{
"year": "1996",
"models": [
{
"name": "F355 Spider",
"engines": [
"3.5L V8"
],
"submodels": []
}
]
},
{
"year": "1995",
"models": [
{
"name": "F355 Berlinetta",
"engines": [],
"submodels": []
}
]
},
{
"year": "1992",
"models": [
{
"name": "348 Tb",
"engines": [
"3.4L V8"
],
"submodels": []
}
]
},
{
"year": "1991",
"models": [
{
"name": "Mondial T",
"engines": [
"3.4L V8"
],
"submodels": []
},
{
"name": "Testarossa",
"engines": [
"4.9L H12"
],
"submodels": [
"Base"
]
}
]
},
{
"year": "1990",
"models": [
{
"name": "348 Ts",
"engines": [
"3.4L V8"
],
"submodels": []
}
]
},
{
"year": "1987",
"models": [
{
"name": "328 Gts",
"engines": [
"3.2L V8"
],
"submodels": []
},
{
"name": "Mondial 3 2",
"engines": [
"3.2L V8"
],
"submodels": []
},
{
"name": "Testarossa",
"engines": [
"4.9L H12"
],
"submodels": [
"Base"
]
}
]
},
{
"year": "1985",
"models": [
{
"name": "308 Gts",
"engines": [
"3.0L V8"
],
"submodels": [
"Base",
"Quattrovalvole"
]
}
]
},
{
"year": "1983",
"models": [
{
"name": "308 Gts",
"engines": [
"3.0L V8"
],
"submodels": [
"Base",
"Quattrovalvole"
]
}
]
},
{
"year": "1980",
"models": [
{
"name": "308 Gts",
"engines": [
"3.0L V8"
],
"submodels": [
"Base",
"Quattrovalvole"
]
}
]
},
{
"year": "1977",
"models": [
{
"name": "308 Gtb",
"engines": [
"3.0L V8"
],
"submodels": []
}
]
},
{
"year": "1972",
"models": [
{
"name": "365 Gtc 4",
"engines": [
"4.4L V12"
],
"submodels": []
},
{
"name": "Dino 246 Gt",
"engines": [
"2.4L V6"
],
"submodels": []
}
]
},
{
"year": "1966",
"models": [
{
"name": "275 Gtb",
"engines": [
"3.3L V12"
],
"submodels": []
},
{
"name": "500 Superfast",
"engines": [
"5.0L V12"
],
"submodels": []
}
]
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,414 @@
{
"genesis": [
{
"year": "2024",
"models": [
{
"name": "G90",
"engines": [
"3.3L V6",
"3.5L V6 MILD HYBRID EV- (MHEV)",
"5.0L V8"
],
"submodels": [
"E-Supercharger",
"Ultimate",
"Premium"
]
},
{
"name": "GV70",
"engines": [
"2.5L I4",
"3.5L V6"
],
"submodels": [
"Base",
"1.5 RS CVT Honda SENSING",
"Sport Plus",
"Prestige",
"Select",
"Advanced Plus",
"Sport Advanced",
"Advanced",
"Sport Prestige"
]
}
]
},
{
"year": "2023",
"models": [
{
"name": "G70",
"engines": [
"2.0L I4",
"3.3L V6"
],
"submodels": [
"Launch Edition",
"Base",
"Design",
"3.3T RWD",
"Advanced",
"Sport Prestige",
"Elite",
"Dynamic",
"Prestige",
"1.5 RS CVT Honda SENSING"
]
},
{
"name": "G80",
"engines": [
"2.5L I4",
"3.3L V6",
"3.5L V6",
"3.8L V6",
"5.0L V8"
],
"submodels": [
"Base",
"Advanced",
"Sport Prestige",
"5.0",
"Prestige",
"1.5 RS CVT Honda SENSING",
"3.8"
]
},
{
"name": "GV70",
"engines": [
"2.5L I4",
"3.5L V6"
],
"submodels": [
"Base",
"1.5 RS CVT Honda SENSING",
"Sport Plus",
"Prestige",
"Select",
"Advanced Plus",
"Sport Advanced",
"Advanced",
"Sport Prestige"
]
},
{
"name": "GV80",
"engines": [
"2.5L I4",
"3.5L V6"
],
"submodels": [
"Advanced",
"Advanced+",
"Prestige"
]
}
]
},
{
"year": "2022",
"models": [
{
"name": "G70",
"engines": [
"2.0L I4",
"3.3L V6"
],
"submodels": [
"Launch Edition",
"Base",
"Design",
"3.3T RWD",
"Advanced",
"Sport Prestige",
"Elite",
"Dynamic",
"Prestige",
"1.5 RS CVT Honda SENSING"
]
},
{
"name": "G80",
"engines": [
"2.5L I4",
"3.3L V6",
"3.5L V6",
"3.8L V6",
"5.0L V8"
],
"submodels": [
"Base",
"Advanced",
"Sport Prestige",
"5.0",
"Prestige",
"1.5 RS CVT Honda SENSING",
"3.8"
]
},
{
"name": "GV70",
"engines": [
"2.5L I4",
"3.5L V6"
],
"submodels": [
"Base",
"1.5 RS CVT Honda SENSING",
"Sport Plus",
"Prestige",
"Select",
"Advanced Plus",
"Sport Advanced",
"Advanced",
"Sport Prestige"
]
},
{
"name": "GV80",
"engines": [
"2.5L I4",
"3.5L V6"
],
"submodels": [
"Advanced",
"Advanced+",
"Prestige"
]
}
]
},
{
"year": "2021",
"models": [
{
"name": "G70",
"engines": [
"2.0L I4",
"3.3L V6"
],
"submodels": [
"Launch Edition",
"Base",
"Design",
"3.3T RWD",
"Advanced",
"Sport Prestige",
"Elite",
"Dynamic",
"Prestige",
"1.5 RS CVT Honda SENSING"
]
},
{
"name": "G80",
"engines": [
"2.5L I4",
"3.3L V6",
"3.5L V6",
"3.8L V6",
"5.0L V8"
],
"submodels": [
"Base",
"Advanced",
"Sport Prestige",
"5.0",
"Prestige",
"1.5 RS CVT Honda SENSING",
"3.8"
]
},
{
"name": "GV80",
"engines": [
"2.5L I4",
"3.5L V6"
],
"submodels": [
"Advanced",
"Advanced+",
"Prestige"
]
}
]
},
{
"year": "2020",
"models": [
{
"name": "G70",
"engines": [
"2.0L I4",
"3.3L V6"
],
"submodels": [
"Launch Edition",
"Base",
"Design",
"3.3T RWD",
"Advanced",
"Sport Prestige",
"Elite",
"Dynamic",
"Prestige",
"1.5 RS CVT Honda SENSING"
]
},
{
"name": "G80",
"engines": [
"2.5L I4",
"3.3L V6",
"3.5L V6",
"3.8L V6",
"5.0L V8"
],
"submodels": [
"Base",
"Advanced",
"Sport Prestige",
"5.0",
"Prestige",
"1.5 RS CVT Honda SENSING",
"3.8"
]
}
]
},
{
"year": "2019",
"models": [
{
"name": "G70",
"engines": [
"2.0L I4",
"3.3L V6"
],
"submodels": [
"Launch Edition",
"Base",
"Design",
"3.3T RWD",
"Advanced",
"Sport Prestige",
"Elite",
"Dynamic",
"Prestige",
"1.5 RS CVT Honda SENSING"
]
},
{
"name": "G80",
"engines": [
"2.5L I4",
"3.3L V6",
"3.5L V6",
"3.8L V6",
"5.0L V8"
],
"submodels": [
"Base",
"Advanced",
"Sport Prestige",
"5.0",
"Prestige",
"1.5 RS CVT Honda SENSING",
"3.8"
]
},
{
"name": "G90",
"engines": [
"3.3L V6",
"3.5L V6 MILD HYBRID EV- (MHEV)",
"5.0L V8"
],
"submodels": [
"E-Supercharger",
"Ultimate",
"Premium"
]
}
]
},
{
"year": "2018",
"models": [
{
"name": "G80",
"engines": [
"2.5L I4",
"3.3L V6",
"3.5L V6",
"3.8L V6",
"5.0L V8"
],
"submodels": [
"Base",
"Advanced",
"Sport Prestige",
"5.0",
"Prestige",
"1.5 RS CVT Honda SENSING",
"3.8"
]
},
{
"name": "G90",
"engines": [
"3.3L V6",
"3.5L V6 MILD HYBRID EV- (MHEV)",
"5.0L V8"
],
"submodels": [
"E-Supercharger",
"Ultimate",
"Premium"
]
}
]
},
{
"year": "2017",
"models": [
{
"name": "G80",
"engines": [
"2.5L I4",
"3.3L V6",
"3.5L V6",
"3.8L V6",
"5.0L V8"
],
"submodels": [
"Base",
"Advanced",
"Sport Prestige",
"5.0",
"Prestige",
"1.5 RS CVT Honda SENSING",
"3.8"
]
},
{
"name": "G90",
"engines": [
"3.3L V6",
"3.5L V6 MILD HYBRID EV- (MHEV)",
"5.0L V8"
],
"submodels": [
"E-Supercharger",
"Ultimate",
"Premium"
]
}
]
}
]
}

View File

@@ -0,0 +1,383 @@
{
"geo": [
{
"year": "1997",
"models": [
{
"name": "Metro",
"engines": [
"1.0L L3",
"1.3L I4"
],
"submodels": [
"XFi",
"LSi",
"Base"
]
},
{
"name": "Prizm",
"engines": [
"1.6L I4",
"1.8L I4"
],
"submodels": [
"GSi",
"LSi",
"Base"
]
},
{
"name": "Tracker",
"engines": [
"1.6L I4"
],
"submodels": [
"LSi",
"Base"
]
}
]
},
{
"year": "1996",
"models": [
{
"name": "Metro",
"engines": [
"1.0L L3",
"1.3L I4"
],
"submodels": [
"XFi",
"LSi",
"Base"
]
},
{
"name": "Prizm",
"engines": [
"1.6L I4",
"1.8L I4"
],
"submodels": [
"GSi",
"LSi",
"Base"
]
},
{
"name": "Tracker",
"engines": [
"1.6L I4"
],
"submodels": [
"LSi",
"Base"
]
}
]
},
{
"year": "1995",
"models": [
{
"name": "Metro",
"engines": [
"1.0L L3",
"1.3L I4"
],
"submodels": [
"XFi",
"LSi",
"Base"
]
},
{
"name": "Prizm",
"engines": [
"1.6L I4",
"1.8L I4"
],
"submodels": [
"GSi",
"LSi",
"Base"
]
},
{
"name": "Tracker",
"engines": [
"1.6L I4"
],
"submodels": [
"LSi",
"Base"
]
}
]
},
{
"year": "1994",
"models": [
{
"name": "Metro",
"engines": [
"1.0L L3",
"1.3L I4"
],
"submodels": [
"XFi",
"LSi",
"Base"
]
},
{
"name": "Prizm",
"engines": [
"1.6L I4",
"1.8L I4"
],
"submodels": [
"GSi",
"LSi",
"Base"
]
},
{
"name": "Tracker",
"engines": [
"1.6L I4"
],
"submodels": [
"LSi",
"Base"
]
}
]
},
{
"year": "1993",
"models": [
{
"name": "Metro",
"engines": [
"1.0L L3",
"1.3L I4"
],
"submodels": [
"XFi",
"LSi",
"Base"
]
},
{
"name": "Prizm",
"engines": [
"1.6L I4",
"1.8L I4"
],
"submodels": [
"GSi",
"LSi",
"Base"
]
},
{
"name": "Storm",
"engines": [
"1.6L I4"
],
"submodels": [
"2+2",
"2+2 GSi"
]
},
{
"name": "Tracker",
"engines": [
"1.6L I4"
],
"submodels": [
"LSi",
"Base"
]
}
]
},
{
"year": "1992",
"models": [
{
"name": "Metro",
"engines": [
"1.0L L3",
"1.3L I4"
],
"submodels": [
"XFi",
"LSi",
"Base"
]
},
{
"name": "Prizm",
"engines": [
"1.6L I4",
"1.8L I4"
],
"submodels": [
"GSi",
"LSi",
"Base"
]
},
{
"name": "Storm",
"engines": [
"1.6L I4"
],
"submodels": [
"2+2",
"2+2 GSi"
]
},
{
"name": "Tracker",
"engines": [
"1.6L I4"
],
"submodels": [
"LSi",
"Base"
]
}
]
},
{
"year": "1991",
"models": [
{
"name": "Metro",
"engines": [
"1.0L L3",
"1.3L I4"
],
"submodels": [
"XFi",
"LSi",
"Base"
]
},
{
"name": "Prizm",
"engines": [
"1.6L I4",
"1.8L I4"
],
"submodels": [
"GSi",
"LSi",
"Base"
]
},
{
"name": "Storm",
"engines": [
"1.6L I4"
],
"submodels": [
"2+2",
"2+2 GSi"
]
},
{
"name": "Tracker",
"engines": [
"1.6L I4"
],
"submodels": [
"LSi",
"Base"
]
}
]
},
{
"year": "1990",
"models": [
{
"name": "Metro",
"engines": [
"1.0L L3",
"1.3L I4"
],
"submodels": [
"XFi",
"LSi",
"Base"
]
},
{
"name": "Prizm",
"engines": [
"1.6L I4",
"1.8L I4"
],
"submodels": [
"GSi",
"LSi",
"Base"
]
},
{
"name": "Storm",
"engines": [
"1.6L I4"
],
"submodels": [
"2+2",
"2+2 GSi"
]
},
{
"name": "Tracker",
"engines": [
"1.6L I4"
],
"submodels": [
"LSi",
"Base"
]
}
]
},
{
"year": "1989",
"models": [
{
"name": "Metro",
"engines": [
"1.0L L3",
"1.3L I4"
],
"submodels": [
"XFi",
"LSi",
"Base"
]
},
{
"name": "Tracker",
"engines": [
"1.6L I4"
],
"submodels": [
"LSi",
"Base"
]
}
]
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,269 @@
{
"hummer": [
{
"year": "2010",
"models": [
{
"name": "H3",
"engines": [
"3.5L L5",
"3.7L L5",
"5.3L V8"
],
"submodels": [
"Championship Series",
"X",
"Alpha",
"Adventure",
"Luxury",
"Base"
]
},
{
"name": "H3T",
"engines": [
"3.7L L5",
"5.3L V8 FLEX",
"5.3L V8"
],
"submodels": [
"Base",
"Alpha"
]
}
]
},
{
"year": "2009",
"models": [
{
"name": "H2",
"engines": [
"6.0L V8",
"6.2L V8 FLEX",
"6.2L V8"
],
"submodels": [
"Special Edition",
"Adventure",
"Base",
"Luxury"
]
},
{
"name": "H3",
"engines": [
"3.5L L5",
"3.7L L5",
"5.3L V8"
],
"submodels": [
"Championship Series",
"X",
"Alpha",
"Adventure",
"Luxury",
"Base"
]
},
{
"name": "H3T",
"engines": [
"3.7L L5",
"5.3L V8 FLEX",
"5.3L V8"
],
"submodels": [
"Base",
"Alpha"
]
}
]
},
{
"year": "2008",
"models": [
{
"name": "H2",
"engines": [
"6.0L V8",
"6.2L V8 FLEX",
"6.2L V8"
],
"submodels": [
"Special Edition",
"Adventure",
"Base",
"Luxury"
]
},
{
"name": "H3",
"engines": [
"3.5L L5",
"3.7L L5",
"5.3L V8"
],
"submodels": [
"Championship Series",
"X",
"Alpha",
"Adventure",
"Luxury",
"Base"
]
}
]
},
{
"year": "2007",
"models": [
{
"name": "H2",
"engines": [
"6.0L V8",
"6.2L V8 FLEX",
"6.2L V8"
],
"submodels": [
"Special Edition",
"Adventure",
"Base",
"Luxury"
]
},
{
"name": "H3",
"engines": [
"3.5L L5",
"3.7L L5",
"5.3L V8"
],
"submodels": [
"Championship Series",
"X",
"Alpha",
"Adventure",
"Luxury",
"Base"
]
}
]
},
{
"year": "2006",
"models": [
{
"name": "H2",
"engines": [
"6.0L V8",
"6.2L V8 FLEX",
"6.2L V8"
],
"submodels": [
"Special Edition",
"Adventure",
"Base",
"Luxury"
]
},
{
"name": "H3",
"engines": [
"3.5L L5",
"3.7L L5",
"5.3L V8"
],
"submodels": [
"Championship Series",
"X",
"Alpha",
"Adventure",
"Luxury",
"Base"
]
}
]
},
{
"year": "2005",
"models": [
{
"name": "H2",
"engines": [
"6.0L V8",
"6.2L V8 FLEX",
"6.2L V8"
],
"submodels": [
"Special Edition",
"Adventure",
"Base",
"Luxury"
]
}
]
},
{
"year": "2004",
"models": [
{
"name": "H2",
"engines": [
"6.0L V8",
"6.2L V8 FLEX",
"6.2L V8"
],
"submodels": [
"Special Edition",
"Adventure",
"Base",
"Luxury"
]
}
]
},
{
"year": "2003",
"models": [
{
"name": "H1",
"engines": [
"6.5L V8"
],
"submodels": [
"Base"
]
},
{
"name": "H2",
"engines": [
"6.0L V8",
"6.2L V8 FLEX",
"6.2L V8"
],
"submodels": [
"Special Edition",
"Adventure",
"Base",
"Luxury"
]
}
]
},
{
"year": "2002",
"models": [
{
"name": "H1",
"engines": [
"6.5L V8"
],
"submodels": [
"Base"
]
}
]
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,221 @@
{
"lamborghini": [
{
"year": "2023",
"models": [
{
"name": "Urus",
"engines": [
"4.0L V8"
],
"submodels": [
"Base",
"Performante"
]
}
]
},
{
"year": "2021",
"models": [
{
"name": "Huracan",
"engines": [
"5.2L V10"
],
"submodels": []
}
]
},
{
"year": "2018",
"models": [
{
"name": "Urus",
"engines": [
"4.0L V8"
],
"submodels": [
"Base",
"Performante"
]
}
]
},
{
"year": "2011",
"models": [
{
"name": "Gallardo",
"engines": [
"5.0L V10",
"5.2L V10"
],
"submodels": [
"Spyder",
"LP550-2",
"Superleggera",
"Base"
]
}
]
},
{
"year": "2009",
"models": [
{
"name": "Gallardo",
"engines": [
"5.0L V10",
"5.2L V10"
],
"submodels": [
"Spyder",
"LP550-2",
"Superleggera",
"Base"
]
}
]
},
{
"year": "2008",
"models": [
{
"name": "Gallardo",
"engines": [
"5.0L V10",
"5.2L V10"
],
"submodels": [
"Spyder",
"LP550-2",
"Superleggera",
"Base"
]
}
]
},
{
"year": "2007",
"models": [
{
"name": "Gallardo",
"engines": [
"5.0L V10",
"5.2L V10"
],
"submodels": [
"Spyder",
"LP550-2",
"Superleggera",
"Base"
]
}
]
},
{
"year": "2006",
"models": [
{
"name": "Gallardo",
"engines": [
"5.0L V10",
"5.2L V10"
],
"submodels": [
"Spyder",
"LP550-2",
"Superleggera",
"Base"
]
},
{
"name": "Murcielago",
"engines": [
"6.2L V12"
],
"submodels": [
"Base"
]
}
]
},
{
"year": "2004",
"models": [
{
"name": "Murcielago",
"engines": [
"6.2L V12"
],
"submodels": [
"Base"
]
}
]
},
{
"year": "2003",
"models": [
{
"name": "Murcielago",
"engines": [
"6.2L V12"
],
"submodels": [
"Base"
]
}
]
},
{
"year": "2002",
"models": [
{
"name": "Murcielago",
"engines": [
"6.2L V12"
],
"submodels": [
"Base"
]
}
]
},
{
"year": "1992",
"models": [
{
"name": "Diablo",
"engines": [],
"submodels": []
}
]
},
{
"year": "1988",
"models": [
{
"name": "Countach",
"engines": [
"5.2L V12"
],
"submodels": []
}
]
},
{
"year": "1974",
"models": [
{
"name": "Urraco",
"engines": [
"2.5L V8"
],
"submodels": []
}
]
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,829 @@
{
"lotus": [
{
"year": "2024",
"models": [
{
"name": "Emira",
"engines": [
"2.0L I4",
"3.5L V6"
],
"submodels": [
"First Edition",
"Base"
]
}
]
},
{
"year": "2023",
"models": [
{
"name": "Emira",
"engines": [
"2.0L I4",
"3.5L V6"
],
"submodels": [
"First Edition",
"Base"
]
}
]
},
{
"year": "2021",
"models": [
{
"name": "Evora Gt",
"engines": [
"3.5L V6"
],
"submodels": []
}
]
},
{
"year": "2020",
"models": [
{
"name": "Evora Gt",
"engines": [
"3.5L V6"
],
"submodels": []
}
]
},
{
"year": "2017",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
},
{
"name": "Evora",
"engines": [
"3.5L V6"
],
"submodels": [
"S SR",
"Sports Racer",
"400",
"S",
"Base"
]
}
]
},
{
"year": "2016",
"models": [
{
"name": "Evora",
"engines": [
"3.5L V6"
],
"submodels": [
"S SR",
"Sports Racer",
"400",
"S",
"Base"
]
}
]
},
{
"year": "2015",
"models": [
{
"name": "Evora",
"engines": [
"3.5L V6"
],
"submodels": [
"S SR",
"Sports Racer",
"400",
"S",
"Base"
]
}
]
},
{
"year": "2014",
"models": [
{
"name": "Evora",
"engines": [
"3.5L V6"
],
"submodels": [
"S SR",
"Sports Racer",
"400",
"S",
"Base"
]
},
{
"name": "Exige",
"engines": [
"1.8L I4"
],
"submodels": [
"S",
"Base",
"S 240"
]
}
]
},
{
"year": "2013",
"models": [
{
"name": "Evora",
"engines": [
"3.5L V6"
],
"submodels": [
"S SR",
"Sports Racer",
"400",
"S",
"Base"
]
}
]
},
{
"year": "2012",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
},
{
"name": "Evora",
"engines": [
"3.5L V6"
],
"submodels": [
"S SR",
"Sports Racer",
"400",
"S",
"Base"
]
}
]
},
{
"year": "2011",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
},
{
"name": "Evora",
"engines": [
"3.5L V6"
],
"submodels": [
"S SR",
"Sports Racer",
"400",
"S",
"Base"
]
}
]
},
{
"year": "2010",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
},
{
"name": "Evora",
"engines": [
"3.5L V6"
],
"submodels": [
"S SR",
"Sports Racer",
"400",
"S",
"Base"
]
},
{
"name": "Exige",
"engines": [
"1.8L I4"
],
"submodels": [
"S",
"Base",
"S 240"
]
}
]
},
{
"year": "2009",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
},
{
"name": "Exige",
"engines": [
"1.8L I4"
],
"submodels": [
"S",
"Base",
"S 240"
]
}
]
},
{
"year": "2008",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
},
{
"name": "Exige",
"engines": [
"1.8L I4"
],
"submodels": [
"S",
"Base",
"S 240"
]
}
]
},
{
"year": "2007",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
},
{
"name": "Europa S",
"engines": [],
"submodels": []
},
{
"name": "Exige",
"engines": [
"1.8L I4"
],
"submodels": [
"S",
"Base",
"S 240"
]
}
]
},
{
"year": "2006",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
},
{
"name": "Exige",
"engines": [
"1.8L I4"
],
"submodels": [
"S",
"Base",
"S 240"
]
}
]
},
{
"year": "2005",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
}
]
},
{
"year": "2004",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
}
]
},
{
"year": "2003",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
}
]
},
{
"year": "2002",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
}
]
},
{
"year": "2001",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
}
]
},
{
"year": "2000",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
}
]
},
{
"year": "1999",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
}
]
},
{
"year": "1998",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
}
]
},
{
"year": "1997",
"models": [
{
"name": "Elise",
"engines": [
"1.8L I4"
],
"submodels": [
"R",
"111 S",
"250 Cup",
"111s",
"111R",
"111",
"1.8",
"SC",
"Base"
]
}
]
},
{
"year": "1994",
"models": [
{
"name": "Esprit",
"engines": [
"2.2L I4"
],
"submodels": [
"S4",
"Turbo SE",
"Turbo"
]
}
]
},
{
"year": "1991",
"models": [
{
"name": "Elan",
"engines": [
"1.6L I4"
],
"submodels": [
"S4",
"2",
"2S 130",
"Base"
]
}
]
},
{
"year": "1990",
"models": [
{
"name": "Elan",
"engines": [
"1.6L I4"
],
"submodels": [
"S4",
"2",
"2S 130",
"Base"
]
},
{
"name": "Esprit",
"engines": [
"2.2L I4"
],
"submodels": [
"S4",
"Turbo SE",
"Turbo"
]
}
]
},
{
"year": "1989",
"models": [
{
"name": "Esprit",
"engines": [
"2.2L I4"
],
"submodels": [
"S4",
"Turbo SE",
"Turbo"
]
}
]
},
{
"year": "1987",
"models": [
{
"name": "Esprit",
"engines": [
"2.2L I4"
],
"submodels": [
"S4",
"Turbo SE",
"Turbo"
]
}
]
},
{
"year": "1972",
"models": [
{
"name": "Elan",
"engines": [
"1.6L I4"
],
"submodels": [
"S4",
"2",
"2S 130",
"Base"
]
}
]
},
{
"year": "1971",
"models": [
{
"name": "Elan",
"engines": [
"1.6L I4"
],
"submodels": [
"S4",
"2",
"2S 130",
"Base"
]
}
]
},
{
"year": "1969",
"models": [
{
"name": "Elan",
"engines": [
"1.6L I4"
],
"submodels": [
"S4",
"2",
"2S 130",
"Base"
]
}
]
},
{
"year": "1967",
"models": [
{
"name": "Elan",
"engines": [
"1.6L I4"
],
"submodels": [
"S4",
"2",
"2S 130",
"Base"
]
}
]
},
{
"year": "1966",
"models": [
{
"name": "Elan",
"engines": [
"1.6L I4"
],
"submodels": [
"S4",
"2",
"2S 130",
"Base"
]
}
]
},
{
"year": "1964",
"models": [
{
"name": "Seven",
"engines": [],
"submodels": []
}
]
},
{
"year": "1962",
"models": [
{
"name": "Super Seven",
"engines": [
"1.5L I4"
],
"submodels": []
}
]
}
]
}

View File

@@ -0,0 +1,14 @@
{
"lucid": [
{
"year": "2023",
"models": [
{
"name": "Air",
"engines": [],
"submodels": []
}
]
}
]
}

View File

@@ -0,0 +1,692 @@
{
"maserati": [
{
"year": "2023",
"models": [
{
"name": "Ghibli",
"engines": [
"3.0L V6"
],
"submodels": [
"Modena",
"S",
"Base",
"Modena Q4",
"S Q4"
]
}
]
},
{
"year": "2022",
"models": [
{
"name": "Ghibli",
"engines": [
"3.0L V6"
],
"submodels": [
"Modena",
"S",
"Base",
"Modena Q4",
"S Q4"
]
},
{
"name": "Levante",
"engines": [
"3.0L V6"
],
"submodels": [
"S",
"Modena",
"Base"
]
}
]
},
{
"year": "2021",
"models": [
{
"name": "Ghibli",
"engines": [
"3.0L V6"
],
"submodels": [
"Modena",
"S",
"Base",
"Modena Q4",
"S Q4"
]
}
]
},
{
"year": "2020",
"models": [
{
"name": "Levante",
"engines": [
"3.0L V6"
],
"submodels": [
"S",
"Modena",
"Base"
]
}
]
},
{
"year": "2019",
"models": [
{
"name": "Levante",
"engines": [
"3.0L V6"
],
"submodels": [
"S",
"Modena",
"Base"
]
}
]
},
{
"year": "2018",
"models": [
{
"name": "Ghibli",
"engines": [
"3.0L V6"
],
"submodels": [
"Modena",
"S",
"Base",
"Modena Q4",
"S Q4"
]
},
{
"name": "Levante",
"engines": [
"3.0L V6"
],
"submodels": [
"S",
"Modena",
"Base"
]
},
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "2017",
"models": [
{
"name": "Ghibli",
"engines": [
"3.0L V6"
],
"submodels": [
"Modena",
"S",
"Base",
"Modena Q4",
"S Q4"
]
},
{
"name": "Levante",
"engines": [
"3.0L V6"
],
"submodels": [
"S",
"Modena",
"Base"
]
}
]
},
{
"year": "2016",
"models": [
{
"name": "Ghibli",
"engines": [
"3.0L V6"
],
"submodels": [
"Modena",
"S",
"Base",
"Modena Q4",
"S Q4"
]
},
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "2015",
"models": [
{
"name": "Ghibli",
"engines": [
"3.0L V6"
],
"submodels": [
"Modena",
"S",
"Base",
"Modena Q4",
"S Q4"
]
},
{
"name": "Granturismo",
"engines": [
"4.2L V8",
"4.7L V8"
],
"submodels": [
"Base",
"S",
"MC",
"1.5 RS CVT Honda SENSING"
]
}
]
},
{
"year": "2014",
"models": [
{
"name": "Ghibli",
"engines": [
"3.0L V6"
],
"submodels": [
"Modena",
"S",
"Base",
"Modena Q4",
"S Q4"
]
},
{
"name": "Granturismo",
"engines": [
"4.2L V8",
"4.7L V8"
],
"submodels": [
"Base",
"S",
"MC",
"1.5 RS CVT Honda SENSING"
]
},
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "2013",
"models": [
{
"name": "Granturismo",
"engines": [
"4.2L V8",
"4.7L V8"
],
"submodels": [
"Base",
"S",
"MC",
"1.5 RS CVT Honda SENSING"
]
}
]
},
{
"year": "2012",
"models": [
{
"name": "Granturismo",
"engines": [
"4.2L V8",
"4.7L V8"
],
"submodels": [
"Base",
"S",
"MC",
"1.5 RS CVT Honda SENSING"
]
}
]
},
{
"year": "2011",
"models": [
{
"name": "Granturismo",
"engines": [
"4.2L V8",
"4.7L V8"
],
"submodels": [
"Base",
"S",
"MC",
"1.5 RS CVT Honda SENSING"
]
}
]
},
{
"year": "2010",
"models": [
{
"name": "Granturismo",
"engines": [
"4.2L V8",
"4.7L V8"
],
"submodels": [
"Base",
"S",
"MC",
"1.5 RS CVT Honda SENSING"
]
},
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "2009",
"models": [
{
"name": "Granturismo",
"engines": [
"4.2L V8",
"4.7L V8"
],
"submodels": [
"Base",
"S",
"MC",
"1.5 RS CVT Honda SENSING"
]
},
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "2008",
"models": [
{
"name": "Granturismo",
"engines": [
"4.2L V8",
"4.7L V8"
],
"submodels": [
"Base",
"S",
"MC",
"1.5 RS CVT Honda SENSING"
]
},
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "2007",
"models": [
{
"name": "Gransport",
"engines": [
"4.2L V8"
],
"submodels": [
"Base"
]
},
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "2006",
"models": [
{
"name": "Gransport",
"engines": [
"4.2L V8"
],
"submodels": [
"Base"
]
},
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "2005",
"models": [
{
"name": "Coupe",
"engines": [
"4.2L V8"
],
"submodels": [
"GT"
]
}
]
},
{
"year": "2004",
"models": [
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
},
{
"name": "Spyder",
"engines": [
"4.2L V8"
],
"submodels": [
"GT"
]
}
]
},
{
"year": "2003",
"models": [
{
"name": "Coupe",
"engines": [
"4.2L V8"
],
"submodels": [
"GT"
]
},
{
"name": "Spyder",
"engines": [
"4.2L V8"
],
"submodels": [
"GT"
]
}
]
},
{
"year": "2001",
"models": [
{
"name": "3200GT",
"engines": [],
"submodels": [
"Base"
]
}
]
},
{
"year": "2000",
"models": [
{
"name": "3200GT",
"engines": [],
"submodels": [
"Base"
]
}
]
},
{
"year": "1999",
"models": [
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "1980",
"models": [
{
"name": "Quattroporte",
"engines": [
"3.0L V6",
"3.2L V8",
"3.8L V8",
"4.2L V8",
"4.7L V8",
"4.9L V8"
],
"submodels": [
"Executive GT",
"Sport GT S",
"Evoluzione",
"GTS",
"Sport GT",
"S Q4",
"Base"
]
}
]
},
{
"year": "1964",
"models": [
{
"name": "Sebring",
"engines": [
"3.5L L6"
],
"submodels": []
}
]
}
]
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,159 @@
{
"mclaren": [
{
"year": "2024",
"models": [
{
"name": "Artura",
"engines": [
"3.0L V6 PLUG-IN HYBRID EV- (PHEV)"
],
"submodels": []
}
]
},
{
"year": "2022",
"models": [
{
"name": "720S",
"engines": [
"4.0L V8"
],
"submodels": [
"Spider",
"Base"
]
}
]
},
{
"year": "2021",
"models": [
{
"name": "Gt",
"engines": [
"4.0L V8"
],
"submodels": []
}
]
},
{
"year": "2019",
"models": [
{
"name": "600LT",
"engines": [
"3.8L V8"
],
"submodels": []
},
{
"name": "720S",
"engines": [
"4.0L V8"
],
"submodels": [
"Spider",
"Base"
]
}
]
},
{
"year": "2018",
"models": [
{
"name": "570S",
"engines": [
"3.8L V8"
],
"submodels": []
},
{
"name": "720S",
"engines": [
"4.0L V8"
],
"submodels": [
"Spider",
"Base"
]
}
]
},
{
"year": "2017",
"models": [
{
"name": "570GT",
"engines": [
"3.8L V8"
],
"submodels": []
}
]
},
{
"year": "2016",
"models": [
{
"name": "650S",
"engines": [
"3.8L V8"
],
"submodels": []
}
]
},
{
"year": "2015",
"models": [
{
"name": "650S",
"engines": [
"3.8L V8"
],
"submodels": []
}
]
},
{
"year": "2014",
"models": [
{
"name": "MP4 12C",
"engines": [
"3.8L V8"
],
"submodels": []
}
]
},
{
"year": "2013",
"models": [
{
"name": "MP4 12C",
"engines": [
"3.8L V8"
],
"submodels": []
}
]
},
{
"year": "2012",
"models": [
{
"name": "MP4 12C",
"engines": [
"3.8L V8"
],
"submodels": []
}
]
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,26 @@
{
"polestar": [
{
"year": "2022",
"models": [
{
"name": "Polestar 2",
"engines": [],
"submodels": []
}
]
},
{
"year": "2021",
"models": [
{
"name": "Polestar 1",
"engines": [
"2.0L I4 PLUG-IN HYBRID EV- (PHEV)"
],
"submodels": []
}
]
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
{
"rivian": [
{
"year": "2024",
"models": [
{
"name": "R1T",
"engines": [],
"submodels": []
}
]
}
]
}

Some files were not shown because too many files have changed in this diff Show More