Initial Commit

This commit is contained in:
Eric Gullickson
2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions

View File

@@ -0,0 +1,36 @@
FROM node:18-alpine as builder
WORKDIR /app
# Copy package files and install dependencies
COPY package.json ./
RUN npm install
# Copy source code
COPY . .
# Build arguments for environment variables
ARG VITE_AUTH0_DOMAIN
ARG VITE_AUTH0_CLIENT_ID
ARG VITE_TENANTS_API_URL
# Set environment variables for build
ENV VITE_AUTH0_DOMAIN=${VITE_AUTH0_DOMAIN}
ENV VITE_AUTH0_CLIENT_ID=${VITE_AUTH0_CLIENT_ID}
ENV VITE_TENANTS_API_URL=${VITE_TENANTS_API_URL}
# Build the application
RUN npm run build
# Production stage
FROM nginx:alpine
# Copy built app to nginx
COPY --from=builder /app/dist /usr/share/nginx/html
# Copy nginx configuration
COPY nginx.conf /etc/nginx/nginx.conf
EXPOSE 3000
CMD ["nginx", "-g", "daemon off;"]

View File

@@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>MotoVaultPro - Vehicle Management Platform</title>
</head>
<body>
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>

View File

@@ -0,0 +1,27 @@
events {
worker_connections 1024;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
# Single HTTP server for internal proxying (edge TLS handled by nginx-proxy)
server {
listen 3000;
server_name localhost motovaultpro.com;
root /usr/share/nginx/html;
index index.html;
# Handle React Router (SPA)
location / {
try_files $uri $uri/ /index.html;
}
# Security headers
add_header X-Frame-Options DENY;
add_header X-Content-Type-Options nosniff;
add_header X-XSS-Protection "1; mode=block";
}
}

View File

@@ -0,0 +1,26 @@
events {
worker_connections 1024;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
server {
listen 3000;
server_name localhost;
root /usr/share/nginx/html;
index index.html;
# Handle React Router (SPA)
location / {
try_files $uri $uri/ /index.html;
}
# Security headers
add_header X-Frame-Options DENY;
add_header X-Content-Type-Options nosniff;
add_header X-XSS-Protection "1; mode=block";
}
}

View File

@@ -0,0 +1,24 @@
{
"name": "mvp-platform-landing",
"version": "1.0.0",
"type": "module",
"scripts": {
"dev": "vite",
"build": "tsc && vite build",
"preview": "vite preview"
},
"dependencies": {
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-router-dom": "^6.8.0",
"@auth0/auth0-react": "^2.2.3",
"axios": "^1.6.2"
},
"devDependencies": {
"@types/react": "^18.2.0",
"@types/react-dom": "^18.2.0",
"@vitejs/plugin-react": "^4.2.0",
"typescript": "^5.6.3",
"vite": "^5.0.6"
}
}

View File

@@ -0,0 +1,19 @@
import { Routes, Route } from 'react-router-dom'
import HomePage from './components/HomePage'
import TenantSignup from './components/TenantSignup'
import CallbackHandler from './components/CallbackHandler'
function App() {
return (
<div className="App">
<Routes>
<Route path="/" element={<HomePage />} />
<Route path="/signup/:tenantId" element={<TenantSignup />} />
<Route path="/callback" element={<CallbackHandler />} />
</Routes>
</div>
)
}
export default App

View File

@@ -0,0 +1,22 @@
import React, { useEffect } from 'react'
const CallbackHandler: React.FC = () => {
useEffect(() => {
// This component is no longer needed since we removed Auth0 from landing page
// Redirect to main app
window.location.href = 'https://admin.motovaultpro.com'
}, [])
return (
<div style={{
padding: '2rem',
textAlign: 'center',
fontFamily: 'Arial, sans-serif'
}}>
<h2>Redirecting...</h2>
<p>Please wait while we redirect you to MotoVaultPro.</p>
</div>
)
}
export default CallbackHandler

View File

@@ -0,0 +1,55 @@
import React from 'react'
const HomePage: React.FC = () => {
const handleLogin = () => {
// Redirect directly to admin tenant for login
window.location.href = 'https://admin.motovaultpro.com'
}
return (
<div style={{ padding: '2rem', fontFamily: 'Arial, sans-serif' }}>
<header style={{ textAlign: 'center', marginBottom: '3rem' }}>
<h1>MotoVaultPro</h1>
<p>The complete vehicle management platform for automotive professionals</p>
</header>
<main style={{ maxWidth: '800px', margin: '0 auto' }}>
<section style={{ marginBottom: '3rem' }}>
<h2>Features</h2>
<ul>
<li>Vehicle inventory management</li>
<li>Maintenance tracking and scheduling</li>
<li>Fuel log analytics</li>
<li>Service station locator</li>
<li>Multi-tenant architecture for teams</li>
</ul>
</section>
<section style={{ textAlign: 'center' }}>
<h2>Get Started</h2>
<p>Already have an account?</p>
<button
onClick={handleLogin}
style={{
padding: '1rem 2rem',
fontSize: '1.1rem',
backgroundColor: '#007bff',
color: 'white',
border: 'none',
borderRadius: '4px',
cursor: 'pointer'
}}
>
Access Your Dashboard
</button>
<p style={{ marginTop: '2rem' }}>
Need to join a team? Contact your tenant administrator for an invitation.
</p>
</section>
</main>
</div>
)
}
export default HomePage

View File

@@ -0,0 +1,109 @@
import React, { useEffect, useState } from 'react'
import { useParams } from 'react-router-dom'
import { useAuth0 } from '@auth0/auth0-react'
import axios from 'axios'
interface TenantInfo {
id: string
name: string
status: string
}
const TenantSignup: React.FC = () => {
const { tenantId } = useParams<{ tenantId: string }>()
const { loginWithRedirect } = useAuth0()
const [tenant, setTenant] = useState<TenantInfo | null>(null)
const [loading, setLoading] = useState(true)
const [error, setError] = useState<string | null>(null)
useEffect(() => {
const fetchTenant = async () => {
try {
const response = await axios.get(
`${import.meta.env.VITE_TENANTS_API_URL}/api/v1/tenants/${tenantId}`
)
setTenant(response.data)
} catch (err) {
setError('Tenant not found or not accepting signups')
} finally {
setLoading(false)
}
}
if (tenantId) {
fetchTenant()
}
}, [tenantId])
const handleSignup = async () => {
await loginWithRedirect({
authorizationParams: {
screen_hint: 'signup',
redirect_uri: `${window.location.origin}/callback`
}
})
}
if (loading) {
return <div style={{ padding: '2rem' }}>Loading...</div>
}
if (error || !tenant) {
return (
<div style={{ padding: '2rem', textAlign: 'center' }}>
<h2>Tenant Not Found</h2>
<p>{error}</p>
<a href="/">Return to Homepage</a>
</div>
)
}
return (
<div style={{ padding: '2rem', maxWidth: '600px', margin: '0 auto', fontFamily: 'Arial, sans-serif' }}>
<header style={{ textAlign: 'center', marginBottom: '2rem' }}>
<h1>Join {tenant.name}</h1>
<p>Create your account to get started</p>
</header>
<main>
<div style={{
border: '1px solid #ddd',
borderRadius: '8px',
padding: '2rem',
backgroundColor: '#f9f9f9'
}}>
<h3>What happens next?</h3>
<ol>
<li>Create your account with Auth0</li>
<li>Your signup request will be sent to the tenant administrator</li>
<li>Once approved, you'll receive access to {tenant.name}</li>
<li>Login at <code>{tenant.id}.motovaultpro.com</code></li>
</ol>
<div style={{ textAlign: 'center', marginTop: '2rem' }}>
<button
onClick={handleSignup}
style={{
padding: '1rem 2rem',
fontSize: '1.1rem',
backgroundColor: '#28a745',
color: 'white',
border: 'none',
borderRadius: '4px',
cursor: 'pointer'
}}
>
Create Account for {tenant.name}
</button>
</div>
</div>
<div style={{ textAlign: 'center', marginTop: '2rem' }}>
<a href="/"> Back to Homepage</a>
</div>
</main>
</div>
)
}
export default TenantSignup

View File

@@ -0,0 +1,12 @@
import React from 'react'
import ReactDOM from 'react-dom/client'
import { BrowserRouter } from 'react-router-dom'
import App from './App'
ReactDOM.createRoot(document.getElementById('root')!).render(
<React.StrictMode>
<BrowserRouter>
<App />
</BrowserRouter>
</React.StrictMode>
)

View File

@@ -0,0 +1,11 @@
/// <reference types="vite/client" />
interface ImportMetaEnv {
readonly VITE_AUTH0_DOMAIN: string
readonly VITE_AUTH0_CLIENT_ID: string
readonly VITE_TENANTS_API_URL: string
}
interface ImportMeta {
readonly env: ImportMetaEnv
}

View File

@@ -0,0 +1,21 @@
{
"compilerOptions": {
"target": "ES2020",
"useDefineForClassFields": true,
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"module": "ESNext",
"skipLibCheck": true,
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "react-jsx",
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true
},
"include": ["src"],
"references": [{ "path": "./tsconfig.node.json" }]
}

View File

@@ -0,0 +1,9 @@
{
"compilerOptions": {
"composite": true,
"skipLibCheck": true,
"module": "ESNext",
"moduleResolution": "bundler"
},
"include": ["vite.config.ts"]
}

View File

@@ -0,0 +1,14 @@
import { defineConfig } from 'vite'
import react from '@vitejs/plugin-react'
export default defineConfig({
plugins: [react()],
server: {
host: true,
port: 3000
},
build: {
outDir: 'dist',
sourcemap: true
}
})

View File

@@ -0,0 +1,333 @@
# Auth0 Multi-Tenant Configuration Guide
This document provides step-by-step instructions for configuring Auth0 for the multi-tenant MotoVaultPro platform.
## Overview
The multi-tenant architecture requires:
- **Landing Page**: `motovaultpro.com` - Entry point with tenant selection
- **Admin Tenant**: `admin.motovaultpro.com` - Admin access to all tenants
- **Regular Tenants**: `{tenant-id}.motovaultpro.com` - Isolated tenant access
- **Signup Workflow**: Tenant-specific signup with admin approval
## Auth0 Application Configuration
### 1. Application Settings
**Application Type**: Single Page Application (SPA)
**Allowed Callback URLs**:
```
# Development URLs
http://localhost:3002/callback
http://admin.motovaultpro.local/callback
http://demo-tenant.motovaultpro.local/callback
# Production URLs
https://motovaultpro.com/callback
https://admin.motovaultpro.com/callback
https://demo-tenant.motovaultpro.com/callback
# Add additional tenant subdomains as needed:
https://{tenant-id}.motovaultpro.com/callback
```
**Allowed Logout URLs**:
```
# Development
http://localhost:3002
http://admin.motovaultpro.local
http://demo-tenant.motovaultpro.local
# Production
https://motovaultpro.com
https://admin.motovaultpro.com
https://demo-tenant.motovaultpro.com
https://{tenant-id}.motovaultpro.com
```
**Allowed Web Origins**:
```
# Development
http://localhost:3002
http://admin.motovaultpro.local:3000
http://demo-tenant.motovaultpro.local:3000
# Production
https://motovaultpro.com
https://admin.motovaultpro.com
https://demo-tenant.motovaultpro.com
https://{tenant-id}.motovaultpro.com
```
### 2. JWT Configuration
**JWT Signature Algorithm**: RS256
**OIDC Conformant**: Enabled
### 3. Advanced Settings
**Grant Types**:
- Authorization Code
- Refresh Token
- Implicit (for development only)
## Auth0 Rules Configuration
### Rule 1: Add Tenant Context to JWT
Create a new Rule in Auth0 Dashboard > Auth Pipeline > Rules:
```javascript
function addTenantContext(user, context, callback) {
const namespace = 'https://motovaultpro.com/';
// Extract tenant_id from user metadata (set during signup)
let tenantId = user.user_metadata && user.user_metadata.tenant_id;
// For existing users without tenant metadata, default to admin
if (!tenantId) {
tenantId = 'admin';
// Optionally update user metadata
user.user_metadata = user.user_metadata || {};
user.user_metadata.tenant_id = tenantId;
}
// Check signup status for non-admin tenants
const signupStatus = user.user_metadata && user.user_metadata.signup_status;
if (tenantId !== 'admin' && signupStatus !== 'approved') {
// Block login for unapproved users
return callback(new UnauthorizedError('Account pending approval'));
}
// Add tenant context to tokens
context.idToken[namespace + 'tenant_id'] = tenantId;
context.accessToken[namespace + 'tenant_id'] = tenantId;
context.idToken[namespace + 'signup_status'] = signupStatus || 'approved';
callback(null, user, context);
}
```
### Rule 2: Tenant-Specific User Metadata
```javascript
function setTenantMetadata(user, context, callback) {
const namespace = 'https://motovaultpro.com/';
// If this is a signup and connection is Username-Password-Authentication
if (context.stats.loginsCount === 1 && context.connection === 'Username-Password-Authentication') {
// Extract tenant from redirect_uri or state parameter
const redirectUri = context.request.query.redirect_uri || '';
const tenantMatch = redirectUri.match(/([a-z0-9-]+)\.motovaultpro\.(com|local)/);
if (tenantMatch) {
const tenantId = tenantMatch[1];
// Set initial user metadata
user.user_metadata = user.user_metadata || {};
user.user_metadata.tenant_id = tenantId;
// Set signup status (pending for regular tenants, approved for admin)
user.user_metadata.signup_status = tenantId === 'admin' ? 'approved' : 'pending';
// Update user metadata in Auth0
auth0.users.updateUserMetadata(user.user_id, user.user_metadata);
}
}
callback(null, user, context);
}
```
## Tenant Signup Flow Configuration
### 1. Signup URLs
**Tenant-Specific Signup**:
```
https://motovaultpro.com/signup/{tenant-id}
```
**Process**:
1. User visits tenant-specific signup URL
2. Landing page validates tenant exists
3. Redirects to Auth0 with tenant context
4. Auth0 Rule sets tenant_id in user metadata
5. User account created with status="pending"
6. Tenant admin receives notification
7. Admin approves/rejects via tenant management API
### 2. Auth0 Hosted Login Customization
Add custom CSS and JavaScript to Auth0 Universal Login to support tenant context:
**Custom CSS** (Dashboard > Universal Login > Advanced Options):
```css
.tenant-signup-info {
background: #f8f9fa;
padding: 15px;
border-radius: 5px;
margin-bottom: 20px;
border-left: 4px solid #007bff;
}
```
**Custom JavaScript**:
```javascript
// Extract tenant from URL parameters
const urlParams = new URLSearchParams(window.location.search);
const redirectUri = urlParams.get('redirect_uri') || '';
const tenantMatch = redirectUri.match(/([a-z0-9-]+)\.motovaultpro\.(com|local)/);
if (tenantMatch && tenantMatch[1] !== 'admin') {
const tenantName = tenantMatch[1].replace('-', ' ').toUpperCase();
// Add tenant information to signup form
const container = document.querySelector('.auth0-lock-header');
if (container) {
const info = document.createElement('div');
info.className = 'tenant-signup-info';
info.innerHTML = `
<strong>Signing up for: ${tenantName}</strong><br>
<small>Your account will require admin approval before you can access the system.</small>
`;
container.appendChild(info);
}
}
```
## JWT Token Format
After successful authentication, JWT tokens will include:
**ID Token Claims**:
```json
{
"sub": "auth0|user-123",
"email": "user@example.com",
"https://motovaultpro.com/tenant_id": "demo-tenant",
"https://motovaultpro.com/signup_status": "approved",
"iat": 1699123456,
"exp": 1699127056
}
```
**Access Token Claims**:
```json
{
"sub": "auth0|user-123",
"https://motovaultpro.com/tenant_id": "demo-tenant",
"scope": "openid profile email",
"iat": 1699123456,
"exp": 1699127056
}
```
## Backend JWT Validation
Services should validate JWT tokens and extract tenant context:
```typescript
// Example JWT validation middleware
import jwt from 'jsonwebtoken';
import jwksClient from 'jwks-rsa';
const client = jwksClient({
jwksUri: `https://${AUTH0_DOMAIN}/.well-known/jwks.json`
});
function getKey(header: any, callback: any) {
client.getSigningKey(header.kid, (err, key) => {
if (err) return callback(err);
const signingKey = key.getPublicKey();
callback(null, signingKey);
});
}
export const validateJWT = (token: string): Promise<any> => {
return new Promise((resolve, reject) => {
jwt.verify(token, getKey, {
audience: process.env.AUTH0_AUDIENCE,
issuer: `https://${process.env.AUTH0_DOMAIN}/`,
algorithms: ['RS256']
}, (err, decoded) => {
if (err) return reject(err);
resolve(decoded);
});
});
};
// Extract tenant from validated JWT
export const getTenantFromToken = (decodedToken: any): string => {
return decodedToken['https://motovaultpro.com/tenant_id'] || 'admin';
};
```
## Environment Variables
Configure the following environment variables for each service:
**Platform Services**:
```env
AUTH0_DOMAIN=your-domain.auth0.com
AUTH0_AUDIENCE=https://api.motovaultpro.com
```
**Landing Page Service**:
```env
VITE_AUTH0_DOMAIN=your-domain.auth0.com
VITE_AUTH0_CLIENT_ID=your-client-id
VITE_TENANTS_API_URL=http://mvp-platform-tenants:8000
```
**Admin/Tenant Services**:
```env
REACT_APP_AUTH0_DOMAIN=your-domain.auth0.com
REACT_APP_AUTH0_CLIENT_ID=your-client-id
REACT_APP_AUTH0_AUDIENCE=https://api.motovaultpro.com
REACT_APP_TENANT_ID=admin # or specific tenant ID
```
## Testing the Configuration
### 1. Test Admin Login
```bash
# Visit admin tenant
open http://admin.motovaultpro.local
# Should redirect to Auth0, login, then return to admin app
```
### 2. Test Tenant Signup
```bash
# Visit tenant signup
open http://motovaultpro.local/signup/demo-tenant
# Complete signup, verify pending status
curl -H "Authorization: Bearer admin-token" \
http://localhost:8001/api/v1/signups
```
### 3. Test Approval Workflow
```bash
# Approve signup
curl -X PUT -H "Authorization: Bearer admin-token" \
http://localhost:8001/api/v1/signups/1/approve
# User should now be able to login to tenant
open http://demo-tenant.motovaultpro.local
```
## Production Deployment Notes
1. **SSL Certificates**: Ensure wildcard SSL certificate for `*.motovaultpro.com`
2. **DNS Configuration**: Set up wildcard DNS or individual A records per tenant
3. **Auth0 Environment**: Use production Auth0 tenant with proper security settings
4. **Rate Limiting**: Configure Auth0 rate limiting for signup endpoints
5. **Monitoring**: Set up Auth0 logs monitoring for failed login attempts
This configuration provides a secure, scalable multi-tenant authentication system with proper tenant isolation and admin approval workflows.

View File

@@ -0,0 +1,525 @@
"""
MVP Platform Tenants Service - FastAPI Application
Handles tenant management, signup approvals, and multi-tenant infrastructure.
"""
from fastapi import FastAPI, HTTPException, Depends, Header
from fastapi.middleware.cors import CORSMiddleware
import asyncpg
import os
import json
import httpx
from typing import Optional, List, Dict
from pydantic import BaseModel
from datetime import datetime
import logging
from jose import jwt, jwk
from jose.exceptions import JWTError, ExpiredSignatureError
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(
title="MVP Platform Tenants Service",
description="Multi-tenant management and signup approval service",
version="1.0.0"
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Auth0 configuration
AUTH0_DOMAIN = os.getenv("AUTH0_DOMAIN")
AUTH0_AUDIENCE = os.getenv("AUTH0_AUDIENCE", "https://api.motovaultpro.com")
# Cache for JWKS keys (in production, use Redis)
_jwks_cache = {}
_jwks_cache_expiry = 0
# Database connection
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://platform_user:platform_pass@platform-postgres:5432/platform")
# Helper function to parse JSON settings
def parse_json_field(value):
if isinstance(value, str):
try:
return json.loads(value)
except json.JSONDecodeError:
return {}
return value or {}
# Models
class TenantCreate(BaseModel):
id: str
name: str
subdomain: str
admin_user_id: Optional[str] = None
settings: dict = {}
class TenantResponse(BaseModel):
id: str
name: str
subdomain: str
status: str
admin_user_id: Optional[str]
settings: dict
created_at: datetime
updated_at: datetime
@classmethod
def from_db_row(cls, row):
data = dict(row)
data['settings'] = parse_json_field(data.get('settings'))
return cls(**data)
class SignupRequest(BaseModel):
user_email: str
user_auth0_id: Optional[str] = None
class SignupResponse(BaseModel):
id: int
tenant_id: str
user_email: str
user_auth0_id: Optional[str]
status: str
requested_at: datetime
approved_by: Optional[str] = None
approved_at: Optional[datetime] = None
rejected_at: Optional[datetime] = None
rejection_reason: Optional[str] = None
class SignupApproval(BaseModel):
reason: Optional[str] = None
# JWT Authentication functions
async def get_jwks() -> Dict:
"""Fetch JWKS from Auth0 with caching"""
global _jwks_cache, _jwks_cache_expiry
import time
current_time = time.time()
# Return cached JWKS if not expired (cache for 1 hour)
if _jwks_cache and current_time < _jwks_cache_expiry:
return _jwks_cache
if not AUTH0_DOMAIN:
raise HTTPException(status_code=500, detail="Auth0 configuration missing")
try:
async with httpx.AsyncClient() as client:
response = await client.get(f"https://{AUTH0_DOMAIN}/.well-known/jwks.json")
response.raise_for_status()
jwks = response.json()
# Cache the JWKS for 1 hour
_jwks_cache = jwks
_jwks_cache_expiry = current_time + 3600
return jwks
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to fetch JWKS: {str(e)}")
async def get_signing_key(kid: str) -> str:
"""Get signing key for the given kid"""
jwks = await get_jwks()
for key in jwks.get("keys", []):
if key.get("kid") == kid:
return jwk.construct(key).key
raise HTTPException(status_code=401, detail="Unable to find appropriate key")
async def verify_jwt(token: str) -> Dict:
"""Verify and decode JWT token"""
if not AUTH0_DOMAIN or not AUTH0_AUDIENCE:
raise HTTPException(status_code=500, detail="Auth0 configuration missing")
try:
# Get the kid from token header
unverified_header = jwt.get_unverified_header(token)
kid = unverified_header.get("kid")
if not kid:
raise HTTPException(status_code=401, detail="Token header missing kid")
# Get the signing key
signing_key = await get_signing_key(kid)
# Verify and decode the token
payload = jwt.decode(
token,
signing_key,
algorithms=["RS256"],
audience=AUTH0_AUDIENCE,
issuer=f"https://{AUTH0_DOMAIN}/"
)
return payload
except ExpiredSignatureError:
raise HTTPException(status_code=401, detail="Token has expired")
except JWTError as e:
raise HTTPException(status_code=401, detail=f"Invalid token: {str(e)}")
except Exception as e:
raise HTTPException(status_code=401, detail=f"Token validation failed: {str(e)}")
# Mock authentication for development/testing
async def mock_auth_user(authorization: str) -> Dict:
"""Mock authentication for testing purposes"""
if not authorization or not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Authorization header required")
token = authorization.split(" ")[1]
if token == "admin-token":
return {
"sub": "admin-user",
"email": "admin@motovaultpro.com",
"https://motovaultpro.com/tenant_id": "admin",
"https://motovaultpro.com/signup_status": "approved"
}
elif token.startswith("tenant-"):
tenant_id = token.replace("tenant-", "", 1).replace("-token", "")
return {
"sub": f"{tenant_id}-admin",
"email": f"admin@{tenant_id}.com",
"https://motovaultpro.com/tenant_id": tenant_id,
"https://motovaultpro.com/signup_status": "approved"
}
raise HTTPException(status_code=401, detail="Invalid token")
async def get_current_user(authorization: str = Header(None)):
"""Extract and validate JWT from Authorization header"""
if not authorization:
raise HTTPException(status_code=401, detail="Authorization header required")
try:
scheme, token = authorization.split(" ", 1)
if scheme.lower() != "bearer":
raise HTTPException(status_code=401, detail="Invalid authentication scheme")
# Try real JWT validation first, fallback to mock for development
try:
if AUTH0_DOMAIN and AUTH0_AUDIENCE:
payload = await verify_jwt(token)
else:
payload = await mock_auth_user(authorization)
except HTTPException:
# Fallback to mock authentication for development
payload = await mock_auth_user(authorization)
# Extract tenant info from JWT claims
tenant_id = payload.get("https://motovaultpro.com/tenant_id", "admin")
user_id = payload.get("sub", "")
email = payload.get("email", "")
return {
"sub": user_id,
"tenant_id": tenant_id,
"email": email,
"payload": payload
}
except ValueError:
raise HTTPException(status_code=401, detail="Invalid authorization header format")
async def get_admin_user(current_user: dict = Depends(get_current_user)):
if current_user.get("tenant_id") != "admin":
raise HTTPException(status_code=403, detail="Admin access required")
return current_user
async def get_tenant_admin(current_user: dict = Depends(get_current_user)):
if not current_user.get("tenant_id"):
raise HTTPException(status_code=401, detail="Tenant authentication required")
return current_user
# Health check
@app.get("/health")
async def health_check():
try:
conn = await asyncpg.connect(DATABASE_URL)
await conn.execute("SELECT 1")
await conn.close()
return {
"status": "healthy",
"database": "connected",
"service": "mvp-platform-tenants",
"version": "1.0.0"
}
except Exception as e:
logger.error(f"Health check failed: {e}")
raise HTTPException(status_code=503, detail="Service unavailable")
# Tenant management endpoints
@app.post("/api/v1/tenants", response_model=TenantResponse)
async def create_tenant(
tenant_data: TenantCreate,
current_user: dict = Depends(get_admin_user)
):
"""Create new tenant (admin only)"""
conn = await asyncpg.connect(DATABASE_URL)
try:
# Check if tenant already exists
existing = await conn.fetchrow(
"SELECT id FROM tenants WHERE id = $1 OR subdomain = $2",
tenant_data.id, tenant_data.subdomain
)
if existing:
raise HTTPException(status_code=409, detail="Tenant ID or subdomain already exists")
# Insert new tenant
result = await conn.fetchrow(
"""
INSERT INTO tenants (id, name, subdomain, admin_user_id, settings)
VALUES ($1, $2, $3, $4, $5)
RETURNING *
""",
tenant_data.id,
tenant_data.name,
tenant_data.subdomain,
tenant_data.admin_user_id,
json.dumps(tenant_data.settings)
)
return TenantResponse.from_db_row(result)
finally:
await conn.close()
@app.get("/api/v1/tenants", response_model=List[TenantResponse])
async def list_tenants(current_user: dict = Depends(get_admin_user)):
"""List all tenants (admin only)"""
conn = await asyncpg.connect(DATABASE_URL)
try:
results = await conn.fetch("SELECT * FROM tenants ORDER BY created_at DESC")
return [TenantResponse.from_db_row(row) for row in results]
finally:
await conn.close()
@app.get("/api/v1/tenants/{tenant_id}", response_model=TenantResponse)
async def get_tenant(tenant_id: str):
"""Get tenant details (public endpoint for validation)"""
conn = await asyncpg.connect(DATABASE_URL)
try:
result = await conn.fetchrow("SELECT * FROM tenants WHERE id = $1", tenant_id)
if not result:
raise HTTPException(status_code=404, detail="Tenant not found")
return TenantResponse.from_db_row(result)
finally:
await conn.close()
@app.put("/api/v1/tenants/{tenant_id}", response_model=TenantResponse)
async def update_tenant(
tenant_id: str,
tenant_data: TenantCreate,
current_user: dict = Depends(get_admin_user)
):
"""Update tenant settings (admin only)"""
conn = await asyncpg.connect(DATABASE_URL)
try:
result = await conn.fetchrow(
"""
UPDATE tenants
SET name = $2, admin_user_id = $3, settings = $4, updated_at = CURRENT_TIMESTAMP
WHERE id = $1
RETURNING *
""",
tenant_id,
tenant_data.name,
tenant_data.admin_user_id,
json.dumps(tenant_data.settings)
)
if not result:
raise HTTPException(status_code=404, detail="Tenant not found")
return TenantResponse.from_db_row(result)
finally:
await conn.close()
# Signup management endpoints
@app.post("/api/v1/tenants/{tenant_id}/signups", response_model=SignupResponse)
async def request_signup(tenant_id: str, signup_data: SignupRequest):
"""Request signup approval for a tenant (public endpoint)"""
conn = await asyncpg.connect(DATABASE_URL)
try:
# Verify tenant exists and accepts signups
tenant = await conn.fetchrow(
"SELECT id, status FROM tenants WHERE id = $1", tenant_id
)
if not tenant:
raise HTTPException(status_code=404, detail="Tenant not found")
if tenant['status'] != 'active':
raise HTTPException(status_code=400, detail="Tenant not accepting signups")
# Check for existing signup
existing = await conn.fetchrow(
"SELECT id FROM tenant_signups WHERE tenant_id = $1 AND user_email = $2",
tenant_id, signup_data.user_email
)
if existing:
raise HTTPException(status_code=409, detail="Signup request already exists")
# Create signup request
result = await conn.fetchrow(
"""
INSERT INTO tenant_signups (tenant_id, user_email, user_auth0_id)
VALUES ($1, $2, $3)
RETURNING *
""",
tenant_id,
signup_data.user_email,
signup_data.user_auth0_id
)
logger.info(f"New signup request: {signup_data.user_email} for tenant {tenant_id}")
return SignupResponse(**dict(result))
finally:
await conn.close()
@app.get("/api/v1/tenants/{tenant_id}/signups", response_model=List[SignupResponse])
async def get_tenant_signups(
tenant_id: str,
status: Optional[str] = "pending",
current_user: dict = Depends(get_tenant_admin)
):
"""List signups for a tenant (tenant admin only)"""
# Verify user has access to this tenant
if current_user.get("tenant_id") != tenant_id and current_user.get("tenant_id") != "admin":
raise HTTPException(status_code=403, detail="Access denied to this tenant")
conn = await asyncpg.connect(DATABASE_URL)
try:
query = "SELECT * FROM tenant_signups WHERE tenant_id = $1"
params = [tenant_id]
if status:
query += " AND status = $2"
params.append(status)
query += " ORDER BY requested_at DESC"
results = await conn.fetch(query, *params)
return [SignupResponse(**dict(row)) for row in results]
finally:
await conn.close()
@app.get("/api/v1/signups", response_model=List[SignupResponse])
async def get_all_signups(
status: Optional[str] = "pending",
current_user: dict = Depends(get_admin_user)
):
"""List all signups across all tenants (admin only)"""
conn = await asyncpg.connect(DATABASE_URL)
try:
query = "SELECT * FROM tenant_signups"
params = []
if status:
query += " WHERE status = $1"
params.append(status)
query += " ORDER BY requested_at DESC"
results = await conn.fetch(query, *params)
return [SignupResponse(**dict(row)) for row in results]
finally:
await conn.close()
@app.put("/api/v1/signups/{signup_id}/approve")
async def approve_signup(
signup_id: int,
current_user: dict = Depends(get_tenant_admin)
):
"""Approve a signup request (tenant admin only)"""
conn = await asyncpg.connect(DATABASE_URL)
try:
# Get signup details to verify tenant access
signup = await conn.fetchrow(
"SELECT * FROM tenant_signups WHERE id = $1", signup_id
)
if not signup:
raise HTTPException(status_code=404, detail="Signup not found")
# Verify user has access to approve this signup
if (current_user.get("tenant_id") != signup['tenant_id'] and
current_user.get("tenant_id") != "admin"):
raise HTTPException(status_code=403, detail="Access denied to this tenant")
result = await conn.fetchrow(
"""
UPDATE tenant_signups
SET status = 'approved', approved_by = $2, approved_at = CURRENT_TIMESTAMP
WHERE id = $1 AND status = 'pending'
RETURNING *
""",
signup_id,
current_user['sub']
)
if not result:
raise HTTPException(status_code=404, detail="Signup not found or already processed")
# TODO: Update Auth0 user metadata to set signup_status = 'approved'
logger.info(f"Approved signup {signup_id} for user {result['user_email']} by {current_user['sub']}")
return {"status": "approved", "signup_id": signup_id}
finally:
await conn.close()
@app.put("/api/v1/signups/{signup_id}/reject")
async def reject_signup(
signup_id: int,
approval_data: SignupApproval,
current_user: dict = Depends(get_tenant_admin)
):
"""Reject a signup request (tenant admin only)"""
conn = await asyncpg.connect(DATABASE_URL)
try:
# Get signup details to verify tenant access
signup = await conn.fetchrow(
"SELECT * FROM tenant_signups WHERE id = $1", signup_id
)
if not signup:
raise HTTPException(status_code=404, detail="Signup not found")
# Verify user has access to reject this signup
if (current_user.get("tenant_id") != signup['tenant_id'] and
current_user.get("tenant_id") != "admin"):
raise HTTPException(status_code=403, detail="Access denied to this tenant")
reason = approval_data.reason or "No reason provided"
result = await conn.fetchrow(
"""
UPDATE tenant_signups
SET status = 'rejected', approved_by = $2, rejected_at = CURRENT_TIMESTAMP, rejection_reason = $3
WHERE id = $1 AND status = 'pending'
RETURNING *
""",
signup_id,
current_user['sub'],
reason
)
if not result:
raise HTTPException(status_code=404, detail="Signup not found or already processed")
logger.info(f"Rejected signup {signup_id} for user {result['user_email']}: {reason}")
return {"status": "rejected", "signup_id": signup_id, "reason": reason}
finally:
await conn.close()
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,21 @@
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY api/ .
# Expose port
EXPOSE 8000
# Run the application
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

View File

@@ -0,0 +1,7 @@
fastapi==0.104.1
uvicorn[standard]==0.24.0
asyncpg==0.29.0
pydantic==2.5.0
python-jose[cryptography]==3.3.0
python-multipart==0.0.6
httpx==0.25.2

View File

@@ -0,0 +1,41 @@
-- Tenant registry schema for MVP Platform Tenants Service
-- Creates core tenant management tables
-- Tenant registry
CREATE TABLE IF NOT EXISTS tenants (
id VARCHAR(100) PRIMARY KEY, -- 'admin', 'acme-corp', etc.
name VARCHAR(255) NOT NULL, -- Display name
subdomain VARCHAR(100) UNIQUE NOT NULL, -- Same as id for simplicity
status VARCHAR(50) DEFAULT 'active', -- active, pending, suspended
admin_user_id VARCHAR(255), -- Auth0 user ID of tenant admin
settings JSONB DEFAULT '{}', -- Tenant-specific configuration
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Create indexes for performance
CREATE INDEX IF NOT EXISTS idx_tenants_status ON tenants(status);
CREATE INDEX IF NOT EXISTS idx_tenants_admin_user ON tenants(admin_user_id);
-- Tenant signup approval workflow
CREATE TABLE IF NOT EXISTS tenant_signups (
id SERIAL PRIMARY KEY,
tenant_id VARCHAR(100) REFERENCES tenants(id),
user_email VARCHAR(255) NOT NULL,
user_auth0_id VARCHAR(255), -- Auth0 user ID after signup
status VARCHAR(50) DEFAULT 'pending', -- pending, approved, rejected
requested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
approved_by VARCHAR(255), -- Auth0 ID of approving admin
approved_at TIMESTAMP,
rejected_at TIMESTAMP,
rejection_reason TEXT
);
-- Create indexes for signup queries
CREATE INDEX IF NOT EXISTS idx_tenant_signups_tenant_status ON tenant_signups(tenant_id, status);
CREATE INDEX IF NOT EXISTS idx_tenant_signups_user_email ON tenant_signups(user_email);
-- Initial admin tenant data
INSERT INTO tenants (id, name, subdomain, status, admin_user_id)
VALUES ('admin', 'Admin Tenant', 'admin', 'active', NULL)
ON CONFLICT (id) DO NOTHING;

View File

@@ -0,0 +1,42 @@
# MVP Platform Vehicles Service
## Schema Bootstrapping (Docker-First)
- Database: PostgreSQL, service `mvp-platform-vehicles-db`.
- On first start, schema files from `mvp-platform-services/vehicles/sql/schema` are executed automatically because the folder is mounted to `/docker-entrypoint-initdb.d` in `docker-compose.yml`.
- Files run in lexicographic order:
- `001_schema.sql` creates `vehicles` schema and tables
- `002_constraints_indexes.sql` adds uniques and indexes
- `003_seed_minimal.sql` seeds minimal dropdown data for sanity checks
## When Do Files Run?
- Only on the initial database initialization (i.e., when the Postgres data volume is empty).
- Subsequent `make start` runs will not reapply these files unless you reset the volume.
## Applying Schema Changes
- Option 1 (fresh reset):
1. `make clean` to remove volumes
2. `make start` (the `.sql` files will be reapplied)
- Option 2 (manual apply to existing DB):
- Exec into the DB container and run the SQL files in order:
```bash
docker compose exec mvp-platform-vehicles-db bash -lc "psql -U mvp_platform_user -d vehicles -f /docker-entrypoint-initdb.d/001_schema.sql"
docker compose exec mvp-platform-vehicles-db bash -lc "psql -U mvp_platform_user -d vehicles -f /docker-entrypoint-initdb.d/002_constraints_indexes.sql"
docker compose exec mvp-platform-vehicles-db bash -lc "psql -U mvp_platform_user -d vehicles -f /docker-entrypoint-initdb.d/003_seed_minimal.sql"
```
## Quick Start
```bash
make start
make logs-platform-vehicles # View API + DB logs
```
## Endpoint Summary (Auth Required: Authorization: Bearer <API_KEY>)
- `GET /api/v1/vehicles/years` → `[number]`
- `GET /api/v1/vehicles/makes?year=YYYY` → `{ makes: [{id,name}] }`
- `GET /api/v1/vehicles/models?year=YYYY&make_id=ID` → `{ models: [...] }`
- `GET /api/v1/vehicles/trims?year=YYYY&make_id=ID&model_id=ID` → `{ trims: [...] }`
- `GET /api/v1/vehicles/engines?year=YYYY&make_id=ID&model_id=ID&trim_id=ID` → `{ engines: [...] }`
## Notes
- Transmissions and performance tables exist for future use; no endpoints yet.
- VIN decode endpoints are pending rebuild and not documented here.

View File

@@ -0,0 +1,43 @@
import os
from pydantic_settings import BaseSettings
from typing import List
class Settings(BaseSettings):
"""Application configuration"""
# Database settings
POSTGRES_HOST: str = os.getenv("POSTGRES_HOST", "localhost")
POSTGRES_PORT: int = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_USER: str = os.getenv("POSTGRES_USER", "mvp_platform_user")
POSTGRES_PASSWORD: str = os.getenv("POSTGRES_PASSWORD", "platform123")
POSTGRES_DATABASE: str = os.getenv("POSTGRES_DATABASE", "vpic")
# Redis settings
REDIS_HOST: str = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT: int = int(os.getenv("REDIS_PORT", "6379"))
REDIS_DB: int = int(os.getenv("REDIS_DB", "0"))
# Database connection pool settings
DATABASE_MIN_CONNECTIONS: int = int(os.getenv("DATABASE_MIN_CONNECTIONS", "5"))
DATABASE_MAX_CONNECTIONS: int = int(os.getenv("DATABASE_MAX_CONNECTIONS", "20"))
# Cache settings
CACHE_TTL: int = int(os.getenv("CACHE_TTL", "3600")) # 1 hour default
# Security
API_KEY: str = os.getenv("API_KEY", "mvp-platform-vehicles-secret-key")
# Application settings
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
CORS_ORIGINS: List[str] = [
"http://localhost:3000",
"https://motovaultpro.com",
"http://localhost:3001"
]
class Config:
case_sensitive = True
def get_settings() -> Settings:
"""Get application settings"""
return Settings()

View File

@@ -0,0 +1,40 @@
import asyncpg
import redis.asyncio as redis
from fastapi import Request, Depends, HTTPException
import logging
from .config import get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
async def get_db_pool(request: Request) -> asyncpg.Pool:
"""Get database pool from app state"""
return request.app.state.db_pool
async def get_db(request: Request) -> asyncpg.Connection:
"""Get database connection"""
pool = await get_db_pool(request)
async with pool.acquire() as conn:
yield conn
async def get_redis_client(request: Request) -> redis.Redis:
"""Get Redis client from app state"""
return request.app.state.redis_client
async def get_cache(request: Request):
"""Get cache service from app state"""
return request.app.state.cache_service
async def verify_bearer_token(request: Request) -> str:
"""Verify Bearer token for service-to-service authentication
Expects header: Authorization: Bearer <token>
Compares token to settings.API_KEY
"""
auth_header = request.headers.get("Authorization", "")
if not auth_header.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
token = auth_header.split(" ", 1)[1].strip()
if token != settings.API_KEY:
raise HTTPException(status_code=401, detail="Invalid service token")
return token

View File

@@ -0,0 +1,202 @@
import logging
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import asyncpg
import redis.asyncio as redis
import time
from .config import get_settings
from .dependencies import get_db_pool, get_redis_client, get_cache, verify_bearer_token
from .routes import vehicles, vin
from .models.responses import HealthResponse
from .services.cache_service import CacheService
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
settings = get_settings()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan manager"""
# Startup
logger.info("Starting MVP Platform Vehicles API...")
# Initialize database pool
try:
app.state.db_pool = await asyncpg.create_pool(
host=settings.POSTGRES_HOST,
port=settings.POSTGRES_PORT,
user=settings.POSTGRES_USER,
password=settings.POSTGRES_PASSWORD,
database=settings.POSTGRES_DATABASE,
min_size=settings.DATABASE_MIN_CONNECTIONS,
max_size=settings.DATABASE_MAX_CONNECTIONS,
command_timeout=30
)
logger.info("Database pool initialized")
except Exception as e:
logger.error(f"Failed to initialize database pool: {e}")
raise
# Initialize Redis client
try:
app.state.redis_client = redis.Redis(
host=settings.REDIS_HOST,
port=settings.REDIS_PORT,
db=settings.REDIS_DB,
decode_responses=False,
socket_connect_timeout=5,
socket_timeout=5
)
# Test connection
await app.state.redis_client.ping()
logger.info("Redis client initialized")
except Exception as e:
logger.warning(f"Failed to initialize Redis client: {e}")
app.state.redis_client = None
# Initialize cache service
app.state.cache_service = CacheService(
app.state.redis_client,
enabled=bool(app.state.redis_client),
default_ttl=settings.CACHE_TTL
)
yield
# Shutdown
logger.info("Shutting down MVP Platform Vehicles API...")
if hasattr(app.state, 'db_pool') and app.state.db_pool:
await app.state.db_pool.close()
logger.info("Database pool closed")
if hasattr(app.state, 'redis_client') and app.state.redis_client:
await app.state.redis_client.aclose()
logger.info("Redis client closed")
# Create FastAPI app
app = FastAPI(
title="MVP Platform Vehicles API",
description="Hierarchical Vehicle API with VIN decoding for MotoVaultPro platform services",
version="1.0.0",
lifespan=lifespan,
docs_url="/docs" if settings.DEBUG else None,
redoc_url="/redoc" if settings.DEBUG else None
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=settings.CORS_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Request timing middleware
@app.middleware("http")
async def add_process_time_header(request: Request, call_next):
start_time = time.time()
response = await call_next(request)
process_time = time.time() - start_time
response.headers["X-Process-Time"] = str(process_time)
return response
# Global exception handler
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
logger.error(f"Unhandled exception in {request.method} {request.url.path}: {exc}")
return JSONResponse(
status_code=500,
content={"detail": "Internal server error"}
)
# Include routers
app.include_router(vehicles.router, prefix="/api/v1", dependencies=[Depends(verify_bearer_token)])
app.include_router(vin.router, prefix="/api/v1", dependencies=[Depends(verify_bearer_token)])
# Health check endpoint
@app.api_route("/health", methods=["GET", "HEAD"], response_model=HealthResponse)
async def health_check(request: Request):
"""Health check endpoint"""
db_status = "ok"
cache_status = "ok"
# Check database
try:
db_pool = request.app.state.db_pool
async with db_pool.acquire() as conn:
await conn.fetchval("SELECT 1")
except Exception as e:
logger.error(f"Database health check failed: {e}")
db_status = "error"
# Check cache
try:
cache = request.app.state.cache_service
if cache and cache.enabled:
await cache.redis.ping()
else:
cache_status = "disabled"
except Exception as e:
logger.error(f"Cache health check failed: {e}")
cache_status = "error"
overall_status = "ok" if db_status == "ok" else "degraded"
return HealthResponse(
status=overall_status,
database=db_status,
cache=cache_status,
version="1.0.0"
)
# Root endpoint
@app.get("/")
async def root():
"""Root endpoint with API information"""
return {
"name": "MVP Platform Vehicles API",
"version": "1.0.0",
"description": "Hierarchical Vehicle API with VIN decoding",
"docs_url": "/docs" if settings.DEBUG else "Contact administrator for documentation",
"endpoints": {
"health": "/health",
"makes": "/api/v1/vehicles/makes?year=2024",
"models": "/api/v1/vehicles/models?year=2024&make_id=1",
"trims": "/api/v1/vehicles/trims?year=2024&make_id=1&model_id=1",
"engines": "/api/v1/vehicles/engines?year=2024&make_id=1&model_id=1",
"transmissions": "/api/v1/vehicles/transmissions?year=2024&make_id=1&model_id=1",
"vin_decode": "/api/v1/vehicles/vindecode"
}
}
# Cache stats endpoint
@app.get("/api/v1/cache/stats")
async def cache_stats(request: Request, token: str = Depends(verify_bearer_token)):
"""Get cache statistics"""
try:
cache = request.app.state.cache_service
stats = await cache.get_stats()
return stats
except Exception as e:
logger.error(f"Failed to get cache stats: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve cache statistics")
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"api.main:app",
host="0.0.0.0",
port=8000,
reload=settings.DEBUG,
log_level="info"
)

View File

@@ -0,0 +1,84 @@
from pydantic import BaseModel
from typing import List, Optional
class MakeItem(BaseModel):
"""Make item response model"""
id: int
name: str
class ModelItem(BaseModel):
"""Model item response model"""
id: int
name: str
class TrimItem(BaseModel):
"""Trim item response model"""
id: int
name: str
class EngineItem(BaseModel):
"""Engine item response model"""
id: int
name: str
class TransmissionItem(BaseModel):
"""Transmission item response model"""
name: str
class MakesResponse(BaseModel):
"""Makes response model"""
makes: List[MakeItem]
class YearsResponse(BaseModel):
"""Years response model"""
years: List[int]
class ModelsResponse(BaseModel):
"""Models response model"""
models: List[ModelItem]
class TrimsResponse(BaseModel):
"""Trims response model"""
trims: List[TrimItem]
class EnginesResponse(BaseModel):
"""Engines response model"""
engines: List[EngineItem]
class TransmissionsResponse(BaseModel):
"""Transmissions response model"""
transmissions: List[TransmissionItem]
class VINDecodeResult(BaseModel):
"""VIN decode result model"""
make: Optional[str] = None
model: Optional[str] = None
year: Optional[int] = None
trim_name: Optional[str] = None
engine_description: Optional[str] = None
transmission_description: Optional[str] = None
horsepower: Optional[float] = None
torque: Optional[float] = None
top_speed: Optional[float] = None
fuel: Optional[str] = None
confidence_score: Optional[float] = None
vehicle_type: Optional[str] = None
class VINDecodeRequest(BaseModel):
"""VIN decode request model"""
vin: str
class VINDecodeResponse(BaseModel):
"""VIN decode response model"""
vin: str
result: Optional[VINDecodeResult]
success: bool
error: Optional[str] = None
class HealthResponse(BaseModel):
"""Health check response model"""
status: str
database: str
cache: str
version: str
etl_last_run: Optional[str] = None

View File

@@ -0,0 +1,79 @@
import asyncpg
from typing import List, Dict
class VehiclesRepository:
"""Repository for hierarchical vehicle queries against normalized schema"""
async def get_years(self, db: asyncpg.Connection) -> List[int]:
rows = await db.fetch(
"""
SELECT DISTINCT year
FROM vehicles.model_year
ORDER BY year DESC
"""
)
return [r["year"] for r in rows]
async def get_makes(self, db: asyncpg.Connection, year: int) -> List[Dict]:
rows = await db.fetch(
"""
SELECT DISTINCT ma.id, ma.name
FROM vehicles.make ma
JOIN vehicles.model mo ON mo.make_id = ma.id
JOIN vehicles.model_year my ON my.model_id = mo.id AND my.year = $1
ORDER BY ma.name
""",
year,
)
return [{"id": r["id"], "name": r["name"]} for r in rows]
async def get_models(self, db: asyncpg.Connection, year: int, make_id: int) -> List[Dict]:
rows = await db.fetch(
"""
SELECT DISTINCT mo.id, mo.name
FROM vehicles.model mo
JOIN vehicles.model_year my ON my.model_id = mo.id AND my.year = $1
WHERE mo.make_id = $2
ORDER BY mo.name
""",
year,
make_id,
)
return [{"id": r["id"], "name": r["name"]} for r in rows]
async def get_trims(self, db: asyncpg.Connection, year: int, model_id: int) -> List[Dict]:
rows = await db.fetch(
"""
SELECT t.id, t.name
FROM vehicles.trim t
JOIN vehicles.model_year my ON my.id = t.model_year_id
WHERE my.year = $1 AND my.model_id = $2
ORDER BY t.name
""",
year,
model_id,
)
return [{"id": r["id"], "name": r["name"]} for r in rows]
async def get_engines(
self, db: asyncpg.Connection, year: int, model_id: int, trim_id: int
) -> List[Dict]:
rows = await db.fetch(
"""
SELECT DISTINCT e.id, e.name
FROM vehicles.engine e
JOIN vehicles.trim_engine te ON te.engine_id = e.id
JOIN vehicles.trim t ON t.id = te.trim_id
JOIN vehicles.model_year my ON my.id = t.model_year_id
WHERE my.year = $1
AND my.model_id = $2
AND t.id = $3
ORDER BY e.name
""",
year,
model_id,
trim_id,
)
return [{"id": r["id"], "name": r["name"]} for r in rows]

View File

@@ -0,0 +1,116 @@
from fastapi import APIRouter, Depends, Query, HTTPException
import asyncpg
from ..dependencies import get_db, get_cache
# DropdownService deprecated; using normalized schema service
from ..services.vehicles_service import VehiclesService
from ..repositories.vehicles_repository import VehiclesRepository
from ..services.cache_service import CacheService
from ..models.responses import (
MakesResponse, ModelsResponse, TrimsResponse,
EnginesResponse,
MakeItem, ModelItem, TrimItem, EngineItem
)
import logging
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/vehicles", tags=["Vehicles"])
@router.get("/years", response_model=list[int])
async def get_years(
db: asyncpg.Connection = Depends(get_db),
cache: CacheService = Depends(get_cache),
):
"""Get available model years (distinct, desc)"""
service = VehiclesService(cache, VehiclesRepository())
return await service.get_years(db)
@router.get("/makes", response_model=MakesResponse)
async def get_makes(
year: int = Query(..., description="Model year", ge=1980, le=2050),
db: asyncpg.Connection = Depends(get_db),
cache: CacheService = Depends(get_cache)
):
"""Get makes for a specific year
Hierarchical API: First level - requires year parameter only
"""
try:
service = VehiclesService(cache, VehiclesRepository())
makes = await service.get_makes(db, year)
return MakesResponse(makes=[MakeItem(**m) for m in makes])
except Exception as e:
logger.error(f"Failed to get makes for year {year}: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to retrieve makes for year {year}"
)
@router.get("/models", response_model=ModelsResponse)
async def get_models(
year: int = Query(..., description="Model year", ge=1980, le=2050),
make_id: int = Query(..., description="Make ID", ge=1),
db: asyncpg.Connection = Depends(get_db),
cache: CacheService = Depends(get_cache)
):
"""Get models for year and make
Hierarchical API: Second level - requires year and make_id parameters
"""
try:
service = VehiclesService(cache, VehiclesRepository())
models = await service.get_models(db, year, make_id)
return ModelsResponse(models=[ModelItem(**m) for m in models])
except Exception as e:
logger.error(f"Failed to get models for year {year}, make {make_id}: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to retrieve models for year {year}, make {make_id}"
)
@router.get("/trims", response_model=TrimsResponse)
async def get_trims(
year: int = Query(..., description="Model year", ge=1980, le=2050),
make_id: int = Query(..., description="Make ID", ge=1),
model_id: int = Query(..., description="Model ID", ge=1),
db: asyncpg.Connection = Depends(get_db),
cache: CacheService = Depends(get_cache)
):
"""Get trims for year, make, and model
Hierarchical API: Third level - requires year, make_id, and model_id parameters
"""
try:
service = VehiclesService(cache, VehiclesRepository())
trims = await service.get_trims(db, year, model_id)
return TrimsResponse(trims=[TrimItem(**t) for t in trims])
except Exception as e:
logger.error(f"Failed to get trims for year {year}, make {make_id}, model {model_id}: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to retrieve trims for year {year}, make {make_id}, model {model_id}"
)
@router.get("/engines", response_model=EnginesResponse)
async def get_engines(
year: int = Query(..., description="Model year", ge=1980, le=2050),
make_id: int = Query(..., description="Make ID", ge=1),
model_id: int = Query(..., description="Model ID", ge=1),
trim_id: int = Query(..., description="Trim ID", ge=1),
db: asyncpg.Connection = Depends(get_db),
cache: CacheService = Depends(get_cache)
):
"""Get engines for year, make, model, and trim"""
try:
service = VehiclesService(cache, VehiclesRepository())
engines = await service.get_engines(db, year, model_id, trim_id)
return EnginesResponse(engines=[EngineItem(**e) for e in engines])
except Exception as e:
logger.error(
f"Failed to get engines for year {year}, make {make_id}, model {model_id}, trim {trim_id}: {e}"
)
raise HTTPException(
status_code=500,
detail=(
f"Failed to retrieve engines for year {year}, make {make_id}, model {model_id}, trim {trim_id}"
)
)

View File

@@ -0,0 +1,110 @@
from fastapi import APIRouter, Depends, HTTPException
import asyncpg
from ..dependencies import get_db, get_cache
from ..services.cache_service import CacheService
from ..models.responses import VINDecodeRequest, VINDecodeResponse, VINDecodeResult
import logging
import re
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/vehicles", tags=["VIN Decoding"])
def validate_vin(vin: str) -> bool:
"""Validate VIN format"""
if len(vin) != 17:
return False
# VIN cannot contain I, O, Q
if any(char in vin.upper() for char in ['I', 'O', 'Q']):
return False
# Must be alphanumeric
if not re.match(r'^[A-HJ-NPR-Z0-9]{17}$', vin.upper()):
return False
return True
@router.post("/vindecode", response_model=VINDecodeResponse)
async def decode_vin(
request: VINDecodeRequest,
db: asyncpg.Connection = Depends(get_db),
cache: CacheService = Depends(get_cache)
):
"""Decode VIN using PostgreSQL function with MSSQL parity
Uses the vehicles.f_decode_vin() function to decode VIN with confidence scoring
"""
vin = request.vin.upper().strip()
# Validate VIN format
if not validate_vin(vin):
return VINDecodeResponse(
vin=vin,
result=None,
success=False,
error="Invalid VIN format"
)
# Check cache first
cache_key = f"vin:decode:{vin}"
cached_result = await cache.get(cache_key)
if cached_result:
logger.debug(f"VIN decode result for {vin} retrieved from cache")
return VINDecodeResponse(**cached_result)
try:
# Call PostgreSQL VIN decode function
query = """
SELECT * FROM vehicles.f_decode_vin($1)
"""
row = await db.fetchrow(query, vin)
if row:
result = VINDecodeResult(
make=row['make'],
model=row['model'],
year=row['year'],
trim_name=row['trim_name'],
engine_description=row['engine_description'],
transmission_description=row['transmission_description'],
horsepower=row.get('horsepower'),
torque=row.get('torque'),
top_speed=row.get('top_speed'),
fuel=row.get('fuel'),
confidence_score=float(row['confidence_score']) if row['confidence_score'] else 0.0,
vehicle_type=row.get('vehicle_type')
)
response = VINDecodeResponse(
vin=vin,
result=result,
success=True
)
# Cache successful decode for 30 days
await cache.set(cache_key, response.dict(), ttl=30*24*3600)
logger.info(f"Successfully decoded VIN {vin}: {result.make} {result.model} {result.year}")
return response
else:
# No result found
response = VINDecodeResponse(
vin=vin,
result=None,
success=False,
error="VIN not found in database"
)
# Cache negative result for 1 hour
await cache.set(cache_key, response.dict(), ttl=3600)
return response
except Exception as e:
logger.error(f"Failed to decode VIN {vin}: {e}")
return VINDecodeResponse(
vin=vin,
result=None,
success=False,
error="Internal server error during VIN decoding"
)

View File

@@ -0,0 +1,88 @@
import redis.asyncio as redis
import json
import logging
from typing import Any, Optional
logger = logging.getLogger(__name__)
class CacheService:
"""Redis cache service with JSON serialization"""
def __init__(self, redis_client: Optional[redis.Redis], enabled: bool = True, default_ttl: int = 3600):
self.redis = redis_client
self.enabled = enabled and redis_client is not None
self.default_ttl = default_ttl
async def get(self, key: str) -> Optional[Any]:
"""Get value from cache"""
if not self.enabled:
return None
try:
value = await self.redis.get(key)
if value:
return json.loads(value)
return None
except Exception as e:
logger.error(f"Cache get error for key {key}: {e}")
return None
async def set(self, key: str, value: Any, ttl: Optional[int] = None) -> bool:
"""Set value in cache"""
if not self.enabled:
return False
try:
ttl = ttl or self.default_ttl
json_value = json.dumps(value, default=str) # Handle datetime objects
await self.redis.setex(key, ttl, json_value)
return True
except Exception as e:
logger.error(f"Cache set error for key {key}: {e}")
return False
async def delete(self, key: str) -> bool:
"""Delete key from cache"""
if not self.enabled:
return False
try:
deleted = await self.redis.delete(key)
return deleted > 0
except Exception as e:
logger.error(f"Cache delete error for key {key}: {e}")
return False
async def invalidate_dropdown_cache(self) -> int:
"""Invalidate all dropdown cache entries"""
if not self.enabled:
return 0
try:
pattern = "dropdown:*"
keys = await self.redis.keys(pattern)
if keys:
deleted = await self.redis.delete(*keys)
logger.info(f"Invalidated {deleted} dropdown cache entries")
return deleted
return 0
except Exception as e:
logger.error(f"Cache invalidation error: {e}")
return 0
async def get_stats(self) -> dict:
"""Get cache statistics"""
if not self.enabled:
return {"enabled": False}
try:
info = await self.redis.info("memory")
return {
"enabled": True,
"used_memory": info.get("used_memory_human"),
"used_memory_peak": info.get("used_memory_peak_human"),
"connected_clients": await self.redis.client_list()
}
except Exception as e:
logger.error(f"Cache stats error: {e}")
return {"enabled": True, "error": str(e)}

View File

@@ -0,0 +1,58 @@
import asyncpg
from typing import List, Dict
from ..services.cache_service import CacheService
from ..repositories.vehicles_repository import VehiclesRepository
class VehiclesService:
def __init__(self, cache: CacheService, repo: VehiclesRepository | None = None):
self.cache = cache
self.repo = repo or VehiclesRepository()
async def get_years(self, db: asyncpg.Connection) -> List[int]:
cache_key = "dropdown:years"
cached = await self.cache.get(cache_key)
if cached:
return cached
years = await self.repo.get_years(db)
await self.cache.set(cache_key, years, ttl=6 * 3600)
return years
async def get_makes(self, db: asyncpg.Connection, year: int) -> List[Dict]:
cache_key = f"dropdown:makes:{year}"
cached = await self.cache.get(cache_key)
if cached:
return cached
makes = await self.repo.get_makes(db, year)
await self.cache.set(cache_key, makes, ttl=6 * 3600)
return makes
async def get_models(self, db: asyncpg.Connection, year: int, make_id: int) -> List[Dict]:
cache_key = f"dropdown:models:{year}:{make_id}"
cached = await self.cache.get(cache_key)
if cached:
return cached
models = await self.repo.get_models(db, year, make_id)
await self.cache.set(cache_key, models, ttl=6 * 3600)
return models
async def get_trims(self, db: asyncpg.Connection, year: int, model_id: int) -> List[Dict]:
cache_key = f"dropdown:trims:{year}:{model_id}"
cached = await self.cache.get(cache_key)
if cached:
return cached
trims = await self.repo.get_trims(db, year, model_id)
await self.cache.set(cache_key, trims, ttl=6 * 3600)
return trims
async def get_engines(
self, db: asyncpg.Connection, year: int, model_id: int, trim_id: int
) -> List[Dict]:
cache_key = f"dropdown:engines:{year}:{model_id}:{trim_id}"
cached = await self.cache.get(cache_key)
if cached:
return cached
engines = await self.repo.get_engines(db, year, model_id, trim_id)
await self.cache.set(cache_key, engines, ttl=6 * 3600)
return engines

View File

@@ -0,0 +1,30 @@
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
wget \
curl \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements-api.txt .
RUN pip install --no-cache-dir -r requirements-api.txt
# Copy application code
COPY api/ ./api/
# Set Python path
ENV PYTHONPATH=/app
# Expose port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD wget --quiet --tries=1 --spider http://localhost:8000/health || exit 1
# Run application
CMD ["python", "-m", "uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,46 @@
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies and ODBC drivers
RUN apt-get update && apt-get install -y \
curl \
apt-transport-https \
gnupg2 \
unixodbc-dev \
unixodbc \
&& curl -sSL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /usr/share/keyrings/microsoft-prod.gpg \
&& echo "deb [arch=amd64,arm64,armhf signed-by=/usr/share/keyrings/microsoft-prod.gpg] https://packages.microsoft.com/debian/12/prod bookworm main" > /etc/apt/sources.list.d/mssql-release.list \
&& apt-get update \
&& ACCEPT_EULA=Y apt-get install -y msodbcsql17 mssql-tools \
&& rm -rf /var/lib/apt/lists/*
# Add SQL Server tools to PATH
ENV PATH="$PATH:/opt/mssql-tools/bin"
# Copy requirements and install Python dependencies
COPY requirements-etl.txt .
RUN pip install --no-cache-dir -r requirements-etl.txt
# Copy ETL code
COPY etl/ ./etl/
# Copy make configuration for filtering
COPY makes.json /app/makes.json
# Create logs and data directories
RUN mkdir -p /app/logs /app/data
# Set Python path
ENV PYTHONPATH=/app
# Expose port for health check
EXPOSE 8001
# Health check
HEALTHCHECK --interval=60s --timeout=10s --start-period=60s --retries=3 \
CMD python -c "import sys; import os; sys.path.append('/app'); from etl.connections import test_connections; exit(0 if test_connections() else 1)" || exit 1
# Run ETL scheduler
CMD ["python", "-m", "etl.main"]

View File

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python3
"""
ETL Package Main Entry Point
Allows running ETL package as a module: python -m etl
"""
from .main import cli
if __name__ == "__main__":
cli()

View File

@@ -0,0 +1,376 @@
import logging
from typing import Dict, List, Set, Optional
from datetime import datetime
from dateutil import tz
from tqdm import tqdm
from ..connections import db_connections
from ..extractors.mssql_extractor import MSSQLExtractor
from ..loaders.postgres_loader import PostgreSQLLoader
from ..config import config
from ..utils.make_filter import MakeFilter
logger = logging.getLogger(__name__)
class NormalizedVehicleBuilder:
"""Build normalized vehicle schema from pattern-based NHTSA source data"""
def __init__(self, make_filter: Optional[MakeFilter] = None):
self.make_filter = make_filter or MakeFilter()
self.extractor = MSSQLExtractor(self.make_filter)
self.loader = PostgreSQLLoader()
logger.info(
f"Initialized normalized vehicle builder with make filtering: {len(self.make_filter.get_allowed_makes())} allowed makes"
)
def build(self):
"""Main normalized vehicle schema building process"""
logger.info("Starting normalized vehicle schema build")
try:
# Step 1: Clear and load reference tables
logger.info("Step 1: Loading reference tables (makes, models, relationships)")
self._load_reference_tables()
# Step 2: Extract year availability from WMI data
logger.info("Step 2: Building model-year availability from WMI data")
self._build_model_year_availability()
# Step 3: Extract trims and engines from pattern analysis
logger.info("Step 3: Extracting trims and engines from pattern data")
self._extract_trims_and_engines()
logger.info("Normalized vehicle schema build completed successfully")
return True
except Exception as e:
logger.error(f"Normalized schema build failed: {e}")
raise e
def _load_reference_tables(self):
"""Load basic reference tables: makes, models with proper relationships"""
# Load makes (filtered by make_filter)
makes_data = self.extractor.extract_reference_table('Make')
if makes_data:
self.loader.load_reference_table('make', makes_data)
logger.info(f"Loaded {len(makes_data)} makes")
# Get make-model relationships first
make_model_rels = self.extractor.extract_make_model_relationships()
# Load models with make_id populated from relationships
models_data = self.extractor.extract_reference_table('Model')
if models_data and make_model_rels:
# Create mapping: model_id -> make_id
model_to_make = {}
for rel in make_model_rels:
model_to_make[rel['ModelId']] = rel['MakeId']
# Add make_id to each model record
for model in models_data:
model['MakeId'] = model_to_make.get(model['Id'])
# Filter out models without make_id (orphaned models)
valid_models = [m for m in models_data if m.get('MakeId') is not None]
self.loader.load_reference_table('model', valid_models)
logger.info(f"Loaded {len(valid_models)} models with make relationships")
logger.info(f"Filtered out {len(models_data) - len(valid_models)} orphaned models")
else:
logger.warning("No models or relationships loaded")
def _build_model_year_availability(self):
"""Build model-year availability from WMI year ranges with realistic constraints"""
logger.info("Extracting model-year availability from WMI data with realistic year bounds")
# Define realistic year constraints
current_year = datetime.now().year
max_year = current_year + 1 # Allow next model year
min_year = current_year - 40 # Reasonable historical range (40 years back)
logger.info(f"Using realistic year range: {min_year} to {max_year}")
# Get WMI data with year ranges
wmi_data = self.extractor.extract_wmi_vin_schema_mappings()
# Get make-model relationships to map WMI to models
make_model_rels = self.extractor.extract_make_model_relationships()
wmi_make_rels = self.extractor.extract_wmi_make_relationships()
# Build mapping: WMI -> Make -> Models
wmi_to_models = {}
make_to_models = {}
# Build make -> models mapping
for rel in make_model_rels:
make_id = rel['MakeId']
if make_id not in make_to_models:
make_to_models[make_id] = []
make_to_models[make_id].append(rel['ModelId'])
# Build WMI -> models mapping via makes
for wmi_make in wmi_make_rels:
wmi_id = wmi_make['WmiId']
make_id = wmi_make['MakeId']
if make_id in make_to_models:
if wmi_id not in wmi_to_models:
wmi_to_models[wmi_id] = []
wmi_to_models[wmi_id].extend(make_to_models[make_id])
# Extremely conservative approach: Only allow models with explicit recent year ranges
logger.info("Building model-year availability - using only models with EXPLICIT recent VIN pattern evidence")
model_years = []
current_year = datetime.now().year
# Strategy: Only include models that have VIN patterns with explicit recent year ranges (not open-ended)
recent_threshold = current_year - 5 # Only patterns from last 5 years
# Find models that have EXPLICIT recent VIN pattern evidence (both YearFrom and YearTo defined)
recent_models_with_years = {} # model_id -> set of years with evidence
for wmi_mapping in wmi_data:
year_from = wmi_mapping['YearFrom']
year_to = wmi_mapping['YearTo']
# Skip patterns without explicit year ranges (YearTo=None means open-ended, likely old discontinued models)
if year_from is None or year_to is None:
continue
# Only consider WMI patterns that have recent, explicit activity
if year_to >= recent_threshold and year_from <= current_year + 1:
wmi_id = wmi_mapping['WmiId']
if wmi_id in wmi_to_models:
models = wmi_to_models[wmi_id]
for model_id in models:
if model_id not in recent_models_with_years:
recent_models_with_years[model_id] = set()
# Add the actual years with evidence (constrained to reasonable range)
evidence_start = max(year_from, recent_threshold)
evidence_end = min(year_to, current_year + 1)
for year in range(evidence_start, evidence_end + 1):
recent_models_with_years[model_id].add(year)
logger.info(f"Found {len(recent_models_with_years)} models with explicit recent VIN pattern evidence (patterns with defined year ranges since {recent_threshold})")
# Create model-year combinations only for years with actual VIN pattern evidence
# Apply business rules to exclude historically discontinued models
discontinued_models = self._get_discontinued_models()
for model_id, years_with_evidence in recent_models_with_years.items():
# Check if this model is in our discontinued list
if model_id in discontinued_models:
max_year = discontinued_models[model_id]
logger.info(f"Applying discontinuation rule: Model ID {model_id} discontinued after {max_year}")
# Only include years up to discontinuation year
years_with_evidence = {y for y in years_with_evidence if y <= max_year}
for year in years_with_evidence:
model_years.append({
'model_id': model_id,
'year': year
})
logger.info(f"Created {len(model_years)} model-year combinations based on explicit VIN pattern evidence")
# Remove duplicates
unique_model_years = []
seen = set()
for my in model_years:
key = (my['model_id'], my['year'])
if key not in seen:
seen.add(key)
unique_model_years.append(my)
# Load to database
if unique_model_years:
self.loader.load_model_years(unique_model_years)
logger.info(f"Generated {len(unique_model_years)} model-year availability records")
def _extract_trims_and_engines(self):
"""Extract trims and engines from pattern analysis"""
logger.info("Extracting trims and engines from pattern data")
# Get model-year IDs for mapping
model_year_mapping = self._get_model_year_mapping()
trims_data = []
engines_data = []
engine_names = set()
# Process patterns in batches
total_trims = 0
total_engines = 0
for pattern_batch in self.extractor.extract_patterns_data():
logger.info(f"Processing pattern batch: {len(pattern_batch)} patterns")
# Group patterns by (year, make, model) combination
vehicle_combinations = {}
for pattern in pattern_batch:
element_id = pattern['ElementId']
attribute_id = pattern.get('AttributeId', '')
make_name = pattern.get('MakeName', '')
# Skip if not allowed make
if not self.make_filter.is_make_allowed(make_name):
continue
# Create vehicle combination key
# We'll derive year from WMI data associated with this pattern
vin_schema_id = pattern['VinSchemaId']
key = (vin_schema_id, make_name)
if key not in vehicle_combinations:
vehicle_combinations[key] = {
'make_name': make_name,
'vin_schema_id': vin_schema_id,
'trims': set(),
'engines': set()
}
# Extract trim and engine data
if element_id == 28 and attribute_id: # Trim
vehicle_combinations[key]['trims'].add(attribute_id)
elif element_id == 18 and attribute_id: # Engine
vehicle_combinations[key]['engines'].add(attribute_id)
# Convert to trim/engine records
for combo in vehicle_combinations.values():
make_name = combo['make_name']
# For now, create generic records
# In a full implementation, you'd map these to specific model-years
for trim_name in combo['trims']:
if trim_name and len(trim_name.strip()) > 0:
# We'll need to associate these with specific model_year_ids
# For now, create a placeholder structure
trims_data.append({
'name': trim_name.strip(),
'make_name': make_name, # temporary for mapping
'source_schema': combo['vin_schema_id']
})
total_trims += 1
for engine_name in combo['engines']:
if engine_name and len(engine_name.strip()) > 0 and engine_name not in engine_names:
engine_names.add(engine_name)
engines_data.append({
'name': engine_name.strip(),
'code': None,
'displacement_l': None,
'cylinders': None,
'fuel_type': None,
'aspiration': None
})
total_engines += 1
# Load engines first (they're independent)
if engines_data:
self.loader.load_engines(engines_data)
logger.info(f"Loaded {total_engines} unique engines")
# For trims, we need to map them to actual model_year records
# This is a simplified approach - in practice you'd need more sophisticated mapping
if trims_data:
simplified_trims = self._map_trims_to_model_years(trims_data, model_year_mapping)
if simplified_trims:
self.loader.load_trims(simplified_trims)
logger.info(f"Loaded {len(simplified_trims)} trims")
def _get_model_year_mapping(self) -> Dict:
"""Get mapping of model_year records for trim association"""
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
SELECT my.id, my.model_id, my.year, m.name as model_name, mk.name as make_name
FROM vehicles.model_year my
JOIN vehicles.model m ON my.model_id = m.id
JOIN vehicles.make mk ON m.make_id = mk.id
"""
cursor.execute(query)
rows = cursor.fetchall()
mapping = {}
for row in rows:
key = (row['make_name'] if isinstance(row, dict) else row[4],
row['year'] if isinstance(row, dict) else row[2])
mapping[key] = row['id'] if isinstance(row, dict) else row[0]
return mapping
def _map_trims_to_model_years(self, trims_data: List[Dict], model_year_mapping: Dict) -> List[Dict]:
"""Map extracted trims to actual model_year records"""
mapped_trims = []
# For now, create a simplified mapping
# Associate trims with all model_years of the same make
for trim in trims_data:
make_name = trim['make_name']
trim_name = trim['name']
# Find all model_year_ids for this make
model_year_ids = []
for (mapped_make, year), model_year_id in model_year_mapping.items():
if mapped_make == make_name:
model_year_ids.append(model_year_id)
# Create trim record for each model_year (simplified approach)
# In practice, you'd need more sophisticated pattern-to-vehicle mapping
for model_year_id in model_year_ids[:5]: # Limit to avoid explosion
mapped_trims.append({
'model_year_id': model_year_id,
'name': trim_name
})
return mapped_trims
def _get_discontinued_models(self) -> Dict[int, int]:
"""Get mapping of discontinued model IDs to their last production year
This method identifies models that were historically discontinued
and should not appear in recent model year combinations.
"""
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Query for specific discontinued models by name patterns
# These are well-known discontinued models that should not appear in recent years
discontinued_patterns = [
('Jimmy%', 1991), # GMC Jimmy discontinued 1991
('S-10%', 2004), # Chevrolet S-10 discontinued 2004
('Blazer%', 2005), # Chevrolet Blazer discontinued 2005 (before recent revival)
('Astro%', 2005), # Chevrolet Astro discontinued 2005
('Safari%', 2005), # GMC Safari discontinued 2005
('Jimmy Utility%', 1991), # GMC Jimmy Utility discontinued 1991
]
discontinued_models = {}
for pattern, last_year in discontinued_patterns:
query = """
SELECT m.id, m.name, mk.name as make_name
FROM vehicles.model m
JOIN vehicles.make mk ON m.make_id = mk.id
WHERE m.name ILIKE %s
AND mk.name IN ('Chevrolet', 'GMC')
"""
cursor.execute(query, (pattern,))
rows = cursor.fetchall()
for row in rows:
model_id = row['id'] if isinstance(row, dict) else row[0]
model_name = row['name'] if isinstance(row, dict) else row[1]
make_name = row['make_name'] if isinstance(row, dict) else row[2]
discontinued_models[model_id] = last_year
logger.info(f"Marked {make_name} {model_name} (ID: {model_id}) as discontinued after {last_year}")
return discontinued_models

View File

@@ -0,0 +1,39 @@
import os
from typing import Optional
class ETLConfig:
"""ETL Configuration using environment variables"""
# MS SQL Server settings
MSSQL_HOST: str = os.getenv("MSSQL_HOST", "mvp-platform-vehicles-mssql")
MSSQL_PORT: int = int(os.getenv("MSSQL_PORT", "1433"))
MSSQL_DATABASE: str = os.getenv("MSSQL_DATABASE", "VPICList")
MSSQL_USER: str = os.getenv("MSSQL_USER", "sa")
MSSQL_PASSWORD: str = os.getenv("MSSQL_PASSWORD", "Platform123!")
# PostgreSQL settings
POSTGRES_HOST: str = os.getenv("POSTGRES_HOST", "mvp-platform-vehicles-db")
POSTGRES_PORT: int = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DATABASE: str = os.getenv("POSTGRES_DATABASE", "vehicles")
POSTGRES_USER: str = os.getenv("POSTGRES_USER", "mvp_platform_user")
POSTGRES_PASSWORD: str = os.getenv("POSTGRES_PASSWORD", "platform123")
# Redis settings
REDIS_HOST: str = os.getenv("REDIS_HOST", "mvp-platform-vehicles-redis")
REDIS_PORT: int = int(os.getenv("REDIS_PORT", "6379"))
REDIS_DB: int = int(os.getenv("REDIS_DB", "0"))
# ETL Scheduling
ETL_SCHEDULE: str = os.getenv("ETL_SCHEDULE", "0 2 * * 0") # Weekly at 2 AM on Sunday
# ETL settings
BATCH_SIZE: int = int(os.getenv("BATCH_SIZE", "10000"))
PARALLEL_WORKERS: int = int(os.getenv("PARALLEL_WORKERS", "4"))
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
# Confidence thresholds
MIN_CONFIDENCE_SCORE: int = int(os.getenv("MIN_CONFIDENCE_SCORE", "50"))
# ETL behavior toggles
DISABLE_ALL_MODELS_FALLBACK: bool = os.getenv("DISABLE_ALL_MODELS_FALLBACK", "true").lower() in ("1", "true", "yes")
config = ETLConfig()

View File

@@ -0,0 +1,152 @@
import pyodbc
import psycopg2
from psycopg2.extras import RealDictCursor
import asyncpg
import redis
from contextlib import contextmanager
import logging
import time
from typing import Optional
from .config import config
logger = logging.getLogger(__name__)
class DatabaseConnections:
"""Manage database connections with retry logic and timeouts"""
def __init__(self):
self.mssql_conn = None
self.postgres_conn = None
self.redis_client = None
self.pg_pool = None
self.max_retries = 3
self.retry_delay = 2 # seconds
def _retry_connection(self, connection_func, connection_type: str, max_retries: Optional[int] = None):
"""Retry connection with exponential backoff"""
max_retries = max_retries or self.max_retries
for attempt in range(max_retries):
try:
return connection_func()
except Exception as e:
if attempt == max_retries - 1:
logger.error(f"Failed to connect to {connection_type} after {max_retries} attempts: {e}")
raise
wait_time = self.retry_delay * (2 ** attempt)
logger.warning(f"{connection_type} connection failed (attempt {attempt + 1}/{max_retries}): {e}")
logger.info(f"Retrying in {wait_time} seconds...")
time.sleep(wait_time)
@contextmanager
def mssql_connection(self):
"""Context manager for MS SQL connection using pyodbc with retry logic"""
def _connect():
connection_string = (
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
f"SERVER={config.MSSQL_HOST},{config.MSSQL_PORT};"
f"DATABASE={config.MSSQL_DATABASE};"
f"UID={config.MSSQL_USER};"
f"PWD={config.MSSQL_PASSWORD};"
f"TrustServerCertificate=yes;"
f"Connection Timeout=30;"
f"Command Timeout=300;"
)
return pyodbc.connect(connection_string)
conn = self._retry_connection(_connect, "MSSQL")
try:
yield conn
finally:
try:
conn.close()
except Exception as e:
logger.warning(f"Error closing MSSQL connection: {e}")
@contextmanager
def postgres_connection(self):
"""Context manager for PostgreSQL connection with retry logic"""
def _connect():
return psycopg2.connect(
host=config.POSTGRES_HOST,
port=config.POSTGRES_PORT,
database=config.POSTGRES_DATABASE,
user=config.POSTGRES_USER,
password=config.POSTGRES_PASSWORD,
cursor_factory=RealDictCursor,
connect_timeout=30,
options='-c statement_timeout=300000' # 5 minutes
)
conn = self._retry_connection(_connect, "PostgreSQL")
try:
yield conn
finally:
try:
conn.close()
except Exception as e:
logger.warning(f"Error closing PostgreSQL connection: {e}")
async def create_pg_pool(self):
"""Create async PostgreSQL connection pool"""
self.pg_pool = await asyncpg.create_pool(
host=config.POSTGRES_HOST,
port=config.POSTGRES_PORT,
database=config.POSTGRES_DATABASE,
user=config.POSTGRES_USER,
password=config.POSTGRES_PASSWORD,
min_size=10,
max_size=20
)
return self.pg_pool
def get_redis_client(self):
"""Get Redis client"""
if not self.redis_client:
self.redis_client = redis.Redis(
host=config.REDIS_HOST,
port=config.REDIS_PORT,
db=config.REDIS_DB,
decode_responses=True
)
return self.redis_client
def test_connections():
"""Test all database connections for health check"""
try:
# Test MSSQL connection (use master DB to avoid failures before restore)
db = DatabaseConnections()
mssql_master_conn_str = (
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
f"SERVER={config.MSSQL_HOST},{config.MSSQL_PORT};"
f"DATABASE=master;"
f"UID={config.MSSQL_USER};"
f"PWD={config.MSSQL_PASSWORD};"
f"TrustServerCertificate=yes;"
)
import pyodbc as _pyodbc
with _pyodbc.connect(mssql_master_conn_str) as conn:
cursor = conn.cursor()
cursor.execute("SELECT 1")
cursor.fetchone()
logger.info("MSSQL connection successful (master)")
# Test PostgreSQL connection
with db.postgres_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT 1")
cursor.fetchone()
logger.info("PostgreSQL connection successful")
# Test Redis connection
redis_client = db.get_redis_client()
redis_client.ping()
logger.info("Redis connection successful")
return True
except Exception as e:
logger.error(f"Connection test failed: {e}")
return False
db_connections = DatabaseConnections()

View File

@@ -0,0 +1 @@
# ETL Downloaders

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env python3
"""
NHTSA vPIC Database Downloader
Downloads and prepares the NHTSA vPIC database file for ETL processing
"""
import os
import logging
import requests
import zipfile
from pathlib import Path
from datetime import datetime
from typing import Optional
logger = logging.getLogger(__name__)
class NHTSADownloader:
"""Downloads and manages NHTSA vPIC database files"""
def __init__(self, download_dir: str = "/app/data"):
self.download_dir = Path(download_dir)
self.download_dir.mkdir(exist_ok=True)
def get_latest_database_url(self) -> str:
"""
Get the latest NHTSA vPIC database URL
Uses July 2025 version as specified
"""
return "https://vpic.nhtsa.dot.gov/api/vPICList_lite_2025_07.bak.zip"
def download_database(self, url: Optional[str] = None) -> Optional[Path]:
"""
Download NHTSA vPIC database file
Args:
url: Database URL (defaults to latest)
Returns:
Path to downloaded .bak file or None if failed
"""
if url is None:
url = self.get_latest_database_url()
logger.info(f"Starting download of NHTSA vPIC database from: {url}")
try:
# Extract filename from URL
zip_filename = url.split('/')[-1]
zip_path = self.download_dir / zip_filename
# Download with progress
response = requests.get(url, stream=True)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
logger.info(f"Downloading {zip_filename} ({total_size:,} bytes)")
with open(zip_path, 'wb') as f:
downloaded = 0
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
progress = (downloaded / total_size) * 100
if downloaded % (1024 * 1024 * 10) == 0: # Log every 10MB
logger.info(f"Download progress: {progress:.1f}% ({downloaded:,}/{total_size:,} bytes)")
logger.info(f"Successfully downloaded: {zip_path}")
# Extract the .bak file
bak_path = self.extract_bak_file(zip_path)
# Clean up zip file
zip_path.unlink()
logger.info(f"Cleaned up zip file: {zip_path}")
return bak_path
except Exception as e:
logger.error(f"Failed to download database: {e}")
return None
def extract_bak_file(self, zip_path: Path) -> Path:
"""
Extract .bak file from zip archive
Args:
zip_path: Path to zip file
Returns:
Path to extracted .bak file
"""
logger.info(f"Extracting .bak file from: {zip_path}")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Find the .bak file
bak_files = [name for name in zip_ref.namelist() if name.endswith('.bak')]
if not bak_files:
raise ValueError("No .bak file found in zip archive")
if len(bak_files) > 1:
logger.warning(f"Multiple .bak files found, using first: {bak_files}")
bak_filename = bak_files[0]
logger.info(f"Extracting: {bak_filename}")
# Extract to download directory
zip_ref.extract(bak_filename, self.download_dir)
bak_path = self.download_dir / bak_filename
logger.info(f"Successfully extracted: {bak_path}")
return bak_path
def get_existing_bak_file(self) -> Optional[Path]:
"""
Find an existing .bak file in preferred locations.
Searches both the shared mount (/app/shared) and local download dir (/app/data).
Returns:
Path to most recent .bak file or None
"""
search_dirs = [Path("/app/shared"), self.download_dir]
candidates = []
for d in search_dirs:
try:
if d.exists():
candidates.extend(list(d.glob("*.bak")))
except Exception as e:
logger.debug(f"Skipping directory {d}: {e}")
if candidates:
latest_bak = max(candidates, key=lambda p: p.stat().st_mtime)
logger.info(f"Found existing .bak file: {latest_bak}")
return latest_bak
return None
def ensure_database_file(self, force_download: bool = False) -> Optional[Path]:
"""
Ensure we have a database file - download if needed
Args:
force_download: Force download even if file exists
Returns:
Path to .bak file or None if failed
"""
if not force_download:
existing_file = self.get_existing_bak_file()
if existing_file:
logger.info(f"Using existing database file: {existing_file}")
return existing_file
logger.info("Downloading fresh database file...")
return self.download_database()
def get_database_info(self, bak_path: Path) -> dict:
"""
Get information about the database file
Args:
bak_path: Path to .bak file
Returns:
Dictionary with file info
"""
if not bak_path.exists():
return {"exists": False}
stat = bak_path.stat()
return {
"exists": True,
"path": str(bak_path),
"size_mb": round(stat.st_size / (1024 * 1024), 1),
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
"name": bak_path.name
}

View File

@@ -0,0 +1,629 @@
"""
JSON Extractor for Manual Vehicle Data Processing
Extracts and normalizes vehicle data from JSON files into database-ready structures.
Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive
data processing with L→I normalization and make name conversion.
Key Features:
- Extract make/model/year/trim/engine data from JSON files
- Handle electric vehicles (empty engines → default motor)
- Data validation and quality assurance
- Progress tracking and error reporting
Usage:
extractor = JsonExtractor(make_mapper, engine_parser)
make_data = extractor.extract_make_data('sources/makes/toyota.json')
all_data = extractor.extract_all_makes('sources/makes/')
"""
import json
import os
import glob
import logging
from typing import List, Dict, Optional, Generator, Tuple
from dataclasses import dataclass
from pathlib import Path
# Import our utilities (handle both relative and direct imports)
try:
from ..utils.make_name_mapper import MakeNameMapper
from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser, EngineSpec
logger = logging.getLogger(__name__)
@dataclass
class ValidationResult:
"""JSON validation result"""
is_valid: bool
errors: List[str]
warnings: List[str]
@property
def has_errors(self) -> bool:
return len(self.errors) > 0
@property
def has_warnings(self) -> bool:
return len(self.warnings) > 0
@dataclass
class ModelData:
"""Extracted model data with normalized engines and trims"""
name: str # Model name from JSON
years: List[int] # Years this model appears in
engines: List[EngineSpec] # Parsed and normalized engines
trims: List[str] # Trim names (from submodels)
is_electric: bool = False # True if empty engines array detected
@property
def total_trims(self) -> int:
return len(self.trims)
@property
def total_engines(self) -> int:
return len(self.engines)
@property
def year_range(self) -> str:
if not self.years:
return "Unknown"
return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0])
@dataclass
class MakeData:
"""Complete make data with models, engines, and metadata"""
name: str # Normalized display name (e.g., "Alfa Romeo")
filename: str # Original JSON filename
models: List[ModelData]
processing_errors: List[str] # Any errors during extraction
processing_warnings: List[str] # Any warnings during extraction
@property
def total_models(self) -> int:
return len(self.models)
@property
def total_engines(self) -> int:
return sum(model.total_engines for model in self.models)
@property
def total_trims(self) -> int:
return sum(model.total_trims for model in self.models)
@property
def electric_models_count(self) -> int:
return sum(1 for model in self.models if model.is_electric)
@property
def year_range(self) -> str:
all_years = []
for model in self.models:
all_years.extend(model.years)
if not all_years:
return "Unknown"
return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0])
@dataclass
class ExtractionResult:
"""Results of extracting all makes"""
makes: List[MakeData]
total_files_processed: int
successful_extractions: int
failed_extractions: int
total_models: int
total_engines: int
total_electric_models: int
@property
def success_rate(self) -> float:
return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0
class JsonExtractor:
"""Extract normalized vehicle data from JSON files"""
def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser):
"""
Initialize JSON extractor with utilities
Args:
make_mapper: For normalizing make names from filenames
engine_parser: For parsing engine specifications with L→I normalization
"""
self.make_mapper = make_mapper
self.engine_parser = engine_parser
logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser")
def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult:
"""
Validate JSON structure before processing
Args:
json_data: Loaded JSON data
filename: Source filename for error context
Returns:
ValidationResult with validity status and any issues
"""
errors = []
warnings = []
try:
# Check top-level structure
if not isinstance(json_data, dict):
errors.append("JSON must be a dictionary")
return ValidationResult(False, errors, warnings)
# Should have exactly one key (the make name)
if len(json_data.keys()) != 1:
errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}")
return ValidationResult(False, errors, warnings)
make_key = list(json_data.keys())[0]
make_data = json_data[make_key]
# Make data should be a list of year entries
if not isinstance(make_data, list):
errors.append(f"Make data for '{make_key}' must be a list")
return ValidationResult(False, errors, warnings)
if len(make_data) == 0:
warnings.append(f"Make '{make_key}' has no year entries")
# Validate year entries
for i, year_entry in enumerate(make_data):
if not isinstance(year_entry, dict):
errors.append(f"Year entry {i} must be a dictionary")
continue
# Check required fields
if 'year' not in year_entry:
errors.append(f"Year entry {i} missing 'year' field")
if 'models' not in year_entry:
errors.append(f"Year entry {i} missing 'models' field")
continue
# Validate year
try:
year = int(year_entry['year'])
if year < 1900 or year > 2030:
warnings.append(f"Unusual year value: {year}")
except (ValueError, TypeError):
errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}")
# Validate models
models = year_entry['models']
if not isinstance(models, list):
errors.append(f"Models in year entry {i} must be a list")
continue
for j, model in enumerate(models):
if not isinstance(model, dict):
errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary")
continue
if 'name' not in model:
errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field")
# Engines and submodels are optional but should be lists if present
if 'engines' in model and not isinstance(model['engines'], list):
errors.append(f"Engines for model {model.get('name')} must be a list")
if 'submodels' in model and not isinstance(model['submodels'], list):
errors.append(f"Submodels for model {model.get('name')} must be a list")
except Exception as e:
errors.append(f"Unexpected error during validation: {str(e)}")
is_valid = len(errors) == 0
if errors:
logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors")
elif warnings:
logger.info(f"JSON validation for {filename}: {len(warnings)} warnings")
else:
logger.debug(f"JSON validation passed for {filename}")
return ValidationResult(is_valid, errors, warnings)
def extract_make_data(self, json_file_path: str) -> MakeData:
"""
Extract complete make data from a single JSON file
Args:
json_file_path: Path to JSON file
Returns:
MakeData with extracted and normalized data
"""
filename = os.path.basename(json_file_path)
logger.info(f"Extracting make data from {filename}")
processing_errors = []
processing_warnings = []
try:
# Load and validate JSON
with open(json_file_path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
validation = self.validate_json_structure(json_data, filename)
processing_errors.extend(validation.errors)
processing_warnings.extend(validation.warnings)
if not validation.is_valid:
logger.error(f"JSON validation failed for {filename}")
return MakeData(
name=self.make_mapper.normalize_make_name(filename),
filename=filename,
models=[],
processing_errors=processing_errors,
processing_warnings=processing_warnings
)
# Get normalized make name
make_name = self.make_mapper.normalize_make_name(filename)
logger.debug(f"Normalized make name: {filename}{make_name}")
# Extract data
make_key = list(json_data.keys())[0]
year_entries = json_data[make_key]
# Group models by name across all years
models_by_name = {} # model_name -> {years: set, engines: set, trims: set}
for year_entry in year_entries:
try:
year = int(year_entry['year'])
models_list = year_entry.get('models', [])
for model_entry in models_list:
model_name = model_entry.get('name', '').strip()
if not model_name:
processing_warnings.append(f"Empty model name in year {year}")
continue
# Initialize model data if not seen before
if model_name not in models_by_name:
models_by_name[model_name] = {
'years': set(),
'engines': set(),
'trims': set()
}
# Add year
models_by_name[model_name]['years'].add(year)
# Add engines
engines_list = model_entry.get('engines', [])
for engine_str in engines_list:
if engine_str and engine_str.strip():
models_by_name[model_name]['engines'].add(engine_str.strip())
# Add trims (from submodels)
submodels_list = model_entry.get('submodels', [])
for trim in submodels_list:
if trim and trim.strip():
models_by_name[model_name]['trims'].add(trim.strip())
except (ValueError, TypeError) as e:
processing_errors.append(f"Error processing year entry: {str(e)}")
continue
# Convert to ModelData objects
models = []
for model_name, model_info in models_by_name.items():
try:
# Parse engines
engine_specs = []
is_electric = False
if not model_info['engines']:
# Empty engines array - electric vehicle
is_electric = True
electric_spec = self.engine_parser.create_electric_motor()
engine_specs = [electric_spec]
logger.debug(f"Created electric motor for {make_name} {model_name}")
else:
# Parse each engine string
for engine_str in model_info['engines']:
spec = self.engine_parser.parse_engine_string(engine_str)
engine_specs.append(spec)
# Remove duplicate engines based on key attributes
unique_engines = self.engine_parser.get_unique_engines(engine_specs)
# Create model data
model_data = ModelData(
name=model_name,
years=sorted(list(model_info['years'])),
engines=unique_engines,
trims=sorted(list(model_info['trims'])),
is_electric=is_electric
)
models.append(model_data)
except Exception as e:
processing_errors.append(f"Error processing model {model_name}: {str(e)}")
continue
# Sort models by name
models.sort(key=lambda m: m.name)
make_data = MakeData(
name=make_name,
filename=filename,
models=models,
processing_errors=processing_errors,
processing_warnings=processing_warnings
)
logger.info(f"Extracted {filename}: {len(models)} models, "
f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models")
return make_data
except Exception as e:
logger.error(f"Failed to extract make data from {filename}: {str(e)}")
processing_errors.append(f"Fatal error: {str(e)}")
return MakeData(
name=self.make_mapper.normalize_make_name(filename),
filename=filename,
models=[],
processing_errors=processing_errors,
processing_warnings=processing_warnings
)
def extract_all_makes(self, sources_dir: str) -> ExtractionResult:
"""
Process all JSON files in the sources directory
Args:
sources_dir: Directory containing JSON make files
Returns:
ExtractionResult with all extracted data and statistics
"""
logger.info(f"Starting extraction of all makes from {sources_dir}")
# Find all JSON files
pattern = os.path.join(sources_dir, '*.json')
json_files = glob.glob(pattern)
if not json_files:
logger.warning(f"No JSON files found in {sources_dir}")
return ExtractionResult(
makes=[],
total_files_processed=0,
successful_extractions=0,
failed_extractions=0,
total_models=0,
total_engines=0,
total_electric_models=0
)
logger.info(f"Found {len(json_files)} JSON files to process")
makes = []
successful_extractions = 0
failed_extractions = 0
# Sort files for consistent processing order
json_files.sort()
for json_file in json_files:
try:
make_data = self.extract_make_data(json_file)
makes.append(make_data)
if make_data.processing_errors:
failed_extractions += 1
logger.error(f"Extraction completed with errors for {make_data.filename}")
else:
successful_extractions += 1
logger.debug(f"Extraction successful for {make_data.filename}")
except Exception as e:
logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}")
failed_extractions += 1
# Create minimal make data for failed file
filename = os.path.basename(json_file)
failed_make = MakeData(
name=self.make_mapper.normalize_make_name(filename),
filename=filename,
models=[],
processing_errors=[f"Fatal extraction error: {str(e)}"],
processing_warnings=[]
)
makes.append(failed_make)
# Calculate statistics
total_models = sum(make.total_models for make in makes)
total_engines = sum(make.total_engines for make in makes)
total_electric_models = sum(make.electric_models_count for make in makes)
result = ExtractionResult(
makes=makes,
total_files_processed=len(json_files),
successful_extractions=successful_extractions,
failed_extractions=failed_extractions,
total_models=total_models,
total_engines=total_engines,
total_electric_models=total_electric_models
)
logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, "
f"{total_models} models, {total_engines} engines, {total_electric_models} electric models")
return result
def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]:
"""
Get detailed extraction statistics
Args:
result: ExtractionResult from extract_all_makes
Returns:
Dictionary with detailed statistics
"""
stats = {
'files': {
'total_processed': result.total_files_processed,
'successful': result.successful_extractions,
'failed': result.failed_extractions,
'success_rate': result.success_rate
},
'data': {
'total_makes': len(result.makes),
'total_models': result.total_models,
'total_engines': result.total_engines,
'electric_models': result.total_electric_models
},
'quality': {
'makes_with_errors': sum(1 for make in result.makes if make.processing_errors),
'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings),
'total_errors': sum(len(make.processing_errors) for make in result.makes),
'total_warnings': sum(len(make.processing_warnings) for make in result.makes)
}
}
# Add make-specific statistics
make_stats = []
for make in result.makes:
make_stat = {
'name': make.name,
'filename': make.filename,
'models': make.total_models,
'engines': make.total_engines,
'trims': make.total_trims,
'electric_models': make.electric_models_count,
'year_range': make.year_range,
'errors': len(make.processing_errors),
'warnings': len(make.processing_warnings)
}
make_stats.append(make_stat)
stats['makes'] = make_stats
return stats
def print_extraction_report(self, result: ExtractionResult) -> None:
"""
Print detailed extraction report
Args:
result: ExtractionResult from extract_all_makes
"""
stats = self.get_extraction_statistics(result)
print(f"🚀 JSON EXTRACTION REPORT")
print(f"=" * 50)
# File processing summary
print(f"\n📁 FILE PROCESSING")
print(f" Files processed: {stats['files']['total_processed']}")
print(f" Successful: {stats['files']['successful']}")
print(f" Failed: {stats['files']['failed']}")
print(f" Success rate: {stats['files']['success_rate']:.1%}")
# Data summary
print(f"\n📊 DATA EXTRACTED")
print(f" Makes: {stats['data']['total_makes']}")
print(f" Models: {stats['data']['total_models']}")
print(f" Engines: {stats['data']['total_engines']}")
print(f" Electric models: {stats['data']['electric_models']}")
# Quality summary
print(f"\n🔍 QUALITY ASSESSMENT")
print(f" Makes with errors: {stats['quality']['makes_with_errors']}")
print(f" Makes with warnings: {stats['quality']['makes_with_warnings']}")
print(f" Total errors: {stats['quality']['total_errors']}")
print(f" Total warnings: {stats['quality']['total_warnings']}")
# Show problematic makes
if stats['quality']['makes_with_errors'] > 0:
print(f"\n⚠️ MAKES WITH ERRORS:")
for make in result.makes:
if make.processing_errors:
print(f" {make.name} ({make.filename}): {len(make.processing_errors)} errors")
# Show top makes by data volume
print(f"\n🏆 TOP MAKES BY MODEL COUNT:")
top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
for make in top_makes:
print(f" {make.name}: {make.total_models} models, {make.total_engines} engines")
# Example usage and testing functions
def example_usage():
"""Demonstrate JsonExtractor usage"""
print("🚀 JsonExtractor Example Usage")
print("=" * 40)
# Use direct imports for example usage
try:
from ..utils.make_name_mapper import MakeNameMapper
from ..utils.engine_spec_parser import EngineSpecParser
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser
# Initialize utilities
make_mapper = MakeNameMapper()
engine_parser = EngineSpecParser()
# Create extractor
extractor = JsonExtractor(make_mapper, engine_parser)
# Extract single make
sources_dir = "sources/makes"
if os.path.exists(sources_dir):
toyota_file = os.path.join(sources_dir, "toyota.json")
if os.path.exists(toyota_file):
print(f"\n📄 Extracting from toyota.json...")
toyota_data = extractor.extract_make_data(toyota_file)
print(f" Make: {toyota_data.name}")
print(f" Models: {toyota_data.total_models}")
print(f" Engines: {toyota_data.total_engines}")
print(f" Electric models: {toyota_data.electric_models_count}")
print(f" Year range: {toyota_data.year_range}")
if toyota_data.processing_errors:
print(f" Errors: {len(toyota_data.processing_errors)}")
if toyota_data.processing_warnings:
print(f" Warnings: {len(toyota_data.processing_warnings)}")
# Extract all makes
print(f"\n🔄 Extracting all makes...")
result = extractor.extract_all_makes(sources_dir)
extractor.print_extraction_report(result)
else:
print(f"Sources directory not found: {sources_dir}")
if __name__ == "__main__":
example_usage()

View File

@@ -0,0 +1,337 @@
import logging
from typing import List, Dict, Optional, Generator
from ..connections import db_connections
from ..utils.make_filter import MakeFilter
from tqdm import tqdm
logger = logging.getLogger(__name__)
class MSSQLExtractor:
"""Extract data from MS SQL Server source database"""
def __init__(self, make_filter: Optional[MakeFilter] = None):
self.batch_size = 10000
self.make_filter = make_filter or MakeFilter()
logger.info(f"Initialized MSSQL extractor with {len(self.make_filter.get_allowed_makes())} allowed makes")
def extract_wmi_data(self) -> List[Dict]:
"""Extract WMI (World Manufacturer Identifier) data with make filtering"""
logger.info("Extracting WMI data from source database with make filtering")
query = f"""
SELECT
w.Id,
w.Wmi,
w.ManufacturerId,
w.MakeId,
w.VehicleTypeId,
w.TruckTypeId,
w.CountryId,
w.PublicAvailabilityDate,
w.NonCompliant,
w.NonCompliantReason,
w.CreatedOn,
w.UpdatedOn,
w.ProcessedOn
FROM dbo.Wmi w
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
)
ORDER BY w.Id
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} WMI records")
return results
def extract_wmi_vin_schema_mappings(self) -> List[Dict]:
"""Extract WMI to VIN Schema mappings with year ranges and make filtering"""
logger.info("Extracting WMI-VinSchema mappings with make filtering")
query = f"""
SELECT
wvs.WmiId,
wvs.VinSchemaId,
wvs.YearFrom,
wvs.YearTo,
w.Wmi,
vs.Name as SchemaName
FROM dbo.Wmi_VinSchema wvs
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
JOIN dbo.VinSchema vs ON wvs.VinSchemaId = vs.Id
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
)
AND w.MakeId IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
ORDER BY wvs.WmiId, wvs.VinSchemaId
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} WMI-VinSchema mappings (filtered by allowed makes)")
return results
def extract_patterns_data(self) -> Generator[List[Dict], None, None]:
"""Extract pattern data in batches with make filtering"""
logger.info("Extracting pattern data from source database with make filtering")
# First get the total count with filtering
count_query = f"""
SELECT COUNT(*) as total
FROM dbo.Pattern p
JOIN dbo.Element e ON p.ElementId = e.Id
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
JOIN dbo.Make m ON wm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
AND e.Id IN (26, 27, 28, 18, 24)
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(count_query)
total_row = self._row_to_dict(cursor, cursor.fetchone())
total_count = total_row.get('total', 0)
logger.info(f"Total patterns to extract (filtered): {total_count}")
# Extract in batches with manufacturer filtering
query = f"""
SELECT
p.Id,
p.VinSchemaId,
p.Keys,
p.ElementId,
p.AttributeId,
e.Name as ElementName,
e.weight,
e.GroupName,
vs.Name as SchemaName,
w.Wmi,
m.Name as MakeName
FROM dbo.Pattern p
JOIN dbo.Element e ON p.ElementId = e.Id
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
JOIN dbo.Make m ON wm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
AND e.Id IN (26, 27, 28, 18, 24)
ORDER BY p.Id
OFFSET {{}} ROWS FETCH NEXT {{}} ROWS ONLY
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
for offset in tqdm(range(0, total_count, self.batch_size), desc="Extracting filtered patterns"):
cursor.execute(query.format(offset, self.batch_size))
rows = cursor.fetchall()
if rows:
yield self._rows_to_dicts(cursor, rows)
else:
break
def extract_elements_data(self) -> List[Dict]:
"""Extract element definitions"""
logger.info("Extracting element data")
query = """
SELECT
Id,
Name,
Code,
LookupTable,
Description,
IsPrivate,
GroupName,
DataType,
MinAllowedValue,
MaxAllowedValue,
IsQS,
Decode,
weight
FROM dbo.Element
ORDER BY Id
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} element definitions")
return results
def extract_reference_table(self, table_name: str) -> List[Dict]:
"""Extract data from a reference table with make filtering"""
logger.info(f"Extracting data from {table_name} with make filtering")
# Apply make filtering - filter by Make brand names (simpler and more efficient)
if table_name == 'Manufacturer':
# Extract manufacturers linked to filtered makes only
query = f"""
SELECT DISTINCT mfr.* FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
ORDER BY mfr.Id
"""
elif table_name == 'Make':
# Filter makes directly by brand names (GMC, Ford, Toyota, etc.)
query = f"""
SELECT * FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
ORDER BY Id
"""
elif table_name == 'Model':
# Filter models by allowed make brand names
query = f"""
SELECT md.* FROM dbo.Model md
JOIN dbo.Make_Model mm ON md.Id = mm.ModelId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
ORDER BY md.Id
"""
elif table_name == 'Wmi':
# Filter WMI records by allowed manufacturers (linked to makes) AND makes directly
query = f"""
SELECT w.* FROM dbo.Wmi w
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make m ON mm.MakeId = m.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
)
AND w.MakeId IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
ORDER BY w.Id
"""
else:
# No filtering for other reference tables
query = f"SELECT * FROM dbo.{table_name} ORDER BY Id"
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} records from {table_name} (filtered by allowed makes)")
return results
def extract_make_model_relationships(self) -> List[Dict]:
"""Extract Make-Model relationships with make filtering"""
logger.info("Extracting Make-Model relationships with make filtering")
query = f"""
SELECT
mm.MakeId,
mm.ModelId,
m.Name as MakeName,
md.Name as ModelName
FROM dbo.Make_Model mm
JOIN dbo.Make m ON mm.MakeId = m.Id
JOIN dbo.Model md ON mm.ModelId = md.Id
WHERE {self.make_filter.get_sql_filter('m.Name')}
ORDER BY mm.MakeId, mm.ModelId
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} Make-Model relationships (filtered by allowed makes)")
return results
def extract_wmi_make_relationships(self) -> List[Dict]:
"""Extract WMI-Make relationships with make filtering"""
logger.info("Extracting WMI-Make relationships with make filtering")
query = f"""
SELECT
wm.WmiId,
wm.MakeId,
w.Wmi,
m.Name as MakeName
FROM dbo.Wmi_Make wm
JOIN dbo.Wmi w ON wm.WmiId = w.Id
JOIN dbo.Make m ON wm.MakeId = m.Id
WHERE w.PublicAvailabilityDate <= GETDATE()
AND w.ManufacturerId IN (
SELECT DISTINCT mfr.Id
FROM dbo.Manufacturer mfr
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
JOIN dbo.Make mk ON mm.MakeId = mk.Id
WHERE {self.make_filter.get_sql_filter('mk.Name')}
)
AND w.MakeId IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
AND m.Id IN (
SELECT Id FROM dbo.Make
WHERE {self.make_filter.get_sql_filter('Name')}
)
ORDER BY wm.WmiId, wm.MakeId
"""
with db_connections.mssql_connection() as conn:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
results = self._rows_to_dicts(cursor, rows)
logger.info(f"Extracted {len(results)} WMI-Make relationships (filtered by allowed makes)")
return results
def _rows_to_dicts(self, cursor, rows) -> List[Dict]:
"""Convert pyodbc rows to list of dicts using cursor description."""
if not rows:
return []
columns = [col[0] for col in cursor.description]
result: List[Dict] = []
for row in rows:
item = {columns[i]: row[i] for i in range(len(columns))}
result.append(item)
return result
def _row_to_dict(self, cursor, row) -> Dict:
"""Convert single pyodbc row to dict."""
if row is None:
return {}
columns = [col[0] for col in cursor.description]
return {columns[i]: row[i] for i in range(len(columns))}

View File

@@ -0,0 +1,63 @@
import logging
from typing import Optional, Dict, Any, List
from ..connections import db_connections
logger = logging.getLogger(__name__)
class VinProcExtractor:
"""Utilities to inspect and sample the MSSQL VIN decode stored procedure."""
def __init__(self, proc_name: str = 'dbo.spVinDecode'):
self.proc_name = proc_name
def find_proc(self) -> Optional[Dict[str, Any]]:
"""Locate the VIN decode proc by name pattern, return basic metadata."""
query = """
SELECT TOP 1
o.name AS object_name,
s.name AS schema_name,
o.type_desc
FROM sys.objects o
JOIN sys.schemas s ON s.schema_id = o.schema_id
WHERE o.name LIKE '%Vin%Decode%'
ORDER BY o.create_date DESC
"""
with db_connections.mssql_connection() as conn:
cur = conn.cursor()
cur.execute(query)
row = cur.fetchone()
if not row:
logger.warning("VIN decode stored procedure not found by pattern")
return None
return { 'object_name': row[0], 'schema_name': row[1], 'type_desc': row[2] }
def get_definition(self, schema: str, name: str) -> str:
"""Return the text definition of the proc using sp_helptext semantics."""
sql = f"EXEC {schema}.sp_helptext '{schema}.{name}'"
definition_lines: List[str] = []
with db_connections.mssql_connection() as conn:
cur = conn.cursor()
cur.execute(sql)
for row in cur.fetchall():
# sp_helptext returns a single NVARCHAR column with line segments
definition_lines.append(row[0])
return ''.join(definition_lines)
def sample_execute(self, vin: str) -> Optional[List[Dict[str, Any]]]:
"""Execute the VIN decode proc with a VIN to capture output shape."""
# Prefer proc signature with @VIN only; if it requires year, MSSQL will error.
sql = f"EXEC {self.proc_name} @VIN=?"
with db_connections.mssql_connection() as conn:
cur = conn.cursor()
try:
cur.execute(sql, (vin,))
columns = [c[0] for c in cur.description] if cur.description else []
rows = cur.fetchall() if cur.description else []
results: List[Dict[str, Any]] = []
for r in rows:
results.append({columns[i]: r[i] for i in range(len(columns))})
return results
except Exception as e:
logger.warning(f"VIN proc sample execution failed: {e}")
return None

View File

@@ -0,0 +1 @@
# ETL Loaders

View File

@@ -0,0 +1,716 @@
"""
JSON Manual Loader for Vehicles ETL
Loads extracted JSON data into PostgreSQL database with referential integrity.
Supports clear/append modes with duplicate handling and comprehensive progress tracking.
Database Schema:
- vehicles.make (id, name)
- vehicles.model (id, make_id, name)
- vehicles.model_year (id, model_id, year)
- vehicles.trim (id, model_year_id, name)
- vehicles.engine (id, name, code, displacement_l, cylinders, fuel_type, aspiration)
- vehicles.trim_engine (trim_id, engine_id)
Load Modes:
- CLEAR: Truncate all tables and reload (destructive)
- APPEND: Insert with conflict resolution (safe)
Usage:
loader = JsonManualLoader(postgres_loader)
result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
"""
import logging
from typing import List, Dict, Optional, Tuple
from enum import Enum
from dataclasses import dataclass
from psycopg2.extras import execute_batch
# Import our components (handle both relative and direct imports)
try:
from .postgres_loader import PostgreSQLLoader
from ..extractors.json_extractor import MakeData, ModelData, ExtractionResult
from ..utils.engine_spec_parser import EngineSpec
from ..connections import db_connections
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
# Import with fallback handling for nested imports
try:
from loaders.postgres_loader import PostgreSQLLoader
except ImportError:
# Mock PostgreSQLLoader for testing
class PostgreSQLLoader:
def __init__(self):
self.batch_size = 1000
from extractors.json_extractor import MakeData, ModelData, ExtractionResult
from utils.engine_spec_parser import EngineSpec
try:
from connections import db_connections
except ImportError:
# Mock db_connections for testing
class MockDBConnections:
def postgres_connection(self):
raise NotImplementedError("Database connection not available in test mode")
db_connections = MockDBConnections()
logger = logging.getLogger(__name__)
class LoadMode(Enum):
"""Data loading modes"""
CLEAR = "clear" # Truncate and reload (destructive)
APPEND = "append" # Insert with conflict handling (safe)
@dataclass
class LoadResult:
"""Result of loading operations"""
total_makes: int
total_models: int
total_model_years: int
total_trims: int
total_engines: int
total_trim_engine_mappings: int
failed_makes: List[str]
warnings: List[str]
load_mode: LoadMode
@property
def success_count(self) -> int:
return self.total_makes - len(self.failed_makes)
@property
def success_rate(self) -> float:
return self.success_count / self.total_makes if self.total_makes > 0 else 0.0
@dataclass
class LoadStatistics:
"""Detailed loading statistics"""
makes_processed: int = 0
makes_skipped: int = 0
models_inserted: int = 0
model_years_inserted: int = 0
skipped_model_years: int = 0
trims_inserted: int = 0
engines_inserted: int = 0
trim_engine_mappings_inserted: int = 0
duplicate_makes: int = 0
duplicate_models: int = 0
duplicate_engines: int = 0
errors: List[str] = None
warnings: List[str] = None
def __post_init__(self):
if self.errors is None:
self.errors = []
if self.warnings is None:
self.warnings = []
class JsonManualLoader:
"""Load JSON-extracted vehicle data into PostgreSQL"""
def _get_id_from_result(self, result, column_name='id'):
"""Helper to extract ID from query result, handling both tuple and dict cursors"""
if result is None:
return None
if isinstance(result, tuple):
return result[0]
# For RealDictCursor, try the column name first, fall back to key access
if column_name in result:
return result[column_name]
# For COUNT(*) queries, the key might be 'count'
if 'count' in result:
return result['count']
# Fall back to first value
return list(result.values())[0] if result else None
def __init__(self, postgres_loader: Optional[PostgreSQLLoader] = None):
"""
Initialize JSON manual loader
Args:
postgres_loader: Existing PostgreSQL loader instance
"""
self.postgres_loader = postgres_loader or PostgreSQLLoader()
self.batch_size = 1000
logger.info("JsonManualLoader initialized")
def clear_all_tables(self) -> None:
"""
Clear all vehicles tables in dependency order
WARNING: This is destructive and will remove all data
"""
logger.warning("CLEARING ALL VEHICLES TABLES - This is destructive!")
tables_to_clear = [
'trim_engine', # Many-to-many mappings first
'trim_transmission',
'performance', # Tables with foreign keys
'trim',
'model_year',
'model',
'make',
'engine', # Independent tables last
'transmission'
]
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
for table in tables_to_clear:
try:
cursor.execute(f"TRUNCATE TABLE vehicles.{table} CASCADE")
logger.info(f"Cleared vehicles.{table}")
except Exception as e:
logger.warning(f"Failed to clear vehicles.{table}: {str(e)}")
conn.commit()
logger.info("All vehicles tables cleared")
def load_make(self, make_data: MakeData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load a single make with all related data
Args:
make_data: Extracted make data
mode: Loading mode (clear/append)
stats: Statistics accumulator
Returns:
Make ID in database
"""
logger.debug(f"Loading make: {make_data.name}")
try:
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# 1. Insert or get make (always check for existing to avoid constraint violations)
# Check if make exists (case-insensitive to match database constraint)
cursor.execute(
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
(make_data.name,)
)
result = cursor.fetchone()
if result:
make_id = self._get_id_from_result(result)
stats.duplicate_makes += 1
logger.debug(f"Make {make_data.name} already exists with ID {make_id}")
else:
# Insert new make with error handling for constraint violations
try:
cursor.execute(
"INSERT INTO vehicles.make (name) VALUES (%s) RETURNING id",
(make_data.name,)
)
result = cursor.fetchone()
make_id = self._get_id_from_result(result)
logger.debug(f"Inserted make {make_data.name} with ID {make_id}")
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Retry the lookup in case of race condition
cursor.execute(
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
(make_data.name,)
)
result = cursor.fetchone()
if result:
make_id = self._get_id_from_result(result)
stats.duplicate_makes += 1
logger.debug(f"Make {make_data.name} found after retry with ID {make_id}")
else:
raise
else:
raise
# 2. Process models
for model_data in make_data.models:
model_id = self.load_model(cursor, make_id, model_data, mode, stats)
conn.commit()
stats.makes_processed += 1
return make_id
except Exception as e:
error_msg = f"Failed to load make {make_data.name}: {str(e)}"
logger.error(error_msg)
stats.errors.append(error_msg)
raise
def load_model(self, cursor, make_id: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load a single model with all related data
Args:
cursor: Database cursor
make_id: Parent make ID
model_data: Extracted model data
mode: Loading mode
stats: Statistics accumulator
Returns:
Model ID in database
"""
# 1. Insert or get model
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.model WHERE make_id = %s AND name = %s",
(make_id, model_data.name)
)
result = cursor.fetchone()
if result:
model_id = result[0] if isinstance(result, tuple) else result['id']
stats.duplicate_models += 1
else:
cursor.execute(
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
(make_id, model_data.name)
)
model_id = self._get_id_from_result(cursor.fetchone())
stats.models_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
(make_id, model_data.name)
)
model_id = self._get_id_from_result(cursor.fetchone())
stats.models_inserted += 1
# 2. Insert model years and related data
for year in model_data.years:
model_year_id = self.load_model_year(cursor, model_id, year, model_data, mode, stats)
# Skip processing if year was outside valid range
if model_year_id is None:
continue
return model_id
def load_model_year(self, cursor, model_id: int, year: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load model year and associated trims/engines
Args:
cursor: Database cursor
model_id: Parent model ID
year: Model year
model_data: Model data with trims and engines
mode: Loading mode
stats: Statistics accumulator
Returns:
Model year ID in database
"""
# Skip years that don't meet database constraints (must be 1950-2100)
if year < 1950 or year > 2100:
logger.warning(f"Skipping year {year} - outside valid range (1950-2100)")
stats.skipped_model_years += 1
return None
# 1. Insert or get model year
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.model_year WHERE model_id = %s AND year = %s",
(model_id, year)
)
result = cursor.fetchone()
if result:
model_year_id = result[0] if isinstance(result, tuple) else result['id']
else:
cursor.execute(
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
(model_id, year)
)
model_year_id = self._get_id_from_result(cursor.fetchone())
stats.model_years_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
(model_id, year)
)
model_year_id = self._get_id_from_result(cursor.fetchone())
stats.model_years_inserted += 1
# 2. Load engines and get their IDs
engine_ids = []
for engine_spec in model_data.engines:
engine_id = self.load_engine(cursor, engine_spec, mode, stats)
engine_ids.append(engine_id)
# 3. Load trims and connect to engines
for trim_name in model_data.trims:
trim_id = self.load_trim(cursor, model_year_id, trim_name, engine_ids, mode, stats)
return model_year_id
def load_engine(self, cursor, engine_spec: EngineSpec, mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load engine specification
Args:
cursor: Database cursor
engine_spec: Parsed engine specification
mode: Loading mode
stats: Statistics accumulator
Returns:
Engine ID in database
"""
# Create a canonical engine name for database storage
if engine_spec.displacement_l and engine_spec.configuration != "Unknown" and engine_spec.cylinders:
engine_name = f"{engine_spec.displacement_l}L {engine_spec.configuration}{engine_spec.cylinders}"
else:
engine_name = engine_spec.raw_string
# Generate engine code from name (remove spaces, lowercase)
engine_code = engine_name.replace(" ", "").lower()
# Always check for existing engine by name or code to avoid constraint violations
cursor.execute("""
SELECT id FROM vehicles.engine
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
""", (engine_name, engine_code))
result = cursor.fetchone()
if result:
engine_id = self._get_id_from_result(result)
stats.duplicate_engines += 1
return engine_id
# Insert new engine
try:
cursor.execute("""
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
VALUES (%s, %s, %s, %s, %s, %s)
RETURNING id
""", (
engine_name,
engine_code,
engine_spec.displacement_l,
engine_spec.cylinders,
engine_spec.fuel_type if engine_spec.fuel_type != "Unknown" else None,
engine_spec.aspiration if engine_spec.aspiration != "Natural" else None
))
engine_id = self._get_id_from_result(cursor.fetchone())
stats.engines_inserted += 1
return engine_id
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Retry the lookup in case of race condition
cursor.execute("""
SELECT id FROM vehicles.engine
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
""", (engine_name, engine_code))
result = cursor.fetchone()
if result:
engine_id = self._get_id_from_result(result)
stats.duplicate_engines += 1
return engine_id
raise
def load_trim(self, cursor, model_year_id: int, trim_name: str, engine_ids: List[int], mode: LoadMode, stats: LoadStatistics) -> int:
"""
Load trim and connect to engines
Args:
cursor: Database cursor
model_year_id: Parent model year ID
trim_name: Trim name
engine_ids: List of engine IDs to connect
mode: Loading mode
stats: Statistics accumulator
Returns:
Trim ID in database
"""
# 1. Insert or get trim
if mode == LoadMode.APPEND:
cursor.execute(
"SELECT id FROM vehicles.trim WHERE model_year_id = %s AND name = %s",
(model_year_id, trim_name)
)
result = cursor.fetchone()
if result:
trim_id = result[0] if isinstance(result, tuple) else result['id']
else:
cursor.execute(
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
(model_year_id, trim_name)
)
trim_id = self._get_id_from_result(cursor.fetchone())
stats.trims_inserted += 1
else:
# CLEAR mode - just insert
cursor.execute(
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
(model_year_id, trim_name)
)
trim_id = self._get_id_from_result(cursor.fetchone())
stats.trims_inserted += 1
# 2. Connect trim to engines (always check for existing to avoid duplicates)
# Deduplicate engine_ids to prevent duplicate mappings within the same trim
unique_engine_ids = list(set(engine_ids))
for engine_id in unique_engine_ids:
# Check if mapping already exists
cursor.execute(
"SELECT 1 FROM vehicles.trim_engine WHERE trim_id = %s AND engine_id = %s",
(trim_id, engine_id)
)
if not cursor.fetchone():
try:
cursor.execute(
"INSERT INTO vehicles.trim_engine (trim_id, engine_id) VALUES (%s, %s)",
(trim_id, engine_id)
)
stats.trim_engine_mappings_inserted += 1
except Exception as e:
if "duplicate key value violates unique constraint" in str(e):
# Another process may have inserted it, skip
logger.debug(f"Trim-engine mapping ({trim_id}, {engine_id}) already exists, skipping")
else:
raise
return trim_id
def load_all_makes(self, makes_data: List[MakeData], mode: LoadMode) -> LoadResult:
"""
Load all makes with complete data
Args:
makes_data: List of extracted make data
mode: Loading mode (clear/append)
Returns:
LoadResult with comprehensive statistics
"""
logger.info(f"Starting bulk load of {len(makes_data)} makes in {mode.value} mode")
# Clear tables if in CLEAR mode
if mode == LoadMode.CLEAR:
self.clear_all_tables()
stats = LoadStatistics()
failed_makes = []
for make_data in makes_data:
try:
if make_data.processing_errors:
logger.warning(f"Skipping make {make_data.name} due to extraction errors")
stats.makes_skipped += 1
failed_makes.append(make_data.name)
continue
make_id = self.load_make(make_data, mode, stats)
logger.info(f"Successfully loaded make {make_data.name} (ID: {make_id})")
except Exception as e:
logger.error(f"Failed to load make {make_data.name}: {str(e)}")
failed_makes.append(make_data.name)
continue
# Create result
result = LoadResult(
total_makes=len(makes_data),
total_models=stats.models_inserted,
total_model_years=stats.model_years_inserted,
total_trims=stats.trims_inserted,
total_engines=stats.engines_inserted,
total_trim_engine_mappings=stats.trim_engine_mappings_inserted,
failed_makes=failed_makes,
warnings=stats.warnings,
load_mode=mode
)
logger.info(f"Bulk load complete: {result.success_count}/{result.total_makes} makes loaded successfully")
logger.info(f"Data loaded: {result.total_models} models, {result.total_engines} engines, {result.total_trims} trims")
return result
def get_database_statistics(self) -> Dict[str, int]:
"""
Get current database record counts
Returns:
Dictionary with table counts
"""
stats = {}
tables = ['make', 'model', 'model_year', 'trim', 'engine', 'trim_engine']
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
for table in tables:
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table}")
result = cursor.fetchone()
stats[table] = result[0] if isinstance(result, tuple) else result['count']
return stats
def validate_referential_integrity(self) -> List[str]:
"""
Validate referential integrity of loaded data
Returns:
List of integrity issues found (empty if all good)
"""
issues = []
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Check for orphaned models
cursor.execute("""
SELECT COUNT(*) FROM vehicles.model m
LEFT JOIN vehicles.make mk ON m.make_id = mk.id
WHERE mk.id IS NULL
""")
orphaned_models = self._get_id_from_result(cursor.fetchone(), 'count')
if orphaned_models > 0:
issues.append(f"Found {orphaned_models} orphaned models")
# Check for orphaned model_years
cursor.execute("""
SELECT COUNT(*) FROM vehicles.model_year my
LEFT JOIN vehicles.model m ON my.model_id = m.id
WHERE m.id IS NULL
""")
orphaned_model_years = self._get_id_from_result(cursor.fetchone())
if orphaned_model_years > 0:
issues.append(f"Found {orphaned_model_years} orphaned model_years")
# Check for orphaned trims
cursor.execute("""
SELECT COUNT(*) FROM vehicles.trim t
LEFT JOIN vehicles.model_year my ON t.model_year_id = my.id
WHERE my.id IS NULL
""")
orphaned_trims = self._get_id_from_result(cursor.fetchone())
if orphaned_trims > 0:
issues.append(f"Found {orphaned_trims} orphaned trims")
# Check for broken trim_engine mappings
cursor.execute("""
SELECT COUNT(*) FROM vehicles.trim_engine te
LEFT JOIN vehicles.trim t ON te.trim_id = t.id
LEFT JOIN vehicles.engine e ON te.engine_id = e.id
WHERE t.id IS NULL OR e.id IS NULL
""")
broken_mappings = self._get_id_from_result(cursor.fetchone())
if broken_mappings > 0:
issues.append(f"Found {broken_mappings} broken trim_engine mappings")
if issues:
logger.warning(f"Referential integrity issues found: {issues}")
else:
logger.info("Referential integrity validation passed")
return issues
def print_load_report(self, result: LoadResult) -> None:
"""
Print comprehensive loading report
Args:
result: LoadResult from load operation
"""
print(f"🚀 JSON MANUAL LOADING REPORT")
print(f"=" * 50)
# Load summary
print(f"\n📊 LOADING SUMMARY")
print(f" Mode: {result.load_mode.value.upper()}")
print(f" Makes processed: {result.success_count}/{result.total_makes}")
print(f" Success rate: {result.success_rate:.1%}")
# Data counts
print(f"\n📈 DATA LOADED")
print(f" Models: {result.total_models}")
print(f" Model years: {result.total_model_years}")
print(f" Trims: {result.total_trims}")
print(f" Engines: {result.total_engines}")
print(f" Trim-engine mappings: {result.total_trim_engine_mappings}")
# Issues
if result.failed_makes:
print(f"\n⚠️ FAILED MAKES ({len(result.failed_makes)}):")
for make in result.failed_makes:
print(f" {make}")
if result.warnings:
print(f"\n⚠️ WARNINGS ({len(result.warnings)}):")
for warning in result.warnings[:5]: # Show first 5
print(f" {warning}")
if len(result.warnings) > 5:
print(f" ... and {len(result.warnings) - 5} more warnings")
# Database statistics
print(f"\n📋 DATABASE STATISTICS:")
db_stats = self.get_database_statistics()
for table, count in db_stats.items():
print(f" vehicles.{table}: {count:,} records")
# Referential integrity
integrity_issues = self.validate_referential_integrity()
if integrity_issues:
print(f"\n❌ REFERENTIAL INTEGRITY ISSUES:")
for issue in integrity_issues:
print(f" {issue}")
else:
print(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
# Example usage and testing functions
def example_usage():
"""Demonstrate JsonManualLoader usage"""
print("🚀 JsonManualLoader Example Usage")
print("=" * 40)
# This would typically be called after JsonExtractor
# For demo purposes, we'll just show the structure
print("\n📋 Typical usage flow:")
print("1. Extract data with JsonExtractor")
print("2. Create JsonManualLoader")
print("3. Load data in APPEND or CLEAR mode")
print("4. Validate and report results")
print(f"\n💡 Example code:")
print("""
# Extract data
extractor = JsonExtractor(make_mapper, engine_parser)
extraction_result = extractor.extract_all_makes('sources/makes')
# Load data
loader = JsonManualLoader()
load_result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
# Report results
loader.print_load_report(load_result)
""")
if __name__ == "__main__":
example_usage()

View File

@@ -0,0 +1,437 @@
#!/usr/bin/env python3
"""
MSSQL Database Loader
Handles loading .bak files into MSSQL Server for ETL processing
"""
import os
import logging
import pyodbc
import time
from pathlib import Path
from typing import Optional, List
from ..config import config
logger = logging.getLogger(__name__)
class MSSQLLoader:
"""Loads database files into MSSQL Server"""
def __init__(self):
self.server = config.MSSQL_HOST
self.port = config.MSSQL_PORT
self.database = config.MSSQL_DATABASE
self.username = config.MSSQL_USER
self.password = config.MSSQL_PASSWORD
def get_connection_string(self, database: str = "master") -> str:
"""Get MSSQL connection string"""
return (
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
f"SERVER={self.server},{self.port};"
f"DATABASE={database};"
f"UID={self.username};"
f"PWD={self.password};"
f"TrustServerCertificate=yes;"
)
def test_connection(self) -> bool:
"""Test MSSQL connection"""
try:
conn_str = self.get_connection_string()
logger.info(f"Testing MSSQL connection to: {self.server}")
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute("SELECT @@VERSION")
version = cursor.fetchone()[0]
logger.info(f"MSSQL connection successful: {version[:100]}...")
return True
except Exception as e:
logger.error(f"MSSQL connection failed: {e}")
return False
def database_exists(self, database_name: str) -> bool:
"""Check if database exists"""
try:
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM sys.databases WHERE name = ?",
(database_name,)
)
count = cursor.fetchone()[0]
return count > 0
except Exception as e:
logger.error(f"Failed to check if database exists: {e}")
return False
def get_database_state(self, database_name: str) -> Optional[str]:
"""Return the state_desc for a database or None if not found"""
try:
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT state_desc FROM sys.databases WHERE name = ?",
(database_name,)
)
row = cursor.fetchone()
return row[0] if row else None
except Exception as e:
logger.error(f"Failed to get database state: {e}")
return None
def drop_database(self, database_name: str) -> bool:
"""Drop database if it exists"""
try:
if not self.database_exists(database_name):
logger.info(f"Database {database_name} does not exist, skipping drop")
return True
logger.info(f"Dropping database: {database_name}")
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
conn.autocommit = True
cursor = conn.cursor()
# Kill existing connections
cursor.execute(f"""
ALTER DATABASE [{database_name}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;
DROP DATABASE [{database_name}];
""")
logger.info(f"Successfully dropped database: {database_name}")
return True
except Exception as e:
logger.error(f"Failed to drop database {database_name}: {e}")
return False
def get_backup_file_info(self, bak_path: Path) -> Optional[dict]:
"""Get information about backup file"""
try:
# Use the MSSQL container's mounted backup directory
container_path = f"/backups/{bak_path.name}"
# For now, assume the file is accessible
# In production, this would copy the file into the MSSQL container
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
# Get backup file information
cursor.execute(f"RESTORE HEADERONLY FROM DISK = '{container_path}'")
headers = cursor.fetchall()
if headers:
header = headers[0]
return {
"database_name": header.DatabaseName,
"server_name": header.ServerName,
"backup_start_date": header.BackupStartDate,
"backup_finish_date": header.BackupFinishDate,
"backup_size": header.BackupSize,
}
except Exception as e:
logger.warning(f"Could not get backup file info: {e}")
return None
def restore_database(self, bak_path: Path, target_database: str = None) -> bool:
"""
Restore database from .bak file
Args:
bak_path: Path to .bak file
target_database: Target database name (defaults to VPICList)
Returns:
True if successful
"""
if target_database is None:
target_database = self.database
if not bak_path.exists():
logger.error(f"Backup file does not exist: {bak_path}")
return False
logger.info(f"Starting database restore: {bak_path} -> {target_database}")
try:
# Copy backup file to MSSQL container
container_bak_path = self.copy_backup_to_container(bak_path)
if not container_bak_path:
logger.error("Failed to copy backup file to container")
return False
# If database exists, note the state; we will handle exclusivity in the same session below
if self.database_exists(target_database):
state = self.get_database_state(target_database)
logger.info(f"Existing database detected: {target_database} (state={state})")
else:
logger.info(f"Target database does not exist yet: {target_database} — proceeding with restore")
# Restore database using a single master connection for exclusivity
logger.info(f"Restoring database from: {container_bak_path}")
conn_str = self.get_connection_string()
with pyodbc.connect(conn_str, timeout=600) as conn: # 10 minute timeout
conn.autocommit = True
cursor = conn.cursor()
# If DB exists, ensure exclusive access: kill sessions + SINGLE_USER in this session
if self.database_exists(target_database):
try:
logger.info(f"Preparing exclusive access for restore: killing active sessions on {target_database}")
kill_sql = f"""
DECLARE @db sysname = N'{target_database}';
DECLARE @kill nvarchar(max) = N'';
SELECT @kill = @kill + N'KILL ' + CONVERT(nvarchar(10), session_id) + N';'
FROM sys.dm_exec_sessions
WHERE database_id = DB_ID(@db) AND session_id <> @@SPID;
IF LEN(@kill) > 0 EXEC (@kill);
"""
cursor.execute(kill_sql)
# Force SINGLE_USER in current session
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
logger.info(f"Exclusive access prepared (SINGLE_USER) for {target_database}")
except Exception as e:
logger.warning(f"Could not fully prepare exclusive access: {e}")
# Get logical file names from backup
cursor.execute(f"RESTORE FILELISTONLY FROM DISK = '{container_bak_path}'")
files = cursor.fetchall()
if not files:
logger.error("No files found in backup")
return False
# Build RESTORE command with MOVE options
data_file = None
log_file = None
for file_info in files:
logical_name = file_info.LogicalName
file_type = file_info.Type
if file_type == 'D': # Data file
data_file = logical_name
elif file_type == 'L': # Log file
log_file = logical_name
if not data_file:
logger.error("No data file found in backup")
return False
# Construct restore command
restore_sql = f"""
RESTORE DATABASE [{target_database}]
FROM DISK = '{container_bak_path}'
WITH
MOVE '{data_file}' TO '/var/opt/mssql/data/{target_database}.mdf',
"""
if log_file:
restore_sql += f" MOVE '{log_file}' TO '/var/opt/mssql/data/{target_database}.ldf',"
restore_sql += """
REPLACE,
RECOVERY,
STATS = 10
"""
logger.info(f"Executing restore command for database: {target_database}")
logger.debug(f"Restore SQL: {restore_sql}")
try:
cursor.execute(restore_sql)
except Exception as e:
# If we hit exclusive access error, retry once after killing sessions again
if 'Exclusive access could not be obtained' in str(e):
logger.warning("Exclusive access error on RESTORE; retrying after killing sessions and reasserting SINGLE_USER...")
try:
cursor.execute(kill_sql)
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
except Exception as e2:
logger.warning(f"Retry exclusive prep failed: {e2}")
cursor.execute(restore_sql)
else:
raise
# Poll for database to be ONLINE
if not self._wait_for_database_online(target_database):
logger.error(f"Database did not come ONLINE in time: {target_database}")
return False
# Small retry around database_exists to handle late readiness
if self._retry_database_exists(target_database):
logger.info(f"Database restore successful and ONLINE: {target_database}")
# Get basic database info
cursor.execute(f"""
SELECT
name,
create_date,
compatibility_level,
state_desc
FROM sys.databases
WHERE name = '{target_database}'
""")
db_info = cursor.fetchone()
if db_info:
logger.info(f"Database info: Name={db_info.name}, Created={db_info.create_date}, Level={db_info.compatibility_level}, State={db_info.state_desc}")
# Optional: quick content verification with small retry window
if not self._retry_verify_content(target_database):
logger.warning("Database restored but content verification is inconclusive")
# Try to set MULTI_USER back in same session
try:
cursor.execute(f"ALTER DATABASE [{target_database}] SET MULTI_USER;")
logger.info(f"Set {target_database} back to MULTI_USER")
except Exception as e:
logger.warning(f"Could not set MULTI_USER on {target_database}: {e}")
return True
else:
logger.error(f"Database restore failed - database not found: {target_database}")
return False
except Exception as e:
logger.error(f"Database restore failed: {e}")
return False
def copy_backup_to_container(self, bak_path: Path) -> Optional[str]:
"""
Copy backup file to shared volume accessible by MSSQL container
Args:
bak_path: Local path to .bak file
Returns:
Container path to .bak file or None if failed
"""
try:
# Use shared volume instead of docker cp
shared_dir = Path("/app/shared")
shared_bak_path = shared_dir / bak_path.name
# If the file is already in the shared dir, skip copying
if bak_path.resolve().parent == shared_dir.resolve():
logger.info(f"Backup already in shared volume: {bak_path}")
else:
logger.info(f"Copying {bak_path} to shared volume...")
import shutil
shutil.copy2(bak_path, shared_bak_path)
# Container path from MSSQL perspective
container_path = f"/backups/{shared_bak_path.name}"
logger.info(f"Successfully copied to shared volume: {container_path}")
return container_path
except Exception as e:
logger.error(f"Failed to copy backup to shared volume: {e}")
return None
def _wait_for_database_online(self, database_name: str, timeout_seconds: int = 600, interval_seconds: int = 5) -> bool:
"""Poll MSSQL until the specified database state becomes ONLINE or timeout.
Returns True if ONLINE, False on timeout/error.
"""
logger.info(f"Waiting for database to become ONLINE: {database_name}")
deadline = time.time() + timeout_seconds
last_state = None
try:
conn_str = self.get_connection_string()
while time.time() < deadline:
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
cursor.execute("SELECT state_desc FROM sys.databases WHERE name = ?", (database_name,))
row = cursor.fetchone()
if row:
state = row[0]
if state != last_state:
logger.info(f"Database state: {state}")
last_state = state
if state == 'ONLINE':
# Optional: verify updateability is READ_WRITE
try:
cursor.execute("SELECT DATABASEPROPERTYEX(?, 'Updateability')", (database_name,))
up = cursor.fetchone()[0]
logger.info(f"Database updateability: {up}")
except Exception:
pass
return True
else:
logger.info("Database entry not found yet in sys.databases")
time.sleep(interval_seconds)
except Exception as e:
logger.error(f"Error while waiting for database ONLINE: {e}")
return False
logger.error("Timed out waiting for database to become ONLINE")
return False
def _retry_database_exists(self, database_name: str, attempts: int = 6, delay_seconds: int = 5) -> bool:
"""Retry wrapper for database existence checks."""
for i in range(1, attempts + 1):
if self.database_exists(database_name):
return True
logger.info(f"database_exists() false, retrying ({i}/{attempts})...")
time.sleep(delay_seconds)
return False
def _retry_verify_content(self, database_name: str, attempts: int = 3, delay_seconds: int = 5) -> bool:
"""Retry wrapper around verify_database_content to allow late readiness."""
for i in range(1, attempts + 1):
try:
counts = self.verify_database_content(database_name)
if counts:
logger.info(f"Content verification counts: {counts}")
return True
except Exception as e:
logger.info(f"Content verification attempt {i} failed: {e}")
time.sleep(delay_seconds)
return False
def verify_database_content(self, database_name: str = None) -> dict:
"""
Verify database has expected content
Returns:
Dictionary with table counts
"""
if database_name is None:
database_name = self.database
try:
conn_str = self.get_connection_string(database_name)
with pyodbc.connect(conn_str, timeout=30) as conn:
cursor = conn.cursor()
# Get table counts for key tables
tables_to_check = ['Make', 'Model', 'VehicleType', 'Manufacturer']
counts = {}
for table in tables_to_check:
try:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
counts[table] = count
logger.info(f"Table {table}: {count:,} rows")
except:
counts[table] = 0
return counts
except Exception as e:
logger.error(f"Failed to verify database content: {e}")
return {}

View File

@@ -0,0 +1,354 @@
import logging
from typing import List, Dict, Optional
from psycopg2.extras import execute_batch
from ..connections import db_connections
from tqdm import tqdm
logger = logging.getLogger(__name__)
class PostgreSQLLoader:
"""Load data into PostgreSQL target database"""
def __init__(self):
self.batch_size = 1000
def load_reference_table(self, table_name: str, data: List[Dict],
clear_existing: bool = True) -> int:
"""Load data into a reference table"""
if not data:
logger.warning(f"No data to load for table {table_name}")
return 0
logger.info(f"Loading {len(data)} records into vehicles.{table_name}")
# Column mapping from source (MS SQL) to target (PostgreSQL)
column_mappings = {
'Id': 'id',
'Name': 'name',
'Code': 'code',
'MakeId': 'make_id',
'CreateOn': 'created_at',
'CreatedOn': 'created_at',
'UpdateOn': 'updated_at',
'UpdatedOn': 'updated_at',
'Wmi': 'wmi',
'ManufacturerId': 'manufacturer_id',
'MakeId': 'make_id',
'VehicleTypeId': 'vehicle_type_id',
'TruckTypeId': 'truck_type_id',
'CountryId': 'country_id',
'PublicAvailabilityDate': 'public_availability_date',
'NonCompliant': 'non_compliant',
'NonCompliantReason': 'non_compliant_reason',
'ProcessedOn': 'processed_on',
'DisplayOrder': 'display_order',
'FormType': 'form_type',
'Description': 'description',
'LookupTable': 'lookup_table',
'IsPrivate': 'is_private',
'GroupName': 'group_name',
'DataType': 'data_type',
'MinAllowedValue': 'min_allowed_value',
'MaxAllowedValue': 'max_allowed_value',
'IsQS': 'is_qs',
'Decode': 'decode',
'weight': 'weight',
# ErrorCode specific mappings
'ErrorCodeName': 'code',
'ErrorCodeDescription': 'description'
}
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
if clear_existing:
cursor.execute(f"TRUNCATE TABLE vehicles.{table_name} CASCADE")
logger.info(f"Cleared existing data from vehicles.{table_name}")
# Get source columns and map them to target columns
source_columns = list(data[0].keys())
target_columns = []
valid_data = []
# Map columns and filter data
for source_col in source_columns:
if source_col in column_mappings:
target_columns.append(column_mappings[source_col])
else:
target_columns.append(source_col.lower())
# Check which columns exist in target table
cursor.execute(f"""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = 'vehicles' AND table_name = '{table_name}'
""")
results = cursor.fetchall()
existing_columns = {row['column_name'] if isinstance(row, dict) else row[0] for row in results}
# Filter to only existing columns
final_columns = []
final_indices = []
for i, col in enumerate(target_columns):
if col in existing_columns:
final_columns.append(col)
final_indices.append(i)
if not final_columns:
logger.warning(f"No matching columns found for table {table_name}")
return 0
column_str = ','.join(final_columns)
placeholders = ','.join(['%s'] * len(final_columns))
# Prepare insert query
query = f"""
INSERT INTO vehicles.{table_name} ({column_str})
VALUES ({placeholders})
ON CONFLICT DO NOTHING
"""
# Prepare data tuples with only valid columns
data_tuples = []
for record in data:
values = []
skip_record = False
for i in final_indices:
source_col = source_columns[i]
value = record[source_col]
# Handle special cases for error_codes table
if table_name == 'error_codes' and source_col in ['ErrorCodeName', 'Code'] and (value is None or value == ''):
skip_record = True
break
values.append(value)
if not skip_record:
data_tuples.append(tuple(values))
# Execute batch insert
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} records into vehicles.{table_name}")
return final_count
def load_wmi_vin_schema_mappings(self, mappings: List[Dict]) -> int:
"""Load WMI to VIN Schema mappings"""
if not mappings:
return 0
logger.info(f"Loading {len(mappings)} WMI-VinSchema mappings")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing mappings
cursor.execute("TRUNCATE TABLE vehicles.wmi_vin_schemas CASCADE")
query = """
INSERT INTO vehicles.wmi_vin_schemas
(wmi_id, vin_schema_id, year_from, year_to)
VALUES (%s, %s, %s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for mapping in mappings:
data_tuples.append((
mapping['WmiId'],
mapping['VinSchemaId'],
mapping['YearFrom'] or 1980,
mapping['YearTo'] or 2999
))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_vin_schemas")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} WMI-VinSchema mappings")
return final_count
def load_make_model_relationships(self, relationships: List[Dict]) -> int:
"""Load Make-Model relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} Make-Model relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing relationships
cursor.execute("TRUNCATE TABLE vehicles.make_models CASCADE")
query = """
INSERT INTO vehicles.make_models (make_id, model_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for rel in relationships:
data_tuples.append((rel['MakeId'], rel['ModelId']))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.make_models")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} Make-Model relationships")
return final_count
def load_wmi_make_relationships(self, relationships: List[Dict]) -> int:
"""Load WMI-Make relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} WMI-Make relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
# Clear existing relationships
cursor.execute("TRUNCATE TABLE vehicles.wmi_makes CASCADE")
query = """
INSERT INTO vehicles.wmi_makes (wmi_id, make_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = []
for rel in relationships:
data_tuples.append((rel['WmiId'], rel['MakeId']))
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_makes")
result = cursor.fetchone()
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
logger.info(f"Successfully loaded {final_count} WMI-Make relationships")
return final_count
def load_model_years(self, model_years: List[Dict]) -> int:
"""Load model year availability data"""
if not model_years:
return 0
logger.info(f"Loading {len(model_years)} model year records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.model_year (model_id, year)
VALUES (%s, %s)
ON CONFLICT (model_id, year) DO NOTHING
"""
data_tuples = [(my['model_id'], my['year']) for my in model_years]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(model_years)
def load_trims(self, trims: List[Dict]) -> int:
"""Load trim data"""
if not trims:
return 0
logger.info(f"Loading {len(trims)} trim records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.trim (model_year_id, name)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
"""
data_tuples = [(t['model_year_id'], t['name']) for t in trims]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(trims)
def load_engines(self, engines: List[Dict]) -> int:
"""Load engine data"""
if not engines:
return 0
logger.info(f"Loading {len(engines)} engine records")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (lower(name)) DO NOTHING
RETURNING id
"""
for engine in engines:
cursor.execute(query, (
engine['name'],
engine.get('code'),
engine.get('displacement_l'),
engine.get('cylinders'),
engine.get('fuel_type'),
engine.get('aspiration')
))
conn.commit()
return len(engines)
def load_trim_engine_relationships(self, relationships: List[Dict]) -> int:
"""Load trim-engine relationships"""
if not relationships:
return 0
logger.info(f"Loading {len(relationships)} trim-engine relationships")
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
query = """
INSERT INTO vehicles.trim_engine (trim_id, engine_id)
VALUES (%s, %s)
ON CONFLICT (trim_id, engine_id) DO NOTHING
"""
data_tuples = [(rel['trim_id'], rel['engine_id']) for rel in relationships]
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
conn.commit()
return len(relationships)
def get_table_count(self, table_name: str) -> int:
"""Get count of records in a table"""
with db_connections.postgres_connection() as conn:
cursor = conn.cursor()
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
result = cursor.fetchone()
return result['count'] if isinstance(result, dict) and 'count' in result else result[0]

View File

@@ -0,0 +1,348 @@
#!/usr/bin/env python3
import logging
import sys
import os
from datetime import datetime
from pathlib import Path
import click
from .config import config
from .utils.logging import setup_logging
from .scheduler import start_etl_scheduler
from .pipeline import run_etl_pipeline
from .connections import test_connections
# Import manual JSON processing components
try:
from .pipelines.manual_json_pipeline import ManualJsonPipeline, PipelineConfig, default_progress_callback
from .loaders.json_manual_loader import LoadMode
from .utils.make_name_mapper import MakeNameMapper
from .utils.engine_spec_parser import EngineSpecParser
from .extractors.json_extractor import JsonExtractor
except ImportError as e:
# Handle import errors gracefully for existing functionality
ManualJsonPipeline = None
logger = logging.getLogger(__name__)
logger.warning(f"Manual JSON processing components not available: {e}")
logger = logging.getLogger(__name__)
@click.group()
def cli():
"""MVP Platform Vehicles ETL Tool"""
setup_logging(config.LOG_LEVEL)
@cli.command()
def build_catalog():
"""Build vehicle catalog from source database"""
success = run_etl_pipeline()
if not success:
sys.exit(1)
@cli.command()
def schedule():
"""Start ETL scheduler (default mode)"""
start_etl_scheduler()
@cli.command()
@click.option('--full', is_flag=True, help='Full reload instead of incremental')
def update(full):
"""Run ETL update"""
logger.info(f"Starting ETL update (full={full})")
success = run_etl_pipeline()
if not success:
sys.exit(1)
@cli.command()
def test():
"""Test database connections"""
success = test_connections()
if not success:
logger.error("Connection test failed")
sys.exit(1)
else:
logger.info("All connections tested successfully")
@cli.command()
@click.option('--sources-dir', '-s', default='sources/makes',
help='Directory containing JSON make files (default: sources/makes)')
@click.option('--mode', '-m', type=click.Choice(['clear', 'append']), default='append',
help='Loading mode: clear (destructive) or append (safe, default)')
@click.option('--progress/--no-progress', default=True,
help='Show progress tracking (default: enabled)')
@click.option('--validate/--no-validate', default=True,
help='Validate referential integrity after loading (default: enabled)')
@click.option('--batch-size', '-b', type=int, default=1000,
help='Database batch size for inserts (default: 1000)')
@click.option('--dry-run', is_flag=True,
help='Extract and validate data without loading to database')
@click.option('--verbose', '-v', is_flag=True,
help='Enable verbose output')
def load_manual(sources_dir, mode, progress, validate, batch_size, dry_run, verbose):
"""Load vehicle data from JSON files manually
This command processes JSON files in the specified directory and loads
vehicle data into the PostgreSQL database. It supports two modes:
• APPEND mode (default): Safely add new data with duplicate detection
• CLEAR mode: Remove all existing data and reload (destructive)
Examples:
python -m etl load-manual
python -m etl load-manual --mode clear --sources-dir custom/path
python -m etl load-manual --dry-run --verbose
"""
if ManualJsonPipeline is None:
click.echo("❌ Manual JSON processing components are not available", err=True)
click.echo(" Please check your installation and dependencies", err=True)
sys.exit(1)
# Validate sources directory
sources_path = Path(sources_dir)
if not sources_path.exists():
click.echo(f"❌ Sources directory not found: {sources_dir}", err=True)
click.echo(" Please specify a valid directory with --sources-dir", err=True)
sys.exit(1)
# Count JSON files
json_files = list(sources_path.glob("*.json"))
if not json_files:
click.echo(f"❌ No JSON files found in: {sources_dir}", err=True)
click.echo(" Please ensure the directory contains *.json files", err=True)
sys.exit(1)
# Set log level if verbose
if verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Create configuration
load_mode_enum = LoadMode.CLEAR if mode == 'clear' else LoadMode.APPEND
config = PipelineConfig(
sources_directory=str(sources_path),
load_mode=load_mode_enum,
enable_progress_tracking=progress,
validate_integrity=validate,
batch_size=batch_size
)
click.echo(f"🚀 Manual JSON Processing Pipeline")
click.echo(f" Sources: {sources_dir}")
click.echo(f" Files: {len(json_files)} JSON files")
click.echo(f" Mode: {mode.upper()}")
if dry_run:
click.echo(f" Dry run: Validation only (no database changes)")
try:
# Create pipeline
pipeline = ManualJsonPipeline(str(sources_path), config)
# Progress callback for CLI
def cli_progress_callback(progress_info):
if progress:
percentage = progress_info['percentage']
phase = progress_info['phase']
files = f"{progress_info['files_completed']}/{progress_info['total_files']}"
if progress_info['files_per_second'] > 0:
rate = f"({progress_info['files_per_second']:.1f} files/sec)"
eta_min = progress_info['eta_seconds'] / 60
eta = f"ETA: {eta_min:.1f}min" if eta_min > 0 else ""
click.echo(f"[{percentage:5.1f}%] {phase}: {files} {rate} {eta}")
else:
click.echo(f"[{percentage:5.1f}%] {phase}: {files}")
if dry_run:
# Extraction only for validation
click.echo("\n📋 Running extraction validation...")
extraction_result = pipeline.run_extraction_only()
# Report extraction results
click.echo(f"\n✅ Extraction Validation Complete")
click.echo(f" Files processed: {extraction_result.total_files_processed}")
click.echo(f" Success rate: {extraction_result.success_rate:.1%}")
click.echo(f" Models extracted: {extraction_result.total_models:,}")
click.echo(f" Engines extracted: {extraction_result.total_engines:,}")
click.echo(f" Electric models: {extraction_result.total_electric_models:,}")
if extraction_result.failed_extractions > 0:
click.echo(f" ⚠️ Failed extractions: {extraction_result.failed_extractions}")
sys.exit(1)
else:
# Full pipeline execution
if mode == 'clear':
click.echo("\n⚠️ WARNING: CLEAR mode will delete all existing vehicle data!")
if not click.confirm("Are you sure you want to continue?", default=False):
click.echo("Operation cancelled")
return
click.echo(f"\n🔄 Running pipeline...")
result = pipeline.run(progress_callback=cli_progress_callback)
# Print comprehensive report
click.echo(f"\n" + "="*60)
click.echo(f"📊 PIPELINE EXECUTION REPORT")
click.echo(f"="*60)
# Performance
click.echo(f"\n⏱️ PERFORMANCE")
click.echo(f" Duration: {result.duration_seconds:.1f} seconds ({result.duration_minutes:.1f} minutes)")
click.echo(f" Processing rate: {result.files_per_second:.1f} files/sec")
click.echo(f" Loading rate: {result.records_per_second:,.0f} records/sec")
# Success rates
click.echo(f"\n📈 SUCCESS RATES")
click.echo(f" Extraction: {result.extraction_success_rate:.1%}")
click.echo(f" Loading: {result.loading_success_rate:.1%}")
click.echo(f" Overall: {result.overall_success_rate:.1%}")
# Data loaded
click.echo(f"\n💾 DATA LOADED")
click.echo(f" Makes: {result.load_result.total_makes}")
click.echo(f" Models: {result.load_result.total_models}")
click.echo(f" Engines: {result.load_result.total_engines}")
click.echo(f" Trims: {result.load_result.total_trims}")
click.echo(f" Total records: {result.total_records_loaded:,}")
# Issues
if result.load_result.failed_makes:
click.echo(f"\n⚠️ FAILED MAKES ({len(result.load_result.failed_makes)}):")
for make in result.load_result.failed_makes:
click.echo(f"{make}")
if result.integrity_issues:
click.echo(f"\n❌ INTEGRITY ISSUES ({len(result.integrity_issues)}):")
for issue in result.integrity_issues:
click.echo(f"{issue}")
else:
click.echo(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
# Final status
if result.was_successful:
click.echo(f"\n🎉 PIPELINE COMPLETED SUCCESSFULLY")
if verbose:
# Show database statistics
db_stats = pipeline.loader.get_database_statistics()
click.echo(f"\n📋 DATABASE STATISTICS:")
for table, count in db_stats.items():
click.echo(f" {table}: {count:,} records")
else:
click.echo(f"\n⚠️ PIPELINE COMPLETED WITH ISSUES")
sys.exit(1)
except KeyboardInterrupt:
click.echo(f"\n⏸️ Pipeline interrupted by user")
sys.exit(1)
except Exception as e:
click.echo(f"\n❌ Pipeline failed: {str(e)}", err=True)
if verbose:
import traceback
traceback.print_exc()
sys.exit(1)
@cli.command()
@click.option('--sources-dir', '-s', default='sources/makes',
help='Directory containing JSON make files (default: sources/makes)')
@click.option('--verbose', '-v', is_flag=True,
help='Enable verbose output with detailed statistics')
def validate_json(sources_dir, verbose):
"""Validate JSON files and show extraction statistics
This command validates the structure and content of JSON files
without loading data into the database. Useful for:
• Checking data quality before loading
• Debugging extraction issues
• Getting statistics about available data
Examples:
python -m etl validate-json
python -m etl validate-json --sources-dir custom/path --verbose
"""
if JsonExtractor is None:
click.echo("❌ JSON validation components are not available", err=True)
sys.exit(1)
# Validate sources directory
sources_path = Path(sources_dir)
if not sources_path.exists():
click.echo(f"❌ Sources directory not found: {sources_dir}", err=True)
sys.exit(1)
# Count JSON files
json_files = list(sources_path.glob("*.json"))
if not json_files:
click.echo(f"❌ No JSON files found in: {sources_dir}", err=True)
sys.exit(1)
click.echo(f"🔍 JSON File Validation")
click.echo(f" Directory: {sources_dir}")
click.echo(f" Files: {len(json_files)} JSON files")
try:
# Initialize components
make_mapper = MakeNameMapper()
engine_parser = EngineSpecParser()
extractor = JsonExtractor(make_mapper, engine_parser)
# Run extraction validation
click.echo(f"\n📋 Validating JSON structure and content...")
result = extractor.extract_all_makes(str(sources_path))
# Basic results
click.echo(f"\n✅ Validation Complete")
click.echo(f" Files processed: {result.total_files_processed}")
click.echo(f" Success rate: {result.success_rate:.1%}")
click.echo(f" Models found: {result.total_models:,}")
click.echo(f" Engines found: {result.total_engines:,}")
click.echo(f" Electric models: {result.total_electric_models:,}")
if result.failed_extractions > 0:
click.echo(f" ⚠️ Failed extractions: {result.failed_extractions}")
# Show top makes by model count
if verbose and result.makes:
click.echo(f"\n🏆 Top Makes by Model Count:")
top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
for i, make in enumerate(top_makes, 1):
click.echo(f" {i:2d}. {make.name}: {make.total_models} models, {make.total_engines} engines")
# Show makes with issues
error_makes = [make for make in result.makes if make.processing_errors]
if error_makes:
click.echo(f"\n⚠️ Makes with Processing Errors ({len(error_makes)}):")
for make in error_makes[:5]:
click.echo(f"{make.name}: {len(make.processing_errors)} errors")
if len(error_makes) > 5:
click.echo(f" ... and {len(error_makes) - 5} more")
# Show data quality insights
click.echo(f"\n📊 Data Quality Insights:")
# Engine configuration distribution
config_counts = {}
for make in result.makes:
for model in make.models:
for engine in model.engines:
config_counts[engine.configuration] = config_counts.get(engine.configuration, 0) + 1
if config_counts:
click.echo(f" Engine configurations:")
for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True):
percentage = count / result.total_engines * 100
click.echo(f" {config}: {count:,} ({percentage:.1f}%)")
if result.failed_extractions > 0:
sys.exit(1)
except Exception as e:
click.echo(f"❌ Validation failed: {str(e)}", err=True)
if verbose:
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
# Default to scheduler mode if no command provided
if len(sys.argv) == 1:
start_etl_scheduler()
else:
cli()

View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
import logging
from datetime import datetime
from .config import config
from .builders.normalized_vehicle_builder import NormalizedVehicleBuilder
from .utils.make_filter import MakeFilter
from .connections import test_connections
from .downloaders.nhtsa_downloader import NHTSADownloader
from .loaders.mssql_loader import MSSQLLoader
from .extractors.vin_proc_extractor import VinProcExtractor
logger = logging.getLogger(__name__)
def run_etl_pipeline():
"""Complete ETL pipeline execution including download and database loading"""
logger.info("Starting complete ETL pipeline")
start_time = datetime.now()
try:
# Step 1: Download NHTSA database file
logger.info("Step 1: Downloading NHTSA vPIC database")
downloader = NHTSADownloader()
bak_file = downloader.ensure_database_file(force_download=False)
if not bak_file:
logger.error("Failed to obtain NHTSA database file")
return False
db_info = downloader.get_database_info(bak_file)
logger.info(f"Using database file: {db_info['name']} ({db_info['size_mb']} MB)")
# Step 2: Load database into MSSQL
logger.info("Step 2: Loading database into MSSQL Server")
mssql_loader = MSSQLLoader()
if not mssql_loader.test_connection():
logger.error("MSSQL connection test failed")
return False
if not mssql_loader.restore_database(bak_file):
logger.error("Failed to restore database to MSSQL")
return False
# Verify MSSQL database content
content_info = mssql_loader.verify_database_content()
logger.info(f"MSSQL database loaded with tables: {content_info}")
# Step 2b: Research stored procedure definition/output for parity
try:
logger.info("Step 2b: Inspecting MSSQL VIN decode stored procedure for parity")
vpe = VinProcExtractor()
meta = vpe.find_proc()
if meta:
logger.info(f"VIN proc found: {meta['schema_name']}.{meta['object_name']} ({meta['type_desc']})")
definition = vpe.get_definition(meta['schema_name'], meta['object_name'])
logger.debug(f"VIN proc definition (first 500 chars): {definition[:500]}")
sample = vpe.sample_execute('1G1YU3D64H5602799')
if sample is not None:
logger.info(f"VIN proc sample output columns: {list(sample[0].keys()) if sample else 'no rows'}")
else:
logger.warning("VIN decode proc not found by pattern; continuing with catalog build")
except Exception as e:
logger.warning(f"VIN proc inspection failed (non-fatal): {e}")
# Step 3: Test all connections (MSSQL + PostgreSQL)
logger.info("Step 3: Testing all database connections")
if not test_connections():
logger.error("Connection test failed after database loading")
return False
# Step 4: Build normalized PostgreSQL schema from MSSQL with make filtering
logger.info("Step 4: Building normalized PostgreSQL vehicle schema from MSSQL with make filtering")
make_filter = MakeFilter()
builder = NormalizedVehicleBuilder(make_filter)
success = builder.build()
elapsed = datetime.now() - start_time
if success:
logger.info(f"Complete ETL pipeline finished successfully in {elapsed}")
logger.info("✅ ETL Summary:")
logger.info(f" - Downloaded: {db_info['name']} ({db_info['size_mb']} MB)")
logger.info(f" - MSSQL Tables: {content_info}")
logger.info(f" - PostgreSQL normalized schema: Built successfully")
return True
else:
logger.error(f"ETL pipeline failed during normalized schema building after {elapsed}")
return False
except Exception as e:
elapsed = datetime.now() - start_time
logger.error(f"ETL pipeline crashed after {elapsed}: {e}", exc_info=True)
return False

View File

@@ -0,0 +1 @@
# Pipelines package

View File

@@ -0,0 +1,465 @@
"""
Manual JSON Pipeline for Vehicle Data Processing
Coordinates end-to-end processing of JSON vehicle data:
1. Extract data from JSON files
2. Load data into PostgreSQL database
3. Progress tracking and comprehensive reporting
Key Features:
- Full extraction→loading workflow coordination
- Clear/append mode support
- Progress tracking with detailed statistics
- Comprehensive error handling and reporting
- Performance monitoring and optimization
- Referential integrity validation
Usage:
pipeline = ManualJsonPipeline(sources_dir="sources/makes")
result = pipeline.run(mode=LoadMode.APPEND, progress_callback=print_progress)
"""
import logging
import time
from typing import List, Dict, Optional, Callable, Tuple
from dataclasses import dataclass
from pathlib import Path
# Import our components (handle both relative and direct imports)
try:
from ..extractors.json_extractor import JsonExtractor, ExtractionResult
from ..loaders.json_manual_loader import JsonManualLoader, LoadMode, LoadResult
from ..utils.make_name_mapper import MakeNameMapper
from ..utils.engine_spec_parser import EngineSpecParser
except ImportError:
# Fallback for direct execution
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from extractors.json_extractor import JsonExtractor, ExtractionResult
from loaders.json_manual_loader import JsonManualLoader, LoadMode, LoadResult
from utils.make_name_mapper import MakeNameMapper
from utils.engine_spec_parser import EngineSpecParser
logger = logging.getLogger(__name__)
@dataclass
class PipelineConfig:
"""Pipeline configuration options"""
sources_directory: str # Directory containing JSON files
load_mode: LoadMode = LoadMode.APPEND # Loading mode
enable_progress_tracking: bool = True # Enable progress callbacks
validate_integrity: bool = True # Validate referential integrity after loading
batch_size: int = 1000 # Database batch size
log_level: str = "INFO" # Logging level
def __post_init__(self):
"""Validate configuration"""
if not self.sources_directory:
raise ValueError("sources_directory is required")
if not Path(self.sources_directory).exists():
raise ValueError(f"Sources directory does not exist: {self.sources_directory}")
@dataclass
class PipelineResult:
"""Complete pipeline execution result"""
# Configuration
config: PipelineConfig
# Timing
start_time: float
end_time: float
# Extraction results
extraction_result: ExtractionResult
# Loading results
load_result: LoadResult
# Performance metrics
total_files_processed: int
total_records_loaded: int
files_per_second: float
records_per_second: float
# Quality metrics
extraction_success_rate: float
loading_success_rate: float
overall_success_rate: float
# Validation results
integrity_issues: List[str]
@property
def duration_seconds(self) -> float:
return self.end_time - self.start_time
@property
def duration_minutes(self) -> float:
return self.duration_seconds / 60.0
@property
def was_successful(self) -> bool:
"""True if pipeline completed without critical errors"""
return (self.extraction_result.failed_extractions == 0 and
len(self.load_result.failed_makes) == 0 and
len(self.integrity_issues) == 0)
class PipelineProgress:
"""Progress tracking for pipeline execution"""
def __init__(self, total_files: int):
self.total_files = total_files
self.current_file = 0
self.current_phase = "Starting"
self.start_time = time.time()
self.phase_start_time = time.time()
def update_phase(self, phase: str) -> None:
"""Update current phase"""
self.current_phase = phase
self.phase_start_time = time.time()
def update_file_progress(self, files_completed: int) -> None:
"""Update file progress"""
self.current_file = files_completed
def get_progress_info(self) -> Dict[str, any]:
"""Get current progress information"""
elapsed = time.time() - self.start_time
phase_elapsed = time.time() - self.phase_start_time
if self.current_file > 0:
files_per_second = self.current_file / elapsed
eta_seconds = (self.total_files - self.current_file) / files_per_second if files_per_second > 0 else 0
else:
files_per_second = 0
eta_seconds = 0
return {
'phase': self.current_phase,
'files_completed': self.current_file,
'total_files': self.total_files,
'percentage': (self.current_file / self.total_files * 100) if self.total_files > 0 else 0,
'elapsed_seconds': elapsed,
'phase_elapsed_seconds': phase_elapsed,
'files_per_second': files_per_second,
'eta_seconds': eta_seconds
}
class ManualJsonPipeline:
"""End-to-end JSON processing pipeline"""
def __init__(self, sources_dir: str, config: Optional[PipelineConfig] = None):
"""
Initialize pipeline
Args:
sources_dir: Directory containing JSON files
config: Pipeline configuration (optional)
"""
self.sources_dir = sources_dir
self.config = config or PipelineConfig(sources_directory=sources_dir)
# Initialize components
self.make_mapper = MakeNameMapper()
self.engine_parser = EngineSpecParser()
self.extractor = JsonExtractor(self.make_mapper, self.engine_parser)
self.loader = JsonManualLoader()
# Progress tracking
self.progress_callback: Optional[Callable[[Dict[str, any]], None]] = None
logger.info(f"ManualJsonPipeline initialized for {sources_dir}")
def set_progress_callback(self, callback: Callable[[Dict[str, any]], None]) -> None:
"""
Set progress callback function
Args:
callback: Function to call with progress updates
"""
self.progress_callback = callback
def _update_progress(self, progress: PipelineProgress) -> None:
"""Update progress via callback if configured"""
if self.progress_callback and self.config.enable_progress_tracking:
progress_info = progress.get_progress_info()
self.progress_callback(progress_info)
def run(self, mode: Optional[LoadMode] = None, progress_callback: Optional[Callable] = None) -> PipelineResult:
"""
Execute complete pipeline
Args:
mode: Loading mode (overrides config)
progress_callback: Progress callback function (overrides config)
Returns:
PipelineResult with complete execution details
"""
start_time = time.time()
# Override config if specified
load_mode = mode or self.config.load_mode
if progress_callback:
self.set_progress_callback(progress_callback)
logger.info(f"Starting manual JSON pipeline in {load_mode.value} mode")
logger.info(f"Processing directory: {self.sources_dir}")
try:
# Count files for progress tracking
json_files = list(Path(self.sources_dir).glob("*.json"))
total_files = len(json_files)
if total_files == 0:
raise ValueError(f"No JSON files found in {self.sources_dir}")
progress = PipelineProgress(total_files)
# Phase 1: Extract data from JSON files
progress.update_phase("Extracting data from JSON files")
self._update_progress(progress)
logger.info(f"Phase 1: Extracting data from {total_files} JSON files")
extraction_result = self.extractor.extract_all_makes(self.sources_dir)
progress.update_file_progress(extraction_result.total_files_processed)
self._update_progress(progress)
if extraction_result.failed_extractions > 0:
logger.warning(f"Extraction completed with {extraction_result.failed_extractions} failures")
else:
logger.info(f"Extraction completed successfully: {extraction_result.total_models} models, {extraction_result.total_engines} engines")
# Phase 2: Load data into database
progress.update_phase("Loading data into database")
self._update_progress(progress)
logger.info(f"Phase 2: Loading {len(extraction_result.makes)} makes into database ({load_mode.value} mode)")
load_result = self.loader.load_all_makes(extraction_result.makes, load_mode)
if len(load_result.failed_makes) > 0:
logger.warning(f"Loading completed with {len(load_result.failed_makes)} failures")
else:
logger.info(f"Loading completed successfully: {load_result.success_count} makes loaded")
# Phase 3: Validate referential integrity (if enabled)
integrity_issues = []
if self.config.validate_integrity:
progress.update_phase("Validating referential integrity")
self._update_progress(progress)
logger.info("Phase 3: Validating referential integrity")
integrity_issues = self.loader.validate_referential_integrity()
if integrity_issues:
logger.warning(f"Referential integrity issues found: {len(integrity_issues)}")
else:
logger.info("Referential integrity validation passed")
# Calculate performance metrics
end_time = time.time()
duration = end_time - start_time
files_per_second = total_files / duration if duration > 0 else 0
total_records = (load_result.total_models + load_result.total_engines +
load_result.total_trims + load_result.total_trim_engine_mappings)
records_per_second = total_records / duration if duration > 0 else 0
# Calculate success rates
extraction_success_rate = extraction_result.success_rate
loading_success_rate = load_result.success_rate
overall_success_rate = min(extraction_success_rate, loading_success_rate)
# Create result
result = PipelineResult(
config=self.config,
start_time=start_time,
end_time=end_time,
extraction_result=extraction_result,
load_result=load_result,
total_files_processed=total_files,
total_records_loaded=total_records,
files_per_second=files_per_second,
records_per_second=records_per_second,
extraction_success_rate=extraction_success_rate,
loading_success_rate=loading_success_rate,
overall_success_rate=overall_success_rate,
integrity_issues=integrity_issues
)
progress.update_phase("Pipeline complete")
self._update_progress(progress)
logger.info(f"Pipeline completed in {result.duration_seconds:.1f} seconds")
logger.info(f"Performance: {files_per_second:.1f} files/sec, {records_per_second:.0f} records/sec")
logger.info(f"Overall success rate: {overall_success_rate:.1%}")
return result
except Exception as e:
end_time = time.time()
logger.error(f"Pipeline failed after {end_time - start_time:.1f} seconds: {str(e)}")
raise
def run_extraction_only(self) -> ExtractionResult:
"""
Run extraction phase only (for testing/validation)
Returns:
ExtractionResult with extracted data
"""
logger.info("Running extraction-only pipeline")
result = self.extractor.extract_all_makes(self.sources_dir)
logger.info(f"Extraction complete: {result.total_models} models, {result.total_engines} engines")
logger.info(f"Success rate: {result.success_rate:.1%}")
return result
def get_source_statistics(self) -> Dict[str, any]:
"""
Get statistics about source JSON files
Returns:
Dictionary with source file statistics
"""
json_files = list(Path(self.sources_dir).glob("*.json"))
total_size_bytes = sum(f.stat().st_size for f in json_files)
return {
'total_files': len(json_files),
'total_size_bytes': total_size_bytes,
'total_size_mb': total_size_bytes / (1024 * 1024),
'average_file_size_kb': (total_size_bytes / len(json_files) / 1024) if json_files else 0,
'directory': str(self.sources_dir)
}
def print_pipeline_report(self, result: PipelineResult) -> None:
"""
Print comprehensive pipeline execution report
Args:
result: PipelineResult from pipeline execution
"""
print(f"🚀 MANUAL JSON PIPELINE EXECUTION REPORT")
print(f"=" * 60)
# Configuration
print(f"\n⚙️ CONFIGURATION")
print(f" Sources directory: {result.config.sources_directory}")
print(f" Load mode: {result.config.load_mode.value.upper()}")
print(f" Batch size: {result.config.batch_size}")
print(f" Integrity validation: {'Enabled' if result.config.validate_integrity else 'Disabled'}")
# Performance
print(f"\n⏱️ PERFORMANCE")
print(f" Total duration: {result.duration_seconds:.1f} seconds ({result.duration_minutes:.1f} minutes)")
print(f" Files processed: {result.total_files_processed}")
print(f" Records loaded: {result.total_records_loaded:,}")
print(f" Processing rate: {result.files_per_second:.1f} files/sec")
print(f" Loading rate: {result.records_per_second:,.0f} records/sec")
# Success rates
print(f"\n📊 SUCCESS RATES")
print(f" Extraction: {result.extraction_success_rate:.1%}")
print(f" Loading: {result.loading_success_rate:.1%}")
print(f" Overall: {result.overall_success_rate:.1%}")
# Data summary
print(f"\n📈 DATA PROCESSED")
print(f" Makes: {result.load_result.total_makes}")
print(f" Models: {result.load_result.total_models}")
print(f" Model years: {result.load_result.total_model_years}")
print(f" Trims: {result.load_result.total_trims}")
print(f" Engines: {result.load_result.total_engines}")
print(f" Trim-engine mappings: {result.load_result.total_trim_engine_mappings}")
# Issues
if result.load_result.failed_makes:
print(f"\n⚠️ FAILED MAKES ({len(result.load_result.failed_makes)}):")
for make in result.load_result.failed_makes:
print(f" {make}")
if result.integrity_issues:
print(f"\n❌ REFERENTIAL INTEGRITY ISSUES ({len(result.integrity_issues)}):")
for issue in result.integrity_issues:
print(f" {issue}")
else:
print(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
# Final status
print(f"\n🎯 PIPELINE STATUS: {'SUCCESS' if result.was_successful else 'COMPLETED WITH ISSUES'}")
def default_progress_callback(progress_info: Dict[str, any]) -> None:
"""Default progress callback that prints to console"""
percentage = progress_info['percentage']
phase = progress_info['phase']
files_completed = progress_info['files_completed']
total_files = progress_info['total_files']
if progress_info['files_per_second'] > 0:
eta_minutes = progress_info['eta_seconds'] / 60
print(f"[{percentage:5.1f}%] {phase}: {files_completed}/{total_files} files "
f"({progress_info['files_per_second']:.1f} files/sec, ETA: {eta_minutes:.1f}min)")
else:
print(f"[{percentage:5.1f}%] {phase}: {files_completed}/{total_files} files")
# Example usage and testing functions
def example_usage():
"""Demonstrate ManualJsonPipeline usage"""
print("🚀 ManualJsonPipeline Example Usage")
print("=" * 40)
sources_dir = "sources/makes"
if not Path(sources_dir).exists():
print(f"❌ Sources directory not found: {sources_dir}")
return
print(f"\n💡 Example pipeline execution:")
print(f"""
# Create pipeline with configuration
config = PipelineConfig(
sources_directory="{sources_dir}",
load_mode=LoadMode.APPEND,
enable_progress_tracking=True,
validate_integrity=True
)
pipeline = ManualJsonPipeline("{sources_dir}", config)
# Run with progress tracking
result = pipeline.run(progress_callback=default_progress_callback)
# Print comprehensive report
pipeline.print_pipeline_report(result)
""")
# Show source statistics
try:
pipeline = ManualJsonPipeline(sources_dir)
stats = pipeline.get_source_statistics()
print(f"\n📊 Source Directory Statistics:")
print(f" Files: {stats['total_files']}")
print(f" Total size: {stats['total_size_mb']:.1f} MB")
print(f" Average file size: {stats['average_file_size_kb']:.1f} KB")
except Exception as e:
print(f"⚠️ Could not get source statistics: {e}")
if __name__ == "__main__":
example_usage()

View File

@@ -0,0 +1,71 @@
import schedule
import time
import logging
from datetime import datetime
# Import locally to avoid circular import
import importlib
from .config import config
logger = logging.getLogger(__name__)
def scheduled_etl_job():
"""Execute the ETL pipeline on schedule"""
start_time = datetime.now()
logger.info(f"Starting scheduled ETL job at {start_time}")
try:
# Import dynamically to avoid circular import
from .pipeline import run_etl_pipeline
success = run_etl_pipeline()
end_time = datetime.now()
duration = end_time - start_time
if success:
logger.info(f"ETL job completed successfully in {duration}")
else:
logger.error(f"ETL job failed after {duration}")
except Exception as e:
end_time = datetime.now()
duration = end_time - start_time
logger.error(f"ETL job crashed after {duration}: {e}")
def start_etl_scheduler():
"""Start the ETL scheduler"""
logger.info(f"Starting ETL scheduler with cron pattern: {config.ETL_SCHEDULE}")
# Parse cron pattern (simplified for weekly schedule)
# Format: "0 2 * * 0" = minute hour day-of-month month day-of-week
# "0 2 * * 0" = Every Sunday at 2:00 AM
if config.ETL_SCHEDULE == "0 2 * * 0":
schedule.every().sunday.at("02:00").do(scheduled_etl_job)
logger.info("Scheduled ETL to run every Sunday at 2:00 AM")
else:
# Default fallback - run once daily at 2 AM
schedule.every().day.at("02:00").do(scheduled_etl_job)
logger.warning(f"Unknown cron pattern {config.ETL_SCHEDULE}, defaulting to daily at 2:00 AM")
# Run scheduler loop
logger.info("ETL scheduler started")
while True:
try:
schedule.run_pending()
time.sleep(60) # Check every minute
except KeyboardInterrupt:
logger.info("ETL scheduler stopped by user")
break
except Exception as e:
logger.error(f"ETL scheduler error: {e}")
time.sleep(300) # Wait 5 minutes on error
if __name__ == "__main__":
# Configure logging
logging.basicConfig(
level=getattr(logging, config.LOG_LEVEL.upper()),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Start scheduler
start_etl_scheduler()

View File

@@ -0,0 +1,64 @@
{
"manufacturers": [
"Acura",
"Alfa Romeo",
"Aston Martin",
"Audi",
"BMW",
"Bentley",
"Buick",
"Cadillac",
"Chevrolet",
"Chrysler",
"Dodge",
"Ferrari",
"Fiat",
"Ford",
"GMC",
"Genesis",
"Geo",
"Honda",
"Hummer",
"Hyundai",
"Infiniti",
"Isuzu",
"Jaguar",
"Jeep",
"Kia",
"Lamborghini",
"Land Rover",
"Lexus",
"Lincoln",
"Lotus",
"Mazda",
"Maserati",
"Maybach",
"McLaren",
"Mercedes-Benz",
"Mercury",
"MINI",
"Mitsubishi",
"Nissan",
"Oldsmobile",
"Plymouth",
"Polestar",
"Pontiac",
"Porsche",
"Ram",
"Rivian",
"Rolls Royce",
"Saab",
"Saturn",
"Scion",
"Smart",
"Subaru",
"Tesla",
"Toyota",
"Volkswagen",
"Volvo",
"Karma",
"Pagani",
"Koenigsegg",
"Lucid"
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,506 @@
{
"aston_martin": [
{
"year": "2023",
"models": [
{
"name": "Vantage",
"engines": [
"4.0L V8",
"5.2L V12"
],
"submodels": [
"AMR",
"V12",
"Base"
]
}
]
},
{
"year": "2020",
"models": [
{
"name": "DB11",
"engines": [
"4.0L V8"
],
"submodels": []
},
{
"name": "Dbs",
"engines": [
"5.2L V12"
],
"submodels": []
},
{
"name": "Vantage",
"engines": [
"4.0L V8",
"5.2L V12"
],
"submodels": [
"AMR",
"V12",
"Base"
]
}
]
},
{
"year": "2019",
"models": [
{
"name": "Vantage",
"engines": [
"4.0L V8",
"5.2L V12"
],
"submodels": [
"AMR",
"V12",
"Base"
]
}
]
},
{
"year": "2018",
"models": [
{
"name": "Rapide",
"engines": [
"6.0L V12"
],
"submodels": []
}
]
},
{
"year": "2017",
"models": [
{
"name": "V12 Vantage",
"engines": [
"6.0L V12"
],
"submodels": [
"Base",
"S"
]
},
{
"name": "Vanquish",
"engines": [
"6.0L V12"
],
"submodels": [
"Carbon",
"Base",
"Volante"
]
}
]
},
{
"year": "2016",
"models": [
{
"name": "Rapide",
"engines": [
"6.0L V12"
],
"submodels": []
},
{
"name": "V12 Vantage",
"engines": [
"6.0L V12"
],
"submodels": [
"Base",
"S"
]
},
{
"name": "Vanquish",
"engines": [
"6.0L V12"
],
"submodels": [
"Carbon",
"Base",
"Volante"
]
}
]
},
{
"year": "2015",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "Rapide",
"engines": [
"6.0L V12"
],
"submodels": []
},
{
"name": "V12 Vantage",
"engines": [
"6.0L V12"
],
"submodels": [
"Base",
"S"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
},
{
"name": "Vanquish",
"engines": [
"6.0L V12"
],
"submodels": [
"Carbon",
"Base",
"Volante"
]
}
]
},
{
"year": "2014",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
},
{
"name": "Vanquish",
"engines": [
"6.0L V12"
],
"submodels": [
"Carbon",
"Base",
"Volante"
]
}
]
},
{
"year": "2013",
"models": [
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2012",
"models": [
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2011",
"models": [
{
"name": "V12 Vantage",
"engines": [
"6.0L V12"
],
"submodels": [
"Base",
"S"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2010",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2009",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2008",
"models": [
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2007",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2006",
"models": [
{
"name": "V8 Vantage",
"engines": [
"4.3L V8",
"4.7L V8"
],
"submodels": [
"GT",
"S",
"Base"
]
}
]
},
{
"year": "2005",
"models": [
{
"name": "DB9",
"engines": [
"6.0L V12"
],
"submodels": [
"Volante",
"Base"
]
},
{
"name": "Vantage",
"engines": [
"4.0L V8",
"5.2L V12"
],
"submodels": [
"AMR",
"V12",
"Base"
]
}
]
},
{
"year": "2002",
"models": [
{
"name": "DB7",
"engines": [
"6.0L V12"
],
"submodels": [
"Vantage Volante",
"Vantage"
]
}
]
},
{
"year": "2001",
"models": [
{
"name": "DB7",
"engines": [
"6.0L V12"
],
"submodels": [
"Vantage Volante",
"Vantage"
]
}
]
},
{
"year": "1993",
"models": [
{
"name": "Virage",
"engines": [
"5.3L V8"
],
"submodels": [
"Volante"
]
}
]
},
{
"year": "1990",
"models": [
{
"name": "Virage",
"engines": [
"5.3L V8"
],
"submodels": [
"Volante"
]
}
]
},
{
"year": "1983",
"models": [
{
"name": "V 8",
"engines": [
"5.3L V8"
],
"submodels": []
}
]
}
]
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,427 @@
{
"bentley": [
{
"year": "2023",
"models": [
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
}
]
},
{
"year": "2022",
"models": [
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
}
]
},
{
"year": "2021",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
},
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
}
]
},
{
"year": "2018",
"models": [
{
"name": "Bentayga",
"engines": [
"6.0L W12"
],
"submodels": [
"W12 Signature",
"Black Edition"
]
},
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2017",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2016",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
},
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
},
{
"name": "Mulsanne",
"engines": [
"6.8L V8"
],
"submodels": [
"Base",
"Speed"
]
}
]
},
{
"year": "2014",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
},
{
"name": "Mulsanne",
"engines": [
"6.8L V8"
],
"submodels": [
"Base",
"Speed"
]
}
]
},
{
"year": "2013",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
},
{
"name": "Flying Spur",
"engines": [
"2.9L V6 MILD HYBRID EV- (MHEV)",
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Hybrid",
"V8",
"W12",
"S Hybrid",
"Base"
]
}
]
},
{
"year": "2009",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2008",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2006",
"models": [
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "2005",
"models": [
{
"name": "Arnage",
"engines": [
"4.4L V8",
"6.8L V8"
],
"submodels": [
"Base",
"R"
]
},
{
"name": "Continental",
"engines": [
"4.0L V8",
"6.0L W12 FLEX",
"6.0L W12"
],
"submodels": [
"Base",
"GTC",
"Flying Spur Speed",
"GT V8 S",
"GTC V8 S",
"Flying Spur",
"GT",
"GT Speed"
]
}
]
},
{
"year": "1999",
"models": [
{
"name": "Arnage",
"engines": [
"4.4L V8",
"6.8L V8"
],
"submodels": [
"Base",
"R"
]
}
]
},
{
"year": "1997",
"models": [
{
"name": "Brooklands",
"engines": [
"6.8L V8"
],
"submodels": []
}
]
},
{
"year": "1996",
"models": [
{
"name": "Azure",
"engines": [],
"submodels": []
}
]
},
{
"year": "1989",
"models": [
{
"name": "Turbo R",
"engines": [
"6.8L V8"
],
"submodels": []
}
]
},
{
"year": "1963",
"models": [
{
"name": "S3 Series",
"engines": [
"6.2L V8"
],
"submodels": []
}
]
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More