Initial Commit
This commit is contained in:
36
mvp-platform-services/landing/Dockerfile
Normal file
36
mvp-platform-services/landing/Dockerfile
Normal file
@@ -0,0 +1,36 @@
|
||||
FROM node:18-alpine as builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy package files and install dependencies
|
||||
COPY package.json ./
|
||||
RUN npm install
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Build arguments for environment variables
|
||||
ARG VITE_AUTH0_DOMAIN
|
||||
ARG VITE_AUTH0_CLIENT_ID
|
||||
ARG VITE_TENANTS_API_URL
|
||||
|
||||
# Set environment variables for build
|
||||
ENV VITE_AUTH0_DOMAIN=${VITE_AUTH0_DOMAIN}
|
||||
ENV VITE_AUTH0_CLIENT_ID=${VITE_AUTH0_CLIENT_ID}
|
||||
ENV VITE_TENANTS_API_URL=${VITE_TENANTS_API_URL}
|
||||
|
||||
# Build the application
|
||||
RUN npm run build
|
||||
|
||||
# Production stage
|
||||
FROM nginx:alpine
|
||||
|
||||
# Copy built app to nginx
|
||||
COPY --from=builder /app/dist /usr/share/nginx/html
|
||||
|
||||
# Copy nginx configuration
|
||||
COPY nginx.conf /etc/nginx/nginx.conf
|
||||
|
||||
EXPOSE 3000
|
||||
|
||||
CMD ["nginx", "-g", "daemon off;"]
|
||||
13
mvp-platform-services/landing/index.html
Normal file
13
mvp-platform-services/landing/index.html
Normal file
@@ -0,0 +1,13 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>MotoVaultPro - Vehicle Management Platform</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.tsx"></script>
|
||||
</body>
|
||||
</html>
|
||||
27
mvp-platform-services/landing/nginx.conf
Normal file
27
mvp-platform-services/landing/nginx.conf
Normal file
@@ -0,0 +1,27 @@
|
||||
events {
|
||||
worker_connections 1024;
|
||||
}
|
||||
|
||||
http {
|
||||
include /etc/nginx/mime.types;
|
||||
default_type application/octet-stream;
|
||||
|
||||
# Single HTTP server for internal proxying (edge TLS handled by nginx-proxy)
|
||||
server {
|
||||
listen 3000;
|
||||
server_name localhost motovaultpro.com;
|
||||
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
# Handle React Router (SPA)
|
||||
location / {
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
|
||||
# Security headers
|
||||
add_header X-Frame-Options DENY;
|
||||
add_header X-Content-Type-Options nosniff;
|
||||
add_header X-XSS-Protection "1; mode=block";
|
||||
}
|
||||
}
|
||||
26
mvp-platform-services/landing/nginx.conf.backup
Normal file
26
mvp-platform-services/landing/nginx.conf.backup
Normal file
@@ -0,0 +1,26 @@
|
||||
events {
|
||||
worker_connections 1024;
|
||||
}
|
||||
|
||||
http {
|
||||
include /etc/nginx/mime.types;
|
||||
default_type application/octet-stream;
|
||||
|
||||
server {
|
||||
listen 3000;
|
||||
server_name localhost;
|
||||
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
# Handle React Router (SPA)
|
||||
location / {
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
|
||||
# Security headers
|
||||
add_header X-Frame-Options DENY;
|
||||
add_header X-Content-Type-Options nosniff;
|
||||
add_header X-XSS-Protection "1; mode=block";
|
||||
}
|
||||
}
|
||||
24
mvp-platform-services/landing/package.json
Normal file
24
mvp-platform-services/landing/package.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"name": "mvp-platform-landing",
|
||||
"version": "1.0.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "tsc && vite build",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
"react-router-dom": "^6.8.0",
|
||||
"@auth0/auth0-react": "^2.2.3",
|
||||
"axios": "^1.6.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/react": "^18.2.0",
|
||||
"@types/react-dom": "^18.2.0",
|
||||
"@vitejs/plugin-react": "^4.2.0",
|
||||
"typescript": "^5.6.3",
|
||||
"vite": "^5.0.6"
|
||||
}
|
||||
}
|
||||
19
mvp-platform-services/landing/src/App.tsx
Normal file
19
mvp-platform-services/landing/src/App.tsx
Normal file
@@ -0,0 +1,19 @@
|
||||
import { Routes, Route } from 'react-router-dom'
|
||||
import HomePage from './components/HomePage'
|
||||
import TenantSignup from './components/TenantSignup'
|
||||
import CallbackHandler from './components/CallbackHandler'
|
||||
|
||||
function App() {
|
||||
|
||||
return (
|
||||
<div className="App">
|
||||
<Routes>
|
||||
<Route path="/" element={<HomePage />} />
|
||||
<Route path="/signup/:tenantId" element={<TenantSignup />} />
|
||||
<Route path="/callback" element={<CallbackHandler />} />
|
||||
</Routes>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default App
|
||||
@@ -0,0 +1,22 @@
|
||||
import React, { useEffect } from 'react'
|
||||
|
||||
const CallbackHandler: React.FC = () => {
|
||||
useEffect(() => {
|
||||
// This component is no longer needed since we removed Auth0 from landing page
|
||||
// Redirect to main app
|
||||
window.location.href = 'https://admin.motovaultpro.com'
|
||||
}, [])
|
||||
|
||||
return (
|
||||
<div style={{
|
||||
padding: '2rem',
|
||||
textAlign: 'center',
|
||||
fontFamily: 'Arial, sans-serif'
|
||||
}}>
|
||||
<h2>Redirecting...</h2>
|
||||
<p>Please wait while we redirect you to MotoVaultPro.</p>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default CallbackHandler
|
||||
55
mvp-platform-services/landing/src/components/HomePage.tsx
Normal file
55
mvp-platform-services/landing/src/components/HomePage.tsx
Normal file
@@ -0,0 +1,55 @@
|
||||
import React from 'react'
|
||||
|
||||
const HomePage: React.FC = () => {
|
||||
const handleLogin = () => {
|
||||
// Redirect directly to admin tenant for login
|
||||
window.location.href = 'https://admin.motovaultpro.com'
|
||||
}
|
||||
|
||||
return (
|
||||
<div style={{ padding: '2rem', fontFamily: 'Arial, sans-serif' }}>
|
||||
<header style={{ textAlign: 'center', marginBottom: '3rem' }}>
|
||||
<h1>MotoVaultPro</h1>
|
||||
<p>The complete vehicle management platform for automotive professionals</p>
|
||||
</header>
|
||||
|
||||
<main style={{ maxWidth: '800px', margin: '0 auto' }}>
|
||||
<section style={{ marginBottom: '3rem' }}>
|
||||
<h2>Features</h2>
|
||||
<ul>
|
||||
<li>Vehicle inventory management</li>
|
||||
<li>Maintenance tracking and scheduling</li>
|
||||
<li>Fuel log analytics</li>
|
||||
<li>Service station locator</li>
|
||||
<li>Multi-tenant architecture for teams</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<section style={{ textAlign: 'center' }}>
|
||||
<h2>Get Started</h2>
|
||||
<p>Already have an account?</p>
|
||||
<button
|
||||
onClick={handleLogin}
|
||||
style={{
|
||||
padding: '1rem 2rem',
|
||||
fontSize: '1.1rem',
|
||||
backgroundColor: '#007bff',
|
||||
color: 'white',
|
||||
border: 'none',
|
||||
borderRadius: '4px',
|
||||
cursor: 'pointer'
|
||||
}}
|
||||
>
|
||||
Access Your Dashboard
|
||||
</button>
|
||||
|
||||
<p style={{ marginTop: '2rem' }}>
|
||||
Need to join a team? Contact your tenant administrator for an invitation.
|
||||
</p>
|
||||
</section>
|
||||
</main>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default HomePage
|
||||
109
mvp-platform-services/landing/src/components/TenantSignup.tsx
Normal file
109
mvp-platform-services/landing/src/components/TenantSignup.tsx
Normal file
@@ -0,0 +1,109 @@
|
||||
import React, { useEffect, useState } from 'react'
|
||||
import { useParams } from 'react-router-dom'
|
||||
import { useAuth0 } from '@auth0/auth0-react'
|
||||
import axios from 'axios'
|
||||
|
||||
interface TenantInfo {
|
||||
id: string
|
||||
name: string
|
||||
status: string
|
||||
}
|
||||
|
||||
const TenantSignup: React.FC = () => {
|
||||
const { tenantId } = useParams<{ tenantId: string }>()
|
||||
const { loginWithRedirect } = useAuth0()
|
||||
const [tenant, setTenant] = useState<TenantInfo | null>(null)
|
||||
const [loading, setLoading] = useState(true)
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
|
||||
useEffect(() => {
|
||||
const fetchTenant = async () => {
|
||||
try {
|
||||
const response = await axios.get(
|
||||
`${import.meta.env.VITE_TENANTS_API_URL}/api/v1/tenants/${tenantId}`
|
||||
)
|
||||
setTenant(response.data)
|
||||
} catch (err) {
|
||||
setError('Tenant not found or not accepting signups')
|
||||
} finally {
|
||||
setLoading(false)
|
||||
}
|
||||
}
|
||||
|
||||
if (tenantId) {
|
||||
fetchTenant()
|
||||
}
|
||||
}, [tenantId])
|
||||
|
||||
const handleSignup = async () => {
|
||||
await loginWithRedirect({
|
||||
authorizationParams: {
|
||||
screen_hint: 'signup',
|
||||
redirect_uri: `${window.location.origin}/callback`
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if (loading) {
|
||||
return <div style={{ padding: '2rem' }}>Loading...</div>
|
||||
}
|
||||
|
||||
if (error || !tenant) {
|
||||
return (
|
||||
<div style={{ padding: '2rem', textAlign: 'center' }}>
|
||||
<h2>Tenant Not Found</h2>
|
||||
<p>{error}</p>
|
||||
<a href="/">Return to Homepage</a>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
return (
|
||||
<div style={{ padding: '2rem', maxWidth: '600px', margin: '0 auto', fontFamily: 'Arial, sans-serif' }}>
|
||||
<header style={{ textAlign: 'center', marginBottom: '2rem' }}>
|
||||
<h1>Join {tenant.name}</h1>
|
||||
<p>Create your account to get started</p>
|
||||
</header>
|
||||
|
||||
<main>
|
||||
<div style={{
|
||||
border: '1px solid #ddd',
|
||||
borderRadius: '8px',
|
||||
padding: '2rem',
|
||||
backgroundColor: '#f9f9f9'
|
||||
}}>
|
||||
<h3>What happens next?</h3>
|
||||
<ol>
|
||||
<li>Create your account with Auth0</li>
|
||||
<li>Your signup request will be sent to the tenant administrator</li>
|
||||
<li>Once approved, you'll receive access to {tenant.name}</li>
|
||||
<li>Login at <code>{tenant.id}.motovaultpro.com</code></li>
|
||||
</ol>
|
||||
|
||||
<div style={{ textAlign: 'center', marginTop: '2rem' }}>
|
||||
<button
|
||||
onClick={handleSignup}
|
||||
style={{
|
||||
padding: '1rem 2rem',
|
||||
fontSize: '1.1rem',
|
||||
backgroundColor: '#28a745',
|
||||
color: 'white',
|
||||
border: 'none',
|
||||
borderRadius: '4px',
|
||||
cursor: 'pointer'
|
||||
}}
|
||||
>
|
||||
Create Account for {tenant.name}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style={{ textAlign: 'center', marginTop: '2rem' }}>
|
||||
<a href="/">← Back to Homepage</a>
|
||||
</div>
|
||||
</main>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default TenantSignup
|
||||
12
mvp-platform-services/landing/src/main.tsx
Normal file
12
mvp-platform-services/landing/src/main.tsx
Normal file
@@ -0,0 +1,12 @@
|
||||
import React from 'react'
|
||||
import ReactDOM from 'react-dom/client'
|
||||
import { BrowserRouter } from 'react-router-dom'
|
||||
import App from './App'
|
||||
|
||||
ReactDOM.createRoot(document.getElementById('root')!).render(
|
||||
<React.StrictMode>
|
||||
<BrowserRouter>
|
||||
<App />
|
||||
</BrowserRouter>
|
||||
</React.StrictMode>
|
||||
)
|
||||
11
mvp-platform-services/landing/src/vite-env.d.ts
vendored
Normal file
11
mvp-platform-services/landing/src/vite-env.d.ts
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
/// <reference types="vite/client" />
|
||||
|
||||
interface ImportMetaEnv {
|
||||
readonly VITE_AUTH0_DOMAIN: string
|
||||
readonly VITE_AUTH0_CLIENT_ID: string
|
||||
readonly VITE_TENANTS_API_URL: string
|
||||
}
|
||||
|
||||
interface ImportMeta {
|
||||
readonly env: ImportMetaEnv
|
||||
}
|
||||
21
mvp-platform-services/landing/tsconfig.json
Normal file
21
mvp-platform-services/landing/tsconfig.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2020",
|
||||
"useDefineForClassFields": true,
|
||||
"lib": ["ES2020", "DOM", "DOM.Iterable"],
|
||||
"module": "ESNext",
|
||||
"skipLibCheck": true,
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"noEmit": true,
|
||||
"jsx": "react-jsx",
|
||||
"strict": true,
|
||||
"noUnusedLocals": true,
|
||||
"noUnusedParameters": true,
|
||||
"noFallthroughCasesInSwitch": true
|
||||
},
|
||||
"include": ["src"],
|
||||
"references": [{ "path": "./tsconfig.node.json" }]
|
||||
}
|
||||
9
mvp-platform-services/landing/tsconfig.node.json
Normal file
9
mvp-platform-services/landing/tsconfig.node.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"composite": true,
|
||||
"skipLibCheck": true,
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler"
|
||||
},
|
||||
"include": ["vite.config.ts"]
|
||||
}
|
||||
14
mvp-platform-services/landing/vite.config.ts
Normal file
14
mvp-platform-services/landing/vite.config.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
import { defineConfig } from 'vite'
|
||||
import react from '@vitejs/plugin-react'
|
||||
|
||||
export default defineConfig({
|
||||
plugins: [react()],
|
||||
server: {
|
||||
host: true,
|
||||
port: 3000
|
||||
},
|
||||
build: {
|
||||
outDir: 'dist',
|
||||
sourcemap: true
|
||||
}
|
||||
})
|
||||
333
mvp-platform-services/tenants/AUTH0-CONFIG.md
Normal file
333
mvp-platform-services/tenants/AUTH0-CONFIG.md
Normal file
@@ -0,0 +1,333 @@
|
||||
# Auth0 Multi-Tenant Configuration Guide
|
||||
|
||||
This document provides step-by-step instructions for configuring Auth0 for the multi-tenant MotoVaultPro platform.
|
||||
|
||||
## Overview
|
||||
|
||||
The multi-tenant architecture requires:
|
||||
- **Landing Page**: `motovaultpro.com` - Entry point with tenant selection
|
||||
- **Admin Tenant**: `admin.motovaultpro.com` - Admin access to all tenants
|
||||
- **Regular Tenants**: `{tenant-id}.motovaultpro.com` - Isolated tenant access
|
||||
- **Signup Workflow**: Tenant-specific signup with admin approval
|
||||
|
||||
## Auth0 Application Configuration
|
||||
|
||||
### 1. Application Settings
|
||||
|
||||
**Application Type**: Single Page Application (SPA)
|
||||
|
||||
**Allowed Callback URLs**:
|
||||
```
|
||||
# Development URLs
|
||||
http://localhost:3002/callback
|
||||
http://admin.motovaultpro.local/callback
|
||||
http://demo-tenant.motovaultpro.local/callback
|
||||
|
||||
# Production URLs
|
||||
https://motovaultpro.com/callback
|
||||
https://admin.motovaultpro.com/callback
|
||||
https://demo-tenant.motovaultpro.com/callback
|
||||
|
||||
# Add additional tenant subdomains as needed:
|
||||
https://{tenant-id}.motovaultpro.com/callback
|
||||
```
|
||||
|
||||
**Allowed Logout URLs**:
|
||||
```
|
||||
# Development
|
||||
http://localhost:3002
|
||||
http://admin.motovaultpro.local
|
||||
http://demo-tenant.motovaultpro.local
|
||||
|
||||
# Production
|
||||
https://motovaultpro.com
|
||||
https://admin.motovaultpro.com
|
||||
https://demo-tenant.motovaultpro.com
|
||||
https://{tenant-id}.motovaultpro.com
|
||||
```
|
||||
|
||||
**Allowed Web Origins**:
|
||||
```
|
||||
# Development
|
||||
http://localhost:3002
|
||||
http://admin.motovaultpro.local:3000
|
||||
http://demo-tenant.motovaultpro.local:3000
|
||||
|
||||
# Production
|
||||
https://motovaultpro.com
|
||||
https://admin.motovaultpro.com
|
||||
https://demo-tenant.motovaultpro.com
|
||||
https://{tenant-id}.motovaultpro.com
|
||||
```
|
||||
|
||||
### 2. JWT Configuration
|
||||
|
||||
**JWT Signature Algorithm**: RS256
|
||||
|
||||
**OIDC Conformant**: Enabled
|
||||
|
||||
### 3. Advanced Settings
|
||||
|
||||
**Grant Types**:
|
||||
- Authorization Code
|
||||
- Refresh Token
|
||||
- Implicit (for development only)
|
||||
|
||||
## Auth0 Rules Configuration
|
||||
|
||||
### Rule 1: Add Tenant Context to JWT
|
||||
|
||||
Create a new Rule in Auth0 Dashboard > Auth Pipeline > Rules:
|
||||
|
||||
```javascript
|
||||
function addTenantContext(user, context, callback) {
|
||||
const namespace = 'https://motovaultpro.com/';
|
||||
|
||||
// Extract tenant_id from user metadata (set during signup)
|
||||
let tenantId = user.user_metadata && user.user_metadata.tenant_id;
|
||||
|
||||
// For existing users without tenant metadata, default to admin
|
||||
if (!tenantId) {
|
||||
tenantId = 'admin';
|
||||
// Optionally update user metadata
|
||||
user.user_metadata = user.user_metadata || {};
|
||||
user.user_metadata.tenant_id = tenantId;
|
||||
}
|
||||
|
||||
// Check signup status for non-admin tenants
|
||||
const signupStatus = user.user_metadata && user.user_metadata.signup_status;
|
||||
|
||||
if (tenantId !== 'admin' && signupStatus !== 'approved') {
|
||||
// Block login for unapproved users
|
||||
return callback(new UnauthorizedError('Account pending approval'));
|
||||
}
|
||||
|
||||
// Add tenant context to tokens
|
||||
context.idToken[namespace + 'tenant_id'] = tenantId;
|
||||
context.accessToken[namespace + 'tenant_id'] = tenantId;
|
||||
context.idToken[namespace + 'signup_status'] = signupStatus || 'approved';
|
||||
|
||||
callback(null, user, context);
|
||||
}
|
||||
```
|
||||
|
||||
### Rule 2: Tenant-Specific User Metadata
|
||||
|
||||
```javascript
|
||||
function setTenantMetadata(user, context, callback) {
|
||||
const namespace = 'https://motovaultpro.com/';
|
||||
|
||||
// If this is a signup and connection is Username-Password-Authentication
|
||||
if (context.stats.loginsCount === 1 && context.connection === 'Username-Password-Authentication') {
|
||||
|
||||
// Extract tenant from redirect_uri or state parameter
|
||||
const redirectUri = context.request.query.redirect_uri || '';
|
||||
const tenantMatch = redirectUri.match(/([a-z0-9-]+)\.motovaultpro\.(com|local)/);
|
||||
|
||||
if (tenantMatch) {
|
||||
const tenantId = tenantMatch[1];
|
||||
|
||||
// Set initial user metadata
|
||||
user.user_metadata = user.user_metadata || {};
|
||||
user.user_metadata.tenant_id = tenantId;
|
||||
|
||||
// Set signup status (pending for regular tenants, approved for admin)
|
||||
user.user_metadata.signup_status = tenantId === 'admin' ? 'approved' : 'pending';
|
||||
|
||||
// Update user metadata in Auth0
|
||||
auth0.users.updateUserMetadata(user.user_id, user.user_metadata);
|
||||
}
|
||||
}
|
||||
|
||||
callback(null, user, context);
|
||||
}
|
||||
```
|
||||
|
||||
## Tenant Signup Flow Configuration
|
||||
|
||||
### 1. Signup URLs
|
||||
|
||||
**Tenant-Specific Signup**:
|
||||
```
|
||||
https://motovaultpro.com/signup/{tenant-id}
|
||||
```
|
||||
|
||||
**Process**:
|
||||
1. User visits tenant-specific signup URL
|
||||
2. Landing page validates tenant exists
|
||||
3. Redirects to Auth0 with tenant context
|
||||
4. Auth0 Rule sets tenant_id in user metadata
|
||||
5. User account created with status="pending"
|
||||
6. Tenant admin receives notification
|
||||
7. Admin approves/rejects via tenant management API
|
||||
|
||||
### 2. Auth0 Hosted Login Customization
|
||||
|
||||
Add custom CSS and JavaScript to Auth0 Universal Login to support tenant context:
|
||||
|
||||
**Custom CSS** (Dashboard > Universal Login > Advanced Options):
|
||||
```css
|
||||
.tenant-signup-info {
|
||||
background: #f8f9fa;
|
||||
padding: 15px;
|
||||
border-radius: 5px;
|
||||
margin-bottom: 20px;
|
||||
border-left: 4px solid #007bff;
|
||||
}
|
||||
```
|
||||
|
||||
**Custom JavaScript**:
|
||||
```javascript
|
||||
// Extract tenant from URL parameters
|
||||
const urlParams = new URLSearchParams(window.location.search);
|
||||
const redirectUri = urlParams.get('redirect_uri') || '';
|
||||
const tenantMatch = redirectUri.match(/([a-z0-9-]+)\.motovaultpro\.(com|local)/);
|
||||
|
||||
if (tenantMatch && tenantMatch[1] !== 'admin') {
|
||||
const tenantName = tenantMatch[1].replace('-', ' ').toUpperCase();
|
||||
|
||||
// Add tenant information to signup form
|
||||
const container = document.querySelector('.auth0-lock-header');
|
||||
if (container) {
|
||||
const info = document.createElement('div');
|
||||
info.className = 'tenant-signup-info';
|
||||
info.innerHTML = `
|
||||
<strong>Signing up for: ${tenantName}</strong><br>
|
||||
<small>Your account will require admin approval before you can access the system.</small>
|
||||
`;
|
||||
container.appendChild(info);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## JWT Token Format
|
||||
|
||||
After successful authentication, JWT tokens will include:
|
||||
|
||||
**ID Token Claims**:
|
||||
```json
|
||||
{
|
||||
"sub": "auth0|user-123",
|
||||
"email": "user@example.com",
|
||||
"https://motovaultpro.com/tenant_id": "demo-tenant",
|
||||
"https://motovaultpro.com/signup_status": "approved",
|
||||
"iat": 1699123456,
|
||||
"exp": 1699127056
|
||||
}
|
||||
```
|
||||
|
||||
**Access Token Claims**:
|
||||
```json
|
||||
{
|
||||
"sub": "auth0|user-123",
|
||||
"https://motovaultpro.com/tenant_id": "demo-tenant",
|
||||
"scope": "openid profile email",
|
||||
"iat": 1699123456,
|
||||
"exp": 1699127056
|
||||
}
|
||||
```
|
||||
|
||||
## Backend JWT Validation
|
||||
|
||||
Services should validate JWT tokens and extract tenant context:
|
||||
|
||||
```typescript
|
||||
// Example JWT validation middleware
|
||||
import jwt from 'jsonwebtoken';
|
||||
import jwksClient from 'jwks-rsa';
|
||||
|
||||
const client = jwksClient({
|
||||
jwksUri: `https://${AUTH0_DOMAIN}/.well-known/jwks.json`
|
||||
});
|
||||
|
||||
function getKey(header: any, callback: any) {
|
||||
client.getSigningKey(header.kid, (err, key) => {
|
||||
if (err) return callback(err);
|
||||
const signingKey = key.getPublicKey();
|
||||
callback(null, signingKey);
|
||||
});
|
||||
}
|
||||
|
||||
export const validateJWT = (token: string): Promise<any> => {
|
||||
return new Promise((resolve, reject) => {
|
||||
jwt.verify(token, getKey, {
|
||||
audience: process.env.AUTH0_AUDIENCE,
|
||||
issuer: `https://${process.env.AUTH0_DOMAIN}/`,
|
||||
algorithms: ['RS256']
|
||||
}, (err, decoded) => {
|
||||
if (err) return reject(err);
|
||||
resolve(decoded);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
// Extract tenant from validated JWT
|
||||
export const getTenantFromToken = (decodedToken: any): string => {
|
||||
return decodedToken['https://motovaultpro.com/tenant_id'] || 'admin';
|
||||
};
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
Configure the following environment variables for each service:
|
||||
|
||||
**Platform Services**:
|
||||
```env
|
||||
AUTH0_DOMAIN=your-domain.auth0.com
|
||||
AUTH0_AUDIENCE=https://api.motovaultpro.com
|
||||
```
|
||||
|
||||
**Landing Page Service**:
|
||||
```env
|
||||
VITE_AUTH0_DOMAIN=your-domain.auth0.com
|
||||
VITE_AUTH0_CLIENT_ID=your-client-id
|
||||
VITE_TENANTS_API_URL=http://mvp-platform-tenants:8000
|
||||
```
|
||||
|
||||
**Admin/Tenant Services**:
|
||||
```env
|
||||
REACT_APP_AUTH0_DOMAIN=your-domain.auth0.com
|
||||
REACT_APP_AUTH0_CLIENT_ID=your-client-id
|
||||
REACT_APP_AUTH0_AUDIENCE=https://api.motovaultpro.com
|
||||
REACT_APP_TENANT_ID=admin # or specific tenant ID
|
||||
```
|
||||
|
||||
## Testing the Configuration
|
||||
|
||||
### 1. Test Admin Login
|
||||
```bash
|
||||
# Visit admin tenant
|
||||
open http://admin.motovaultpro.local
|
||||
|
||||
# Should redirect to Auth0, login, then return to admin app
|
||||
```
|
||||
|
||||
### 2. Test Tenant Signup
|
||||
```bash
|
||||
# Visit tenant signup
|
||||
open http://motovaultpro.local/signup/demo-tenant
|
||||
|
||||
# Complete signup, verify pending status
|
||||
curl -H "Authorization: Bearer admin-token" \
|
||||
http://localhost:8001/api/v1/signups
|
||||
```
|
||||
|
||||
### 3. Test Approval Workflow
|
||||
```bash
|
||||
# Approve signup
|
||||
curl -X PUT -H "Authorization: Bearer admin-token" \
|
||||
http://localhost:8001/api/v1/signups/1/approve
|
||||
|
||||
# User should now be able to login to tenant
|
||||
open http://demo-tenant.motovaultpro.local
|
||||
```
|
||||
|
||||
## Production Deployment Notes
|
||||
|
||||
1. **SSL Certificates**: Ensure wildcard SSL certificate for `*.motovaultpro.com`
|
||||
2. **DNS Configuration**: Set up wildcard DNS or individual A records per tenant
|
||||
3. **Auth0 Environment**: Use production Auth0 tenant with proper security settings
|
||||
4. **Rate Limiting**: Configure Auth0 rate limiting for signup endpoints
|
||||
5. **Monitoring**: Set up Auth0 logs monitoring for failed login attempts
|
||||
|
||||
This configuration provides a secure, scalable multi-tenant authentication system with proper tenant isolation and admin approval workflows.
|
||||
525
mvp-platform-services/tenants/api/main.py
Normal file
525
mvp-platform-services/tenants/api/main.py
Normal file
@@ -0,0 +1,525 @@
|
||||
"""
|
||||
MVP Platform Tenants Service - FastAPI Application
|
||||
Handles tenant management, signup approvals, and multi-tenant infrastructure.
|
||||
"""
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Depends, Header
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
import asyncpg
|
||||
import os
|
||||
import json
|
||||
import httpx
|
||||
from typing import Optional, List, Dict
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from jose import jwt, jwk
|
||||
from jose.exceptions import JWTError, ExpiredSignatureError
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(
|
||||
title="MVP Platform Tenants Service",
|
||||
description="Multi-tenant management and signup approval service",
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # Configure appropriately for production
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Auth0 configuration
|
||||
AUTH0_DOMAIN = os.getenv("AUTH0_DOMAIN")
|
||||
AUTH0_AUDIENCE = os.getenv("AUTH0_AUDIENCE", "https://api.motovaultpro.com")
|
||||
|
||||
# Cache for JWKS keys (in production, use Redis)
|
||||
_jwks_cache = {}
|
||||
_jwks_cache_expiry = 0
|
||||
|
||||
# Database connection
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://platform_user:platform_pass@platform-postgres:5432/platform")
|
||||
|
||||
# Helper function to parse JSON settings
|
||||
def parse_json_field(value):
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return json.loads(value)
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
return value or {}
|
||||
|
||||
# Models
|
||||
class TenantCreate(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
subdomain: str
|
||||
admin_user_id: Optional[str] = None
|
||||
settings: dict = {}
|
||||
|
||||
class TenantResponse(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
subdomain: str
|
||||
status: str
|
||||
admin_user_id: Optional[str]
|
||||
settings: dict
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
@classmethod
|
||||
def from_db_row(cls, row):
|
||||
data = dict(row)
|
||||
data['settings'] = parse_json_field(data.get('settings'))
|
||||
return cls(**data)
|
||||
|
||||
class SignupRequest(BaseModel):
|
||||
user_email: str
|
||||
user_auth0_id: Optional[str] = None
|
||||
|
||||
class SignupResponse(BaseModel):
|
||||
id: int
|
||||
tenant_id: str
|
||||
user_email: str
|
||||
user_auth0_id: Optional[str]
|
||||
status: str
|
||||
requested_at: datetime
|
||||
approved_by: Optional[str] = None
|
||||
approved_at: Optional[datetime] = None
|
||||
rejected_at: Optional[datetime] = None
|
||||
rejection_reason: Optional[str] = None
|
||||
|
||||
class SignupApproval(BaseModel):
|
||||
reason: Optional[str] = None
|
||||
|
||||
# JWT Authentication functions
|
||||
async def get_jwks() -> Dict:
|
||||
"""Fetch JWKS from Auth0 with caching"""
|
||||
global _jwks_cache, _jwks_cache_expiry
|
||||
import time
|
||||
|
||||
current_time = time.time()
|
||||
|
||||
# Return cached JWKS if not expired (cache for 1 hour)
|
||||
if _jwks_cache and current_time < _jwks_cache_expiry:
|
||||
return _jwks_cache
|
||||
|
||||
if not AUTH0_DOMAIN:
|
||||
raise HTTPException(status_code=500, detail="Auth0 configuration missing")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(f"https://{AUTH0_DOMAIN}/.well-known/jwks.json")
|
||||
response.raise_for_status()
|
||||
jwks = response.json()
|
||||
|
||||
# Cache the JWKS for 1 hour
|
||||
_jwks_cache = jwks
|
||||
_jwks_cache_expiry = current_time + 3600
|
||||
|
||||
return jwks
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to fetch JWKS: {str(e)}")
|
||||
|
||||
async def get_signing_key(kid: str) -> str:
|
||||
"""Get signing key for the given kid"""
|
||||
jwks = await get_jwks()
|
||||
|
||||
for key in jwks.get("keys", []):
|
||||
if key.get("kid") == kid:
|
||||
return jwk.construct(key).key
|
||||
|
||||
raise HTTPException(status_code=401, detail="Unable to find appropriate key")
|
||||
|
||||
async def verify_jwt(token: str) -> Dict:
|
||||
"""Verify and decode JWT token"""
|
||||
if not AUTH0_DOMAIN or not AUTH0_AUDIENCE:
|
||||
raise HTTPException(status_code=500, detail="Auth0 configuration missing")
|
||||
|
||||
try:
|
||||
# Get the kid from token header
|
||||
unverified_header = jwt.get_unverified_header(token)
|
||||
kid = unverified_header.get("kid")
|
||||
|
||||
if not kid:
|
||||
raise HTTPException(status_code=401, detail="Token header missing kid")
|
||||
|
||||
# Get the signing key
|
||||
signing_key = await get_signing_key(kid)
|
||||
|
||||
# Verify and decode the token
|
||||
payload = jwt.decode(
|
||||
token,
|
||||
signing_key,
|
||||
algorithms=["RS256"],
|
||||
audience=AUTH0_AUDIENCE,
|
||||
issuer=f"https://{AUTH0_DOMAIN}/"
|
||||
)
|
||||
|
||||
return payload
|
||||
|
||||
except ExpiredSignatureError:
|
||||
raise HTTPException(status_code=401, detail="Token has expired")
|
||||
except JWTError as e:
|
||||
raise HTTPException(status_code=401, detail=f"Invalid token: {str(e)}")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=401, detail=f"Token validation failed: {str(e)}")
|
||||
|
||||
# Mock authentication for development/testing
|
||||
async def mock_auth_user(authorization: str) -> Dict:
|
||||
"""Mock authentication for testing purposes"""
|
||||
if not authorization or not authorization.startswith("Bearer "):
|
||||
raise HTTPException(status_code=401, detail="Authorization header required")
|
||||
|
||||
token = authorization.split(" ")[1]
|
||||
|
||||
if token == "admin-token":
|
||||
return {
|
||||
"sub": "admin-user",
|
||||
"email": "admin@motovaultpro.com",
|
||||
"https://motovaultpro.com/tenant_id": "admin",
|
||||
"https://motovaultpro.com/signup_status": "approved"
|
||||
}
|
||||
elif token.startswith("tenant-"):
|
||||
tenant_id = token.replace("tenant-", "", 1).replace("-token", "")
|
||||
return {
|
||||
"sub": f"{tenant_id}-admin",
|
||||
"email": f"admin@{tenant_id}.com",
|
||||
"https://motovaultpro.com/tenant_id": tenant_id,
|
||||
"https://motovaultpro.com/signup_status": "approved"
|
||||
}
|
||||
|
||||
raise HTTPException(status_code=401, detail="Invalid token")
|
||||
|
||||
async def get_current_user(authorization: str = Header(None)):
|
||||
"""Extract and validate JWT from Authorization header"""
|
||||
if not authorization:
|
||||
raise HTTPException(status_code=401, detail="Authorization header required")
|
||||
|
||||
try:
|
||||
scheme, token = authorization.split(" ", 1)
|
||||
if scheme.lower() != "bearer":
|
||||
raise HTTPException(status_code=401, detail="Invalid authentication scheme")
|
||||
|
||||
# Try real JWT validation first, fallback to mock for development
|
||||
try:
|
||||
if AUTH0_DOMAIN and AUTH0_AUDIENCE:
|
||||
payload = await verify_jwt(token)
|
||||
else:
|
||||
payload = await mock_auth_user(authorization)
|
||||
except HTTPException:
|
||||
# Fallback to mock authentication for development
|
||||
payload = await mock_auth_user(authorization)
|
||||
|
||||
# Extract tenant info from JWT claims
|
||||
tenant_id = payload.get("https://motovaultpro.com/tenant_id", "admin")
|
||||
user_id = payload.get("sub", "")
|
||||
email = payload.get("email", "")
|
||||
|
||||
return {
|
||||
"sub": user_id,
|
||||
"tenant_id": tenant_id,
|
||||
"email": email,
|
||||
"payload": payload
|
||||
}
|
||||
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=401, detail="Invalid authorization header format")
|
||||
|
||||
async def get_admin_user(current_user: dict = Depends(get_current_user)):
|
||||
if current_user.get("tenant_id") != "admin":
|
||||
raise HTTPException(status_code=403, detail="Admin access required")
|
||||
return current_user
|
||||
|
||||
async def get_tenant_admin(current_user: dict = Depends(get_current_user)):
|
||||
if not current_user.get("tenant_id"):
|
||||
raise HTTPException(status_code=401, detail="Tenant authentication required")
|
||||
return current_user
|
||||
|
||||
# Health check
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
try:
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
await conn.execute("SELECT 1")
|
||||
await conn.close()
|
||||
return {
|
||||
"status": "healthy",
|
||||
"database": "connected",
|
||||
"service": "mvp-platform-tenants",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Health check failed: {e}")
|
||||
raise HTTPException(status_code=503, detail="Service unavailable")
|
||||
|
||||
# Tenant management endpoints
|
||||
@app.post("/api/v1/tenants", response_model=TenantResponse)
|
||||
async def create_tenant(
|
||||
tenant_data: TenantCreate,
|
||||
current_user: dict = Depends(get_admin_user)
|
||||
):
|
||||
"""Create new tenant (admin only)"""
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
try:
|
||||
# Check if tenant already exists
|
||||
existing = await conn.fetchrow(
|
||||
"SELECT id FROM tenants WHERE id = $1 OR subdomain = $2",
|
||||
tenant_data.id, tenant_data.subdomain
|
||||
)
|
||||
if existing:
|
||||
raise HTTPException(status_code=409, detail="Tenant ID or subdomain already exists")
|
||||
|
||||
# Insert new tenant
|
||||
result = await conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO tenants (id, name, subdomain, admin_user_id, settings)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
RETURNING *
|
||||
""",
|
||||
tenant_data.id,
|
||||
tenant_data.name,
|
||||
tenant_data.subdomain,
|
||||
tenant_data.admin_user_id,
|
||||
json.dumps(tenant_data.settings)
|
||||
)
|
||||
|
||||
return TenantResponse.from_db_row(result)
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
@app.get("/api/v1/tenants", response_model=List[TenantResponse])
|
||||
async def list_tenants(current_user: dict = Depends(get_admin_user)):
|
||||
"""List all tenants (admin only)"""
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
try:
|
||||
results = await conn.fetch("SELECT * FROM tenants ORDER BY created_at DESC")
|
||||
return [TenantResponse.from_db_row(row) for row in results]
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
@app.get("/api/v1/tenants/{tenant_id}", response_model=TenantResponse)
|
||||
async def get_tenant(tenant_id: str):
|
||||
"""Get tenant details (public endpoint for validation)"""
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
try:
|
||||
result = await conn.fetchrow("SELECT * FROM tenants WHERE id = $1", tenant_id)
|
||||
if not result:
|
||||
raise HTTPException(status_code=404, detail="Tenant not found")
|
||||
|
||||
return TenantResponse.from_db_row(result)
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
@app.put("/api/v1/tenants/{tenant_id}", response_model=TenantResponse)
|
||||
async def update_tenant(
|
||||
tenant_id: str,
|
||||
tenant_data: TenantCreate,
|
||||
current_user: dict = Depends(get_admin_user)
|
||||
):
|
||||
"""Update tenant settings (admin only)"""
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
try:
|
||||
result = await conn.fetchrow(
|
||||
"""
|
||||
UPDATE tenants
|
||||
SET name = $2, admin_user_id = $3, settings = $4, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
RETURNING *
|
||||
""",
|
||||
tenant_id,
|
||||
tenant_data.name,
|
||||
tenant_data.admin_user_id,
|
||||
json.dumps(tenant_data.settings)
|
||||
)
|
||||
|
||||
if not result:
|
||||
raise HTTPException(status_code=404, detail="Tenant not found")
|
||||
|
||||
return TenantResponse.from_db_row(result)
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
# Signup management endpoints
|
||||
@app.post("/api/v1/tenants/{tenant_id}/signups", response_model=SignupResponse)
|
||||
async def request_signup(tenant_id: str, signup_data: SignupRequest):
|
||||
"""Request signup approval for a tenant (public endpoint)"""
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
try:
|
||||
# Verify tenant exists and accepts signups
|
||||
tenant = await conn.fetchrow(
|
||||
"SELECT id, status FROM tenants WHERE id = $1", tenant_id
|
||||
)
|
||||
if not tenant:
|
||||
raise HTTPException(status_code=404, detail="Tenant not found")
|
||||
if tenant['status'] != 'active':
|
||||
raise HTTPException(status_code=400, detail="Tenant not accepting signups")
|
||||
|
||||
# Check for existing signup
|
||||
existing = await conn.fetchrow(
|
||||
"SELECT id FROM tenant_signups WHERE tenant_id = $1 AND user_email = $2",
|
||||
tenant_id, signup_data.user_email
|
||||
)
|
||||
if existing:
|
||||
raise HTTPException(status_code=409, detail="Signup request already exists")
|
||||
|
||||
# Create signup request
|
||||
result = await conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO tenant_signups (tenant_id, user_email, user_auth0_id)
|
||||
VALUES ($1, $2, $3)
|
||||
RETURNING *
|
||||
""",
|
||||
tenant_id,
|
||||
signup_data.user_email,
|
||||
signup_data.user_auth0_id
|
||||
)
|
||||
|
||||
logger.info(f"New signup request: {signup_data.user_email} for tenant {tenant_id}")
|
||||
return SignupResponse(**dict(result))
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
@app.get("/api/v1/tenants/{tenant_id}/signups", response_model=List[SignupResponse])
|
||||
async def get_tenant_signups(
|
||||
tenant_id: str,
|
||||
status: Optional[str] = "pending",
|
||||
current_user: dict = Depends(get_tenant_admin)
|
||||
):
|
||||
"""List signups for a tenant (tenant admin only)"""
|
||||
# Verify user has access to this tenant
|
||||
if current_user.get("tenant_id") != tenant_id and current_user.get("tenant_id") != "admin":
|
||||
raise HTTPException(status_code=403, detail="Access denied to this tenant")
|
||||
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
try:
|
||||
query = "SELECT * FROM tenant_signups WHERE tenant_id = $1"
|
||||
params = [tenant_id]
|
||||
|
||||
if status:
|
||||
query += " AND status = $2"
|
||||
params.append(status)
|
||||
|
||||
query += " ORDER BY requested_at DESC"
|
||||
|
||||
results = await conn.fetch(query, *params)
|
||||
return [SignupResponse(**dict(row)) for row in results]
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
@app.get("/api/v1/signups", response_model=List[SignupResponse])
|
||||
async def get_all_signups(
|
||||
status: Optional[str] = "pending",
|
||||
current_user: dict = Depends(get_admin_user)
|
||||
):
|
||||
"""List all signups across all tenants (admin only)"""
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
try:
|
||||
query = "SELECT * FROM tenant_signups"
|
||||
params = []
|
||||
|
||||
if status:
|
||||
query += " WHERE status = $1"
|
||||
params.append(status)
|
||||
|
||||
query += " ORDER BY requested_at DESC"
|
||||
|
||||
results = await conn.fetch(query, *params)
|
||||
return [SignupResponse(**dict(row)) for row in results]
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
@app.put("/api/v1/signups/{signup_id}/approve")
|
||||
async def approve_signup(
|
||||
signup_id: int,
|
||||
current_user: dict = Depends(get_tenant_admin)
|
||||
):
|
||||
"""Approve a signup request (tenant admin only)"""
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
try:
|
||||
# Get signup details to verify tenant access
|
||||
signup = await conn.fetchrow(
|
||||
"SELECT * FROM tenant_signups WHERE id = $1", signup_id
|
||||
)
|
||||
if not signup:
|
||||
raise HTTPException(status_code=404, detail="Signup not found")
|
||||
|
||||
# Verify user has access to approve this signup
|
||||
if (current_user.get("tenant_id") != signup['tenant_id'] and
|
||||
current_user.get("tenant_id") != "admin"):
|
||||
raise HTTPException(status_code=403, detail="Access denied to this tenant")
|
||||
|
||||
result = await conn.fetchrow(
|
||||
"""
|
||||
UPDATE tenant_signups
|
||||
SET status = 'approved', approved_by = $2, approved_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1 AND status = 'pending'
|
||||
RETURNING *
|
||||
""",
|
||||
signup_id,
|
||||
current_user['sub']
|
||||
)
|
||||
|
||||
if not result:
|
||||
raise HTTPException(status_code=404, detail="Signup not found or already processed")
|
||||
|
||||
# TODO: Update Auth0 user metadata to set signup_status = 'approved'
|
||||
logger.info(f"Approved signup {signup_id} for user {result['user_email']} by {current_user['sub']}")
|
||||
|
||||
return {"status": "approved", "signup_id": signup_id}
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
@app.put("/api/v1/signups/{signup_id}/reject")
|
||||
async def reject_signup(
|
||||
signup_id: int,
|
||||
approval_data: SignupApproval,
|
||||
current_user: dict = Depends(get_tenant_admin)
|
||||
):
|
||||
"""Reject a signup request (tenant admin only)"""
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
try:
|
||||
# Get signup details to verify tenant access
|
||||
signup = await conn.fetchrow(
|
||||
"SELECT * FROM tenant_signups WHERE id = $1", signup_id
|
||||
)
|
||||
if not signup:
|
||||
raise HTTPException(status_code=404, detail="Signup not found")
|
||||
|
||||
# Verify user has access to reject this signup
|
||||
if (current_user.get("tenant_id") != signup['tenant_id'] and
|
||||
current_user.get("tenant_id") != "admin"):
|
||||
raise HTTPException(status_code=403, detail="Access denied to this tenant")
|
||||
|
||||
reason = approval_data.reason or "No reason provided"
|
||||
|
||||
result = await conn.fetchrow(
|
||||
"""
|
||||
UPDATE tenant_signups
|
||||
SET status = 'rejected', approved_by = $2, rejected_at = CURRENT_TIMESTAMP, rejection_reason = $3
|
||||
WHERE id = $1 AND status = 'pending'
|
||||
RETURNING *
|
||||
""",
|
||||
signup_id,
|
||||
current_user['sub'],
|
||||
reason
|
||||
)
|
||||
|
||||
if not result:
|
||||
raise HTTPException(status_code=404, detail="Signup not found or already processed")
|
||||
|
||||
logger.info(f"Rejected signup {signup_id} for user {result['user_email']}: {reason}")
|
||||
|
||||
return {"status": "rejected", "signup_id": signup_id, "reason": reason}
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
21
mvp-platform-services/tenants/docker/Dockerfile.api
Normal file
21
mvp-platform-services/tenants/docker/Dockerfile.api
Normal file
@@ -0,0 +1,21 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY api/ .
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||
7
mvp-platform-services/tenants/requirements.txt
Normal file
7
mvp-platform-services/tenants/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
fastapi==0.104.1
|
||||
uvicorn[standard]==0.24.0
|
||||
asyncpg==0.29.0
|
||||
pydantic==2.5.0
|
||||
python-jose[cryptography]==3.3.0
|
||||
python-multipart==0.0.6
|
||||
httpx==0.25.2
|
||||
@@ -0,0 +1,41 @@
|
||||
-- Tenant registry schema for MVP Platform Tenants Service
|
||||
-- Creates core tenant management tables
|
||||
|
||||
-- Tenant registry
|
||||
CREATE TABLE IF NOT EXISTS tenants (
|
||||
id VARCHAR(100) PRIMARY KEY, -- 'admin', 'acme-corp', etc.
|
||||
name VARCHAR(255) NOT NULL, -- Display name
|
||||
subdomain VARCHAR(100) UNIQUE NOT NULL, -- Same as id for simplicity
|
||||
status VARCHAR(50) DEFAULT 'active', -- active, pending, suspended
|
||||
admin_user_id VARCHAR(255), -- Auth0 user ID of tenant admin
|
||||
settings JSONB DEFAULT '{}', -- Tenant-specific configuration
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Create indexes for performance
|
||||
CREATE INDEX IF NOT EXISTS idx_tenants_status ON tenants(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_tenants_admin_user ON tenants(admin_user_id);
|
||||
|
||||
-- Tenant signup approval workflow
|
||||
CREATE TABLE IF NOT EXISTS tenant_signups (
|
||||
id SERIAL PRIMARY KEY,
|
||||
tenant_id VARCHAR(100) REFERENCES tenants(id),
|
||||
user_email VARCHAR(255) NOT NULL,
|
||||
user_auth0_id VARCHAR(255), -- Auth0 user ID after signup
|
||||
status VARCHAR(50) DEFAULT 'pending', -- pending, approved, rejected
|
||||
requested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
approved_by VARCHAR(255), -- Auth0 ID of approving admin
|
||||
approved_at TIMESTAMP,
|
||||
rejected_at TIMESTAMP,
|
||||
rejection_reason TEXT
|
||||
);
|
||||
|
||||
-- Create indexes for signup queries
|
||||
CREATE INDEX IF NOT EXISTS idx_tenant_signups_tenant_status ON tenant_signups(tenant_id, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_tenant_signups_user_email ON tenant_signups(user_email);
|
||||
|
||||
-- Initial admin tenant data
|
||||
INSERT INTO tenants (id, name, subdomain, status, admin_user_id)
|
||||
VALUES ('admin', 'Admin Tenant', 'admin', 'active', NULL)
|
||||
ON CONFLICT (id) DO NOTHING;
|
||||
42
mvp-platform-services/vehicles/README.md
Normal file
42
mvp-platform-services/vehicles/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# MVP Platform Vehicles Service
|
||||
|
||||
## Schema Bootstrapping (Docker-First)
|
||||
- Database: PostgreSQL, service `mvp-platform-vehicles-db`.
|
||||
- On first start, schema files from `mvp-platform-services/vehicles/sql/schema` are executed automatically because the folder is mounted to `/docker-entrypoint-initdb.d` in `docker-compose.yml`.
|
||||
- Files run in lexicographic order:
|
||||
- `001_schema.sql` – creates `vehicles` schema and tables
|
||||
- `002_constraints_indexes.sql` – adds uniques and indexes
|
||||
- `003_seed_minimal.sql` – seeds minimal dropdown data for sanity checks
|
||||
|
||||
## When Do Files Run?
|
||||
- Only on the initial database initialization (i.e., when the Postgres data volume is empty).
|
||||
- Subsequent `make start` runs will not reapply these files unless you reset the volume.
|
||||
|
||||
## Applying Schema Changes
|
||||
- Option 1 (fresh reset):
|
||||
1. `make clean` to remove volumes
|
||||
2. `make start` (the `.sql` files will be reapplied)
|
||||
- Option 2 (manual apply to existing DB):
|
||||
- Exec into the DB container and run the SQL files in order:
|
||||
```bash
|
||||
docker compose exec mvp-platform-vehicles-db bash -lc "psql -U mvp_platform_user -d vehicles -f /docker-entrypoint-initdb.d/001_schema.sql"
|
||||
docker compose exec mvp-platform-vehicles-db bash -lc "psql -U mvp_platform_user -d vehicles -f /docker-entrypoint-initdb.d/002_constraints_indexes.sql"
|
||||
docker compose exec mvp-platform-vehicles-db bash -lc "psql -U mvp_platform_user -d vehicles -f /docker-entrypoint-initdb.d/003_seed_minimal.sql"
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
```bash
|
||||
make start
|
||||
make logs-platform-vehicles # View API + DB logs
|
||||
```
|
||||
|
||||
## Endpoint Summary (Auth Required: Authorization: Bearer <API_KEY>)
|
||||
- `GET /api/v1/vehicles/years` → `[number]`
|
||||
- `GET /api/v1/vehicles/makes?year=YYYY` → `{ makes: [{id,name}] }`
|
||||
- `GET /api/v1/vehicles/models?year=YYYY&make_id=ID` → `{ models: [...] }`
|
||||
- `GET /api/v1/vehicles/trims?year=YYYY&make_id=ID&model_id=ID` → `{ trims: [...] }`
|
||||
- `GET /api/v1/vehicles/engines?year=YYYY&make_id=ID&model_id=ID&trim_id=ID` → `{ engines: [...] }`
|
||||
|
||||
## Notes
|
||||
- Transmissions and performance tables exist for future use; no endpoints yet.
|
||||
- VIN decode endpoints are pending rebuild and not documented here.
|
||||
0
mvp-platform-services/vehicles/api/__init__.py
Normal file
0
mvp-platform-services/vehicles/api/__init__.py
Normal file
43
mvp-platform-services/vehicles/api/config.py
Normal file
43
mvp-platform-services/vehicles/api/config.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import os
|
||||
from pydantic_settings import BaseSettings
|
||||
from typing import List
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""Application configuration"""
|
||||
|
||||
# Database settings
|
||||
POSTGRES_HOST: str = os.getenv("POSTGRES_HOST", "localhost")
|
||||
POSTGRES_PORT: int = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||
POSTGRES_USER: str = os.getenv("POSTGRES_USER", "mvp_platform_user")
|
||||
POSTGRES_PASSWORD: str = os.getenv("POSTGRES_PASSWORD", "platform123")
|
||||
POSTGRES_DATABASE: str = os.getenv("POSTGRES_DATABASE", "vpic")
|
||||
|
||||
# Redis settings
|
||||
REDIS_HOST: str = os.getenv("REDIS_HOST", "localhost")
|
||||
REDIS_PORT: int = int(os.getenv("REDIS_PORT", "6379"))
|
||||
REDIS_DB: int = int(os.getenv("REDIS_DB", "0"))
|
||||
|
||||
# Database connection pool settings
|
||||
DATABASE_MIN_CONNECTIONS: int = int(os.getenv("DATABASE_MIN_CONNECTIONS", "5"))
|
||||
DATABASE_MAX_CONNECTIONS: int = int(os.getenv("DATABASE_MAX_CONNECTIONS", "20"))
|
||||
|
||||
# Cache settings
|
||||
CACHE_TTL: int = int(os.getenv("CACHE_TTL", "3600")) # 1 hour default
|
||||
|
||||
# Security
|
||||
API_KEY: str = os.getenv("API_KEY", "mvp-platform-vehicles-secret-key")
|
||||
|
||||
# Application settings
|
||||
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
|
||||
CORS_ORIGINS: List[str] = [
|
||||
"http://localhost:3000",
|
||||
"https://motovaultpro.com",
|
||||
"http://localhost:3001"
|
||||
]
|
||||
|
||||
class Config:
|
||||
case_sensitive = True
|
||||
|
||||
def get_settings() -> Settings:
|
||||
"""Get application settings"""
|
||||
return Settings()
|
||||
40
mvp-platform-services/vehicles/api/dependencies.py
Normal file
40
mvp-platform-services/vehicles/api/dependencies.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import asyncpg
|
||||
import redis.asyncio as redis
|
||||
from fastapi import Request, Depends, HTTPException
|
||||
import logging
|
||||
from .config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
async def get_db_pool(request: Request) -> asyncpg.Pool:
|
||||
"""Get database pool from app state"""
|
||||
return request.app.state.db_pool
|
||||
|
||||
async def get_db(request: Request) -> asyncpg.Connection:
|
||||
"""Get database connection"""
|
||||
pool = await get_db_pool(request)
|
||||
async with pool.acquire() as conn:
|
||||
yield conn
|
||||
|
||||
async def get_redis_client(request: Request) -> redis.Redis:
|
||||
"""Get Redis client from app state"""
|
||||
return request.app.state.redis_client
|
||||
|
||||
async def get_cache(request: Request):
|
||||
"""Get cache service from app state"""
|
||||
return request.app.state.cache_service
|
||||
|
||||
async def verify_bearer_token(request: Request) -> str:
|
||||
"""Verify Bearer token for service-to-service authentication
|
||||
|
||||
Expects header: Authorization: Bearer <token>
|
||||
Compares token to settings.API_KEY
|
||||
"""
|
||||
auth_header = request.headers.get("Authorization", "")
|
||||
if not auth_header.startswith("Bearer "):
|
||||
raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
|
||||
token = auth_header.split(" ", 1)[1].strip()
|
||||
if token != settings.API_KEY:
|
||||
raise HTTPException(status_code=401, detail="Invalid service token")
|
||||
return token
|
||||
202
mvp-platform-services/vehicles/api/main.py
Normal file
202
mvp-platform-services/vehicles/api/main.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, Request, HTTPException, Depends
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
import asyncpg
|
||||
import redis.asyncio as redis
|
||||
import time
|
||||
|
||||
from .config import get_settings
|
||||
from .dependencies import get_db_pool, get_redis_client, get_cache, verify_bearer_token
|
||||
from .routes import vehicles, vin
|
||||
from .models.responses import HealthResponse
|
||||
from .services.cache_service import CacheService
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Application lifespan manager"""
|
||||
# Startup
|
||||
logger.info("Starting MVP Platform Vehicles API...")
|
||||
|
||||
# Initialize database pool
|
||||
try:
|
||||
app.state.db_pool = await asyncpg.create_pool(
|
||||
host=settings.POSTGRES_HOST,
|
||||
port=settings.POSTGRES_PORT,
|
||||
user=settings.POSTGRES_USER,
|
||||
password=settings.POSTGRES_PASSWORD,
|
||||
database=settings.POSTGRES_DATABASE,
|
||||
min_size=settings.DATABASE_MIN_CONNECTIONS,
|
||||
max_size=settings.DATABASE_MAX_CONNECTIONS,
|
||||
command_timeout=30
|
||||
)
|
||||
logger.info("Database pool initialized")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize database pool: {e}")
|
||||
raise
|
||||
|
||||
# Initialize Redis client
|
||||
try:
|
||||
app.state.redis_client = redis.Redis(
|
||||
host=settings.REDIS_HOST,
|
||||
port=settings.REDIS_PORT,
|
||||
db=settings.REDIS_DB,
|
||||
decode_responses=False,
|
||||
socket_connect_timeout=5,
|
||||
socket_timeout=5
|
||||
)
|
||||
# Test connection
|
||||
await app.state.redis_client.ping()
|
||||
logger.info("Redis client initialized")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize Redis client: {e}")
|
||||
app.state.redis_client = None
|
||||
|
||||
# Initialize cache service
|
||||
app.state.cache_service = CacheService(
|
||||
app.state.redis_client,
|
||||
enabled=bool(app.state.redis_client),
|
||||
default_ttl=settings.CACHE_TTL
|
||||
)
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
logger.info("Shutting down MVP Platform Vehicles API...")
|
||||
|
||||
if hasattr(app.state, 'db_pool') and app.state.db_pool:
|
||||
await app.state.db_pool.close()
|
||||
logger.info("Database pool closed")
|
||||
|
||||
if hasattr(app.state, 'redis_client') and app.state.redis_client:
|
||||
await app.state.redis_client.aclose()
|
||||
logger.info("Redis client closed")
|
||||
|
||||
# Create FastAPI app
|
||||
app = FastAPI(
|
||||
title="MVP Platform Vehicles API",
|
||||
description="Hierarchical Vehicle API with VIN decoding for MotoVaultPro platform services",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
docs_url="/docs" if settings.DEBUG else None,
|
||||
redoc_url="/redoc" if settings.DEBUG else None
|
||||
)
|
||||
|
||||
# Add CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=settings.CORS_ORIGINS,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Request timing middleware
|
||||
@app.middleware("http")
|
||||
async def add_process_time_header(request: Request, call_next):
|
||||
start_time = time.time()
|
||||
response = await call_next(request)
|
||||
process_time = time.time() - start_time
|
||||
response.headers["X-Process-Time"] = str(process_time)
|
||||
return response
|
||||
|
||||
# Global exception handler
|
||||
@app.exception_handler(Exception)
|
||||
async def global_exception_handler(request: Request, exc: Exception):
|
||||
logger.error(f"Unhandled exception in {request.method} {request.url.path}: {exc}")
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={"detail": "Internal server error"}
|
||||
)
|
||||
|
||||
# Include routers
|
||||
app.include_router(vehicles.router, prefix="/api/v1", dependencies=[Depends(verify_bearer_token)])
|
||||
app.include_router(vin.router, prefix="/api/v1", dependencies=[Depends(verify_bearer_token)])
|
||||
|
||||
# Health check endpoint
|
||||
@app.api_route("/health", methods=["GET", "HEAD"], response_model=HealthResponse)
|
||||
async def health_check(request: Request):
|
||||
"""Health check endpoint"""
|
||||
db_status = "ok"
|
||||
cache_status = "ok"
|
||||
|
||||
# Check database
|
||||
try:
|
||||
db_pool = request.app.state.db_pool
|
||||
async with db_pool.acquire() as conn:
|
||||
await conn.fetchval("SELECT 1")
|
||||
except Exception as e:
|
||||
logger.error(f"Database health check failed: {e}")
|
||||
db_status = "error"
|
||||
|
||||
# Check cache
|
||||
try:
|
||||
cache = request.app.state.cache_service
|
||||
if cache and cache.enabled:
|
||||
await cache.redis.ping()
|
||||
else:
|
||||
cache_status = "disabled"
|
||||
except Exception as e:
|
||||
logger.error(f"Cache health check failed: {e}")
|
||||
cache_status = "error"
|
||||
|
||||
overall_status = "ok" if db_status == "ok" else "degraded"
|
||||
|
||||
return HealthResponse(
|
||||
status=overall_status,
|
||||
database=db_status,
|
||||
cache=cache_status,
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# Root endpoint
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint with API information"""
|
||||
return {
|
||||
"name": "MVP Platform Vehicles API",
|
||||
"version": "1.0.0",
|
||||
"description": "Hierarchical Vehicle API with VIN decoding",
|
||||
"docs_url": "/docs" if settings.DEBUG else "Contact administrator for documentation",
|
||||
"endpoints": {
|
||||
"health": "/health",
|
||||
"makes": "/api/v1/vehicles/makes?year=2024",
|
||||
"models": "/api/v1/vehicles/models?year=2024&make_id=1",
|
||||
"trims": "/api/v1/vehicles/trims?year=2024&make_id=1&model_id=1",
|
||||
"engines": "/api/v1/vehicles/engines?year=2024&make_id=1&model_id=1",
|
||||
"transmissions": "/api/v1/vehicles/transmissions?year=2024&make_id=1&model_id=1",
|
||||
"vin_decode": "/api/v1/vehicles/vindecode"
|
||||
}
|
||||
}
|
||||
|
||||
# Cache stats endpoint
|
||||
@app.get("/api/v1/cache/stats")
|
||||
async def cache_stats(request: Request, token: str = Depends(verify_bearer_token)):
|
||||
"""Get cache statistics"""
|
||||
try:
|
||||
cache = request.app.state.cache_service
|
||||
stats = await cache.get_stats()
|
||||
return stats
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get cache stats: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve cache statistics")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(
|
||||
"api.main:app",
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
reload=settings.DEBUG,
|
||||
log_level="info"
|
||||
)
|
||||
84
mvp-platform-services/vehicles/api/models/responses.py
Normal file
84
mvp-platform-services/vehicles/api/models/responses.py
Normal file
@@ -0,0 +1,84 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
|
||||
class MakeItem(BaseModel):
|
||||
"""Make item response model"""
|
||||
id: int
|
||||
name: str
|
||||
|
||||
class ModelItem(BaseModel):
|
||||
"""Model item response model"""
|
||||
id: int
|
||||
name: str
|
||||
|
||||
class TrimItem(BaseModel):
|
||||
"""Trim item response model"""
|
||||
id: int
|
||||
name: str
|
||||
|
||||
class EngineItem(BaseModel):
|
||||
"""Engine item response model"""
|
||||
id: int
|
||||
name: str
|
||||
|
||||
class TransmissionItem(BaseModel):
|
||||
"""Transmission item response model"""
|
||||
name: str
|
||||
|
||||
class MakesResponse(BaseModel):
|
||||
"""Makes response model"""
|
||||
makes: List[MakeItem]
|
||||
|
||||
class YearsResponse(BaseModel):
|
||||
"""Years response model"""
|
||||
years: List[int]
|
||||
|
||||
class ModelsResponse(BaseModel):
|
||||
"""Models response model"""
|
||||
models: List[ModelItem]
|
||||
|
||||
class TrimsResponse(BaseModel):
|
||||
"""Trims response model"""
|
||||
trims: List[TrimItem]
|
||||
|
||||
class EnginesResponse(BaseModel):
|
||||
"""Engines response model"""
|
||||
engines: List[EngineItem]
|
||||
|
||||
class TransmissionsResponse(BaseModel):
|
||||
"""Transmissions response model"""
|
||||
transmissions: List[TransmissionItem]
|
||||
|
||||
class VINDecodeResult(BaseModel):
|
||||
"""VIN decode result model"""
|
||||
make: Optional[str] = None
|
||||
model: Optional[str] = None
|
||||
year: Optional[int] = None
|
||||
trim_name: Optional[str] = None
|
||||
engine_description: Optional[str] = None
|
||||
transmission_description: Optional[str] = None
|
||||
horsepower: Optional[float] = None
|
||||
torque: Optional[float] = None
|
||||
top_speed: Optional[float] = None
|
||||
fuel: Optional[str] = None
|
||||
confidence_score: Optional[float] = None
|
||||
vehicle_type: Optional[str] = None
|
||||
|
||||
class VINDecodeRequest(BaseModel):
|
||||
"""VIN decode request model"""
|
||||
vin: str
|
||||
|
||||
class VINDecodeResponse(BaseModel):
|
||||
"""VIN decode response model"""
|
||||
vin: str
|
||||
result: Optional[VINDecodeResult]
|
||||
success: bool
|
||||
error: Optional[str] = None
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
"""Health check response model"""
|
||||
status: str
|
||||
database: str
|
||||
cache: str
|
||||
version: str
|
||||
etl_last_run: Optional[str] = None
|
||||
@@ -0,0 +1,79 @@
|
||||
import asyncpg
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
class VehiclesRepository:
|
||||
"""Repository for hierarchical vehicle queries against normalized schema"""
|
||||
|
||||
async def get_years(self, db: asyncpg.Connection) -> List[int]:
|
||||
rows = await db.fetch(
|
||||
"""
|
||||
SELECT DISTINCT year
|
||||
FROM vehicles.model_year
|
||||
ORDER BY year DESC
|
||||
"""
|
||||
)
|
||||
return [r["year"] for r in rows]
|
||||
|
||||
async def get_makes(self, db: asyncpg.Connection, year: int) -> List[Dict]:
|
||||
rows = await db.fetch(
|
||||
"""
|
||||
SELECT DISTINCT ma.id, ma.name
|
||||
FROM vehicles.make ma
|
||||
JOIN vehicles.model mo ON mo.make_id = ma.id
|
||||
JOIN vehicles.model_year my ON my.model_id = mo.id AND my.year = $1
|
||||
ORDER BY ma.name
|
||||
""",
|
||||
year,
|
||||
)
|
||||
return [{"id": r["id"], "name": r["name"]} for r in rows]
|
||||
|
||||
async def get_models(self, db: asyncpg.Connection, year: int, make_id: int) -> List[Dict]:
|
||||
rows = await db.fetch(
|
||||
"""
|
||||
SELECT DISTINCT mo.id, mo.name
|
||||
FROM vehicles.model mo
|
||||
JOIN vehicles.model_year my ON my.model_id = mo.id AND my.year = $1
|
||||
WHERE mo.make_id = $2
|
||||
ORDER BY mo.name
|
||||
""",
|
||||
year,
|
||||
make_id,
|
||||
)
|
||||
return [{"id": r["id"], "name": r["name"]} for r in rows]
|
||||
|
||||
async def get_trims(self, db: asyncpg.Connection, year: int, model_id: int) -> List[Dict]:
|
||||
rows = await db.fetch(
|
||||
"""
|
||||
SELECT t.id, t.name
|
||||
FROM vehicles.trim t
|
||||
JOIN vehicles.model_year my ON my.id = t.model_year_id
|
||||
WHERE my.year = $1 AND my.model_id = $2
|
||||
ORDER BY t.name
|
||||
""",
|
||||
year,
|
||||
model_id,
|
||||
)
|
||||
return [{"id": r["id"], "name": r["name"]} for r in rows]
|
||||
|
||||
async def get_engines(
|
||||
self, db: asyncpg.Connection, year: int, model_id: int, trim_id: int
|
||||
) -> List[Dict]:
|
||||
rows = await db.fetch(
|
||||
"""
|
||||
SELECT DISTINCT e.id, e.name
|
||||
FROM vehicles.engine e
|
||||
JOIN vehicles.trim_engine te ON te.engine_id = e.id
|
||||
JOIN vehicles.trim t ON t.id = te.trim_id
|
||||
JOIN vehicles.model_year my ON my.id = t.model_year_id
|
||||
WHERE my.year = $1
|
||||
AND my.model_id = $2
|
||||
AND t.id = $3
|
||||
ORDER BY e.name
|
||||
""",
|
||||
year,
|
||||
model_id,
|
||||
trim_id,
|
||||
)
|
||||
return [{"id": r["id"], "name": r["name"]} for r in rows]
|
||||
|
||||
116
mvp-platform-services/vehicles/api/routes/vehicles.py
Normal file
116
mvp-platform-services/vehicles/api/routes/vehicles.py
Normal file
@@ -0,0 +1,116 @@
|
||||
from fastapi import APIRouter, Depends, Query, HTTPException
|
||||
import asyncpg
|
||||
from ..dependencies import get_db, get_cache
|
||||
# DropdownService deprecated; using normalized schema service
|
||||
from ..services.vehicles_service import VehiclesService
|
||||
from ..repositories.vehicles_repository import VehiclesRepository
|
||||
from ..services.cache_service import CacheService
|
||||
from ..models.responses import (
|
||||
MakesResponse, ModelsResponse, TrimsResponse,
|
||||
EnginesResponse,
|
||||
MakeItem, ModelItem, TrimItem, EngineItem
|
||||
)
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/vehicles", tags=["Vehicles"])
|
||||
|
||||
@router.get("/years", response_model=list[int])
|
||||
async def get_years(
|
||||
db: asyncpg.Connection = Depends(get_db),
|
||||
cache: CacheService = Depends(get_cache),
|
||||
):
|
||||
"""Get available model years (distinct, desc)"""
|
||||
service = VehiclesService(cache, VehiclesRepository())
|
||||
return await service.get_years(db)
|
||||
|
||||
@router.get("/makes", response_model=MakesResponse)
|
||||
async def get_makes(
|
||||
year: int = Query(..., description="Model year", ge=1980, le=2050),
|
||||
db: asyncpg.Connection = Depends(get_db),
|
||||
cache: CacheService = Depends(get_cache)
|
||||
):
|
||||
"""Get makes for a specific year
|
||||
|
||||
Hierarchical API: First level - requires year parameter only
|
||||
"""
|
||||
try:
|
||||
service = VehiclesService(cache, VehiclesRepository())
|
||||
makes = await service.get_makes(db, year)
|
||||
return MakesResponse(makes=[MakeItem(**m) for m in makes])
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get makes for year {year}: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to retrieve makes for year {year}"
|
||||
)
|
||||
|
||||
@router.get("/models", response_model=ModelsResponse)
|
||||
async def get_models(
|
||||
year: int = Query(..., description="Model year", ge=1980, le=2050),
|
||||
make_id: int = Query(..., description="Make ID", ge=1),
|
||||
db: asyncpg.Connection = Depends(get_db),
|
||||
cache: CacheService = Depends(get_cache)
|
||||
):
|
||||
"""Get models for year and make
|
||||
|
||||
Hierarchical API: Second level - requires year and make_id parameters
|
||||
"""
|
||||
try:
|
||||
service = VehiclesService(cache, VehiclesRepository())
|
||||
models = await service.get_models(db, year, make_id)
|
||||
return ModelsResponse(models=[ModelItem(**m) for m in models])
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get models for year {year}, make {make_id}: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to retrieve models for year {year}, make {make_id}"
|
||||
)
|
||||
|
||||
@router.get("/trims", response_model=TrimsResponse)
|
||||
async def get_trims(
|
||||
year: int = Query(..., description="Model year", ge=1980, le=2050),
|
||||
make_id: int = Query(..., description="Make ID", ge=1),
|
||||
model_id: int = Query(..., description="Model ID", ge=1),
|
||||
db: asyncpg.Connection = Depends(get_db),
|
||||
cache: CacheService = Depends(get_cache)
|
||||
):
|
||||
"""Get trims for year, make, and model
|
||||
|
||||
Hierarchical API: Third level - requires year, make_id, and model_id parameters
|
||||
"""
|
||||
try:
|
||||
service = VehiclesService(cache, VehiclesRepository())
|
||||
trims = await service.get_trims(db, year, model_id)
|
||||
return TrimsResponse(trims=[TrimItem(**t) for t in trims])
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get trims for year {year}, make {make_id}, model {model_id}: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to retrieve trims for year {year}, make {make_id}, model {model_id}"
|
||||
)
|
||||
|
||||
@router.get("/engines", response_model=EnginesResponse)
|
||||
async def get_engines(
|
||||
year: int = Query(..., description="Model year", ge=1980, le=2050),
|
||||
make_id: int = Query(..., description="Make ID", ge=1),
|
||||
model_id: int = Query(..., description="Model ID", ge=1),
|
||||
trim_id: int = Query(..., description="Trim ID", ge=1),
|
||||
db: asyncpg.Connection = Depends(get_db),
|
||||
cache: CacheService = Depends(get_cache)
|
||||
):
|
||||
"""Get engines for year, make, model, and trim"""
|
||||
try:
|
||||
service = VehiclesService(cache, VehiclesRepository())
|
||||
engines = await service.get_engines(db, year, model_id, trim_id)
|
||||
return EnginesResponse(engines=[EngineItem(**e) for e in engines])
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to get engines for year {year}, make {make_id}, model {model_id}, trim {trim_id}: {e}"
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=(
|
||||
f"Failed to retrieve engines for year {year}, make {make_id}, model {model_id}, trim {trim_id}"
|
||||
)
|
||||
)
|
||||
110
mvp-platform-services/vehicles/api/routes/vin.py
Normal file
110
mvp-platform-services/vehicles/api/routes/vin.py
Normal file
@@ -0,0 +1,110 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
import asyncpg
|
||||
from ..dependencies import get_db, get_cache
|
||||
from ..services.cache_service import CacheService
|
||||
from ..models.responses import VINDecodeRequest, VINDecodeResponse, VINDecodeResult
|
||||
import logging
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/vehicles", tags=["VIN Decoding"])
|
||||
|
||||
def validate_vin(vin: str) -> bool:
|
||||
"""Validate VIN format"""
|
||||
if len(vin) != 17:
|
||||
return False
|
||||
|
||||
# VIN cannot contain I, O, Q
|
||||
if any(char in vin.upper() for char in ['I', 'O', 'Q']):
|
||||
return False
|
||||
|
||||
# Must be alphanumeric
|
||||
if not re.match(r'^[A-HJ-NPR-Z0-9]{17}$', vin.upper()):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@router.post("/vindecode", response_model=VINDecodeResponse)
|
||||
async def decode_vin(
|
||||
request: VINDecodeRequest,
|
||||
db: asyncpg.Connection = Depends(get_db),
|
||||
cache: CacheService = Depends(get_cache)
|
||||
):
|
||||
"""Decode VIN using PostgreSQL function with MSSQL parity
|
||||
|
||||
Uses the vehicles.f_decode_vin() function to decode VIN with confidence scoring
|
||||
"""
|
||||
vin = request.vin.upper().strip()
|
||||
|
||||
# Validate VIN format
|
||||
if not validate_vin(vin):
|
||||
return VINDecodeResponse(
|
||||
vin=vin,
|
||||
result=None,
|
||||
success=False,
|
||||
error="Invalid VIN format"
|
||||
)
|
||||
|
||||
# Check cache first
|
||||
cache_key = f"vin:decode:{vin}"
|
||||
cached_result = await cache.get(cache_key)
|
||||
if cached_result:
|
||||
logger.debug(f"VIN decode result for {vin} retrieved from cache")
|
||||
return VINDecodeResponse(**cached_result)
|
||||
|
||||
try:
|
||||
# Call PostgreSQL VIN decode function
|
||||
query = """
|
||||
SELECT * FROM vehicles.f_decode_vin($1)
|
||||
"""
|
||||
|
||||
row = await db.fetchrow(query, vin)
|
||||
|
||||
if row:
|
||||
result = VINDecodeResult(
|
||||
make=row['make'],
|
||||
model=row['model'],
|
||||
year=row['year'],
|
||||
trim_name=row['trim_name'],
|
||||
engine_description=row['engine_description'],
|
||||
transmission_description=row['transmission_description'],
|
||||
horsepower=row.get('horsepower'),
|
||||
torque=row.get('torque'),
|
||||
top_speed=row.get('top_speed'),
|
||||
fuel=row.get('fuel'),
|
||||
confidence_score=float(row['confidence_score']) if row['confidence_score'] else 0.0,
|
||||
vehicle_type=row.get('vehicle_type')
|
||||
)
|
||||
|
||||
response = VINDecodeResponse(
|
||||
vin=vin,
|
||||
result=result,
|
||||
success=True
|
||||
)
|
||||
|
||||
# Cache successful decode for 30 days
|
||||
await cache.set(cache_key, response.dict(), ttl=30*24*3600)
|
||||
|
||||
logger.info(f"Successfully decoded VIN {vin}: {result.make} {result.model} {result.year}")
|
||||
return response
|
||||
else:
|
||||
# No result found
|
||||
response = VINDecodeResponse(
|
||||
vin=vin,
|
||||
result=None,
|
||||
success=False,
|
||||
error="VIN not found in database"
|
||||
)
|
||||
|
||||
# Cache negative result for 1 hour
|
||||
await cache.set(cache_key, response.dict(), ttl=3600)
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to decode VIN {vin}: {e}")
|
||||
return VINDecodeResponse(
|
||||
vin=vin,
|
||||
result=None,
|
||||
success=False,
|
||||
error="Internal server error during VIN decoding"
|
||||
)
|
||||
88
mvp-platform-services/vehicles/api/services/cache_service.py
Normal file
88
mvp-platform-services/vehicles/api/services/cache_service.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import redis.asyncio as redis
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class CacheService:
|
||||
"""Redis cache service with JSON serialization"""
|
||||
|
||||
def __init__(self, redis_client: Optional[redis.Redis], enabled: bool = True, default_ttl: int = 3600):
|
||||
self.redis = redis_client
|
||||
self.enabled = enabled and redis_client is not None
|
||||
self.default_ttl = default_ttl
|
||||
|
||||
async def get(self, key: str) -> Optional[Any]:
|
||||
"""Get value from cache"""
|
||||
if not self.enabled:
|
||||
return None
|
||||
|
||||
try:
|
||||
value = await self.redis.get(key)
|
||||
if value:
|
||||
return json.loads(value)
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Cache get error for key {key}: {e}")
|
||||
return None
|
||||
|
||||
async def set(self, key: str, value: Any, ttl: Optional[int] = None) -> bool:
|
||||
"""Set value in cache"""
|
||||
if not self.enabled:
|
||||
return False
|
||||
|
||||
try:
|
||||
ttl = ttl or self.default_ttl
|
||||
json_value = json.dumps(value, default=str) # Handle datetime objects
|
||||
await self.redis.setex(key, ttl, json_value)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Cache set error for key {key}: {e}")
|
||||
return False
|
||||
|
||||
async def delete(self, key: str) -> bool:
|
||||
"""Delete key from cache"""
|
||||
if not self.enabled:
|
||||
return False
|
||||
|
||||
try:
|
||||
deleted = await self.redis.delete(key)
|
||||
return deleted > 0
|
||||
except Exception as e:
|
||||
logger.error(f"Cache delete error for key {key}: {e}")
|
||||
return False
|
||||
|
||||
async def invalidate_dropdown_cache(self) -> int:
|
||||
"""Invalidate all dropdown cache entries"""
|
||||
if not self.enabled:
|
||||
return 0
|
||||
|
||||
try:
|
||||
pattern = "dropdown:*"
|
||||
keys = await self.redis.keys(pattern)
|
||||
if keys:
|
||||
deleted = await self.redis.delete(*keys)
|
||||
logger.info(f"Invalidated {deleted} dropdown cache entries")
|
||||
return deleted
|
||||
return 0
|
||||
except Exception as e:
|
||||
logger.error(f"Cache invalidation error: {e}")
|
||||
return 0
|
||||
|
||||
async def get_stats(self) -> dict:
|
||||
"""Get cache statistics"""
|
||||
if not self.enabled:
|
||||
return {"enabled": False}
|
||||
|
||||
try:
|
||||
info = await self.redis.info("memory")
|
||||
return {
|
||||
"enabled": True,
|
||||
"used_memory": info.get("used_memory_human"),
|
||||
"used_memory_peak": info.get("used_memory_peak_human"),
|
||||
"connected_clients": await self.redis.client_list()
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Cache stats error: {e}")
|
||||
return {"enabled": True, "error": str(e)}
|
||||
@@ -0,0 +1,58 @@
|
||||
import asyncpg
|
||||
from typing import List, Dict
|
||||
from ..services.cache_service import CacheService
|
||||
from ..repositories.vehicles_repository import VehiclesRepository
|
||||
|
||||
|
||||
class VehiclesService:
|
||||
def __init__(self, cache: CacheService, repo: VehiclesRepository | None = None):
|
||||
self.cache = cache
|
||||
self.repo = repo or VehiclesRepository()
|
||||
|
||||
async def get_years(self, db: asyncpg.Connection) -> List[int]:
|
||||
cache_key = "dropdown:years"
|
||||
cached = await self.cache.get(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
years = await self.repo.get_years(db)
|
||||
await self.cache.set(cache_key, years, ttl=6 * 3600)
|
||||
return years
|
||||
|
||||
async def get_makes(self, db: asyncpg.Connection, year: int) -> List[Dict]:
|
||||
cache_key = f"dropdown:makes:{year}"
|
||||
cached = await self.cache.get(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
makes = await self.repo.get_makes(db, year)
|
||||
await self.cache.set(cache_key, makes, ttl=6 * 3600)
|
||||
return makes
|
||||
|
||||
async def get_models(self, db: asyncpg.Connection, year: int, make_id: int) -> List[Dict]:
|
||||
cache_key = f"dropdown:models:{year}:{make_id}"
|
||||
cached = await self.cache.get(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
models = await self.repo.get_models(db, year, make_id)
|
||||
await self.cache.set(cache_key, models, ttl=6 * 3600)
|
||||
return models
|
||||
|
||||
async def get_trims(self, db: asyncpg.Connection, year: int, model_id: int) -> List[Dict]:
|
||||
cache_key = f"dropdown:trims:{year}:{model_id}"
|
||||
cached = await self.cache.get(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
trims = await self.repo.get_trims(db, year, model_id)
|
||||
await self.cache.set(cache_key, trims, ttl=6 * 3600)
|
||||
return trims
|
||||
|
||||
async def get_engines(
|
||||
self, db: asyncpg.Connection, year: int, model_id: int, trim_id: int
|
||||
) -> List[Dict]:
|
||||
cache_key = f"dropdown:engines:{year}:{model_id}:{trim_id}"
|
||||
cached = await self.cache.get(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
engines = await self.repo.get_engines(db, year, model_id, trim_id)
|
||||
await self.cache.set(cache_key, engines, ttl=6 * 3600)
|
||||
return engines
|
||||
|
||||
30
mvp-platform-services/vehicles/docker/Dockerfile.api
Normal file
30
mvp-platform-services/vehicles/docker/Dockerfile.api
Normal file
@@ -0,0 +1,30 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
wget \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements and install Python dependencies
|
||||
COPY requirements-api.txt .
|
||||
RUN pip install --no-cache-dir -r requirements-api.txt
|
||||
|
||||
# Copy application code
|
||||
COPY api/ ./api/
|
||||
|
||||
# Set Python path
|
||||
ENV PYTHONPATH=/app
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
||||
CMD wget --quiet --tries=1 --spider http://localhost:8000/health || exit 1
|
||||
|
||||
# Run application
|
||||
CMD ["python", "-m", "uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
46
mvp-platform-services/vehicles/docker/Dockerfile.etl
Normal file
46
mvp-platform-services/vehicles/docker/Dockerfile.etl
Normal file
@@ -0,0 +1,46 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies and ODBC drivers
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
apt-transport-https \
|
||||
gnupg2 \
|
||||
unixodbc-dev \
|
||||
unixodbc \
|
||||
&& curl -sSL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /usr/share/keyrings/microsoft-prod.gpg \
|
||||
&& echo "deb [arch=amd64,arm64,armhf signed-by=/usr/share/keyrings/microsoft-prod.gpg] https://packages.microsoft.com/debian/12/prod bookworm main" > /etc/apt/sources.list.d/mssql-release.list \
|
||||
&& apt-get update \
|
||||
&& ACCEPT_EULA=Y apt-get install -y msodbcsql17 mssql-tools \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Add SQL Server tools to PATH
|
||||
ENV PATH="$PATH:/opt/mssql-tools/bin"
|
||||
|
||||
# Copy requirements and install Python dependencies
|
||||
COPY requirements-etl.txt .
|
||||
RUN pip install --no-cache-dir -r requirements-etl.txt
|
||||
|
||||
# Copy ETL code
|
||||
COPY etl/ ./etl/
|
||||
|
||||
# Copy make configuration for filtering
|
||||
COPY makes.json /app/makes.json
|
||||
|
||||
# Create logs and data directories
|
||||
RUN mkdir -p /app/logs /app/data
|
||||
|
||||
# Set Python path
|
||||
ENV PYTHONPATH=/app
|
||||
|
||||
# Expose port for health check
|
||||
EXPOSE 8001
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=60s --timeout=10s --start-period=60s --retries=3 \
|
||||
CMD python -c "import sys; import os; sys.path.append('/app'); from etl.connections import test_connections; exit(0 if test_connections() else 1)" || exit 1
|
||||
|
||||
# Run ETL scheduler
|
||||
CMD ["python", "-m", "etl.main"]
|
||||
0
mvp-platform-services/vehicles/etl/__init__.py
Executable file
0
mvp-platform-services/vehicles/etl/__init__.py
Executable file
10
mvp-platform-services/vehicles/etl/__main__.py
Normal file
10
mvp-platform-services/vehicles/etl/__main__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ETL Package Main Entry Point
|
||||
|
||||
Allows running ETL package as a module: python -m etl
|
||||
"""
|
||||
from .main import cli
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
mvp-platform-services/vehicles/etl/__pycache__/config.cpython-312.pyc
Executable file
BIN
mvp-platform-services/vehicles/etl/__pycache__/config.cpython-312.pyc
Executable file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
0
mvp-platform-services/vehicles/etl/analyzers/__init__.py
Executable file
0
mvp-platform-services/vehicles/etl/analyzers/__init__.py
Executable file
0
mvp-platform-services/vehicles/etl/builders/__init__.py
Executable file
0
mvp-platform-services/vehicles/etl/builders/__init__.py
Executable file
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,376 @@
|
||||
import logging
|
||||
from typing import Dict, List, Set, Optional
|
||||
from datetime import datetime
|
||||
from dateutil import tz
|
||||
from tqdm import tqdm
|
||||
from ..connections import db_connections
|
||||
from ..extractors.mssql_extractor import MSSQLExtractor
|
||||
from ..loaders.postgres_loader import PostgreSQLLoader
|
||||
from ..config import config
|
||||
from ..utils.make_filter import MakeFilter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class NormalizedVehicleBuilder:
|
||||
"""Build normalized vehicle schema from pattern-based NHTSA source data"""
|
||||
|
||||
def __init__(self, make_filter: Optional[MakeFilter] = None):
|
||||
self.make_filter = make_filter or MakeFilter()
|
||||
self.extractor = MSSQLExtractor(self.make_filter)
|
||||
self.loader = PostgreSQLLoader()
|
||||
|
||||
logger.info(
|
||||
f"Initialized normalized vehicle builder with make filtering: {len(self.make_filter.get_allowed_makes())} allowed makes"
|
||||
)
|
||||
|
||||
def build(self):
|
||||
"""Main normalized vehicle schema building process"""
|
||||
logger.info("Starting normalized vehicle schema build")
|
||||
|
||||
try:
|
||||
# Step 1: Clear and load reference tables
|
||||
logger.info("Step 1: Loading reference tables (makes, models, relationships)")
|
||||
self._load_reference_tables()
|
||||
|
||||
# Step 2: Extract year availability from WMI data
|
||||
logger.info("Step 2: Building model-year availability from WMI data")
|
||||
self._build_model_year_availability()
|
||||
|
||||
# Step 3: Extract trims and engines from pattern analysis
|
||||
logger.info("Step 3: Extracting trims and engines from pattern data")
|
||||
self._extract_trims_and_engines()
|
||||
|
||||
logger.info("Normalized vehicle schema build completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Normalized schema build failed: {e}")
|
||||
raise e
|
||||
|
||||
def _load_reference_tables(self):
|
||||
"""Load basic reference tables: makes, models with proper relationships"""
|
||||
|
||||
# Load makes (filtered by make_filter)
|
||||
makes_data = self.extractor.extract_reference_table('Make')
|
||||
if makes_data:
|
||||
self.loader.load_reference_table('make', makes_data)
|
||||
logger.info(f"Loaded {len(makes_data)} makes")
|
||||
|
||||
# Get make-model relationships first
|
||||
make_model_rels = self.extractor.extract_make_model_relationships()
|
||||
|
||||
# Load models with make_id populated from relationships
|
||||
models_data = self.extractor.extract_reference_table('Model')
|
||||
if models_data and make_model_rels:
|
||||
# Create mapping: model_id -> make_id
|
||||
model_to_make = {}
|
||||
for rel in make_model_rels:
|
||||
model_to_make[rel['ModelId']] = rel['MakeId']
|
||||
|
||||
# Add make_id to each model record
|
||||
for model in models_data:
|
||||
model['MakeId'] = model_to_make.get(model['Id'])
|
||||
|
||||
# Filter out models without make_id (orphaned models)
|
||||
valid_models = [m for m in models_data if m.get('MakeId') is not None]
|
||||
|
||||
self.loader.load_reference_table('model', valid_models)
|
||||
logger.info(f"Loaded {len(valid_models)} models with make relationships")
|
||||
logger.info(f"Filtered out {len(models_data) - len(valid_models)} orphaned models")
|
||||
else:
|
||||
logger.warning("No models or relationships loaded")
|
||||
|
||||
def _build_model_year_availability(self):
|
||||
"""Build model-year availability from WMI year ranges with realistic constraints"""
|
||||
logger.info("Extracting model-year availability from WMI data with realistic year bounds")
|
||||
|
||||
# Define realistic year constraints
|
||||
current_year = datetime.now().year
|
||||
max_year = current_year + 1 # Allow next model year
|
||||
min_year = current_year - 40 # Reasonable historical range (40 years back)
|
||||
|
||||
logger.info(f"Using realistic year range: {min_year} to {max_year}")
|
||||
|
||||
# Get WMI data with year ranges
|
||||
wmi_data = self.extractor.extract_wmi_vin_schema_mappings()
|
||||
|
||||
# Get make-model relationships to map WMI to models
|
||||
make_model_rels = self.extractor.extract_make_model_relationships()
|
||||
wmi_make_rels = self.extractor.extract_wmi_make_relationships()
|
||||
|
||||
# Build mapping: WMI -> Make -> Models
|
||||
wmi_to_models = {}
|
||||
make_to_models = {}
|
||||
|
||||
# Build make -> models mapping
|
||||
for rel in make_model_rels:
|
||||
make_id = rel['MakeId']
|
||||
if make_id not in make_to_models:
|
||||
make_to_models[make_id] = []
|
||||
make_to_models[make_id].append(rel['ModelId'])
|
||||
|
||||
# Build WMI -> models mapping via makes
|
||||
for wmi_make in wmi_make_rels:
|
||||
wmi_id = wmi_make['WmiId']
|
||||
make_id = wmi_make['MakeId']
|
||||
|
||||
if make_id in make_to_models:
|
||||
if wmi_id not in wmi_to_models:
|
||||
wmi_to_models[wmi_id] = []
|
||||
wmi_to_models[wmi_id].extend(make_to_models[make_id])
|
||||
|
||||
# Extremely conservative approach: Only allow models with explicit recent year ranges
|
||||
logger.info("Building model-year availability - using only models with EXPLICIT recent VIN pattern evidence")
|
||||
|
||||
model_years = []
|
||||
current_year = datetime.now().year
|
||||
|
||||
# Strategy: Only include models that have VIN patterns with explicit recent year ranges (not open-ended)
|
||||
recent_threshold = current_year - 5 # Only patterns from last 5 years
|
||||
|
||||
# Find models that have EXPLICIT recent VIN pattern evidence (both YearFrom and YearTo defined)
|
||||
recent_models_with_years = {} # model_id -> set of years with evidence
|
||||
|
||||
for wmi_mapping in wmi_data:
|
||||
year_from = wmi_mapping['YearFrom']
|
||||
year_to = wmi_mapping['YearTo']
|
||||
|
||||
# Skip patterns without explicit year ranges (YearTo=None means open-ended, likely old discontinued models)
|
||||
if year_from is None or year_to is None:
|
||||
continue
|
||||
|
||||
# Only consider WMI patterns that have recent, explicit activity
|
||||
if year_to >= recent_threshold and year_from <= current_year + 1:
|
||||
wmi_id = wmi_mapping['WmiId']
|
||||
if wmi_id in wmi_to_models:
|
||||
models = wmi_to_models[wmi_id]
|
||||
for model_id in models:
|
||||
if model_id not in recent_models_with_years:
|
||||
recent_models_with_years[model_id] = set()
|
||||
# Add the actual years with evidence (constrained to reasonable range)
|
||||
evidence_start = max(year_from, recent_threshold)
|
||||
evidence_end = min(year_to, current_year + 1)
|
||||
for year in range(evidence_start, evidence_end + 1):
|
||||
recent_models_with_years[model_id].add(year)
|
||||
|
||||
logger.info(f"Found {len(recent_models_with_years)} models with explicit recent VIN pattern evidence (patterns with defined year ranges since {recent_threshold})")
|
||||
|
||||
# Create model-year combinations only for years with actual VIN pattern evidence
|
||||
# Apply business rules to exclude historically discontinued models
|
||||
discontinued_models = self._get_discontinued_models()
|
||||
|
||||
for model_id, years_with_evidence in recent_models_with_years.items():
|
||||
# Check if this model is in our discontinued list
|
||||
if model_id in discontinued_models:
|
||||
max_year = discontinued_models[model_id]
|
||||
logger.info(f"Applying discontinuation rule: Model ID {model_id} discontinued after {max_year}")
|
||||
# Only include years up to discontinuation year
|
||||
years_with_evidence = {y for y in years_with_evidence if y <= max_year}
|
||||
|
||||
for year in years_with_evidence:
|
||||
model_years.append({
|
||||
'model_id': model_id,
|
||||
'year': year
|
||||
})
|
||||
|
||||
logger.info(f"Created {len(model_years)} model-year combinations based on explicit VIN pattern evidence")
|
||||
|
||||
# Remove duplicates
|
||||
unique_model_years = []
|
||||
seen = set()
|
||||
for my in model_years:
|
||||
key = (my['model_id'], my['year'])
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_model_years.append(my)
|
||||
|
||||
# Load to database
|
||||
if unique_model_years:
|
||||
self.loader.load_model_years(unique_model_years)
|
||||
logger.info(f"Generated {len(unique_model_years)} model-year availability records")
|
||||
|
||||
|
||||
def _extract_trims_and_engines(self):
|
||||
"""Extract trims and engines from pattern analysis"""
|
||||
logger.info("Extracting trims and engines from pattern data")
|
||||
|
||||
# Get model-year IDs for mapping
|
||||
model_year_mapping = self._get_model_year_mapping()
|
||||
|
||||
trims_data = []
|
||||
engines_data = []
|
||||
engine_names = set()
|
||||
|
||||
# Process patterns in batches
|
||||
total_trims = 0
|
||||
total_engines = 0
|
||||
|
||||
for pattern_batch in self.extractor.extract_patterns_data():
|
||||
logger.info(f"Processing pattern batch: {len(pattern_batch)} patterns")
|
||||
|
||||
# Group patterns by (year, make, model) combination
|
||||
vehicle_combinations = {}
|
||||
|
||||
for pattern in pattern_batch:
|
||||
element_id = pattern['ElementId']
|
||||
attribute_id = pattern.get('AttributeId', '')
|
||||
make_name = pattern.get('MakeName', '')
|
||||
|
||||
# Skip if not allowed make
|
||||
if not self.make_filter.is_make_allowed(make_name):
|
||||
continue
|
||||
|
||||
# Create vehicle combination key
|
||||
# We'll derive year from WMI data associated with this pattern
|
||||
vin_schema_id = pattern['VinSchemaId']
|
||||
key = (vin_schema_id, make_name)
|
||||
|
||||
if key not in vehicle_combinations:
|
||||
vehicle_combinations[key] = {
|
||||
'make_name': make_name,
|
||||
'vin_schema_id': vin_schema_id,
|
||||
'trims': set(),
|
||||
'engines': set()
|
||||
}
|
||||
|
||||
# Extract trim and engine data
|
||||
if element_id == 28 and attribute_id: # Trim
|
||||
vehicle_combinations[key]['trims'].add(attribute_id)
|
||||
elif element_id == 18 and attribute_id: # Engine
|
||||
vehicle_combinations[key]['engines'].add(attribute_id)
|
||||
|
||||
# Convert to trim/engine records
|
||||
for combo in vehicle_combinations.values():
|
||||
make_name = combo['make_name']
|
||||
|
||||
# For now, create generic records
|
||||
# In a full implementation, you'd map these to specific model-years
|
||||
for trim_name in combo['trims']:
|
||||
if trim_name and len(trim_name.strip()) > 0:
|
||||
# We'll need to associate these with specific model_year_ids
|
||||
# For now, create a placeholder structure
|
||||
trims_data.append({
|
||||
'name': trim_name.strip(),
|
||||
'make_name': make_name, # temporary for mapping
|
||||
'source_schema': combo['vin_schema_id']
|
||||
})
|
||||
total_trims += 1
|
||||
|
||||
for engine_name in combo['engines']:
|
||||
if engine_name and len(engine_name.strip()) > 0 and engine_name not in engine_names:
|
||||
engine_names.add(engine_name)
|
||||
engines_data.append({
|
||||
'name': engine_name.strip(),
|
||||
'code': None,
|
||||
'displacement_l': None,
|
||||
'cylinders': None,
|
||||
'fuel_type': None,
|
||||
'aspiration': None
|
||||
})
|
||||
total_engines += 1
|
||||
|
||||
# Load engines first (they're independent)
|
||||
if engines_data:
|
||||
self.loader.load_engines(engines_data)
|
||||
logger.info(f"Loaded {total_engines} unique engines")
|
||||
|
||||
# For trims, we need to map them to actual model_year records
|
||||
# This is a simplified approach - in practice you'd need more sophisticated mapping
|
||||
if trims_data:
|
||||
simplified_trims = self._map_trims_to_model_years(trims_data, model_year_mapping)
|
||||
if simplified_trims:
|
||||
self.loader.load_trims(simplified_trims)
|
||||
logger.info(f"Loaded {len(simplified_trims)} trims")
|
||||
|
||||
def _get_model_year_mapping(self) -> Dict:
|
||||
"""Get mapping of model_year records for trim association"""
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
SELECT my.id, my.model_id, my.year, m.name as model_name, mk.name as make_name
|
||||
FROM vehicles.model_year my
|
||||
JOIN vehicles.model m ON my.model_id = m.id
|
||||
JOIN vehicles.make mk ON m.make_id = mk.id
|
||||
"""
|
||||
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
|
||||
mapping = {}
|
||||
for row in rows:
|
||||
key = (row['make_name'] if isinstance(row, dict) else row[4],
|
||||
row['year'] if isinstance(row, dict) else row[2])
|
||||
mapping[key] = row['id'] if isinstance(row, dict) else row[0]
|
||||
|
||||
return mapping
|
||||
|
||||
def _map_trims_to_model_years(self, trims_data: List[Dict], model_year_mapping: Dict) -> List[Dict]:
|
||||
"""Map extracted trims to actual model_year records"""
|
||||
mapped_trims = []
|
||||
|
||||
# For now, create a simplified mapping
|
||||
# Associate trims with all model_years of the same make
|
||||
for trim in trims_data:
|
||||
make_name = trim['make_name']
|
||||
trim_name = trim['name']
|
||||
|
||||
# Find all model_year_ids for this make
|
||||
model_year_ids = []
|
||||
for (mapped_make, year), model_year_id in model_year_mapping.items():
|
||||
if mapped_make == make_name:
|
||||
model_year_ids.append(model_year_id)
|
||||
|
||||
# Create trim record for each model_year (simplified approach)
|
||||
# In practice, you'd need more sophisticated pattern-to-vehicle mapping
|
||||
for model_year_id in model_year_ids[:5]: # Limit to avoid explosion
|
||||
mapped_trims.append({
|
||||
'model_year_id': model_year_id,
|
||||
'name': trim_name
|
||||
})
|
||||
|
||||
return mapped_trims
|
||||
|
||||
def _get_discontinued_models(self) -> Dict[int, int]:
|
||||
"""Get mapping of discontinued model IDs to their last production year
|
||||
|
||||
This method identifies models that were historically discontinued
|
||||
and should not appear in recent model year combinations.
|
||||
"""
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Query for specific discontinued models by name patterns
|
||||
# These are well-known discontinued models that should not appear in recent years
|
||||
discontinued_patterns = [
|
||||
('Jimmy%', 1991), # GMC Jimmy discontinued 1991
|
||||
('S-10%', 2004), # Chevrolet S-10 discontinued 2004
|
||||
('Blazer%', 2005), # Chevrolet Blazer discontinued 2005 (before recent revival)
|
||||
('Astro%', 2005), # Chevrolet Astro discontinued 2005
|
||||
('Safari%', 2005), # GMC Safari discontinued 2005
|
||||
('Jimmy Utility%', 1991), # GMC Jimmy Utility discontinued 1991
|
||||
]
|
||||
|
||||
discontinued_models = {}
|
||||
|
||||
for pattern, last_year in discontinued_patterns:
|
||||
query = """
|
||||
SELECT m.id, m.name, mk.name as make_name
|
||||
FROM vehicles.model m
|
||||
JOIN vehicles.make mk ON m.make_id = mk.id
|
||||
WHERE m.name ILIKE %s
|
||||
AND mk.name IN ('Chevrolet', 'GMC')
|
||||
"""
|
||||
|
||||
cursor.execute(query, (pattern,))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
for row in rows:
|
||||
model_id = row['id'] if isinstance(row, dict) else row[0]
|
||||
model_name = row['name'] if isinstance(row, dict) else row[1]
|
||||
make_name = row['make_name'] if isinstance(row, dict) else row[2]
|
||||
|
||||
discontinued_models[model_id] = last_year
|
||||
logger.info(f"Marked {make_name} {model_name} (ID: {model_id}) as discontinued after {last_year}")
|
||||
|
||||
return discontinued_models
|
||||
39
mvp-platform-services/vehicles/etl/config.py
Executable file
39
mvp-platform-services/vehicles/etl/config.py
Executable file
@@ -0,0 +1,39 @@
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
class ETLConfig:
|
||||
"""ETL Configuration using environment variables"""
|
||||
|
||||
# MS SQL Server settings
|
||||
MSSQL_HOST: str = os.getenv("MSSQL_HOST", "mvp-platform-vehicles-mssql")
|
||||
MSSQL_PORT: int = int(os.getenv("MSSQL_PORT", "1433"))
|
||||
MSSQL_DATABASE: str = os.getenv("MSSQL_DATABASE", "VPICList")
|
||||
MSSQL_USER: str = os.getenv("MSSQL_USER", "sa")
|
||||
MSSQL_PASSWORD: str = os.getenv("MSSQL_PASSWORD", "Platform123!")
|
||||
|
||||
# PostgreSQL settings
|
||||
POSTGRES_HOST: str = os.getenv("POSTGRES_HOST", "mvp-platform-vehicles-db")
|
||||
POSTGRES_PORT: int = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||
POSTGRES_DATABASE: str = os.getenv("POSTGRES_DATABASE", "vehicles")
|
||||
POSTGRES_USER: str = os.getenv("POSTGRES_USER", "mvp_platform_user")
|
||||
POSTGRES_PASSWORD: str = os.getenv("POSTGRES_PASSWORD", "platform123")
|
||||
|
||||
# Redis settings
|
||||
REDIS_HOST: str = os.getenv("REDIS_HOST", "mvp-platform-vehicles-redis")
|
||||
REDIS_PORT: int = int(os.getenv("REDIS_PORT", "6379"))
|
||||
REDIS_DB: int = int(os.getenv("REDIS_DB", "0"))
|
||||
|
||||
# ETL Scheduling
|
||||
ETL_SCHEDULE: str = os.getenv("ETL_SCHEDULE", "0 2 * * 0") # Weekly at 2 AM on Sunday
|
||||
|
||||
# ETL settings
|
||||
BATCH_SIZE: int = int(os.getenv("BATCH_SIZE", "10000"))
|
||||
PARALLEL_WORKERS: int = int(os.getenv("PARALLEL_WORKERS", "4"))
|
||||
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
|
||||
|
||||
# Confidence thresholds
|
||||
MIN_CONFIDENCE_SCORE: int = int(os.getenv("MIN_CONFIDENCE_SCORE", "50"))
|
||||
# ETL behavior toggles
|
||||
DISABLE_ALL_MODELS_FALLBACK: bool = os.getenv("DISABLE_ALL_MODELS_FALLBACK", "true").lower() in ("1", "true", "yes")
|
||||
|
||||
config = ETLConfig()
|
||||
152
mvp-platform-services/vehicles/etl/connections.py
Executable file
152
mvp-platform-services/vehicles/etl/connections.py
Executable file
@@ -0,0 +1,152 @@
|
||||
import pyodbc
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import asyncpg
|
||||
import redis
|
||||
from contextlib import contextmanager
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
from .config import config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DatabaseConnections:
|
||||
"""Manage database connections with retry logic and timeouts"""
|
||||
|
||||
def __init__(self):
|
||||
self.mssql_conn = None
|
||||
self.postgres_conn = None
|
||||
self.redis_client = None
|
||||
self.pg_pool = None
|
||||
self.max_retries = 3
|
||||
self.retry_delay = 2 # seconds
|
||||
|
||||
def _retry_connection(self, connection_func, connection_type: str, max_retries: Optional[int] = None):
|
||||
"""Retry connection with exponential backoff"""
|
||||
max_retries = max_retries or self.max_retries
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
return connection_func()
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
logger.error(f"Failed to connect to {connection_type} after {max_retries} attempts: {e}")
|
||||
raise
|
||||
|
||||
wait_time = self.retry_delay * (2 ** attempt)
|
||||
logger.warning(f"{connection_type} connection failed (attempt {attempt + 1}/{max_retries}): {e}")
|
||||
logger.info(f"Retrying in {wait_time} seconds...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
@contextmanager
|
||||
def mssql_connection(self):
|
||||
"""Context manager for MS SQL connection using pyodbc with retry logic"""
|
||||
def _connect():
|
||||
connection_string = (
|
||||
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
|
||||
f"SERVER={config.MSSQL_HOST},{config.MSSQL_PORT};"
|
||||
f"DATABASE={config.MSSQL_DATABASE};"
|
||||
f"UID={config.MSSQL_USER};"
|
||||
f"PWD={config.MSSQL_PASSWORD};"
|
||||
f"TrustServerCertificate=yes;"
|
||||
f"Connection Timeout=30;"
|
||||
f"Command Timeout=300;"
|
||||
)
|
||||
return pyodbc.connect(connection_string)
|
||||
|
||||
conn = self._retry_connection(_connect, "MSSQL")
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
try:
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error closing MSSQL connection: {e}")
|
||||
|
||||
@contextmanager
|
||||
def postgres_connection(self):
|
||||
"""Context manager for PostgreSQL connection with retry logic"""
|
||||
def _connect():
|
||||
return psycopg2.connect(
|
||||
host=config.POSTGRES_HOST,
|
||||
port=config.POSTGRES_PORT,
|
||||
database=config.POSTGRES_DATABASE,
|
||||
user=config.POSTGRES_USER,
|
||||
password=config.POSTGRES_PASSWORD,
|
||||
cursor_factory=RealDictCursor,
|
||||
connect_timeout=30,
|
||||
options='-c statement_timeout=300000' # 5 minutes
|
||||
)
|
||||
|
||||
conn = self._retry_connection(_connect, "PostgreSQL")
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
try:
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error closing PostgreSQL connection: {e}")
|
||||
|
||||
async def create_pg_pool(self):
|
||||
"""Create async PostgreSQL connection pool"""
|
||||
self.pg_pool = await asyncpg.create_pool(
|
||||
host=config.POSTGRES_HOST,
|
||||
port=config.POSTGRES_PORT,
|
||||
database=config.POSTGRES_DATABASE,
|
||||
user=config.POSTGRES_USER,
|
||||
password=config.POSTGRES_PASSWORD,
|
||||
min_size=10,
|
||||
max_size=20
|
||||
)
|
||||
return self.pg_pool
|
||||
|
||||
def get_redis_client(self):
|
||||
"""Get Redis client"""
|
||||
if not self.redis_client:
|
||||
self.redis_client = redis.Redis(
|
||||
host=config.REDIS_HOST,
|
||||
port=config.REDIS_PORT,
|
||||
db=config.REDIS_DB,
|
||||
decode_responses=True
|
||||
)
|
||||
return self.redis_client
|
||||
|
||||
def test_connections():
|
||||
"""Test all database connections for health check"""
|
||||
try:
|
||||
# Test MSSQL connection (use master DB to avoid failures before restore)
|
||||
db = DatabaseConnections()
|
||||
mssql_master_conn_str = (
|
||||
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
|
||||
f"SERVER={config.MSSQL_HOST},{config.MSSQL_PORT};"
|
||||
f"DATABASE=master;"
|
||||
f"UID={config.MSSQL_USER};"
|
||||
f"PWD={config.MSSQL_PASSWORD};"
|
||||
f"TrustServerCertificate=yes;"
|
||||
)
|
||||
import pyodbc as _pyodbc
|
||||
with _pyodbc.connect(mssql_master_conn_str) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT 1")
|
||||
cursor.fetchone()
|
||||
logger.info("MSSQL connection successful (master)")
|
||||
|
||||
# Test PostgreSQL connection
|
||||
with db.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT 1")
|
||||
cursor.fetchone()
|
||||
logger.info("PostgreSQL connection successful")
|
||||
|
||||
# Test Redis connection
|
||||
redis_client = db.get_redis_client()
|
||||
redis_client.ping()
|
||||
logger.info("Redis connection successful")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Connection test failed: {e}")
|
||||
return False
|
||||
|
||||
db_connections = DatabaseConnections()
|
||||
@@ -0,0 +1 @@
|
||||
# ETL Downloaders
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
NHTSA vPIC Database Downloader
|
||||
Downloads and prepares the NHTSA vPIC database file for ETL processing
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
import requests
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class NHTSADownloader:
|
||||
"""Downloads and manages NHTSA vPIC database files"""
|
||||
|
||||
def __init__(self, download_dir: str = "/app/data"):
|
||||
self.download_dir = Path(download_dir)
|
||||
self.download_dir.mkdir(exist_ok=True)
|
||||
|
||||
def get_latest_database_url(self) -> str:
|
||||
"""
|
||||
Get the latest NHTSA vPIC database URL
|
||||
Uses July 2025 version as specified
|
||||
"""
|
||||
return "https://vpic.nhtsa.dot.gov/api/vPICList_lite_2025_07.bak.zip"
|
||||
|
||||
def download_database(self, url: Optional[str] = None) -> Optional[Path]:
|
||||
"""
|
||||
Download NHTSA vPIC database file
|
||||
|
||||
Args:
|
||||
url: Database URL (defaults to latest)
|
||||
|
||||
Returns:
|
||||
Path to downloaded .bak file or None if failed
|
||||
"""
|
||||
if url is None:
|
||||
url = self.get_latest_database_url()
|
||||
|
||||
logger.info(f"Starting download of NHTSA vPIC database from: {url}")
|
||||
|
||||
try:
|
||||
# Extract filename from URL
|
||||
zip_filename = url.split('/')[-1]
|
||||
zip_path = self.download_dir / zip_filename
|
||||
|
||||
# Download with progress
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
total_size = int(response.headers.get('content-length', 0))
|
||||
logger.info(f"Downloading {zip_filename} ({total_size:,} bytes)")
|
||||
|
||||
with open(zip_path, 'wb') as f:
|
||||
downloaded = 0
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if total_size > 0:
|
||||
progress = (downloaded / total_size) * 100
|
||||
if downloaded % (1024 * 1024 * 10) == 0: # Log every 10MB
|
||||
logger.info(f"Download progress: {progress:.1f}% ({downloaded:,}/{total_size:,} bytes)")
|
||||
|
||||
logger.info(f"Successfully downloaded: {zip_path}")
|
||||
|
||||
# Extract the .bak file
|
||||
bak_path = self.extract_bak_file(zip_path)
|
||||
|
||||
# Clean up zip file
|
||||
zip_path.unlink()
|
||||
logger.info(f"Cleaned up zip file: {zip_path}")
|
||||
|
||||
return bak_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download database: {e}")
|
||||
return None
|
||||
|
||||
def extract_bak_file(self, zip_path: Path) -> Path:
|
||||
"""
|
||||
Extract .bak file from zip archive
|
||||
|
||||
Args:
|
||||
zip_path: Path to zip file
|
||||
|
||||
Returns:
|
||||
Path to extracted .bak file
|
||||
"""
|
||||
logger.info(f"Extracting .bak file from: {zip_path}")
|
||||
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
# Find the .bak file
|
||||
bak_files = [name for name in zip_ref.namelist() if name.endswith('.bak')]
|
||||
|
||||
if not bak_files:
|
||||
raise ValueError("No .bak file found in zip archive")
|
||||
|
||||
if len(bak_files) > 1:
|
||||
logger.warning(f"Multiple .bak files found, using first: {bak_files}")
|
||||
|
||||
bak_filename = bak_files[0]
|
||||
logger.info(f"Extracting: {bak_filename}")
|
||||
|
||||
# Extract to download directory
|
||||
zip_ref.extract(bak_filename, self.download_dir)
|
||||
|
||||
bak_path = self.download_dir / bak_filename
|
||||
logger.info(f"Successfully extracted: {bak_path}")
|
||||
|
||||
return bak_path
|
||||
|
||||
def get_existing_bak_file(self) -> Optional[Path]:
|
||||
"""
|
||||
Find an existing .bak file in preferred locations.
|
||||
Searches both the shared mount (/app/shared) and local download dir (/app/data).
|
||||
|
||||
Returns:
|
||||
Path to most recent .bak file or None
|
||||
"""
|
||||
search_dirs = [Path("/app/shared"), self.download_dir]
|
||||
candidates = []
|
||||
|
||||
for d in search_dirs:
|
||||
try:
|
||||
if d.exists():
|
||||
candidates.extend(list(d.glob("*.bak")))
|
||||
except Exception as e:
|
||||
logger.debug(f"Skipping directory {d}: {e}")
|
||||
|
||||
if candidates:
|
||||
latest_bak = max(candidates, key=lambda p: p.stat().st_mtime)
|
||||
logger.info(f"Found existing .bak file: {latest_bak}")
|
||||
return latest_bak
|
||||
|
||||
return None
|
||||
|
||||
def ensure_database_file(self, force_download: bool = False) -> Optional[Path]:
|
||||
"""
|
||||
Ensure we have a database file - download if needed
|
||||
|
||||
Args:
|
||||
force_download: Force download even if file exists
|
||||
|
||||
Returns:
|
||||
Path to .bak file or None if failed
|
||||
"""
|
||||
if not force_download:
|
||||
existing_file = self.get_existing_bak_file()
|
||||
if existing_file:
|
||||
logger.info(f"Using existing database file: {existing_file}")
|
||||
return existing_file
|
||||
|
||||
logger.info("Downloading fresh database file...")
|
||||
return self.download_database()
|
||||
|
||||
def get_database_info(self, bak_path: Path) -> dict:
|
||||
"""
|
||||
Get information about the database file
|
||||
|
||||
Args:
|
||||
bak_path: Path to .bak file
|
||||
|
||||
Returns:
|
||||
Dictionary with file info
|
||||
"""
|
||||
if not bak_path.exists():
|
||||
return {"exists": False}
|
||||
|
||||
stat = bak_path.stat()
|
||||
return {
|
||||
"exists": True,
|
||||
"path": str(bak_path),
|
||||
"size_mb": round(stat.st_size / (1024 * 1024), 1),
|
||||
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
||||
"name": bak_path.name
|
||||
}
|
||||
0
mvp-platform-services/vehicles/etl/extractors/__init__.py
Executable file
0
mvp-platform-services/vehicles/etl/extractors/__init__.py
Executable file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
629
mvp-platform-services/vehicles/etl/extractors/json_extractor.py
Normal file
629
mvp-platform-services/vehicles/etl/extractors/json_extractor.py
Normal file
@@ -0,0 +1,629 @@
|
||||
"""
|
||||
JSON Extractor for Manual Vehicle Data Processing
|
||||
|
||||
Extracts and normalizes vehicle data from JSON files into database-ready structures.
|
||||
Integrates with MakeNameMapper and EngineSpecParser utilities for comprehensive
|
||||
data processing with L→I normalization and make name conversion.
|
||||
|
||||
Key Features:
|
||||
- Extract make/model/year/trim/engine data from JSON files
|
||||
- Handle electric vehicles (empty engines → default motor)
|
||||
- Data validation and quality assurance
|
||||
- Progress tracking and error reporting
|
||||
|
||||
Usage:
|
||||
extractor = JsonExtractor(make_mapper, engine_parser)
|
||||
make_data = extractor.extract_make_data('sources/makes/toyota.json')
|
||||
all_data = extractor.extract_all_makes('sources/makes/')
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import glob
|
||||
import logging
|
||||
from typing import List, Dict, Optional, Generator, Tuple
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
# Import our utilities (handle both relative and direct imports)
|
||||
try:
|
||||
from ..utils.make_name_mapper import MakeNameMapper
|
||||
from ..utils.engine_spec_parser import EngineSpecParser, EngineSpec
|
||||
except ImportError:
|
||||
# Fallback for direct execution
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
from utils.make_name_mapper import MakeNameMapper
|
||||
from utils.engine_spec_parser import EngineSpecParser, EngineSpec
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationResult:
|
||||
"""JSON validation result"""
|
||||
is_valid: bool
|
||||
errors: List[str]
|
||||
warnings: List[str]
|
||||
|
||||
@property
|
||||
def has_errors(self) -> bool:
|
||||
return len(self.errors) > 0
|
||||
|
||||
@property
|
||||
def has_warnings(self) -> bool:
|
||||
return len(self.warnings) > 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelData:
|
||||
"""Extracted model data with normalized engines and trims"""
|
||||
name: str # Model name from JSON
|
||||
years: List[int] # Years this model appears in
|
||||
engines: List[EngineSpec] # Parsed and normalized engines
|
||||
trims: List[str] # Trim names (from submodels)
|
||||
is_electric: bool = False # True if empty engines array detected
|
||||
|
||||
@property
|
||||
def total_trims(self) -> int:
|
||||
return len(self.trims)
|
||||
|
||||
@property
|
||||
def total_engines(self) -> int:
|
||||
return len(self.engines)
|
||||
|
||||
@property
|
||||
def year_range(self) -> str:
|
||||
if not self.years:
|
||||
return "Unknown"
|
||||
return f"{min(self.years)}-{max(self.years)}" if len(self.years) > 1 else str(self.years[0])
|
||||
|
||||
|
||||
@dataclass
|
||||
class MakeData:
|
||||
"""Complete make data with models, engines, and metadata"""
|
||||
name: str # Normalized display name (e.g., "Alfa Romeo")
|
||||
filename: str # Original JSON filename
|
||||
models: List[ModelData]
|
||||
processing_errors: List[str] # Any errors during extraction
|
||||
processing_warnings: List[str] # Any warnings during extraction
|
||||
|
||||
@property
|
||||
def total_models(self) -> int:
|
||||
return len(self.models)
|
||||
|
||||
@property
|
||||
def total_engines(self) -> int:
|
||||
return sum(model.total_engines for model in self.models)
|
||||
|
||||
@property
|
||||
def total_trims(self) -> int:
|
||||
return sum(model.total_trims for model in self.models)
|
||||
|
||||
@property
|
||||
def electric_models_count(self) -> int:
|
||||
return sum(1 for model in self.models if model.is_electric)
|
||||
|
||||
@property
|
||||
def year_range(self) -> str:
|
||||
all_years = []
|
||||
for model in self.models:
|
||||
all_years.extend(model.years)
|
||||
|
||||
if not all_years:
|
||||
return "Unknown"
|
||||
return f"{min(all_years)}-{max(all_years)}" if len(set(all_years)) > 1 else str(all_years[0])
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
"""Results of extracting all makes"""
|
||||
makes: List[MakeData]
|
||||
total_files_processed: int
|
||||
successful_extractions: int
|
||||
failed_extractions: int
|
||||
total_models: int
|
||||
total_engines: int
|
||||
total_electric_models: int
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
return self.successful_extractions / self.total_files_processed if self.total_files_processed > 0 else 0.0
|
||||
|
||||
|
||||
class JsonExtractor:
|
||||
"""Extract normalized vehicle data from JSON files"""
|
||||
|
||||
def __init__(self, make_mapper: MakeNameMapper, engine_parser: EngineSpecParser):
|
||||
"""
|
||||
Initialize JSON extractor with utilities
|
||||
|
||||
Args:
|
||||
make_mapper: For normalizing make names from filenames
|
||||
engine_parser: For parsing engine specifications with L→I normalization
|
||||
"""
|
||||
self.make_mapper = make_mapper
|
||||
self.engine_parser = engine_parser
|
||||
|
||||
logger.info("JsonExtractor initialized with MakeNameMapper and EngineSpecParser")
|
||||
|
||||
def validate_json_structure(self, json_data: dict, filename: str) -> ValidationResult:
|
||||
"""
|
||||
Validate JSON structure before processing
|
||||
|
||||
Args:
|
||||
json_data: Loaded JSON data
|
||||
filename: Source filename for error context
|
||||
|
||||
Returns:
|
||||
ValidationResult with validity status and any issues
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
try:
|
||||
# Check top-level structure
|
||||
if not isinstance(json_data, dict):
|
||||
errors.append("JSON must be a dictionary")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
# Should have exactly one key (the make name)
|
||||
if len(json_data.keys()) != 1:
|
||||
errors.append(f"JSON should have exactly one top-level key, found {len(json_data.keys())}")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
make_key = list(json_data.keys())[0]
|
||||
make_data = json_data[make_key]
|
||||
|
||||
# Make data should be a list of year entries
|
||||
if not isinstance(make_data, list):
|
||||
errors.append(f"Make data for '{make_key}' must be a list")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
if len(make_data) == 0:
|
||||
warnings.append(f"Make '{make_key}' has no year entries")
|
||||
|
||||
# Validate year entries
|
||||
for i, year_entry in enumerate(make_data):
|
||||
if not isinstance(year_entry, dict):
|
||||
errors.append(f"Year entry {i} must be a dictionary")
|
||||
continue
|
||||
|
||||
# Check required fields
|
||||
if 'year' not in year_entry:
|
||||
errors.append(f"Year entry {i} missing 'year' field")
|
||||
|
||||
if 'models' not in year_entry:
|
||||
errors.append(f"Year entry {i} missing 'models' field")
|
||||
continue
|
||||
|
||||
# Validate year
|
||||
try:
|
||||
year = int(year_entry['year'])
|
||||
if year < 1900 or year > 2030:
|
||||
warnings.append(f"Unusual year value: {year}")
|
||||
except (ValueError, TypeError):
|
||||
errors.append(f"Invalid year value in entry {i}: {year_entry.get('year')}")
|
||||
|
||||
# Validate models
|
||||
models = year_entry['models']
|
||||
if not isinstance(models, list):
|
||||
errors.append(f"Models in year entry {i} must be a list")
|
||||
continue
|
||||
|
||||
for j, model in enumerate(models):
|
||||
if not isinstance(model, dict):
|
||||
errors.append(f"Model {j} in year {year_entry.get('year')} must be a dictionary")
|
||||
continue
|
||||
|
||||
if 'name' not in model:
|
||||
errors.append(f"Model {j} in year {year_entry.get('year')} missing 'name' field")
|
||||
|
||||
# Engines and submodels are optional but should be lists if present
|
||||
if 'engines' in model and not isinstance(model['engines'], list):
|
||||
errors.append(f"Engines for model {model.get('name')} must be a list")
|
||||
|
||||
if 'submodels' in model and not isinstance(model['submodels'], list):
|
||||
errors.append(f"Submodels for model {model.get('name')} must be a list")
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Unexpected error during validation: {str(e)}")
|
||||
|
||||
is_valid = len(errors) == 0
|
||||
|
||||
if errors:
|
||||
logger.warning(f"JSON validation failed for {filename}: {len(errors)} errors")
|
||||
elif warnings:
|
||||
logger.info(f"JSON validation for {filename}: {len(warnings)} warnings")
|
||||
else:
|
||||
logger.debug(f"JSON validation passed for {filename}")
|
||||
|
||||
return ValidationResult(is_valid, errors, warnings)
|
||||
|
||||
def extract_make_data(self, json_file_path: str) -> MakeData:
|
||||
"""
|
||||
Extract complete make data from a single JSON file
|
||||
|
||||
Args:
|
||||
json_file_path: Path to JSON file
|
||||
|
||||
Returns:
|
||||
MakeData with extracted and normalized data
|
||||
"""
|
||||
filename = os.path.basename(json_file_path)
|
||||
logger.info(f"Extracting make data from {filename}")
|
||||
|
||||
processing_errors = []
|
||||
processing_warnings = []
|
||||
|
||||
try:
|
||||
# Load and validate JSON
|
||||
with open(json_file_path, 'r', encoding='utf-8') as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
validation = self.validate_json_structure(json_data, filename)
|
||||
processing_errors.extend(validation.errors)
|
||||
processing_warnings.extend(validation.warnings)
|
||||
|
||||
if not validation.is_valid:
|
||||
logger.error(f"JSON validation failed for {filename}")
|
||||
return MakeData(
|
||||
name=self.make_mapper.normalize_make_name(filename),
|
||||
filename=filename,
|
||||
models=[],
|
||||
processing_errors=processing_errors,
|
||||
processing_warnings=processing_warnings
|
||||
)
|
||||
|
||||
# Get normalized make name
|
||||
make_name = self.make_mapper.normalize_make_name(filename)
|
||||
logger.debug(f"Normalized make name: {filename} → {make_name}")
|
||||
|
||||
# Extract data
|
||||
make_key = list(json_data.keys())[0]
|
||||
year_entries = json_data[make_key]
|
||||
|
||||
# Group models by name across all years
|
||||
models_by_name = {} # model_name -> {years: set, engines: set, trims: set}
|
||||
|
||||
for year_entry in year_entries:
|
||||
try:
|
||||
year = int(year_entry['year'])
|
||||
models_list = year_entry.get('models', [])
|
||||
|
||||
for model_entry in models_list:
|
||||
model_name = model_entry.get('name', '').strip()
|
||||
if not model_name:
|
||||
processing_warnings.append(f"Empty model name in year {year}")
|
||||
continue
|
||||
|
||||
# Initialize model data if not seen before
|
||||
if model_name not in models_by_name:
|
||||
models_by_name[model_name] = {
|
||||
'years': set(),
|
||||
'engines': set(),
|
||||
'trims': set()
|
||||
}
|
||||
|
||||
# Add year
|
||||
models_by_name[model_name]['years'].add(year)
|
||||
|
||||
# Add engines
|
||||
engines_list = model_entry.get('engines', [])
|
||||
for engine_str in engines_list:
|
||||
if engine_str and engine_str.strip():
|
||||
models_by_name[model_name]['engines'].add(engine_str.strip())
|
||||
|
||||
# Add trims (from submodels)
|
||||
submodels_list = model_entry.get('submodels', [])
|
||||
for trim in submodels_list:
|
||||
if trim and trim.strip():
|
||||
models_by_name[model_name]['trims'].add(trim.strip())
|
||||
|
||||
except (ValueError, TypeError) as e:
|
||||
processing_errors.append(f"Error processing year entry: {str(e)}")
|
||||
continue
|
||||
|
||||
# Convert to ModelData objects
|
||||
models = []
|
||||
for model_name, model_info in models_by_name.items():
|
||||
try:
|
||||
# Parse engines
|
||||
engine_specs = []
|
||||
is_electric = False
|
||||
|
||||
if not model_info['engines']:
|
||||
# Empty engines array - electric vehicle
|
||||
is_electric = True
|
||||
electric_spec = self.engine_parser.create_electric_motor()
|
||||
engine_specs = [electric_spec]
|
||||
logger.debug(f"Created electric motor for {make_name} {model_name}")
|
||||
else:
|
||||
# Parse each engine string
|
||||
for engine_str in model_info['engines']:
|
||||
spec = self.engine_parser.parse_engine_string(engine_str)
|
||||
engine_specs.append(spec)
|
||||
|
||||
# Remove duplicate engines based on key attributes
|
||||
unique_engines = self.engine_parser.get_unique_engines(engine_specs)
|
||||
|
||||
# Create model data
|
||||
model_data = ModelData(
|
||||
name=model_name,
|
||||
years=sorted(list(model_info['years'])),
|
||||
engines=unique_engines,
|
||||
trims=sorted(list(model_info['trims'])),
|
||||
is_electric=is_electric
|
||||
)
|
||||
|
||||
models.append(model_data)
|
||||
|
||||
except Exception as e:
|
||||
processing_errors.append(f"Error processing model {model_name}: {str(e)}")
|
||||
continue
|
||||
|
||||
# Sort models by name
|
||||
models.sort(key=lambda m: m.name)
|
||||
|
||||
make_data = MakeData(
|
||||
name=make_name,
|
||||
filename=filename,
|
||||
models=models,
|
||||
processing_errors=processing_errors,
|
||||
processing_warnings=processing_warnings
|
||||
)
|
||||
|
||||
logger.info(f"Extracted {filename}: {len(models)} models, "
|
||||
f"{make_data.total_engines} engines, {make_data.electric_models_count} electric models")
|
||||
|
||||
return make_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract make data from {filename}: {str(e)}")
|
||||
processing_errors.append(f"Fatal error: {str(e)}")
|
||||
|
||||
return MakeData(
|
||||
name=self.make_mapper.normalize_make_name(filename),
|
||||
filename=filename,
|
||||
models=[],
|
||||
processing_errors=processing_errors,
|
||||
processing_warnings=processing_warnings
|
||||
)
|
||||
|
||||
def extract_all_makes(self, sources_dir: str) -> ExtractionResult:
|
||||
"""
|
||||
Process all JSON files in the sources directory
|
||||
|
||||
Args:
|
||||
sources_dir: Directory containing JSON make files
|
||||
|
||||
Returns:
|
||||
ExtractionResult with all extracted data and statistics
|
||||
"""
|
||||
logger.info(f"Starting extraction of all makes from {sources_dir}")
|
||||
|
||||
# Find all JSON files
|
||||
pattern = os.path.join(sources_dir, '*.json')
|
||||
json_files = glob.glob(pattern)
|
||||
|
||||
if not json_files:
|
||||
logger.warning(f"No JSON files found in {sources_dir}")
|
||||
return ExtractionResult(
|
||||
makes=[],
|
||||
total_files_processed=0,
|
||||
successful_extractions=0,
|
||||
failed_extractions=0,
|
||||
total_models=0,
|
||||
total_engines=0,
|
||||
total_electric_models=0
|
||||
)
|
||||
|
||||
logger.info(f"Found {len(json_files)} JSON files to process")
|
||||
|
||||
makes = []
|
||||
successful_extractions = 0
|
||||
failed_extractions = 0
|
||||
|
||||
# Sort files for consistent processing order
|
||||
json_files.sort()
|
||||
|
||||
for json_file in json_files:
|
||||
try:
|
||||
make_data = self.extract_make_data(json_file)
|
||||
makes.append(make_data)
|
||||
|
||||
if make_data.processing_errors:
|
||||
failed_extractions += 1
|
||||
logger.error(f"Extraction completed with errors for {make_data.filename}")
|
||||
else:
|
||||
successful_extractions += 1
|
||||
logger.debug(f"Extraction successful for {make_data.filename}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error processing {os.path.basename(json_file)}: {str(e)}")
|
||||
failed_extractions += 1
|
||||
|
||||
# Create minimal make data for failed file
|
||||
filename = os.path.basename(json_file)
|
||||
failed_make = MakeData(
|
||||
name=self.make_mapper.normalize_make_name(filename),
|
||||
filename=filename,
|
||||
models=[],
|
||||
processing_errors=[f"Fatal extraction error: {str(e)}"],
|
||||
processing_warnings=[]
|
||||
)
|
||||
makes.append(failed_make)
|
||||
|
||||
# Calculate statistics
|
||||
total_models = sum(make.total_models for make in makes)
|
||||
total_engines = sum(make.total_engines for make in makes)
|
||||
total_electric_models = sum(make.electric_models_count for make in makes)
|
||||
|
||||
result = ExtractionResult(
|
||||
makes=makes,
|
||||
total_files_processed=len(json_files),
|
||||
successful_extractions=successful_extractions,
|
||||
failed_extractions=failed_extractions,
|
||||
total_models=total_models,
|
||||
total_engines=total_engines,
|
||||
total_electric_models=total_electric_models
|
||||
)
|
||||
|
||||
logger.info(f"Extraction complete: {successful_extractions}/{len(json_files)} successful, "
|
||||
f"{total_models} models, {total_engines} engines, {total_electric_models} electric models")
|
||||
|
||||
return result
|
||||
|
||||
def get_extraction_statistics(self, result: ExtractionResult) -> Dict[str, any]:
|
||||
"""
|
||||
Get detailed extraction statistics
|
||||
|
||||
Args:
|
||||
result: ExtractionResult from extract_all_makes
|
||||
|
||||
Returns:
|
||||
Dictionary with detailed statistics
|
||||
"""
|
||||
stats = {
|
||||
'files': {
|
||||
'total_processed': result.total_files_processed,
|
||||
'successful': result.successful_extractions,
|
||||
'failed': result.failed_extractions,
|
||||
'success_rate': result.success_rate
|
||||
},
|
||||
'data': {
|
||||
'total_makes': len(result.makes),
|
||||
'total_models': result.total_models,
|
||||
'total_engines': result.total_engines,
|
||||
'electric_models': result.total_electric_models
|
||||
},
|
||||
'quality': {
|
||||
'makes_with_errors': sum(1 for make in result.makes if make.processing_errors),
|
||||
'makes_with_warnings': sum(1 for make in result.makes if make.processing_warnings),
|
||||
'total_errors': sum(len(make.processing_errors) for make in result.makes),
|
||||
'total_warnings': sum(len(make.processing_warnings) for make in result.makes)
|
||||
}
|
||||
}
|
||||
|
||||
# Add make-specific statistics
|
||||
make_stats = []
|
||||
for make in result.makes:
|
||||
make_stat = {
|
||||
'name': make.name,
|
||||
'filename': make.filename,
|
||||
'models': make.total_models,
|
||||
'engines': make.total_engines,
|
||||
'trims': make.total_trims,
|
||||
'electric_models': make.electric_models_count,
|
||||
'year_range': make.year_range,
|
||||
'errors': len(make.processing_errors),
|
||||
'warnings': len(make.processing_warnings)
|
||||
}
|
||||
make_stats.append(make_stat)
|
||||
|
||||
stats['makes'] = make_stats
|
||||
|
||||
return stats
|
||||
|
||||
def print_extraction_report(self, result: ExtractionResult) -> None:
|
||||
"""
|
||||
Print detailed extraction report
|
||||
|
||||
Args:
|
||||
result: ExtractionResult from extract_all_makes
|
||||
"""
|
||||
stats = self.get_extraction_statistics(result)
|
||||
|
||||
print(f"🚀 JSON EXTRACTION REPORT")
|
||||
print(f"=" * 50)
|
||||
|
||||
# File processing summary
|
||||
print(f"\n📁 FILE PROCESSING")
|
||||
print(f" Files processed: {stats['files']['total_processed']}")
|
||||
print(f" Successful: {stats['files']['successful']}")
|
||||
print(f" Failed: {stats['files']['failed']}")
|
||||
print(f" Success rate: {stats['files']['success_rate']:.1%}")
|
||||
|
||||
# Data summary
|
||||
print(f"\n📊 DATA EXTRACTED")
|
||||
print(f" Makes: {stats['data']['total_makes']}")
|
||||
print(f" Models: {stats['data']['total_models']}")
|
||||
print(f" Engines: {stats['data']['total_engines']}")
|
||||
print(f" Electric models: {stats['data']['electric_models']}")
|
||||
|
||||
# Quality summary
|
||||
print(f"\n🔍 QUALITY ASSESSMENT")
|
||||
print(f" Makes with errors: {stats['quality']['makes_with_errors']}")
|
||||
print(f" Makes with warnings: {stats['quality']['makes_with_warnings']}")
|
||||
print(f" Total errors: {stats['quality']['total_errors']}")
|
||||
print(f" Total warnings: {stats['quality']['total_warnings']}")
|
||||
|
||||
# Show problematic makes
|
||||
if stats['quality']['makes_with_errors'] > 0:
|
||||
print(f"\n⚠️ MAKES WITH ERRORS:")
|
||||
for make in result.makes:
|
||||
if make.processing_errors:
|
||||
print(f" {make.name} ({make.filename}): {len(make.processing_errors)} errors")
|
||||
|
||||
# Show top makes by data volume
|
||||
print(f"\n🏆 TOP MAKES BY MODEL COUNT:")
|
||||
top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
|
||||
for make in top_makes:
|
||||
print(f" {make.name}: {make.total_models} models, {make.total_engines} engines")
|
||||
|
||||
|
||||
# Example usage and testing functions
|
||||
def example_usage():
|
||||
"""Demonstrate JsonExtractor usage"""
|
||||
print("🚀 JsonExtractor Example Usage")
|
||||
print("=" * 40)
|
||||
|
||||
# Use direct imports for example usage
|
||||
try:
|
||||
from ..utils.make_name_mapper import MakeNameMapper
|
||||
from ..utils.engine_spec_parser import EngineSpecParser
|
||||
except ImportError:
|
||||
# Fallback for direct execution
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
from utils.make_name_mapper import MakeNameMapper
|
||||
from utils.engine_spec_parser import EngineSpecParser
|
||||
|
||||
# Initialize utilities
|
||||
make_mapper = MakeNameMapper()
|
||||
engine_parser = EngineSpecParser()
|
||||
|
||||
# Create extractor
|
||||
extractor = JsonExtractor(make_mapper, engine_parser)
|
||||
|
||||
# Extract single make
|
||||
sources_dir = "sources/makes"
|
||||
if os.path.exists(sources_dir):
|
||||
toyota_file = os.path.join(sources_dir, "toyota.json")
|
||||
if os.path.exists(toyota_file):
|
||||
print(f"\n📄 Extracting from toyota.json...")
|
||||
toyota_data = extractor.extract_make_data(toyota_file)
|
||||
|
||||
print(f" Make: {toyota_data.name}")
|
||||
print(f" Models: {toyota_data.total_models}")
|
||||
print(f" Engines: {toyota_data.total_engines}")
|
||||
print(f" Electric models: {toyota_data.electric_models_count}")
|
||||
print(f" Year range: {toyota_data.year_range}")
|
||||
|
||||
if toyota_data.processing_errors:
|
||||
print(f" Errors: {len(toyota_data.processing_errors)}")
|
||||
if toyota_data.processing_warnings:
|
||||
print(f" Warnings: {len(toyota_data.processing_warnings)}")
|
||||
|
||||
# Extract all makes
|
||||
print(f"\n🔄 Extracting all makes...")
|
||||
result = extractor.extract_all_makes(sources_dir)
|
||||
extractor.print_extraction_report(result)
|
||||
else:
|
||||
print(f"Sources directory not found: {sources_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
337
mvp-platform-services/vehicles/etl/extractors/mssql_extractor.py
Executable file
337
mvp-platform-services/vehicles/etl/extractors/mssql_extractor.py
Executable file
@@ -0,0 +1,337 @@
|
||||
import logging
|
||||
from typing import List, Dict, Optional, Generator
|
||||
from ..connections import db_connections
|
||||
from ..utils.make_filter import MakeFilter
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MSSQLExtractor:
|
||||
"""Extract data from MS SQL Server source database"""
|
||||
|
||||
def __init__(self, make_filter: Optional[MakeFilter] = None):
|
||||
self.batch_size = 10000
|
||||
self.make_filter = make_filter or MakeFilter()
|
||||
logger.info(f"Initialized MSSQL extractor with {len(self.make_filter.get_allowed_makes())} allowed makes")
|
||||
|
||||
def extract_wmi_data(self) -> List[Dict]:
|
||||
"""Extract WMI (World Manufacturer Identifier) data with make filtering"""
|
||||
logger.info("Extracting WMI data from source database with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
w.Id,
|
||||
w.Wmi,
|
||||
w.ManufacturerId,
|
||||
w.MakeId,
|
||||
w.VehicleTypeId,
|
||||
w.TruckTypeId,
|
||||
w.CountryId,
|
||||
w.PublicAvailabilityDate,
|
||||
w.NonCompliant,
|
||||
w.NonCompliantReason,
|
||||
w.CreatedOn,
|
||||
w.UpdatedOn,
|
||||
w.ProcessedOn
|
||||
FROM dbo.Wmi w
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
)
|
||||
ORDER BY w.Id
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} WMI records")
|
||||
return results
|
||||
|
||||
def extract_wmi_vin_schema_mappings(self) -> List[Dict]:
|
||||
"""Extract WMI to VIN Schema mappings with year ranges and make filtering"""
|
||||
logger.info("Extracting WMI-VinSchema mappings with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
wvs.WmiId,
|
||||
wvs.VinSchemaId,
|
||||
wvs.YearFrom,
|
||||
wvs.YearTo,
|
||||
w.Wmi,
|
||||
vs.Name as SchemaName
|
||||
FROM dbo.Wmi_VinSchema wvs
|
||||
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
|
||||
JOIN dbo.VinSchema vs ON wvs.VinSchemaId = vs.Id
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
)
|
||||
AND w.MakeId IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
ORDER BY wvs.WmiId, wvs.VinSchemaId
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} WMI-VinSchema mappings (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def extract_patterns_data(self) -> Generator[List[Dict], None, None]:
|
||||
"""Extract pattern data in batches with make filtering"""
|
||||
logger.info("Extracting pattern data from source database with make filtering")
|
||||
|
||||
# First get the total count with filtering
|
||||
count_query = f"""
|
||||
SELECT COUNT(*) as total
|
||||
FROM dbo.Pattern p
|
||||
JOIN dbo.Element e ON p.ElementId = e.Id
|
||||
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
|
||||
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
|
||||
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
|
||||
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
|
||||
JOIN dbo.Make m ON wm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
AND e.Id IN (26, 27, 28, 18, 24)
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(count_query)
|
||||
total_row = self._row_to_dict(cursor, cursor.fetchone())
|
||||
total_count = total_row.get('total', 0)
|
||||
|
||||
logger.info(f"Total patterns to extract (filtered): {total_count}")
|
||||
|
||||
# Extract in batches with manufacturer filtering
|
||||
query = f"""
|
||||
SELECT
|
||||
p.Id,
|
||||
p.VinSchemaId,
|
||||
p.Keys,
|
||||
p.ElementId,
|
||||
p.AttributeId,
|
||||
e.Name as ElementName,
|
||||
e.weight,
|
||||
e.GroupName,
|
||||
vs.Name as SchemaName,
|
||||
w.Wmi,
|
||||
m.Name as MakeName
|
||||
FROM dbo.Pattern p
|
||||
JOIN dbo.Element e ON p.ElementId = e.Id
|
||||
JOIN dbo.VinSchema vs ON p.VinSchemaId = vs.Id
|
||||
JOIN dbo.Wmi_VinSchema wvs ON vs.Id = wvs.VinSchemaId
|
||||
JOIN dbo.Wmi w ON wvs.WmiId = w.Id
|
||||
JOIN dbo.Wmi_Make wm ON w.Id = wm.WmiId
|
||||
JOIN dbo.Make m ON wm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
AND e.Id IN (26, 27, 28, 18, 24)
|
||||
ORDER BY p.Id
|
||||
OFFSET {{}} ROWS FETCH NEXT {{}} ROWS ONLY
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
for offset in tqdm(range(0, total_count, self.batch_size), desc="Extracting filtered patterns"):
|
||||
cursor.execute(query.format(offset, self.batch_size))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if rows:
|
||||
yield self._rows_to_dicts(cursor, rows)
|
||||
else:
|
||||
break
|
||||
|
||||
def extract_elements_data(self) -> List[Dict]:
|
||||
"""Extract element definitions"""
|
||||
logger.info("Extracting element data")
|
||||
|
||||
query = """
|
||||
SELECT
|
||||
Id,
|
||||
Name,
|
||||
Code,
|
||||
LookupTable,
|
||||
Description,
|
||||
IsPrivate,
|
||||
GroupName,
|
||||
DataType,
|
||||
MinAllowedValue,
|
||||
MaxAllowedValue,
|
||||
IsQS,
|
||||
Decode,
|
||||
weight
|
||||
FROM dbo.Element
|
||||
ORDER BY Id
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} element definitions")
|
||||
return results
|
||||
|
||||
def extract_reference_table(self, table_name: str) -> List[Dict]:
|
||||
"""Extract data from a reference table with make filtering"""
|
||||
logger.info(f"Extracting data from {table_name} with make filtering")
|
||||
|
||||
# Apply make filtering - filter by Make brand names (simpler and more efficient)
|
||||
if table_name == 'Manufacturer':
|
||||
# Extract manufacturers linked to filtered makes only
|
||||
query = f"""
|
||||
SELECT DISTINCT mfr.* FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
ORDER BY mfr.Id
|
||||
"""
|
||||
elif table_name == 'Make':
|
||||
# Filter makes directly by brand names (GMC, Ford, Toyota, etc.)
|
||||
query = f"""
|
||||
SELECT * FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
ORDER BY Id
|
||||
"""
|
||||
elif table_name == 'Model':
|
||||
# Filter models by allowed make brand names
|
||||
query = f"""
|
||||
SELECT md.* FROM dbo.Model md
|
||||
JOIN dbo.Make_Model mm ON md.Id = mm.ModelId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
ORDER BY md.Id
|
||||
"""
|
||||
elif table_name == 'Wmi':
|
||||
# Filter WMI records by allowed manufacturers (linked to makes) AND makes directly
|
||||
query = f"""
|
||||
SELECT w.* FROM dbo.Wmi w
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
)
|
||||
AND w.MakeId IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
ORDER BY w.Id
|
||||
"""
|
||||
else:
|
||||
# No filtering for other reference tables
|
||||
query = f"SELECT * FROM dbo.{table_name} ORDER BY Id"
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} records from {table_name} (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def extract_make_model_relationships(self) -> List[Dict]:
|
||||
"""Extract Make-Model relationships with make filtering"""
|
||||
logger.info("Extracting Make-Model relationships with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
mm.MakeId,
|
||||
mm.ModelId,
|
||||
m.Name as MakeName,
|
||||
md.Name as ModelName
|
||||
FROM dbo.Make_Model mm
|
||||
JOIN dbo.Make m ON mm.MakeId = m.Id
|
||||
JOIN dbo.Model md ON mm.ModelId = md.Id
|
||||
WHERE {self.make_filter.get_sql_filter('m.Name')}
|
||||
ORDER BY mm.MakeId, mm.ModelId
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} Make-Model relationships (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def extract_wmi_make_relationships(self) -> List[Dict]:
|
||||
"""Extract WMI-Make relationships with make filtering"""
|
||||
logger.info("Extracting WMI-Make relationships with make filtering")
|
||||
|
||||
query = f"""
|
||||
SELECT
|
||||
wm.WmiId,
|
||||
wm.MakeId,
|
||||
w.Wmi,
|
||||
m.Name as MakeName
|
||||
FROM dbo.Wmi_Make wm
|
||||
JOIN dbo.Wmi w ON wm.WmiId = w.Id
|
||||
JOIN dbo.Make m ON wm.MakeId = m.Id
|
||||
WHERE w.PublicAvailabilityDate <= GETDATE()
|
||||
AND w.ManufacturerId IN (
|
||||
SELECT DISTINCT mfr.Id
|
||||
FROM dbo.Manufacturer mfr
|
||||
JOIN dbo.Manufacturer_Make mm ON mfr.Id = mm.ManufacturerId
|
||||
JOIN dbo.Make mk ON mm.MakeId = mk.Id
|
||||
WHERE {self.make_filter.get_sql_filter('mk.Name')}
|
||||
)
|
||||
AND w.MakeId IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
AND m.Id IN (
|
||||
SELECT Id FROM dbo.Make
|
||||
WHERE {self.make_filter.get_sql_filter('Name')}
|
||||
)
|
||||
ORDER BY wm.WmiId, wm.MakeId
|
||||
"""
|
||||
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
results = self._rows_to_dicts(cursor, rows)
|
||||
|
||||
logger.info(f"Extracted {len(results)} WMI-Make relationships (filtered by allowed makes)")
|
||||
return results
|
||||
|
||||
def _rows_to_dicts(self, cursor, rows) -> List[Dict]:
|
||||
"""Convert pyodbc rows to list of dicts using cursor description."""
|
||||
if not rows:
|
||||
return []
|
||||
columns = [col[0] for col in cursor.description]
|
||||
result: List[Dict] = []
|
||||
for row in rows:
|
||||
item = {columns[i]: row[i] for i in range(len(columns))}
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
def _row_to_dict(self, cursor, row) -> Dict:
|
||||
"""Convert single pyodbc row to dict."""
|
||||
if row is None:
|
||||
return {}
|
||||
columns = [col[0] for col in cursor.description]
|
||||
return {columns[i]: row[i] for i in range(len(columns))}
|
||||
@@ -0,0 +1,63 @@
|
||||
import logging
|
||||
from typing import Optional, Dict, Any, List
|
||||
from ..connections import db_connections
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class VinProcExtractor:
|
||||
"""Utilities to inspect and sample the MSSQL VIN decode stored procedure."""
|
||||
|
||||
def __init__(self, proc_name: str = 'dbo.spVinDecode'):
|
||||
self.proc_name = proc_name
|
||||
|
||||
def find_proc(self) -> Optional[Dict[str, Any]]:
|
||||
"""Locate the VIN decode proc by name pattern, return basic metadata."""
|
||||
query = """
|
||||
SELECT TOP 1
|
||||
o.name AS object_name,
|
||||
s.name AS schema_name,
|
||||
o.type_desc
|
||||
FROM sys.objects o
|
||||
JOIN sys.schemas s ON s.schema_id = o.schema_id
|
||||
WHERE o.name LIKE '%Vin%Decode%'
|
||||
ORDER BY o.create_date DESC
|
||||
"""
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute(query)
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
logger.warning("VIN decode stored procedure not found by pattern")
|
||||
return None
|
||||
return { 'object_name': row[0], 'schema_name': row[1], 'type_desc': row[2] }
|
||||
|
||||
def get_definition(self, schema: str, name: str) -> str:
|
||||
"""Return the text definition of the proc using sp_helptext semantics."""
|
||||
sql = f"EXEC {schema}.sp_helptext '{schema}.{name}'"
|
||||
definition_lines: List[str] = []
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute(sql)
|
||||
for row in cur.fetchall():
|
||||
# sp_helptext returns a single NVARCHAR column with line segments
|
||||
definition_lines.append(row[0])
|
||||
return ''.join(definition_lines)
|
||||
|
||||
def sample_execute(self, vin: str) -> Optional[List[Dict[str, Any]]]:
|
||||
"""Execute the VIN decode proc with a VIN to capture output shape."""
|
||||
# Prefer proc signature with @VIN only; if it requires year, MSSQL will error.
|
||||
sql = f"EXEC {self.proc_name} @VIN=?"
|
||||
with db_connections.mssql_connection() as conn:
|
||||
cur = conn.cursor()
|
||||
try:
|
||||
cur.execute(sql, (vin,))
|
||||
columns = [c[0] for c in cur.description] if cur.description else []
|
||||
rows = cur.fetchall() if cur.description else []
|
||||
results: List[Dict[str, Any]] = []
|
||||
for r in rows:
|
||||
results.append({columns[i]: r[i] for i in range(len(columns))})
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.warning(f"VIN proc sample execution failed: {e}")
|
||||
return None
|
||||
|
||||
1
mvp-platform-services/vehicles/etl/loaders/__init__.py
Executable file
1
mvp-platform-services/vehicles/etl/loaders/__init__.py
Executable file
@@ -0,0 +1 @@
|
||||
# ETL Loaders
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
716
mvp-platform-services/vehicles/etl/loaders/json_manual_loader.py
Normal file
716
mvp-platform-services/vehicles/etl/loaders/json_manual_loader.py
Normal file
@@ -0,0 +1,716 @@
|
||||
"""
|
||||
JSON Manual Loader for Vehicles ETL
|
||||
|
||||
Loads extracted JSON data into PostgreSQL database with referential integrity.
|
||||
Supports clear/append modes with duplicate handling and comprehensive progress tracking.
|
||||
|
||||
Database Schema:
|
||||
- vehicles.make (id, name)
|
||||
- vehicles.model (id, make_id, name)
|
||||
- vehicles.model_year (id, model_id, year)
|
||||
- vehicles.trim (id, model_year_id, name)
|
||||
- vehicles.engine (id, name, code, displacement_l, cylinders, fuel_type, aspiration)
|
||||
- vehicles.trim_engine (trim_id, engine_id)
|
||||
|
||||
Load Modes:
|
||||
- CLEAR: Truncate all tables and reload (destructive)
|
||||
- APPEND: Insert with conflict resolution (safe)
|
||||
|
||||
Usage:
|
||||
loader = JsonManualLoader(postgres_loader)
|
||||
result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
# Import our components (handle both relative and direct imports)
|
||||
try:
|
||||
from .postgres_loader import PostgreSQLLoader
|
||||
from ..extractors.json_extractor import MakeData, ModelData, ExtractionResult
|
||||
from ..utils.engine_spec_parser import EngineSpec
|
||||
from ..connections import db_connections
|
||||
except ImportError:
|
||||
# Fallback for direct execution
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
|
||||
# Import with fallback handling for nested imports
|
||||
try:
|
||||
from loaders.postgres_loader import PostgreSQLLoader
|
||||
except ImportError:
|
||||
# Mock PostgreSQLLoader for testing
|
||||
class PostgreSQLLoader:
|
||||
def __init__(self):
|
||||
self.batch_size = 1000
|
||||
|
||||
from extractors.json_extractor import MakeData, ModelData, ExtractionResult
|
||||
from utils.engine_spec_parser import EngineSpec
|
||||
|
||||
try:
|
||||
from connections import db_connections
|
||||
except ImportError:
|
||||
# Mock db_connections for testing
|
||||
class MockDBConnections:
|
||||
def postgres_connection(self):
|
||||
raise NotImplementedError("Database connection not available in test mode")
|
||||
db_connections = MockDBConnections()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LoadMode(Enum):
|
||||
"""Data loading modes"""
|
||||
CLEAR = "clear" # Truncate and reload (destructive)
|
||||
APPEND = "append" # Insert with conflict handling (safe)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoadResult:
|
||||
"""Result of loading operations"""
|
||||
total_makes: int
|
||||
total_models: int
|
||||
total_model_years: int
|
||||
total_trims: int
|
||||
total_engines: int
|
||||
total_trim_engine_mappings: int
|
||||
failed_makes: List[str]
|
||||
warnings: List[str]
|
||||
load_mode: LoadMode
|
||||
|
||||
@property
|
||||
def success_count(self) -> int:
|
||||
return self.total_makes - len(self.failed_makes)
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
return self.success_count / self.total_makes if self.total_makes > 0 else 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoadStatistics:
|
||||
"""Detailed loading statistics"""
|
||||
makes_processed: int = 0
|
||||
makes_skipped: int = 0
|
||||
models_inserted: int = 0
|
||||
model_years_inserted: int = 0
|
||||
skipped_model_years: int = 0
|
||||
trims_inserted: int = 0
|
||||
engines_inserted: int = 0
|
||||
trim_engine_mappings_inserted: int = 0
|
||||
duplicate_makes: int = 0
|
||||
duplicate_models: int = 0
|
||||
duplicate_engines: int = 0
|
||||
errors: List[str] = None
|
||||
warnings: List[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.errors is None:
|
||||
self.errors = []
|
||||
if self.warnings is None:
|
||||
self.warnings = []
|
||||
|
||||
|
||||
class JsonManualLoader:
|
||||
"""Load JSON-extracted vehicle data into PostgreSQL"""
|
||||
|
||||
def _get_id_from_result(self, result, column_name='id'):
|
||||
"""Helper to extract ID from query result, handling both tuple and dict cursors"""
|
||||
if result is None:
|
||||
return None
|
||||
if isinstance(result, tuple):
|
||||
return result[0]
|
||||
# For RealDictCursor, try the column name first, fall back to key access
|
||||
if column_name in result:
|
||||
return result[column_name]
|
||||
# For COUNT(*) queries, the key might be 'count'
|
||||
if 'count' in result:
|
||||
return result['count']
|
||||
# Fall back to first value
|
||||
return list(result.values())[0] if result else None
|
||||
|
||||
def __init__(self, postgres_loader: Optional[PostgreSQLLoader] = None):
|
||||
"""
|
||||
Initialize JSON manual loader
|
||||
|
||||
Args:
|
||||
postgres_loader: Existing PostgreSQL loader instance
|
||||
"""
|
||||
self.postgres_loader = postgres_loader or PostgreSQLLoader()
|
||||
self.batch_size = 1000
|
||||
|
||||
logger.info("JsonManualLoader initialized")
|
||||
|
||||
def clear_all_tables(self) -> None:
|
||||
"""
|
||||
Clear all vehicles tables in dependency order
|
||||
|
||||
WARNING: This is destructive and will remove all data
|
||||
"""
|
||||
logger.warning("CLEARING ALL VEHICLES TABLES - This is destructive!")
|
||||
|
||||
tables_to_clear = [
|
||||
'trim_engine', # Many-to-many mappings first
|
||||
'trim_transmission',
|
||||
'performance', # Tables with foreign keys
|
||||
'trim',
|
||||
'model_year',
|
||||
'model',
|
||||
'make',
|
||||
'engine', # Independent tables last
|
||||
'transmission'
|
||||
]
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
for table in tables_to_clear:
|
||||
try:
|
||||
cursor.execute(f"TRUNCATE TABLE vehicles.{table} CASCADE")
|
||||
logger.info(f"Cleared vehicles.{table}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clear vehicles.{table}: {str(e)}")
|
||||
|
||||
conn.commit()
|
||||
|
||||
logger.info("All vehicles tables cleared")
|
||||
|
||||
def load_make(self, make_data: MakeData, mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load a single make with all related data
|
||||
|
||||
Args:
|
||||
make_data: Extracted make data
|
||||
mode: Loading mode (clear/append)
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Make ID in database
|
||||
"""
|
||||
logger.debug(f"Loading make: {make_data.name}")
|
||||
|
||||
try:
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 1. Insert or get make (always check for existing to avoid constraint violations)
|
||||
# Check if make exists (case-insensitive to match database constraint)
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
|
||||
(make_data.name,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
make_id = self._get_id_from_result(result)
|
||||
stats.duplicate_makes += 1
|
||||
logger.debug(f"Make {make_data.name} already exists with ID {make_id}")
|
||||
else:
|
||||
# Insert new make with error handling for constraint violations
|
||||
try:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.make (name) VALUES (%s) RETURNING id",
|
||||
(make_data.name,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
make_id = self._get_id_from_result(result)
|
||||
logger.debug(f"Inserted make {make_data.name} with ID {make_id}")
|
||||
except Exception as e:
|
||||
if "duplicate key value violates unique constraint" in str(e):
|
||||
# Retry the lookup in case of race condition
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.make WHERE lower(name) = lower(%s)",
|
||||
(make_data.name,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
make_id = self._get_id_from_result(result)
|
||||
stats.duplicate_makes += 1
|
||||
logger.debug(f"Make {make_data.name} found after retry with ID {make_id}")
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
raise
|
||||
|
||||
# 2. Process models
|
||||
for model_data in make_data.models:
|
||||
model_id = self.load_model(cursor, make_id, model_data, mode, stats)
|
||||
|
||||
conn.commit()
|
||||
stats.makes_processed += 1
|
||||
|
||||
return make_id
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to load make {make_data.name}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
stats.errors.append(error_msg)
|
||||
raise
|
||||
|
||||
def load_model(self, cursor, make_id: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load a single model with all related data
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
make_id: Parent make ID
|
||||
model_data: Extracted model data
|
||||
mode: Loading mode
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Model ID in database
|
||||
"""
|
||||
# 1. Insert or get model
|
||||
if mode == LoadMode.APPEND:
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.model WHERE make_id = %s AND name = %s",
|
||||
(make_id, model_data.name)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
model_id = result[0] if isinstance(result, tuple) else result['id']
|
||||
stats.duplicate_models += 1
|
||||
else:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
|
||||
(make_id, model_data.name)
|
||||
)
|
||||
model_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.models_inserted += 1
|
||||
else:
|
||||
# CLEAR mode - just insert
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.model (make_id, name) VALUES (%s, %s) RETURNING id",
|
||||
(make_id, model_data.name)
|
||||
)
|
||||
model_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.models_inserted += 1
|
||||
|
||||
# 2. Insert model years and related data
|
||||
for year in model_data.years:
|
||||
model_year_id = self.load_model_year(cursor, model_id, year, model_data, mode, stats)
|
||||
# Skip processing if year was outside valid range
|
||||
if model_year_id is None:
|
||||
continue
|
||||
|
||||
return model_id
|
||||
|
||||
def load_model_year(self, cursor, model_id: int, year: int, model_data: ModelData, mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load model year and associated trims/engines
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
model_id: Parent model ID
|
||||
year: Model year
|
||||
model_data: Model data with trims and engines
|
||||
mode: Loading mode
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Model year ID in database
|
||||
"""
|
||||
# Skip years that don't meet database constraints (must be 1950-2100)
|
||||
if year < 1950 or year > 2100:
|
||||
logger.warning(f"Skipping year {year} - outside valid range (1950-2100)")
|
||||
stats.skipped_model_years += 1
|
||||
return None
|
||||
|
||||
# 1. Insert or get model year
|
||||
if mode == LoadMode.APPEND:
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.model_year WHERE model_id = %s AND year = %s",
|
||||
(model_id, year)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
model_year_id = result[0] if isinstance(result, tuple) else result['id']
|
||||
else:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
|
||||
(model_id, year)
|
||||
)
|
||||
model_year_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.model_years_inserted += 1
|
||||
else:
|
||||
# CLEAR mode - just insert
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.model_year (model_id, year) VALUES (%s, %s) RETURNING id",
|
||||
(model_id, year)
|
||||
)
|
||||
model_year_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.model_years_inserted += 1
|
||||
|
||||
# 2. Load engines and get their IDs
|
||||
engine_ids = []
|
||||
for engine_spec in model_data.engines:
|
||||
engine_id = self.load_engine(cursor, engine_spec, mode, stats)
|
||||
engine_ids.append(engine_id)
|
||||
|
||||
# 3. Load trims and connect to engines
|
||||
for trim_name in model_data.trims:
|
||||
trim_id = self.load_trim(cursor, model_year_id, trim_name, engine_ids, mode, stats)
|
||||
|
||||
return model_year_id
|
||||
|
||||
def load_engine(self, cursor, engine_spec: EngineSpec, mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load engine specification
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
engine_spec: Parsed engine specification
|
||||
mode: Loading mode
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Engine ID in database
|
||||
"""
|
||||
# Create a canonical engine name for database storage
|
||||
if engine_spec.displacement_l and engine_spec.configuration != "Unknown" and engine_spec.cylinders:
|
||||
engine_name = f"{engine_spec.displacement_l}L {engine_spec.configuration}{engine_spec.cylinders}"
|
||||
else:
|
||||
engine_name = engine_spec.raw_string
|
||||
|
||||
# Generate engine code from name (remove spaces, lowercase)
|
||||
engine_code = engine_name.replace(" ", "").lower()
|
||||
|
||||
# Always check for existing engine by name or code to avoid constraint violations
|
||||
cursor.execute("""
|
||||
SELECT id FROM vehicles.engine
|
||||
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
|
||||
""", (engine_name, engine_code))
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
engine_id = self._get_id_from_result(result)
|
||||
stats.duplicate_engines += 1
|
||||
return engine_id
|
||||
|
||||
# Insert new engine
|
||||
try:
|
||||
cursor.execute("""
|
||||
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
RETURNING id
|
||||
""", (
|
||||
engine_name,
|
||||
engine_code,
|
||||
engine_spec.displacement_l,
|
||||
engine_spec.cylinders,
|
||||
engine_spec.fuel_type if engine_spec.fuel_type != "Unknown" else None,
|
||||
engine_spec.aspiration if engine_spec.aspiration != "Natural" else None
|
||||
))
|
||||
|
||||
engine_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.engines_inserted += 1
|
||||
|
||||
return engine_id
|
||||
except Exception as e:
|
||||
if "duplicate key value violates unique constraint" in str(e):
|
||||
# Retry the lookup in case of race condition
|
||||
cursor.execute("""
|
||||
SELECT id FROM vehicles.engine
|
||||
WHERE lower(name) = lower(%s) OR (code IS NOT NULL AND code = %s)
|
||||
""", (engine_name, engine_code))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
engine_id = self._get_id_from_result(result)
|
||||
stats.duplicate_engines += 1
|
||||
return engine_id
|
||||
raise
|
||||
|
||||
def load_trim(self, cursor, model_year_id: int, trim_name: str, engine_ids: List[int], mode: LoadMode, stats: LoadStatistics) -> int:
|
||||
"""
|
||||
Load trim and connect to engines
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
model_year_id: Parent model year ID
|
||||
trim_name: Trim name
|
||||
engine_ids: List of engine IDs to connect
|
||||
mode: Loading mode
|
||||
stats: Statistics accumulator
|
||||
|
||||
Returns:
|
||||
Trim ID in database
|
||||
"""
|
||||
# 1. Insert or get trim
|
||||
if mode == LoadMode.APPEND:
|
||||
cursor.execute(
|
||||
"SELECT id FROM vehicles.trim WHERE model_year_id = %s AND name = %s",
|
||||
(model_year_id, trim_name)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
trim_id = result[0] if isinstance(result, tuple) else result['id']
|
||||
else:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
|
||||
(model_year_id, trim_name)
|
||||
)
|
||||
trim_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.trims_inserted += 1
|
||||
else:
|
||||
# CLEAR mode - just insert
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.trim (model_year_id, name) VALUES (%s, %s) RETURNING id",
|
||||
(model_year_id, trim_name)
|
||||
)
|
||||
trim_id = self._get_id_from_result(cursor.fetchone())
|
||||
stats.trims_inserted += 1
|
||||
|
||||
# 2. Connect trim to engines (always check for existing to avoid duplicates)
|
||||
# Deduplicate engine_ids to prevent duplicate mappings within the same trim
|
||||
unique_engine_ids = list(set(engine_ids))
|
||||
for engine_id in unique_engine_ids:
|
||||
# Check if mapping already exists
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM vehicles.trim_engine WHERE trim_id = %s AND engine_id = %s",
|
||||
(trim_id, engine_id)
|
||||
)
|
||||
|
||||
if not cursor.fetchone():
|
||||
try:
|
||||
cursor.execute(
|
||||
"INSERT INTO vehicles.trim_engine (trim_id, engine_id) VALUES (%s, %s)",
|
||||
(trim_id, engine_id)
|
||||
)
|
||||
stats.trim_engine_mappings_inserted += 1
|
||||
except Exception as e:
|
||||
if "duplicate key value violates unique constraint" in str(e):
|
||||
# Another process may have inserted it, skip
|
||||
logger.debug(f"Trim-engine mapping ({trim_id}, {engine_id}) already exists, skipping")
|
||||
else:
|
||||
raise
|
||||
|
||||
return trim_id
|
||||
|
||||
def load_all_makes(self, makes_data: List[MakeData], mode: LoadMode) -> LoadResult:
|
||||
"""
|
||||
Load all makes with complete data
|
||||
|
||||
Args:
|
||||
makes_data: List of extracted make data
|
||||
mode: Loading mode (clear/append)
|
||||
|
||||
Returns:
|
||||
LoadResult with comprehensive statistics
|
||||
"""
|
||||
logger.info(f"Starting bulk load of {len(makes_data)} makes in {mode.value} mode")
|
||||
|
||||
# Clear tables if in CLEAR mode
|
||||
if mode == LoadMode.CLEAR:
|
||||
self.clear_all_tables()
|
||||
|
||||
stats = LoadStatistics()
|
||||
failed_makes = []
|
||||
|
||||
for make_data in makes_data:
|
||||
try:
|
||||
if make_data.processing_errors:
|
||||
logger.warning(f"Skipping make {make_data.name} due to extraction errors")
|
||||
stats.makes_skipped += 1
|
||||
failed_makes.append(make_data.name)
|
||||
continue
|
||||
|
||||
make_id = self.load_make(make_data, mode, stats)
|
||||
logger.info(f"Successfully loaded make {make_data.name} (ID: {make_id})")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load make {make_data.name}: {str(e)}")
|
||||
failed_makes.append(make_data.name)
|
||||
continue
|
||||
|
||||
# Create result
|
||||
result = LoadResult(
|
||||
total_makes=len(makes_data),
|
||||
total_models=stats.models_inserted,
|
||||
total_model_years=stats.model_years_inserted,
|
||||
total_trims=stats.trims_inserted,
|
||||
total_engines=stats.engines_inserted,
|
||||
total_trim_engine_mappings=stats.trim_engine_mappings_inserted,
|
||||
failed_makes=failed_makes,
|
||||
warnings=stats.warnings,
|
||||
load_mode=mode
|
||||
)
|
||||
|
||||
logger.info(f"Bulk load complete: {result.success_count}/{result.total_makes} makes loaded successfully")
|
||||
logger.info(f"Data loaded: {result.total_models} models, {result.total_engines} engines, {result.total_trims} trims")
|
||||
|
||||
return result
|
||||
|
||||
def get_database_statistics(self) -> Dict[str, int]:
|
||||
"""
|
||||
Get current database record counts
|
||||
|
||||
Returns:
|
||||
Dictionary with table counts
|
||||
"""
|
||||
stats = {}
|
||||
|
||||
tables = ['make', 'model', 'model_year', 'trim', 'engine', 'trim_engine']
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
for table in tables:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table}")
|
||||
result = cursor.fetchone()
|
||||
stats[table] = result[0] if isinstance(result, tuple) else result['count']
|
||||
|
||||
return stats
|
||||
|
||||
def validate_referential_integrity(self) -> List[str]:
|
||||
"""
|
||||
Validate referential integrity of loaded data
|
||||
|
||||
Returns:
|
||||
List of integrity issues found (empty if all good)
|
||||
"""
|
||||
issues = []
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check for orphaned models
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM vehicles.model m
|
||||
LEFT JOIN vehicles.make mk ON m.make_id = mk.id
|
||||
WHERE mk.id IS NULL
|
||||
""")
|
||||
orphaned_models = self._get_id_from_result(cursor.fetchone(), 'count')
|
||||
if orphaned_models > 0:
|
||||
issues.append(f"Found {orphaned_models} orphaned models")
|
||||
|
||||
# Check for orphaned model_years
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM vehicles.model_year my
|
||||
LEFT JOIN vehicles.model m ON my.model_id = m.id
|
||||
WHERE m.id IS NULL
|
||||
""")
|
||||
orphaned_model_years = self._get_id_from_result(cursor.fetchone())
|
||||
if orphaned_model_years > 0:
|
||||
issues.append(f"Found {orphaned_model_years} orphaned model_years")
|
||||
|
||||
# Check for orphaned trims
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM vehicles.trim t
|
||||
LEFT JOIN vehicles.model_year my ON t.model_year_id = my.id
|
||||
WHERE my.id IS NULL
|
||||
""")
|
||||
orphaned_trims = self._get_id_from_result(cursor.fetchone())
|
||||
if orphaned_trims > 0:
|
||||
issues.append(f"Found {orphaned_trims} orphaned trims")
|
||||
|
||||
# Check for broken trim_engine mappings
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM vehicles.trim_engine te
|
||||
LEFT JOIN vehicles.trim t ON te.trim_id = t.id
|
||||
LEFT JOIN vehicles.engine e ON te.engine_id = e.id
|
||||
WHERE t.id IS NULL OR e.id IS NULL
|
||||
""")
|
||||
broken_mappings = self._get_id_from_result(cursor.fetchone())
|
||||
if broken_mappings > 0:
|
||||
issues.append(f"Found {broken_mappings} broken trim_engine mappings")
|
||||
|
||||
if issues:
|
||||
logger.warning(f"Referential integrity issues found: {issues}")
|
||||
else:
|
||||
logger.info("Referential integrity validation passed")
|
||||
|
||||
return issues
|
||||
|
||||
def print_load_report(self, result: LoadResult) -> None:
|
||||
"""
|
||||
Print comprehensive loading report
|
||||
|
||||
Args:
|
||||
result: LoadResult from load operation
|
||||
"""
|
||||
print(f"🚀 JSON MANUAL LOADING REPORT")
|
||||
print(f"=" * 50)
|
||||
|
||||
# Load summary
|
||||
print(f"\n📊 LOADING SUMMARY")
|
||||
print(f" Mode: {result.load_mode.value.upper()}")
|
||||
print(f" Makes processed: {result.success_count}/{result.total_makes}")
|
||||
print(f" Success rate: {result.success_rate:.1%}")
|
||||
|
||||
# Data counts
|
||||
print(f"\n📈 DATA LOADED")
|
||||
print(f" Models: {result.total_models}")
|
||||
print(f" Model years: {result.total_model_years}")
|
||||
print(f" Trims: {result.total_trims}")
|
||||
print(f" Engines: {result.total_engines}")
|
||||
print(f" Trim-engine mappings: {result.total_trim_engine_mappings}")
|
||||
|
||||
# Issues
|
||||
if result.failed_makes:
|
||||
print(f"\n⚠️ FAILED MAKES ({len(result.failed_makes)}):")
|
||||
for make in result.failed_makes:
|
||||
print(f" {make}")
|
||||
|
||||
if result.warnings:
|
||||
print(f"\n⚠️ WARNINGS ({len(result.warnings)}):")
|
||||
for warning in result.warnings[:5]: # Show first 5
|
||||
print(f" {warning}")
|
||||
if len(result.warnings) > 5:
|
||||
print(f" ... and {len(result.warnings) - 5} more warnings")
|
||||
|
||||
# Database statistics
|
||||
print(f"\n📋 DATABASE STATISTICS:")
|
||||
db_stats = self.get_database_statistics()
|
||||
for table, count in db_stats.items():
|
||||
print(f" vehicles.{table}: {count:,} records")
|
||||
|
||||
# Referential integrity
|
||||
integrity_issues = self.validate_referential_integrity()
|
||||
if integrity_issues:
|
||||
print(f"\n❌ REFERENTIAL INTEGRITY ISSUES:")
|
||||
for issue in integrity_issues:
|
||||
print(f" {issue}")
|
||||
else:
|
||||
print(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
|
||||
|
||||
|
||||
# Example usage and testing functions
|
||||
def example_usage():
|
||||
"""Demonstrate JsonManualLoader usage"""
|
||||
print("🚀 JsonManualLoader Example Usage")
|
||||
print("=" * 40)
|
||||
|
||||
# This would typically be called after JsonExtractor
|
||||
# For demo purposes, we'll just show the structure
|
||||
|
||||
print("\n📋 Typical usage flow:")
|
||||
print("1. Extract data with JsonExtractor")
|
||||
print("2. Create JsonManualLoader")
|
||||
print("3. Load data in APPEND or CLEAR mode")
|
||||
print("4. Validate and report results")
|
||||
|
||||
print(f"\n💡 Example code:")
|
||||
print("""
|
||||
# Extract data
|
||||
extractor = JsonExtractor(make_mapper, engine_parser)
|
||||
extraction_result = extractor.extract_all_makes('sources/makes')
|
||||
|
||||
# Load data
|
||||
loader = JsonManualLoader()
|
||||
load_result = loader.load_all_makes(extraction_result.makes, LoadMode.APPEND)
|
||||
|
||||
# Report results
|
||||
loader.print_load_report(load_result)
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
437
mvp-platform-services/vehicles/etl/loaders/mssql_loader.py
Normal file
437
mvp-platform-services/vehicles/etl/loaders/mssql_loader.py
Normal file
@@ -0,0 +1,437 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
MSSQL Database Loader
|
||||
Handles loading .bak files into MSSQL Server for ETL processing
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
import pyodbc
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
from ..config import config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MSSQLLoader:
|
||||
"""Loads database files into MSSQL Server"""
|
||||
|
||||
def __init__(self):
|
||||
self.server = config.MSSQL_HOST
|
||||
self.port = config.MSSQL_PORT
|
||||
self.database = config.MSSQL_DATABASE
|
||||
self.username = config.MSSQL_USER
|
||||
self.password = config.MSSQL_PASSWORD
|
||||
|
||||
def get_connection_string(self, database: str = "master") -> str:
|
||||
"""Get MSSQL connection string"""
|
||||
return (
|
||||
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
|
||||
f"SERVER={self.server},{self.port};"
|
||||
f"DATABASE={database};"
|
||||
f"UID={self.username};"
|
||||
f"PWD={self.password};"
|
||||
f"TrustServerCertificate=yes;"
|
||||
)
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
"""Test MSSQL connection"""
|
||||
try:
|
||||
conn_str = self.get_connection_string()
|
||||
logger.info(f"Testing MSSQL connection to: {self.server}")
|
||||
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT @@VERSION")
|
||||
version = cursor.fetchone()[0]
|
||||
logger.info(f"MSSQL connection successful: {version[:100]}...")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"MSSQL connection failed: {e}")
|
||||
return False
|
||||
|
||||
def database_exists(self, database_name: str) -> bool:
|
||||
"""Check if database exists"""
|
||||
try:
|
||||
conn_str = self.get_connection_string()
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT COUNT(*) FROM sys.databases WHERE name = ?",
|
||||
(database_name,)
|
||||
)
|
||||
count = cursor.fetchone()[0]
|
||||
return count > 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to check if database exists: {e}")
|
||||
return False
|
||||
|
||||
def get_database_state(self, database_name: str) -> Optional[str]:
|
||||
"""Return the state_desc for a database or None if not found"""
|
||||
try:
|
||||
conn_str = self.get_connection_string()
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT state_desc FROM sys.databases WHERE name = ?",
|
||||
(database_name,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get database state: {e}")
|
||||
return None
|
||||
|
||||
def drop_database(self, database_name: str) -> bool:
|
||||
"""Drop database if it exists"""
|
||||
try:
|
||||
if not self.database_exists(database_name):
|
||||
logger.info(f"Database {database_name} does not exist, skipping drop")
|
||||
return True
|
||||
|
||||
logger.info(f"Dropping database: {database_name}")
|
||||
conn_str = self.get_connection_string()
|
||||
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
conn.autocommit = True
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Kill existing connections
|
||||
cursor.execute(f"""
|
||||
ALTER DATABASE [{database_name}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;
|
||||
DROP DATABASE [{database_name}];
|
||||
""")
|
||||
|
||||
logger.info(f"Successfully dropped database: {database_name}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to drop database {database_name}: {e}")
|
||||
return False
|
||||
|
||||
def get_backup_file_info(self, bak_path: Path) -> Optional[dict]:
|
||||
"""Get information about backup file"""
|
||||
try:
|
||||
# Use the MSSQL container's mounted backup directory
|
||||
container_path = f"/backups/{bak_path.name}"
|
||||
|
||||
# For now, assume the file is accessible
|
||||
# In production, this would copy the file into the MSSQL container
|
||||
|
||||
conn_str = self.get_connection_string()
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get backup file information
|
||||
cursor.execute(f"RESTORE HEADERONLY FROM DISK = '{container_path}'")
|
||||
headers = cursor.fetchall()
|
||||
|
||||
if headers:
|
||||
header = headers[0]
|
||||
return {
|
||||
"database_name": header.DatabaseName,
|
||||
"server_name": header.ServerName,
|
||||
"backup_start_date": header.BackupStartDate,
|
||||
"backup_finish_date": header.BackupFinishDate,
|
||||
"backup_size": header.BackupSize,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not get backup file info: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def restore_database(self, bak_path: Path, target_database: str = None) -> bool:
|
||||
"""
|
||||
Restore database from .bak file
|
||||
|
||||
Args:
|
||||
bak_path: Path to .bak file
|
||||
target_database: Target database name (defaults to VPICList)
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
if target_database is None:
|
||||
target_database = self.database
|
||||
|
||||
if not bak_path.exists():
|
||||
logger.error(f"Backup file does not exist: {bak_path}")
|
||||
return False
|
||||
|
||||
logger.info(f"Starting database restore: {bak_path} -> {target_database}")
|
||||
|
||||
try:
|
||||
# Copy backup file to MSSQL container
|
||||
container_bak_path = self.copy_backup_to_container(bak_path)
|
||||
|
||||
if not container_bak_path:
|
||||
logger.error("Failed to copy backup file to container")
|
||||
return False
|
||||
|
||||
# If database exists, note the state; we will handle exclusivity in the same session below
|
||||
if self.database_exists(target_database):
|
||||
state = self.get_database_state(target_database)
|
||||
logger.info(f"Existing database detected: {target_database} (state={state})")
|
||||
else:
|
||||
logger.info(f"Target database does not exist yet: {target_database} — proceeding with restore")
|
||||
|
||||
# Restore database using a single master connection for exclusivity
|
||||
logger.info(f"Restoring database from: {container_bak_path}")
|
||||
|
||||
conn_str = self.get_connection_string()
|
||||
with pyodbc.connect(conn_str, timeout=600) as conn: # 10 minute timeout
|
||||
conn.autocommit = True
|
||||
cursor = conn.cursor()
|
||||
|
||||
# If DB exists, ensure exclusive access: kill sessions + SINGLE_USER in this session
|
||||
if self.database_exists(target_database):
|
||||
try:
|
||||
logger.info(f"Preparing exclusive access for restore: killing active sessions on {target_database}")
|
||||
kill_sql = f"""
|
||||
DECLARE @db sysname = N'{target_database}';
|
||||
DECLARE @kill nvarchar(max) = N'';
|
||||
SELECT @kill = @kill + N'KILL ' + CONVERT(nvarchar(10), session_id) + N';'
|
||||
FROM sys.dm_exec_sessions
|
||||
WHERE database_id = DB_ID(@db) AND session_id <> @@SPID;
|
||||
IF LEN(@kill) > 0 EXEC (@kill);
|
||||
"""
|
||||
cursor.execute(kill_sql)
|
||||
# Force SINGLE_USER in current session
|
||||
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
|
||||
logger.info(f"Exclusive access prepared (SINGLE_USER) for {target_database}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not fully prepare exclusive access: {e}")
|
||||
|
||||
# Get logical file names from backup
|
||||
cursor.execute(f"RESTORE FILELISTONLY FROM DISK = '{container_bak_path}'")
|
||||
files = cursor.fetchall()
|
||||
|
||||
if not files:
|
||||
logger.error("No files found in backup")
|
||||
return False
|
||||
|
||||
# Build RESTORE command with MOVE options
|
||||
data_file = None
|
||||
log_file = None
|
||||
|
||||
for file_info in files:
|
||||
logical_name = file_info.LogicalName
|
||||
file_type = file_info.Type
|
||||
|
||||
if file_type == 'D': # Data file
|
||||
data_file = logical_name
|
||||
elif file_type == 'L': # Log file
|
||||
log_file = logical_name
|
||||
|
||||
if not data_file:
|
||||
logger.error("No data file found in backup")
|
||||
return False
|
||||
|
||||
# Construct restore command
|
||||
restore_sql = f"""
|
||||
RESTORE DATABASE [{target_database}]
|
||||
FROM DISK = '{container_bak_path}'
|
||||
WITH
|
||||
MOVE '{data_file}' TO '/var/opt/mssql/data/{target_database}.mdf',
|
||||
"""
|
||||
|
||||
if log_file:
|
||||
restore_sql += f" MOVE '{log_file}' TO '/var/opt/mssql/data/{target_database}.ldf',"
|
||||
|
||||
restore_sql += """
|
||||
REPLACE,
|
||||
RECOVERY,
|
||||
STATS = 10
|
||||
"""
|
||||
|
||||
logger.info(f"Executing restore command for database: {target_database}")
|
||||
logger.debug(f"Restore SQL: {restore_sql}")
|
||||
|
||||
try:
|
||||
cursor.execute(restore_sql)
|
||||
except Exception as e:
|
||||
# If we hit exclusive access error, retry once after killing sessions again
|
||||
if 'Exclusive access could not be obtained' in str(e):
|
||||
logger.warning("Exclusive access error on RESTORE; retrying after killing sessions and reasserting SINGLE_USER...")
|
||||
try:
|
||||
cursor.execute(kill_sql)
|
||||
cursor.execute(f"ALTER DATABASE [{target_database}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;")
|
||||
except Exception as e2:
|
||||
logger.warning(f"Retry exclusive prep failed: {e2}")
|
||||
cursor.execute(restore_sql)
|
||||
else:
|
||||
raise
|
||||
|
||||
# Poll for database to be ONLINE
|
||||
if not self._wait_for_database_online(target_database):
|
||||
logger.error(f"Database did not come ONLINE in time: {target_database}")
|
||||
return False
|
||||
|
||||
# Small retry around database_exists to handle late readiness
|
||||
if self._retry_database_exists(target_database):
|
||||
logger.info(f"Database restore successful and ONLINE: {target_database}")
|
||||
|
||||
# Get basic database info
|
||||
cursor.execute(f"""
|
||||
SELECT
|
||||
name,
|
||||
create_date,
|
||||
compatibility_level,
|
||||
state_desc
|
||||
FROM sys.databases
|
||||
WHERE name = '{target_database}'
|
||||
""")
|
||||
|
||||
db_info = cursor.fetchone()
|
||||
if db_info:
|
||||
logger.info(f"Database info: Name={db_info.name}, Created={db_info.create_date}, Level={db_info.compatibility_level}, State={db_info.state_desc}")
|
||||
|
||||
# Optional: quick content verification with small retry window
|
||||
if not self._retry_verify_content(target_database):
|
||||
logger.warning("Database restored but content verification is inconclusive")
|
||||
|
||||
# Try to set MULTI_USER back in same session
|
||||
try:
|
||||
cursor.execute(f"ALTER DATABASE [{target_database}] SET MULTI_USER;")
|
||||
logger.info(f"Set {target_database} back to MULTI_USER")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not set MULTI_USER on {target_database}: {e}")
|
||||
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Database restore failed - database not found: {target_database}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Database restore failed: {e}")
|
||||
return False
|
||||
|
||||
def copy_backup_to_container(self, bak_path: Path) -> Optional[str]:
|
||||
"""
|
||||
Copy backup file to shared volume accessible by MSSQL container
|
||||
|
||||
Args:
|
||||
bak_path: Local path to .bak file
|
||||
|
||||
Returns:
|
||||
Container path to .bak file or None if failed
|
||||
"""
|
||||
try:
|
||||
# Use shared volume instead of docker cp
|
||||
shared_dir = Path("/app/shared")
|
||||
shared_bak_path = shared_dir / bak_path.name
|
||||
|
||||
# If the file is already in the shared dir, skip copying
|
||||
if bak_path.resolve().parent == shared_dir.resolve():
|
||||
logger.info(f"Backup already in shared volume: {bak_path}")
|
||||
else:
|
||||
logger.info(f"Copying {bak_path} to shared volume...")
|
||||
import shutil
|
||||
shutil.copy2(bak_path, shared_bak_path)
|
||||
|
||||
# Container path from MSSQL perspective
|
||||
container_path = f"/backups/{shared_bak_path.name}"
|
||||
|
||||
logger.info(f"Successfully copied to shared volume: {container_path}")
|
||||
return container_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to copy backup to shared volume: {e}")
|
||||
return None
|
||||
|
||||
def _wait_for_database_online(self, database_name: str, timeout_seconds: int = 600, interval_seconds: int = 5) -> bool:
|
||||
"""Poll MSSQL until the specified database state becomes ONLINE or timeout.
|
||||
|
||||
Returns True if ONLINE, False on timeout/error.
|
||||
"""
|
||||
logger.info(f"Waiting for database to become ONLINE: {database_name}")
|
||||
deadline = time.time() + timeout_seconds
|
||||
last_state = None
|
||||
try:
|
||||
conn_str = self.get_connection_string()
|
||||
while time.time() < deadline:
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT state_desc FROM sys.databases WHERE name = ?", (database_name,))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
state = row[0]
|
||||
if state != last_state:
|
||||
logger.info(f"Database state: {state}")
|
||||
last_state = state
|
||||
if state == 'ONLINE':
|
||||
# Optional: verify updateability is READ_WRITE
|
||||
try:
|
||||
cursor.execute("SELECT DATABASEPROPERTYEX(?, 'Updateability')", (database_name,))
|
||||
up = cursor.fetchone()[0]
|
||||
logger.info(f"Database updateability: {up}")
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
else:
|
||||
logger.info("Database entry not found yet in sys.databases")
|
||||
time.sleep(interval_seconds)
|
||||
except Exception as e:
|
||||
logger.error(f"Error while waiting for database ONLINE: {e}")
|
||||
return False
|
||||
logger.error("Timed out waiting for database to become ONLINE")
|
||||
return False
|
||||
|
||||
def _retry_database_exists(self, database_name: str, attempts: int = 6, delay_seconds: int = 5) -> bool:
|
||||
"""Retry wrapper for database existence checks."""
|
||||
for i in range(1, attempts + 1):
|
||||
if self.database_exists(database_name):
|
||||
return True
|
||||
logger.info(f"database_exists() false, retrying ({i}/{attempts})...")
|
||||
time.sleep(delay_seconds)
|
||||
return False
|
||||
|
||||
def _retry_verify_content(self, database_name: str, attempts: int = 3, delay_seconds: int = 5) -> bool:
|
||||
"""Retry wrapper around verify_database_content to allow late readiness."""
|
||||
for i in range(1, attempts + 1):
|
||||
try:
|
||||
counts = self.verify_database_content(database_name)
|
||||
if counts:
|
||||
logger.info(f"Content verification counts: {counts}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.info(f"Content verification attempt {i} failed: {e}")
|
||||
time.sleep(delay_seconds)
|
||||
return False
|
||||
|
||||
def verify_database_content(self, database_name: str = None) -> dict:
|
||||
"""
|
||||
Verify database has expected content
|
||||
|
||||
Returns:
|
||||
Dictionary with table counts
|
||||
"""
|
||||
if database_name is None:
|
||||
database_name = self.database
|
||||
|
||||
try:
|
||||
conn_str = self.get_connection_string(database_name)
|
||||
with pyodbc.connect(conn_str, timeout=30) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get table counts for key tables
|
||||
tables_to_check = ['Make', 'Model', 'VehicleType', 'Manufacturer']
|
||||
counts = {}
|
||||
|
||||
for table in tables_to_check:
|
||||
try:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
count = cursor.fetchone()[0]
|
||||
counts[table] = count
|
||||
logger.info(f"Table {table}: {count:,} rows")
|
||||
except:
|
||||
counts[table] = 0
|
||||
|
||||
return counts
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to verify database content: {e}")
|
||||
return {}
|
||||
354
mvp-platform-services/vehicles/etl/loaders/postgres_loader.py
Executable file
354
mvp-platform-services/vehicles/etl/loaders/postgres_loader.py
Executable file
@@ -0,0 +1,354 @@
|
||||
import logging
|
||||
from typing import List, Dict, Optional
|
||||
from psycopg2.extras import execute_batch
|
||||
from ..connections import db_connections
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PostgreSQLLoader:
|
||||
"""Load data into PostgreSQL target database"""
|
||||
|
||||
def __init__(self):
|
||||
self.batch_size = 1000
|
||||
|
||||
def load_reference_table(self, table_name: str, data: List[Dict],
|
||||
clear_existing: bool = True) -> int:
|
||||
"""Load data into a reference table"""
|
||||
if not data:
|
||||
logger.warning(f"No data to load for table {table_name}")
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(data)} records into vehicles.{table_name}")
|
||||
|
||||
# Column mapping from source (MS SQL) to target (PostgreSQL)
|
||||
column_mappings = {
|
||||
'Id': 'id',
|
||||
'Name': 'name',
|
||||
'Code': 'code',
|
||||
'MakeId': 'make_id',
|
||||
'CreateOn': 'created_at',
|
||||
'CreatedOn': 'created_at',
|
||||
'UpdateOn': 'updated_at',
|
||||
'UpdatedOn': 'updated_at',
|
||||
'Wmi': 'wmi',
|
||||
'ManufacturerId': 'manufacturer_id',
|
||||
'MakeId': 'make_id',
|
||||
'VehicleTypeId': 'vehicle_type_id',
|
||||
'TruckTypeId': 'truck_type_id',
|
||||
'CountryId': 'country_id',
|
||||
'PublicAvailabilityDate': 'public_availability_date',
|
||||
'NonCompliant': 'non_compliant',
|
||||
'NonCompliantReason': 'non_compliant_reason',
|
||||
'ProcessedOn': 'processed_on',
|
||||
'DisplayOrder': 'display_order',
|
||||
'FormType': 'form_type',
|
||||
'Description': 'description',
|
||||
'LookupTable': 'lookup_table',
|
||||
'IsPrivate': 'is_private',
|
||||
'GroupName': 'group_name',
|
||||
'DataType': 'data_type',
|
||||
'MinAllowedValue': 'min_allowed_value',
|
||||
'MaxAllowedValue': 'max_allowed_value',
|
||||
'IsQS': 'is_qs',
|
||||
'Decode': 'decode',
|
||||
'weight': 'weight',
|
||||
# ErrorCode specific mappings
|
||||
'ErrorCodeName': 'code',
|
||||
'ErrorCodeDescription': 'description'
|
||||
}
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
if clear_existing:
|
||||
cursor.execute(f"TRUNCATE TABLE vehicles.{table_name} CASCADE")
|
||||
logger.info(f"Cleared existing data from vehicles.{table_name}")
|
||||
|
||||
# Get source columns and map them to target columns
|
||||
source_columns = list(data[0].keys())
|
||||
target_columns = []
|
||||
valid_data = []
|
||||
|
||||
# Map columns and filter data
|
||||
for source_col in source_columns:
|
||||
if source_col in column_mappings:
|
||||
target_columns.append(column_mappings[source_col])
|
||||
else:
|
||||
target_columns.append(source_col.lower())
|
||||
|
||||
# Check which columns exist in target table
|
||||
cursor.execute(f"""
|
||||
SELECT column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = 'vehicles' AND table_name = '{table_name}'
|
||||
""")
|
||||
results = cursor.fetchall()
|
||||
existing_columns = {row['column_name'] if isinstance(row, dict) else row[0] for row in results}
|
||||
|
||||
# Filter to only existing columns
|
||||
final_columns = []
|
||||
final_indices = []
|
||||
for i, col in enumerate(target_columns):
|
||||
if col in existing_columns:
|
||||
final_columns.append(col)
|
||||
final_indices.append(i)
|
||||
|
||||
if not final_columns:
|
||||
logger.warning(f"No matching columns found for table {table_name}")
|
||||
return 0
|
||||
|
||||
column_str = ','.join(final_columns)
|
||||
placeholders = ','.join(['%s'] * len(final_columns))
|
||||
|
||||
# Prepare insert query
|
||||
query = f"""
|
||||
INSERT INTO vehicles.{table_name} ({column_str})
|
||||
VALUES ({placeholders})
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
# Prepare data tuples with only valid columns
|
||||
data_tuples = []
|
||||
for record in data:
|
||||
values = []
|
||||
skip_record = False
|
||||
|
||||
for i in final_indices:
|
||||
source_col = source_columns[i]
|
||||
value = record[source_col]
|
||||
|
||||
# Handle special cases for error_codes table
|
||||
if table_name == 'error_codes' and source_col in ['ErrorCodeName', 'Code'] and (value is None or value == ''):
|
||||
skip_record = True
|
||||
break
|
||||
|
||||
values.append(value)
|
||||
|
||||
if not skip_record:
|
||||
data_tuples.append(tuple(values))
|
||||
|
||||
# Execute batch insert
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
# Get final count
|
||||
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
|
||||
result = cursor.fetchone()
|
||||
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
|
||||
logger.info(f"Successfully loaded {final_count} records into vehicles.{table_name}")
|
||||
return final_count
|
||||
|
||||
def load_wmi_vin_schema_mappings(self, mappings: List[Dict]) -> int:
|
||||
"""Load WMI to VIN Schema mappings"""
|
||||
if not mappings:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(mappings)} WMI-VinSchema mappings")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clear existing mappings
|
||||
cursor.execute("TRUNCATE TABLE vehicles.wmi_vin_schemas CASCADE")
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.wmi_vin_schemas
|
||||
(wmi_id, vin_schema_id, year_from, year_to)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = []
|
||||
for mapping in mappings:
|
||||
data_tuples.append((
|
||||
mapping['WmiId'],
|
||||
mapping['VinSchemaId'],
|
||||
mapping['YearFrom'] or 1980,
|
||||
mapping['YearTo'] or 2999
|
||||
))
|
||||
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
# Get final count
|
||||
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_vin_schemas")
|
||||
result = cursor.fetchone()
|
||||
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
|
||||
logger.info(f"Successfully loaded {final_count} WMI-VinSchema mappings")
|
||||
return final_count
|
||||
|
||||
def load_make_model_relationships(self, relationships: List[Dict]) -> int:
|
||||
"""Load Make-Model relationships"""
|
||||
if not relationships:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(relationships)} Make-Model relationships")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clear existing relationships
|
||||
cursor.execute("TRUNCATE TABLE vehicles.make_models CASCADE")
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.make_models (make_id, model_id)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = []
|
||||
for rel in relationships:
|
||||
data_tuples.append((rel['MakeId'], rel['ModelId']))
|
||||
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
# Get final count
|
||||
cursor.execute("SELECT COUNT(*) FROM vehicles.make_models")
|
||||
result = cursor.fetchone()
|
||||
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
|
||||
logger.info(f"Successfully loaded {final_count} Make-Model relationships")
|
||||
return final_count
|
||||
|
||||
def load_wmi_make_relationships(self, relationships: List[Dict]) -> int:
|
||||
"""Load WMI-Make relationships"""
|
||||
if not relationships:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(relationships)} WMI-Make relationships")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clear existing relationships
|
||||
cursor.execute("TRUNCATE TABLE vehicles.wmi_makes CASCADE")
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.wmi_makes (wmi_id, make_id)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = []
|
||||
for rel in relationships:
|
||||
data_tuples.append((rel['WmiId'], rel['MakeId']))
|
||||
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
# Get final count
|
||||
cursor.execute("SELECT COUNT(*) FROM vehicles.wmi_makes")
|
||||
result = cursor.fetchone()
|
||||
final_count = result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
|
||||
logger.info(f"Successfully loaded {final_count} WMI-Make relationships")
|
||||
return final_count
|
||||
|
||||
def load_model_years(self, model_years: List[Dict]) -> int:
|
||||
"""Load model year availability data"""
|
||||
if not model_years:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(model_years)} model year records")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.model_year (model_id, year)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT (model_id, year) DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = [(my['model_id'], my['year']) for my in model_years]
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
return len(model_years)
|
||||
|
||||
def load_trims(self, trims: List[Dict]) -> int:
|
||||
"""Load trim data"""
|
||||
if not trims:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(trims)} trim records")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.trim (model_year_id, name)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = [(t['model_year_id'], t['name']) for t in trims]
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
return len(trims)
|
||||
|
||||
def load_engines(self, engines: List[Dict]) -> int:
|
||||
"""Load engine data"""
|
||||
if not engines:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(engines)} engine records")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.engine (name, code, displacement_l, cylinders, fuel_type, aspiration)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (lower(name)) DO NOTHING
|
||||
RETURNING id
|
||||
"""
|
||||
|
||||
for engine in engines:
|
||||
cursor.execute(query, (
|
||||
engine['name'],
|
||||
engine.get('code'),
|
||||
engine.get('displacement_l'),
|
||||
engine.get('cylinders'),
|
||||
engine.get('fuel_type'),
|
||||
engine.get('aspiration')
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
|
||||
return len(engines)
|
||||
|
||||
def load_trim_engine_relationships(self, relationships: List[Dict]) -> int:
|
||||
"""Load trim-engine relationships"""
|
||||
if not relationships:
|
||||
return 0
|
||||
|
||||
logger.info(f"Loading {len(relationships)} trim-engine relationships")
|
||||
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
INSERT INTO vehicles.trim_engine (trim_id, engine_id)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT (trim_id, engine_id) DO NOTHING
|
||||
"""
|
||||
|
||||
data_tuples = [(rel['trim_id'], rel['engine_id']) for rel in relationships]
|
||||
execute_batch(cursor, query, data_tuples, page_size=self.batch_size)
|
||||
conn.commit()
|
||||
|
||||
return len(relationships)
|
||||
|
||||
def get_table_count(self, table_name: str) -> int:
|
||||
"""Get count of records in a table"""
|
||||
with db_connections.postgres_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(f"SELECT COUNT(*) FROM vehicles.{table_name}")
|
||||
result = cursor.fetchone()
|
||||
return result['count'] if isinstance(result, dict) and 'count' in result else result[0]
|
||||
348
mvp-platform-services/vehicles/etl/main.py
Executable file
348
mvp-platform-services/vehicles/etl/main.py
Executable file
@@ -0,0 +1,348 @@
|
||||
#!/usr/bin/env python3
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import click
|
||||
from .config import config
|
||||
from .utils.logging import setup_logging
|
||||
from .scheduler import start_etl_scheduler
|
||||
from .pipeline import run_etl_pipeline
|
||||
from .connections import test_connections
|
||||
|
||||
# Import manual JSON processing components
|
||||
try:
|
||||
from .pipelines.manual_json_pipeline import ManualJsonPipeline, PipelineConfig, default_progress_callback
|
||||
from .loaders.json_manual_loader import LoadMode
|
||||
from .utils.make_name_mapper import MakeNameMapper
|
||||
from .utils.engine_spec_parser import EngineSpecParser
|
||||
from .extractors.json_extractor import JsonExtractor
|
||||
except ImportError as e:
|
||||
# Handle import errors gracefully for existing functionality
|
||||
ManualJsonPipeline = None
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(f"Manual JSON processing components not available: {e}")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
"""MVP Platform Vehicles ETL Tool"""
|
||||
setup_logging(config.LOG_LEVEL)
|
||||
|
||||
@cli.command()
|
||||
def build_catalog():
|
||||
"""Build vehicle catalog from source database"""
|
||||
success = run_etl_pipeline()
|
||||
if not success:
|
||||
sys.exit(1)
|
||||
|
||||
@cli.command()
|
||||
def schedule():
|
||||
"""Start ETL scheduler (default mode)"""
|
||||
start_etl_scheduler()
|
||||
|
||||
@cli.command()
|
||||
@click.option('--full', is_flag=True, help='Full reload instead of incremental')
|
||||
def update(full):
|
||||
"""Run ETL update"""
|
||||
logger.info(f"Starting ETL update (full={full})")
|
||||
success = run_etl_pipeline()
|
||||
if not success:
|
||||
sys.exit(1)
|
||||
|
||||
@cli.command()
|
||||
def test():
|
||||
"""Test database connections"""
|
||||
success = test_connections()
|
||||
if not success:
|
||||
logger.error("Connection test failed")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info("All connections tested successfully")
|
||||
|
||||
@cli.command()
|
||||
@click.option('--sources-dir', '-s', default='sources/makes',
|
||||
help='Directory containing JSON make files (default: sources/makes)')
|
||||
@click.option('--mode', '-m', type=click.Choice(['clear', 'append']), default='append',
|
||||
help='Loading mode: clear (destructive) or append (safe, default)')
|
||||
@click.option('--progress/--no-progress', default=True,
|
||||
help='Show progress tracking (default: enabled)')
|
||||
@click.option('--validate/--no-validate', default=True,
|
||||
help='Validate referential integrity after loading (default: enabled)')
|
||||
@click.option('--batch-size', '-b', type=int, default=1000,
|
||||
help='Database batch size for inserts (default: 1000)')
|
||||
@click.option('--dry-run', is_flag=True,
|
||||
help='Extract and validate data without loading to database')
|
||||
@click.option('--verbose', '-v', is_flag=True,
|
||||
help='Enable verbose output')
|
||||
def load_manual(sources_dir, mode, progress, validate, batch_size, dry_run, verbose):
|
||||
"""Load vehicle data from JSON files manually
|
||||
|
||||
This command processes JSON files in the specified directory and loads
|
||||
vehicle data into the PostgreSQL database. It supports two modes:
|
||||
|
||||
• APPEND mode (default): Safely add new data with duplicate detection
|
||||
• CLEAR mode: Remove all existing data and reload (destructive)
|
||||
|
||||
Examples:
|
||||
python -m etl load-manual
|
||||
python -m etl load-manual --mode clear --sources-dir custom/path
|
||||
python -m etl load-manual --dry-run --verbose
|
||||
"""
|
||||
if ManualJsonPipeline is None:
|
||||
click.echo("❌ Manual JSON processing components are not available", err=True)
|
||||
click.echo(" Please check your installation and dependencies", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Validate sources directory
|
||||
sources_path = Path(sources_dir)
|
||||
if not sources_path.exists():
|
||||
click.echo(f"❌ Sources directory not found: {sources_dir}", err=True)
|
||||
click.echo(" Please specify a valid directory with --sources-dir", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Count JSON files
|
||||
json_files = list(sources_path.glob("*.json"))
|
||||
if not json_files:
|
||||
click.echo(f"❌ No JSON files found in: {sources_dir}", err=True)
|
||||
click.echo(" Please ensure the directory contains *.json files", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Set log level if verbose
|
||||
if verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
# Create configuration
|
||||
load_mode_enum = LoadMode.CLEAR if mode == 'clear' else LoadMode.APPEND
|
||||
config = PipelineConfig(
|
||||
sources_directory=str(sources_path),
|
||||
load_mode=load_mode_enum,
|
||||
enable_progress_tracking=progress,
|
||||
validate_integrity=validate,
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
click.echo(f"🚀 Manual JSON Processing Pipeline")
|
||||
click.echo(f" Sources: {sources_dir}")
|
||||
click.echo(f" Files: {len(json_files)} JSON files")
|
||||
click.echo(f" Mode: {mode.upper()}")
|
||||
if dry_run:
|
||||
click.echo(f" Dry run: Validation only (no database changes)")
|
||||
|
||||
try:
|
||||
# Create pipeline
|
||||
pipeline = ManualJsonPipeline(str(sources_path), config)
|
||||
|
||||
# Progress callback for CLI
|
||||
def cli_progress_callback(progress_info):
|
||||
if progress:
|
||||
percentage = progress_info['percentage']
|
||||
phase = progress_info['phase']
|
||||
files = f"{progress_info['files_completed']}/{progress_info['total_files']}"
|
||||
|
||||
if progress_info['files_per_second'] > 0:
|
||||
rate = f"({progress_info['files_per_second']:.1f} files/sec)"
|
||||
eta_min = progress_info['eta_seconds'] / 60
|
||||
eta = f"ETA: {eta_min:.1f}min" if eta_min > 0 else ""
|
||||
click.echo(f"[{percentage:5.1f}%] {phase}: {files} {rate} {eta}")
|
||||
else:
|
||||
click.echo(f"[{percentage:5.1f}%] {phase}: {files}")
|
||||
|
||||
if dry_run:
|
||||
# Extraction only for validation
|
||||
click.echo("\n📋 Running extraction validation...")
|
||||
extraction_result = pipeline.run_extraction_only()
|
||||
|
||||
# Report extraction results
|
||||
click.echo(f"\n✅ Extraction Validation Complete")
|
||||
click.echo(f" Files processed: {extraction_result.total_files_processed}")
|
||||
click.echo(f" Success rate: {extraction_result.success_rate:.1%}")
|
||||
click.echo(f" Models extracted: {extraction_result.total_models:,}")
|
||||
click.echo(f" Engines extracted: {extraction_result.total_engines:,}")
|
||||
click.echo(f" Electric models: {extraction_result.total_electric_models:,}")
|
||||
|
||||
if extraction_result.failed_extractions > 0:
|
||||
click.echo(f" ⚠️ Failed extractions: {extraction_result.failed_extractions}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Full pipeline execution
|
||||
if mode == 'clear':
|
||||
click.echo("\n⚠️ WARNING: CLEAR mode will delete all existing vehicle data!")
|
||||
if not click.confirm("Are you sure you want to continue?", default=False):
|
||||
click.echo("Operation cancelled")
|
||||
return
|
||||
|
||||
click.echo(f"\n🔄 Running pipeline...")
|
||||
result = pipeline.run(progress_callback=cli_progress_callback)
|
||||
|
||||
# Print comprehensive report
|
||||
click.echo(f"\n" + "="*60)
|
||||
click.echo(f"📊 PIPELINE EXECUTION REPORT")
|
||||
click.echo(f"="*60)
|
||||
|
||||
# Performance
|
||||
click.echo(f"\n⏱️ PERFORMANCE")
|
||||
click.echo(f" Duration: {result.duration_seconds:.1f} seconds ({result.duration_minutes:.1f} minutes)")
|
||||
click.echo(f" Processing rate: {result.files_per_second:.1f} files/sec")
|
||||
click.echo(f" Loading rate: {result.records_per_second:,.0f} records/sec")
|
||||
|
||||
# Success rates
|
||||
click.echo(f"\n📈 SUCCESS RATES")
|
||||
click.echo(f" Extraction: {result.extraction_success_rate:.1%}")
|
||||
click.echo(f" Loading: {result.loading_success_rate:.1%}")
|
||||
click.echo(f" Overall: {result.overall_success_rate:.1%}")
|
||||
|
||||
# Data loaded
|
||||
click.echo(f"\n💾 DATA LOADED")
|
||||
click.echo(f" Makes: {result.load_result.total_makes}")
|
||||
click.echo(f" Models: {result.load_result.total_models}")
|
||||
click.echo(f" Engines: {result.load_result.total_engines}")
|
||||
click.echo(f" Trims: {result.load_result.total_trims}")
|
||||
click.echo(f" Total records: {result.total_records_loaded:,}")
|
||||
|
||||
# Issues
|
||||
if result.load_result.failed_makes:
|
||||
click.echo(f"\n⚠️ FAILED MAKES ({len(result.load_result.failed_makes)}):")
|
||||
for make in result.load_result.failed_makes:
|
||||
click.echo(f" • {make}")
|
||||
|
||||
if result.integrity_issues:
|
||||
click.echo(f"\n❌ INTEGRITY ISSUES ({len(result.integrity_issues)}):")
|
||||
for issue in result.integrity_issues:
|
||||
click.echo(f" • {issue}")
|
||||
else:
|
||||
click.echo(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
|
||||
|
||||
# Final status
|
||||
if result.was_successful:
|
||||
click.echo(f"\n🎉 PIPELINE COMPLETED SUCCESSFULLY")
|
||||
if verbose:
|
||||
# Show database statistics
|
||||
db_stats = pipeline.loader.get_database_statistics()
|
||||
click.echo(f"\n📋 DATABASE STATISTICS:")
|
||||
for table, count in db_stats.items():
|
||||
click.echo(f" {table}: {count:,} records")
|
||||
else:
|
||||
click.echo(f"\n⚠️ PIPELINE COMPLETED WITH ISSUES")
|
||||
sys.exit(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
click.echo(f"\n⏸️ Pipeline interrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
click.echo(f"\n❌ Pipeline failed: {str(e)}", err=True)
|
||||
if verbose:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
@cli.command()
|
||||
@click.option('--sources-dir', '-s', default='sources/makes',
|
||||
help='Directory containing JSON make files (default: sources/makes)')
|
||||
@click.option('--verbose', '-v', is_flag=True,
|
||||
help='Enable verbose output with detailed statistics')
|
||||
def validate_json(sources_dir, verbose):
|
||||
"""Validate JSON files and show extraction statistics
|
||||
|
||||
This command validates the structure and content of JSON files
|
||||
without loading data into the database. Useful for:
|
||||
|
||||
• Checking data quality before loading
|
||||
• Debugging extraction issues
|
||||
• Getting statistics about available data
|
||||
|
||||
Examples:
|
||||
python -m etl validate-json
|
||||
python -m etl validate-json --sources-dir custom/path --verbose
|
||||
"""
|
||||
if JsonExtractor is None:
|
||||
click.echo("❌ JSON validation components are not available", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Validate sources directory
|
||||
sources_path = Path(sources_dir)
|
||||
if not sources_path.exists():
|
||||
click.echo(f"❌ Sources directory not found: {sources_dir}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Count JSON files
|
||||
json_files = list(sources_path.glob("*.json"))
|
||||
if not json_files:
|
||||
click.echo(f"❌ No JSON files found in: {sources_dir}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"🔍 JSON File Validation")
|
||||
click.echo(f" Directory: {sources_dir}")
|
||||
click.echo(f" Files: {len(json_files)} JSON files")
|
||||
|
||||
try:
|
||||
# Initialize components
|
||||
make_mapper = MakeNameMapper()
|
||||
engine_parser = EngineSpecParser()
|
||||
extractor = JsonExtractor(make_mapper, engine_parser)
|
||||
|
||||
# Run extraction validation
|
||||
click.echo(f"\n📋 Validating JSON structure and content...")
|
||||
result = extractor.extract_all_makes(str(sources_path))
|
||||
|
||||
# Basic results
|
||||
click.echo(f"\n✅ Validation Complete")
|
||||
click.echo(f" Files processed: {result.total_files_processed}")
|
||||
click.echo(f" Success rate: {result.success_rate:.1%}")
|
||||
click.echo(f" Models found: {result.total_models:,}")
|
||||
click.echo(f" Engines found: {result.total_engines:,}")
|
||||
click.echo(f" Electric models: {result.total_electric_models:,}")
|
||||
|
||||
if result.failed_extractions > 0:
|
||||
click.echo(f" ⚠️ Failed extractions: {result.failed_extractions}")
|
||||
|
||||
# Show top makes by model count
|
||||
if verbose and result.makes:
|
||||
click.echo(f"\n🏆 Top Makes by Model Count:")
|
||||
top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
|
||||
for i, make in enumerate(top_makes, 1):
|
||||
click.echo(f" {i:2d}. {make.name}: {make.total_models} models, {make.total_engines} engines")
|
||||
|
||||
# Show makes with issues
|
||||
error_makes = [make for make in result.makes if make.processing_errors]
|
||||
if error_makes:
|
||||
click.echo(f"\n⚠️ Makes with Processing Errors ({len(error_makes)}):")
|
||||
for make in error_makes[:5]:
|
||||
click.echo(f" • {make.name}: {len(make.processing_errors)} errors")
|
||||
if len(error_makes) > 5:
|
||||
click.echo(f" ... and {len(error_makes) - 5} more")
|
||||
|
||||
# Show data quality insights
|
||||
click.echo(f"\n📊 Data Quality Insights:")
|
||||
|
||||
# Engine configuration distribution
|
||||
config_counts = {}
|
||||
for make in result.makes:
|
||||
for model in make.models:
|
||||
for engine in model.engines:
|
||||
config_counts[engine.configuration] = config_counts.get(engine.configuration, 0) + 1
|
||||
|
||||
if config_counts:
|
||||
click.echo(f" Engine configurations:")
|
||||
for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True):
|
||||
percentage = count / result.total_engines * 100
|
||||
click.echo(f" {config}: {count:,} ({percentage:.1f}%)")
|
||||
|
||||
if result.failed_extractions > 0:
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"❌ Validation failed: {str(e)}", err=True)
|
||||
if verbose:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Default to scheduler mode if no command provided
|
||||
if len(sys.argv) == 1:
|
||||
start_etl_scheduler()
|
||||
else:
|
||||
cli()
|
||||
92
mvp-platform-services/vehicles/etl/pipeline.py
Normal file
92
mvp-platform-services/vehicles/etl/pipeline.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from .config import config
|
||||
from .builders.normalized_vehicle_builder import NormalizedVehicleBuilder
|
||||
from .utils.make_filter import MakeFilter
|
||||
from .connections import test_connections
|
||||
from .downloaders.nhtsa_downloader import NHTSADownloader
|
||||
from .loaders.mssql_loader import MSSQLLoader
|
||||
from .extractors.vin_proc_extractor import VinProcExtractor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def run_etl_pipeline():
|
||||
"""Complete ETL pipeline execution including download and database loading"""
|
||||
logger.info("Starting complete ETL pipeline")
|
||||
start_time = datetime.now()
|
||||
|
||||
try:
|
||||
# Step 1: Download NHTSA database file
|
||||
logger.info("Step 1: Downloading NHTSA vPIC database")
|
||||
downloader = NHTSADownloader()
|
||||
|
||||
bak_file = downloader.ensure_database_file(force_download=False)
|
||||
if not bak_file:
|
||||
logger.error("Failed to obtain NHTSA database file")
|
||||
return False
|
||||
|
||||
db_info = downloader.get_database_info(bak_file)
|
||||
logger.info(f"Using database file: {db_info['name']} ({db_info['size_mb']} MB)")
|
||||
|
||||
# Step 2: Load database into MSSQL
|
||||
logger.info("Step 2: Loading database into MSSQL Server")
|
||||
mssql_loader = MSSQLLoader()
|
||||
|
||||
if not mssql_loader.test_connection():
|
||||
logger.error("MSSQL connection test failed")
|
||||
return False
|
||||
|
||||
if not mssql_loader.restore_database(bak_file):
|
||||
logger.error("Failed to restore database to MSSQL")
|
||||
return False
|
||||
|
||||
# Verify MSSQL database content
|
||||
content_info = mssql_loader.verify_database_content()
|
||||
logger.info(f"MSSQL database loaded with tables: {content_info}")
|
||||
|
||||
# Step 2b: Research stored procedure definition/output for parity
|
||||
try:
|
||||
logger.info("Step 2b: Inspecting MSSQL VIN decode stored procedure for parity")
|
||||
vpe = VinProcExtractor()
|
||||
meta = vpe.find_proc()
|
||||
if meta:
|
||||
logger.info(f"VIN proc found: {meta['schema_name']}.{meta['object_name']} ({meta['type_desc']})")
|
||||
definition = vpe.get_definition(meta['schema_name'], meta['object_name'])
|
||||
logger.debug(f"VIN proc definition (first 500 chars): {definition[:500]}")
|
||||
sample = vpe.sample_execute('1G1YU3D64H5602799')
|
||||
if sample is not None:
|
||||
logger.info(f"VIN proc sample output columns: {list(sample[0].keys()) if sample else 'no rows'}")
|
||||
else:
|
||||
logger.warning("VIN decode proc not found by pattern; continuing with catalog build")
|
||||
except Exception as e:
|
||||
logger.warning(f"VIN proc inspection failed (non-fatal): {e}")
|
||||
|
||||
# Step 3: Test all connections (MSSQL + PostgreSQL)
|
||||
logger.info("Step 3: Testing all database connections")
|
||||
if not test_connections():
|
||||
logger.error("Connection test failed after database loading")
|
||||
return False
|
||||
|
||||
# Step 4: Build normalized PostgreSQL schema from MSSQL with make filtering
|
||||
logger.info("Step 4: Building normalized PostgreSQL vehicle schema from MSSQL with make filtering")
|
||||
make_filter = MakeFilter()
|
||||
builder = NormalizedVehicleBuilder(make_filter)
|
||||
success = builder.build()
|
||||
|
||||
elapsed = datetime.now() - start_time
|
||||
if success:
|
||||
logger.info(f"Complete ETL pipeline finished successfully in {elapsed}")
|
||||
logger.info("✅ ETL Summary:")
|
||||
logger.info(f" - Downloaded: {db_info['name']} ({db_info['size_mb']} MB)")
|
||||
logger.info(f" - MSSQL Tables: {content_info}")
|
||||
logger.info(f" - PostgreSQL normalized schema: Built successfully")
|
||||
return True
|
||||
else:
|
||||
logger.error(f"ETL pipeline failed during normalized schema building after {elapsed}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
elapsed = datetime.now() - start_time
|
||||
logger.error(f"ETL pipeline crashed after {elapsed}: {e}", exc_info=True)
|
||||
return False
|
||||
1
mvp-platform-services/vehicles/etl/pipelines/__init__.py
Normal file
1
mvp-platform-services/vehicles/etl/pipelines/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Pipelines package
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,465 @@
|
||||
"""
|
||||
Manual JSON Pipeline for Vehicle Data Processing
|
||||
|
||||
Coordinates end-to-end processing of JSON vehicle data:
|
||||
1. Extract data from JSON files
|
||||
2. Load data into PostgreSQL database
|
||||
3. Progress tracking and comprehensive reporting
|
||||
|
||||
Key Features:
|
||||
- Full extraction→loading workflow coordination
|
||||
- Clear/append mode support
|
||||
- Progress tracking with detailed statistics
|
||||
- Comprehensive error handling and reporting
|
||||
- Performance monitoring and optimization
|
||||
- Referential integrity validation
|
||||
|
||||
Usage:
|
||||
pipeline = ManualJsonPipeline(sources_dir="sources/makes")
|
||||
result = pipeline.run(mode=LoadMode.APPEND, progress_callback=print_progress)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Dict, Optional, Callable, Tuple
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
# Import our components (handle both relative and direct imports)
|
||||
try:
|
||||
from ..extractors.json_extractor import JsonExtractor, ExtractionResult
|
||||
from ..loaders.json_manual_loader import JsonManualLoader, LoadMode, LoadResult
|
||||
from ..utils.make_name_mapper import MakeNameMapper
|
||||
from ..utils.engine_spec_parser import EngineSpecParser
|
||||
except ImportError:
|
||||
# Fallback for direct execution
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
from extractors.json_extractor import JsonExtractor, ExtractionResult
|
||||
from loaders.json_manual_loader import JsonManualLoader, LoadMode, LoadResult
|
||||
from utils.make_name_mapper import MakeNameMapper
|
||||
from utils.engine_spec_parser import EngineSpecParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineConfig:
|
||||
"""Pipeline configuration options"""
|
||||
sources_directory: str # Directory containing JSON files
|
||||
load_mode: LoadMode = LoadMode.APPEND # Loading mode
|
||||
enable_progress_tracking: bool = True # Enable progress callbacks
|
||||
validate_integrity: bool = True # Validate referential integrity after loading
|
||||
batch_size: int = 1000 # Database batch size
|
||||
log_level: str = "INFO" # Logging level
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate configuration"""
|
||||
if not self.sources_directory:
|
||||
raise ValueError("sources_directory is required")
|
||||
|
||||
if not Path(self.sources_directory).exists():
|
||||
raise ValueError(f"Sources directory does not exist: {self.sources_directory}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineResult:
|
||||
"""Complete pipeline execution result"""
|
||||
# Configuration
|
||||
config: PipelineConfig
|
||||
|
||||
# Timing
|
||||
start_time: float
|
||||
end_time: float
|
||||
|
||||
# Extraction results
|
||||
extraction_result: ExtractionResult
|
||||
|
||||
# Loading results
|
||||
load_result: LoadResult
|
||||
|
||||
# Performance metrics
|
||||
total_files_processed: int
|
||||
total_records_loaded: int
|
||||
files_per_second: float
|
||||
records_per_second: float
|
||||
|
||||
# Quality metrics
|
||||
extraction_success_rate: float
|
||||
loading_success_rate: float
|
||||
overall_success_rate: float
|
||||
|
||||
# Validation results
|
||||
integrity_issues: List[str]
|
||||
|
||||
@property
|
||||
def duration_seconds(self) -> float:
|
||||
return self.end_time - self.start_time
|
||||
|
||||
@property
|
||||
def duration_minutes(self) -> float:
|
||||
return self.duration_seconds / 60.0
|
||||
|
||||
@property
|
||||
def was_successful(self) -> bool:
|
||||
"""True if pipeline completed without critical errors"""
|
||||
return (self.extraction_result.failed_extractions == 0 and
|
||||
len(self.load_result.failed_makes) == 0 and
|
||||
len(self.integrity_issues) == 0)
|
||||
|
||||
|
||||
class PipelineProgress:
|
||||
"""Progress tracking for pipeline execution"""
|
||||
|
||||
def __init__(self, total_files: int):
|
||||
self.total_files = total_files
|
||||
self.current_file = 0
|
||||
self.current_phase = "Starting"
|
||||
self.start_time = time.time()
|
||||
self.phase_start_time = time.time()
|
||||
|
||||
def update_phase(self, phase: str) -> None:
|
||||
"""Update current phase"""
|
||||
self.current_phase = phase
|
||||
self.phase_start_time = time.time()
|
||||
|
||||
def update_file_progress(self, files_completed: int) -> None:
|
||||
"""Update file progress"""
|
||||
self.current_file = files_completed
|
||||
|
||||
def get_progress_info(self) -> Dict[str, any]:
|
||||
"""Get current progress information"""
|
||||
elapsed = time.time() - self.start_time
|
||||
phase_elapsed = time.time() - self.phase_start_time
|
||||
|
||||
if self.current_file > 0:
|
||||
files_per_second = self.current_file / elapsed
|
||||
eta_seconds = (self.total_files - self.current_file) / files_per_second if files_per_second > 0 else 0
|
||||
else:
|
||||
files_per_second = 0
|
||||
eta_seconds = 0
|
||||
|
||||
return {
|
||||
'phase': self.current_phase,
|
||||
'files_completed': self.current_file,
|
||||
'total_files': self.total_files,
|
||||
'percentage': (self.current_file / self.total_files * 100) if self.total_files > 0 else 0,
|
||||
'elapsed_seconds': elapsed,
|
||||
'phase_elapsed_seconds': phase_elapsed,
|
||||
'files_per_second': files_per_second,
|
||||
'eta_seconds': eta_seconds
|
||||
}
|
||||
|
||||
|
||||
class ManualJsonPipeline:
|
||||
"""End-to-end JSON processing pipeline"""
|
||||
|
||||
def __init__(self, sources_dir: str, config: Optional[PipelineConfig] = None):
|
||||
"""
|
||||
Initialize pipeline
|
||||
|
||||
Args:
|
||||
sources_dir: Directory containing JSON files
|
||||
config: Pipeline configuration (optional)
|
||||
"""
|
||||
self.sources_dir = sources_dir
|
||||
self.config = config or PipelineConfig(sources_directory=sources_dir)
|
||||
|
||||
# Initialize components
|
||||
self.make_mapper = MakeNameMapper()
|
||||
self.engine_parser = EngineSpecParser()
|
||||
self.extractor = JsonExtractor(self.make_mapper, self.engine_parser)
|
||||
self.loader = JsonManualLoader()
|
||||
|
||||
# Progress tracking
|
||||
self.progress_callback: Optional[Callable[[Dict[str, any]], None]] = None
|
||||
|
||||
logger.info(f"ManualJsonPipeline initialized for {sources_dir}")
|
||||
|
||||
def set_progress_callback(self, callback: Callable[[Dict[str, any]], None]) -> None:
|
||||
"""
|
||||
Set progress callback function
|
||||
|
||||
Args:
|
||||
callback: Function to call with progress updates
|
||||
"""
|
||||
self.progress_callback = callback
|
||||
|
||||
def _update_progress(self, progress: PipelineProgress) -> None:
|
||||
"""Update progress via callback if configured"""
|
||||
if self.progress_callback and self.config.enable_progress_tracking:
|
||||
progress_info = progress.get_progress_info()
|
||||
self.progress_callback(progress_info)
|
||||
|
||||
def run(self, mode: Optional[LoadMode] = None, progress_callback: Optional[Callable] = None) -> PipelineResult:
|
||||
"""
|
||||
Execute complete pipeline
|
||||
|
||||
Args:
|
||||
mode: Loading mode (overrides config)
|
||||
progress_callback: Progress callback function (overrides config)
|
||||
|
||||
Returns:
|
||||
PipelineResult with complete execution details
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Override config if specified
|
||||
load_mode = mode or self.config.load_mode
|
||||
if progress_callback:
|
||||
self.set_progress_callback(progress_callback)
|
||||
|
||||
logger.info(f"Starting manual JSON pipeline in {load_mode.value} mode")
|
||||
logger.info(f"Processing directory: {self.sources_dir}")
|
||||
|
||||
try:
|
||||
# Count files for progress tracking
|
||||
json_files = list(Path(self.sources_dir).glob("*.json"))
|
||||
total_files = len(json_files)
|
||||
|
||||
if total_files == 0:
|
||||
raise ValueError(f"No JSON files found in {self.sources_dir}")
|
||||
|
||||
progress = PipelineProgress(total_files)
|
||||
|
||||
# Phase 1: Extract data from JSON files
|
||||
progress.update_phase("Extracting data from JSON files")
|
||||
self._update_progress(progress)
|
||||
|
||||
logger.info(f"Phase 1: Extracting data from {total_files} JSON files")
|
||||
extraction_result = self.extractor.extract_all_makes(self.sources_dir)
|
||||
|
||||
progress.update_file_progress(extraction_result.total_files_processed)
|
||||
self._update_progress(progress)
|
||||
|
||||
if extraction_result.failed_extractions > 0:
|
||||
logger.warning(f"Extraction completed with {extraction_result.failed_extractions} failures")
|
||||
else:
|
||||
logger.info(f"Extraction completed successfully: {extraction_result.total_models} models, {extraction_result.total_engines} engines")
|
||||
|
||||
# Phase 2: Load data into database
|
||||
progress.update_phase("Loading data into database")
|
||||
self._update_progress(progress)
|
||||
|
||||
logger.info(f"Phase 2: Loading {len(extraction_result.makes)} makes into database ({load_mode.value} mode)")
|
||||
load_result = self.loader.load_all_makes(extraction_result.makes, load_mode)
|
||||
|
||||
if len(load_result.failed_makes) > 0:
|
||||
logger.warning(f"Loading completed with {len(load_result.failed_makes)} failures")
|
||||
else:
|
||||
logger.info(f"Loading completed successfully: {load_result.success_count} makes loaded")
|
||||
|
||||
# Phase 3: Validate referential integrity (if enabled)
|
||||
integrity_issues = []
|
||||
if self.config.validate_integrity:
|
||||
progress.update_phase("Validating referential integrity")
|
||||
self._update_progress(progress)
|
||||
|
||||
logger.info("Phase 3: Validating referential integrity")
|
||||
integrity_issues = self.loader.validate_referential_integrity()
|
||||
|
||||
if integrity_issues:
|
||||
logger.warning(f"Referential integrity issues found: {len(integrity_issues)}")
|
||||
else:
|
||||
logger.info("Referential integrity validation passed")
|
||||
|
||||
# Calculate performance metrics
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
|
||||
files_per_second = total_files / duration if duration > 0 else 0
|
||||
total_records = (load_result.total_models + load_result.total_engines +
|
||||
load_result.total_trims + load_result.total_trim_engine_mappings)
|
||||
records_per_second = total_records / duration if duration > 0 else 0
|
||||
|
||||
# Calculate success rates
|
||||
extraction_success_rate = extraction_result.success_rate
|
||||
loading_success_rate = load_result.success_rate
|
||||
overall_success_rate = min(extraction_success_rate, loading_success_rate)
|
||||
|
||||
# Create result
|
||||
result = PipelineResult(
|
||||
config=self.config,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
extraction_result=extraction_result,
|
||||
load_result=load_result,
|
||||
total_files_processed=total_files,
|
||||
total_records_loaded=total_records,
|
||||
files_per_second=files_per_second,
|
||||
records_per_second=records_per_second,
|
||||
extraction_success_rate=extraction_success_rate,
|
||||
loading_success_rate=loading_success_rate,
|
||||
overall_success_rate=overall_success_rate,
|
||||
integrity_issues=integrity_issues
|
||||
)
|
||||
|
||||
progress.update_phase("Pipeline complete")
|
||||
self._update_progress(progress)
|
||||
|
||||
logger.info(f"Pipeline completed in {result.duration_seconds:.1f} seconds")
|
||||
logger.info(f"Performance: {files_per_second:.1f} files/sec, {records_per_second:.0f} records/sec")
|
||||
logger.info(f"Overall success rate: {overall_success_rate:.1%}")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
end_time = time.time()
|
||||
logger.error(f"Pipeline failed after {end_time - start_time:.1f} seconds: {str(e)}")
|
||||
raise
|
||||
|
||||
def run_extraction_only(self) -> ExtractionResult:
|
||||
"""
|
||||
Run extraction phase only (for testing/validation)
|
||||
|
||||
Returns:
|
||||
ExtractionResult with extracted data
|
||||
"""
|
||||
logger.info("Running extraction-only pipeline")
|
||||
|
||||
result = self.extractor.extract_all_makes(self.sources_dir)
|
||||
|
||||
logger.info(f"Extraction complete: {result.total_models} models, {result.total_engines} engines")
|
||||
logger.info(f"Success rate: {result.success_rate:.1%}")
|
||||
|
||||
return result
|
||||
|
||||
def get_source_statistics(self) -> Dict[str, any]:
|
||||
"""
|
||||
Get statistics about source JSON files
|
||||
|
||||
Returns:
|
||||
Dictionary with source file statistics
|
||||
"""
|
||||
json_files = list(Path(self.sources_dir).glob("*.json"))
|
||||
|
||||
total_size_bytes = sum(f.stat().st_size for f in json_files)
|
||||
|
||||
return {
|
||||
'total_files': len(json_files),
|
||||
'total_size_bytes': total_size_bytes,
|
||||
'total_size_mb': total_size_bytes / (1024 * 1024),
|
||||
'average_file_size_kb': (total_size_bytes / len(json_files) / 1024) if json_files else 0,
|
||||
'directory': str(self.sources_dir)
|
||||
}
|
||||
|
||||
def print_pipeline_report(self, result: PipelineResult) -> None:
|
||||
"""
|
||||
Print comprehensive pipeline execution report
|
||||
|
||||
Args:
|
||||
result: PipelineResult from pipeline execution
|
||||
"""
|
||||
print(f"🚀 MANUAL JSON PIPELINE EXECUTION REPORT")
|
||||
print(f"=" * 60)
|
||||
|
||||
# Configuration
|
||||
print(f"\n⚙️ CONFIGURATION")
|
||||
print(f" Sources directory: {result.config.sources_directory}")
|
||||
print(f" Load mode: {result.config.load_mode.value.upper()}")
|
||||
print(f" Batch size: {result.config.batch_size}")
|
||||
print(f" Integrity validation: {'Enabled' if result.config.validate_integrity else 'Disabled'}")
|
||||
|
||||
# Performance
|
||||
print(f"\n⏱️ PERFORMANCE")
|
||||
print(f" Total duration: {result.duration_seconds:.1f} seconds ({result.duration_minutes:.1f} minutes)")
|
||||
print(f" Files processed: {result.total_files_processed}")
|
||||
print(f" Records loaded: {result.total_records_loaded:,}")
|
||||
print(f" Processing rate: {result.files_per_second:.1f} files/sec")
|
||||
print(f" Loading rate: {result.records_per_second:,.0f} records/sec")
|
||||
|
||||
# Success rates
|
||||
print(f"\n📊 SUCCESS RATES")
|
||||
print(f" Extraction: {result.extraction_success_rate:.1%}")
|
||||
print(f" Loading: {result.loading_success_rate:.1%}")
|
||||
print(f" Overall: {result.overall_success_rate:.1%}")
|
||||
|
||||
# Data summary
|
||||
print(f"\n📈 DATA PROCESSED")
|
||||
print(f" Makes: {result.load_result.total_makes}")
|
||||
print(f" Models: {result.load_result.total_models}")
|
||||
print(f" Model years: {result.load_result.total_model_years}")
|
||||
print(f" Trims: {result.load_result.total_trims}")
|
||||
print(f" Engines: {result.load_result.total_engines}")
|
||||
print(f" Trim-engine mappings: {result.load_result.total_trim_engine_mappings}")
|
||||
|
||||
# Issues
|
||||
if result.load_result.failed_makes:
|
||||
print(f"\n⚠️ FAILED MAKES ({len(result.load_result.failed_makes)}):")
|
||||
for make in result.load_result.failed_makes:
|
||||
print(f" {make}")
|
||||
|
||||
if result.integrity_issues:
|
||||
print(f"\n❌ REFERENTIAL INTEGRITY ISSUES ({len(result.integrity_issues)}):")
|
||||
for issue in result.integrity_issues:
|
||||
print(f" {issue}")
|
||||
else:
|
||||
print(f"\n✅ REFERENTIAL INTEGRITY: PASSED")
|
||||
|
||||
# Final status
|
||||
print(f"\n🎯 PIPELINE STATUS: {'SUCCESS' if result.was_successful else 'COMPLETED WITH ISSUES'}")
|
||||
|
||||
|
||||
def default_progress_callback(progress_info: Dict[str, any]) -> None:
|
||||
"""Default progress callback that prints to console"""
|
||||
percentage = progress_info['percentage']
|
||||
phase = progress_info['phase']
|
||||
files_completed = progress_info['files_completed']
|
||||
total_files = progress_info['total_files']
|
||||
|
||||
if progress_info['files_per_second'] > 0:
|
||||
eta_minutes = progress_info['eta_seconds'] / 60
|
||||
print(f"[{percentage:5.1f}%] {phase}: {files_completed}/{total_files} files "
|
||||
f"({progress_info['files_per_second']:.1f} files/sec, ETA: {eta_minutes:.1f}min)")
|
||||
else:
|
||||
print(f"[{percentage:5.1f}%] {phase}: {files_completed}/{total_files} files")
|
||||
|
||||
|
||||
# Example usage and testing functions
|
||||
def example_usage():
|
||||
"""Demonstrate ManualJsonPipeline usage"""
|
||||
print("🚀 ManualJsonPipeline Example Usage")
|
||||
print("=" * 40)
|
||||
|
||||
sources_dir = "sources/makes"
|
||||
|
||||
if not Path(sources_dir).exists():
|
||||
print(f"❌ Sources directory not found: {sources_dir}")
|
||||
return
|
||||
|
||||
print(f"\n💡 Example pipeline execution:")
|
||||
print(f"""
|
||||
# Create pipeline with configuration
|
||||
config = PipelineConfig(
|
||||
sources_directory="{sources_dir}",
|
||||
load_mode=LoadMode.APPEND,
|
||||
enable_progress_tracking=True,
|
||||
validate_integrity=True
|
||||
)
|
||||
|
||||
pipeline = ManualJsonPipeline("{sources_dir}", config)
|
||||
|
||||
# Run with progress tracking
|
||||
result = pipeline.run(progress_callback=default_progress_callback)
|
||||
|
||||
# Print comprehensive report
|
||||
pipeline.print_pipeline_report(result)
|
||||
""")
|
||||
|
||||
# Show source statistics
|
||||
try:
|
||||
pipeline = ManualJsonPipeline(sources_dir)
|
||||
stats = pipeline.get_source_statistics()
|
||||
|
||||
print(f"\n📊 Source Directory Statistics:")
|
||||
print(f" Files: {stats['total_files']}")
|
||||
print(f" Total size: {stats['total_size_mb']:.1f} MB")
|
||||
print(f" Average file size: {stats['average_file_size_kb']:.1f} KB")
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Could not get source statistics: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
71
mvp-platform-services/vehicles/etl/scheduler.py
Normal file
71
mvp-platform-services/vehicles/etl/scheduler.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import schedule
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
# Import locally to avoid circular import
|
||||
import importlib
|
||||
from .config import config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def scheduled_etl_job():
|
||||
"""Execute the ETL pipeline on schedule"""
|
||||
start_time = datetime.now()
|
||||
logger.info(f"Starting scheduled ETL job at {start_time}")
|
||||
|
||||
try:
|
||||
# Import dynamically to avoid circular import
|
||||
from .pipeline import run_etl_pipeline
|
||||
success = run_etl_pipeline()
|
||||
end_time = datetime.now()
|
||||
duration = end_time - start_time
|
||||
|
||||
if success:
|
||||
logger.info(f"ETL job completed successfully in {duration}")
|
||||
else:
|
||||
logger.error(f"ETL job failed after {duration}")
|
||||
|
||||
except Exception as e:
|
||||
end_time = datetime.now()
|
||||
duration = end_time - start_time
|
||||
logger.error(f"ETL job crashed after {duration}: {e}")
|
||||
|
||||
def start_etl_scheduler():
|
||||
"""Start the ETL scheduler"""
|
||||
logger.info(f"Starting ETL scheduler with cron pattern: {config.ETL_SCHEDULE}")
|
||||
|
||||
# Parse cron pattern (simplified for weekly schedule)
|
||||
# Format: "0 2 * * 0" = minute hour day-of-month month day-of-week
|
||||
# "0 2 * * 0" = Every Sunday at 2:00 AM
|
||||
|
||||
if config.ETL_SCHEDULE == "0 2 * * 0":
|
||||
schedule.every().sunday.at("02:00").do(scheduled_etl_job)
|
||||
logger.info("Scheduled ETL to run every Sunday at 2:00 AM")
|
||||
else:
|
||||
# Default fallback - run once daily at 2 AM
|
||||
schedule.every().day.at("02:00").do(scheduled_etl_job)
|
||||
logger.warning(f"Unknown cron pattern {config.ETL_SCHEDULE}, defaulting to daily at 2:00 AM")
|
||||
|
||||
# Run scheduler loop
|
||||
logger.info("ETL scheduler started")
|
||||
|
||||
while True:
|
||||
try:
|
||||
schedule.run_pending()
|
||||
time.sleep(60) # Check every minute
|
||||
except KeyboardInterrupt:
|
||||
logger.info("ETL scheduler stopped by user")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"ETL scheduler error: {e}")
|
||||
time.sleep(300) # Wait 5 minutes on error
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, config.LOG_LEVEL.upper()),
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
# Start scheduler
|
||||
start_etl_scheduler()
|
||||
64
mvp-platform-services/vehicles/etl/sources/makes.json
Normal file
64
mvp-platform-services/vehicles/etl/sources/makes.json
Normal file
@@ -0,0 +1,64 @@
|
||||
{
|
||||
"manufacturers": [
|
||||
"Acura",
|
||||
"Alfa Romeo",
|
||||
"Aston Martin",
|
||||
"Audi",
|
||||
"BMW",
|
||||
"Bentley",
|
||||
"Buick",
|
||||
"Cadillac",
|
||||
"Chevrolet",
|
||||
"Chrysler",
|
||||
"Dodge",
|
||||
"Ferrari",
|
||||
"Fiat",
|
||||
"Ford",
|
||||
"GMC",
|
||||
"Genesis",
|
||||
"Geo",
|
||||
"Honda",
|
||||
"Hummer",
|
||||
"Hyundai",
|
||||
"Infiniti",
|
||||
"Isuzu",
|
||||
"Jaguar",
|
||||
"Jeep",
|
||||
"Kia",
|
||||
"Lamborghini",
|
||||
"Land Rover",
|
||||
"Lexus",
|
||||
"Lincoln",
|
||||
"Lotus",
|
||||
"Mazda",
|
||||
"Maserati",
|
||||
"Maybach",
|
||||
"McLaren",
|
||||
"Mercedes-Benz",
|
||||
"Mercury",
|
||||
"MINI",
|
||||
"Mitsubishi",
|
||||
"Nissan",
|
||||
"Oldsmobile",
|
||||
"Plymouth",
|
||||
"Polestar",
|
||||
"Pontiac",
|
||||
"Porsche",
|
||||
"Ram",
|
||||
"Rivian",
|
||||
"Rolls Royce",
|
||||
"Saab",
|
||||
"Saturn",
|
||||
"Scion",
|
||||
"Smart",
|
||||
"Subaru",
|
||||
"Tesla",
|
||||
"Toyota",
|
||||
"Volkswagen",
|
||||
"Volvo",
|
||||
"Karma",
|
||||
"Pagani",
|
||||
"Koenigsegg",
|
||||
"Lucid"
|
||||
]
|
||||
}
|
||||
3162
mvp-platform-services/vehicles/etl/sources/makes/acura.json
Normal file
3162
mvp-platform-services/vehicles/etl/sources/makes/acura.json
Normal file
File diff suppressed because it is too large
Load Diff
3446
mvp-platform-services/vehicles/etl/sources/makes/alfa_romeo.json
Normal file
3446
mvp-platform-services/vehicles/etl/sources/makes/alfa_romeo.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,506 @@
|
||||
{
|
||||
"aston_martin": [
|
||||
{
|
||||
"year": "2023",
|
||||
"models": [
|
||||
{
|
||||
"name": "Vantage",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"5.2L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"AMR",
|
||||
"V12",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2020",
|
||||
"models": [
|
||||
{
|
||||
"name": "DB11",
|
||||
"engines": [
|
||||
"4.0L V8"
|
||||
],
|
||||
"submodels": []
|
||||
},
|
||||
{
|
||||
"name": "Dbs",
|
||||
"engines": [
|
||||
"5.2L V12"
|
||||
],
|
||||
"submodels": []
|
||||
},
|
||||
{
|
||||
"name": "Vantage",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"5.2L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"AMR",
|
||||
"V12",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2019",
|
||||
"models": [
|
||||
{
|
||||
"name": "Vantage",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"5.2L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"AMR",
|
||||
"V12",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2018",
|
||||
"models": [
|
||||
{
|
||||
"name": "Rapide",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": []
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2017",
|
||||
"models": [
|
||||
{
|
||||
"name": "V12 Vantage",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"S"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Vanquish",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Carbon",
|
||||
"Base",
|
||||
"Volante"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2016",
|
||||
"models": [
|
||||
{
|
||||
"name": "Rapide",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": []
|
||||
},
|
||||
{
|
||||
"name": "V12 Vantage",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"S"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Vanquish",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Carbon",
|
||||
"Base",
|
||||
"Volante"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2015",
|
||||
"models": [
|
||||
{
|
||||
"name": "DB9",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Volante",
|
||||
"Base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Rapide",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": []
|
||||
},
|
||||
{
|
||||
"name": "V12 Vantage",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"S"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Vanquish",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Carbon",
|
||||
"Base",
|
||||
"Volante"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2014",
|
||||
"models": [
|
||||
{
|
||||
"name": "DB9",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Volante",
|
||||
"Base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Vanquish",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Carbon",
|
||||
"Base",
|
||||
"Volante"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2013",
|
||||
"models": [
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2012",
|
||||
"models": [
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2011",
|
||||
"models": [
|
||||
{
|
||||
"name": "V12 Vantage",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"S"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2010",
|
||||
"models": [
|
||||
{
|
||||
"name": "DB9",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Volante",
|
||||
"Base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2009",
|
||||
"models": [
|
||||
{
|
||||
"name": "DB9",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Volante",
|
||||
"Base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2008",
|
||||
"models": [
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2007",
|
||||
"models": [
|
||||
{
|
||||
"name": "DB9",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Volante",
|
||||
"Base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2006",
|
||||
"models": [
|
||||
{
|
||||
"name": "V8 Vantage",
|
||||
"engines": [
|
||||
"4.3L V8",
|
||||
"4.7L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"GT",
|
||||
"S",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2005",
|
||||
"models": [
|
||||
{
|
||||
"name": "DB9",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Volante",
|
||||
"Base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Vantage",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"5.2L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"AMR",
|
||||
"V12",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2002",
|
||||
"models": [
|
||||
{
|
||||
"name": "DB7",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Vantage Volante",
|
||||
"Vantage"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2001",
|
||||
"models": [
|
||||
{
|
||||
"name": "DB7",
|
||||
"engines": [
|
||||
"6.0L V12"
|
||||
],
|
||||
"submodels": [
|
||||
"Vantage Volante",
|
||||
"Vantage"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "1993",
|
||||
"models": [
|
||||
{
|
||||
"name": "Virage",
|
||||
"engines": [
|
||||
"5.3L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"Volante"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "1990",
|
||||
"models": [
|
||||
{
|
||||
"name": "Virage",
|
||||
"engines": [
|
||||
"5.3L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"Volante"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "1983",
|
||||
"models": [
|
||||
{
|
||||
"name": "V 8",
|
||||
"engines": [
|
||||
"5.3L V8"
|
||||
],
|
||||
"submodels": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
15979
mvp-platform-services/vehicles/etl/sources/makes/audi.json
Normal file
15979
mvp-platform-services/vehicles/etl/sources/makes/audi.json
Normal file
File diff suppressed because it is too large
Load Diff
427
mvp-platform-services/vehicles/etl/sources/makes/bentley.json
Normal file
427
mvp-platform-services/vehicles/etl/sources/makes/bentley.json
Normal file
@@ -0,0 +1,427 @@
|
||||
{
|
||||
"bentley": [
|
||||
{
|
||||
"year": "2023",
|
||||
"models": [
|
||||
{
|
||||
"name": "Flying Spur",
|
||||
"engines": [
|
||||
"2.9L V6 MILD HYBRID EV- (MHEV)",
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Hybrid",
|
||||
"V8",
|
||||
"W12",
|
||||
"S Hybrid",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2022",
|
||||
"models": [
|
||||
{
|
||||
"name": "Flying Spur",
|
||||
"engines": [
|
||||
"2.9L V6 MILD HYBRID EV- (MHEV)",
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Hybrid",
|
||||
"V8",
|
||||
"W12",
|
||||
"S Hybrid",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2021",
|
||||
"models": [
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Flying Spur",
|
||||
"engines": [
|
||||
"2.9L V6 MILD HYBRID EV- (MHEV)",
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Hybrid",
|
||||
"V8",
|
||||
"W12",
|
||||
"S Hybrid",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2018",
|
||||
"models": [
|
||||
{
|
||||
"name": "Bentayga",
|
||||
"engines": [
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"W12 Signature",
|
||||
"Black Edition"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2017",
|
||||
"models": [
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2016",
|
||||
"models": [
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Flying Spur",
|
||||
"engines": [
|
||||
"2.9L V6 MILD HYBRID EV- (MHEV)",
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Hybrid",
|
||||
"V8",
|
||||
"W12",
|
||||
"S Hybrid",
|
||||
"Base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Mulsanne",
|
||||
"engines": [
|
||||
"6.8L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"Speed"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2014",
|
||||
"models": [
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Mulsanne",
|
||||
"engines": [
|
||||
"6.8L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"Speed"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2013",
|
||||
"models": [
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Flying Spur",
|
||||
"engines": [
|
||||
"2.9L V6 MILD HYBRID EV- (MHEV)",
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Hybrid",
|
||||
"V8",
|
||||
"W12",
|
||||
"S Hybrid",
|
||||
"Base"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2009",
|
||||
"models": [
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2008",
|
||||
"models": [
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2006",
|
||||
"models": [
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "2005",
|
||||
"models": [
|
||||
{
|
||||
"name": "Arnage",
|
||||
"engines": [
|
||||
"4.4L V8",
|
||||
"6.8L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"R"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Continental",
|
||||
"engines": [
|
||||
"4.0L V8",
|
||||
"6.0L W12 FLEX",
|
||||
"6.0L W12"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"GTC",
|
||||
"Flying Spur Speed",
|
||||
"GT V8 S",
|
||||
"GTC V8 S",
|
||||
"Flying Spur",
|
||||
"GT",
|
||||
"GT Speed"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "1999",
|
||||
"models": [
|
||||
{
|
||||
"name": "Arnage",
|
||||
"engines": [
|
||||
"4.4L V8",
|
||||
"6.8L V8"
|
||||
],
|
||||
"submodels": [
|
||||
"Base",
|
||||
"R"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "1997",
|
||||
"models": [
|
||||
{
|
||||
"name": "Brooklands",
|
||||
"engines": [
|
||||
"6.8L V8"
|
||||
],
|
||||
"submodels": []
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "1996",
|
||||
"models": [
|
||||
{
|
||||
"name": "Azure",
|
||||
"engines": [],
|
||||
"submodels": []
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "1989",
|
||||
"models": [
|
||||
{
|
||||
"name": "Turbo R",
|
||||
"engines": [
|
||||
"6.8L V8"
|
||||
],
|
||||
"submodels": []
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"year": "1963",
|
||||
"models": [
|
||||
{
|
||||
"name": "S3 Series",
|
||||
"engines": [
|
||||
"6.2L V8"
|
||||
],
|
||||
"submodels": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
21551
mvp-platform-services/vehicles/etl/sources/makes/bmw.json
Normal file
21551
mvp-platform-services/vehicles/etl/sources/makes/bmw.json
Normal file
File diff suppressed because it is too large
Load Diff
4951
mvp-platform-services/vehicles/etl/sources/makes/buick.json
Normal file
4951
mvp-platform-services/vehicles/etl/sources/makes/buick.json
Normal file
File diff suppressed because it is too large
Load Diff
4843
mvp-platform-services/vehicles/etl/sources/makes/cadillac.json
Normal file
4843
mvp-platform-services/vehicles/etl/sources/makes/cadillac.json
Normal file
File diff suppressed because it is too large
Load Diff
32077
mvp-platform-services/vehicles/etl/sources/makes/chevrolet.json
Normal file
32077
mvp-platform-services/vehicles/etl/sources/makes/chevrolet.json
Normal file
File diff suppressed because it is too large
Load Diff
4041
mvp-platform-services/vehicles/etl/sources/makes/chrysler.json
Normal file
4041
mvp-platform-services/vehicles/etl/sources/makes/chrysler.json
Normal file
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user