Possible working ETL

This commit is contained in:
Eric Gullickson
2025-12-15 18:19:55 -06:00
parent 1fc69b7779
commit 1e599e334f
110 changed files with 4843 additions and 2078706 deletions

View File

@@ -0,0 +1,238 @@
#!/usr/bin/env python3
"""
Generate SQL import files from a VehAPI snapshot SQLite database.
Reads observed compatibility pairs from the snapshot (trim-filtered engine<->transmission pairs)
and produces:
- output/01_engines.sql
- output/02_transmissions.sql
- output/03_vehicle_options.sql
No legacy JSON or network calls are used. The snapshot path is provided via CLI flag.
"""
import argparse
import os
import sqlite3
from pathlib import Path
from typing import Dict, Iterable, List, Sequence
BATCH_SIZE = 1000
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Generate SQL files from a VehAPI snapshot (SQLite).",
)
parser.add_argument(
"--snapshot-path",
type=Path,
default=os.environ.get("SNAPSHOT_PATH"),
help="Path to snapshots/<date>/snapshot.sqlite produced by vehapi_fetch_snapshot.py (or env SNAPSHOT_PATH)",
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("output"),
help="Directory to write SQL output files (default: output)",
)
return parser.parse_args()
def load_pairs(snapshot_path: Path) -> List[sqlite3.Row]:
if not snapshot_path.exists():
raise FileNotFoundError(f"Snapshot not found: {snapshot_path}")
conn = sqlite3.connect(snapshot_path)
conn.row_factory = sqlite3.Row
try:
cursor = conn.execute(
"""
SELECT
year,
make,
model,
trim,
engine_display,
engine_canon,
engine_bucket,
trans_display,
trans_canon,
trans_bucket
FROM pairs
ORDER BY year, make, model, trim, engine_canon, trans_canon
"""
)
rows = cursor.fetchall()
except sqlite3.Error as exc:
raise RuntimeError(f"Failed to read pairs from snapshot: {exc}") from exc
finally:
conn.close()
if not rows:
raise ValueError("Snapshot contains no rows in pairs table.")
return rows
def choose_engine_label(engine_display: str, engine_bucket: str, engine_canon: str) -> str:
"""
Use VehAPI display string when present, otherwise fall back to the bucket label,
and finally to the canonical key to avoid empty names.
"""
if engine_display:
return engine_display
if engine_bucket:
return engine_bucket
return engine_canon
def choose_trans_label(trans_display: str, trans_bucket: str, trans_canon: str) -> str:
if trans_display:
return trans_display
if trans_bucket:
return trans_bucket
return trans_canon
def build_engine_dimension(rows: Sequence[sqlite3.Row]) -> Dict[str, Dict]:
engines: Dict[str, Dict] = {}
for row in rows:
canon = row["engine_canon"]
if canon is None or canon == "":
raise ValueError(f"Missing engine_canon for row: {dict(row)}")
if canon in engines:
continue
engines[canon] = {
"id": len(engines) + 1,
"name": choose_engine_label(row["engine_display"], row["engine_bucket"], canon),
"fuel_type": row["engine_bucket"] or None,
}
return engines
def build_transmission_dimension(rows: Sequence[sqlite3.Row]) -> Dict[str, Dict]:
transmissions: Dict[str, Dict] = {}
for row in rows:
canon = row["trans_canon"]
if canon is None or canon == "":
raise ValueError(f"Missing trans_canon for row: {dict(row)}")
if canon in transmissions:
continue
transmissions[canon] = {
"id": len(transmissions) + 1,
"type": choose_trans_label(row["trans_display"], row["trans_bucket"], canon),
}
return transmissions
def build_vehicle_options(
rows: Sequence[sqlite3.Row],
engine_map: Dict[str, Dict],
trans_map: Dict[str, Dict],
) -> List[Dict]:
options: List[Dict] = []
for row in rows:
engine_canon = row["engine_canon"]
trans_canon = row["trans_canon"]
options.append(
{
"year": int(row["year"]),
"make": row["make"],
"model": row["model"],
"trim": row["trim"],
"engine_id": engine_map[engine_canon]["id"],
"transmission_id": trans_map[trans_canon]["id"],
}
)
return options
def sql_value(value):
if value is None:
return "NULL"
if isinstance(value, str):
return "'" + value.replace("'", "''") + "'"
return str(value)
def chunked(seq: Iterable[Dict], size: int) -> Iterable[List[Dict]]:
chunk: List[Dict] = []
for item in seq:
chunk.append(item)
if len(chunk) >= size:
yield chunk
chunk = []
if chunk:
yield chunk
def write_insert_file(
path: Path,
table: str,
columns: Sequence[str],
rows: Sequence[Dict],
):
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as f:
f.write(f"-- Auto-generated by etl_generate_sql.py\n")
if not rows:
f.write(f"-- No rows for {table}\n")
return
for batch in chunked(rows, BATCH_SIZE):
values_sql = ",\n".join(
"(" + ",".join(sql_value(row[col]) for col in columns) + ")"
for row in batch
)
f.write(f"INSERT INTO {table} ({', '.join(columns)}) VALUES\n{values_sql};\n\n")
def main():
args = parse_args()
snapshot_path: Path = args.snapshot_path
output_dir: Path = args.output_dir
if snapshot_path is None:
raise SystemExit("Snapshot path is required. Pass --snapshot-path or set SNAPSHOT_PATH.")
print(f"Reading snapshot: {snapshot_path}")
rows = load_pairs(snapshot_path)
years = sorted({int(row["year"]) for row in rows})
print(f" Loaded {len(rows):,} observed engine<->transmission pairs across {len(years)} years")
engines = build_engine_dimension(rows)
transmissions = build_transmission_dimension(rows)
vehicle_options = build_vehicle_options(rows, engines, transmissions)
print(f"Engines: {len(engines):,}")
print(f"Transmissions: {len(transmissions):,}")
print(f"Vehicle options (observed pairs): {len(vehicle_options):,}")
write_insert_file(
output_dir / "01_engines.sql",
"engines",
["id", "name", "fuel_type"],
engines.values(),
)
write_insert_file(
output_dir / "02_transmissions.sql",
"transmissions",
["id", "type"],
transmissions.values(),
)
write_insert_file(
output_dir / "03_vehicle_options.sql",
"vehicle_options",
["year", "make", "model", "trim", "engine_id", "transmission_id"],
vehicle_options,
)
print("\nSQL files generated:")
print(f" - {output_dir / '01_engines.sql'}")
print(f" - {output_dir / '02_transmissions.sql'}")
print(f" - {output_dir / '03_vehicle_options.sql'}")
print(f"\nYear coverage: {years[0]}-{years[-1]}")
if __name__ == "__main__":
main()

71
data/vehicle-etl/import_data.sh Executable file
View File

@@ -0,0 +1,71 @@
#!/bin/bash
# Offline import of generated SQL files into PostgreSQL (no network).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
echo "=========================================="
echo "📥 Automotive Database Import (offline)"
echo "=========================================="
echo ""
require_file() {
if [ ! -f "$1" ]; then
echo "❌ Missing required file: $1"
exit 1
fi
}
if ! docker ps --filter "name=mvp-postgres" --format "{{.Names}}" | grep -q "mvp-postgres"; then
echo "❌ Error: mvp-postgres container is not running"
exit 1
fi
require_file "output/01_engines.sql"
require_file "output/02_transmissions.sql"
require_file "output/03_vehicle_options.sql"
echo "📋 Step 1: Running database schema migration..."
docker exec -i mvp-postgres psql -U postgres -d motovaultpro < migrations/001_create_vehicle_database.sql
echo "✓ Schema migration completed"
echo ""
echo "🧹 Step 2: Truncating existing data..."
docker exec -i mvp-postgres psql -U postgres -d motovaultpro <<'EOF'
TRUNCATE TABLE vehicle_options RESTART IDENTITY CASCADE;
TRUNCATE TABLE engines RESTART IDENTITY CASCADE;
TRUNCATE TABLE transmissions RESTART IDENTITY CASCADE;
EOF
echo "✓ Tables truncated"
echo ""
echo "📥 Step 3: Importing engines..."
docker exec -i mvp-postgres psql -U postgres -d motovaultpro < output/01_engines.sql
echo "✓ Engines imported"
echo ""
echo "📥 Step 4: Importing transmissions..."
docker exec -i mvp-postgres psql -U postgres -d motovaultpro < output/02_transmissions.sql
echo "✓ Transmissions imported"
echo ""
echo "📥 Step 5: Importing vehicle options (observed pairs only)..."
docker exec -i mvp-postgres psql -U postgres -d motovaultpro < output/03_vehicle_options.sql
echo "✓ Vehicle options imported"
echo ""
echo "=========================================="
echo "✅ Import completed"
echo "=========================================="
echo ""
echo "🔍 Database verification:"
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT COUNT(*) as engines FROM engines;"
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT COUNT(*) as transmissions FROM transmissions;"
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT COUNT(*) as vehicle_options FROM vehicle_options;"
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT MIN(year) as min_year, MAX(year) as max_year FROM vehicle_options;"
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT DISTINCT year FROM vehicle_options ORDER BY year LIMIT 5;"
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT DISTINCT year FROM vehicle_options ORDER BY year DESC LIMIT 5;"
echo ""
echo "✓ Database ready for dropdown use."

View File

@@ -0,0 +1,286 @@
-- Migration: Create Automotive Vehicle Selection Database
-- Optimized for dropdown cascade queries
-- Date: 2025-11-10
-- Drop existing tables if they exist
DROP TABLE IF EXISTS vehicle_options CASCADE;
DROP TABLE IF EXISTS engines CASCADE;
DROP TABLE IF EXISTS transmissions CASCADE;
DROP INDEX IF EXISTS idx_vehicle_year;
DROP INDEX IF EXISTS idx_vehicle_make;
DROP INDEX IF EXISTS idx_vehicle_model;
DROP INDEX IF EXISTS idx_vehicle_trim;
DROP INDEX IF EXISTS idx_vehicle_composite;
-- Create engines table with detailed specifications
CREATE TABLE engines (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
displacement VARCHAR(50),
configuration VARCHAR(50),
horsepower VARCHAR(100),
torque VARCHAR(100),
fuel_type VARCHAR(100),
fuel_system VARCHAR(255),
aspiration VARCHAR(100),
specs_json JSONB,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Prevent duplicate engine display names (case-insensitive)
CREATE UNIQUE INDEX IF NOT EXISTS uq_engines_name_lower ON engines (LOWER(name));
CREATE INDEX idx_engines_displacement ON engines(displacement);
CREATE INDEX idx_engines_config ON engines(configuration);
-- Create transmissions table
CREATE TABLE transmissions (
id SERIAL PRIMARY KEY,
type VARCHAR(100) NOT NULL,
speeds VARCHAR(50),
drive_type VARCHAR(100),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Prevent duplicate transmission display names (case-insensitive)
CREATE UNIQUE INDEX IF NOT EXISTS uq_transmissions_type_lower ON transmissions (LOWER(type));
CREATE INDEX idx_transmissions_type ON transmissions(type);
-- Create denormalized vehicle_options table optimized for dropdown queries
CREATE TABLE vehicle_options (
id SERIAL PRIMARY KEY,
year INTEGER NOT NULL,
make VARCHAR(100) NOT NULL,
model VARCHAR(255) NOT NULL,
trim VARCHAR(255) NOT NULL,
engine_id INTEGER REFERENCES engines(id) ON DELETE SET NULL,
transmission_id INTEGER REFERENCES transmissions(id) ON DELETE SET NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Prevent duplicate vehicle option rows
CREATE UNIQUE INDEX IF NOT EXISTS uq_vehicle_options_full ON vehicle_options (
year, make, model, trim, engine_id, transmission_id
);
-- Indexes for cascading dropdown performance
CREATE INDEX idx_vehicle_year ON vehicle_options(year);
CREATE INDEX idx_vehicle_make ON vehicle_options(make);
CREATE INDEX idx_vehicle_model ON vehicle_options(model);
CREATE INDEX idx_vehicle_trim ON vehicle_options(trim);
CREATE INDEX idx_vehicle_year_make ON vehicle_options(year, make);
CREATE INDEX idx_vehicle_year_make_model ON vehicle_options(year, make, model);
CREATE INDEX idx_vehicle_year_make_model_trim ON vehicle_options(year, make, model, trim);
CREATE INDEX idx_vehicle_year_make_model_trim_engine ON vehicle_options(year, make, model, trim, engine_id);
CREATE INDEX idx_vehicle_year_make_model_trim_trans ON vehicle_options(year, make, model, trim, transmission_id);
-- Views for dropdown queries
-- View: Get all available years
CREATE OR REPLACE VIEW available_years AS
SELECT DISTINCT year
FROM vehicle_options
ORDER BY year DESC;
-- View: Get makes by year
CREATE OR REPLACE VIEW makes_by_year AS
SELECT DISTINCT year, make
FROM vehicle_options
ORDER BY year DESC, make ASC;
-- View: Get models by year and make
CREATE OR REPLACE VIEW models_by_year_make AS
SELECT DISTINCT year, make, model
FROM vehicle_options
ORDER BY year DESC, make ASC, model ASC;
-- View: Get trims by year, make, and model
CREATE OR REPLACE VIEW trims_by_year_make_model AS
SELECT DISTINCT year, make, model, trim
FROM vehicle_options
ORDER BY year DESC, make ASC, model ASC, trim ASC;
-- View: Get complete vehicle configurations with engine and transmission details
CREATE OR REPLACE VIEW complete_vehicle_configs AS
SELECT
vo.id,
vo.year,
vo.make,
vo.model,
vo.trim,
e.name AS engine_name,
e.displacement,
e.configuration,
e.horsepower,
e.torque,
e.fuel_type,
t.type AS transmission_type,
t.speeds AS transmission_speeds,
t.drive_type
FROM vehicle_options vo
LEFT JOIN engines e ON vo.engine_id = e.id
LEFT JOIN transmissions t ON vo.transmission_id = t.id
ORDER BY vo.year DESC, vo.make ASC, vo.model ASC, vo.trim ASC;
-- Function to get makes for a specific year
CREATE OR REPLACE FUNCTION get_makes_for_year(p_year INTEGER)
RETURNS TABLE(make VARCHAR) AS $$
BEGIN
RETURN QUERY
SELECT DISTINCT vehicle_options.make
FROM vehicle_options
WHERE vehicle_options.year = p_year
ORDER BY vehicle_options.make ASC;
END;
$$ LANGUAGE plpgsql;
-- Function to get models for a specific year and make
CREATE OR REPLACE FUNCTION get_models_for_year_make(p_year INTEGER, p_make VARCHAR)
RETURNS TABLE(model VARCHAR) AS $$
BEGIN
RETURN QUERY
SELECT DISTINCT vehicle_options.model
FROM vehicle_options
WHERE vehicle_options.year = p_year
AND vehicle_options.make = p_make
ORDER BY vehicle_options.model ASC;
END;
$$ LANGUAGE plpgsql;
-- Function to get trims for a specific year, make, and model
CREATE OR REPLACE FUNCTION get_trims_for_year_make_model(p_year INTEGER, p_make VARCHAR, p_model VARCHAR)
RETURNS TABLE(trim_name VARCHAR) AS $$
BEGIN
RETURN QUERY
SELECT DISTINCT vehicle_options.trim
FROM vehicle_options
WHERE vehicle_options.year = p_year
AND vehicle_options.make = p_make
AND vehicle_options.model = p_model
ORDER BY vehicle_options.trim ASC;
END;
$$ LANGUAGE plpgsql;
-- Function to get engine and transmission options for a specific vehicle
CREATE OR REPLACE FUNCTION get_options_for_vehicle(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR)
RETURNS TABLE(
engine_name VARCHAR,
engine_displacement VARCHAR,
engine_horsepower VARCHAR,
transmission_type VARCHAR,
transmission_speeds VARCHAR,
drive_type VARCHAR
) AS $$
BEGIN
RETURN QUERY
SELECT
e.name,
e.displacement,
e.horsepower,
t.type,
t.speeds,
t.drive_type
FROM vehicle_options vo
LEFT JOIN engines e ON vo.engine_id = e.id
LEFT JOIN transmissions t ON vo.transmission_id = t.id
WHERE vo.year = p_year
AND vo.make = p_make
AND vo.model = p_model
AND vo.trim = p_trim;
END;
$$ LANGUAGE plpgsql;
-- Helper functions for trim-level options and pair-safe filtering
CREATE OR REPLACE FUNCTION get_transmissions_for_vehicle(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR)
RETURNS TABLE(
transmission_id INTEGER,
transmission_type VARCHAR
) AS $$
BEGIN
RETURN QUERY
SELECT DISTINCT
t.id,
t.type
FROM vehicle_options vo
JOIN transmissions t ON vo.transmission_id = t.id
WHERE vo.year = p_year
AND vo.make = p_make
AND vo.model = p_model
AND vo.trim = p_trim
ORDER BY t.type ASC;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION get_engines_for_vehicle(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR)
RETURNS TABLE(
engine_id INTEGER,
engine_name VARCHAR
) AS $$
BEGIN
RETURN QUERY
SELECT DISTINCT
e.id,
e.name
FROM vehicle_options vo
JOIN engines e ON vo.engine_id = e.id
WHERE vo.year = p_year
AND vo.make = p_make
AND vo.model = p_model
AND vo.trim = p_trim
ORDER BY e.name ASC;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION get_transmissions_for_vehicle_engine(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR, p_engine_name VARCHAR)
RETURNS TABLE(
transmission_id INTEGER,
transmission_type VARCHAR
) AS $$
BEGIN
RETURN QUERY
SELECT DISTINCT
t.id,
t.type
FROM vehicle_options vo
JOIN engines e ON vo.engine_id = e.id
JOIN transmissions t ON vo.transmission_id = t.id
WHERE vo.year = p_year
AND vo.make = p_make
AND vo.model = p_model
AND vo.trim = p_trim
AND e.name = p_engine_name
ORDER BY t.type ASC;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION get_engines_for_vehicle_trans(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR, p_trans_type VARCHAR)
RETURNS TABLE(
engine_id INTEGER,
engine_name VARCHAR
) AS $$
BEGIN
RETURN QUERY
SELECT DISTINCT
e.id,
e.name
FROM vehicle_options vo
JOIN engines e ON vo.engine_id = e.id
JOIN transmissions t ON vo.transmission_id = t.id
WHERE vo.year = p_year
AND vo.make = p_make
AND vo.model = p_model
AND vo.trim = p_trim
AND t.type = p_trans_type
ORDER BY e.name ASC;
END;
$$ LANGUAGE plpgsql;
COMMENT ON TABLE vehicle_options IS 'Denormalized table optimized for cascading dropdown queries';
COMMENT ON TABLE engines IS 'Engine specifications with detailed technical data';
COMMENT ON TABLE transmissions IS 'Transmission specifications';
COMMENT ON VIEW available_years IS 'Returns all distinct years available in the database';
COMMENT ON VIEW makes_by_year IS 'Returns makes grouped by year for dropdown population';
COMMENT ON VIEW models_by_year_make IS 'Returns models grouped by year and make';
COMMENT ON VIEW trims_by_year_make_model IS 'Returns trims grouped by year, make, and model';
COMMENT ON VIEW complete_vehicle_configs IS 'Complete vehicle configurations with all details';

View File

@@ -0,0 +1,22 @@
-- Auto-generated by etl_generate_sql.py
INSERT INTO engines (id, name, fuel_type) VALUES
(1,'Gas','Gas'),
(2,'2.0L 150 hp I4','Gas'),
(3,'2.4L 201 hp I4','Gas'),
(4,'3.5L 290 hp V6','Gas'),
(5,'3.5L 273 hp V6','Gas'),
(6,'3.5L 310 hp V6','Gas'),
(7,'2.4L 206 hp I4','Gas'),
(8,'2.0L 220 hp I4','Gas'),
(9,'1.8L 170 hp I4','Gas'),
(10,'Diesel','Diesel'),
(11,'2.0L 150 hp I4 Diesel','Diesel'),
(12,'2.0L 220 hp I4 Flex Fuel Vehicle','Gas'),
(13,'3.0L 310 hp V6','Gas'),
(14,'3.0L 240 hp V6 Diesel','Diesel'),
(15,'4.0L 435 hp V8','Diesel'),
(16,'3.0L 333 hp V6','Gas'),
(17,'6.3L 500 hp W12','Gas'),
(18,'2.0L 200 hp I4','Gas'),
(19,'3.0L 272 hp V6','Gas');

View File

@@ -0,0 +1,13 @@
-- Auto-generated by etl_generate_sql.py
INSERT INTO transmissions (id, type) VALUES
(1,'Automatic'),
(2,'Manual'),
(3,'5-Speed Automatic'),
(4,'6-Speed Manual'),
(5,'6-Speed Automatic'),
(6,'8-Speed Dual Clutch'),
(7,'9-Speed Automatic'),
(8,'6-Speed Dual Clutch'),
(9,'8-Speed Automatic'),
(10,'Continuously Variable Transmission');

View File

@@ -0,0 +1,281 @@
-- Auto-generated by etl_generate_sql.py
INSERT INTO vehicle_options (year, make, model, trim, engine_id, transmission_id) VALUES
(2015,'Acura','ILX','2.0L',1,1),
(2015,'Acura','ILX','2.0L',1,2),
(2015,'Acura','ILX','2.0L FWD',2,3),
(2015,'Acura','ILX','2.0L FWD with Premium Package',2,3),
(2015,'Acura','ILX','2.0L FWD with Technology Package',2,3),
(2015,'Acura','ILX','2.0L Technology',1,1),
(2015,'Acura','ILX','2.0L Technology',1,2),
(2015,'Acura','ILX','2.0L w/Premium Package',1,1),
(2015,'Acura','ILX','2.0L w/Premium Package',1,2),
(2015,'Acura','ILX','2.4L FWD with Premium Package',2,3),
(2015,'Acura','ILX','2.4L FWD with Premium Package',3,4),
(2015,'Acura','ILX','2.4L w/Premium Package',1,1),
(2015,'Acura','ILX','2.4L w/Premium Package',1,2),
(2015,'Acura','ILX','FWD with Dynamic Package',2,3),
(2015,'Acura','MDX','3.5L',1,1),
(2015,'Acura','MDX','3.5L',1,2),
(2015,'Acura','MDX','3.5L Advance Pkg w/Entertainment Pkg',1,1),
(2015,'Acura','MDX','3.5L Advance Pkg w/Entertainment Pkg',1,2),
(2015,'Acura','MDX','3.5L Technology Package',1,1),
(2015,'Acura','MDX','3.5L Technology Package',1,2),
(2015,'Acura','MDX','3.5L Technology Pkg/w Entertainment Pkg',1,1),
(2015,'Acura','MDX','3.5L Technology Pkg/w Entertainment Pkg',1,2),
(2015,'Acura','MDX','3.5L w/Technology & Entertainment Pkgs',1,1),
(2015,'Acura','MDX','3.5L w/Technology & Entertainment Pkgs',1,2),
(2015,'Acura','MDX','FWD',4,5),
(2015,'Acura','MDX','FWD with Advance and Entertainment Package',4,5),
(2015,'Acura','MDX','FWD with Technology Package',4,5),
(2015,'Acura','MDX','FWD with Technology and Entertainment Package',4,5),
(2015,'Acura','MDX','SH-AWD',4,5),
(2015,'Acura','MDX','SH-AWD with Advance and Entertainment Package',4,5),
(2015,'Acura','MDX','SH-AWD with Elite Package',4,5),
(2015,'Acura','MDX','SH-AWD with Navigation',4,5),
(2015,'Acura','MDX','SH-AWD with Technology Package',4,5),
(2015,'Acura','MDX','SH-AWD with Technology and Entertainment Package',4,5),
(2015,'Acura','RDX','AWD',5,5),
(2015,'Acura','RDX','AWD with Technology Package',5,5),
(2015,'Acura','RDX','Base',1,1),
(2015,'Acura','RDX','Base',1,2),
(2015,'Acura','RDX','FWD',5,5),
(2015,'Acura','RDX','FWD with Technology Package',5,5),
(2015,'Acura','RDX','Technology Package',1,1),
(2015,'Acura','RDX','Technology Package',1,2),
(2015,'Acura','RLX','Advance Package',1,1),
(2015,'Acura','RLX','Advance Package',1,2),
(2015,'Acura','RLX','Base',1,1),
(2015,'Acura','RLX','Base',1,2),
(2015,'Acura','RLX','FWD',6,5),
(2015,'Acura','RLX','FWD',1,1),
(2015,'Acura','RLX','FWD',1,2),
(2015,'Acura','RLX','FWD with Advance Package',6,5),
(2015,'Acura','RLX','FWD with Elite Package',6,5),
(2015,'Acura','RLX','FWD with Krell Audio Package',6,5),
(2015,'Acura','RLX','FWD with Navigation',6,5),
(2015,'Acura','RLX','FWD with Technology Package',6,5),
(2015,'Acura','RLX','Navigation',1,1),
(2015,'Acura','RLX','Navigation',1,2),
(2015,'Acura','RLX','Technology Package',1,1),
(2015,'Acura','RLX','Technology Package',1,2),
(2015,'Acura','RLX Hybrid Sport','SH-AWD',1,1),
(2015,'Acura','RLX Hybrid Sport','SH-AWD',1,2),
(2015,'Acura','TLX','Base',1,1),
(2015,'Acura','TLX','Base',1,2),
(2015,'Acura','TLX','FWD',7,6),
(2015,'Acura','TLX','FWD with Technology Package',7,6),
(2015,'Acura','TLX','SH-AWD with Elite Package',4,7),
(2015,'Acura','TLX','Tech',1,1),
(2015,'Acura','TLX','Tech',1,2),
(2015,'Acura','TLX','V6',1,1),
(2015,'Acura','TLX','V6',1,2),
(2015,'Acura','TLX','V6 Advance',1,1),
(2015,'Acura','TLX','V6 Advance',1,2),
(2015,'Acura','TLX','V6 FWD',4,7),
(2015,'Acura','TLX','V6 FWD with Advance Package',4,7),
(2015,'Acura','TLX','V6 FWD with Technology Package',4,7),
(2015,'Acura','TLX','V6 SH-AWD',4,7),
(2015,'Acura','TLX','V6 SH-AWD with Advance Package',4,7),
(2015,'Acura','TLX','V6 SH-AWD with Technology Package',4,7),
(2015,'Acura','TLX','V6 Tech',1,1),
(2015,'Acura','TLX','V6 Tech',1,2),
(2015,'Acura','TLX','V6 with Elite Package',4,7),
(2015,'Audi','A3','1.8T Komfort Sedan FWD',8,8),
(2015,'Audi','A3','1.8T Premium',1,1),
(2015,'Audi','A3','1.8T Premium',1,2),
(2015,'Audi','A3','1.8T Premium Cabriolet FWD',9,8),
(2015,'Audi','A3','1.8T Premium Plus',1,1),
(2015,'Audi','A3','1.8T Premium Plus',1,2),
(2015,'Audi','A3','1.8T Premium Plus Cabriolet FWD',9,8),
(2015,'Audi','A3','1.8T Premium Plus Sedan FWD',9,8),
(2015,'Audi','A3','1.8T Premium Sedan FWD',9,8),
(2015,'Audi','A3','1.8T Prestige',1,1),
(2015,'Audi','A3','1.8T Prestige',1,2),
(2015,'Audi','A3','1.8T Prestige Cabriolet FWD',9,8),
(2015,'Audi','A3','1.8T Prestige Sedan FWD',9,8),
(2015,'Audi','A3','1.8T Prestige Sedan FWD',8,8),
(2015,'Audi','A3','1.8T Progressiv Sedan FWD',8,8),
(2015,'Audi','A3','2.0 TDI Premium',10,1),
(2015,'Audi','A3','2.0 TDI Premium',10,2),
(2015,'Audi','A3','2.0 TDI Premium Plus',10,1),
(2015,'Audi','A3','2.0 TDI Premium Plus',10,2),
(2015,'Audi','A3','2.0 TDI Premium Plus Sedan FWD',11,8),
(2015,'Audi','A3','2.0 TDI Premium Sedan FWD',11,8),
(2015,'Audi','A3','2.0 TDI Prestige',10,1),
(2015,'Audi','A3','2.0 TDI Prestige',10,2),
(2015,'Audi','A3','2.0 TDI Prestige Sedan FWD',11,8),
(2015,'Audi','A3','2.0T Premium',1,1),
(2015,'Audi','A3','2.0T Premium',1,2),
(2015,'Audi','A3','2.0T Premium Plus',1,1),
(2015,'Audi','A3','2.0T Premium Plus',1,2),
(2015,'Audi','A3','2.0T Prestige',1,1),
(2015,'Audi','A3','2.0T Prestige',1,2),
(2015,'Audi','A3','2.0T quattro Komfort Cabriolet AWD',8,8),
(2015,'Audi','A3','2.0T quattro Komfort Sedan AWD',8,8),
(2015,'Audi','A3','2.0T quattro Premium Cabriolet AWD',8,8),
(2015,'Audi','A3','2.0T quattro Premium Plus Cabriolet AWD',8,8),
(2015,'Audi','A3','2.0T quattro Premium Plus Sedan AWD',8,8),
(2015,'Audi','A3','2.0T quattro Premium Sedan AWD',8,8),
(2015,'Audi','A3','2.0T quattro Prestige Cabriolet AWD',8,8),
(2015,'Audi','A3','2.0T quattro Prestige Sedan AWD',8,8),
(2015,'Audi','A3','2.0T quattro Progressiv Cabriolet AWD',8,8),
(2015,'Audi','A3','2.0T quattro Progressiv Sedan AWD',8,8),
(2015,'Audi','A3','2.0T quattro Technik Cabriolet AWD',8,8),
(2015,'Audi','A3','2.0T quattro Technik FWD',1,1),
(2015,'Audi','A3','2.0T quattro Technik FWD',1,2),
(2015,'Audi','A3','2.0T quattro Technik Sedan AWD',8,8),
(2015,'Audi','A3','TDI Komfort Sedan FWD',8,8),
(2015,'Audi','A3','TDI Progressiv Sedan FWD',8,8),
(2015,'Audi','A3','TDI Technik Sedan FWD',8,8),
(2015,'Audi','A4','2.0T FrontTrak Komfort FWD',8,4),
(2015,'Audi','A4','2.0T FrontTrak Komfort FWD',8,9),
(2015,'Audi','A4','2.0T Premium',1,1),
(2015,'Audi','A4','2.0T Premium',1,2),
(2015,'Audi','A4','2.0T Premium FWD',8,10),
(2015,'Audi','A4','2.0T Premium Plus',1,1),
(2015,'Audi','A4','2.0T Premium Plus',1,2),
(2015,'Audi','A4','2.0T Premium Plus FWD',8,10),
(2015,'Audi','A4','2.0T Premium Plus Sedan FWD',8,10),
(2015,'Audi','A4','2.0T Premium Sedan FWD',8,10),
(2015,'Audi','A4','2.0T Prestige',1,1),
(2015,'Audi','A4','2.0T Prestige',1,2),
(2015,'Audi','A4','2.0T Prestige FWD',8,4),
(2015,'Audi','A4','2.0T Prestige FWD',8,9),
(2015,'Audi','A4','2.0T Prestige Sedan FWD',8,10),
(2015,'Audi','A4','2.0T quattro Komfort AWD',8,4),
(2015,'Audi','A4','2.0T quattro Komfort AWD',8,9),
(2015,'Audi','A4','2.0T quattro Premium AWD',8,4),
(2015,'Audi','A4','2.0T quattro Premium AWD',8,9),
(2015,'Audi','A4','2.0T quattro Premium Plus AWD',8,4),
(2015,'Audi','A4','2.0T quattro Premium Plus AWD',8,9),
(2015,'Audi','A4','2.0T quattro Premium Plus Sedan AWD',8,4),
(2015,'Audi','A4','2.0T quattro Premium Plus Sedan AWD',8,9),
(2015,'Audi','A4','2.0T quattro Premium Sedan AWD',8,4),
(2015,'Audi','A4','2.0T quattro Premium Sedan AWD',8,9),
(2015,'Audi','A4','2.0T quattro Prestige AWD',8,4),
(2015,'Audi','A4','2.0T quattro Prestige AWD',8,9),
(2015,'Audi','A4','2.0T quattro Prestige Sedan AWD',8,4),
(2015,'Audi','A4','2.0T quattro Prestige Sedan AWD',8,9),
(2015,'Audi','A4','2.0T quattro Progressiv AWD',8,4),
(2015,'Audi','A4','2.0T quattro Progressiv AWD',8,9),
(2015,'Audi','A4','2.0T quattro Technik AWD',8,4),
(2015,'Audi','A4','2.0T quattro Technik AWD',8,9),
(2015,'Audi','A4 Allroad','2.0T quattro Komfort AWD',12,9),
(2015,'Audi','A4 Allroad','2.0T quattro Premium AWD',8,9),
(2015,'Audi','A4 Allroad','2.0T quattro Premium AWD',12,9),
(2015,'Audi','A4 Allroad','2.0T quattro Premium Plus AWD',8,9),
(2015,'Audi','A4 Allroad','2.0T quattro Premium Plus AWD',12,9),
(2015,'Audi','A4 Allroad','2.0T quattro Prestige AWD',8,9),
(2015,'Audi','A4 Allroad','2.0T quattro Prestige AWD',12,9),
(2015,'Audi','A4 Allroad','2.0T quattro Progressiv AWD',12,9),
(2015,'Audi','A4 Allroad','2.0T quattro Technik AWD',12,9),
(2015,'Audi','A5','2.0T Premium',1,1),
(2015,'Audi','A5','2.0T Premium',1,2),
(2015,'Audi','A5','2.0T Premium Plus',1,1),
(2015,'Audi','A5','2.0T Premium Plus',1,2),
(2015,'Audi','A5','2.0T quattro Komfort Coupe AWD',8,4),
(2015,'Audi','A5','2.0T quattro Komfort Coupe AWD',8,9),
(2015,'Audi','A5','2.0T quattro Premium Cabriolet AWD',8,9),
(2015,'Audi','A5','2.0T quattro Premium Cabriolet AWD',12,9),
(2015,'Audi','A5','2.0T quattro Premium Coupe AWD',8,4),
(2015,'Audi','A5','2.0T quattro Premium Coupe AWD',8,9),
(2015,'Audi','A5','2.0T quattro Premium Plus Cabriolet AWD',8,9),
(2015,'Audi','A5','2.0T quattro Premium Plus Cabriolet AWD',12,9),
(2015,'Audi','A5','2.0T quattro Premium Plus Coupe AWD',8,4),
(2015,'Audi','A5','2.0T quattro Premium Plus Coupe AWD',8,9),
(2015,'Audi','A5','2.0T quattro Prestige Cabriolet AWD',8,4),
(2015,'Audi','A5','2.0T quattro Prestige Cabriolet AWD',8,9),
(2015,'Audi','A5','2.0T quattro Prestige Coupe AWD',8,4),
(2015,'Audi','A5','2.0T quattro Prestige Coupe AWD',8,9),
(2015,'Audi','A5','2.0T quattro Progressiv Cabriolet AWD',8,4),
(2015,'Audi','A5','2.0T quattro Progressiv Cabriolet AWD',8,9),
(2015,'Audi','A5','2.0T quattro Progressiv Coupe AWD',8,4),
(2015,'Audi','A5','2.0T quattro Progressiv Coupe AWD',8,9),
(2015,'Audi','A5','2.0T quattro Progressiv Coupe AWD',1,1),
(2015,'Audi','A5','2.0T quattro Progressiv Coupe AWD',1,2),
(2015,'Audi','A5','2.0T quattro Technik Cabriolet AWD',8,4),
(2015,'Audi','A5','2.0T quattro Technik Cabriolet AWD',8,9),
(2015,'Audi','A5','2.0T quattro Technik Coupe AWD',8,4),
(2015,'Audi','A5','2.0T quattro Technik Coupe AWD',8,9),
(2015,'Audi','A6','2.0T Premium',1,1),
(2015,'Audi','A6','2.0T Premium',1,2),
(2015,'Audi','A6','2.0T Premium Plus Sedan FWD',8,10),
(2015,'Audi','A6','2.0T Premium Sedan FWD',8,10),
(2015,'Audi','A6','2.0T Premium Sedan FWD',13,9),
(2015,'Audi','A6','2.0T quattro Premium Plus Sedan AWD',8,9),
(2015,'Audi','A6','2.0T quattro Premium Sedan AWD',8,9),
(2015,'Audi','A6','2.0T quattro Progressiv Sedan AWD',13,9),
(2015,'Audi','A6','2.0T quattro Technik Sedan AWD',13,9),
(2015,'Audi','A6','3.0 TDI Premium Plus',10,1),
(2015,'Audi','A6','3.0 TDI Premium Plus',10,2),
(2015,'Audi','A6','3.0 TDI quattro Premium Plus Sedan AWD',14,9),
(2015,'Audi','A6','3.0 TDI quattro Prestige Sedan AWD',14,9),
(2015,'Audi','A6','3.0 TDI quattro Progressiv Sedan AWD',13,9),
(2015,'Audi','A6','3.0 TDI quattro Technik Sedan AWD',13,9),
(2015,'Audi','A6','3.0T Premium Plus',1,1),
(2015,'Audi','A6','3.0T Premium Plus',1,2),
(2015,'Audi','A6','3.0T Prestige',1,1),
(2015,'Audi','A6','3.0T Prestige',1,2),
(2015,'Audi','A6','3.0T quattro Premium Plus Sedan AWD',13,9),
(2015,'Audi','A6','3.0T quattro Prestige Sedan AWD',13,9),
(2015,'Audi','A6','3.0T quattro Progressiv Sedan AWD',13,9),
(2015,'Audi','A6','3.0T quattro Technik Sedan AWD',13,9),
(2015,'Audi','A7','3.0 TDI Premium Plus',10,1),
(2015,'Audi','A7','3.0 TDI Premium Plus',10,2),
(2015,'Audi','A7','3.0 TDI quattro Premium Plus AWD',14,9),
(2015,'Audi','A7','3.0 TDI quattro Premium Plus AWD',13,9),
(2015,'Audi','A7','3.0 TDI quattro Prestige AWD',14,9),
(2015,'Audi','A7','3.0 TDI quattro Progressiv AWD',13,9),
(2015,'Audi','A7','3.0 TDI quattro Technik AWD',13,9),
(2015,'Audi','A7','3.0T Premium Plus',1,1),
(2015,'Audi','A7','3.0T Premium Plus',1,2),
(2015,'Audi','A7','3.0T Prestige',1,1),
(2015,'Audi','A7','3.0T Prestige',1,2),
(2015,'Audi','A7','3.0T quattro Premium Plus AWD',13,9),
(2015,'Audi','A7','3.0T quattro Prestige AWD',13,9),
(2015,'Audi','A7','3.0T quattro Progressiv AWD',13,9),
(2015,'Audi','A7','3.0T quattro Technik AWD',13,9),
(2015,'Audi','A8','3.0 TDI quattro AWD',15,9),
(2015,'Audi','A8','3.0T',1,1),
(2015,'Audi','A8','3.0T',1,2),
(2015,'Audi','A8','3.0T quattro AWD',16,9),
(2015,'Audi','A8','4.0T',1,1),
(2015,'Audi','A8','4.0T',1,2),
(2015,'Audi','A8','4.0T quattro AWD',15,9),
(2015,'Audi','A8','L 3.0 TDI',10,1),
(2015,'Audi','A8','L 3.0 TDI',10,2),
(2015,'Audi','A8','L 3.0 TDI quattro AWD',14,9),
(2015,'Audi','A8','L 3.0T',1,1),
(2015,'Audi','A8','L 3.0T',1,2),
(2015,'Audi','A8','L 3.0T quattro AWD',16,9),
(2015,'Audi','A8','L 4.0T',1,1),
(2015,'Audi','A8','L 4.0T',1,2),
(2015,'Audi','A8','L 4.0T quattro AWD',15,9),
(2015,'Audi','A8','L W12 6.3',1,1),
(2015,'Audi','A8','L W12 6.3',1,2),
(2015,'Audi','A8','L W12 quattro AWD',15,9),
(2015,'Audi','A8','L W12 quattro AWD',17,9),
(2015,'Audi','Q3','2.0T Premium Plus',1,1),
(2015,'Audi','Q3','2.0T Premium Plus',1,2),
(2015,'Audi','Q3','2.0T Premium Plus FWD',18,5),
(2015,'Audi','Q3','2.0T Prestige',1,1),
(2015,'Audi','Q3','2.0T Prestige',1,2),
(2015,'Audi','Q3','2.0T Prestige FWD',18,5),
(2015,'Audi','Q3','2.0T Progressiv FWD',18,5),
(2015,'Audi','Q3','2.0T Technik FWD',18,5),
(2015,'Audi','Q3','2.0T quattro Premium Plus AWD',18,5),
(2015,'Audi','Q3','2.0T quattro Prestige AWD',18,5),
(2015,'Audi','Q3','3.0T quattro Progressiv AWD',18,5),
(2015,'Audi','Q3','3.0T quattro Technik AWD',18,5),
(2015,'Audi','Q5','2.0T Premium',1,1),
(2015,'Audi','Q5','2.0T Premium',1,2),
(2015,'Audi','Q5','2.0T Premium Plus',1,1),
(2015,'Audi','Q5','2.0T Premium Plus',1,2),
(2015,'Audi','Q5','2.0T quattro Komfort AWD',19,9),
(2015,'Audi','allroad','2.0T Premium',1,1),
(2015,'Audi','allroad','2.0T Premium',1,2),
(2015,'Audi','allroad','2.0T Premium Plus',1,1),
(2015,'Audi','allroad','2.0T Premium Plus',1,2),
(2015,'Audi','allroad','2.0T Prestige',1,1),
(2015,'Audi','allroad','2.0T Prestige',1,2);

190
data/vehicle-etl/qa_validate.py Executable file
View File

@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""
Post-import QA validation for vehicle dropdown data.
Runs basic duplicate and range checks against the motovaultpro Postgres container.
"""
import os
import subprocess
import sys
def run_psql(query: str) -> str:
cmd = [
"docker",
"exec",
"mvp-postgres",
"psql",
"-U",
"postgres",
"-d",
"motovaultpro",
"-At",
"-c",
query,
]
return subprocess.check_output(cmd, text=True)
def check_container():
try:
subprocess.check_output(["docker", "ps"], text=True)
except Exception:
print("❌ Docker not available.")
sys.exit(1)
try:
containers = subprocess.check_output(
["docker", "ps", "--filter", "name=mvp-postgres", "--format", "{{.Names}}"],
text=True,
).strip()
if not containers:
print("❌ mvp-postgres container not running.")
sys.exit(1)
except Exception as exc:
print(f"❌ Failed to check containers: {exc}")
sys.exit(1)
def check_invalid_combinations():
"""Verify known invalid combinations do not exist."""
invalid_combos = [
(1992, "Chevrolet", "Corvette", "Z06"), # Z06 started 2001
(2000, "Chevrolet", "Corvette", "35th Anniversary Edition"), # Was 1988
(2000, "Chevrolet", "Corvette", "Stingray"), # Stingray started 2014
(1995, "Ford", "Mustang", "Mach-E"), # Mach-E is 2021+
(2020, "Tesla", "Cybertruck", "Base"), # Not in production until later
]
issues = []
for year, make, model, trim in invalid_combos:
query = f"""
SELECT COUNT(*) FROM vehicle_options
WHERE year = {year}
AND make = '{make}'
AND model = '{model}'
AND trim = '{trim}'
"""
count = int(run_psql(query).strip())
if count > 0:
issues.append(f"Invalid combo found: {year} {make} {model} {trim}")
return issues
def check_trim_coverage():
"""Report on trim coverage statistics."""
query = """
SELECT
COUNT(DISTINCT (year, make, model)) as total_models,
COUNT(DISTINCT (year, make, model)) FILTER (WHERE trim = 'Base') as base_only,
COUNT(DISTINCT (year, make, model)) FILTER (WHERE trim != 'Base') as has_specific_trims
FROM vehicle_options
"""
result = run_psql(query).strip()
print(f"Trim coverage (total/base_only/has_specific_trims): {result}")
def main():
check_container()
print("🔍 Running QA checks...\n")
queries = {
"engine_duplicate_names": """
SELECT COUNT(*) FROM (
SELECT LOWER(name) as n, COUNT(*) c
FROM engines
GROUP BY 1 HAVING COUNT(*) > 1
) t;
""",
"transmission_duplicate_types": """
SELECT COUNT(*) FROM (
SELECT LOWER(type) as t, COUNT(*) c
FROM transmissions
GROUP BY 1 HAVING COUNT(*) > 1
) t;
""",
"vehicle_option_duplicates": """
SELECT COUNT(*) FROM (
SELECT year, make, model, trim, engine_id, transmission_id, COUNT(*) c
FROM vehicle_options
GROUP BY 1,2,3,4,5,6 HAVING COUNT(*) > 1
) t;
""",
"year_range": """
SELECT MIN(year) || ' - ' || MAX(year) FROM vehicle_options;
""",
"year_range_valid": """
SELECT COUNT(*) FROM (
SELECT 1 FROM vehicle_options WHERE year < 2015 OR year > 2022 LIMIT 1
) t;
""",
"counts": """
SELECT
(SELECT COUNT(*) FROM engines) AS engines,
(SELECT COUNT(*) FROM transmissions) AS transmissions,
(SELECT COUNT(*) FROM vehicle_options) AS vehicle_options;
""",
"cross_join_gaps": """
SELECT COUNT(*) FROM (
SELECT base.year, base.make, base.model, base.trim, e.engine_id, t.transmission_id
FROM (
SELECT DISTINCT year, make, model, trim FROM vehicle_options
) base
JOIN (
SELECT DISTINCT year, make, model, trim, engine_id FROM vehicle_options
) e ON base.year = e.year AND base.make = e.make AND base.model = e.model AND base.trim = e.trim
JOIN (
SELECT DISTINCT year, make, model, trim, transmission_id FROM vehicle_options
) t ON base.year = t.year AND base.make = t.make AND base.model = t.model AND base.trim = t.trim
EXCEPT
SELECT year, make, model, trim, engine_id, transmission_id FROM vehicle_options
) gap;
""",
}
results = {}
for key, query in queries.items():
try:
results[key] = run_psql(query).strip()
except subprocess.CalledProcessError as exc:
print(f"❌ Query failed ({key}): {exc}")
sys.exit(1)
issues_found = False
print(f"Engine duplicate names: {results['engine_duplicate_names']}")
print(f"Transmission duplicate types: {results['transmission_duplicate_types']}")
print(f"Vehicle option duplicates: {results['vehicle_option_duplicates']}")
print(f"Year range: {results['year_range']}")
print(f"Out-of-range years (should be 0): {results['year_range_valid']}")
print(f"Counts (engines, transmissions, vehicle_options): {results['counts']}")
print(f"Cross-join gaps (should be 0 to avoid impossible pairs): {results['cross_join_gaps']}")
if (
results["engine_duplicate_names"] != "0"
or results["transmission_duplicate_types"] != "0"
or results["vehicle_option_duplicates"] != "0"
or results["year_range_valid"] != "0"
or results["cross_join_gaps"] != "0"
):
issues_found = True
invalids = check_invalid_combinations()
if invalids:
issues_found = True
print("\n❌ Invalid combinations detected:")
for issue in invalids:
print(f" - {issue}")
else:
print("\n✅ No known invalid year/make/model/trim combos found.")
check_trim_coverage()
if not issues_found:
print("\n✅ QA checks passed.")
else:
print("\n❌ QA checks found issues.")
if __name__ == "__main__":
main()

View File

Binary file not shown.

View File

@@ -0,0 +1,53 @@
acura
alfa_romeo
aston_martin
audi
bentley
bmw
buick
cadillac
chevrolet
chrysler
dodge
ferrari
fiat
ford
genesis
gmc
honda
hummer
hyundai
infiniti
isuzu
jaguar
jeep
kia
lamborghini
land_rover
lexus
lincoln
lotus
lucid
maserati
mazda
mclaren
mercury
mini
mitsubishi
nissan
oldsmobile
plymouth
polestar
pontiac
porsche
ram
rivian
rolls_royce
saab
scion
smart
subaru
tesla
toyota
volkswagen
volvo

View File

@@ -0,0 +1 @@
N9ZTsICa0gprFoXxgYRK6UApGCIVLeJlu0XR0leN

View File

@@ -0,0 +1,515 @@
#!/usr/bin/env python3
"""
Fetches VehAPI data into an offline snapshot (SQLite + meta.json).
Workflow:
1. Walks Year -> Make -> Model -> Trim -> Transmission -> Engine using VehAPI.
2. Persists observed compatibility pairs to snapshot.sqlite (no Cartesian products).
3. Stores request/response cache for resume; obeys rate limits and 429 retry-after.
"""
from __future__ import annotations
import argparse
import hashlib
import json
import random
import sqlite3
import sys
import time
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence
from urllib.parse import quote
try:
import requests
except ImportError: # pragma: no cover - env guard
print("[error] Missing dependency 'requests'. Install with `pip install requests`.", file=sys.stderr)
sys.exit(1)
SCRIPT_VERSION = "vehapi_fetch_snapshot.py@1.1.0"
DEFAULT_MIN_YEAR = 2015
DEFAULT_MAX_YEAR = 2022
DEFAULT_RATE_PER_SEC = 55 # stays under the 60 req/sec ceiling
MAX_ATTEMPTS = 5
FALLBACK_TRIMS = ["Base"]
FALLBACK_TRANSMISSIONS = ["Manual", "Automatic"]
DEFAULT_BASE_URL = "https://vehapi.com/api/v1/car-lists/get/car"
def canonicalize(value: str) -> str:
"""Lowercase, trim, collapse spaces, and normalize hyphens for dedupe keys."""
import re
cleaned = (value or "").strip()
cleaned = re.sub(r"[\s\u00A0]+", " ", cleaned)
cleaned = re.sub(r"[-\u2010-\u2015]+", "-", cleaned)
return cleaned.lower()
def infer_trans_bucket(trans_str: str) -> str:
lowered = (trans_str or "").lower()
if "manual" in lowered or "mt" in lowered or "m/t" in lowered:
return "Manual"
return "Automatic"
def infer_fuel_bucket(engine_str: str, trans_str: str, trim_str: str) -> str:
target = " ".join([engine_str or "", trans_str or "", trim_str or ""]).lower()
if any(token in target for token in ["electric", "ev", "battery", "motor", "kwh"]):
return "Electric"
if any(token in target for token in ["hybrid", "phev", "plug-in", "hev", "e-hybrid"]):
return "Hybrid"
if any(token in target for token in ["diesel", "tdi", "dci", "duramax", "power stroke", "cummins"]):
return "Diesel"
return "Gas"
def read_text_file(path: Path) -> str:
with path.open("r", encoding="utf-8") as fh:
return fh.read()
def read_lines(path: Path) -> List[str]:
return [line.strip() for line in read_text_file(path).splitlines() if line.strip()]
def sha256_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as fh:
for chunk in iter(lambda: fh.read(8192), b""):
h.update(chunk)
return h.hexdigest()
def ensure_snapshot_dir(root: Path, custom_dir: Optional[str]) -> Path:
if custom_dir:
snapshot_dir = Path(custom_dir)
else:
today = datetime.now(timezone.utc).date().isoformat()
snapshot_dir = root / today
snapshot_dir.mkdir(parents=True, exist_ok=True)
return snapshot_dir
class RateLimiter:
"""Simple leaky bucket limiter to stay below the VehAPI threshold."""
def __init__(self, max_per_sec: int) -> None:
self.max_per_sec = max_per_sec
self._history: List[float] = []
def acquire(self) -> None:
while True:
now = time.monotonic()
window_start = now - 1
self._history = [ts for ts in self._history if ts >= window_start]
if len(self._history) < self.max_per_sec:
break
sleep_for = max(self._history[0] - window_start, 0.001)
time.sleep(sleep_for)
self._history.append(time.monotonic())
@dataclass
class FetchCounts:
pairs_inserted: int = 0
cache_hits: int = 0
fallback_transmissions: int = 0
fallback_engines: int = 0
class VehapiFetcher:
def __init__(
self,
session: requests.Session,
base_url: str,
token: str,
min_year: int,
max_year: int,
allowed_makes: Sequence[str],
snapshot_path: Path,
responses_cache: bool = True,
rate_per_sec: int = DEFAULT_RATE_PER_SEC,
) -> None:
self.session = session
self.base_url = base_url.rstrip("/")
self.token = token
self.min_year = min_year
self.max_year = max_year
self.allowed_makes = {canonicalize(m): m for m in allowed_makes}
self.snapshot_path = snapshot_path
self.conn = sqlite3.connect(self.snapshot_path)
self.conn.execute("PRAGMA journal_mode=WAL;")
self.conn.execute("PRAGMA synchronous=NORMAL;")
self._init_schema()
self.responses_cache = responses_cache
self.rate_limiter = RateLimiter(rate_per_sec)
self.counts = FetchCounts()
def _init_schema(self) -> None:
self.conn.execute(
"""
CREATE TABLE IF NOT EXISTS pairs(
year INT,
make TEXT,
model TEXT,
trim TEXT,
engine_display TEXT,
engine_canon TEXT,
engine_bucket TEXT,
trans_display TEXT,
trans_canon TEXT,
trans_bucket TEXT,
PRIMARY KEY(year, make, model, trim, engine_canon, trans_canon)
)
"""
)
self.conn.execute(
"""
CREATE TABLE IF NOT EXISTS meta(
key TEXT PRIMARY KEY,
value TEXT
)
"""
)
self.conn.execute(
"""
CREATE TABLE IF NOT EXISTS responses(
request_key TEXT PRIMARY KEY,
url TEXT,
status INT,
headers_json TEXT,
body_json TEXT,
fetched_at TEXT
)
"""
)
self.conn.commit()
def _store_meta(self, meta: Dict[str, Any]) -> None:
rows = [(k, str(v)) for k, v in meta.items()]
self.conn.executemany("INSERT OR REPLACE INTO meta(key, value) VALUES (?, ?)", rows)
self.conn.commit()
def _load_cached_response(self, request_key: str) -> Optional[Any]:
if not self.responses_cache:
return None
cur = self.conn.execute("SELECT body_json FROM responses WHERE request_key = ?", (request_key,))
row = cur.fetchone()
if not row:
return None
self.counts.cache_hits += 1
try:
return json.loads(row[0])
except Exception:
return None
def _save_response(self, request_key: str, url: str, status: int, headers: Dict[str, Any], body: Any) -> None:
self.conn.execute(
"""
INSERT OR REPLACE INTO responses(request_key, url, status, headers_json, body_json, fetched_at)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
request_key,
url,
status,
json.dumps(dict(headers), default=str),
json.dumps(body, default=str),
datetime.now(timezone.utc).isoformat(),
),
)
self.conn.commit()
def _request_json(self, path_parts: Sequence[str], label: str) -> Any:
path_parts = [str(p) for p in path_parts]
request_key = "/".join(path_parts)
cached = self._load_cached_response(request_key)
if cached is not None:
return cached
url = f"{self.base_url}/" + "/".join(quote(p, safe="") for p in path_parts)
attempts = 0
backoff = 1.0
while attempts < MAX_ATTEMPTS:
attempts += 1
self.rate_limiter.acquire()
try:
resp = self.session.get(url, headers={"Authorization": f"Bearer {self.token}"}, timeout=30)
except requests.RequestException as exc:
print(f"[warn] {label}: request error {exc}; retrying...", file=sys.stderr)
time.sleep(backoff + random.uniform(0, 0.5))
backoff = min(backoff * 2, 30)
continue
if resp.status_code == 429:
retry_after = resp.headers.get("retry-after") or resp.headers.get("Retry-After")
try:
retry_seconds = float(retry_after)
except (TypeError, ValueError):
retry_seconds = 30.0
sleep_for = retry_seconds + random.uniform(0, 3)
print(f"[info] {label}: hit 429, sleeping {sleep_for:.1f}s before retry", file=sys.stderr)
time.sleep(sleep_for)
backoff = min(backoff * 2, 30)
continue
if resp.status_code >= 500:
print(f"[warn] {label}: server {resp.status_code}, retrying...", file=sys.stderr)
time.sleep(backoff + random.uniform(0, 0.5))
backoff = min(backoff * 2, 30)
continue
if not resp.ok:
print(f"[warn] {label}: HTTP {resp.status_code}, skipping", file=sys.stderr)
return []
try:
body = resp.json()
except ValueError:
print(f"[warn] {label}: non-JSON response, skipping", file=sys.stderr)
return []
self._save_response(request_key, url, resp.status_code, resp.headers, body)
return body
print(f"[error] {label}: exhausted retries", file=sys.stderr)
return []
@staticmethod
def _extract_values(payload: Any, keys: Sequence[str]) -> List[str]:
values: List[str] = []
if isinstance(payload, dict):
payload = payload.get("data") or payload.get("results") or payload.get("items") or payload
if not payload:
return values
if isinstance(payload, list):
for item in payload:
if isinstance(item, str):
if item.strip():
values.append(item.strip())
continue
if isinstance(item, dict):
for key in keys:
val = item.get(key)
if val:
values.append(str(val).strip())
break
return values
def _record_pair(
self,
year: int,
make: str,
model: str,
trim: str,
engine_display: str,
engine_bucket: str,
trans_display: str,
trans_bucket: str,
) -> None:
engine_canon = canonicalize(engine_display)
trans_canon = canonicalize(trans_display)
cur = self.conn.execute(
"""
INSERT OR IGNORE INTO pairs(
year, make, model, trim,
engine_display, engine_canon, engine_bucket,
trans_display, trans_canon, trans_bucket
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
year,
make,
model,
trim,
engine_display.strip(),
engine_canon,
engine_bucket,
trans_display.strip(),
trans_canon,
trans_bucket,
),
)
if cur.rowcount:
self.counts.pairs_inserted += 1
def _fetch_engines_for_transmission(
self, year: int, make: str, model: str, trim: str, transmission: str, trans_bucket: str
) -> None:
path = ["engines", str(year), make, model, trim, transmission]
label = f"engines:{year}/{make}/{model}/{trim}/{transmission}"
engines_payload = self._request_json(path, label)
engines = self._extract_values(engines_payload, ["engine"])
if not engines:
engine_bucket = infer_fuel_bucket("", transmission, trim)
fallback_engine = engine_bucket
self._record_pair(year, make, model, trim, fallback_engine, engine_bucket, transmission, trans_bucket)
self.counts.fallback_engines += 1
return
for engine in engines:
engine_bucket = infer_fuel_bucket(engine, transmission, trim)
self._record_pair(year, make, model, trim, engine, engine_bucket, transmission, trans_bucket)
def _fetch_transmissions_for_trim(self, year: int, make: str, model: str, trim: str) -> None:
path = ["transmissions", str(year), make, model, trim]
label = f"transmissions:{year}/{make}/{model}/{trim}"
transmissions_payload = self._request_json(path, label)
transmissions = self._extract_values(transmissions_payload, ["transmission"])
if not transmissions:
for fallback in FALLBACK_TRANSMISSIONS:
trans_bucket = infer_trans_bucket(fallback)
engine_bucket = infer_fuel_bucket("", fallback, trim)
self._record_pair(year, make, model, trim, engine_bucket, engine_bucket, fallback, trans_bucket)
self.counts.fallback_transmissions += 1
self.counts.fallback_engines += 1
return
for trans in transmissions:
trans_bucket = infer_trans_bucket(trans)
self._fetch_engines_for_transmission(year, make, model, trim, trans, trans_bucket)
def _fetch_trims_for_model(self, year: int, make: str, model: str) -> None:
path = ["trims", str(year), make, model]
label = f"trims:{year}/{make}/{model}"
trims_payload = self._request_json(path, label)
trims = self._extract_values(trims_payload, ["trim"])
if not trims:
trims = FALLBACK_TRIMS
for trim in trims:
self._fetch_transmissions_for_trim(year, make, model, trim)
self.conn.commit()
def _fetch_models_for_make(self, year: int, make: str) -> None:
path = ["models", str(year), make]
label = f"models:{year}/{make}"
models_payload = self._request_json(path, label)
models = self._extract_values(models_payload, ["model"])
if not models:
print(f"[warn] {label}: no models returned", file=sys.stderr)
return
for model in models:
self._fetch_trims_for_model(year, make, model)
def _fetch_makes_for_year(self, year: int) -> List[str]:
path = ["makes", str(year)]
label = f"makes:{year}"
makes_payload = self._request_json(path, label)
makes = self._extract_values(makes_payload, ["make"])
filtered = []
for make in makes:
canon = canonicalize(make)
if canon in self.allowed_makes:
filtered.append(make)
return filtered
def run(self) -> FetchCounts:
for year in range(self.min_year, self.max_year + 1):
makes = self._fetch_makes_for_year(year)
if not makes:
print(f"[info] {year}: no allowed makes found, skipping", file=sys.stderr)
continue
print(f"[info] {year}: {len(makes)} makes", file=sys.stderr)
for make in makes:
print(f"[info] {year} {make}: fetching models", file=sys.stderr)
self._fetch_models_for_make(year, make)
self.conn.commit()
return self.counts
def build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Fetch VehAPI snapshot into SQLite.")
parser.add_argument("--min-year", type=int, default=int(read_env("MIN_YEAR", DEFAULT_MIN_YEAR)), help="Inclusive min year (default env MIN_YEAR or 2017)")
parser.add_argument("--max-year", type=int, default=int(read_env("MAX_YEAR", DEFAULT_MAX_YEAR)), help="Inclusive max year (default env MAX_YEAR or 2026)")
parser.add_argument("--snapshot-dir", type=str, help="Target snapshot directory (default snapshots/<today>)")
parser.add_argument("--base-url", type=str, default=read_env("VEHAPI_BASE_URL", DEFAULT_BASE_URL), help="VehAPI base URL (e.g. https://vehapi.com/api/v1/car-lists/get/car)")
parser.add_argument("--rate-per-sec", type=int, default=int(read_env("VEHAPI_MAX_RPS", DEFAULT_RATE_PER_SEC)), help="Max requests per second (<=60)")
parser.add_argument("--makes-file", type=str, default="source-makes.txt", help="Path to source-makes.txt")
parser.add_argument("--api-key-file", type=str, default="vehapi.key", help="Path to VehAPI bearer token file")
parser.add_argument("--no-response-cache", action="store_true", help="Disable request cache stored in snapshot.sqlite")
return parser
def read_env(key: str, default: Any) -> Any:
import os
return os.environ.get(key, default)
def main(argv: Sequence[str]) -> int:
parser = build_arg_parser()
args = parser.parse_args(argv)
base_dir = Path(__file__).resolve().parent
snapshot_root = base_dir / "snapshots"
snapshot_dir = ensure_snapshot_dir(snapshot_root, args.snapshot_dir)
snapshot_path = snapshot_dir / "snapshot.sqlite"
meta_path = snapshot_dir / "meta.json"
makes_file = (base_dir / args.makes_file).resolve()
api_key_file = (base_dir / args.api_key_file).resolve()
if not makes_file.exists():
print(f"[error] makes file not found: {makes_file}", file=sys.stderr)
return 1
if not api_key_file.exists():
print(f"[error] api key file not found: {api_key_file}", file=sys.stderr)
return 1
allowed_makes = read_lines(makes_file)
token = read_text_file(api_key_file).strip()
if not token:
print("[error] vehapi.key is empty", file=sys.stderr)
return 1
session = requests.Session()
fetcher = VehapiFetcher(
session=session,
base_url=args.base_url,
token=token,
min_year=args.min_year,
max_year=args.max_year,
allowed_makes=allowed_makes,
snapshot_path=snapshot_path,
responses_cache=not args.no_response_cache,
rate_per_sec=args.rate_per_sec,
)
started_at = datetime.now(timezone.utc)
counts = fetcher.run()
finished_at = datetime.now(timezone.utc)
meta = {
"generated_at": finished_at.isoformat(),
"started_at": started_at.isoformat(),
"min_year": args.min_year,
"max_year": args.max_year,
"script_version": SCRIPT_VERSION,
"makes_file": str(makes_file),
"makes_hash": sha256_file(makes_file),
"api_base_url": args.base_url,
"snapshot_path": str(snapshot_path),
"pairs_inserted": counts.pairs_inserted,
"fallback_transmissions": counts.fallback_transmissions,
"fallback_engines": counts.fallback_engines,
"response_cache_hits": counts.cache_hits,
}
fetcher._store_meta(meta)
with meta_path.open("w", encoding="utf-8") as fh:
json.dump(meta, fh, indent=2)
print(
f"[done] wrote snapshot to {snapshot_path} with {counts.pairs_inserted} pairs "
f"(fallback trans={counts.fallback_transmissions}, fallback engines={counts.fallback_engines}, cache hits={counts.cache_hits})",
file=sys.stderr,
)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

File diff suppressed because it is too large Load Diff