Possible working ETL
This commit is contained in:
BIN
data/vehicle-etl/__pycache__/etl_generate_sql.cpython-314.pyc
Normal file
BIN
data/vehicle-etl/__pycache__/etl_generate_sql.cpython-314.pyc
Normal file
Binary file not shown.
BIN
data/vehicle-etl/__pycache__/nhtsa_fetch.cpython-314.pyc
Normal file
BIN
data/vehicle-etl/__pycache__/nhtsa_fetch.cpython-314.pyc
Normal file
Binary file not shown.
Binary file not shown.
238
data/vehicle-etl/etl_generate_sql.py
Normal file
238
data/vehicle-etl/etl_generate_sql.py
Normal file
@@ -0,0 +1,238 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate SQL import files from a VehAPI snapshot SQLite database.
|
||||
|
||||
Reads observed compatibility pairs from the snapshot (trim-filtered engine<->transmission pairs)
|
||||
and produces:
|
||||
- output/01_engines.sql
|
||||
- output/02_transmissions.sql
|
||||
- output/03_vehicle_options.sql
|
||||
|
||||
No legacy JSON or network calls are used. The snapshot path is provided via CLI flag.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Sequence
|
||||
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate SQL files from a VehAPI snapshot (SQLite).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--snapshot-path",
|
||||
type=Path,
|
||||
default=os.environ.get("SNAPSHOT_PATH"),
|
||||
help="Path to snapshots/<date>/snapshot.sqlite produced by vehapi_fetch_snapshot.py (or env SNAPSHOT_PATH)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("output"),
|
||||
help="Directory to write SQL output files (default: output)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_pairs(snapshot_path: Path) -> List[sqlite3.Row]:
|
||||
if not snapshot_path.exists():
|
||||
raise FileNotFoundError(f"Snapshot not found: {snapshot_path}")
|
||||
|
||||
conn = sqlite3.connect(snapshot_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
"""
|
||||
SELECT
|
||||
year,
|
||||
make,
|
||||
model,
|
||||
trim,
|
||||
engine_display,
|
||||
engine_canon,
|
||||
engine_bucket,
|
||||
trans_display,
|
||||
trans_canon,
|
||||
trans_bucket
|
||||
FROM pairs
|
||||
ORDER BY year, make, model, trim, engine_canon, trans_canon
|
||||
"""
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
except sqlite3.Error as exc:
|
||||
raise RuntimeError(f"Failed to read pairs from snapshot: {exc}") from exc
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if not rows:
|
||||
raise ValueError("Snapshot contains no rows in pairs table.")
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def choose_engine_label(engine_display: str, engine_bucket: str, engine_canon: str) -> str:
|
||||
"""
|
||||
Use VehAPI display string when present, otherwise fall back to the bucket label,
|
||||
and finally to the canonical key to avoid empty names.
|
||||
"""
|
||||
if engine_display:
|
||||
return engine_display
|
||||
if engine_bucket:
|
||||
return engine_bucket
|
||||
return engine_canon
|
||||
|
||||
|
||||
def choose_trans_label(trans_display: str, trans_bucket: str, trans_canon: str) -> str:
|
||||
if trans_display:
|
||||
return trans_display
|
||||
if trans_bucket:
|
||||
return trans_bucket
|
||||
return trans_canon
|
||||
|
||||
|
||||
def build_engine_dimension(rows: Sequence[sqlite3.Row]) -> Dict[str, Dict]:
|
||||
engines: Dict[str, Dict] = {}
|
||||
for row in rows:
|
||||
canon = row["engine_canon"]
|
||||
if canon is None or canon == "":
|
||||
raise ValueError(f"Missing engine_canon for row: {dict(row)}")
|
||||
if canon in engines:
|
||||
continue
|
||||
engines[canon] = {
|
||||
"id": len(engines) + 1,
|
||||
"name": choose_engine_label(row["engine_display"], row["engine_bucket"], canon),
|
||||
"fuel_type": row["engine_bucket"] or None,
|
||||
}
|
||||
return engines
|
||||
|
||||
|
||||
def build_transmission_dimension(rows: Sequence[sqlite3.Row]) -> Dict[str, Dict]:
|
||||
transmissions: Dict[str, Dict] = {}
|
||||
for row in rows:
|
||||
canon = row["trans_canon"]
|
||||
if canon is None or canon == "":
|
||||
raise ValueError(f"Missing trans_canon for row: {dict(row)}")
|
||||
if canon in transmissions:
|
||||
continue
|
||||
transmissions[canon] = {
|
||||
"id": len(transmissions) + 1,
|
||||
"type": choose_trans_label(row["trans_display"], row["trans_bucket"], canon),
|
||||
}
|
||||
return transmissions
|
||||
|
||||
|
||||
def build_vehicle_options(
|
||||
rows: Sequence[sqlite3.Row],
|
||||
engine_map: Dict[str, Dict],
|
||||
trans_map: Dict[str, Dict],
|
||||
) -> List[Dict]:
|
||||
options: List[Dict] = []
|
||||
for row in rows:
|
||||
engine_canon = row["engine_canon"]
|
||||
trans_canon = row["trans_canon"]
|
||||
options.append(
|
||||
{
|
||||
"year": int(row["year"]),
|
||||
"make": row["make"],
|
||||
"model": row["model"],
|
||||
"trim": row["trim"],
|
||||
"engine_id": engine_map[engine_canon]["id"],
|
||||
"transmission_id": trans_map[trans_canon]["id"],
|
||||
}
|
||||
)
|
||||
return options
|
||||
|
||||
|
||||
def sql_value(value):
|
||||
if value is None:
|
||||
return "NULL"
|
||||
if isinstance(value, str):
|
||||
return "'" + value.replace("'", "''") + "'"
|
||||
return str(value)
|
||||
|
||||
|
||||
def chunked(seq: Iterable[Dict], size: int) -> Iterable[List[Dict]]:
|
||||
chunk: List[Dict] = []
|
||||
for item in seq:
|
||||
chunk.append(item)
|
||||
if len(chunk) >= size:
|
||||
yield chunk
|
||||
chunk = []
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
|
||||
def write_insert_file(
|
||||
path: Path,
|
||||
table: str,
|
||||
columns: Sequence[str],
|
||||
rows: Sequence[Dict],
|
||||
):
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8") as f:
|
||||
f.write(f"-- Auto-generated by etl_generate_sql.py\n")
|
||||
if not rows:
|
||||
f.write(f"-- No rows for {table}\n")
|
||||
return
|
||||
|
||||
for batch in chunked(rows, BATCH_SIZE):
|
||||
values_sql = ",\n".join(
|
||||
"(" + ",".join(sql_value(row[col]) for col in columns) + ")"
|
||||
for row in batch
|
||||
)
|
||||
f.write(f"INSERT INTO {table} ({', '.join(columns)}) VALUES\n{values_sql};\n\n")
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
snapshot_path: Path = args.snapshot_path
|
||||
output_dir: Path = args.output_dir
|
||||
if snapshot_path is None:
|
||||
raise SystemExit("Snapshot path is required. Pass --snapshot-path or set SNAPSHOT_PATH.")
|
||||
|
||||
print(f"Reading snapshot: {snapshot_path}")
|
||||
rows = load_pairs(snapshot_path)
|
||||
years = sorted({int(row["year"]) for row in rows})
|
||||
print(f" Loaded {len(rows):,} observed engine<->transmission pairs across {len(years)} years")
|
||||
|
||||
engines = build_engine_dimension(rows)
|
||||
transmissions = build_transmission_dimension(rows)
|
||||
vehicle_options = build_vehicle_options(rows, engines, transmissions)
|
||||
|
||||
print(f"Engines: {len(engines):,}")
|
||||
print(f"Transmissions: {len(transmissions):,}")
|
||||
print(f"Vehicle options (observed pairs): {len(vehicle_options):,}")
|
||||
|
||||
write_insert_file(
|
||||
output_dir / "01_engines.sql",
|
||||
"engines",
|
||||
["id", "name", "fuel_type"],
|
||||
engines.values(),
|
||||
)
|
||||
write_insert_file(
|
||||
output_dir / "02_transmissions.sql",
|
||||
"transmissions",
|
||||
["id", "type"],
|
||||
transmissions.values(),
|
||||
)
|
||||
write_insert_file(
|
||||
output_dir / "03_vehicle_options.sql",
|
||||
"vehicle_options",
|
||||
["year", "make", "model", "trim", "engine_id", "transmission_id"],
|
||||
vehicle_options,
|
||||
)
|
||||
|
||||
print("\nSQL files generated:")
|
||||
print(f" - {output_dir / '01_engines.sql'}")
|
||||
print(f" - {output_dir / '02_transmissions.sql'}")
|
||||
print(f" - {output_dir / '03_vehicle_options.sql'}")
|
||||
print(f"\nYear coverage: {years[0]}-{years[-1]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
71
data/vehicle-etl/import_data.sh
Executable file
71
data/vehicle-etl/import_data.sh
Executable file
@@ -0,0 +1,71 @@
|
||||
#!/bin/bash
|
||||
# Offline import of generated SQL files into PostgreSQL (no network).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
echo "=========================================="
|
||||
echo "📥 Automotive Database Import (offline)"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
require_file() {
|
||||
if [ ! -f "$1" ]; then
|
||||
echo "❌ Missing required file: $1"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
if ! docker ps --filter "name=mvp-postgres" --format "{{.Names}}" | grep -q "mvp-postgres"; then
|
||||
echo "❌ Error: mvp-postgres container is not running"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
require_file "output/01_engines.sql"
|
||||
require_file "output/02_transmissions.sql"
|
||||
require_file "output/03_vehicle_options.sql"
|
||||
|
||||
echo "📋 Step 1: Running database schema migration..."
|
||||
docker exec -i mvp-postgres psql -U postgres -d motovaultpro < migrations/001_create_vehicle_database.sql
|
||||
echo "✓ Schema migration completed"
|
||||
echo ""
|
||||
|
||||
echo "🧹 Step 2: Truncating existing data..."
|
||||
docker exec -i mvp-postgres psql -U postgres -d motovaultpro <<'EOF'
|
||||
TRUNCATE TABLE vehicle_options RESTART IDENTITY CASCADE;
|
||||
TRUNCATE TABLE engines RESTART IDENTITY CASCADE;
|
||||
TRUNCATE TABLE transmissions RESTART IDENTITY CASCADE;
|
||||
EOF
|
||||
echo "✓ Tables truncated"
|
||||
echo ""
|
||||
|
||||
echo "📥 Step 3: Importing engines..."
|
||||
docker exec -i mvp-postgres psql -U postgres -d motovaultpro < output/01_engines.sql
|
||||
echo "✓ Engines imported"
|
||||
echo ""
|
||||
|
||||
echo "📥 Step 4: Importing transmissions..."
|
||||
docker exec -i mvp-postgres psql -U postgres -d motovaultpro < output/02_transmissions.sql
|
||||
echo "✓ Transmissions imported"
|
||||
echo ""
|
||||
|
||||
echo "📥 Step 5: Importing vehicle options (observed pairs only)..."
|
||||
docker exec -i mvp-postgres psql -U postgres -d motovaultpro < output/03_vehicle_options.sql
|
||||
echo "✓ Vehicle options imported"
|
||||
echo ""
|
||||
|
||||
echo "=========================================="
|
||||
echo "✅ Import completed"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "🔍 Database verification:"
|
||||
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT COUNT(*) as engines FROM engines;"
|
||||
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT COUNT(*) as transmissions FROM transmissions;"
|
||||
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT COUNT(*) as vehicle_options FROM vehicle_options;"
|
||||
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT MIN(year) as min_year, MAX(year) as max_year FROM vehicle_options;"
|
||||
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT DISTINCT year FROM vehicle_options ORDER BY year LIMIT 5;"
|
||||
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "SELECT DISTINCT year FROM vehicle_options ORDER BY year DESC LIMIT 5;"
|
||||
echo ""
|
||||
echo "✓ Database ready for dropdown use."
|
||||
286
data/vehicle-etl/migrations/001_create_vehicle_database.sql
Normal file
286
data/vehicle-etl/migrations/001_create_vehicle_database.sql
Normal file
@@ -0,0 +1,286 @@
|
||||
-- Migration: Create Automotive Vehicle Selection Database
|
||||
-- Optimized for dropdown cascade queries
|
||||
-- Date: 2025-11-10
|
||||
|
||||
-- Drop existing tables if they exist
|
||||
DROP TABLE IF EXISTS vehicle_options CASCADE;
|
||||
DROP TABLE IF EXISTS engines CASCADE;
|
||||
DROP TABLE IF EXISTS transmissions CASCADE;
|
||||
DROP INDEX IF EXISTS idx_vehicle_year;
|
||||
DROP INDEX IF EXISTS idx_vehicle_make;
|
||||
DROP INDEX IF EXISTS idx_vehicle_model;
|
||||
DROP INDEX IF EXISTS idx_vehicle_trim;
|
||||
DROP INDEX IF EXISTS idx_vehicle_composite;
|
||||
|
||||
-- Create engines table with detailed specifications
|
||||
CREATE TABLE engines (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
displacement VARCHAR(50),
|
||||
configuration VARCHAR(50),
|
||||
horsepower VARCHAR(100),
|
||||
torque VARCHAR(100),
|
||||
fuel_type VARCHAR(100),
|
||||
fuel_system VARCHAR(255),
|
||||
aspiration VARCHAR(100),
|
||||
specs_json JSONB,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
-- Prevent duplicate engine display names (case-insensitive)
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_engines_name_lower ON engines (LOWER(name));
|
||||
|
||||
CREATE INDEX idx_engines_displacement ON engines(displacement);
|
||||
CREATE INDEX idx_engines_config ON engines(configuration);
|
||||
|
||||
-- Create transmissions table
|
||||
CREATE TABLE transmissions (
|
||||
id SERIAL PRIMARY KEY,
|
||||
type VARCHAR(100) NOT NULL,
|
||||
speeds VARCHAR(50),
|
||||
drive_type VARCHAR(100),
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
-- Prevent duplicate transmission display names (case-insensitive)
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_transmissions_type_lower ON transmissions (LOWER(type));
|
||||
|
||||
CREATE INDEX idx_transmissions_type ON transmissions(type);
|
||||
|
||||
-- Create denormalized vehicle_options table optimized for dropdown queries
|
||||
CREATE TABLE vehicle_options (
|
||||
id SERIAL PRIMARY KEY,
|
||||
year INTEGER NOT NULL,
|
||||
make VARCHAR(100) NOT NULL,
|
||||
model VARCHAR(255) NOT NULL,
|
||||
trim VARCHAR(255) NOT NULL,
|
||||
engine_id INTEGER REFERENCES engines(id) ON DELETE SET NULL,
|
||||
transmission_id INTEGER REFERENCES transmissions(id) ON DELETE SET NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
-- Prevent duplicate vehicle option rows
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_vehicle_options_full ON vehicle_options (
|
||||
year, make, model, trim, engine_id, transmission_id
|
||||
);
|
||||
|
||||
-- Indexes for cascading dropdown performance
|
||||
CREATE INDEX idx_vehicle_year ON vehicle_options(year);
|
||||
CREATE INDEX idx_vehicle_make ON vehicle_options(make);
|
||||
CREATE INDEX idx_vehicle_model ON vehicle_options(model);
|
||||
CREATE INDEX idx_vehicle_trim ON vehicle_options(trim);
|
||||
CREATE INDEX idx_vehicle_year_make ON vehicle_options(year, make);
|
||||
CREATE INDEX idx_vehicle_year_make_model ON vehicle_options(year, make, model);
|
||||
CREATE INDEX idx_vehicle_year_make_model_trim ON vehicle_options(year, make, model, trim);
|
||||
CREATE INDEX idx_vehicle_year_make_model_trim_engine ON vehicle_options(year, make, model, trim, engine_id);
|
||||
CREATE INDEX idx_vehicle_year_make_model_trim_trans ON vehicle_options(year, make, model, trim, transmission_id);
|
||||
|
||||
-- Views for dropdown queries
|
||||
|
||||
-- View: Get all available years
|
||||
CREATE OR REPLACE VIEW available_years AS
|
||||
SELECT DISTINCT year
|
||||
FROM vehicle_options
|
||||
ORDER BY year DESC;
|
||||
|
||||
-- View: Get makes by year
|
||||
CREATE OR REPLACE VIEW makes_by_year AS
|
||||
SELECT DISTINCT year, make
|
||||
FROM vehicle_options
|
||||
ORDER BY year DESC, make ASC;
|
||||
|
||||
-- View: Get models by year and make
|
||||
CREATE OR REPLACE VIEW models_by_year_make AS
|
||||
SELECT DISTINCT year, make, model
|
||||
FROM vehicle_options
|
||||
ORDER BY year DESC, make ASC, model ASC;
|
||||
|
||||
-- View: Get trims by year, make, and model
|
||||
CREATE OR REPLACE VIEW trims_by_year_make_model AS
|
||||
SELECT DISTINCT year, make, model, trim
|
||||
FROM vehicle_options
|
||||
ORDER BY year DESC, make ASC, model ASC, trim ASC;
|
||||
|
||||
-- View: Get complete vehicle configurations with engine and transmission details
|
||||
CREATE OR REPLACE VIEW complete_vehicle_configs AS
|
||||
SELECT
|
||||
vo.id,
|
||||
vo.year,
|
||||
vo.make,
|
||||
vo.model,
|
||||
vo.trim,
|
||||
e.name AS engine_name,
|
||||
e.displacement,
|
||||
e.configuration,
|
||||
e.horsepower,
|
||||
e.torque,
|
||||
e.fuel_type,
|
||||
t.type AS transmission_type,
|
||||
t.speeds AS transmission_speeds,
|
||||
t.drive_type
|
||||
FROM vehicle_options vo
|
||||
LEFT JOIN engines e ON vo.engine_id = e.id
|
||||
LEFT JOIN transmissions t ON vo.transmission_id = t.id
|
||||
ORDER BY vo.year DESC, vo.make ASC, vo.model ASC, vo.trim ASC;
|
||||
|
||||
-- Function to get makes for a specific year
|
||||
CREATE OR REPLACE FUNCTION get_makes_for_year(p_year INTEGER)
|
||||
RETURNS TABLE(make VARCHAR) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT DISTINCT vehicle_options.make
|
||||
FROM vehicle_options
|
||||
WHERE vehicle_options.year = p_year
|
||||
ORDER BY vehicle_options.make ASC;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to get models for a specific year and make
|
||||
CREATE OR REPLACE FUNCTION get_models_for_year_make(p_year INTEGER, p_make VARCHAR)
|
||||
RETURNS TABLE(model VARCHAR) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT DISTINCT vehicle_options.model
|
||||
FROM vehicle_options
|
||||
WHERE vehicle_options.year = p_year
|
||||
AND vehicle_options.make = p_make
|
||||
ORDER BY vehicle_options.model ASC;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to get trims for a specific year, make, and model
|
||||
CREATE OR REPLACE FUNCTION get_trims_for_year_make_model(p_year INTEGER, p_make VARCHAR, p_model VARCHAR)
|
||||
RETURNS TABLE(trim_name VARCHAR) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT DISTINCT vehicle_options.trim
|
||||
FROM vehicle_options
|
||||
WHERE vehicle_options.year = p_year
|
||||
AND vehicle_options.make = p_make
|
||||
AND vehicle_options.model = p_model
|
||||
ORDER BY vehicle_options.trim ASC;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to get engine and transmission options for a specific vehicle
|
||||
CREATE OR REPLACE FUNCTION get_options_for_vehicle(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR)
|
||||
RETURNS TABLE(
|
||||
engine_name VARCHAR,
|
||||
engine_displacement VARCHAR,
|
||||
engine_horsepower VARCHAR,
|
||||
transmission_type VARCHAR,
|
||||
transmission_speeds VARCHAR,
|
||||
drive_type VARCHAR
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
e.name,
|
||||
e.displacement,
|
||||
e.horsepower,
|
||||
t.type,
|
||||
t.speeds,
|
||||
t.drive_type
|
||||
FROM vehicle_options vo
|
||||
LEFT JOIN engines e ON vo.engine_id = e.id
|
||||
LEFT JOIN transmissions t ON vo.transmission_id = t.id
|
||||
WHERE vo.year = p_year
|
||||
AND vo.make = p_make
|
||||
AND vo.model = p_model
|
||||
AND vo.trim = p_trim;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Helper functions for trim-level options and pair-safe filtering
|
||||
CREATE OR REPLACE FUNCTION get_transmissions_for_vehicle(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR)
|
||||
RETURNS TABLE(
|
||||
transmission_id INTEGER,
|
||||
transmission_type VARCHAR
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT DISTINCT
|
||||
t.id,
|
||||
t.type
|
||||
FROM vehicle_options vo
|
||||
JOIN transmissions t ON vo.transmission_id = t.id
|
||||
WHERE vo.year = p_year
|
||||
AND vo.make = p_make
|
||||
AND vo.model = p_model
|
||||
AND vo.trim = p_trim
|
||||
ORDER BY t.type ASC;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION get_engines_for_vehicle(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR)
|
||||
RETURNS TABLE(
|
||||
engine_id INTEGER,
|
||||
engine_name VARCHAR
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT DISTINCT
|
||||
e.id,
|
||||
e.name
|
||||
FROM vehicle_options vo
|
||||
JOIN engines e ON vo.engine_id = e.id
|
||||
WHERE vo.year = p_year
|
||||
AND vo.make = p_make
|
||||
AND vo.model = p_model
|
||||
AND vo.trim = p_trim
|
||||
ORDER BY e.name ASC;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION get_transmissions_for_vehicle_engine(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR, p_engine_name VARCHAR)
|
||||
RETURNS TABLE(
|
||||
transmission_id INTEGER,
|
||||
transmission_type VARCHAR
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT DISTINCT
|
||||
t.id,
|
||||
t.type
|
||||
FROM vehicle_options vo
|
||||
JOIN engines e ON vo.engine_id = e.id
|
||||
JOIN transmissions t ON vo.transmission_id = t.id
|
||||
WHERE vo.year = p_year
|
||||
AND vo.make = p_make
|
||||
AND vo.model = p_model
|
||||
AND vo.trim = p_trim
|
||||
AND e.name = p_engine_name
|
||||
ORDER BY t.type ASC;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION get_engines_for_vehicle_trans(p_year INTEGER, p_make VARCHAR, p_model VARCHAR, p_trim VARCHAR, p_trans_type VARCHAR)
|
||||
RETURNS TABLE(
|
||||
engine_id INTEGER,
|
||||
engine_name VARCHAR
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT DISTINCT
|
||||
e.id,
|
||||
e.name
|
||||
FROM vehicle_options vo
|
||||
JOIN engines e ON vo.engine_id = e.id
|
||||
JOIN transmissions t ON vo.transmission_id = t.id
|
||||
WHERE vo.year = p_year
|
||||
AND vo.make = p_make
|
||||
AND vo.model = p_model
|
||||
AND vo.trim = p_trim
|
||||
AND t.type = p_trans_type
|
||||
ORDER BY e.name ASC;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
COMMENT ON TABLE vehicle_options IS 'Denormalized table optimized for cascading dropdown queries';
|
||||
COMMENT ON TABLE engines IS 'Engine specifications with detailed technical data';
|
||||
COMMENT ON TABLE transmissions IS 'Transmission specifications';
|
||||
COMMENT ON VIEW available_years IS 'Returns all distinct years available in the database';
|
||||
COMMENT ON VIEW makes_by_year IS 'Returns makes grouped by year for dropdown population';
|
||||
COMMENT ON VIEW models_by_year_make IS 'Returns models grouped by year and make';
|
||||
COMMENT ON VIEW trims_by_year_make_model IS 'Returns trims grouped by year, make, and model';
|
||||
COMMENT ON VIEW complete_vehicle_configs IS 'Complete vehicle configurations with all details';
|
||||
22
data/vehicle-etl/output/01_engines.sql
Normal file
22
data/vehicle-etl/output/01_engines.sql
Normal file
@@ -0,0 +1,22 @@
|
||||
-- Auto-generated by etl_generate_sql.py
|
||||
INSERT INTO engines (id, name, fuel_type) VALUES
|
||||
(1,'Gas','Gas'),
|
||||
(2,'2.0L 150 hp I4','Gas'),
|
||||
(3,'2.4L 201 hp I4','Gas'),
|
||||
(4,'3.5L 290 hp V6','Gas'),
|
||||
(5,'3.5L 273 hp V6','Gas'),
|
||||
(6,'3.5L 310 hp V6','Gas'),
|
||||
(7,'2.4L 206 hp I4','Gas'),
|
||||
(8,'2.0L 220 hp I4','Gas'),
|
||||
(9,'1.8L 170 hp I4','Gas'),
|
||||
(10,'Diesel','Diesel'),
|
||||
(11,'2.0L 150 hp I4 Diesel','Diesel'),
|
||||
(12,'2.0L 220 hp I4 Flex Fuel Vehicle','Gas'),
|
||||
(13,'3.0L 310 hp V6','Gas'),
|
||||
(14,'3.0L 240 hp V6 Diesel','Diesel'),
|
||||
(15,'4.0L 435 hp V8','Diesel'),
|
||||
(16,'3.0L 333 hp V6','Gas'),
|
||||
(17,'6.3L 500 hp W12','Gas'),
|
||||
(18,'2.0L 200 hp I4','Gas'),
|
||||
(19,'3.0L 272 hp V6','Gas');
|
||||
|
||||
13
data/vehicle-etl/output/02_transmissions.sql
Normal file
13
data/vehicle-etl/output/02_transmissions.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
-- Auto-generated by etl_generate_sql.py
|
||||
INSERT INTO transmissions (id, type) VALUES
|
||||
(1,'Automatic'),
|
||||
(2,'Manual'),
|
||||
(3,'5-Speed Automatic'),
|
||||
(4,'6-Speed Manual'),
|
||||
(5,'6-Speed Automatic'),
|
||||
(6,'8-Speed Dual Clutch'),
|
||||
(7,'9-Speed Automatic'),
|
||||
(8,'6-Speed Dual Clutch'),
|
||||
(9,'8-Speed Automatic'),
|
||||
(10,'Continuously Variable Transmission');
|
||||
|
||||
281
data/vehicle-etl/output/03_vehicle_options.sql
Normal file
281
data/vehicle-etl/output/03_vehicle_options.sql
Normal file
@@ -0,0 +1,281 @@
|
||||
-- Auto-generated by etl_generate_sql.py
|
||||
INSERT INTO vehicle_options (year, make, model, trim, engine_id, transmission_id) VALUES
|
||||
(2015,'Acura','ILX','2.0L',1,1),
|
||||
(2015,'Acura','ILX','2.0L',1,2),
|
||||
(2015,'Acura','ILX','2.0L FWD',2,3),
|
||||
(2015,'Acura','ILX','2.0L FWD with Premium Package',2,3),
|
||||
(2015,'Acura','ILX','2.0L FWD with Technology Package',2,3),
|
||||
(2015,'Acura','ILX','2.0L Technology',1,1),
|
||||
(2015,'Acura','ILX','2.0L Technology',1,2),
|
||||
(2015,'Acura','ILX','2.0L w/Premium Package',1,1),
|
||||
(2015,'Acura','ILX','2.0L w/Premium Package',1,2),
|
||||
(2015,'Acura','ILX','2.4L FWD with Premium Package',2,3),
|
||||
(2015,'Acura','ILX','2.4L FWD with Premium Package',3,4),
|
||||
(2015,'Acura','ILX','2.4L w/Premium Package',1,1),
|
||||
(2015,'Acura','ILX','2.4L w/Premium Package',1,2),
|
||||
(2015,'Acura','ILX','FWD with Dynamic Package',2,3),
|
||||
(2015,'Acura','MDX','3.5L',1,1),
|
||||
(2015,'Acura','MDX','3.5L',1,2),
|
||||
(2015,'Acura','MDX','3.5L Advance Pkg w/Entertainment Pkg',1,1),
|
||||
(2015,'Acura','MDX','3.5L Advance Pkg w/Entertainment Pkg',1,2),
|
||||
(2015,'Acura','MDX','3.5L Technology Package',1,1),
|
||||
(2015,'Acura','MDX','3.5L Technology Package',1,2),
|
||||
(2015,'Acura','MDX','3.5L Technology Pkg/w Entertainment Pkg',1,1),
|
||||
(2015,'Acura','MDX','3.5L Technology Pkg/w Entertainment Pkg',1,2),
|
||||
(2015,'Acura','MDX','3.5L w/Technology & Entertainment Pkgs',1,1),
|
||||
(2015,'Acura','MDX','3.5L w/Technology & Entertainment Pkgs',1,2),
|
||||
(2015,'Acura','MDX','FWD',4,5),
|
||||
(2015,'Acura','MDX','FWD with Advance and Entertainment Package',4,5),
|
||||
(2015,'Acura','MDX','FWD with Technology Package',4,5),
|
||||
(2015,'Acura','MDX','FWD with Technology and Entertainment Package',4,5),
|
||||
(2015,'Acura','MDX','SH-AWD',4,5),
|
||||
(2015,'Acura','MDX','SH-AWD with Advance and Entertainment Package',4,5),
|
||||
(2015,'Acura','MDX','SH-AWD with Elite Package',4,5),
|
||||
(2015,'Acura','MDX','SH-AWD with Navigation',4,5),
|
||||
(2015,'Acura','MDX','SH-AWD with Technology Package',4,5),
|
||||
(2015,'Acura','MDX','SH-AWD with Technology and Entertainment Package',4,5),
|
||||
(2015,'Acura','RDX','AWD',5,5),
|
||||
(2015,'Acura','RDX','AWD with Technology Package',5,5),
|
||||
(2015,'Acura','RDX','Base',1,1),
|
||||
(2015,'Acura','RDX','Base',1,2),
|
||||
(2015,'Acura','RDX','FWD',5,5),
|
||||
(2015,'Acura','RDX','FWD with Technology Package',5,5),
|
||||
(2015,'Acura','RDX','Technology Package',1,1),
|
||||
(2015,'Acura','RDX','Technology Package',1,2),
|
||||
(2015,'Acura','RLX','Advance Package',1,1),
|
||||
(2015,'Acura','RLX','Advance Package',1,2),
|
||||
(2015,'Acura','RLX','Base',1,1),
|
||||
(2015,'Acura','RLX','Base',1,2),
|
||||
(2015,'Acura','RLX','FWD',6,5),
|
||||
(2015,'Acura','RLX','FWD',1,1),
|
||||
(2015,'Acura','RLX','FWD',1,2),
|
||||
(2015,'Acura','RLX','FWD with Advance Package',6,5),
|
||||
(2015,'Acura','RLX','FWD with Elite Package',6,5),
|
||||
(2015,'Acura','RLX','FWD with Krell Audio Package',6,5),
|
||||
(2015,'Acura','RLX','FWD with Navigation',6,5),
|
||||
(2015,'Acura','RLX','FWD with Technology Package',6,5),
|
||||
(2015,'Acura','RLX','Navigation',1,1),
|
||||
(2015,'Acura','RLX','Navigation',1,2),
|
||||
(2015,'Acura','RLX','Technology Package',1,1),
|
||||
(2015,'Acura','RLX','Technology Package',1,2),
|
||||
(2015,'Acura','RLX Hybrid Sport','SH-AWD',1,1),
|
||||
(2015,'Acura','RLX Hybrid Sport','SH-AWD',1,2),
|
||||
(2015,'Acura','TLX','Base',1,1),
|
||||
(2015,'Acura','TLX','Base',1,2),
|
||||
(2015,'Acura','TLX','FWD',7,6),
|
||||
(2015,'Acura','TLX','FWD with Technology Package',7,6),
|
||||
(2015,'Acura','TLX','SH-AWD with Elite Package',4,7),
|
||||
(2015,'Acura','TLX','Tech',1,1),
|
||||
(2015,'Acura','TLX','Tech',1,2),
|
||||
(2015,'Acura','TLX','V6',1,1),
|
||||
(2015,'Acura','TLX','V6',1,2),
|
||||
(2015,'Acura','TLX','V6 Advance',1,1),
|
||||
(2015,'Acura','TLX','V6 Advance',1,2),
|
||||
(2015,'Acura','TLX','V6 FWD',4,7),
|
||||
(2015,'Acura','TLX','V6 FWD with Advance Package',4,7),
|
||||
(2015,'Acura','TLX','V6 FWD with Technology Package',4,7),
|
||||
(2015,'Acura','TLX','V6 SH-AWD',4,7),
|
||||
(2015,'Acura','TLX','V6 SH-AWD with Advance Package',4,7),
|
||||
(2015,'Acura','TLX','V6 SH-AWD with Technology Package',4,7),
|
||||
(2015,'Acura','TLX','V6 Tech',1,1),
|
||||
(2015,'Acura','TLX','V6 Tech',1,2),
|
||||
(2015,'Acura','TLX','V6 with Elite Package',4,7),
|
||||
(2015,'Audi','A3','1.8T Komfort Sedan FWD',8,8),
|
||||
(2015,'Audi','A3','1.8T Premium',1,1),
|
||||
(2015,'Audi','A3','1.8T Premium',1,2),
|
||||
(2015,'Audi','A3','1.8T Premium Cabriolet FWD',9,8),
|
||||
(2015,'Audi','A3','1.8T Premium Plus',1,1),
|
||||
(2015,'Audi','A3','1.8T Premium Plus',1,2),
|
||||
(2015,'Audi','A3','1.8T Premium Plus Cabriolet FWD',9,8),
|
||||
(2015,'Audi','A3','1.8T Premium Plus Sedan FWD',9,8),
|
||||
(2015,'Audi','A3','1.8T Premium Sedan FWD',9,8),
|
||||
(2015,'Audi','A3','1.8T Prestige',1,1),
|
||||
(2015,'Audi','A3','1.8T Prestige',1,2),
|
||||
(2015,'Audi','A3','1.8T Prestige Cabriolet FWD',9,8),
|
||||
(2015,'Audi','A3','1.8T Prestige Sedan FWD',9,8),
|
||||
(2015,'Audi','A3','1.8T Prestige Sedan FWD',8,8),
|
||||
(2015,'Audi','A3','1.8T Progressiv Sedan FWD',8,8),
|
||||
(2015,'Audi','A3','2.0 TDI Premium',10,1),
|
||||
(2015,'Audi','A3','2.0 TDI Premium',10,2),
|
||||
(2015,'Audi','A3','2.0 TDI Premium Plus',10,1),
|
||||
(2015,'Audi','A3','2.0 TDI Premium Plus',10,2),
|
||||
(2015,'Audi','A3','2.0 TDI Premium Plus Sedan FWD',11,8),
|
||||
(2015,'Audi','A3','2.0 TDI Premium Sedan FWD',11,8),
|
||||
(2015,'Audi','A3','2.0 TDI Prestige',10,1),
|
||||
(2015,'Audi','A3','2.0 TDI Prestige',10,2),
|
||||
(2015,'Audi','A3','2.0 TDI Prestige Sedan FWD',11,8),
|
||||
(2015,'Audi','A3','2.0T Premium',1,1),
|
||||
(2015,'Audi','A3','2.0T Premium',1,2),
|
||||
(2015,'Audi','A3','2.0T Premium Plus',1,1),
|
||||
(2015,'Audi','A3','2.0T Premium Plus',1,2),
|
||||
(2015,'Audi','A3','2.0T Prestige',1,1),
|
||||
(2015,'Audi','A3','2.0T Prestige',1,2),
|
||||
(2015,'Audi','A3','2.0T quattro Komfort Cabriolet AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Komfort Sedan AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Premium Cabriolet AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Premium Plus Cabriolet AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Premium Plus Sedan AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Premium Sedan AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Prestige Cabriolet AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Prestige Sedan AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Progressiv Cabriolet AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Progressiv Sedan AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Technik Cabriolet AWD',8,8),
|
||||
(2015,'Audi','A3','2.0T quattro Technik FWD',1,1),
|
||||
(2015,'Audi','A3','2.0T quattro Technik FWD',1,2),
|
||||
(2015,'Audi','A3','2.0T quattro Technik Sedan AWD',8,8),
|
||||
(2015,'Audi','A3','TDI Komfort Sedan FWD',8,8),
|
||||
(2015,'Audi','A3','TDI Progressiv Sedan FWD',8,8),
|
||||
(2015,'Audi','A3','TDI Technik Sedan FWD',8,8),
|
||||
(2015,'Audi','A4','2.0T FrontTrak Komfort FWD',8,4),
|
||||
(2015,'Audi','A4','2.0T FrontTrak Komfort FWD',8,9),
|
||||
(2015,'Audi','A4','2.0T Premium',1,1),
|
||||
(2015,'Audi','A4','2.0T Premium',1,2),
|
||||
(2015,'Audi','A4','2.0T Premium FWD',8,10),
|
||||
(2015,'Audi','A4','2.0T Premium Plus',1,1),
|
||||
(2015,'Audi','A4','2.0T Premium Plus',1,2),
|
||||
(2015,'Audi','A4','2.0T Premium Plus FWD',8,10),
|
||||
(2015,'Audi','A4','2.0T Premium Plus Sedan FWD',8,10),
|
||||
(2015,'Audi','A4','2.0T Premium Sedan FWD',8,10),
|
||||
(2015,'Audi','A4','2.0T Prestige',1,1),
|
||||
(2015,'Audi','A4','2.0T Prestige',1,2),
|
||||
(2015,'Audi','A4','2.0T Prestige FWD',8,4),
|
||||
(2015,'Audi','A4','2.0T Prestige FWD',8,9),
|
||||
(2015,'Audi','A4','2.0T Prestige Sedan FWD',8,10),
|
||||
(2015,'Audi','A4','2.0T quattro Komfort AWD',8,4),
|
||||
(2015,'Audi','A4','2.0T quattro Komfort AWD',8,9),
|
||||
(2015,'Audi','A4','2.0T quattro Premium AWD',8,4),
|
||||
(2015,'Audi','A4','2.0T quattro Premium AWD',8,9),
|
||||
(2015,'Audi','A4','2.0T quattro Premium Plus AWD',8,4),
|
||||
(2015,'Audi','A4','2.0T quattro Premium Plus AWD',8,9),
|
||||
(2015,'Audi','A4','2.0T quattro Premium Plus Sedan AWD',8,4),
|
||||
(2015,'Audi','A4','2.0T quattro Premium Plus Sedan AWD',8,9),
|
||||
(2015,'Audi','A4','2.0T quattro Premium Sedan AWD',8,4),
|
||||
(2015,'Audi','A4','2.0T quattro Premium Sedan AWD',8,9),
|
||||
(2015,'Audi','A4','2.0T quattro Prestige AWD',8,4),
|
||||
(2015,'Audi','A4','2.0T quattro Prestige AWD',8,9),
|
||||
(2015,'Audi','A4','2.0T quattro Prestige Sedan AWD',8,4),
|
||||
(2015,'Audi','A4','2.0T quattro Prestige Sedan AWD',8,9),
|
||||
(2015,'Audi','A4','2.0T quattro Progressiv AWD',8,4),
|
||||
(2015,'Audi','A4','2.0T quattro Progressiv AWD',8,9),
|
||||
(2015,'Audi','A4','2.0T quattro Technik AWD',8,4),
|
||||
(2015,'Audi','A4','2.0T quattro Technik AWD',8,9),
|
||||
(2015,'Audi','A4 Allroad','2.0T quattro Komfort AWD',12,9),
|
||||
(2015,'Audi','A4 Allroad','2.0T quattro Premium AWD',8,9),
|
||||
(2015,'Audi','A4 Allroad','2.0T quattro Premium AWD',12,9),
|
||||
(2015,'Audi','A4 Allroad','2.0T quattro Premium Plus AWD',8,9),
|
||||
(2015,'Audi','A4 Allroad','2.0T quattro Premium Plus AWD',12,9),
|
||||
(2015,'Audi','A4 Allroad','2.0T quattro Prestige AWD',8,9),
|
||||
(2015,'Audi','A4 Allroad','2.0T quattro Prestige AWD',12,9),
|
||||
(2015,'Audi','A4 Allroad','2.0T quattro Progressiv AWD',12,9),
|
||||
(2015,'Audi','A4 Allroad','2.0T quattro Technik AWD',12,9),
|
||||
(2015,'Audi','A5','2.0T Premium',1,1),
|
||||
(2015,'Audi','A5','2.0T Premium',1,2),
|
||||
(2015,'Audi','A5','2.0T Premium Plus',1,1),
|
||||
(2015,'Audi','A5','2.0T Premium Plus',1,2),
|
||||
(2015,'Audi','A5','2.0T quattro Komfort Coupe AWD',8,4),
|
||||
(2015,'Audi','A5','2.0T quattro Komfort Coupe AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Premium Cabriolet AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Premium Cabriolet AWD',12,9),
|
||||
(2015,'Audi','A5','2.0T quattro Premium Coupe AWD',8,4),
|
||||
(2015,'Audi','A5','2.0T quattro Premium Coupe AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Premium Plus Cabriolet AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Premium Plus Cabriolet AWD',12,9),
|
||||
(2015,'Audi','A5','2.0T quattro Premium Plus Coupe AWD',8,4),
|
||||
(2015,'Audi','A5','2.0T quattro Premium Plus Coupe AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Prestige Cabriolet AWD',8,4),
|
||||
(2015,'Audi','A5','2.0T quattro Prestige Cabriolet AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Prestige Coupe AWD',8,4),
|
||||
(2015,'Audi','A5','2.0T quattro Prestige Coupe AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Progressiv Cabriolet AWD',8,4),
|
||||
(2015,'Audi','A5','2.0T quattro Progressiv Cabriolet AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Progressiv Coupe AWD',8,4),
|
||||
(2015,'Audi','A5','2.0T quattro Progressiv Coupe AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Progressiv Coupe AWD',1,1),
|
||||
(2015,'Audi','A5','2.0T quattro Progressiv Coupe AWD',1,2),
|
||||
(2015,'Audi','A5','2.0T quattro Technik Cabriolet AWD',8,4),
|
||||
(2015,'Audi','A5','2.0T quattro Technik Cabriolet AWD',8,9),
|
||||
(2015,'Audi','A5','2.0T quattro Technik Coupe AWD',8,4),
|
||||
(2015,'Audi','A5','2.0T quattro Technik Coupe AWD',8,9),
|
||||
(2015,'Audi','A6','2.0T Premium',1,1),
|
||||
(2015,'Audi','A6','2.0T Premium',1,2),
|
||||
(2015,'Audi','A6','2.0T Premium Plus Sedan FWD',8,10),
|
||||
(2015,'Audi','A6','2.0T Premium Sedan FWD',8,10),
|
||||
(2015,'Audi','A6','2.0T Premium Sedan FWD',13,9),
|
||||
(2015,'Audi','A6','2.0T quattro Premium Plus Sedan AWD',8,9),
|
||||
(2015,'Audi','A6','2.0T quattro Premium Sedan AWD',8,9),
|
||||
(2015,'Audi','A6','2.0T quattro Progressiv Sedan AWD',13,9),
|
||||
(2015,'Audi','A6','2.0T quattro Technik Sedan AWD',13,9),
|
||||
(2015,'Audi','A6','3.0 TDI Premium Plus',10,1),
|
||||
(2015,'Audi','A6','3.0 TDI Premium Plus',10,2),
|
||||
(2015,'Audi','A6','3.0 TDI quattro Premium Plus Sedan AWD',14,9),
|
||||
(2015,'Audi','A6','3.0 TDI quattro Prestige Sedan AWD',14,9),
|
||||
(2015,'Audi','A6','3.0 TDI quattro Progressiv Sedan AWD',13,9),
|
||||
(2015,'Audi','A6','3.0 TDI quattro Technik Sedan AWD',13,9),
|
||||
(2015,'Audi','A6','3.0T Premium Plus',1,1),
|
||||
(2015,'Audi','A6','3.0T Premium Plus',1,2),
|
||||
(2015,'Audi','A6','3.0T Prestige',1,1),
|
||||
(2015,'Audi','A6','3.0T Prestige',1,2),
|
||||
(2015,'Audi','A6','3.0T quattro Premium Plus Sedan AWD',13,9),
|
||||
(2015,'Audi','A6','3.0T quattro Prestige Sedan AWD',13,9),
|
||||
(2015,'Audi','A6','3.0T quattro Progressiv Sedan AWD',13,9),
|
||||
(2015,'Audi','A6','3.0T quattro Technik Sedan AWD',13,9),
|
||||
(2015,'Audi','A7','3.0 TDI Premium Plus',10,1),
|
||||
(2015,'Audi','A7','3.0 TDI Premium Plus',10,2),
|
||||
(2015,'Audi','A7','3.0 TDI quattro Premium Plus AWD',14,9),
|
||||
(2015,'Audi','A7','3.0 TDI quattro Premium Plus AWD',13,9),
|
||||
(2015,'Audi','A7','3.0 TDI quattro Prestige AWD',14,9),
|
||||
(2015,'Audi','A7','3.0 TDI quattro Progressiv AWD',13,9),
|
||||
(2015,'Audi','A7','3.0 TDI quattro Technik AWD',13,9),
|
||||
(2015,'Audi','A7','3.0T Premium Plus',1,1),
|
||||
(2015,'Audi','A7','3.0T Premium Plus',1,2),
|
||||
(2015,'Audi','A7','3.0T Prestige',1,1),
|
||||
(2015,'Audi','A7','3.0T Prestige',1,2),
|
||||
(2015,'Audi','A7','3.0T quattro Premium Plus AWD',13,9),
|
||||
(2015,'Audi','A7','3.0T quattro Prestige AWD',13,9),
|
||||
(2015,'Audi','A7','3.0T quattro Progressiv AWD',13,9),
|
||||
(2015,'Audi','A7','3.0T quattro Technik AWD',13,9),
|
||||
(2015,'Audi','A8','3.0 TDI quattro AWD',15,9),
|
||||
(2015,'Audi','A8','3.0T',1,1),
|
||||
(2015,'Audi','A8','3.0T',1,2),
|
||||
(2015,'Audi','A8','3.0T quattro AWD',16,9),
|
||||
(2015,'Audi','A8','4.0T',1,1),
|
||||
(2015,'Audi','A8','4.0T',1,2),
|
||||
(2015,'Audi','A8','4.0T quattro AWD',15,9),
|
||||
(2015,'Audi','A8','L 3.0 TDI',10,1),
|
||||
(2015,'Audi','A8','L 3.0 TDI',10,2),
|
||||
(2015,'Audi','A8','L 3.0 TDI quattro AWD',14,9),
|
||||
(2015,'Audi','A8','L 3.0T',1,1),
|
||||
(2015,'Audi','A8','L 3.0T',1,2),
|
||||
(2015,'Audi','A8','L 3.0T quattro AWD',16,9),
|
||||
(2015,'Audi','A8','L 4.0T',1,1),
|
||||
(2015,'Audi','A8','L 4.0T',1,2),
|
||||
(2015,'Audi','A8','L 4.0T quattro AWD',15,9),
|
||||
(2015,'Audi','A8','L W12 6.3',1,1),
|
||||
(2015,'Audi','A8','L W12 6.3',1,2),
|
||||
(2015,'Audi','A8','L W12 quattro AWD',15,9),
|
||||
(2015,'Audi','A8','L W12 quattro AWD',17,9),
|
||||
(2015,'Audi','Q3','2.0T Premium Plus',1,1),
|
||||
(2015,'Audi','Q3','2.0T Premium Plus',1,2),
|
||||
(2015,'Audi','Q3','2.0T Premium Plus FWD',18,5),
|
||||
(2015,'Audi','Q3','2.0T Prestige',1,1),
|
||||
(2015,'Audi','Q3','2.0T Prestige',1,2),
|
||||
(2015,'Audi','Q3','2.0T Prestige FWD',18,5),
|
||||
(2015,'Audi','Q3','2.0T Progressiv FWD',18,5),
|
||||
(2015,'Audi','Q3','2.0T Technik FWD',18,5),
|
||||
(2015,'Audi','Q3','2.0T quattro Premium Plus AWD',18,5),
|
||||
(2015,'Audi','Q3','2.0T quattro Prestige AWD',18,5),
|
||||
(2015,'Audi','Q3','3.0T quattro Progressiv AWD',18,5),
|
||||
(2015,'Audi','Q3','3.0T quattro Technik AWD',18,5),
|
||||
(2015,'Audi','Q5','2.0T Premium',1,1),
|
||||
(2015,'Audi','Q5','2.0T Premium',1,2),
|
||||
(2015,'Audi','Q5','2.0T Premium Plus',1,1),
|
||||
(2015,'Audi','Q5','2.0T Premium Plus',1,2),
|
||||
(2015,'Audi','Q5','2.0T quattro Komfort AWD',19,9),
|
||||
(2015,'Audi','allroad','2.0T Premium',1,1),
|
||||
(2015,'Audi','allroad','2.0T Premium',1,2),
|
||||
(2015,'Audi','allroad','2.0T Premium Plus',1,1),
|
||||
(2015,'Audi','allroad','2.0T Premium Plus',1,2),
|
||||
(2015,'Audi','allroad','2.0T Prestige',1,1),
|
||||
(2015,'Audi','allroad','2.0T Prestige',1,2);
|
||||
|
||||
190
data/vehicle-etl/qa_validate.py
Executable file
190
data/vehicle-etl/qa_validate.py
Executable file
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Post-import QA validation for vehicle dropdown data.
|
||||
Runs basic duplicate and range checks against the motovaultpro Postgres container.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def run_psql(query: str) -> str:
|
||||
cmd = [
|
||||
"docker",
|
||||
"exec",
|
||||
"mvp-postgres",
|
||||
"psql",
|
||||
"-U",
|
||||
"postgres",
|
||||
"-d",
|
||||
"motovaultpro",
|
||||
"-At",
|
||||
"-c",
|
||||
query,
|
||||
]
|
||||
return subprocess.check_output(cmd, text=True)
|
||||
|
||||
|
||||
def check_container():
|
||||
try:
|
||||
subprocess.check_output(["docker", "ps"], text=True)
|
||||
except Exception:
|
||||
print("❌ Docker not available.")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
containers = subprocess.check_output(
|
||||
["docker", "ps", "--filter", "name=mvp-postgres", "--format", "{{.Names}}"],
|
||||
text=True,
|
||||
).strip()
|
||||
if not containers:
|
||||
print("❌ mvp-postgres container not running.")
|
||||
sys.exit(1)
|
||||
except Exception as exc:
|
||||
print(f"❌ Failed to check containers: {exc}")
|
||||
sys.exit(1)
|
||||
|
||||
def check_invalid_combinations():
|
||||
"""Verify known invalid combinations do not exist."""
|
||||
invalid_combos = [
|
||||
(1992, "Chevrolet", "Corvette", "Z06"), # Z06 started 2001
|
||||
(2000, "Chevrolet", "Corvette", "35th Anniversary Edition"), # Was 1988
|
||||
(2000, "Chevrolet", "Corvette", "Stingray"), # Stingray started 2014
|
||||
(1995, "Ford", "Mustang", "Mach-E"), # Mach-E is 2021+
|
||||
(2020, "Tesla", "Cybertruck", "Base"), # Not in production until later
|
||||
]
|
||||
|
||||
issues = []
|
||||
for year, make, model, trim in invalid_combos:
|
||||
query = f"""
|
||||
SELECT COUNT(*) FROM vehicle_options
|
||||
WHERE year = {year}
|
||||
AND make = '{make}'
|
||||
AND model = '{model}'
|
||||
AND trim = '{trim}'
|
||||
"""
|
||||
count = int(run_psql(query).strip())
|
||||
if count > 0:
|
||||
issues.append(f"Invalid combo found: {year} {make} {model} {trim}")
|
||||
|
||||
return issues
|
||||
|
||||
def check_trim_coverage():
|
||||
"""Report on trim coverage statistics."""
|
||||
query = """
|
||||
SELECT
|
||||
COUNT(DISTINCT (year, make, model)) as total_models,
|
||||
COUNT(DISTINCT (year, make, model)) FILTER (WHERE trim = 'Base') as base_only,
|
||||
COUNT(DISTINCT (year, make, model)) FILTER (WHERE trim != 'Base') as has_specific_trims
|
||||
FROM vehicle_options
|
||||
"""
|
||||
result = run_psql(query).strip()
|
||||
print(f"Trim coverage (total/base_only/has_specific_trims): {result}")
|
||||
|
||||
|
||||
def main():
|
||||
check_container()
|
||||
|
||||
print("🔍 Running QA checks...\n")
|
||||
|
||||
queries = {
|
||||
"engine_duplicate_names": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT LOWER(name) as n, COUNT(*) c
|
||||
FROM engines
|
||||
GROUP BY 1 HAVING COUNT(*) > 1
|
||||
) t;
|
||||
""",
|
||||
"transmission_duplicate_types": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT LOWER(type) as t, COUNT(*) c
|
||||
FROM transmissions
|
||||
GROUP BY 1 HAVING COUNT(*) > 1
|
||||
) t;
|
||||
""",
|
||||
"vehicle_option_duplicates": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT year, make, model, trim, engine_id, transmission_id, COUNT(*) c
|
||||
FROM vehicle_options
|
||||
GROUP BY 1,2,3,4,5,6 HAVING COUNT(*) > 1
|
||||
) t;
|
||||
""",
|
||||
"year_range": """
|
||||
SELECT MIN(year) || ' - ' || MAX(year) FROM vehicle_options;
|
||||
""",
|
||||
"year_range_valid": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT 1 FROM vehicle_options WHERE year < 2015 OR year > 2022 LIMIT 1
|
||||
) t;
|
||||
""",
|
||||
"counts": """
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM engines) AS engines,
|
||||
(SELECT COUNT(*) FROM transmissions) AS transmissions,
|
||||
(SELECT COUNT(*) FROM vehicle_options) AS vehicle_options;
|
||||
""",
|
||||
"cross_join_gaps": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT base.year, base.make, base.model, base.trim, e.engine_id, t.transmission_id
|
||||
FROM (
|
||||
SELECT DISTINCT year, make, model, trim FROM vehicle_options
|
||||
) base
|
||||
JOIN (
|
||||
SELECT DISTINCT year, make, model, trim, engine_id FROM vehicle_options
|
||||
) e ON base.year = e.year AND base.make = e.make AND base.model = e.model AND base.trim = e.trim
|
||||
JOIN (
|
||||
SELECT DISTINCT year, make, model, trim, transmission_id FROM vehicle_options
|
||||
) t ON base.year = t.year AND base.make = t.make AND base.model = t.model AND base.trim = t.trim
|
||||
EXCEPT
|
||||
SELECT year, make, model, trim, engine_id, transmission_id FROM vehicle_options
|
||||
) gap;
|
||||
""",
|
||||
}
|
||||
|
||||
results = {}
|
||||
for key, query in queries.items():
|
||||
try:
|
||||
results[key] = run_psql(query).strip()
|
||||
except subprocess.CalledProcessError as exc:
|
||||
print(f"❌ Query failed ({key}): {exc}")
|
||||
sys.exit(1)
|
||||
|
||||
issues_found = False
|
||||
|
||||
print(f"Engine duplicate names: {results['engine_duplicate_names']}")
|
||||
print(f"Transmission duplicate types: {results['transmission_duplicate_types']}")
|
||||
print(f"Vehicle option duplicates: {results['vehicle_option_duplicates']}")
|
||||
print(f"Year range: {results['year_range']}")
|
||||
print(f"Out-of-range years (should be 0): {results['year_range_valid']}")
|
||||
print(f"Counts (engines, transmissions, vehicle_options): {results['counts']}")
|
||||
print(f"Cross-join gaps (should be 0 to avoid impossible pairs): {results['cross_join_gaps']}")
|
||||
|
||||
if (
|
||||
results["engine_duplicate_names"] != "0"
|
||||
or results["transmission_duplicate_types"] != "0"
|
||||
or results["vehicle_option_duplicates"] != "0"
|
||||
or results["year_range_valid"] != "0"
|
||||
or results["cross_join_gaps"] != "0"
|
||||
):
|
||||
issues_found = True
|
||||
|
||||
invalids = check_invalid_combinations()
|
||||
if invalids:
|
||||
issues_found = True
|
||||
print("\n❌ Invalid combinations detected:")
|
||||
for issue in invalids:
|
||||
print(f" - {issue}")
|
||||
else:
|
||||
print("\n✅ No known invalid year/make/model/trim combos found.")
|
||||
|
||||
check_trim_coverage()
|
||||
|
||||
if not issues_found:
|
||||
print("\n✅ QA checks passed.")
|
||||
else:
|
||||
print("\n❌ QA checks found issues.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
data/vehicle-etl/snapshots/.gitkeep
Normal file
0
data/vehicle-etl/snapshots/.gitkeep
Normal file
BIN
data/vehicle-etl/snapshots/2025-12-15/snapshot.sqlite
Normal file
BIN
data/vehicle-etl/snapshots/2025-12-15/snapshot.sqlite
Normal file
Binary file not shown.
BIN
data/vehicle-etl/snapshots/2025-12-15/snapshot.sqlite-shm
Normal file
BIN
data/vehicle-etl/snapshots/2025-12-15/snapshot.sqlite-shm
Normal file
Binary file not shown.
BIN
data/vehicle-etl/snapshots/2025-12-15/snapshot.sqlite-wal
Normal file
BIN
data/vehicle-etl/snapshots/2025-12-15/snapshot.sqlite-wal
Normal file
Binary file not shown.
53
data/vehicle-etl/source-makes.txt
Normal file
53
data/vehicle-etl/source-makes.txt
Normal file
@@ -0,0 +1,53 @@
|
||||
acura
|
||||
alfa_romeo
|
||||
aston_martin
|
||||
audi
|
||||
bentley
|
||||
bmw
|
||||
buick
|
||||
cadillac
|
||||
chevrolet
|
||||
chrysler
|
||||
dodge
|
||||
ferrari
|
||||
fiat
|
||||
ford
|
||||
genesis
|
||||
gmc
|
||||
honda
|
||||
hummer
|
||||
hyundai
|
||||
infiniti
|
||||
isuzu
|
||||
jaguar
|
||||
jeep
|
||||
kia
|
||||
lamborghini
|
||||
land_rover
|
||||
lexus
|
||||
lincoln
|
||||
lotus
|
||||
lucid
|
||||
maserati
|
||||
mazda
|
||||
mclaren
|
||||
mercury
|
||||
mini
|
||||
mitsubishi
|
||||
nissan
|
||||
oldsmobile
|
||||
plymouth
|
||||
polestar
|
||||
pontiac
|
||||
porsche
|
||||
ram
|
||||
rivian
|
||||
rolls_royce
|
||||
saab
|
||||
scion
|
||||
smart
|
||||
subaru
|
||||
tesla
|
||||
toyota
|
||||
volkswagen
|
||||
volvo
|
||||
1
data/vehicle-etl/vehapi.key
Normal file
1
data/vehicle-etl/vehapi.key
Normal file
@@ -0,0 +1 @@
|
||||
N9ZTsICa0gprFoXxgYRK6UApGCIVLeJlu0XR0leN
|
||||
515
data/vehicle-etl/vehapi_fetch_snapshot.py
Normal file
515
data/vehicle-etl/vehapi_fetch_snapshot.py
Normal file
@@ -0,0 +1,515 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fetches VehAPI data into an offline snapshot (SQLite + meta.json).
|
||||
|
||||
Workflow:
|
||||
1. Walks Year -> Make -> Model -> Trim -> Transmission -> Engine using VehAPI.
|
||||
2. Persists observed compatibility pairs to snapshot.sqlite (no Cartesian products).
|
||||
3. Stores request/response cache for resume; obeys rate limits and 429 retry-after.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import random
|
||||
import sqlite3
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Sequence
|
||||
from urllib.parse import quote
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError: # pragma: no cover - env guard
|
||||
print("[error] Missing dependency 'requests'. Install with `pip install requests`.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
SCRIPT_VERSION = "vehapi_fetch_snapshot.py@1.1.0"
|
||||
DEFAULT_MIN_YEAR = 2015
|
||||
DEFAULT_MAX_YEAR = 2022
|
||||
DEFAULT_RATE_PER_SEC = 55 # stays under the 60 req/sec ceiling
|
||||
MAX_ATTEMPTS = 5
|
||||
FALLBACK_TRIMS = ["Base"]
|
||||
FALLBACK_TRANSMISSIONS = ["Manual", "Automatic"]
|
||||
DEFAULT_BASE_URL = "https://vehapi.com/api/v1/car-lists/get/car"
|
||||
|
||||
|
||||
def canonicalize(value: str) -> str:
|
||||
"""Lowercase, trim, collapse spaces, and normalize hyphens for dedupe keys."""
|
||||
import re
|
||||
|
||||
cleaned = (value or "").strip()
|
||||
cleaned = re.sub(r"[\s\u00A0]+", " ", cleaned)
|
||||
cleaned = re.sub(r"[-\u2010-\u2015]+", "-", cleaned)
|
||||
return cleaned.lower()
|
||||
|
||||
|
||||
def infer_trans_bucket(trans_str: str) -> str:
|
||||
lowered = (trans_str or "").lower()
|
||||
if "manual" in lowered or "mt" in lowered or "m/t" in lowered:
|
||||
return "Manual"
|
||||
return "Automatic"
|
||||
|
||||
|
||||
def infer_fuel_bucket(engine_str: str, trans_str: str, trim_str: str) -> str:
|
||||
target = " ".join([engine_str or "", trans_str or "", trim_str or ""]).lower()
|
||||
if any(token in target for token in ["electric", "ev", "battery", "motor", "kwh"]):
|
||||
return "Electric"
|
||||
if any(token in target for token in ["hybrid", "phev", "plug-in", "hev", "e-hybrid"]):
|
||||
return "Hybrid"
|
||||
if any(token in target for token in ["diesel", "tdi", "dci", "duramax", "power stroke", "cummins"]):
|
||||
return "Diesel"
|
||||
return "Gas"
|
||||
|
||||
|
||||
def read_text_file(path: Path) -> str:
|
||||
with path.open("r", encoding="utf-8") as fh:
|
||||
return fh.read()
|
||||
|
||||
|
||||
def read_lines(path: Path) -> List[str]:
|
||||
return [line.strip() for line in read_text_file(path).splitlines() if line.strip()]
|
||||
|
||||
|
||||
def sha256_file(path: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as fh:
|
||||
for chunk in iter(lambda: fh.read(8192), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def ensure_snapshot_dir(root: Path, custom_dir: Optional[str]) -> Path:
|
||||
if custom_dir:
|
||||
snapshot_dir = Path(custom_dir)
|
||||
else:
|
||||
today = datetime.now(timezone.utc).date().isoformat()
|
||||
snapshot_dir = root / today
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
return snapshot_dir
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Simple leaky bucket limiter to stay below the VehAPI threshold."""
|
||||
|
||||
def __init__(self, max_per_sec: int) -> None:
|
||||
self.max_per_sec = max_per_sec
|
||||
self._history: List[float] = []
|
||||
|
||||
def acquire(self) -> None:
|
||||
while True:
|
||||
now = time.monotonic()
|
||||
window_start = now - 1
|
||||
self._history = [ts for ts in self._history if ts >= window_start]
|
||||
if len(self._history) < self.max_per_sec:
|
||||
break
|
||||
sleep_for = max(self._history[0] - window_start, 0.001)
|
||||
time.sleep(sleep_for)
|
||||
self._history.append(time.monotonic())
|
||||
|
||||
|
||||
@dataclass
|
||||
class FetchCounts:
|
||||
pairs_inserted: int = 0
|
||||
cache_hits: int = 0
|
||||
fallback_transmissions: int = 0
|
||||
fallback_engines: int = 0
|
||||
|
||||
|
||||
class VehapiFetcher:
|
||||
def __init__(
|
||||
self,
|
||||
session: requests.Session,
|
||||
base_url: str,
|
||||
token: str,
|
||||
min_year: int,
|
||||
max_year: int,
|
||||
allowed_makes: Sequence[str],
|
||||
snapshot_path: Path,
|
||||
responses_cache: bool = True,
|
||||
rate_per_sec: int = DEFAULT_RATE_PER_SEC,
|
||||
) -> None:
|
||||
self.session = session
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.token = token
|
||||
self.min_year = min_year
|
||||
self.max_year = max_year
|
||||
self.allowed_makes = {canonicalize(m): m for m in allowed_makes}
|
||||
self.snapshot_path = snapshot_path
|
||||
self.conn = sqlite3.connect(self.snapshot_path)
|
||||
self.conn.execute("PRAGMA journal_mode=WAL;")
|
||||
self.conn.execute("PRAGMA synchronous=NORMAL;")
|
||||
self._init_schema()
|
||||
self.responses_cache = responses_cache
|
||||
self.rate_limiter = RateLimiter(rate_per_sec)
|
||||
self.counts = FetchCounts()
|
||||
|
||||
def _init_schema(self) -> None:
|
||||
self.conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS pairs(
|
||||
year INT,
|
||||
make TEXT,
|
||||
model TEXT,
|
||||
trim TEXT,
|
||||
engine_display TEXT,
|
||||
engine_canon TEXT,
|
||||
engine_bucket TEXT,
|
||||
trans_display TEXT,
|
||||
trans_canon TEXT,
|
||||
trans_bucket TEXT,
|
||||
PRIMARY KEY(year, make, model, trim, engine_canon, trans_canon)
|
||||
)
|
||||
"""
|
||||
)
|
||||
self.conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS meta(
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
self.conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS responses(
|
||||
request_key TEXT PRIMARY KEY,
|
||||
url TEXT,
|
||||
status INT,
|
||||
headers_json TEXT,
|
||||
body_json TEXT,
|
||||
fetched_at TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def _store_meta(self, meta: Dict[str, Any]) -> None:
|
||||
rows = [(k, str(v)) for k, v in meta.items()]
|
||||
self.conn.executemany("INSERT OR REPLACE INTO meta(key, value) VALUES (?, ?)", rows)
|
||||
self.conn.commit()
|
||||
|
||||
def _load_cached_response(self, request_key: str) -> Optional[Any]:
|
||||
if not self.responses_cache:
|
||||
return None
|
||||
cur = self.conn.execute("SELECT body_json FROM responses WHERE request_key = ?", (request_key,))
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
self.counts.cache_hits += 1
|
||||
try:
|
||||
return json.loads(row[0])
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _save_response(self, request_key: str, url: str, status: int, headers: Dict[str, Any], body: Any) -> None:
|
||||
self.conn.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO responses(request_key, url, status, headers_json, body_json, fetched_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
request_key,
|
||||
url,
|
||||
status,
|
||||
json.dumps(dict(headers), default=str),
|
||||
json.dumps(body, default=str),
|
||||
datetime.now(timezone.utc).isoformat(),
|
||||
),
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def _request_json(self, path_parts: Sequence[str], label: str) -> Any:
|
||||
path_parts = [str(p) for p in path_parts]
|
||||
request_key = "/".join(path_parts)
|
||||
cached = self._load_cached_response(request_key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
url = f"{self.base_url}/" + "/".join(quote(p, safe="") for p in path_parts)
|
||||
attempts = 0
|
||||
backoff = 1.0
|
||||
while attempts < MAX_ATTEMPTS:
|
||||
attempts += 1
|
||||
self.rate_limiter.acquire()
|
||||
try:
|
||||
resp = self.session.get(url, headers={"Authorization": f"Bearer {self.token}"}, timeout=30)
|
||||
except requests.RequestException as exc:
|
||||
print(f"[warn] {label}: request error {exc}; retrying...", file=sys.stderr)
|
||||
time.sleep(backoff + random.uniform(0, 0.5))
|
||||
backoff = min(backoff * 2, 30)
|
||||
continue
|
||||
|
||||
if resp.status_code == 429:
|
||||
retry_after = resp.headers.get("retry-after") or resp.headers.get("Retry-After")
|
||||
try:
|
||||
retry_seconds = float(retry_after)
|
||||
except (TypeError, ValueError):
|
||||
retry_seconds = 30.0
|
||||
sleep_for = retry_seconds + random.uniform(0, 3)
|
||||
print(f"[info] {label}: hit 429, sleeping {sleep_for:.1f}s before retry", file=sys.stderr)
|
||||
time.sleep(sleep_for)
|
||||
backoff = min(backoff * 2, 30)
|
||||
continue
|
||||
|
||||
if resp.status_code >= 500:
|
||||
print(f"[warn] {label}: server {resp.status_code}, retrying...", file=sys.stderr)
|
||||
time.sleep(backoff + random.uniform(0, 0.5))
|
||||
backoff = min(backoff * 2, 30)
|
||||
continue
|
||||
|
||||
if not resp.ok:
|
||||
print(f"[warn] {label}: HTTP {resp.status_code}, skipping", file=sys.stderr)
|
||||
return []
|
||||
|
||||
try:
|
||||
body = resp.json()
|
||||
except ValueError:
|
||||
print(f"[warn] {label}: non-JSON response, skipping", file=sys.stderr)
|
||||
return []
|
||||
|
||||
self._save_response(request_key, url, resp.status_code, resp.headers, body)
|
||||
return body
|
||||
|
||||
print(f"[error] {label}: exhausted retries", file=sys.stderr)
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def _extract_values(payload: Any, keys: Sequence[str]) -> List[str]:
|
||||
values: List[str] = []
|
||||
if isinstance(payload, dict):
|
||||
payload = payload.get("data") or payload.get("results") or payload.get("items") or payload
|
||||
if not payload:
|
||||
return values
|
||||
if isinstance(payload, list):
|
||||
for item in payload:
|
||||
if isinstance(item, str):
|
||||
if item.strip():
|
||||
values.append(item.strip())
|
||||
continue
|
||||
if isinstance(item, dict):
|
||||
for key in keys:
|
||||
val = item.get(key)
|
||||
if val:
|
||||
values.append(str(val).strip())
|
||||
break
|
||||
return values
|
||||
|
||||
def _record_pair(
|
||||
self,
|
||||
year: int,
|
||||
make: str,
|
||||
model: str,
|
||||
trim: str,
|
||||
engine_display: str,
|
||||
engine_bucket: str,
|
||||
trans_display: str,
|
||||
trans_bucket: str,
|
||||
) -> None:
|
||||
engine_canon = canonicalize(engine_display)
|
||||
trans_canon = canonicalize(trans_display)
|
||||
cur = self.conn.execute(
|
||||
"""
|
||||
INSERT OR IGNORE INTO pairs(
|
||||
year, make, model, trim,
|
||||
engine_display, engine_canon, engine_bucket,
|
||||
trans_display, trans_canon, trans_bucket
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
year,
|
||||
make,
|
||||
model,
|
||||
trim,
|
||||
engine_display.strip(),
|
||||
engine_canon,
|
||||
engine_bucket,
|
||||
trans_display.strip(),
|
||||
trans_canon,
|
||||
trans_bucket,
|
||||
),
|
||||
)
|
||||
if cur.rowcount:
|
||||
self.counts.pairs_inserted += 1
|
||||
|
||||
def _fetch_engines_for_transmission(
|
||||
self, year: int, make: str, model: str, trim: str, transmission: str, trans_bucket: str
|
||||
) -> None:
|
||||
path = ["engines", str(year), make, model, trim, transmission]
|
||||
label = f"engines:{year}/{make}/{model}/{trim}/{transmission}"
|
||||
engines_payload = self._request_json(path, label)
|
||||
engines = self._extract_values(engines_payload, ["engine"])
|
||||
if not engines:
|
||||
engine_bucket = infer_fuel_bucket("", transmission, trim)
|
||||
fallback_engine = engine_bucket
|
||||
self._record_pair(year, make, model, trim, fallback_engine, engine_bucket, transmission, trans_bucket)
|
||||
self.counts.fallback_engines += 1
|
||||
return
|
||||
|
||||
for engine in engines:
|
||||
engine_bucket = infer_fuel_bucket(engine, transmission, trim)
|
||||
self._record_pair(year, make, model, trim, engine, engine_bucket, transmission, trans_bucket)
|
||||
|
||||
def _fetch_transmissions_for_trim(self, year: int, make: str, model: str, trim: str) -> None:
|
||||
path = ["transmissions", str(year), make, model, trim]
|
||||
label = f"transmissions:{year}/{make}/{model}/{trim}"
|
||||
transmissions_payload = self._request_json(path, label)
|
||||
transmissions = self._extract_values(transmissions_payload, ["transmission"])
|
||||
if not transmissions:
|
||||
for fallback in FALLBACK_TRANSMISSIONS:
|
||||
trans_bucket = infer_trans_bucket(fallback)
|
||||
engine_bucket = infer_fuel_bucket("", fallback, trim)
|
||||
self._record_pair(year, make, model, trim, engine_bucket, engine_bucket, fallback, trans_bucket)
|
||||
self.counts.fallback_transmissions += 1
|
||||
self.counts.fallback_engines += 1
|
||||
return
|
||||
|
||||
for trans in transmissions:
|
||||
trans_bucket = infer_trans_bucket(trans)
|
||||
self._fetch_engines_for_transmission(year, make, model, trim, trans, trans_bucket)
|
||||
|
||||
def _fetch_trims_for_model(self, year: int, make: str, model: str) -> None:
|
||||
path = ["trims", str(year), make, model]
|
||||
label = f"trims:{year}/{make}/{model}"
|
||||
trims_payload = self._request_json(path, label)
|
||||
trims = self._extract_values(trims_payload, ["trim"])
|
||||
|
||||
if not trims:
|
||||
trims = FALLBACK_TRIMS
|
||||
|
||||
for trim in trims:
|
||||
self._fetch_transmissions_for_trim(year, make, model, trim)
|
||||
self.conn.commit()
|
||||
|
||||
def _fetch_models_for_make(self, year: int, make: str) -> None:
|
||||
path = ["models", str(year), make]
|
||||
label = f"models:{year}/{make}"
|
||||
models_payload = self._request_json(path, label)
|
||||
models = self._extract_values(models_payload, ["model"])
|
||||
if not models:
|
||||
print(f"[warn] {label}: no models returned", file=sys.stderr)
|
||||
return
|
||||
for model in models:
|
||||
self._fetch_trims_for_model(year, make, model)
|
||||
|
||||
def _fetch_makes_for_year(self, year: int) -> List[str]:
|
||||
path = ["makes", str(year)]
|
||||
label = f"makes:{year}"
|
||||
makes_payload = self._request_json(path, label)
|
||||
makes = self._extract_values(makes_payload, ["make"])
|
||||
filtered = []
|
||||
for make in makes:
|
||||
canon = canonicalize(make)
|
||||
if canon in self.allowed_makes:
|
||||
filtered.append(make)
|
||||
return filtered
|
||||
|
||||
def run(self) -> FetchCounts:
|
||||
for year in range(self.min_year, self.max_year + 1):
|
||||
makes = self._fetch_makes_for_year(year)
|
||||
if not makes:
|
||||
print(f"[info] {year}: no allowed makes found, skipping", file=sys.stderr)
|
||||
continue
|
||||
print(f"[info] {year}: {len(makes)} makes", file=sys.stderr)
|
||||
for make in makes:
|
||||
print(f"[info] {year} {make}: fetching models", file=sys.stderr)
|
||||
self._fetch_models_for_make(year, make)
|
||||
self.conn.commit()
|
||||
return self.counts
|
||||
|
||||
|
||||
def build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Fetch VehAPI snapshot into SQLite.")
|
||||
parser.add_argument("--min-year", type=int, default=int(read_env("MIN_YEAR", DEFAULT_MIN_YEAR)), help="Inclusive min year (default env MIN_YEAR or 2017)")
|
||||
parser.add_argument("--max-year", type=int, default=int(read_env("MAX_YEAR", DEFAULT_MAX_YEAR)), help="Inclusive max year (default env MAX_YEAR or 2026)")
|
||||
parser.add_argument("--snapshot-dir", type=str, help="Target snapshot directory (default snapshots/<today>)")
|
||||
parser.add_argument("--base-url", type=str, default=read_env("VEHAPI_BASE_URL", DEFAULT_BASE_URL), help="VehAPI base URL (e.g. https://vehapi.com/api/v1/car-lists/get/car)")
|
||||
parser.add_argument("--rate-per-sec", type=int, default=int(read_env("VEHAPI_MAX_RPS", DEFAULT_RATE_PER_SEC)), help="Max requests per second (<=60)")
|
||||
parser.add_argument("--makes-file", type=str, default="source-makes.txt", help="Path to source-makes.txt")
|
||||
parser.add_argument("--api-key-file", type=str, default="vehapi.key", help="Path to VehAPI bearer token file")
|
||||
parser.add_argument("--no-response-cache", action="store_true", help="Disable request cache stored in snapshot.sqlite")
|
||||
return parser
|
||||
|
||||
|
||||
def read_env(key: str, default: Any) -> Any:
|
||||
import os
|
||||
|
||||
return os.environ.get(key, default)
|
||||
|
||||
|
||||
def main(argv: Sequence[str]) -> int:
|
||||
parser = build_arg_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
base_dir = Path(__file__).resolve().parent
|
||||
snapshot_root = base_dir / "snapshots"
|
||||
snapshot_dir = ensure_snapshot_dir(snapshot_root, args.snapshot_dir)
|
||||
snapshot_path = snapshot_dir / "snapshot.sqlite"
|
||||
meta_path = snapshot_dir / "meta.json"
|
||||
|
||||
makes_file = (base_dir / args.makes_file).resolve()
|
||||
api_key_file = (base_dir / args.api_key_file).resolve()
|
||||
if not makes_file.exists():
|
||||
print(f"[error] makes file not found: {makes_file}", file=sys.stderr)
|
||||
return 1
|
||||
if not api_key_file.exists():
|
||||
print(f"[error] api key file not found: {api_key_file}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
allowed_makes = read_lines(makes_file)
|
||||
token = read_text_file(api_key_file).strip()
|
||||
if not token:
|
||||
print("[error] vehapi.key is empty", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
session = requests.Session()
|
||||
fetcher = VehapiFetcher(
|
||||
session=session,
|
||||
base_url=args.base_url,
|
||||
token=token,
|
||||
min_year=args.min_year,
|
||||
max_year=args.max_year,
|
||||
allowed_makes=allowed_makes,
|
||||
snapshot_path=snapshot_path,
|
||||
responses_cache=not args.no_response_cache,
|
||||
rate_per_sec=args.rate_per_sec,
|
||||
)
|
||||
|
||||
started_at = datetime.now(timezone.utc)
|
||||
counts = fetcher.run()
|
||||
finished_at = datetime.now(timezone.utc)
|
||||
|
||||
meta = {
|
||||
"generated_at": finished_at.isoformat(),
|
||||
"started_at": started_at.isoformat(),
|
||||
"min_year": args.min_year,
|
||||
"max_year": args.max_year,
|
||||
"script_version": SCRIPT_VERSION,
|
||||
"makes_file": str(makes_file),
|
||||
"makes_hash": sha256_file(makes_file),
|
||||
"api_base_url": args.base_url,
|
||||
"snapshot_path": str(snapshot_path),
|
||||
"pairs_inserted": counts.pairs_inserted,
|
||||
"fallback_transmissions": counts.fallback_transmissions,
|
||||
"fallback_engines": counts.fallback_engines,
|
||||
"response_cache_hits": counts.cache_hits,
|
||||
}
|
||||
fetcher._store_meta(meta)
|
||||
with meta_path.open("w", encoding="utf-8") as fh:
|
||||
json.dump(meta, fh, indent=2)
|
||||
|
||||
print(
|
||||
f"[done] wrote snapshot to {snapshot_path} with {counts.pairs_inserted} pairs "
|
||||
f"(fallback trans={counts.fallback_transmissions}, fallback engines={counts.fallback_engines}, cache hits={counts.cache_hits})",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
2489
data/vehicle-etl/vehicle-api.postman_collection.json
Normal file
2489
data/vehicle-etl/vehicle-api.postman_collection.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user