Possible working ETL

This commit is contained in:
Eric Gullickson
2025-12-15 18:19:55 -06:00
parent 1fc69b7779
commit 1e599e334f
110 changed files with 4843 additions and 2078706 deletions

190
data/vehicle-etl/qa_validate.py Executable file
View File

@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""
Post-import QA validation for vehicle dropdown data.
Runs basic duplicate and range checks against the motovaultpro Postgres container.
"""
import os
import subprocess
import sys
def run_psql(query: str) -> str:
cmd = [
"docker",
"exec",
"mvp-postgres",
"psql",
"-U",
"postgres",
"-d",
"motovaultpro",
"-At",
"-c",
query,
]
return subprocess.check_output(cmd, text=True)
def check_container():
try:
subprocess.check_output(["docker", "ps"], text=True)
except Exception:
print("❌ Docker not available.")
sys.exit(1)
try:
containers = subprocess.check_output(
["docker", "ps", "--filter", "name=mvp-postgres", "--format", "{{.Names}}"],
text=True,
).strip()
if not containers:
print("❌ mvp-postgres container not running.")
sys.exit(1)
except Exception as exc:
print(f"❌ Failed to check containers: {exc}")
sys.exit(1)
def check_invalid_combinations():
"""Verify known invalid combinations do not exist."""
invalid_combos = [
(1992, "Chevrolet", "Corvette", "Z06"), # Z06 started 2001
(2000, "Chevrolet", "Corvette", "35th Anniversary Edition"), # Was 1988
(2000, "Chevrolet", "Corvette", "Stingray"), # Stingray started 2014
(1995, "Ford", "Mustang", "Mach-E"), # Mach-E is 2021+
(2020, "Tesla", "Cybertruck", "Base"), # Not in production until later
]
issues = []
for year, make, model, trim in invalid_combos:
query = f"""
SELECT COUNT(*) FROM vehicle_options
WHERE year = {year}
AND make = '{make}'
AND model = '{model}'
AND trim = '{trim}'
"""
count = int(run_psql(query).strip())
if count > 0:
issues.append(f"Invalid combo found: {year} {make} {model} {trim}")
return issues
def check_trim_coverage():
"""Report on trim coverage statistics."""
query = """
SELECT
COUNT(DISTINCT (year, make, model)) as total_models,
COUNT(DISTINCT (year, make, model)) FILTER (WHERE trim = 'Base') as base_only,
COUNT(DISTINCT (year, make, model)) FILTER (WHERE trim != 'Base') as has_specific_trims
FROM vehicle_options
"""
result = run_psql(query).strip()
print(f"Trim coverage (total/base_only/has_specific_trims): {result}")
def main():
check_container()
print("🔍 Running QA checks...\n")
queries = {
"engine_duplicate_names": """
SELECT COUNT(*) FROM (
SELECT LOWER(name) as n, COUNT(*) c
FROM engines
GROUP BY 1 HAVING COUNT(*) > 1
) t;
""",
"transmission_duplicate_types": """
SELECT COUNT(*) FROM (
SELECT LOWER(type) as t, COUNT(*) c
FROM transmissions
GROUP BY 1 HAVING COUNT(*) > 1
) t;
""",
"vehicle_option_duplicates": """
SELECT COUNT(*) FROM (
SELECT year, make, model, trim, engine_id, transmission_id, COUNT(*) c
FROM vehicle_options
GROUP BY 1,2,3,4,5,6 HAVING COUNT(*) > 1
) t;
""",
"year_range": """
SELECT MIN(year) || ' - ' || MAX(year) FROM vehicle_options;
""",
"year_range_valid": """
SELECT COUNT(*) FROM (
SELECT 1 FROM vehicle_options WHERE year < 2015 OR year > 2022 LIMIT 1
) t;
""",
"counts": """
SELECT
(SELECT COUNT(*) FROM engines) AS engines,
(SELECT COUNT(*) FROM transmissions) AS transmissions,
(SELECT COUNT(*) FROM vehicle_options) AS vehicle_options;
""",
"cross_join_gaps": """
SELECT COUNT(*) FROM (
SELECT base.year, base.make, base.model, base.trim, e.engine_id, t.transmission_id
FROM (
SELECT DISTINCT year, make, model, trim FROM vehicle_options
) base
JOIN (
SELECT DISTINCT year, make, model, trim, engine_id FROM vehicle_options
) e ON base.year = e.year AND base.make = e.make AND base.model = e.model AND base.trim = e.trim
JOIN (
SELECT DISTINCT year, make, model, trim, transmission_id FROM vehicle_options
) t ON base.year = t.year AND base.make = t.make AND base.model = t.model AND base.trim = t.trim
EXCEPT
SELECT year, make, model, trim, engine_id, transmission_id FROM vehicle_options
) gap;
""",
}
results = {}
for key, query in queries.items():
try:
results[key] = run_psql(query).strip()
except subprocess.CalledProcessError as exc:
print(f"❌ Query failed ({key}): {exc}")
sys.exit(1)
issues_found = False
print(f"Engine duplicate names: {results['engine_duplicate_names']}")
print(f"Transmission duplicate types: {results['transmission_duplicate_types']}")
print(f"Vehicle option duplicates: {results['vehicle_option_duplicates']}")
print(f"Year range: {results['year_range']}")
print(f"Out-of-range years (should be 0): {results['year_range_valid']}")
print(f"Counts (engines, transmissions, vehicle_options): {results['counts']}")
print(f"Cross-join gaps (should be 0 to avoid impossible pairs): {results['cross_join_gaps']}")
if (
results["engine_duplicate_names"] != "0"
or results["transmission_duplicate_types"] != "0"
or results["vehicle_option_duplicates"] != "0"
or results["year_range_valid"] != "0"
or results["cross_join_gaps"] != "0"
):
issues_found = True
invalids = check_invalid_combinations()
if invalids:
issues_found = True
print("\n❌ Invalid combinations detected:")
for issue in invalids:
print(f" - {issue}")
else:
print("\n✅ No known invalid year/make/model/trim combos found.")
check_trim_coverage()
if not issues_found:
print("\n✅ QA checks passed.")
else:
print("\n❌ QA checks found issues.")
if __name__ == "__main__":
main()