Possible working ETL
This commit is contained in:
190
data/vehicle-etl/qa_validate.py
Executable file
190
data/vehicle-etl/qa_validate.py
Executable file
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Post-import QA validation for vehicle dropdown data.
|
||||
Runs basic duplicate and range checks against the motovaultpro Postgres container.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def run_psql(query: str) -> str:
|
||||
cmd = [
|
||||
"docker",
|
||||
"exec",
|
||||
"mvp-postgres",
|
||||
"psql",
|
||||
"-U",
|
||||
"postgres",
|
||||
"-d",
|
||||
"motovaultpro",
|
||||
"-At",
|
||||
"-c",
|
||||
query,
|
||||
]
|
||||
return subprocess.check_output(cmd, text=True)
|
||||
|
||||
|
||||
def check_container():
|
||||
try:
|
||||
subprocess.check_output(["docker", "ps"], text=True)
|
||||
except Exception:
|
||||
print("❌ Docker not available.")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
containers = subprocess.check_output(
|
||||
["docker", "ps", "--filter", "name=mvp-postgres", "--format", "{{.Names}}"],
|
||||
text=True,
|
||||
).strip()
|
||||
if not containers:
|
||||
print("❌ mvp-postgres container not running.")
|
||||
sys.exit(1)
|
||||
except Exception as exc:
|
||||
print(f"❌ Failed to check containers: {exc}")
|
||||
sys.exit(1)
|
||||
|
||||
def check_invalid_combinations():
|
||||
"""Verify known invalid combinations do not exist."""
|
||||
invalid_combos = [
|
||||
(1992, "Chevrolet", "Corvette", "Z06"), # Z06 started 2001
|
||||
(2000, "Chevrolet", "Corvette", "35th Anniversary Edition"), # Was 1988
|
||||
(2000, "Chevrolet", "Corvette", "Stingray"), # Stingray started 2014
|
||||
(1995, "Ford", "Mustang", "Mach-E"), # Mach-E is 2021+
|
||||
(2020, "Tesla", "Cybertruck", "Base"), # Not in production until later
|
||||
]
|
||||
|
||||
issues = []
|
||||
for year, make, model, trim in invalid_combos:
|
||||
query = f"""
|
||||
SELECT COUNT(*) FROM vehicle_options
|
||||
WHERE year = {year}
|
||||
AND make = '{make}'
|
||||
AND model = '{model}'
|
||||
AND trim = '{trim}'
|
||||
"""
|
||||
count = int(run_psql(query).strip())
|
||||
if count > 0:
|
||||
issues.append(f"Invalid combo found: {year} {make} {model} {trim}")
|
||||
|
||||
return issues
|
||||
|
||||
def check_trim_coverage():
|
||||
"""Report on trim coverage statistics."""
|
||||
query = """
|
||||
SELECT
|
||||
COUNT(DISTINCT (year, make, model)) as total_models,
|
||||
COUNT(DISTINCT (year, make, model)) FILTER (WHERE trim = 'Base') as base_only,
|
||||
COUNT(DISTINCT (year, make, model)) FILTER (WHERE trim != 'Base') as has_specific_trims
|
||||
FROM vehicle_options
|
||||
"""
|
||||
result = run_psql(query).strip()
|
||||
print(f"Trim coverage (total/base_only/has_specific_trims): {result}")
|
||||
|
||||
|
||||
def main():
|
||||
check_container()
|
||||
|
||||
print("🔍 Running QA checks...\n")
|
||||
|
||||
queries = {
|
||||
"engine_duplicate_names": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT LOWER(name) as n, COUNT(*) c
|
||||
FROM engines
|
||||
GROUP BY 1 HAVING COUNT(*) > 1
|
||||
) t;
|
||||
""",
|
||||
"transmission_duplicate_types": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT LOWER(type) as t, COUNT(*) c
|
||||
FROM transmissions
|
||||
GROUP BY 1 HAVING COUNT(*) > 1
|
||||
) t;
|
||||
""",
|
||||
"vehicle_option_duplicates": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT year, make, model, trim, engine_id, transmission_id, COUNT(*) c
|
||||
FROM vehicle_options
|
||||
GROUP BY 1,2,3,4,5,6 HAVING COUNT(*) > 1
|
||||
) t;
|
||||
""",
|
||||
"year_range": """
|
||||
SELECT MIN(year) || ' - ' || MAX(year) FROM vehicle_options;
|
||||
""",
|
||||
"year_range_valid": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT 1 FROM vehicle_options WHERE year < 2015 OR year > 2022 LIMIT 1
|
||||
) t;
|
||||
""",
|
||||
"counts": """
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM engines) AS engines,
|
||||
(SELECT COUNT(*) FROM transmissions) AS transmissions,
|
||||
(SELECT COUNT(*) FROM vehicle_options) AS vehicle_options;
|
||||
""",
|
||||
"cross_join_gaps": """
|
||||
SELECT COUNT(*) FROM (
|
||||
SELECT base.year, base.make, base.model, base.trim, e.engine_id, t.transmission_id
|
||||
FROM (
|
||||
SELECT DISTINCT year, make, model, trim FROM vehicle_options
|
||||
) base
|
||||
JOIN (
|
||||
SELECT DISTINCT year, make, model, trim, engine_id FROM vehicle_options
|
||||
) e ON base.year = e.year AND base.make = e.make AND base.model = e.model AND base.trim = e.trim
|
||||
JOIN (
|
||||
SELECT DISTINCT year, make, model, trim, transmission_id FROM vehicle_options
|
||||
) t ON base.year = t.year AND base.make = t.make AND base.model = t.model AND base.trim = t.trim
|
||||
EXCEPT
|
||||
SELECT year, make, model, trim, engine_id, transmission_id FROM vehicle_options
|
||||
) gap;
|
||||
""",
|
||||
}
|
||||
|
||||
results = {}
|
||||
for key, query in queries.items():
|
||||
try:
|
||||
results[key] = run_psql(query).strip()
|
||||
except subprocess.CalledProcessError as exc:
|
||||
print(f"❌ Query failed ({key}): {exc}")
|
||||
sys.exit(1)
|
||||
|
||||
issues_found = False
|
||||
|
||||
print(f"Engine duplicate names: {results['engine_duplicate_names']}")
|
||||
print(f"Transmission duplicate types: {results['transmission_duplicate_types']}")
|
||||
print(f"Vehicle option duplicates: {results['vehicle_option_duplicates']}")
|
||||
print(f"Year range: {results['year_range']}")
|
||||
print(f"Out-of-range years (should be 0): {results['year_range_valid']}")
|
||||
print(f"Counts (engines, transmissions, vehicle_options): {results['counts']}")
|
||||
print(f"Cross-join gaps (should be 0 to avoid impossible pairs): {results['cross_join_gaps']}")
|
||||
|
||||
if (
|
||||
results["engine_duplicate_names"] != "0"
|
||||
or results["transmission_duplicate_types"] != "0"
|
||||
or results["vehicle_option_duplicates"] != "0"
|
||||
or results["year_range_valid"] != "0"
|
||||
or results["cross_join_gaps"] != "0"
|
||||
):
|
||||
issues_found = True
|
||||
|
||||
invalids = check_invalid_combinations()
|
||||
if invalids:
|
||||
issues_found = True
|
||||
print("\n❌ Invalid combinations detected:")
|
||||
for issue in invalids:
|
||||
print(f" - {issue}")
|
||||
else:
|
||||
print("\n✅ No known invalid year/make/model/trim combos found.")
|
||||
|
||||
check_trim_coverage()
|
||||
|
||||
if not issues_found:
|
||||
print("\n✅ QA checks passed.")
|
||||
else:
|
||||
print("\n❌ QA checks found issues.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user