fix: ETL vehicle db import fixes

This commit is contained in:
Eric Gullickson
2025-12-26 14:54:51 -06:00
parent fb52ce398b
commit 09410c3c3f
10 changed files with 67243 additions and 61980 deletions

View File

@@ -1,40 +1,84 @@
Step 1: Fetch Data from VehAPI
# Vehicle Catalog Data Export
cd data/vehicle-etl
python3 vehapi_fetch_snapshot.py --min-year 2015 --max-year 2025
Export the current vehicle catalog database to SQL files for GitLab CI/CD deployment.
Options:
| Flag | Default | Description |
|---------------------|-------------------|------------------------|
| --min-year | 2015 | Start year |
| --max-year | 2022 | End year |
| --rate-per-min | 55 | API rate limit |
| --snapshot-dir | snapshots/<today> | Output directory |
| --no-response-cache | false | Disable resume caching |
## Export Workflow
Output: Creates snapshots/<date>/snapshot.sqlite
### Export from Running Database
---
Step 2: Generate SQL Files
```bash
cd data/vehicle-etl
python3 export_from_postgres.py
```
python3 etl_generate_sql.py --snapshot-path snapshots/<date>/snapshot.sqlite
**Output:** Creates output/01_engines.sql, output/02_transmissions.sql, output/03_vehicle_options.sql
Output: Creates output/01_engines.sql, output/02_transmissions.sql, output/03_vehicle_options.sql
**Requirements:**
- mvp-postgres container running
- Python 3.7+
---
Step 3: Import to PostgreSQL
### Commit and Deploy
./import_data.sh
```bash
git add output/*.sql
git commit -m "Update vehicle catalog data from PostgreSQL export"
git push
```
Requires: mvp-postgres container running, SQL files in output/
GitLab CI/CD will automatically import these SQL files during deployment.
---
Quick Test (single year)
---
python3 vehapi_fetch_snapshot.py --min-year 2020 --max-year 2020
## When to Export
# Full ETL workflow with cached results
./reset_database.sh # Clear old data
python3 etl_generate_sql.py --snapshot-path snapshots/*.sqlite # Generate SQL
./import_data.sh # Import to Postgres
docker compose exec mvp-redis redis-cli FLUSHALL # Flush Redis Cache for front end
| Scenario | Action |
|----------|--------|
| Admin uploaded CSVs to database | Export and commit |
| Manual corrections in PostgreSQL | Export and commit |
| After adding new vehicle data | Export and commit |
| Preparing for deployment | Export and commit |
---
## Local Testing
```bash
# Export current database state
python3 export_from_postgres.py
# Test import locally
./reset_database.sh
./import_data.sh
docker compose exec mvp-redis redis-cli FLUSHALL
# Verify data
docker exec mvp-postgres psql -U postgres -d motovaultpro -c "
SELECT
(SELECT COUNT(*) FROM engines) as engines,
(SELECT COUNT(*) FROM transmissions) as transmissions,
(SELECT COUNT(*) FROM vehicle_options) as vehicle_options,
(SELECT MIN(year) FROM vehicle_options) as min_year,
(SELECT MAX(year) FROM vehicle_options) as max_year;
"
```
---
## GitLab CI/CD Integration
The pipeline automatically imports SQL files from `output/` directory during deployment (/.gitlab-ci.yml lines 89-98):
- data/vehicle-etl/output/01_engines.sql
- data/vehicle-etl/output/02_transmissions.sql
- data/vehicle-etl/output/03_vehicle_options.sql
Commit updated SQL files to trigger deployment with new data.
---
## Legacy Scripts (Not Used)
The following scripts are legacy from the VehAPI integration and are no longer used:
- vehapi_fetch_snapshot.py (obsolete - VehAPI not used)
- etl_generate_sql.py (obsolete - database export used instead)
These scripts are preserved for historical reference but should not be executed.

View File

@@ -0,0 +1,322 @@
#!/usr/bin/env python3
"""
Export PostgreSQL database to SQL files.
Extracts current state from running mvp-postgres container and generates
SQL import files compatible with the GitLab CI/CD pipeline.
Usage:
python3 export_from_postgres.py
python3 export_from_postgres.py --output-dir custom/path
Output files:
- output/01_engines.sql
- output/02_transmissions.sql
- output/03_vehicle_options.sql
"""
import argparse
import csv
import io
import subprocess
import sys
from pathlib import Path
from typing import Dict, Iterable, List, Sequence
BATCH_SIZE = 1000
def check_python_version():
"""Ensure Python 3.7+ is being used."""
if sys.version_info < (3, 7):
raise RuntimeError(
f"Python 3.7 or higher required. Current version: {sys.version_info.major}.{sys.version_info.minor}"
)
def check_container_running():
"""Verify mvp-postgres container is running."""
try:
result = subprocess.run(
["docker", "ps", "--filter", "name=mvp-postgres", "--format", "{{.Names}}"],
capture_output=True,
text=True,
check=True,
)
if "mvp-postgres" not in result.stdout:
raise RuntimeError(
"mvp-postgres container is not running.\n"
"Start with: docker compose up -d mvp-postgres"
)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Failed to check Docker containers: {e}")
def sql_value(value):
"""
Convert a Python value to its SQL representation.
- None -> NULL
- str -> 'escaped string' (single quotes doubled)
- int/other -> str(value)
"""
if value is None:
return "NULL"
if isinstance(value, str):
return "'" + value.replace("'", "''") + "'"
return str(value)
def chunked(seq: Iterable[Dict], size: int) -> Iterable[List[Dict]]:
"""
Yield successive chunks of `size` from sequence.
Used to batch INSERT statements for better performance.
"""
chunk: List[Dict] = []
for item in seq:
chunk.append(item)
if len(chunk) >= size:
yield chunk
chunk = []
if chunk:
yield chunk
def write_insert_file(
path: Path,
table: str,
columns: Sequence[str],
rows: Sequence[Dict],
):
"""
Write batched INSERT statements to a SQL file.
Args:
path: Output file path
table: Table name
columns: Column names to insert
rows: List of row dictionaries
"""
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as f:
f.write(f"-- Auto-generated by export_from_postgres.py\n")
if not rows:
f.write(f"-- No rows for {table}\n")
return
for batch in chunked(rows, BATCH_SIZE):
values_sql = ",\n".join(
"(" + ",".join(sql_value(row[col]) for col in columns) + ")"
for row in batch
)
f.write(f"INSERT INTO {table} ({', '.join(columns)}) VALUES\n{values_sql};\n\n")
def execute_psql_copy(query: str) -> str:
"""
Execute a PostgreSQL COPY command via docker exec.
Args:
query: SQL COPY query to execute
Returns:
CSV output as string
Raises:
RuntimeError: If command fails
"""
try:
result = subprocess.run(
[
"docker",
"exec",
"mvp-postgres",
"psql",
"-U",
"postgres",
"-d",
"motovaultpro",
"-c",
query,
],
capture_output=True,
text=True,
check=True,
)
return result.stdout
except subprocess.CalledProcessError as e:
error_msg = e.stderr if e.stderr else str(e)
raise RuntimeError(f"PostgreSQL query failed: {error_msg}")
def export_engines(output_dir: Path) -> int:
"""
Export engines table to 01_engines.sql.
Returns:
Number of records exported
"""
query = "COPY (SELECT id, name, fuel_type FROM engines ORDER BY id) TO STDOUT WITH CSV HEADER"
csv_output = execute_psql_copy(query)
rows = []
try:
reader = csv.DictReader(io.StringIO(csv_output))
for row in reader:
rows.append({
"id": int(row["id"]),
"name": row["name"],
"fuel_type": row["fuel_type"] if row["fuel_type"] else None,
})
except (csv.Error, KeyError, ValueError) as e:
raise RuntimeError(f"Failed to parse engines CSV output: {e}")
write_insert_file(
output_dir / "01_engines.sql",
"engines",
["id", "name", "fuel_type"],
rows,
)
return len(rows)
def export_transmissions(output_dir: Path) -> int:
"""
Export transmissions table to 02_transmissions.sql.
Returns:
Number of records exported
"""
query = "COPY (SELECT id, type FROM transmissions ORDER BY id) TO STDOUT WITH CSV HEADER"
csv_output = execute_psql_copy(query)
rows = []
try:
reader = csv.DictReader(io.StringIO(csv_output))
for row in reader:
rows.append({
"id": int(row["id"]),
"type": row["type"],
})
except (csv.Error, KeyError, ValueError) as e:
raise RuntimeError(f"Failed to parse transmissions CSV output: {e}")
write_insert_file(
output_dir / "02_transmissions.sql",
"transmissions",
["id", "type"],
rows,
)
return len(rows)
def export_vehicle_options(output_dir: Path) -> tuple:
"""
Export vehicle_options table to 03_vehicle_options.sql.
Returns:
Tuple of (record_count, min_year, max_year)
"""
query = """COPY (
SELECT year, make, model, trim, engine_id, transmission_id
FROM vehicle_options
ORDER BY year, make, model, trim
) TO STDOUT WITH CSV HEADER"""
csv_output = execute_psql_copy(query)
rows = []
years = []
try:
reader = csv.DictReader(io.StringIO(csv_output))
for row in reader:
year = int(row["year"])
years.append(year)
rows.append({
"year": year,
"make": row["make"],
"model": row["model"],
"trim": row["trim"],
"engine_id": int(row["engine_id"]) if row["engine_id"] else None,
"transmission_id": int(row["transmission_id"]) if row["transmission_id"] else None,
})
except (csv.Error, KeyError, ValueError) as e:
raise RuntimeError(f"Failed to parse vehicle_options CSV output: {e}")
write_insert_file(
output_dir / "03_vehicle_options.sql",
"vehicle_options",
["year", "make", "model", "trim", "engine_id", "transmission_id"],
rows,
)
min_year = min(years) if years else None
max_year = max(years) if years else None
return len(rows), min_year, max_year
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Export PostgreSQL vehicle catalog to SQL files.",
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("output"),
help="Directory to write SQL output files (default: output)",
)
return parser.parse_args()
def main():
"""Main export workflow."""
check_python_version()
args = parse_args()
output_dir: Path = args.output_dir
print("Exporting from PostgreSQL database...")
print()
# Verify container is running
try:
check_container_running()
except RuntimeError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
# Export each table
try:
engines_count = export_engines(output_dir)
print(f" Engines: {engines_count:,} records")
trans_count = export_transmissions(output_dir)
print(f" Transmissions: {trans_count:,} records")
vehicles_count, min_year, max_year = export_vehicle_options(output_dir)
print(f" Vehicle options: {vehicles_count:,} records")
print()
except RuntimeError as e:
print(f"Error during export: {e}", file=sys.stderr)
sys.exit(1)
# Print summary
print("SQL files generated:")
for sql_file in sorted(output_dir.glob("*.sql")):
size_kb = sql_file.stat().st_size / 1024
print(f" - {sql_file} ({size_kb:.0f}KB)")
print()
if min_year and max_year:
print(f"Year coverage: {min_year}-{max_year}")
print()
print("Export complete! Commit these files to deploy:")
print(f" git add {output_dir}/*.sql")
print(f" git commit -m \"Update vehicle catalog from PostgreSQL export ({min_year}-{max_year})\"")
if __name__ == "__main__":
main()

View File

@@ -1,4 +1,4 @@
-- Auto-generated by etl_generate_sql.py
-- Auto-generated by export_from_postgres.py
INSERT INTO engines (id, name, fuel_type) VALUES
(1,'Gas','Gas'),
(2,'1.7L 74 hp I4','Gas'),
@@ -2479,5 +2479,100 @@ INSERT INTO engines (id, name, fuel_type) VALUES
(2473,'3.5-liter direct injection V-6','Gas'),
(2474,'6.2L 668 hp V8','Gas'),
(2475,'3.3L 368 hp V6','Gas'),
(2476,'1.5L 158 hp I4','Gas');
(2476,'1.5L 158 hp I4','Gas'),
(2484,'3.9L 612 hp V8','Gas'),
(2486,'4.0L 769 hp V12 Hybrid','Gas'),
(2500,'3.0L 612 hp V6 Hybrid','Gas'),
(2518,'6.2L 819 hp V12','Gas'),
(2544,'1.4L I4','Gas'),
(2548,'Electric Motor','Gas'),
(2549,'1.4L Turbo I4','Gas'),
(2553,'1.5L Turbo I4','Gas'),
(2557,'1.8L I4 Hybrid','Gas'),
(2560,'2.0L Turbo I4','Gas'),
(2561,'3.6L V6','Gas'),
(2562,'6.2L V8','Gas'),
(2563,'6.2L Supercharged V8','Gas'),
(2564,'1.2L Turbo I3','Gas'),
(2610,'2.8L Diesel I4','Gas'),
(2630,'2.7L Turbo I4','Gas'),
(2645,'3.0L Diesel I6','Gas'),
(2661,'6.6L Duramax Diesel V8','Gas'),
(2677,'5.5L V8','Gas'),
(3592,'2.0L I4 Turbo','Gas'),
(3596,'3.0L V6 Turbo','Gas'),
(3601,'3.5L V6 Hybrid','Gas'),
(3613,'3.5L V6 Twin-Turbo Hybrid','Gas'),
(3618,'1.5L I4 Turbo','Gas'),
(3670,'Dual Electric Motors','Gas'),
(3875,'4.0L V8 Twin-Turbo 542 hp','Gas'),
(3878,'4.0L V8 Twin-Turbo 626 hp','Gas'),
(3879,'3.0L V6 Twin-Turbo PHEV 443 hp','Gas'),
(3880,'6.0L W12 Twin-Turbo 626 hp','Gas'),
(3882,'6.0L W12 Twin-Turbo 705 hp','Gas'),
(3910,'6.0L W12 Twin-Turbo PHEV 536 hp','Gas'),
(3980,'3.0L I6 Turbo','Gas'),
(3986,'2.0L I4 Turbo Hybrid','Gas'),
(3998,'4.4L V8 Twin-Turbo','Gas'),
(4005,'3.0L I6 Turbo Hybrid','Gas'),
(4006,'6.6L V12','Gas'),
(4035,'3.0L I6 Twin-Turbo','Gas'),
(4275,'4.4L V8 Hybrid','Gas'),
(4380,'1.4L Turbo I3','Gas'),
(4491,'6.2L V8 Supercharged','Gas'),
(4590,'6.2L V8 Hybrid','Gas'),
(4875,'8-Speed Automatic','Gas'),
(5149,'1.4L I4 Turbocharged','Gas'),
(5248,'3.5L V6 EcoBoost','Gas'),
(5250,'3.3L V6 Hybrid','Gas'),
(5257,'2.3L EcoBoost','Gas'),
(5300,'2.0L EcoBoost','Gas'),
(5412,'2.0L Turbocharged I4','Gas'),
(5418,'2.5L Turbocharged I4','Gas'),
(5511,'84 kWh Battery','Gas'),
(6380,'3-Motor Electric (1000hp)','Gas'),
(6385,'3-Motor Electric (830hp)','Gas'),
(6387,'2-Motor Electric (570hp)','Gas'),
(6393,'3-Motor Electric (1160hp)','Gas'),
(6397,'3.0L V6 Twin-Turbo','Gas'),
(6403,'2.0L I4 VC-Turbo','Gas'),
(6415,'5.6L V8','Gas'),
(6464,'3.5L V6 Twin-Turbo','Gas'),
(6480,'5.2L I4 Turbocharged','Gas'),
(6504,'6.7L I6 Turbocharged Cummins','Gas'),
(6560,'2.0L 246 hp Turbo I4','Gas'),
(6561,'2.0L 296 hp Turbo I4','Gas'),
(6564,'3.0L 395 hp Turbo Supercharged I6 Mild Hybrid','Gas'),
(6566,'5.0L 542 hp Supercharged V8','Gas'),
(6570,'5.0L 444 hp Supercharged V8','Gas'),
(6572,'5.0L 575 hp Supercharged V8','Gas'),
(6578,'5.0L 550 hp Supercharged V8','Gas'),
(6600,'5.0L 576 hp Supercharged V8','Gas'),
(6608,'5.0L 567 hp Supercharged V8','Gas'),
(6619,'6.4L V8','Gas'),
(6623,'2.0L Turbo I4 Hybrid','Gas'),
(6648,'3.0L Diesel V6','Gas'),
(6666,'1.3L Turbo I4','Gas'),
(6682,'2.0L Turbo I4 PHEV','Gas'),
(6857,'3.0L Twin-Turbo I6','Gas'),
(6863,'3.0L Twin-Turbo I6 HO','Gas'),
(7027,'2.5L I4 Hybrid','Gas'),
(7056,'2.0L I4 Hybrid','Gas'),
(7064,'2.4L I4 Turbo','Gas'),
(7072,'2.5L I4 Plug-in Hybrid','Gas'),
(7164,'3.4L V6 Twin-Turbo','Gas'),
(7237,'3.4L V6 Twin-Turbo Hybrid','Gas'),
(7261,'3.5L V6 Plug-in Hybrid','Gas'),
(7465,'3.8L V8 Twin-Turbo','Gas'),
(7490,'2.0L I4 Twin-Turbo Hybrid','Gas'),
(7520,'Standard','Gas'),
(7538,'2.0L 455 hp I4 Twin-Turbo Hybrid','Gas'),
(7715,'4.0L 657 hp V8','Gas'),
(7743,'6.5L 1001 hp V12 Hybrid','Gas'),
(7784,'1.6L I4 Hybrid','Gas'),
(7798,'1.6L Turbocharged I4','Gas'),
(7841,'2.5L I4 Turbocharged','Gas'),
(8095,'2.5L Turbocharged I4 Hybrid','Gas'),
(8196,'4.0L V8 Twin-Turbo','Gas'),
(8238,'3.0L V6 Twin-Turbo Hybrid','Gas');

View File

@@ -1,4 +1,4 @@
-- Auto-generated by etl_generate_sql.py
-- Auto-generated by export_from_postgres.py
INSERT INTO transmissions (id, type) VALUES
(1,'Automatic'),
(2,'Manual'),
@@ -35,5 +35,13 @@ INSERT INTO transmissions (id, type) VALUES
(33,'10-Speed Dual Clutch'),
(34,'10-Speed CVT'),
(35,'2-Speed Automatic'),
(36,'10-Speed Automatic Transmission');
(36,'10-Speed Automatic Transmission'),
(115,'CVT'),
(119,'1-Speed Direct Drive'),
(1159,'8-Speed DCT'),
(1172,'7-Speed DCT'),
(1184,'9-Speed DCT'),
(3072,'Single-Speed Transmission'),
(5081,'Electric'),
(5304,'ISR Automatic');

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,34 @@
#!/bin/bash
# Compare database counts with exported SQL file counts
# Usage: ./validate_export.sh
set -e
echo "Validating exported SQL files against database..."
echo ""
# Get counts from database
DB_ENGINES=$(docker exec mvp-postgres psql -U postgres -d motovaultpro -t -A -c "SELECT COUNT(*) FROM engines;")
DB_TRANS=$(docker exec mvp-postgres psql -U postgres -d motovaultpro -t -A -c "SELECT COUNT(*) FROM transmissions;")
DB_VEHICLES=$(docker exec mvp-postgres psql -U postgres -d motovaultpro -t -A -c "SELECT COUNT(*) FROM vehicle_options;")
# Count records in SQL files (count lines starting with '(' which are data rows)
SQL_ENGINES=$(grep -c '^(' output/01_engines.sql)
SQL_TRANS=$(grep -c '^(' output/02_transmissions.sql)
SQL_VEHICLES=$(grep -c '^(' output/03_vehicle_options.sql)
# Display comparison
echo "Database vs SQL File Counts:"
echo " Engines: $DB_ENGINES (DB) vs $SQL_ENGINES (SQL)"
echo " Transmissions: $DB_TRANS (DB) vs $SQL_TRANS (SQL)"
echo " Vehicle Options: $DB_VEHICLES (DB) vs $SQL_VEHICLES (SQL)"
echo ""
# Validate counts match
if [ "$DB_ENGINES" -eq "$SQL_ENGINES" ] && [ "$DB_TRANS" -eq "$SQL_TRANS" ] && [ "$DB_VEHICLES" -eq "$SQL_VEHICLES" ]; then
echo "Validation PASSED - All counts match!"
exit 0
else
echo "Validation FAILED - Counts do not match!"
exit 1
fi