Vehicle ETL Process fixed. Admin settings fixed.

This commit is contained in:
Eric Gullickson
2025-12-15 20:51:52 -06:00
parent 1a9ead9d9d
commit b84d4c7fef
23 changed files with 4553 additions and 2450 deletions

View File

@@ -1,53 +0,0 @@
acura
alfa_romeo
aston_martin
audi
bentley
bmw
buick
cadillac
chevrolet
chrysler
dodge
ferrari
fiat
ford
genesis
gmc
honda
hummer
hyundai
infiniti
isuzu
jaguar
jeep
kia
lamborghini
land_rover
lexus
lincoln
lotus
lucid
maserati
mazda
mclaren
mercury
mini
mitsubishi
nissan
oldsmobile
plymouth
polestar
pontiac
porsche
ram
rivian
rolls_royce
saab
scion
smart
subaru
tesla
toyota
volkswagen
volvo

View File

@@ -0,0 +1,41 @@
Step 1: Fetch Data from VehAPI
cd data/vehicle-etl
python3 vehapi_fetch_snapshot.py --min-year 2015 --max-year 2025
Options:
| Flag | Default | Description |
|---------------------|-------------------|------------------------|
| --min-year | 2015 | Start year |
| --max-year | 2022 | End year |
| --rate-per-min | 55 | API rate limit |
| --snapshot-dir | snapshots/<today> | Output directory |
| --no-response-cache | false | Disable resume caching |
Output: Creates snapshots/<date>/snapshot.sqlite
---
Step 2: Generate SQL Files
python3 etl_generate_sql.py --snapshot-path snapshots/<date>/snapshot.sqlite
Output: Creates output/01_engines.sql, output/02_transmissions.sql, output/03_vehicle_options.sql
---
Step 3: Import to PostgreSQL
./import_data.sh
Requires: mvp-postgres container running, SQL files in output/
---
Quick Test (single year)
python3 vehapi_fetch_snapshot.py --min-year 2020 --max-year 2020
# Full ETL workflow
./reset_database.sh # Clear old data
python3 vehapi_fetch_snapshot.py # Fetch from API
python3 etl_generate_sql.py # Generate SQL
./import_data.sh # Import to Postgres
docker compose exec mvp-redis redis-cli FLUSHALL # Flush Redis Cache for front end

View File

@@ -1,22 +1,128 @@
-- Auto-generated by etl_generate_sql.py
INSERT INTO engines (id, name, fuel_type) VALUES
(1,'Gas','Gas'),
(2,'2.0L 150 hp I4','Gas'),
(3,'2.4L 201 hp I4','Gas'),
(4,'3.5L 290 hp V6','Gas'),
(5,'3.5L 273 hp V6','Gas'),
(6,'3.5L 310 hp V6','Gas'),
(7,'2.4L 206 hp I4','Gas'),
(8,'2.0L 220 hp I4','Gas'),
(9,'1.8L 170 hp I4','Gas'),
(10,'Diesel','Diesel'),
(11,'2.0L 150 hp I4 Diesel','Diesel'),
(12,'2.0L 220 hp I4 Flex Fuel Vehicle','Gas'),
(13,'3.0L 310 hp V6','Gas'),
(14,'3.0L 240 hp V6 Diesel','Diesel'),
(15,'4.0L 435 hp V8','Diesel'),
(2,'2.4L 201 hp I4','Gas'),
(3,'3.5L 290 hp V6','Gas'),
(4,'3.0L 321 hp V6 Hybrid','Hybrid'),
(5,'3.5L 573 hp V6 Hybrid','Hybrid'),
(6,'3.5L 279 hp V6','Gas'),
(7,'3.5L 310 hp V6','Gas'),
(8,'3.5L 377 hp V6 Hybrid','Hybrid'),
(9,'2.4L 206 hp I4','Gas'),
(10,'2.0L 220 hp I4','Gas'),
(11,'2.0L 186 hp I4','Gas'),
(12,'1.4L 204 hp I4','Gas'),
(13,'2.0L 190 hp I4','Gas'),
(14,'2.0L 252 hp I4','Gas'),
(15,'2.0L 220 hp I4 Flex Fuel Vehicle','Gas'),
(16,'3.0L 333 hp V6','Gas'),
(17,'6.3L 500 hp W12','Gas'),
(18,'2.0L 200 hp I4','Gas'),
(19,'3.0L 272 hp V6','Gas');
(17,'3.0L 340 hp V6','Gas'),
(18,'4.0L 450 hp V8','Gas'),
(19,'2.0L 200 hp I4','Gas'),
(20,'3.0L 272 hp V6','Gas'),
(21,'Diesel','Diesel'),
(22,'5.2L 540 hp V10','Gas'),
(23,'5.2L 610 hp V10','Gas'),
(24,'2.5L 400 hp I5','Gas'),
(25,'4.0L 560 hp V8','Gas'),
(26,'4.0L 605 hp V8','Gas'),
(27,'2.0L 292 hp I4','Gas'),
(28,'3.0L 354 hp V6','Gas'),
(29,'2.0L 248 hp I4','Gas'),
(30,'3.0L 335 hp I6','Gas'),
(31,'2.0L 180 hp I4','Gas'),
(32,'2.0L 180 hp I4 Diesel','Diesel'),
(33,'3.0L 320 hp I6','Gas'),
(34,'3.0L 300 hp I6','Gas'),
(35,'4.4L 445 hp V8','Gas'),
(36,'3.0L 315 hp I6','Gas'),
(37,'4.4L 600 hp V8','Gas'),
(38,'2.0L 322 hp I4','Gas'),
(39,'6.6L 601 hp V12','Gas'),
(40,'3.0L 365 hp I6','Gas'),
(41,'3.0L 425 hp I6','Gas'),
(42,'4.4L 552 hp V8','Gas'),
(43,'2.0L 228 hp I4','Gas'),
(44,'2.0L 240 hp I4','Gas'),
(45,'3.0L 355 hp I6','Gas'),
(46,'3.0L 255 hp I6 Diesel','Diesel'),
(47,'2.0L 308 hp I4','Gas'),
(48,'4.4L 567 hp V8','Gas'),
(49,'0.7L 168 hp I2','Gas'),
(50,'170 hp Electric','Electric'),
(51,'168 hp Electric','Electric'),
(52,'0.7L 170 hp I2','Gas'),
(53,'1.5L 357 hp I3','Gas'),
(54,'6.0L 600 hp W12','Gas'),
(55,'6.0L 633 hp W12','Gas'),
(56,'4.0L 500 hp V8','Gas'),
(57,'4.0L 521 hp V8','Gas'),
(58,'6.0L 582 hp W12','Gas'),
(59,'6.0L 700 hp W12','Gas'),
(60,'6.0L 616 hp W12','Gas'),
(61,'6.0L 626 hp W12','Gas'),
(62,'6.8L 505 hp V8','Gas'),
(63,'6.8L 530 hp V8','Gas'),
(64,'1.6L 200 hp I4','Gas'),
(65,'3.6L 288 hp V6','Gas'),
(66,'1.4L 138 hp I4','Gas'),
(67,'1.4L 153 hp I4','Gas'),
(68,'2.5L 197 hp I4','Gas'),
(69,'3.6L 310 hp V6','Gas'),
(70,'2.4L 182 hp I4','Gas'),
(71,'2.4L 182 hp I4 Flex Fuel Vehicle','Gas'),
(72,'2.0L 259 hp I4','Gas'),
(73,'2.4L 180 hp I4','Gas'),
(74,'2.4L 180 hp I4 Flex Fuel Vehicle','Gas'),
(75,'2.0L 272 hp I4','Gas'),
(76,'3.6L 335 hp V6','Gas'),
(77,'2.5L 202 hp I4','Gas'),
(78,'3.6L 464 hp V6','Gas'),
(79,'2.0L 265 hp I4','Gas'),
(80,'3.0L 404 hp V6','Gas'),
(81,'2.0L 335 hp I4','Gas'),
(82,'2.0L 268 hp I4','Gas'),
(83,'3.6L 420 hp V6','Gas'),
(84,'6.2L 640 hp V8','Gas'),
(85,'6.2L 420 hp V8','Gas'),
(86,'3.6L 304 hp V6','Gas'),
(87,'3.6L 410 hp V6','Gas'),
(88,'200 hp Electric','Electric'),
(89,'2.0L 275 hp I4','Gas'),
(90,'6.2L 455 hp V8','Gas'),
(91,'6.2L 650 hp V8','Gas'),
(92,'3.6L 301 hp V6 Flex Fuel Vehicle','Gas'),
(93,'6.0L 355 hp V8 Flex Fuel Vehicle','Gas'),
(94,'2.0L 131 hp I4','Gas'),
(95,'2.5L 200 hp I4','Gas'),
(96,'2.8L 181 hp I4 Diesel','Diesel'),
(97,'3.6L 308 hp V6','Gas'),
(98,'6.2L 460 hp V8','Gas'),
(99,'1.6L 137 hp I4 Diesel','Diesel'),
(100,'3.6L 301 hp V6','Gas'),
(101,'4.8L 285 hp V8','Gas'),
(102,'6.0L 342 hp V8 Flex Fuel Vehicle','Gas'),
(103,'3.6L 260 hp V6 Compressed Natural Gas','Gas'),
(104,'3.6L 305 hp V6 Flex Fuel Vehicle','Gas'),
(105,'1.5L 160 hp I4','Gas'),
(106,'2.0L 250 hp I4','Gas'),
(107,'1.8L 182 hp I4 Hybrid','Hybrid'),
(108,'6.2L 415 hp V8','Gas'),
(109,'4.3L 285 hp V6 Flex Fuel Vehicle','Gas'),
(110,'5.3L 355 hp V8','Gas'),
(111,'6.0L 360 hp V8 Flex Fuel Vehicle','Gas'),
(112,'6.6L 445 hp V8 Biodiesel','Diesel'),
(113,'1.8L 138 hp I4','Gas'),
(114,'1.4L 98 hp I4','Gas'),
(115,'5.3L 355 hp V8 Flex Fuel Vehicle','Gas'),
(116,'6.0L 360 hp V8','Gas'),
(117,'3.6L 281 hp V6','Gas'),
(118,'1.5L 149 hp I4','Gas'),
(119,'2.4L 184 hp I4','Gas'),
(120,'3.6L 295 hp V6','Gas'),
(121,'3.6L 292 hp V6','Gas'),
(122,'5.7L 363 hp V8','Gas'),
(123,'3.6L 300 hp V6','Gas'),
(124,'3.6L 287 hp V6','Gas'),
(125,'3.6L 260 hp V6','Gas');

View File

@@ -2,12 +2,19 @@
INSERT INTO transmissions (id, type) VALUES
(1,'Automatic'),
(2,'Manual'),
(3,'5-Speed Automatic'),
(4,'6-Speed Manual'),
(5,'6-Speed Automatic'),
(6,'8-Speed Dual Clutch'),
(7,'9-Speed Automatic'),
(3,'8-Speed Dual Clutch'),
(4,'9-Speed Automatic'),
(5,'7-Speed Dual Clutch'),
(6,'9-Speed Dual Clutch'),
(7,'6-Speed Automatic'),
(8,'6-Speed Dual Clutch'),
(9,'8-Speed Automatic'),
(10,'Continuously Variable Transmission');
(9,'6-Speed Manual'),
(10,'8-Speed Automatic'),
(11,'1-Speed Dual Clutch'),
(12,'6-Speed Automatic Overdrive'),
(13,'4-Speed Automatic'),
(14,'10-Speed Automatic'),
(15,'Continuously Variable Transmission'),
(16,'7-Speed Manual'),
(17,'5-Speed Manual');

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,56 @@
#!/bin/bash
# Reset vehicle database tables before a fresh import.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
echo "=========================================="
echo "Vehicle Database Reset"
echo "=========================================="
echo ""
# Check if postgres container is running
if ! docker ps --filter "name=mvp-postgres" --format "{{.Names}}" | grep -q "mvp-postgres"; then
echo "Error: mvp-postgres container is not running"
exit 1
fi
echo "Current data (before reset):"
docker exec mvp-postgres psql -U postgres -d motovaultpro -c \
"SELECT
(SELECT COUNT(*) FROM engines) as engines,
(SELECT COUNT(*) FROM transmissions) as transmissions,
(SELECT COUNT(*) FROM vehicle_options) as vehicle_options;" 2>/dev/null || echo " Tables may not exist yet"
echo ""
# Confirm reset
read -p "Are you sure you want to reset all vehicle data? (y/N) " -n 1 -r
echo ""
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "Reset cancelled."
exit 0
fi
echo ""
echo "Truncating tables..."
docker exec -i mvp-postgres psql -U postgres -d motovaultpro <<'EOF'
TRUNCATE TABLE vehicle_options RESTART IDENTITY CASCADE;
TRUNCATE TABLE engines RESTART IDENTITY CASCADE;
TRUNCATE TABLE transmissions RESTART IDENTITY CASCADE;
EOF
echo ""
echo "=========================================="
echo "Reset complete"
echo "=========================================="
echo ""
echo "Verification (should all be 0):"
docker exec mvp-postgres psql -U postgres -d motovaultpro -c \
"SELECT
(SELECT COUNT(*) FROM engines) as engines,
(SELECT COUNT(*) FROM transmissions) as transmissions,
(SELECT COUNT(*) FROM vehicle_options) as vehicle_options;"
echo ""
echo "Ready for fresh import with: ./import_data.sh"

View File

@@ -32,7 +32,7 @@ except ImportError: # pragma: no cover - env guard
SCRIPT_VERSION = "vehapi_fetch_snapshot.py@1.1.0"
DEFAULT_MIN_YEAR = 2015
DEFAULT_MAX_YEAR = 2022
DEFAULT_RATE_PER_SEC = 55 # stays under the 60 req/sec ceiling
DEFAULT_RATE_PER_MIN = 55 # stays under the 60 req/min ceiling
MAX_ATTEMPTS = 5
FALLBACK_TRIMS = ["Base"]
FALLBACK_TRANSMISSIONS = ["Manual", "Automatic"]
@@ -95,22 +95,18 @@ def ensure_snapshot_dir(root: Path, custom_dir: Optional[str]) -> Path:
class RateLimiter:
"""Simple leaky bucket limiter to stay below the VehAPI threshold."""
"""Fixed delay limiter to stay below the VehAPI threshold (60 req/min)."""
def __init__(self, max_per_sec: int) -> None:
self.max_per_sec = max_per_sec
self._history: List[float] = []
def __init__(self, max_per_min: int) -> None:
self.delay = 60.0 / max_per_min # ~1.09 sec for 55 rpm
self._last_request = 0.0
def acquire(self) -> None:
while True:
now = time.monotonic()
window_start = now - 1
self._history = [ts for ts in self._history if ts >= window_start]
if len(self._history) < self.max_per_sec:
break
sleep_for = max(self._history[0] - window_start, 0.001)
time.sleep(sleep_for)
self._history.append(time.monotonic())
now = time.monotonic()
elapsed = now - self._last_request
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self._last_request = time.monotonic()
@dataclass
@@ -132,7 +128,7 @@ class VehapiFetcher:
allowed_makes: Sequence[str],
snapshot_path: Path,
responses_cache: bool = True,
rate_per_sec: int = DEFAULT_RATE_PER_SEC,
rate_per_min: int = DEFAULT_RATE_PER_MIN,
) -> None:
self.session = session
self.base_url = base_url.rstrip("/")
@@ -146,7 +142,7 @@ class VehapiFetcher:
self.conn.execute("PRAGMA synchronous=NORMAL;")
self._init_schema()
self.responses_cache = responses_cache
self.rate_limiter = RateLimiter(rate_per_sec)
self.rate_limiter = RateLimiter(rate_per_min)
self.counts = FetchCounts()
def _init_schema(self) -> None:
@@ -251,7 +247,7 @@ class VehapiFetcher:
retry_seconds = float(retry_after)
except (TypeError, ValueError):
retry_seconds = 30.0
sleep_for = retry_seconds + random.uniform(0, 3)
sleep_for = retry_seconds + random.uniform(0, 0.5)
print(f"[info] {label}: hit 429, sleeping {sleep_for:.1f}s before retry", file=sys.stderr)
time.sleep(sleep_for)
backoff = min(backoff * 2, 30)
@@ -374,6 +370,7 @@ class VehapiFetcher:
self._fetch_engines_for_transmission(year, make, model, trim, trans, trans_bucket)
def _fetch_trims_for_model(self, year: int, make: str, model: str) -> None:
print(f" -> {year} {make} {model}", file=sys.stderr)
path = ["trims", str(year), make, model]
label = f"trims:{year}/{make}/{model}"
trims_payload = self._request_json(path, label)
@@ -416,9 +413,10 @@ class VehapiFetcher:
print(f"[info] {year}: no allowed makes found, skipping", file=sys.stderr)
continue
print(f"[info] {year}: {len(makes)} makes", file=sys.stderr)
for make in makes:
print(f"[info] {year} {make}: fetching models", file=sys.stderr)
for idx, make in enumerate(makes, 1):
print(f"[{year}] ({idx}/{len(makes)}) {make}", file=sys.stderr)
self._fetch_models_for_make(year, make)
print(f" [{self.counts.pairs_inserted} pairs so far]", file=sys.stderr)
self.conn.commit()
return self.counts
@@ -429,7 +427,7 @@ def build_arg_parser() -> argparse.ArgumentParser:
parser.add_argument("--max-year", type=int, default=int(read_env("MAX_YEAR", DEFAULT_MAX_YEAR)), help="Inclusive max year (default env MAX_YEAR or 2026)")
parser.add_argument("--snapshot-dir", type=str, help="Target snapshot directory (default snapshots/<today>)")
parser.add_argument("--base-url", type=str, default=read_env("VEHAPI_BASE_URL", DEFAULT_BASE_URL), help="VehAPI base URL (e.g. https://vehapi.com/api/v1/car-lists/get/car)")
parser.add_argument("--rate-per-sec", type=int, default=int(read_env("VEHAPI_MAX_RPS", DEFAULT_RATE_PER_SEC)), help="Max requests per second (<=60)")
parser.add_argument("--rate-per-min", type=int, default=int(read_env("VEHAPI_MAX_RPM", DEFAULT_RATE_PER_MIN)), help="Max requests per minute (<=60)")
parser.add_argument("--makes-file", type=str, default="source-makes.txt", help="Path to source-makes.txt")
parser.add_argument("--api-key-file", type=str, default="vehapi.key", help="Path to VehAPI bearer token file")
parser.add_argument("--no-response-cache", action="store_true", help="Disable request cache stored in snapshot.sqlite")
@@ -477,7 +475,7 @@ def main(argv: Sequence[str]) -> int:
allowed_makes=allowed_makes,
snapshot_path=snapshot_path,
responses_cache=not args.no_response_cache,
rate_per_sec=args.rate_per_sec,
rate_per_min=args.rate_per_min,
)
started_at = datetime.now(timezone.utc)