Vehicle ETL Process fixed. Admin settings fixed.
This commit is contained in:
41
data/vehicle-etl/README.md
Normal file
41
data/vehicle-etl/README.md
Normal file
@@ -0,0 +1,41 @@
|
||||
Step 1: Fetch Data from VehAPI
|
||||
|
||||
cd data/vehicle-etl
|
||||
python3 vehapi_fetch_snapshot.py --min-year 2015 --max-year 2025
|
||||
|
||||
Options:
|
||||
| Flag | Default | Description |
|
||||
|---------------------|-------------------|------------------------|
|
||||
| --min-year | 2015 | Start year |
|
||||
| --max-year | 2022 | End year |
|
||||
| --rate-per-min | 55 | API rate limit |
|
||||
| --snapshot-dir | snapshots/<today> | Output directory |
|
||||
| --no-response-cache | false | Disable resume caching |
|
||||
|
||||
Output: Creates snapshots/<date>/snapshot.sqlite
|
||||
|
||||
---
|
||||
Step 2: Generate SQL Files
|
||||
|
||||
python3 etl_generate_sql.py --snapshot-path snapshots/<date>/snapshot.sqlite
|
||||
|
||||
Output: Creates output/01_engines.sql, output/02_transmissions.sql, output/03_vehicle_options.sql
|
||||
|
||||
---
|
||||
Step 3: Import to PostgreSQL
|
||||
|
||||
./import_data.sh
|
||||
|
||||
Requires: mvp-postgres container running, SQL files in output/
|
||||
|
||||
---
|
||||
Quick Test (single year)
|
||||
|
||||
python3 vehapi_fetch_snapshot.py --min-year 2020 --max-year 2020
|
||||
|
||||
# Full ETL workflow
|
||||
./reset_database.sh # Clear old data
|
||||
python3 vehapi_fetch_snapshot.py # Fetch from API
|
||||
python3 etl_generate_sql.py # Generate SQL
|
||||
./import_data.sh # Import to Postgres
|
||||
docker compose exec mvp-redis redis-cli FLUSHALL # Flush Redis Cache for front end
|
||||
Binary file not shown.
@@ -1,22 +1,128 @@
|
||||
-- Auto-generated by etl_generate_sql.py
|
||||
INSERT INTO engines (id, name, fuel_type) VALUES
|
||||
(1,'Gas','Gas'),
|
||||
(2,'2.0L 150 hp I4','Gas'),
|
||||
(3,'2.4L 201 hp I4','Gas'),
|
||||
(4,'3.5L 290 hp V6','Gas'),
|
||||
(5,'3.5L 273 hp V6','Gas'),
|
||||
(6,'3.5L 310 hp V6','Gas'),
|
||||
(7,'2.4L 206 hp I4','Gas'),
|
||||
(8,'2.0L 220 hp I4','Gas'),
|
||||
(9,'1.8L 170 hp I4','Gas'),
|
||||
(10,'Diesel','Diesel'),
|
||||
(11,'2.0L 150 hp I4 Diesel','Diesel'),
|
||||
(12,'2.0L 220 hp I4 Flex Fuel Vehicle','Gas'),
|
||||
(13,'3.0L 310 hp V6','Gas'),
|
||||
(14,'3.0L 240 hp V6 Diesel','Diesel'),
|
||||
(15,'4.0L 435 hp V8','Diesel'),
|
||||
(2,'2.4L 201 hp I4','Gas'),
|
||||
(3,'3.5L 290 hp V6','Gas'),
|
||||
(4,'3.0L 321 hp V6 Hybrid','Hybrid'),
|
||||
(5,'3.5L 573 hp V6 Hybrid','Hybrid'),
|
||||
(6,'3.5L 279 hp V6','Gas'),
|
||||
(7,'3.5L 310 hp V6','Gas'),
|
||||
(8,'3.5L 377 hp V6 Hybrid','Hybrid'),
|
||||
(9,'2.4L 206 hp I4','Gas'),
|
||||
(10,'2.0L 220 hp I4','Gas'),
|
||||
(11,'2.0L 186 hp I4','Gas'),
|
||||
(12,'1.4L 204 hp I4','Gas'),
|
||||
(13,'2.0L 190 hp I4','Gas'),
|
||||
(14,'2.0L 252 hp I4','Gas'),
|
||||
(15,'2.0L 220 hp I4 Flex Fuel Vehicle','Gas'),
|
||||
(16,'3.0L 333 hp V6','Gas'),
|
||||
(17,'6.3L 500 hp W12','Gas'),
|
||||
(18,'2.0L 200 hp I4','Gas'),
|
||||
(19,'3.0L 272 hp V6','Gas');
|
||||
(17,'3.0L 340 hp V6','Gas'),
|
||||
(18,'4.0L 450 hp V8','Gas'),
|
||||
(19,'2.0L 200 hp I4','Gas'),
|
||||
(20,'3.0L 272 hp V6','Gas'),
|
||||
(21,'Diesel','Diesel'),
|
||||
(22,'5.2L 540 hp V10','Gas'),
|
||||
(23,'5.2L 610 hp V10','Gas'),
|
||||
(24,'2.5L 400 hp I5','Gas'),
|
||||
(25,'4.0L 560 hp V8','Gas'),
|
||||
(26,'4.0L 605 hp V8','Gas'),
|
||||
(27,'2.0L 292 hp I4','Gas'),
|
||||
(28,'3.0L 354 hp V6','Gas'),
|
||||
(29,'2.0L 248 hp I4','Gas'),
|
||||
(30,'3.0L 335 hp I6','Gas'),
|
||||
(31,'2.0L 180 hp I4','Gas'),
|
||||
(32,'2.0L 180 hp I4 Diesel','Diesel'),
|
||||
(33,'3.0L 320 hp I6','Gas'),
|
||||
(34,'3.0L 300 hp I6','Gas'),
|
||||
(35,'4.4L 445 hp V8','Gas'),
|
||||
(36,'3.0L 315 hp I6','Gas'),
|
||||
(37,'4.4L 600 hp V8','Gas'),
|
||||
(38,'2.0L 322 hp I4','Gas'),
|
||||
(39,'6.6L 601 hp V12','Gas'),
|
||||
(40,'3.0L 365 hp I6','Gas'),
|
||||
(41,'3.0L 425 hp I6','Gas'),
|
||||
(42,'4.4L 552 hp V8','Gas'),
|
||||
(43,'2.0L 228 hp I4','Gas'),
|
||||
(44,'2.0L 240 hp I4','Gas'),
|
||||
(45,'3.0L 355 hp I6','Gas'),
|
||||
(46,'3.0L 255 hp I6 Diesel','Diesel'),
|
||||
(47,'2.0L 308 hp I4','Gas'),
|
||||
(48,'4.4L 567 hp V8','Gas'),
|
||||
(49,'0.7L 168 hp I2','Gas'),
|
||||
(50,'170 hp Electric','Electric'),
|
||||
(51,'168 hp Electric','Electric'),
|
||||
(52,'0.7L 170 hp I2','Gas'),
|
||||
(53,'1.5L 357 hp I3','Gas'),
|
||||
(54,'6.0L 600 hp W12','Gas'),
|
||||
(55,'6.0L 633 hp W12','Gas'),
|
||||
(56,'4.0L 500 hp V8','Gas'),
|
||||
(57,'4.0L 521 hp V8','Gas'),
|
||||
(58,'6.0L 582 hp W12','Gas'),
|
||||
(59,'6.0L 700 hp W12','Gas'),
|
||||
(60,'6.0L 616 hp W12','Gas'),
|
||||
(61,'6.0L 626 hp W12','Gas'),
|
||||
(62,'6.8L 505 hp V8','Gas'),
|
||||
(63,'6.8L 530 hp V8','Gas'),
|
||||
(64,'1.6L 200 hp I4','Gas'),
|
||||
(65,'3.6L 288 hp V6','Gas'),
|
||||
(66,'1.4L 138 hp I4','Gas'),
|
||||
(67,'1.4L 153 hp I4','Gas'),
|
||||
(68,'2.5L 197 hp I4','Gas'),
|
||||
(69,'3.6L 310 hp V6','Gas'),
|
||||
(70,'2.4L 182 hp I4','Gas'),
|
||||
(71,'2.4L 182 hp I4 Flex Fuel Vehicle','Gas'),
|
||||
(72,'2.0L 259 hp I4','Gas'),
|
||||
(73,'2.4L 180 hp I4','Gas'),
|
||||
(74,'2.4L 180 hp I4 Flex Fuel Vehicle','Gas'),
|
||||
(75,'2.0L 272 hp I4','Gas'),
|
||||
(76,'3.6L 335 hp V6','Gas'),
|
||||
(77,'2.5L 202 hp I4','Gas'),
|
||||
(78,'3.6L 464 hp V6','Gas'),
|
||||
(79,'2.0L 265 hp I4','Gas'),
|
||||
(80,'3.0L 404 hp V6','Gas'),
|
||||
(81,'2.0L 335 hp I4','Gas'),
|
||||
(82,'2.0L 268 hp I4','Gas'),
|
||||
(83,'3.6L 420 hp V6','Gas'),
|
||||
(84,'6.2L 640 hp V8','Gas'),
|
||||
(85,'6.2L 420 hp V8','Gas'),
|
||||
(86,'3.6L 304 hp V6','Gas'),
|
||||
(87,'3.6L 410 hp V6','Gas'),
|
||||
(88,'200 hp Electric','Electric'),
|
||||
(89,'2.0L 275 hp I4','Gas'),
|
||||
(90,'6.2L 455 hp V8','Gas'),
|
||||
(91,'6.2L 650 hp V8','Gas'),
|
||||
(92,'3.6L 301 hp V6 Flex Fuel Vehicle','Gas'),
|
||||
(93,'6.0L 355 hp V8 Flex Fuel Vehicle','Gas'),
|
||||
(94,'2.0L 131 hp I4','Gas'),
|
||||
(95,'2.5L 200 hp I4','Gas'),
|
||||
(96,'2.8L 181 hp I4 Diesel','Diesel'),
|
||||
(97,'3.6L 308 hp V6','Gas'),
|
||||
(98,'6.2L 460 hp V8','Gas'),
|
||||
(99,'1.6L 137 hp I4 Diesel','Diesel'),
|
||||
(100,'3.6L 301 hp V6','Gas'),
|
||||
(101,'4.8L 285 hp V8','Gas'),
|
||||
(102,'6.0L 342 hp V8 Flex Fuel Vehicle','Gas'),
|
||||
(103,'3.6L 260 hp V6 Compressed Natural Gas','Gas'),
|
||||
(104,'3.6L 305 hp V6 Flex Fuel Vehicle','Gas'),
|
||||
(105,'1.5L 160 hp I4','Gas'),
|
||||
(106,'2.0L 250 hp I4','Gas'),
|
||||
(107,'1.8L 182 hp I4 Hybrid','Hybrid'),
|
||||
(108,'6.2L 415 hp V8','Gas'),
|
||||
(109,'4.3L 285 hp V6 Flex Fuel Vehicle','Gas'),
|
||||
(110,'5.3L 355 hp V8','Gas'),
|
||||
(111,'6.0L 360 hp V8 Flex Fuel Vehicle','Gas'),
|
||||
(112,'6.6L 445 hp V8 Biodiesel','Diesel'),
|
||||
(113,'1.8L 138 hp I4','Gas'),
|
||||
(114,'1.4L 98 hp I4','Gas'),
|
||||
(115,'5.3L 355 hp V8 Flex Fuel Vehicle','Gas'),
|
||||
(116,'6.0L 360 hp V8','Gas'),
|
||||
(117,'3.6L 281 hp V6','Gas'),
|
||||
(118,'1.5L 149 hp I4','Gas'),
|
||||
(119,'2.4L 184 hp I4','Gas'),
|
||||
(120,'3.6L 295 hp V6','Gas'),
|
||||
(121,'3.6L 292 hp V6','Gas'),
|
||||
(122,'5.7L 363 hp V8','Gas'),
|
||||
(123,'3.6L 300 hp V6','Gas'),
|
||||
(124,'3.6L 287 hp V6','Gas'),
|
||||
(125,'3.6L 260 hp V6','Gas');
|
||||
|
||||
|
||||
@@ -2,12 +2,19 @@
|
||||
INSERT INTO transmissions (id, type) VALUES
|
||||
(1,'Automatic'),
|
||||
(2,'Manual'),
|
||||
(3,'5-Speed Automatic'),
|
||||
(4,'6-Speed Manual'),
|
||||
(5,'6-Speed Automatic'),
|
||||
(6,'8-Speed Dual Clutch'),
|
||||
(7,'9-Speed Automatic'),
|
||||
(3,'8-Speed Dual Clutch'),
|
||||
(4,'9-Speed Automatic'),
|
||||
(5,'7-Speed Dual Clutch'),
|
||||
(6,'9-Speed Dual Clutch'),
|
||||
(7,'6-Speed Automatic'),
|
||||
(8,'6-Speed Dual Clutch'),
|
||||
(9,'8-Speed Automatic'),
|
||||
(10,'Continuously Variable Transmission');
|
||||
(9,'6-Speed Manual'),
|
||||
(10,'8-Speed Automatic'),
|
||||
(11,'1-Speed Dual Clutch'),
|
||||
(12,'6-Speed Automatic Overdrive'),
|
||||
(13,'4-Speed Automatic'),
|
||||
(14,'10-Speed Automatic'),
|
||||
(15,'Continuously Variable Transmission'),
|
||||
(16,'7-Speed Manual'),
|
||||
(17,'5-Speed Manual');
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
56
data/vehicle-etl/reset_database.sh
Executable file
56
data/vehicle-etl/reset_database.sh
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
# Reset vehicle database tables before a fresh import.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
echo "=========================================="
|
||||
echo "Vehicle Database Reset"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# Check if postgres container is running
|
||||
if ! docker ps --filter "name=mvp-postgres" --format "{{.Names}}" | grep -q "mvp-postgres"; then
|
||||
echo "Error: mvp-postgres container is not running"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Current data (before reset):"
|
||||
docker exec mvp-postgres psql -U postgres -d motovaultpro -c \
|
||||
"SELECT
|
||||
(SELECT COUNT(*) FROM engines) as engines,
|
||||
(SELECT COUNT(*) FROM transmissions) as transmissions,
|
||||
(SELECT COUNT(*) FROM vehicle_options) as vehicle_options;" 2>/dev/null || echo " Tables may not exist yet"
|
||||
echo ""
|
||||
|
||||
# Confirm reset
|
||||
read -p "Are you sure you want to reset all vehicle data? (y/N) " -n 1 -r
|
||||
echo ""
|
||||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||
echo "Reset cancelled."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Truncating tables..."
|
||||
docker exec -i mvp-postgres psql -U postgres -d motovaultpro <<'EOF'
|
||||
TRUNCATE TABLE vehicle_options RESTART IDENTITY CASCADE;
|
||||
TRUNCATE TABLE engines RESTART IDENTITY CASCADE;
|
||||
TRUNCATE TABLE transmissions RESTART IDENTITY CASCADE;
|
||||
EOF
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "Reset complete"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "Verification (should all be 0):"
|
||||
docker exec mvp-postgres psql -U postgres -d motovaultpro -c \
|
||||
"SELECT
|
||||
(SELECT COUNT(*) FROM engines) as engines,
|
||||
(SELECT COUNT(*) FROM transmissions) as transmissions,
|
||||
(SELECT COUNT(*) FROM vehicle_options) as vehicle_options;"
|
||||
echo ""
|
||||
echo "Ready for fresh import with: ./import_data.sh"
|
||||
@@ -32,7 +32,7 @@ except ImportError: # pragma: no cover - env guard
|
||||
SCRIPT_VERSION = "vehapi_fetch_snapshot.py@1.1.0"
|
||||
DEFAULT_MIN_YEAR = 2015
|
||||
DEFAULT_MAX_YEAR = 2022
|
||||
DEFAULT_RATE_PER_SEC = 55 # stays under the 60 req/sec ceiling
|
||||
DEFAULT_RATE_PER_MIN = 55 # stays under the 60 req/min ceiling
|
||||
MAX_ATTEMPTS = 5
|
||||
FALLBACK_TRIMS = ["Base"]
|
||||
FALLBACK_TRANSMISSIONS = ["Manual", "Automatic"]
|
||||
@@ -95,22 +95,18 @@ def ensure_snapshot_dir(root: Path, custom_dir: Optional[str]) -> Path:
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Simple leaky bucket limiter to stay below the VehAPI threshold."""
|
||||
"""Fixed delay limiter to stay below the VehAPI threshold (60 req/min)."""
|
||||
|
||||
def __init__(self, max_per_sec: int) -> None:
|
||||
self.max_per_sec = max_per_sec
|
||||
self._history: List[float] = []
|
||||
def __init__(self, max_per_min: int) -> None:
|
||||
self.delay = 60.0 / max_per_min # ~1.09 sec for 55 rpm
|
||||
self._last_request = 0.0
|
||||
|
||||
def acquire(self) -> None:
|
||||
while True:
|
||||
now = time.monotonic()
|
||||
window_start = now - 1
|
||||
self._history = [ts for ts in self._history if ts >= window_start]
|
||||
if len(self._history) < self.max_per_sec:
|
||||
break
|
||||
sleep_for = max(self._history[0] - window_start, 0.001)
|
||||
time.sleep(sleep_for)
|
||||
self._history.append(time.monotonic())
|
||||
now = time.monotonic()
|
||||
elapsed = now - self._last_request
|
||||
if elapsed < self.delay:
|
||||
time.sleep(self.delay - elapsed)
|
||||
self._last_request = time.monotonic()
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -132,7 +128,7 @@ class VehapiFetcher:
|
||||
allowed_makes: Sequence[str],
|
||||
snapshot_path: Path,
|
||||
responses_cache: bool = True,
|
||||
rate_per_sec: int = DEFAULT_RATE_PER_SEC,
|
||||
rate_per_min: int = DEFAULT_RATE_PER_MIN,
|
||||
) -> None:
|
||||
self.session = session
|
||||
self.base_url = base_url.rstrip("/")
|
||||
@@ -146,7 +142,7 @@ class VehapiFetcher:
|
||||
self.conn.execute("PRAGMA synchronous=NORMAL;")
|
||||
self._init_schema()
|
||||
self.responses_cache = responses_cache
|
||||
self.rate_limiter = RateLimiter(rate_per_sec)
|
||||
self.rate_limiter = RateLimiter(rate_per_min)
|
||||
self.counts = FetchCounts()
|
||||
|
||||
def _init_schema(self) -> None:
|
||||
@@ -251,7 +247,7 @@ class VehapiFetcher:
|
||||
retry_seconds = float(retry_after)
|
||||
except (TypeError, ValueError):
|
||||
retry_seconds = 30.0
|
||||
sleep_for = retry_seconds + random.uniform(0, 3)
|
||||
sleep_for = retry_seconds + random.uniform(0, 0.5)
|
||||
print(f"[info] {label}: hit 429, sleeping {sleep_for:.1f}s before retry", file=sys.stderr)
|
||||
time.sleep(sleep_for)
|
||||
backoff = min(backoff * 2, 30)
|
||||
@@ -374,6 +370,7 @@ class VehapiFetcher:
|
||||
self._fetch_engines_for_transmission(year, make, model, trim, trans, trans_bucket)
|
||||
|
||||
def _fetch_trims_for_model(self, year: int, make: str, model: str) -> None:
|
||||
print(f" -> {year} {make} {model}", file=sys.stderr)
|
||||
path = ["trims", str(year), make, model]
|
||||
label = f"trims:{year}/{make}/{model}"
|
||||
trims_payload = self._request_json(path, label)
|
||||
@@ -416,9 +413,10 @@ class VehapiFetcher:
|
||||
print(f"[info] {year}: no allowed makes found, skipping", file=sys.stderr)
|
||||
continue
|
||||
print(f"[info] {year}: {len(makes)} makes", file=sys.stderr)
|
||||
for make in makes:
|
||||
print(f"[info] {year} {make}: fetching models", file=sys.stderr)
|
||||
for idx, make in enumerate(makes, 1):
|
||||
print(f"[{year}] ({idx}/{len(makes)}) {make}", file=sys.stderr)
|
||||
self._fetch_models_for_make(year, make)
|
||||
print(f" [{self.counts.pairs_inserted} pairs so far]", file=sys.stderr)
|
||||
self.conn.commit()
|
||||
return self.counts
|
||||
|
||||
@@ -429,7 +427,7 @@ def build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser.add_argument("--max-year", type=int, default=int(read_env("MAX_YEAR", DEFAULT_MAX_YEAR)), help="Inclusive max year (default env MAX_YEAR or 2026)")
|
||||
parser.add_argument("--snapshot-dir", type=str, help="Target snapshot directory (default snapshots/<today>)")
|
||||
parser.add_argument("--base-url", type=str, default=read_env("VEHAPI_BASE_URL", DEFAULT_BASE_URL), help="VehAPI base URL (e.g. https://vehapi.com/api/v1/car-lists/get/car)")
|
||||
parser.add_argument("--rate-per-sec", type=int, default=int(read_env("VEHAPI_MAX_RPS", DEFAULT_RATE_PER_SEC)), help="Max requests per second (<=60)")
|
||||
parser.add_argument("--rate-per-min", type=int, default=int(read_env("VEHAPI_MAX_RPM", DEFAULT_RATE_PER_MIN)), help="Max requests per minute (<=60)")
|
||||
parser.add_argument("--makes-file", type=str, default="source-makes.txt", help="Path to source-makes.txt")
|
||||
parser.add_argument("--api-key-file", type=str, default="vehapi.key", help="Path to VehAPI bearer token file")
|
||||
parser.add_argument("--no-response-cache", action="store_true", help="Disable request cache stored in snapshot.sqlite")
|
||||
@@ -477,7 +475,7 @@ def main(argv: Sequence[str]) -> int:
|
||||
allowed_makes=allowed_makes,
|
||||
snapshot_path=snapshot_path,
|
||||
responses_cache=not args.no_response_cache,
|
||||
rate_per_sec=args.rate_per_sec,
|
||||
rate_per_min=args.rate_per_min,
|
||||
)
|
||||
|
||||
started_at = datetime.now(timezone.utc)
|
||||
|
||||
Reference in New Issue
Block a user