Initial Commit

This commit is contained in:
Eric Gullickson
2025-09-17 16:09:15 -05:00
parent 0cdb9803de
commit a052040e3a
373 changed files with 437090 additions and 6773 deletions

View File

@@ -0,0 +1,3 @@
from .make_filter import MakeFilter
__all__ = ['MakeFilter']

View File

@@ -0,0 +1,392 @@
"""
Engine Specification Parser
Parses engine specifications from JSON vehicle data into structured components.
Handles displacement, configuration, cylinders, fuel type, and aspiration.
CRITICAL REQUIREMENT: L-configuration normalization
- L3 → I3 (L-configuration treated as Inline)
- L4 → I4 (L-configuration treated as Inline)
Standard format: {displacement}L {config}{cylinders} {modifiers}
Examples:
- "2.0L I4" → 2.0L, Inline, 4-cylinder
- "1.5L L3 PLUG-IN HYBRID EV- (PHEV)" → 1.5L, Inline (normalized), 3-cyl, Plug-in Hybrid
- "2.4L H4" → 2.4L, Horizontal (Subaru Boxer), 4-cylinder
Usage:
parser = EngineSpecParser()
spec = parser.parse_engine_string("1.5L L3 PLUG-IN HYBRID EV- (PHEV)")
# spec.configuration == "I" (normalized from L)
"""
import re
import logging
from typing import Optional, List, Pattern
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class EngineSpec:
"""Parsed engine specification"""
displacement_l: Optional[float] # Engine displacement in liters
configuration: str # I, V, H, Electric, Unknown
cylinders: Optional[int] # Number of cylinders
fuel_type: str # Gasoline, Hybrid variants, Electric, Flex Fuel
aspiration: str # Natural, Turbocharged, Supercharged
raw_string: str # Original engine string
def __str__(self) -> str:
return f"EngineSpec({self.displacement_l}L {self.configuration}{self.cylinders}, {self.fuel_type}, {self.aspiration})"
class EngineSpecParser:
"""Parse engine specifications with L→I normalization"""
def __init__(self):
"""Initialize engine parser with regex patterns"""
# Primary engine pattern: {displacement}L {config}{cylinders}
# Supports I, V, H, L, W configurations
self.engine_pattern = re.compile(r'(\d+\.?\d*)L\s+([IVHLW])(\d+)', re.IGNORECASE)
# Hybrid detection patterns (most specific first)
self.hybrid_patterns = [
(re.compile(r'PLUG-IN HYBRID EV-?\s*\(PHEV\)', re.IGNORECASE), 'Plug-in Hybrid'),
(re.compile(r'FULL HYBRID EV-?\s*\(FHEV\)', re.IGNORECASE), 'Full Hybrid'),
(re.compile(r'HYBRID', re.IGNORECASE), 'Hybrid'),
]
# Other fuel type patterns
self.fuel_patterns = [
(re.compile(r'FLEX', re.IGNORECASE), 'Flex Fuel'),
(re.compile(r'ELECTRIC', re.IGNORECASE), 'Electric'),
]
# Aspiration patterns
self.aspiration_patterns = [
(re.compile(r'TURBO', re.IGNORECASE), 'Turbocharged'),
(re.compile(r'SUPERCHARGED|SC', re.IGNORECASE), 'Supercharged'),
]
logger.debug("EngineSpecParser initialized with regex patterns")
def normalize_configuration(self, config: str) -> str:
"""
CRITICAL: Convert L-configuration to I (Inline)
L-configurations are alternate notation for Inline engines.
W-configurations are W-type engines (VW Group, Bentley, etc.)
Args:
config: Configuration character (I, V, H, L, W)
Returns:
Normalized configuration (L becomes I, others unchanged)
"""
config_upper = config.upper()
if config_upper == 'L':
logger.debug(f"Normalizing L-configuration to I (Inline)")
return 'I'
return config_upper
def extract_fuel_type(self, engine_str: str) -> str:
"""
Extract fuel type from engine string
Priority order:
1. Hybrid patterns (PHEV, FHEV, HYBRID)
2. Other fuel types (FLEX, ELECTRIC)
3. Default to Gasoline
Args:
engine_str: Original engine string
Returns:
Detected fuel type
"""
# Check hybrid patterns first (most specific)
for pattern, fuel_type in self.hybrid_patterns:
if pattern.search(engine_str):
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
return fuel_type
# Check other fuel types
for pattern, fuel_type in self.fuel_patterns:
if pattern.search(engine_str):
logger.debug(f"Detected fuel type '{fuel_type}' from '{engine_str}'")
return fuel_type
# Default to gasoline
return 'Gasoline'
def extract_aspiration(self, engine_str: str) -> str:
"""
Extract aspiration type from engine string
Args:
engine_str: Original engine string
Returns:
Detected aspiration type
"""
for pattern, aspiration in self.aspiration_patterns:
if pattern.search(engine_str):
logger.debug(f"Detected aspiration '{aspiration}' from '{engine_str}'")
return aspiration
return 'Natural' # Default to naturally aspirated
def parse_engine_string(self, engine_str: str) -> EngineSpec:
"""
Parse complete engine specification
Args:
engine_str: Engine specification string
Returns:
EngineSpec with parsed components
"""
if not engine_str or not engine_str.strip():
logger.warning("Empty engine string provided")
return self.create_fallback_engine("Empty Engine String")
engine_str = engine_str.strip()
# Try to match standard engine pattern
match = self.engine_pattern.match(engine_str)
if not match:
logger.warning(f"Could not parse engine string: '{engine_str}'")
return self.create_fallback_engine(engine_str)
try:
# Extract basic components
displacement = float(match.group(1))
raw_config = match.group(2)
cylinders = int(match.group(3))
# CRITICAL: Apply L→I normalization
config = self.normalize_configuration(raw_config)
# Extract fuel type and aspiration from modifiers
fuel_type = self.extract_fuel_type(engine_str)
aspiration = self.extract_aspiration(engine_str)
# Log L→I normalization when it occurs
if raw_config.upper() == 'L' and config == 'I':
logger.info(f"L→I normalization applied: '{engine_str}'{displacement}L I{cylinders}")
spec = EngineSpec(
displacement_l=displacement,
configuration=config,
cylinders=cylinders,
fuel_type=fuel_type,
aspiration=aspiration,
raw_string=engine_str
)
logger.debug(f"Parsed '{engine_str}'{spec}")
return spec
except (ValueError, IndexError) as e:
logger.error(f"Failed to parse matched components from '{engine_str}': {e}")
return self.create_fallback_engine(engine_str)
def create_fallback_engine(self, raw_string: str) -> EngineSpec:
"""
Create fallback engine spec for unparseable strings
Args:
raw_string: Original engine string that couldn't be parsed
Returns:
EngineSpec with unknown values but preserved raw string
"""
logger.debug(f"Creating fallback engine for '{raw_string}'")
return EngineSpec(
displacement_l=None,
configuration="Unknown",
cylinders=None,
fuel_type="Unknown",
aspiration="Natural",
raw_string=raw_string
)
def create_electric_motor(self) -> EngineSpec:
"""
Create default electric motor spec for empty engines arrays
Common for Tesla, Lucid, and other electric vehicles that have
empty engines arrays in their JSON data.
Returns:
EngineSpec configured for electric motor
"""
logger.debug("Creating default electric motor spec")
return EngineSpec(
displacement_l=None, # N/A for electric
configuration="Electric", # Special designation
cylinders=None, # N/A for electric
fuel_type="Electric",
aspiration=None, # N/A for electric
raw_string="Electric Motor"
)
def parse_multiple_engines(self, engine_strings: List[str]) -> List[EngineSpec]:
"""
Parse multiple engine specifications
Args:
engine_strings: List of engine specification strings
Returns:
List of parsed EngineSpec objects
"""
if not engine_strings:
# Handle empty engines array (common for electric vehicles)
logger.info("Empty engines array detected - creating electric motor")
return [self.create_electric_motor()]
specs = []
for engine_str in engine_strings:
spec = self.parse_engine_string(engine_str)
specs.append(spec)
logger.debug(f"Parsed {len(specs)} engines from {len(engine_strings)} strings")
return specs
def get_unique_engines(self, engine_specs: List[EngineSpec]) -> List[EngineSpec]:
"""
Get unique engines based on key attributes
Args:
engine_specs: List of engine specifications
Returns:
List of unique engine specifications
"""
seen = set()
unique_specs = []
for spec in engine_specs:
# Create key based on engine characteristics
key = (
spec.displacement_l,
spec.configuration,
spec.cylinders,
spec.fuel_type,
spec.aspiration
)
if key not in seen:
seen.add(key)
unique_specs.append(spec)
else:
logger.debug(f"Skipping duplicate engine: {spec}")
logger.info(f"Reduced {len(engine_specs)} engines to {len(unique_specs)} unique engines")
return unique_specs
def validate_engine_spec(self, spec: EngineSpec) -> List[str]:
"""
Validate engine specification for data quality issues
Args:
spec: Engine specification to validate
Returns:
List of validation warnings (empty if no issues)
"""
warnings = []
# Check displacement
if spec.displacement_l is not None:
if spec.displacement_l <= 0:
warnings.append(f"Invalid displacement: {spec.displacement_l}")
elif spec.displacement_l > 20: # Unrealistic for production cars
warnings.append(f"Unusually large displacement: {spec.displacement_l}L")
# Check cylinders
if spec.cylinders is not None:
if spec.cylinders <= 0:
warnings.append(f"Invalid cylinder count: {spec.cylinders}")
elif spec.cylinders > 16: # Very rare in production
warnings.append(f"Unusually high cylinder count: {spec.cylinders}")
# Check configuration consistency
if spec.configuration == "Electric" and spec.displacement_l is not None:
warnings.append("Electric motor should not have displacement")
if spec.configuration not in ["I", "V", "H", "W", "Electric", "Unknown"]:
warnings.append(f"Unexpected configuration: {spec.configuration}")
# Check fuel type consistency
if spec.fuel_type == "Electric" and spec.configuration != "Electric":
warnings.append("Electric fuel type should have Electric configuration")
return warnings
# Example usage and testing functions
def example_usage():
"""Demonstrate EngineSpecParser usage"""
print("🔧 EngineSpecParser Example Usage")
print("=" * 40)
parser = EngineSpecParser()
# Test cases from actual JSON data
test_engines = [
# Standard engines
"2.0L I4",
"3.5L V6",
# L→I normalization examples (CRITICAL)
"1.5L L3",
"1.2L L3 FULL HYBRID EV- (FHEV)",
# Subaru Boxer engines
"2.4L H4",
# W-configuration engines (VW Group, Bentley)
"6.0L W12",
"4.0L W8",
# Hybrid examples
"2.5L I4 FULL HYBRID EV- (FHEV)",
"1.5L L3 PLUG-IN HYBRID EV- (PHEV)",
# Flex fuel
"5.6L V8 FLEX",
# Electric
"1.8L I4 ELECTRIC",
]
for engine_str in test_engines:
spec = parser.parse_engine_string(engine_str)
print(f"\nInput: \"{engine_str}\"")
print(f"{spec.displacement_l}L {spec.configuration}{spec.cylinders}")
print(f" → Fuel: {spec.fuel_type}, Aspiration: {spec.aspiration}")
# Highlight L→I normalization
if 'L' in engine_str and spec.configuration == 'I' and not 'ELECTRIC' in engine_str.upper():
print(f" 🎯 L→I NORMALIZED")
# Test electric vehicle handling
print(f"\n⚡ Electric Vehicle Handling:")
electric_spec = parser.create_electric_motor()
print(f" Default: {electric_spec.raw_string}")
print(f" → Config: {electric_spec.configuration}, Fuel: {electric_spec.fuel_type}")
if __name__ == "__main__":
example_usage()

View File

@@ -0,0 +1,28 @@
import logging
import sys
from pathlib import Path
from datetime import datetime
def setup_logging(log_level: str = "INFO"):
"""Setup logging configuration"""
# Create logs directory if it doesn't exist
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
# Configure logging
logging.basicConfig(
level=getattr(logging, log_level.upper()),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(
log_dir / f"etl_{datetime.now().strftime('%Y%m%d')}.log"
)
]
)
# Set specific logger levels
logging.getLogger("pymssql").setLevel(logging.WARNING)
logging.getLogger("psycopg2").setLevel(logging.WARNING)
logging.getLogger("redis").setLevel(logging.WARNING)

View File

@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""
Make filtering service for ETL pipeline optimization.
Filters processing to only allowed mainstream makes/brands.
"""
import json
import logging
from typing import List, Set
from pathlib import Path
logger = logging.getLogger(__name__)
class MakeFilter:
"""Service to filter ETL processing by allowed makes/brands"""
def __init__(self, config_path: str = '/app/etl/sources/makes.json'):
"""
Initialize make filter
Args:
config_path: Path to makes.json configuration file
"""
self.config_path = Path(config_path)
self.allowed_makes: List[str] = []
self.allowed_makes_set: Set[str] = set()
self.allowed_makes_sql: str = ""
self._load_makes()
def _load_makes(self):
"""Load allowed makes from JSON configuration"""
try:
# Try multiple paths for container and local development
config_paths = [
self.config_path,
Path('/app/etl/sources/makes.json'), # Primary container path - administrator controlled
Path(__file__).parent.parent / 'sources' / 'makes.json', # Local development
Path(__file__).parent.parent.parent / 'makes.json' # Fallback to main makes.json if needed
]
config_data = None
used_path = None
for path in config_paths:
if path.exists():
with open(path, 'r') as f:
config_data = json.load(f)
used_path = path
break
if not config_data:
raise FileNotFoundError(f"Could not find makes.json in any of: {config_paths}")
self.allowed_makes = config_data.get('manufacturers', [])
self.allowed_makes_set = set(self.allowed_makes)
self.allowed_makes_sql = self._build_sql_in_clause()
logger.info(f"Loaded {len(self.allowed_makes)} allowed makes from {used_path}")
logger.debug(f"Allowed makes: {', '.join(sorted(self.allowed_makes[:10]))}{'...' if len(self.allowed_makes) > 10 else ''}")
except Exception as e:
logger.error(f"Failed to load make configuration: {e}")
raise
def _build_sql_in_clause(self) -> str:
"""Build SQL IN clause for make filtering"""
if not self.allowed_makes:
return "()"
# Escape single quotes and build IN clause
escaped_names = [name.replace("'", "''") for name in self.allowed_makes]
return "(" + ",".join(f"'{name}'" for name in escaped_names) + ")"
def is_make_allowed(self, make_name: str) -> bool:
"""
Check if a make is in the allowed list
Args:
make_name: Name to check
Returns:
True if make is allowed, False otherwise
"""
return make_name in self.allowed_makes_set
def get_allowed_makes(self) -> List[str]:
"""Get list of allowed makes"""
return self.allowed_makes.copy()
def get_sql_filter(self, column_name: str = 'Name') -> str:
"""
Get SQL WHERE clause for make filtering
Args:
column_name: Name of the make column
Returns:
SQL WHERE clause fragment
"""
return f"{column_name} IN {self.allowed_makes_sql}"
def reload_configuration(self):
"""Reload makes configuration from file"""
logger.info("Reloading make configuration...")
self._load_makes()
def get_filter_stats(self) -> dict:
"""Get filtering statistics"""
return {
'total_allowed_makes': len(self.allowed_makes),
'config_path': str(self.config_path),
'sql_clause_length': len(self.allowed_makes_sql)
}

View File

@@ -0,0 +1,317 @@
"""
Make Name Mapper Utility
Converts JSON filenames to proper display names for database storage.
Handles underscore-to-space conversion, title casing, and special capitalization cases.
Critical for converting:
- alfa_romeo.json → "Alfa Romeo"
- bmw.json → "BMW"
- land_rover.json → "Land Rover"
Usage:
mapper = MakeNameMapper()
display_name = mapper.normalize_make_name('alfa_romeo.json') # Returns "Alfa Romeo"
"""
import json
import glob
import os
import logging
from typing import Set, Dict, List, Optional
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
@dataclass
class ValidationReport:
"""Make name validation report"""
total_files: int
valid_mappings: int
mismatches: List[Dict[str, str]]
@property
def success_rate(self) -> float:
"""Calculate success rate as percentage"""
return self.valid_mappings / self.total_files if self.total_files > 0 else 0.0
class MakeNameMapper:
"""Convert JSON filenames to proper make display names"""
def __init__(self, sources_dir: Optional[str] = None):
"""
Initialize make name mapper
Args:
sources_dir: Directory containing sources/makes.json for validation
"""
self.sources_dir = sources_dir or "sources"
# Special capitalization cases that don't follow standard title case
self.special_cases = {
'Bmw': 'BMW', # Bayerische Motoren Werke
'Gmc': 'GMC', # General Motors Company
'Mini': 'MINI', # Brand styling requirement
'Mclaren': 'McLaren', # Scottish naming convention
}
# Load authoritative makes list for validation
self.authoritative_makes = self._load_authoritative_makes()
logger.debug(f"MakeNameMapper initialized with {len(self.authoritative_makes)} authoritative makes")
def _load_authoritative_makes(self) -> Set[str]:
"""Load authoritative makes list from sources/makes.json"""
makes_file = os.path.join(self.sources_dir, 'makes.json')
try:
if os.path.exists(makes_file):
with open(makes_file, 'r', encoding='utf-8') as f:
data = json.load(f)
makes_set = set(data.get('manufacturers', []))
logger.info(f"Loaded {len(makes_set)} authoritative makes from {makes_file}")
return makes_set
else:
logger.warning(f"Authoritative makes file not found: {makes_file}")
return self._get_fallback_makes()
except Exception as e:
logger.error(f"Failed to load authoritative makes from {makes_file}: {e}")
return self._get_fallback_makes()
def _get_fallback_makes(self) -> Set[str]:
"""Fallback authoritative makes list if file is not available"""
return {
'Acura', 'Alfa Romeo', 'Aston Martin', 'Audi', 'BMW', 'Bentley',
'Buick', 'Cadillac', 'Chevrolet', 'Chrysler', 'Dodge', 'Ferrari',
'Fiat', 'Ford', 'Genesis', 'Geo', 'GMC', 'Honda', 'Hummer',
'Hyundai', 'Infiniti', 'Isuzu', 'Jaguar', 'Jeep', 'Kia',
'Lamborghini', 'Land Rover', 'Lexus', 'Lincoln', 'Lotus', 'Lucid',
'MINI', 'Maserati', 'Mazda', 'McLaren', 'Mercury', 'Mitsubishi',
'Nissan', 'Oldsmobile', 'Plymouth', 'Polestar', 'Pontiac',
'Porsche', 'Ram', 'Rivian', 'Rolls Royce', 'Saab', 'Saturn',
'Scion', 'Smart', 'Subaru', 'Tesla', 'Toyota', 'Volkswagen',
'Volvo'
}
def normalize_make_name(self, filename: str) -> str:
"""
Convert filename to proper display name
Args:
filename: JSON filename (e.g., 'alfa_romeo.json')
Returns:
Normalized display name (e.g., 'Alfa Romeo')
"""
try:
# Remove .json extension
base_name = filename.replace('.json', '')
# Handle edge case of empty string
if not base_name:
logger.warning(f"Empty base name after removing .json from '{filename}'")
return "Unknown"
# Replace underscores with spaces
spaced_name = base_name.replace('_', ' ')
# Apply title case
title_cased = spaced_name.title()
# Apply special capitalization cases
normalized = self.special_cases.get(title_cased, title_cased)
logger.debug(f"Normalized '{filename}''{normalized}'")
return normalized
except Exception as e:
logger.error(f"Failed to normalize make name '{filename}': {e}")
return "Unknown"
def validate_mapping(self, filename: str, display_name: str) -> bool:
"""
Validate mapped name against authoritative list
Args:
filename: Original JSON filename
display_name: Normalized display name
Returns:
True if display name is in authoritative list
"""
is_valid = display_name in self.authoritative_makes
if not is_valid:
logger.warning(f"Make '{display_name}' from '{filename}' not found in authoritative list")
return is_valid
def get_all_mappings(self, json_files_dir: str) -> Dict[str, str]:
"""
Get complete filename → display name mapping for all JSON files
Args:
json_files_dir: Directory containing make JSON files
Returns:
Dictionary mapping filenames to display names
"""
mappings = {}
try:
pattern = os.path.join(json_files_dir, '*.json')
json_files = glob.glob(pattern)
logger.info(f"Found {len(json_files)} JSON files in {json_files_dir}")
for file_path in json_files:
filename = os.path.basename(file_path)
display_name = self.normalize_make_name(filename)
mappings[filename] = display_name
return mappings
except Exception as e:
logger.error(f"Failed to get all mappings from {json_files_dir}: {e}")
return {}
def validate_all_mappings(self, json_files_dir: str) -> ValidationReport:
"""
Validate all mappings against authoritative list
Args:
json_files_dir: Directory containing make JSON files
Returns:
ValidationReport with results
"""
mappings = self.get_all_mappings(json_files_dir)
mismatches = []
for filename, display_name in mappings.items():
if not self.validate_mapping(filename, display_name):
mismatches.append({
'filename': filename,
'mapped_name': display_name,
'status': 'NOT_FOUND_IN_AUTHORITATIVE'
})
report = ValidationReport(
total_files=len(mappings),
valid_mappings=len(mappings) - len(mismatches),
mismatches=mismatches
)
logger.info(f"Validation complete: {report.valid_mappings}/{report.total_files} valid ({report.success_rate:.1%})")
return report
def get_filename_for_display_name(self, display_name: str) -> Optional[str]:
"""
Reverse lookup: get JSON filename for a display name
Args:
display_name: Make display name (e.g., 'Alfa Romeo')
Returns:
JSON filename (e.g., 'alfa_romeo.json') or None if not found
"""
# Convert display name back to filename format
# Handle special cases in reverse
reverse_special_cases = {v: k for k, v in self.special_cases.items()}
if display_name in reverse_special_cases:
# Special case: BMW → Bmw, etc.
base_name = reverse_special_cases[display_name].lower()
else:
# Standard case: convert to lowercase, spaces to underscores
base_name = display_name.lower().replace(' ', '_')
filename = f"{base_name}.json"
logger.debug(f"Reverse lookup: '{display_name}''{filename}'")
return filename
def print_validation_report(self, report: ValidationReport) -> None:
"""
Print formatted validation report
Args:
report: ValidationReport to display
"""
print(f"📋 Make Name Validation Report")
print(f"=" * 35)
print(f"Total files: {report.total_files}")
print(f"Valid mappings: {report.valid_mappings}")
print(f"Success rate: {report.success_rate:.1%}")
if report.mismatches:
print(f"\n⚠️ Mismatches ({len(report.mismatches)}):")
for mismatch in report.mismatches:
print(f" {mismatch['filename']}{mismatch['mapped_name']}")
print(f" Status: {mismatch['status']}")
else:
print(f"\n🎉 All mappings are valid!")
def get_make_statistics(self, json_files_dir: str) -> Dict[str, int]:
"""
Get statistics about make name transformations
Args:
json_files_dir: Directory containing make JSON files
Returns:
Dictionary with transformation statistics
"""
mappings = self.get_all_mappings(json_files_dir)
single_words = 0
multi_words = 0
special_cases = 0
for filename, display_name in mappings.items():
if display_name in self.special_cases.values():
special_cases += 1
elif ' ' in display_name:
multi_words += 1
else:
single_words += 1
return {
'total': len(mappings),
'single_words': single_words,
'multi_words': multi_words,
'special_cases': special_cases
}
# Example usage and testing functions
def example_usage():
"""Demonstrate MakeNameMapper usage"""
print("🏷️ MakeNameMapper Example Usage")
print("=" * 35)
mapper = MakeNameMapper()
# Test individual conversions
test_files = [
'toyota.json',
'alfa_romeo.json',
'bmw.json',
'land_rover.json',
'mclaren.json'
]
for filename in test_files:
display_name = mapper.normalize_make_name(filename)
is_valid = mapper.validate_mapping(filename, display_name)
status = "" if is_valid else "⚠️"
print(f"{status} {filename:20}{display_name}")
if __name__ == "__main__":
example_usage()