Initial Commit
This commit is contained in:
317
mvp-platform-services/vehicles/etl/utils/make_name_mapper.py
Normal file
317
mvp-platform-services/vehicles/etl/utils/make_name_mapper.py
Normal file
@@ -0,0 +1,317 @@
|
||||
"""
|
||||
Make Name Mapper Utility
|
||||
|
||||
Converts JSON filenames to proper display names for database storage.
|
||||
Handles underscore-to-space conversion, title casing, and special capitalization cases.
|
||||
|
||||
Critical for converting:
|
||||
- alfa_romeo.json → "Alfa Romeo"
|
||||
- bmw.json → "BMW"
|
||||
- land_rover.json → "Land Rover"
|
||||
|
||||
Usage:
|
||||
mapper = MakeNameMapper()
|
||||
display_name = mapper.normalize_make_name('alfa_romeo.json') # Returns "Alfa Romeo"
|
||||
"""
|
||||
|
||||
import json
|
||||
import glob
|
||||
import os
|
||||
import logging
|
||||
from typing import Set, Dict, List, Optional
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationReport:
|
||||
"""Make name validation report"""
|
||||
total_files: int
|
||||
valid_mappings: int
|
||||
mismatches: List[Dict[str, str]]
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
"""Calculate success rate as percentage"""
|
||||
return self.valid_mappings / self.total_files if self.total_files > 0 else 0.0
|
||||
|
||||
|
||||
class MakeNameMapper:
|
||||
"""Convert JSON filenames to proper make display names"""
|
||||
|
||||
def __init__(self, sources_dir: Optional[str] = None):
|
||||
"""
|
||||
Initialize make name mapper
|
||||
|
||||
Args:
|
||||
sources_dir: Directory containing sources/makes.json for validation
|
||||
"""
|
||||
self.sources_dir = sources_dir or "sources"
|
||||
|
||||
# Special capitalization cases that don't follow standard title case
|
||||
self.special_cases = {
|
||||
'Bmw': 'BMW', # Bayerische Motoren Werke
|
||||
'Gmc': 'GMC', # General Motors Company
|
||||
'Mini': 'MINI', # Brand styling requirement
|
||||
'Mclaren': 'McLaren', # Scottish naming convention
|
||||
}
|
||||
|
||||
# Load authoritative makes list for validation
|
||||
self.authoritative_makes = self._load_authoritative_makes()
|
||||
|
||||
logger.debug(f"MakeNameMapper initialized with {len(self.authoritative_makes)} authoritative makes")
|
||||
|
||||
def _load_authoritative_makes(self) -> Set[str]:
|
||||
"""Load authoritative makes list from sources/makes.json"""
|
||||
makes_file = os.path.join(self.sources_dir, 'makes.json')
|
||||
|
||||
try:
|
||||
if os.path.exists(makes_file):
|
||||
with open(makes_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
makes_set = set(data.get('manufacturers', []))
|
||||
logger.info(f"Loaded {len(makes_set)} authoritative makes from {makes_file}")
|
||||
return makes_set
|
||||
else:
|
||||
logger.warning(f"Authoritative makes file not found: {makes_file}")
|
||||
return self._get_fallback_makes()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load authoritative makes from {makes_file}: {e}")
|
||||
return self._get_fallback_makes()
|
||||
|
||||
def _get_fallback_makes(self) -> Set[str]:
|
||||
"""Fallback authoritative makes list if file is not available"""
|
||||
return {
|
||||
'Acura', 'Alfa Romeo', 'Aston Martin', 'Audi', 'BMW', 'Bentley',
|
||||
'Buick', 'Cadillac', 'Chevrolet', 'Chrysler', 'Dodge', 'Ferrari',
|
||||
'Fiat', 'Ford', 'Genesis', 'Geo', 'GMC', 'Honda', 'Hummer',
|
||||
'Hyundai', 'Infiniti', 'Isuzu', 'Jaguar', 'Jeep', 'Kia',
|
||||
'Lamborghini', 'Land Rover', 'Lexus', 'Lincoln', 'Lotus', 'Lucid',
|
||||
'MINI', 'Maserati', 'Mazda', 'McLaren', 'Mercury', 'Mitsubishi',
|
||||
'Nissan', 'Oldsmobile', 'Plymouth', 'Polestar', 'Pontiac',
|
||||
'Porsche', 'Ram', 'Rivian', 'Rolls Royce', 'Saab', 'Saturn',
|
||||
'Scion', 'Smart', 'Subaru', 'Tesla', 'Toyota', 'Volkswagen',
|
||||
'Volvo'
|
||||
}
|
||||
|
||||
def normalize_make_name(self, filename: str) -> str:
|
||||
"""
|
||||
Convert filename to proper display name
|
||||
|
||||
Args:
|
||||
filename: JSON filename (e.g., 'alfa_romeo.json')
|
||||
|
||||
Returns:
|
||||
Normalized display name (e.g., 'Alfa Romeo')
|
||||
"""
|
||||
try:
|
||||
# Remove .json extension
|
||||
base_name = filename.replace('.json', '')
|
||||
|
||||
# Handle edge case of empty string
|
||||
if not base_name:
|
||||
logger.warning(f"Empty base name after removing .json from '{filename}'")
|
||||
return "Unknown"
|
||||
|
||||
# Replace underscores with spaces
|
||||
spaced_name = base_name.replace('_', ' ')
|
||||
|
||||
# Apply title case
|
||||
title_cased = spaced_name.title()
|
||||
|
||||
# Apply special capitalization cases
|
||||
normalized = self.special_cases.get(title_cased, title_cased)
|
||||
|
||||
logger.debug(f"Normalized '{filename}' → '{normalized}'")
|
||||
return normalized
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to normalize make name '{filename}': {e}")
|
||||
return "Unknown"
|
||||
|
||||
def validate_mapping(self, filename: str, display_name: str) -> bool:
|
||||
"""
|
||||
Validate mapped name against authoritative list
|
||||
|
||||
Args:
|
||||
filename: Original JSON filename
|
||||
display_name: Normalized display name
|
||||
|
||||
Returns:
|
||||
True if display name is in authoritative list
|
||||
"""
|
||||
is_valid = display_name in self.authoritative_makes
|
||||
|
||||
if not is_valid:
|
||||
logger.warning(f"Make '{display_name}' from '{filename}' not found in authoritative list")
|
||||
|
||||
return is_valid
|
||||
|
||||
def get_all_mappings(self, json_files_dir: str) -> Dict[str, str]:
|
||||
"""
|
||||
Get complete filename → display name mapping for all JSON files
|
||||
|
||||
Args:
|
||||
json_files_dir: Directory containing make JSON files
|
||||
|
||||
Returns:
|
||||
Dictionary mapping filenames to display names
|
||||
"""
|
||||
mappings = {}
|
||||
|
||||
try:
|
||||
pattern = os.path.join(json_files_dir, '*.json')
|
||||
json_files = glob.glob(pattern)
|
||||
|
||||
logger.info(f"Found {len(json_files)} JSON files in {json_files_dir}")
|
||||
|
||||
for file_path in json_files:
|
||||
filename = os.path.basename(file_path)
|
||||
display_name = self.normalize_make_name(filename)
|
||||
mappings[filename] = display_name
|
||||
|
||||
return mappings
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get all mappings from {json_files_dir}: {e}")
|
||||
return {}
|
||||
|
||||
def validate_all_mappings(self, json_files_dir: str) -> ValidationReport:
|
||||
"""
|
||||
Validate all mappings against authoritative list
|
||||
|
||||
Args:
|
||||
json_files_dir: Directory containing make JSON files
|
||||
|
||||
Returns:
|
||||
ValidationReport with results
|
||||
"""
|
||||
mappings = self.get_all_mappings(json_files_dir)
|
||||
mismatches = []
|
||||
|
||||
for filename, display_name in mappings.items():
|
||||
if not self.validate_mapping(filename, display_name):
|
||||
mismatches.append({
|
||||
'filename': filename,
|
||||
'mapped_name': display_name,
|
||||
'status': 'NOT_FOUND_IN_AUTHORITATIVE'
|
||||
})
|
||||
|
||||
report = ValidationReport(
|
||||
total_files=len(mappings),
|
||||
valid_mappings=len(mappings) - len(mismatches),
|
||||
mismatches=mismatches
|
||||
)
|
||||
|
||||
logger.info(f"Validation complete: {report.valid_mappings}/{report.total_files} valid ({report.success_rate:.1%})")
|
||||
|
||||
return report
|
||||
|
||||
def get_filename_for_display_name(self, display_name: str) -> Optional[str]:
|
||||
"""
|
||||
Reverse lookup: get JSON filename for a display name
|
||||
|
||||
Args:
|
||||
display_name: Make display name (e.g., 'Alfa Romeo')
|
||||
|
||||
Returns:
|
||||
JSON filename (e.g., 'alfa_romeo.json') or None if not found
|
||||
"""
|
||||
# Convert display name back to filename format
|
||||
# Handle special cases in reverse
|
||||
reverse_special_cases = {v: k for k, v in self.special_cases.items()}
|
||||
|
||||
if display_name in reverse_special_cases:
|
||||
# Special case: BMW → Bmw, etc.
|
||||
base_name = reverse_special_cases[display_name].lower()
|
||||
else:
|
||||
# Standard case: convert to lowercase, spaces to underscores
|
||||
base_name = display_name.lower().replace(' ', '_')
|
||||
|
||||
filename = f"{base_name}.json"
|
||||
|
||||
logger.debug(f"Reverse lookup: '{display_name}' → '{filename}'")
|
||||
return filename
|
||||
|
||||
def print_validation_report(self, report: ValidationReport) -> None:
|
||||
"""
|
||||
Print formatted validation report
|
||||
|
||||
Args:
|
||||
report: ValidationReport to display
|
||||
"""
|
||||
print(f"📋 Make Name Validation Report")
|
||||
print(f"=" * 35)
|
||||
print(f"Total files: {report.total_files}")
|
||||
print(f"Valid mappings: {report.valid_mappings}")
|
||||
print(f"Success rate: {report.success_rate:.1%}")
|
||||
|
||||
if report.mismatches:
|
||||
print(f"\n⚠️ Mismatches ({len(report.mismatches)}):")
|
||||
for mismatch in report.mismatches:
|
||||
print(f" {mismatch['filename']} → {mismatch['mapped_name']}")
|
||||
print(f" Status: {mismatch['status']}")
|
||||
else:
|
||||
print(f"\n🎉 All mappings are valid!")
|
||||
|
||||
def get_make_statistics(self, json_files_dir: str) -> Dict[str, int]:
|
||||
"""
|
||||
Get statistics about make name transformations
|
||||
|
||||
Args:
|
||||
json_files_dir: Directory containing make JSON files
|
||||
|
||||
Returns:
|
||||
Dictionary with transformation statistics
|
||||
"""
|
||||
mappings = self.get_all_mappings(json_files_dir)
|
||||
|
||||
single_words = 0
|
||||
multi_words = 0
|
||||
special_cases = 0
|
||||
|
||||
for filename, display_name in mappings.items():
|
||||
if display_name in self.special_cases.values():
|
||||
special_cases += 1
|
||||
elif ' ' in display_name:
|
||||
multi_words += 1
|
||||
else:
|
||||
single_words += 1
|
||||
|
||||
return {
|
||||
'total': len(mappings),
|
||||
'single_words': single_words,
|
||||
'multi_words': multi_words,
|
||||
'special_cases': special_cases
|
||||
}
|
||||
|
||||
|
||||
# Example usage and testing functions
|
||||
def example_usage():
|
||||
"""Demonstrate MakeNameMapper usage"""
|
||||
print("🏷️ MakeNameMapper Example Usage")
|
||||
print("=" * 35)
|
||||
|
||||
mapper = MakeNameMapper()
|
||||
|
||||
# Test individual conversions
|
||||
test_files = [
|
||||
'toyota.json',
|
||||
'alfa_romeo.json',
|
||||
'bmw.json',
|
||||
'land_rover.json',
|
||||
'mclaren.json'
|
||||
]
|
||||
|
||||
for filename in test_files:
|
||||
display_name = mapper.normalize_make_name(filename)
|
||||
is_valid = mapper.validate_mapping(filename, display_name)
|
||||
status = "✅" if is_valid else "⚠️"
|
||||
|
||||
print(f"{status} {filename:20} → {display_name}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
Reference in New Issue
Block a user