motovaultpro/mvp-platform-services/vehicles/etl/main.py

#!/usr/bin/env python3
import logging
import sys
import os
from datetime import datetime
from pathlib import Path
import click
from .config import config
from .utils.logging import setup_logging
from .scheduler import start_etl_scheduler
from .pipeline import run_etl_pipeline
from .connections import test_connections

# Import manual JSON processing components
try:
    from .pipelines.manual_json_pipeline import ManualJsonPipeline, PipelineConfig, default_progress_callback
    from .loaders.json_manual_loader import LoadMode
    from .utils.make_name_mapper import MakeNameMapper
    from .utils.engine_spec_parser import EngineSpecParser
    from .extractors.json_extractor import JsonExtractor
except ImportError as e:
    # Handle import errors gracefully for existing functionality
    ManualJsonPipeline = None
    logger = logging.getLogger(__name__)
    logger.warning(f"Manual JSON processing components not available: {e}")

logger = logging.getLogger(__name__)

@click.group()
def cli():
    """MVP Platform Vehicles ETL Tool"""
    setup_logging(config.LOG_LEVEL)

@cli.command()
def build_catalog():
    """Build vehicle catalog from source database"""
    success = run_etl_pipeline()
    if not success:
        sys.exit(1)

@cli.command()
def schedule():
    """Start ETL scheduler (default mode)"""
    start_etl_scheduler()

@cli.command()
@click.option('--full', is_flag=True, help='Full reload instead of incremental')
def update(full):
    """Run ETL update"""
    logger.info(f"Starting ETL update (full={full})")
    success = run_etl_pipeline()
    if not success:
        sys.exit(1)

@cli.command()
def test():
    """Test database connections"""
    success = test_connections()
    if not success:
        logger.error("Connection test failed")
        sys.exit(1)
    else:
        logger.info("All connections tested successfully")

@cli.command()
@click.option('--sources-dir', '-s', default='sources/makes',
              help='Directory containing JSON make files (default: sources/makes)')
@click.option('--mode', '-m', type=click.Choice(['clear', 'append']), default='append',
              help='Loading mode: clear (destructive) or append (safe, default)')
@click.option('--progress/--no-progress', default=True,
              help='Show progress tracking (default: enabled)')
@click.option('--validate/--no-validate', default=True,
              help='Validate referential integrity after loading (default: enabled)')
@click.option('--batch-size', '-b', type=int, default=1000,
              help='Database batch size for inserts (default: 1000)')
@click.option('--dry-run', is_flag=True,
              help='Extract and validate data without loading to database')
@click.option('--verbose', '-v', is_flag=True,
              help='Enable verbose output')
def load_manual(sources_dir, mode, progress, validate, batch_size, dry_run, verbose):
    """Load vehicle data from JSON files manually

    This command processes JSON files in the specified directory and loads
    vehicle data into the PostgreSQL database. It supports two modes:

    • APPEND mode (default): Safely add new data with duplicate detection
    • CLEAR mode: Remove all existing data and reload (destructive)

    Examples:
        python -m etl load-manual
        python -m etl load-manual --mode clear --sources-dir custom/path
        python -m etl load-manual --dry-run --verbose
    """
    if ManualJsonPipeline is None:
        click.echo("❌ Manual JSON processing components are not available", err=True)
        click.echo("   Please check your installation and dependencies", err=True)
        sys.exit(1)

    # Validate sources directory
    sources_path = Path(sources_dir)
    if not sources_path.exists():
        click.echo(f"❌ Sources directory not found: {sources_dir}", err=True)
        click.echo("   Please specify a valid directory with --sources-dir", err=True)
        sys.exit(1)

    # Count JSON files
    json_files = list(sources_path.glob("*.json"))
    if not json_files:
        click.echo(f"❌ No JSON files found in: {sources_dir}", err=True)
        click.echo("   Please ensure the directory contains *.json files", err=True)
        sys.exit(1)

    # Set log level if verbose
    if verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Create configuration
    load_mode_enum = LoadMode.CLEAR if mode == 'clear' else LoadMode.APPEND
    config = PipelineConfig(
        sources_directory=str(sources_path),
        load_mode=load_mode_enum,
        enable_progress_tracking=progress,
        validate_integrity=validate,
        batch_size=batch_size
    )

    click.echo(f"🚀 Manual JSON Processing Pipeline")
    click.echo(f"   Sources: {sources_dir}")
    click.echo(f"   Files: {len(json_files)} JSON files")
    click.echo(f"   Mode: {mode.upper()}")
    if dry_run:
        click.echo(f"   Dry run: Validation only (no database changes)")

    try:
        # Create pipeline
        pipeline = ManualJsonPipeline(str(sources_path), config)

        # Progress callback for CLI
        def cli_progress_callback(progress_info):
            if progress:
                percentage = progress_info['percentage']
                phase = progress_info['phase']
                files = f"{progress_info['files_completed']}/{progress_info['total_files']}"

                if progress_info['files_per_second'] > 0:
                    rate = f"({progress_info['files_per_second']:.1f} files/sec)"
                    eta_min = progress_info['eta_seconds'] / 60
                    eta = f"ETA: {eta_min:.1f}min" if eta_min > 0 else ""
                    click.echo(f"[{percentage:5.1f}%] {phase}: {files} {rate} {eta}")
                else:
                    click.echo(f"[{percentage:5.1f}%] {phase}: {files}")

        if dry_run:
            # Extraction only for validation
            click.echo("\n📋 Running extraction validation...")
            extraction_result = pipeline.run_extraction_only()

            # Report extraction results
            click.echo(f"\n✅ Extraction Validation Complete")
            click.echo(f"   Files processed: {extraction_result.total_files_processed}")
            click.echo(f"   Success rate: {extraction_result.success_rate:.1%}")
            click.echo(f"   Models extracted: {extraction_result.total_models:,}")
            click.echo(f"   Engines extracted: {extraction_result.total_engines:,}")
            click.echo(f"   Electric models: {extraction_result.total_electric_models:,}")

            if extraction_result.failed_extractions > 0:
                click.echo(f"   ⚠️  Failed extractions: {extraction_result.failed_extractions}")
                sys.exit(1)
        else:
            # Full pipeline execution
            if mode == 'clear':
                click.echo("\n⚠️  WARNING: CLEAR mode will delete all existing vehicle data!")
                if not click.confirm("Are you sure you want to continue?", default=False):
                    click.echo("Operation cancelled")
                    return

            click.echo(f"\n🔄 Running pipeline...")
            result = pipeline.run(progress_callback=cli_progress_callback)

            # Print comprehensive report
            click.echo(f"\n" + "="*60)
            click.echo(f"📊 PIPELINE EXECUTION REPORT")
            click.echo(f"="*60)

            # Performance
            click.echo(f"\n⏱️  PERFORMANCE")
            click.echo(f"   Duration: {result.duration_seconds:.1f} seconds ({result.duration_minutes:.1f} minutes)")
            click.echo(f"   Processing rate: {result.files_per_second:.1f} files/sec")
            click.echo(f"   Loading rate: {result.records_per_second:,.0f} records/sec")

            # Success rates
            click.echo(f"\n📈 SUCCESS RATES")
            click.echo(f"   Extraction: {result.extraction_success_rate:.1%}")
            click.echo(f"   Loading: {result.loading_success_rate:.1%}")
            click.echo(f"   Overall: {result.overall_success_rate:.1%}")

            # Data loaded
            click.echo(f"\n💾 DATA LOADED")
            click.echo(f"   Makes: {result.load_result.total_makes}")
            click.echo(f"   Models: {result.load_result.total_models}")
            click.echo(f"   Engines: {result.load_result.total_engines}")
            click.echo(f"   Trims: {result.load_result.total_trims}")
            click.echo(f"   Total records: {result.total_records_loaded:,}")

            # Issues
            if result.load_result.failed_makes:
                click.echo(f"\n⚠️  FAILED MAKES ({len(result.load_result.failed_makes)}):")
                for make in result.load_result.failed_makes:
                    click.echo(f"   • {make}")

            if result.integrity_issues:
                click.echo(f"\n❌ INTEGRITY ISSUES ({len(result.integrity_issues)}):")
                for issue in result.integrity_issues:
                    click.echo(f"   • {issue}")
            else:
                click.echo(f"\n✅ REFERENTIAL INTEGRITY: PASSED")

            # Final status
            if result.was_successful:
                click.echo(f"\n🎉 PIPELINE COMPLETED SUCCESSFULLY")
                if verbose:
                    # Show database statistics
                    db_stats = pipeline.loader.get_database_statistics()
                    click.echo(f"\n📋 DATABASE STATISTICS:")
                    for table, count in db_stats.items():
                        click.echo(f"   {table}: {count:,} records")
            else:
                click.echo(f"\n⚠️  PIPELINE COMPLETED WITH ISSUES")
                sys.exit(1)

    except KeyboardInterrupt:
        click.echo(f"\n⏸️  Pipeline interrupted by user")
        sys.exit(1)
    except Exception as e:
        click.echo(f"\n❌ Pipeline failed: {str(e)}", err=True)
        if verbose:
            import traceback
            traceback.print_exc()
        sys.exit(1)

@cli.command()
@click.option('--sources-dir', '-s', default='sources/makes',
              help='Directory containing JSON make files (default: sources/makes)')
@click.option('--verbose', '-v', is_flag=True,
              help='Enable verbose output with detailed statistics')
def validate_json(sources_dir, verbose):
    """Validate JSON files and show extraction statistics

    This command validates the structure and content of JSON files
    without loading data into the database. Useful for:

    • Checking data quality before loading
    • Debugging extraction issues
    • Getting statistics about available data

    Examples:
        python -m etl validate-json
        python -m etl validate-json --sources-dir custom/path --verbose
    """
    if JsonExtractor is None:
        click.echo("❌ JSON validation components are not available", err=True)
        sys.exit(1)

    # Validate sources directory
    sources_path = Path(sources_dir)
    if not sources_path.exists():
        click.echo(f"❌ Sources directory not found: {sources_dir}", err=True)
        sys.exit(1)

    # Count JSON files
    json_files = list(sources_path.glob("*.json"))
    if not json_files:
        click.echo(f"❌ No JSON files found in: {sources_dir}", err=True)
        sys.exit(1)

    click.echo(f"🔍 JSON File Validation")
    click.echo(f"   Directory: {sources_dir}")
    click.echo(f"   Files: {len(json_files)} JSON files")

    try:
        # Initialize components
        make_mapper = MakeNameMapper()
        engine_parser = EngineSpecParser()
        extractor = JsonExtractor(make_mapper, engine_parser)

        # Run extraction validation
        click.echo(f"\n📋 Validating JSON structure and content...")
        result = extractor.extract_all_makes(str(sources_path))

        # Basic results
        click.echo(f"\n✅ Validation Complete")
        click.echo(f"   Files processed: {result.total_files_processed}")
        click.echo(f"   Success rate: {result.success_rate:.1%}")
        click.echo(f"   Models found: {result.total_models:,}")
        click.echo(f"   Engines found: {result.total_engines:,}")
        click.echo(f"   Electric models: {result.total_electric_models:,}")

        if result.failed_extractions > 0:
            click.echo(f"   ⚠️  Failed extractions: {result.failed_extractions}")

        # Show top makes by model count
        if verbose and result.makes:
            click.echo(f"\n🏆 Top Makes by Model Count:")
            top_makes = sorted(result.makes, key=lambda m: m.total_models, reverse=True)[:10]
            for i, make in enumerate(top_makes, 1):
                click.echo(f"   {i:2d}. {make.name}: {make.total_models} models, {make.total_engines} engines")

            # Show makes with issues
            error_makes = [make for make in result.makes if make.processing_errors]
            if error_makes:
                click.echo(f"\n⚠️  Makes with Processing Errors ({len(error_makes)}):")
                for make in error_makes[:5]:
                    click.echo(f"   • {make.name}: {len(make.processing_errors)} errors")
                if len(error_makes) > 5:
                    click.echo(f"   ... and {len(error_makes) - 5} more")

            # Show data quality insights
            click.echo(f"\n📊 Data Quality Insights:")

            # Engine configuration distribution
            config_counts = {}
            for make in result.makes:
                for model in make.models:
                    for engine in model.engines:
                        config_counts[engine.configuration] = config_counts.get(engine.configuration, 0) + 1

            if config_counts:
                click.echo(f"   Engine configurations:")
                for config, count in sorted(config_counts.items(), key=lambda x: x[1], reverse=True):
                    percentage = count / result.total_engines * 100
                    click.echo(f"     {config}: {count:,} ({percentage:.1f}%)")

        if result.failed_extractions > 0:
            sys.exit(1)

    except Exception as e:
        click.echo(f"❌ Validation failed: {str(e)}", err=True)
        if verbose:
            import traceback
            traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    # Default to scheduler mode if no command provided
    if len(sys.argv) == 1:
        start_etl_scheduler()
    else:
        cli()