# -*- coding: utf-8 -*-
"""
Process Analytics Data Management Command
========================================

Django management command for processing raw analytics data.
Transforms, validates, and aggregates analytics data from various sources
into structured formats for reporting and analysis.

Command Features:
- Batch processing of raw analytics data
- Data validation and cleansing
- Error handling and recovery
- Progress tracking and reporting
- Parallel processing support
- Incremental processing
- Data quality checks
- Performance monitoring
- Notification support
- Dry-run mode for testing

Data Processing Pipeline:
1. Data Extraction: Extract raw data from sources
2. Data Validation: Validate data integrity and format
3. Data Transformation: Transform data to standard format
4. Data Enrichment: Add derived fields and calculations
5. Data Aggregation: Create summary statistics
6. Data Storage: Store processed data in database
7. Cache Update: Update analytics cache
8. Notification: Send completion notifications

Supported Data Sources:
- SFR analytics data
- Bouygues analytics data
- Impression tracking data
- VAST response data
- Performance metrics
- User behavior data
- Campaign data
- Channel data

Processing Options:
- Date range processing
- Incremental processing
- Full reprocessing
- Provider-specific processing
- Batch size configuration
- Parallel processing
- Error recovery
- Data validation levels

Usage Examples:
# Process data for specific date
python manage.py process_analytics_data --date=2024-01-01

# Process date range
python manage.py process_analytics_data --start-date=2024-01-01 --end-date=2024-01-31

# Incremental processing
python manage.py process_analytics_data --incremental

# Process specific provider data
python manage.py process_analytics_data --provider=sfr --date=2024-01-01

# Parallel processing with custom batch size
python manage.py process_analytics_data --parallel --batch-size=500

# Dry run for testing
python manage.py process_analytics_data --dry-run --verbose

Author: Adtlas Development Team
Version: 1.0.0
Last Updated: 2024
"""

import logging
import time
from datetime import datetime, timedelta, date
from typing import Dict, List, Any, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass

from django.core.management.base import BaseCommand, CommandError
from django.db import transaction, connection
from django.utils import timezone
from django.conf import settings
from django.core.cache import cache
from django.db.models import Q, Count, Sum, Avg

from apps.analytics.models import (
    SfrAnalytics, BouyguesAnalytics, Impression, VastResponse,
    PerformanceMetric, AnalyticsReport
)
from apps.analytics.constants import (
    ANALYTICS_PROVIDERS, PROCESSING_STATUS, DATA_QUALITY_LEVELS,
    CACHE_KEYS, CACHE_TIMEOUTS
)
from apps.analytics.exceptions import (
    AnalyticsDataException, ProcessingFailedException,
    DataValidationException
)
from apps.analytics.utils import (
    validate_analytics_data, calculate_data_quality_score,
    send_notification, get_processing_stats
)
from apps.analytics.tasks import (
    process_analytics_batch, update_analytics_cache,
    generate_data_quality_report
)

# Configure logging
logger = logging.getLogger(__name__)


@dataclass
class ProcessingStats:
    """
    Data class for tracking processing statistics.
    """
    total_records: int = 0
    processed_records: int = 0
    failed_records: int = 0
    skipped_records: int = 0
    processing_time: float = 0.0
    start_time: Optional[datetime] = None
    end_time: Optional[datetime] = None
    errors: List[str] = None
    
    def __post_init__(self):
        if self.errors is None:
            self.errors = []
    
    @property
    def success_rate(self) -> float:
        """Calculate success rate percentage."""
        if self.total_records == 0:
            return 0.0
        return (self.processed_records / self.total_records) * 100
    
    @property
    def records_per_second(self) -> float:
        """Calculate processing rate."""
        if self.processing_time == 0:
            return 0.0
        return self.processed_records / self.processing_time


class Command(BaseCommand):
    """
    Management command for processing analytics data.
    
    Processes raw analytics data from various sources,
    validates, transforms, and stores it in structured format.
    """
    
    help = 'Process raw analytics data from various sources'
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.stats = ProcessingStats()
        self.dry_run = False
        self.verbose = False
        self.parallel = False
        self.batch_size = 1000
        self.max_workers = 4
        self.notification_enabled = False
    
    def add_arguments(self, parser):
        """
        Add command line arguments.
        
        Args:
            parser: Argument parser instance
        """
        # Date arguments
        parser.add_argument(
            '--date',
            type=str,
            help='Process data for specific date (YYYY-MM-DD)'
        )
        
        parser.add_argument(
            '--start-date',
            type=str,
            help='Start date for date range processing (YYYY-MM-DD)'
        )
        
        parser.add_argument(
            '--end-date',
            type=str,
            help='End date for date range processing (YYYY-MM-DD)'
        )
        
        parser.add_argument(
            '--incremental',
            action='store_true',
            help='Process only new/updated data since last run'
        )
        
        # Provider arguments
        parser.add_argument(
            '--provider',
            choices=['sfr', 'bouygues', 'all'],
            default='all',
            help='Process data for specific provider'
        )
        
        # Processing arguments
        parser.add_argument(
            '--batch-size',
            type=int,
            default=1000,
            help='Number of records to process in each batch'
        )
        
        parser.add_argument(
            '--parallel',
            action='store_true',
            help='Enable parallel processing'
        )
        
        parser.add_argument(
            '--max-workers',
            type=int,
            default=4,
            help='Maximum number of parallel workers'
        )
        
        # Control arguments
        parser.add_argument(
            '--dry-run',
            action='store_true',
            help='Run without making any changes'
        )
        
        parser.add_argument(
            '--force',
            action='store_true',
            help='Force processing even if data already exists'
        )
        
        parser.add_argument(
            '--notify',
            action='store_true',
            help='Send notification when processing completes'
        )
        
        # Validation arguments
        parser.add_argument(
            '--skip-validation',
            action='store_true',
            help='Skip data validation (faster but less safe)'
        )
        
        parser.add_argument(
            '--validation-level',
            choices=['basic', 'standard', 'strict'],
            default='standard',
            help='Data validation level'
        )
    
    def handle(self, *args, **options):
        """
        Main command handler.
        
        Args:
            *args: Positional arguments
            **options: Command options
        """
        try:
            # Initialize command
            self._initialize_command(options)
            
            # Validate arguments
            self._validate_arguments(options)
            
            # Determine date range
            start_date, end_date = self._get_date_range(options)
            
            # Log command start
            self._log_command_start(start_date, end_date, options)
            
            # Process data
            self._process_analytics_data(start_date, end_date, options)
            
            # Generate summary
            self._generate_summary()
            
            # Send notifications
            if self.notification_enabled:
                self._send_completion_notification()
            
            # Log command completion
            self._log_command_completion()
            
        except Exception as e:
            self._handle_command_error(e)
            raise CommandError(f"Command failed: {e}")
    
    def _initialize_command(self, options: Dict[str, Any]) -> None:
        """
        Initialize command with options.
        
        Args:
            options: Command options dictionary
        """
        self.dry_run = options.get('dry_run', False)
        self.verbose = options.get('verbosity', 1) > 1
        self.parallel = options.get('parallel', False)
        self.batch_size = options.get('batch_size', 1000)
        self.max_workers = options.get('max_workers', 4)
        self.notification_enabled = options.get('notify', False)
        
        # Initialize stats
        self.stats = ProcessingStats()
        self.stats.start_time = timezone.now()
        
        # Configure logging level
        if self.verbose:
            logging.getLogger('apps.analytics').setLevel(logging.DEBUG)
        
        self.stdout.write(
            self.style.SUCCESS(
                f"Analytics data processing command initialized"
            )
        )
        
        if self.dry_run:
            self.stdout.write(
                self.style.WARNING("DRY RUN MODE: No changes will be made")
            )
    
    def _validate_arguments(self, options: Dict[str, Any]) -> None:
        """
        Validate command arguments.
        
        Args:
            options: Command options dictionary
            
        Raises:
            CommandError: If arguments are invalid
        """
        # Validate date arguments
        if options.get('date') and (options.get('start_date') or options.get('end_date')):
            raise CommandError("Cannot specify both --date and --start-date/--end-date")
        
        # Validate batch size
        if self.batch_size <= 0 or self.batch_size > 10000:
            raise CommandError("Batch size must be between 1 and 10000")
        
        # Validate max workers
        if self.max_workers <= 0 or self.max_workers > 20:
            raise CommandError("Max workers must be between 1 and 20")
        
        # Validate provider
        provider = options.get('provider', 'all')
        if provider not in ['sfr', 'bouygues', 'all']:
            raise CommandError(f"Invalid provider: {provider}")
    
    def _get_date_range(self, options: Dict[str, Any]) -> Tuple[date, date]:
        """
        Determine date range for processing.
        
        Args:
            options: Command options dictionary
            
        Returns:
            Tuple of (start_date, end_date)
        """
        if options.get('incremental'):
            # Get last processing date from cache or database
            last_processed = self._get_last_processed_date()
            start_date = last_processed + timedelta(days=1)
            end_date = timezone.now().date()
            
        elif options.get('date'):
            # Single date processing
            try:
                target_date = datetime.strptime(options['date'], '%Y-%m-%d').date()
                start_date = end_date = target_date
            except ValueError:
                raise CommandError("Invalid date format. Use YYYY-MM-DD")
            
        elif options.get('start_date') and options.get('end_date'):
            # Date range processing
            try:
                start_date = datetime.strptime(options['start_date'], '%Y-%m-%d').date()
                end_date = datetime.strptime(options['end_date'], '%Y-%m-%d').date()
            except ValueError:
                raise CommandError("Invalid date format. Use YYYY-MM-DD")
            
            if start_date > end_date:
                raise CommandError("Start date must be before end date")
            
        else:
            # Default to yesterday
            yesterday = timezone.now().date() - timedelta(days=1)
            start_date = end_date = yesterday
        
        return start_date, end_date
    
    def _get_last_processed_date(self) -> date:
        """
        Get the last processed date from cache or database.
        
        Returns:
            Last processed date
        """
        # Try cache first
        cache_key = CACHE_KEYS.LAST_PROCESSED_DATE
        last_date = cache.get(cache_key)
        
        if last_date:
            return last_date
        
        # Fallback to database
        try:
            # Get latest analytics record date
            latest_sfr = SfrAnalytics.objects.latest('created_at').created_at.date()
            latest_bouygues = BouyguesAnalytics.objects.latest('created_at').created_at.date()
            last_date = max(latest_sfr, latest_bouygues)
            
            # Cache the result
            cache.set(cache_key, last_date, timeout=CACHE_TIMEOUTS.MEDIUM)
            
            return last_date
            
        except (SfrAnalytics.DoesNotExist, BouyguesAnalytics.DoesNotExist):
            # No data exists, start from a week ago
            return timezone.now().date() - timedelta(days=7)
    
    def _log_command_start(self, start_date: date, end_date: date, options: Dict[str, Any]) -> None:
        """
        Log command start information.
        
        Args:
            start_date: Processing start date
            end_date: Processing end date
            options: Command options
        """
        provider = options.get('provider', 'all')
        
        self.stdout.write(
            self.style.SUCCESS(
                f"Starting analytics data processing:\n"
                f"  Date range: {start_date} to {end_date}\n"
                f"  Provider: {provider}\n"
                f"  Batch size: {self.batch_size}\n"
                f"  Parallel: {self.parallel}\n"
                f"  Dry run: {self.dry_run}"
            )
        )
        
        logger.info(
            f"Analytics data processing started",
            extra={
                'start_date': start_date.isoformat(),
                'end_date': end_date.isoformat(),
                'provider': provider,
                'batch_size': self.batch_size,
                'parallel': self.parallel,
                'dry_run': self.dry_run
            }
        )
    
    def _process_analytics_data(self, start_date: date, end_date: date, options: Dict[str, Any]) -> None:
        """
        Process analytics data for the specified date range.
        
        Args:
            start_date: Processing start date
            end_date: Processing end date
            options: Command options
        """
        provider = options.get('provider', 'all')
        
        # Process each date in the range
        current_date = start_date
        while current_date <= end_date:
            try:
                self._process_date(current_date, provider, options)
                current_date += timedelta(days=1)
                
            except Exception as e:
                error_msg = f"Failed to process date {current_date}: {e}"
                self.stats.errors.append(error_msg)
                logger.error(error_msg, exc_info=True)
                
                if not options.get('force', False):
                    raise
                
                # Continue with next date if force is enabled
                current_date += timedelta(days=1)
        
        # Update last processed date
        if not self.dry_run:
            self._update_last_processed_date(end_date)
    
    def _process_date(self, target_date: date, provider: str, options: Dict[str, Any]) -> None:
        """
        Process analytics data for a specific date.
        
        Args:
            target_date: Date to process
            provider: Provider to process ('sfr', 'bouygues', or 'all')
            options: Command options
        """
        self.stdout.write(f"Processing data for {target_date}...")
        
        if provider in ['sfr', 'all']:
            self._process_provider_data('sfr', target_date, options)
        
        if provider in ['bouygues', 'all']:
            self._process_provider_data('bouygues', target_date, options)
        
        # Process impressions and VAST responses
        self._process_impression_data(target_date, options)
        self._process_vast_data(target_date, options)
        
        # Update performance metrics
        self._update_performance_metrics(target_date, options)
        
        self.stdout.write(
            self.style.SUCCESS(f"Completed processing for {target_date}")
        )
    
    def _process_provider_data(self, provider: str, target_date: date, options: Dict[str, Any]) -> None:
        """
        Process data for a specific provider.
        
        Args:
            provider: Provider name ('sfr' or 'bouygues')
            target_date: Date to process
            options: Command options
        """
        try:
            # Get model class
            model_class = SfrAnalytics if provider == 'sfr' else BouyguesAnalytics
            
            # Get raw data to process
            raw_data = self._get_raw_data(provider, target_date)
            
            if not raw_data:
                if self.verbose:
                    self.stdout.write(f"No raw data found for {provider} on {target_date}")
                return
            
            self.stats.total_records += len(raw_data)
            
            # Process in batches
            if self.parallel:
                self._process_batches_parallel(model_class, raw_data, options)
            else:
                self._process_batches_sequential(model_class, raw_data, options)
            
        except Exception as e:
            error_msg = f"Failed to process {provider} data for {target_date}: {e}"
            self.stats.errors.append(error_msg)
            logger.error(error_msg, exc_info=True)
            raise
    
    def _get_raw_data(self, provider: str, target_date: date) -> List[Dict[str, Any]]:
        """
        Get raw data for processing.
        
        Args:
            provider: Provider name
            target_date: Target date
            
        Returns:
            List of raw data records
        """
        # This would typically fetch from external APIs or data files
        # For now, return empty list as placeholder
        return []
    
    def _process_batches_sequential(self, model_class, raw_data: List[Dict[str, Any]], options: Dict[str, Any]) -> None:
        """
        Process data batches sequentially.
        
        Args:
            model_class: Model class to use
            raw_data: Raw data to process
            options: Command options
        """
        batch_count = 0
        total_batches = (len(raw_data) + self.batch_size - 1) // self.batch_size
        
        for i in range(0, len(raw_data), self.batch_size):
            batch = raw_data[i:i + self.batch_size]
            batch_count += 1
            
            try:
                self._process_batch(model_class, batch, options)
                
                if self.verbose:
                    self.stdout.write(
                        f"Processed batch {batch_count}/{total_batches} "
                        f"({len(batch)} records)"
                    )
                    
            except Exception as e:
                error_msg = f"Failed to process batch {batch_count}: {e}"
                self.stats.errors.append(error_msg)
                logger.error(error_msg, exc_info=True)
                
                if not options.get('force', False):
                    raise
    
    def _process_batches_parallel(self, model_class, raw_data: List[Dict[str, Any]], options: Dict[str, Any]) -> None:
        """
        Process data batches in parallel.
        
        Args:
            model_class: Model class to use
            raw_data: Raw data to process
            options: Command options
        """
        # Split data into batches
        batches = [
            raw_data[i:i + self.batch_size]
            for i in range(0, len(raw_data), self.batch_size)
        ]
        
        # Process batches in parallel
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all batches
            future_to_batch = {
                executor.submit(self._process_batch, model_class, batch, options): i
                for i, batch in enumerate(batches)
            }
            
            # Process completed batches
            for future in as_completed(future_to_batch):
                batch_index = future_to_batch[future]
                
                try:
                    future.result()
                    
                    if self.verbose:
                        self.stdout.write(
                            f"Completed batch {batch_index + 1}/{len(batches)}"
                        )
                        
                except Exception as e:
                    error_msg = f"Failed to process batch {batch_index + 1}: {e}"
                    self.stats.errors.append(error_msg)
                    logger.error(error_msg, exc_info=True)
                    
                    if not options.get('force', False):
                        raise
    
    def _process_batch(self, model_class, batch: List[Dict[str, Any]], options: Dict[str, Any]) -> None:
        """
        Process a single batch of data.
        
        Args:
            model_class: Model class to use
            batch: Batch of raw data
            options: Command options
        """
        if self.dry_run:
            # Simulate processing
            time.sleep(0.1)
            self.stats.processed_records += len(batch)
            return
        
        try:
            with transaction.atomic():
                processed_objects = []
                
                for record in batch:
                    try:
                        # Validate data
                        if not options.get('skip_validation', False):
                            validation_level = options.get('validation_level', 'standard')
                            self._validate_record(record, validation_level)
                        
                        # Transform data
                        transformed_record = self._transform_record(record)
                        
                        # Create model instance
                        obj = model_class(**transformed_record)
                        processed_objects.append(obj)
                        
                        self.stats.processed_records += 1
                        
                    except Exception as e:
                        self.stats.failed_records += 1
                        error_msg = f"Failed to process record: {e}"
                        self.stats.errors.append(error_msg)
                        logger.warning(error_msg)
                        
                        if not options.get('force', False):
                            raise
                
                # Bulk create objects
                if processed_objects:
                    model_class.objects.bulk_create(
                        processed_objects,
                        ignore_conflicts=True
                    )
                    
        except Exception as e:
            logger.error(f"Batch processing failed: {e}", exc_info=True)
            raise
    
    def _validate_record(self, record: Dict[str, Any], validation_level: str) -> None:
        """
        Validate a single data record.
        
        Args:
            record: Data record to validate
            validation_level: Validation level ('basic', 'standard', 'strict')
            
        Raises:
            DataValidationException: If validation fails
        """
        try:
            validate_analytics_data(record, level=validation_level)
        except Exception as e:
            raise DataValidationException(f"Record validation failed: {e}")
    
    def _transform_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
        """
        Transform raw record to model format.
        
        Args:
            record: Raw data record
            
        Returns:
            Transformed record
        """
        # This would contain actual transformation logic
        # For now, return as-is
        return record
    
    def _process_impression_data(self, target_date: date, options: Dict[str, Any]) -> None:
        """
        Process impression data for the target date.
        
        Args:
            target_date: Date to process
            options: Command options
        """
        # Placeholder for impression data processing
        pass
    
    def _process_vast_data(self, target_date: date, options: Dict[str, Any]) -> None:
        """
        Process VAST response data for the target date.
        
        Args:
            target_date: Date to process
            options: Command options
        """
        # Placeholder for VAST data processing
        pass
    
    def _update_performance_metrics(self, target_date: date, options: Dict[str, Any]) -> None:
        """
        Update performance metrics for the target date.
        
        Args:
            target_date: Date to process
            options: Command options
        """
        if self.dry_run:
            return
        
        try:
            # Calculate and store performance metrics
            # This would contain actual metric calculation logic
            pass
            
        except Exception as e:
            logger.error(f"Failed to update performance metrics: {e}", exc_info=True)
    
    def _update_last_processed_date(self, last_date: date) -> None:
        """
        Update the last processed date in cache.
        
        Args:
            last_date: Last processed date
        """
        cache_key = CACHE_KEYS.LAST_PROCESSED_DATE
        cache.set(cache_key, last_date, timeout=CACHE_TIMEOUTS.LONG)
    
    def _generate_summary(self) -> None:
        """
        Generate and display processing summary.
        """
        self.stats.end_time = timezone.now()
        self.stats.processing_time = (
            self.stats.end_time - self.stats.start_time
        ).total_seconds()
        
        # Display summary
        self.stdout.write("\n" + "=" * 50)
        self.stdout.write(self.style.SUCCESS("PROCESSING SUMMARY"))
        self.stdout.write("=" * 50)
        
        self.stdout.write(f"Total records: {self.stats.total_records}")
        self.stdout.write(f"Processed: {self.stats.processed_records}")
        self.stdout.write(f"Failed: {self.stats.failed_records}")
        self.stdout.write(f"Success rate: {self.stats.success_rate:.2f}%")
        self.stdout.write(f"Processing time: {self.stats.processing_time:.2f} seconds")
        self.stdout.write(f"Records/second: {self.stats.records_per_second:.2f}")
        
        if self.stats.errors:
            self.stdout.write(f"\nErrors ({len(self.stats.errors)}):")
            for error in self.stats.errors[-5:]:  # Show last 5 errors
                self.stdout.write(f"  - {error}")
            
            if len(self.stats.errors) > 5:
                self.stdout.write(f"  ... and {len(self.stats.errors) - 5} more")
        
        self.stdout.write("=" * 50)
    
    def _send_completion_notification(self) -> None:
        """
        Send completion notification.
        """
        try:
            notification_data = {
                'command': 'process_analytics_data',
                'status': 'completed' if self.stats.failed_records == 0 else 'completed_with_errors',
                'stats': {
                    'total_records': self.stats.total_records,
                    'processed_records': self.stats.processed_records,
                    'failed_records': self.stats.failed_records,
                    'success_rate': self.stats.success_rate,
                    'processing_time': self.stats.processing_time,
                    'records_per_second': self.stats.records_per_second
                },
                'errors': self.stats.errors[-10:] if self.stats.errors else [],
                'timestamp': timezone.now().isoformat()
            }
            
            send_notification(
                'analytics_processing_completed',
                notification_data
            )
            
        except Exception as e:
            logger.error(f"Failed to send notification: {e}")
    
    def _log_command_completion(self) -> None:
        """
        Log command completion.
        """
        logger.info(
            f"Analytics data processing completed",
            extra={
                'total_records': self.stats.total_records,
                'processed_records': self.stats.processed_records,
                'failed_records': self.stats.failed_records,
                'success_rate': self.stats.success_rate,
                'processing_time': self.stats.processing_time,
                'records_per_second': self.stats.records_per_second,
                'error_count': len(self.stats.errors)
            }
        )
    
    def _handle_command_error(self, error: Exception) -> None:
        """
        Handle command errors.
        
        Args:
            error: Exception that occurred
        """
        self.stats.end_time = timezone.now()
        
        error_msg = f"Command failed: {error}"
        self.stdout.write(self.style.ERROR(error_msg))
        
        logger.error(
            error_msg,
            extra={
                'total_records': self.stats.total_records,
                'processed_records': self.stats.processed_records,
                'failed_records': self.stats.failed_records,
                'error_count': len(self.stats.errors)
            },
            exc_info=True
        )
        
        # Send error notification
        if self.notification_enabled:
            try:
                send_notification(
                    'analytics_processing_failed',
                    {
                        'command': 'process_analytics_data',
                        'error': str(error),
                        'stats': {
                            'total_records': self.stats.total_records,
                            'processed_records': self.stats.processed_records,
                            'failed_records': self.stats.failed_records
                        },
                        'timestamp': timezone.now().isoformat()
                    }
                )
            except Exception as e:
                logger.error(f"Failed to send error notification: {e}")