#!/usr/bin/env python
import os
import time
import museval
import numpy as np
import soundfile as sf
from audio_separator.separator import Separator
import json
from json import JSONEncoder
import logging
import musdb
from decimal import Decimal
import tempfile
import argparse


# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


# Custom JSON Encoder to handle Decimal types
class DecimalEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Decimal):
            return float(obj)
        return super().default(obj)


MUSDB_PATH = "/Volumes/Nomad4TBOne/python-audio-separator/tests/model-metrics/datasets/musdb18hq"
RESULTS_PATH = "/Volumes/Nomad4TBOne/python-audio-separator/tests/model-metrics/results"
COMBINED_RESULTS_PATH = "/Users/andrew/Projects/python-audio-separator/audio_separator/models-scores.json"
COMBINED_MUSEVAL_RESULTS_PATH = "/Volumes/Nomad4TBOne/python-audio-separator/tests/model-metrics/results/combined-museval-results.json"
STOP_SIGNAL_PATH = "/Volumes/Nomad4TBOne/python-audio-separator/tests/model-metrics/stop-signal"


def load_combined_results():
    """Load the combined museval results file"""
    if os.path.exists(COMBINED_MUSEVAL_RESULTS_PATH):
        logger.info("Loading combined museval results...")
        try:
            with open(COMBINED_MUSEVAL_RESULTS_PATH, "r") as f:
                # Use a custom parser to handle Decimal values
                def decimal_parser(dct):
                    for k, v in dct.items():
                        if isinstance(v, str) and v.replace(".", "").isdigit():
                            try:
                                dct[k] = float(v)
                            except (ValueError, TypeError):
                                pass
                    return dct

                return json.load(f, object_hook=decimal_parser)
        except Exception as e:
            logger.error(f"Error loading combined results: {str(e)}")
            # Try to load a backup file if it exists
            backup_path = COMBINED_MUSEVAL_RESULTS_PATH + ".backup"
            if os.path.exists(backup_path):
                logger.info("Attempting to load backup file...")
                try:
                    with open(backup_path, "r") as f:
                        return json.load(f, object_hook=decimal_parser)
                except Exception as backup_e:
                    logger.error(f"Error loading backup file: {str(backup_e)}")
            return {}
    else:
        logger.info("No combined results file found, creating new one")
        return {}


def save_combined_results(combined_results):
    """Save the combined museval results file"""
    logger.info("Saving combined museval results...")
    try:
        # Create a backup of the existing file if it exists
        if os.path.exists(COMBINED_MUSEVAL_RESULTS_PATH):
            backup_path = COMBINED_MUSEVAL_RESULTS_PATH + ".backup"
            try:
                with open(COMBINED_MUSEVAL_RESULTS_PATH, "r") as src, open(backup_path, "w") as dst:
                    dst.write(src.read())
            except Exception as e:
                logger.error(f"Error creating backup file: {str(e)}")

        # Save the new results using the custom encoder
        with open(COMBINED_MUSEVAL_RESULTS_PATH, "w") as f:
            json.dump(combined_results, f, cls=DecimalEncoder, indent=2)
        logger.info("Combined results saved successfully")
        return True
    except Exception as e:
        logger.error(f"Error saving combined results: {str(e)}")
        return False


def update_combined_results(model_name, track_name, track_data):
    """Update the combined results file with new track data"""
    try:
        # Load existing combined results
        combined_results = load_combined_results()

        # Initialize model entry if it doesn't exist
        if model_name not in combined_results:
            combined_results[model_name] = {}

        # Add or update track data
        combined_results[model_name][track_name] = track_data

        # Write updated results back to file
        save_combined_results(combined_results)
        return True
    except Exception as e:
        logger.error(f"Error updating combined results: {str(e)}")
        return False


def check_track_evaluated(model_name, track_name):
    """Check if a track has already been evaluated for a specific model"""
    combined_results = load_combined_results()
    return model_name in combined_results and track_name in combined_results[model_name]


def get_track_results(model_name, track_name):
    """Get the evaluation results for a specific track and model"""
    combined_results = load_combined_results()
    if model_name in combined_results and track_name in combined_results[model_name]:
        return combined_results[model_name][track_name]
    return None


def get_track_duration(track_path):
    """Get the duration of a track in minutes"""
    try:
        mixture_path = os.path.join(track_path, "mixture.wav")
        info = sf.info(mixture_path)
        return info.duration / 60.0  # Convert seconds to minutes
    except Exception as e:
        logger.error(f"Error getting track duration: {str(e)}")
        return 0.0


def evaluate_track(track_name, track_path, test_model, mus_db):
    """Evaluate a single track using a specific model"""
    logger.info(f"Evaluating track: {track_name} with model: {test_model}")

    # Get track duration in minutes
    track_duration_minutes = get_track_duration(track_path)
    logger.info(f"Track duration: {track_duration_minutes:.2f} minutes")

    # Initialize variables to track processing time
    processing_time = 0
    seconds_per_minute = 0

    # Create a basic result structure that will be returned even if evaluation fails
    basic_model_results = {"track_name": track_name, "scores": {}}

    # Check if evaluation results already exist in combined file
    museval_results = load_combined_results()
    if test_model in museval_results and track_name in museval_results[test_model]:
        logger.info("Found existing evaluation results in combined file...")
        track_data = museval_results[test_model][track_name]
        scores = museval.TrackStore(track_name)
        scores.scores = track_data

        # Try to extract existing speed metrics if available
        try:
            if isinstance(track_data, dict) and "targets" in track_data:
                for target in track_data["targets"]:
                    if "metrics" in target and "seconds_per_minute_m3" in target["metrics"]:
                        basic_model_results["scores"]["seconds_per_minute_m3"] = target["metrics"]["seconds_per_minute_m3"]
                        break
        except Exception:
            pass  # Ignore errors in extracting existing speed metrics
    else:
        # Expanded stem mapping to include "no-stem" outputs and custom stem formats
        stem_mapping = {
            # Standard stems
            "Vocals": "vocals",
            "Instrumental": "instrumental",
            "Drums": "drums",
            "Bass": "bass",
            "Other": "other",
            # No-stem variants
            "No Drums": "nodrums",
            "No Bass": "nobass",
            "No Other": "noother",
            # Custom stem formats (with hyphens)
            "Drum-Bass": "drumbass",
            "No Drum-Bass": "nodrumbass",
            "Vocals-Other": "vocalsother",
            "No Vocals-Other": "novocalsother",
        }

        # Create a temporary directory for separation files
        with tempfile.TemporaryDirectory() as temp_dir:
            logger.info(f"Using temporary directory: {temp_dir}")

            # Measure separation time
            start_time = time.time()

            # Perform separation
            logger.info("Performing separation...")
            separator = Separator(output_dir=temp_dir)
            separator.load_model(model_filename=test_model)
            separator.separate(os.path.join(track_path, "mixture.wav"), custom_output_names=stem_mapping)

            # Calculate processing time
            processing_time = time.time() - start_time
            seconds_per_minute = processing_time / track_duration_minutes if track_duration_minutes > 0 else 0
            logger.info(f"Separation completed in {processing_time:.2f} seconds")
            logger.info(f"Processing speed: {seconds_per_minute:.2f} seconds per minute of audio")

            # Always add the speed metric to our basic results
            basic_model_results["scores"]["seconds_per_minute_m3"] = round(seconds_per_minute, 1)

            # Check which stems were actually created
            wav_files = [f for f in os.listdir(temp_dir) if f.endswith(".wav")]
            logger.info(f"Found WAV files: {wav_files}")

            # Determine if this is a standard vocal/instrumental model that can be evaluated with museval
            standard_model = False
            if len(wav_files) == 2:
                # Check if one of the files is named vocals.wav or instrumental.wav
                if "vocals.wav" in wav_files and "instrumental.wav" in wav_files:
                    standard_model = True
                    logger.info("Detected standard vocals/instrumental model, will run museval evaluation")

            # If not a standard model, skip museval evaluation and just return speed metrics
            if not standard_model:
                logger.info(f"Non-standard stem configuration detected for model {test_model}, skipping museval evaluation")

                # Store the speed metric in the combined results
                if test_model not in museval_results:
                    museval_results[test_model] = {}

                # Create a minimal structure for the speed metric
                minimal_results = {"targets": [{"name": "speed_metrics_only", "metrics": {"seconds_per_minute_m3": round(seconds_per_minute, 1)}}]}

                museval_results[test_model][track_name] = minimal_results
                save_combined_results(museval_results)

                return None, basic_model_results

            # For standard models, proceed with museval evaluation
            available_stems = {}
            available_stems["vocals"] = os.path.join(temp_dir, "vocals.wav")
            available_stems["accompaniment"] = os.path.join(temp_dir, "instrumental.wav")

            # Get track from MUSDB
            track = next((t for t in mus_db if t.name == track_name), None)
            if track is None:
                raise ValueError(f"Track {track_name} not found in MUSDB18")

            # Load available stems
            estimates = {}
            for stem_name, stem_path in available_stems.items():
                audio, _ = sf.read(stem_path)
                if len(audio.shape) == 1:
                    audio = np.expand_dims(audio, axis=1)
                estimates[stem_name] = audio

            # Evaluate using museval
            logger.info(f"Evaluating stems: {list(estimates.keys())}")
            try:
                scores = museval.eval_mus_track(track, estimates, output_dir=temp_dir, mode="v4")

                # Add the speed metric to the scores
                if not hasattr(scores, "speed_metric_added"):
                    for target in scores.scores["targets"]:
                        if "metrics" not in target:
                            target["metrics"] = {}
                        target["metrics"]["seconds_per_minute_m3"] = round(seconds_per_minute, 1)
                    scores.speed_metric_added = True

                # Update the combined results file with the new evaluation
                if test_model not in museval_results:
                    museval_results[test_model] = {}
                museval_results[test_model][track_name] = scores.scores
                save_combined_results(museval_results)
            except Exception as e:
                logger.error(f"Error during museval evaluation: {str(e)}")
                logger.exception("Evaluation exception details:")
                # Return basic results with just the speed metric
                return None, basic_model_results

    try:
        # Only process museval results if we have them
        if "scores" in locals() and scores is not None:
            # Calculate aggregate scores for available stems
            results_store = museval.EvalStore()
            results_store.add_track(scores.df)
            methods = museval.MethodStore()
            methods.add_evalstore(results_store, name=test_model)
            agg_scores = methods.agg_frames_tracks_scores()

            # Return the aggregate scores in a structured format with 6 significant figures
            model_results = {"track_name": track_name, "scores": {}}

            for stem in ["vocals", "drums", "bass", "other", "accompaniment"]:
                try:
                    stem_scores = {metric: float(f"{agg_scores.loc[(test_model, stem, metric)]:.6g}") for metric in ["SDR", "SIR", "SAR", "ISR"]}
                    # Rename 'accompaniment' to 'instrumental' in the output
                    output_stem = "instrumental" if stem == "accompaniment" else stem
                    model_results["scores"][output_stem] = stem_scores
                except KeyError:
                    continue

            # Add the seconds_per_minute_m3 metric if it was calculated
            if processing_time > 0 and track_duration_minutes > 0:
                model_results["scores"]["seconds_per_minute_m3"] = round(seconds_per_minute, 1)

            return scores, model_results if model_results["scores"] else basic_model_results
        else:
            # If we don't have scores, just return the basic results with speed metrics
            return None, basic_model_results

    except Exception as e:
        logger.error(f"Error processing evaluation results: {str(e)}")
        logger.exception("Results processing exception details:")
        # Return basic results with just the speed metric
        return None, basic_model_results


def convert_decimal_to_float(obj):
    """Recursively converts Decimal objects to floats in a nested structure."""
    if isinstance(obj, Decimal):
        return float(obj)
    elif isinstance(obj, dict):
        return {k: convert_decimal_to_float(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_decimal_to_float(x) for x in obj]
    return obj


def calculate_median_scores(track_scores):
    """Calculate median scores across all tracks for each stem and metric"""
    # Initialize containers for each stem's metrics
    stem_metrics = {
        "vocals": {"SDR": [], "SIR": [], "SAR": [], "ISR": []},
        "drums": {"SDR": [], "SIR": [], "SAR": [], "ISR": []},
        "bass": {"SDR": [], "SIR": [], "SAR": [], "ISR": []},
        "instrumental": {"SDR": [], "SIR": [], "SAR": [], "ISR": []},
        "seconds_per_minute_m3": [],
    }

    # Collect all scores for each stem and metric
    for track_score in track_scores:
        if track_score is not None and "scores" in track_score:
            # Process audio quality metrics
            for stem, metrics in track_score["scores"].items():
                if stem in stem_metrics and stem != "seconds_per_minute_m3":
                    for metric, value in metrics.items():
                        stem_metrics[stem][metric].append(value)

            # Process speed metric separately
            if "seconds_per_minute_m3" in track_score["scores"]:
                stem_metrics["seconds_per_minute_m3"].append(track_score["scores"]["seconds_per_minute_m3"])

    # Calculate medians for each stem and metric
    median_scores = {}
    for stem, metrics in stem_metrics.items():
        if stem != "seconds_per_minute_m3" and any(metrics.values()):  # Only include stems that have scores
            median_scores[stem] = {metric: float(f"{np.median(values):.6g}") for metric, values in metrics.items() if values}  # Only include metrics that have values

    # Add median speed metric if available
    if stem_metrics["seconds_per_minute_m3"]:
        median_scores["seconds_per_minute_m3"] = round(np.median(stem_metrics["seconds_per_minute_m3"]), 1)

    return median_scores


def check_disk_usage(path):
    """Check inode usage and disk space on the filesystem containing path"""
    import subprocess
    import sys

    # Check disk space first
    result = subprocess.run(["df", "-h", path], capture_output=True, text=True)
    output = result.stdout
    logger.info(f"Current disk usage:\n{output}")

    # Parse the output to get disk usage percentage
    lines = output.strip().split("\n")
    if len(lines) >= 2:
        parts = lines[1].split()
        if len(parts) >= 5:
            try:
                # Extract disk usage percentage
                disk_usage_str = parts[4].rstrip("%")
                disk_usage_pct = int(disk_usage_str)

                logger.info(f"Disk usage: {disk_usage_pct}%")

                if disk_usage_pct >= 99:
                    logger.critical("CRITICAL: Disk is almost full (>99%)! Cannot continue processing.")
                    logger.critical("Please free up disk space before continuing.")
                    sys.exit(1)
                elif disk_usage_pct > 95:
                    logger.warning(f"WARNING: High disk usage ({disk_usage_pct}%)!")
            except (ValueError, IndexError) as e:
                logger.error(f"Error parsing disk usage: {str(e)}")

    # Now check inode usage
    result = subprocess.run(["df", "-i", path], capture_output=True, text=True)
    output = result.stdout
    logger.info(f"Current inode usage:\n{output}")

    # Parse the output to get inode usage percentage
    lines = output.strip().split("\n")
    if len(lines) >= 2:
        # The second line contains the actual data
        parts = lines[1].split()
        if len(parts) >= 8:  # macOS df -i format has 8 columns
            try:
                # On macOS, inode usage is in the 8th column as a percentage
                inode_usage_str = parts[7].rstrip("%")
                inode_usage_pct = int(inode_usage_str)

                # Also extract the actual inode numbers for better reporting
                iused = int(parts[5])
                ifree = int(parts[6])
                total_inodes = iused + ifree

                # Skip inode check for exFAT or similar filesystems
                if total_inodes <= 1:
                    logger.info("Filesystem appears to be exFAT or similar (no real inode tracking). Skipping inode check.")
                    return None

                logger.info(f"Inode usage: {iused:,}/{total_inodes:,} ({inode_usage_pct}%)")

                if inode_usage_pct >= 100:
                    logger.critical("CRITICAL: Inode usage is at 100%! Cannot continue processing.")
                    logger.critical("Please free up inodes before continuing.")
                    sys.exit(1)
                elif inode_usage_pct > 90:
                    logger.warning(f"WARNING: High inode usage ({inode_usage_pct}%)!")

                return inode_usage_pct
            except (ValueError, IndexError) as e:
                logger.error(f"Error parsing inode usage: {str(e)}")

    return None


def get_evaluated_track_count(model_name, museval_results):
    """Get the number of tracks evaluated for a specific model"""
    if model_name in museval_results:
        return len(museval_results[model_name])
    return 0


def get_most_evaluated_tracks(museval_results, min_count=10):
    """Get tracks that have been evaluated for the most models"""
    track_counts = {}

    # Count how many models have evaluated each track
    for model_name, tracks in museval_results.items():
        for track_name in tracks:
            if track_name not in track_counts:
                track_counts[track_name] = 0
            track_counts[track_name] += 1

    # Sort tracks by evaluation count (descending)
    sorted_tracks = sorted(track_counts.items(), key=lambda x: x[1], reverse=True)

    # Return tracks that have been evaluated at least min_count times
    return [track for track, count in sorted_tracks if count >= min_count]


def generate_summary_statistics(
    start_time, models_processed, tracks_processed, models_with_new_data, tracks_evaluated, total_processing_time, fastest_model=None, slowest_model=None, combined_results_path=None, is_dry_run=False
):
    """Generate a summary of the script's execution"""
    end_time = time.time()
    total_runtime = end_time - start_time

    # Format the runtime
    hours, remainder = divmod(total_runtime, 3600)
    minutes, seconds = divmod(remainder, 60)
    runtime_str = f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"

    # Build the summary
    summary = [
        "=" * 80,
        "DRY RUN SUMMARY - PREVIEW ONLY" if is_dry_run else "EXECUTION SUMMARY",
        "=" * 80,
        f"Total runtime: {runtime_str}",
        f"Models {'that would be' if is_dry_run else ''} processed: {models_processed}",
        f"Models {'that would receive' if is_dry_run else 'with'} new data: {len(models_with_new_data)}",
        f"Total tracks {'that would be' if is_dry_run else ''} evaluated: {tracks_evaluated}",
        f"Average tracks per model: {tracks_evaluated / len(models_with_new_data) if models_with_new_data else 0:.2f}",
    ]

    if fastest_model:
        summary.append(f"Fastest model: {fastest_model['name']} ({fastest_model['speed']:.2f} seconds per minute)")

    if slowest_model:
        summary.append(f"Slowest model: {slowest_model['name']} ({slowest_model['speed']:.2f} seconds per minute)")

    if total_processing_time > 0:
        summary.append(f"Total audio processing time: {total_processing_time:.2f} seconds")

    if combined_results_path and os.path.exists(combined_results_path):
        file_size = os.path.getsize(combined_results_path) / (1024 * 1024)  # Size in MB
        summary.append(f"Results file size: {file_size:.2f} MB")

    # Add models with new data
    if models_with_new_data:
        summary.append(f"\nModels {'that would receive' if is_dry_run else 'with'} new evaluation data:")
        for model_name in models_with_new_data:
            summary.append(f"- {model_name}")

    # Add dry run disclaimer if needed
    if is_dry_run:
        summary.append("\nNOTE: This is a dry run summary. No actual changes were made.")
        summary.append("Run without --dry-run to perform actual evaluations.")

    summary.append("=" * 80)
    return "\n".join(summary)


def check_stop_signal():
    """Check if the stop signal file exists"""
    if os.path.exists(STOP_SIGNAL_PATH):
        logger.info("Stop signal detected at: " + STOP_SIGNAL_PATH)
        return True
    return False


def main():
    # Add command line argument parsing for dry run mode
    parser = argparse.ArgumentParser(description="Run model evaluation on MUSDB18 dataset")
    parser.add_argument("--dry-run", action="store_true", help="Run in dry-run mode (no writes)")
    parser.add_argument("--max-tracks", type=int, default=10, help="Maximum number of tracks to evaluate per model")
    parser.add_argument("--max-models", type=int, default=None, help="Maximum number of models to evaluate")
    args = parser.parse_args()

    # Remove any existing stop signal file at start
    if os.path.exists(STOP_SIGNAL_PATH):
        os.remove(STOP_SIGNAL_PATH)
        logger.info("Removed existing stop signal file")

    # Track start time for progress reporting
    start_time = time.time()

    # Statistics tracking
    models_processed = 0
    tracks_processed = 0
    models_with_new_data = set()
    total_processing_time = 0
    fastest_model = {"name": "", "speed": float("inf")}  # Initialize with infinity for comparison
    slowest_model = {"name": "", "speed": 0}  # Initialize with zero for comparison

    # Create a results cache manager
    class ResultsCache:
        def __init__(self):
            self.results = load_combined_results()
            self.last_update_time = time.time()

        def get_results(self, force=False):
            current_time = time.time()
            # Only reload from disk every 5 minutes unless forced
            if force or (current_time - self.last_update_time) > 300:
                self.results = load_combined_results()
                self.last_update_time = current_time
            return self.results

    results_cache = ResultsCache()

    # Helper function for logging with elapsed time
    def log_with_time(message, level=logging.INFO):
        elapsed = time.time() - start_time
        hours, remainder = divmod(elapsed, 3600)
        minutes, seconds = divmod(remainder, 60)
        time_str = f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
        logger.log(level, f"[{time_str}] {message}")

    if args.dry_run:
        log_with_time("*** RUNNING IN DRY-RUN MODE - NO DATA WILL BE MODIFIED ***")

    log_with_time("Starting model evaluation script...")
    os.makedirs(RESULTS_PATH, exist_ok=True)

    # Check disk space and inode usage at start
    check_disk_usage(RESULTS_PATH)

    # Load existing results if available
    combined_results = {}
    if os.path.exists(COMBINED_RESULTS_PATH):
        log_with_time("Loading existing combined results...")
        with open(COMBINED_RESULTS_PATH) as f:
            combined_results = json.load(f)

    # Get initial museval results
    museval_results = results_cache.get_results()
    log_with_time(f"Loaded combined museval results with {len(museval_results)} models")

    # Get the most commonly evaluated tracks
    common_tracks = get_most_evaluated_tracks(museval_results)
    log_with_time(f"Found {len(common_tracks)} commonly evaluated tracks")

    # Initialize MUSDB
    log_with_time("Initializing MUSDB database...")
    mus = musdb.DB(root=MUSDB_PATH, is_wav=True)

    # Create a prioritized list of tracks
    all_tracks = []
    for track in mus.tracks:
        # Check if this is a commonly evaluated track
        is_common = track.name in common_tracks
        all_tracks.append({"name": track.name, "path": os.path.dirname(track.path), "is_common": is_common})

    # Sort tracks by whether they're commonly evaluated
    all_tracks.sort(key=lambda t: 0 if t["is_common"] else 1)

    # Get list of all available models
    log_with_time("Getting list of available models...")
    separator = Separator()
    models_by_type = separator.list_supported_model_files()

    # Flatten the models list and prioritize them
    all_models = []
    for model_type, models in models_by_type.items():
        for model_name, model_info in models.items():
            filename = model_info.get("filename")
            if filename:
                # Count how many tracks have been evaluated for this model
                evaluated_count = get_evaluated_track_count(filename, museval_results)

                # Determine if this is a roformer model
                is_roformer = "roformer" in model_name.lower()

                # Add to the list with priority information
                all_models.append({"name": model_name, "filename": filename, "type": model_type, "info": model_info, "evaluated_count": evaluated_count, "is_roformer": is_roformer})

    # Sort models by priority:
    # 1. Roformer models with fewer than max_tracks evaluations
    # 2. Other models with fewer than max_tracks evaluations
    # 3. Roformer models with more evaluations
    # 4. Other models with more evaluations
    all_models.sort(
        key=lambda m: (
            0 if m["is_roformer"] and m["evaluated_count"] < args.max_tracks else 1 if not m["is_roformer"] and m["evaluated_count"] < args.max_tracks else 2 if m["is_roformer"] else 3,
            m["evaluated_count"],  # Secondary sort by number of evaluations (ascending)
        )
    )

    # Log the prioritized models
    log_with_time(f"Prioritized {len(all_models)} models for evaluation:")
    for i, model in enumerate(all_models[:10]):  # Show top 10
        log_with_time(f"{i+1}. {model['name']} ({model['filename']}) - {model['evaluated_count']} tracks evaluated, roformer: {model['is_roformer']}")

    if len(all_models) > 10:
        log_with_time(f"... and {len(all_models) - 10} more models")

    # Limit the number of models if specified
    if args.max_models:
        all_models = all_models[: args.max_models]
        log_with_time(f"Limited to {args.max_models} models for this run")

    # Process models according to priority
    model_idx = 0
    stop_requested = False
    while model_idx < len(all_models):
        # Check for stop signal before processing each model
        if check_stop_signal():
            log_with_time("Stop signal detected. Will finish current model's tracks and then exit.")
            stop_requested = True

        model = all_models[model_idx]
        model_name = model["name"]
        model_filename = model["filename"]
        model_type = model["type"]

        progress_pct = (model_idx + 1) / len(all_models) * 100
        log_with_time(f"\n=== Processing model {model_idx+1}/{len(all_models)} ({progress_pct:.1f}%): {model_name} ({model_filename}) ===")

        # Initialize model entry if it doesn't exist
        if model_filename not in combined_results:
            log_with_time(f"Initializing new entry for {model_filename}")
            combined_results[model_filename] = {"model_name": model_name, "track_scores": [], "median_scores": {}, "stems": [], "target_stem": None}

        # Try to load the model to get stem information
        try:
            separator.load_model(model_filename=model_filename)
            model_data = separator.model_instance.model_data

            # Extract stem information (similar to your existing code)
            # ... (keep your existing stem extraction logic here)

        except Exception as e:
            log_with_time(f"Error loading model {model_filename}: {str(e)}", logging.ERROR)
            logger.exception("Full exception details:")
            model_idx += 1
            continue

        # Count how many tracks have been evaluated for this model
        # Use the cached results
        evaluated_count = get_evaluated_track_count(model_filename, results_cache.get_results())

        # Determine how many more tracks to evaluate
        tracks_to_evaluate = max(0, args.max_tracks - evaluated_count)

        if tracks_to_evaluate == 0:
            log_with_time(f"Model {model_name} already has {evaluated_count} tracks evaluated (>= {args.max_tracks}). Skipping.")
            model_idx += 1
            continue

        log_with_time(f"Will evaluate up to {tracks_to_evaluate} tracks for model {model_name}")

        # Process tracks for this model
        tracks_processed = 0
        for track in all_tracks:
            # Check for stop signal before each track if we haven't already detected it
            if not stop_requested and check_stop_signal():
                log_with_time("Stop signal detected. Will finish current track and then exit.")
                stop_requested = True

            # Skip if we've processed enough tracks for this model
            if tracks_processed >= tracks_to_evaluate:
                break

            track_name = track["name"]
            track_path = track["path"]

            # Skip if track already evaluated for this model
            # Use the cached results
            if model_filename in results_cache.get_results() and track_name in results_cache.get_results()[model_filename]:
                log_with_time(f"Skipping already evaluated track {track_name} for model: {model_filename}")
                continue

            log_with_time(f"Processing track: {track_name} for model: {model_filename}")

            if args.dry_run:
                log_with_time(f"[DRY RUN] Would evaluate track {track_name} with model {model_filename}")
                tracks_processed += 1
                models_with_new_data.add(model_filename)

                # Estimate processing time based on model type for dry run
                # This is a rough estimate - roformer models are typically slower
                estimated_speed = 30.0  # Default estimate: 30 seconds per minute
                if "roformer" in model_name.lower():
                    estimated_speed = 45.0  # Roformer models are typically slower
                elif "umx" in model_name.lower():
                    estimated_speed = 20.0  # UMX models are typically faster

                # Update statistics with estimated values
                total_processing_time += estimated_speed

                # Track fastest and slowest models based on estimates
                if estimated_speed < fastest_model["speed"]:
                    fastest_model = {"name": model_name, "speed": estimated_speed}
                if estimated_speed > slowest_model["speed"]:
                    slowest_model = {"name": model_name, "speed": estimated_speed}

                continue

            try:
                result = evaluate_track(track_name, track_path, model_filename, mus)

                # Unpack the result safely
                if result and isinstance(result, tuple) and len(result) == 2:
                    _, model_results = result
                else:
                    model_results = None

                # Process the results if they exist and are valid
                if model_results is not None and isinstance(model_results, dict):
                    combined_results[model_filename]["track_scores"].append(model_results)
                    tracks_processed += 1
                    models_with_new_data.add(model_filename)

                    # Track processing time statistics - safely access nested dictionaries
                    scores = model_results.get("scores", {})
                    if isinstance(scores, dict):
                        speed = scores.get("seconds_per_minute_m3")
                        if speed is not None:
                            total_processing_time += speed  # Accumulate total processing time

                            # Track fastest and slowest models
                            if speed < fastest_model["speed"]:
                                fastest_model = {"name": model_name, "speed": speed}
                            if speed > slowest_model["speed"]:
                                slowest_model = {"name": model_name, "speed": speed}
                else:
                    log_with_time(f"Skipping model {model_filename} for track {track_name} due to no evaluatable stems or invalid results")
            except Exception as e:
                log_with_time(f"Error evaluating model {model_filename} with track {track_name}: {str(e)}", logging.ERROR)
                logger.exception(f"Exception details: ", exc_info=e)
                continue

            # Update and save results
            if combined_results[model_filename]["track_scores"]:
                median_scores = calculate_median_scores(combined_results[model_filename]["track_scores"])
                combined_results[model_filename]["median_scores"] = median_scores

            # Save results after each track
            if not args.dry_run:
                os.makedirs(os.path.dirname(COMBINED_RESULTS_PATH), exist_ok=True)
                with open(COMBINED_RESULTS_PATH, "w", encoding="utf-8") as f:
                    json.dump(combined_results, f, indent=2)
                log_with_time(f"Updated combined results file with {model_filename} - {track_name}")

                # Force update the cache after saving
                results_cache.get_results(force=True)
            else:
                log_with_time(f"[DRY RUN] Would have updated combined results for {model_filename} - {track_name}")

            # Check disk space periodically
            check_disk_usage(RESULTS_PATH)

        log_with_time(f"Completed processing {tracks_processed} tracks for model {model_name}")

        # If stop was requested, exit after completing the current model
        if stop_requested:
            log_with_time("Stop signal processed. Generating final summary before exit.")
            break

        # If we're processing a non-roformer model, check if there are roformer models that need evaluation
        if not model["is_roformer"]:
            # Find roformer models that still need more evaluations
            # Use the cached results
            roformer_models_needing_eval = []
            for i, m in enumerate(all_models[model_idx + 1 :], start=model_idx + 1):
                if m["is_roformer"]:
                    eval_count = get_evaluated_track_count(m["filename"], results_cache.get_results())
                    if eval_count < args.max_tracks:
                        roformer_models_needing_eval.append((i, m))

            if roformer_models_needing_eval:
                log_with_time(f"Found {len(roformer_models_needing_eval)} roformer models that still need evaluation. Reprioritizing...")

                # Move these models to the front of the remaining queue
                for offset, (i, m) in enumerate(roformer_models_needing_eval):
                    # Adjust index for models we've already moved
                    adjusted_idx = i - offset
                    # Move this model right after the current one
                    all_models.insert(model_idx + 1, all_models.pop(adjusted_idx))

                log_with_time("Reprioritization complete. Continuing with highest priority model.")

        # Move to the next model
        model_idx += 1
        models_processed += 1

    log_with_time("Evaluation complete")
    # Final disk space check
    check_disk_usage(RESULTS_PATH)

    # Generate and display summary statistics
    # Reset fastest/slowest models if they weren't updated
    if fastest_model["speed"] == float("inf"):
        fastest_model = None
    if slowest_model["speed"] == 0:
        slowest_model = None

    summary = generate_summary_statistics(
        start_time=start_time,
        models_processed=models_processed,
        tracks_processed=tracks_processed,
        models_with_new_data=models_with_new_data,
        tracks_evaluated=tracks_processed,
        total_processing_time=total_processing_time,
        fastest_model=fastest_model,
        slowest_model=slowest_model,
        combined_results_path=COMBINED_RESULTS_PATH,
        is_dry_run=args.dry_run,
    )

    log_with_time("\n" + summary)

    # Also write summary to a log file
    summary_filename = "dry_run_summary.log" if args.dry_run else "evaluation_summary.log"
    if stop_requested:
        summary_filename = "stopped_" + summary_filename
    summary_log_path = os.path.join(os.path.dirname(COMBINED_RESULTS_PATH), summary_filename)
    with open(summary_log_path, "w") as f:
        f.write(f"{'Dry run' if args.dry_run else 'Evaluation'} {'(stopped early)' if stop_requested else ''} completed at: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(summary)

    log_with_time(f"Summary written to {summary_log_path}")

    # Clean up stop signal file if it exists
    if os.path.exists(STOP_SIGNAL_PATH):
        os.remove(STOP_SIGNAL_PATH)
        log_with_time("Removed stop signal file")

    return 0 if not stop_requested else 2  # Return different exit code if stopped early


if __name__ == "__main__":
    exit(main())