Source code for ml_toolkit.functions.eval_utils.helpers.config

"""Configuration dataclasses for run_experiment().

This module provides typed configuration classes for running experiments
that compare multiple model configurations against a single dataset.
"""

from __future__ import annotations

from dataclasses import dataclass, field
import hashlib
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable

if TYPE_CHECKING:
    import yipit_databricks_client as ydbc

from ml_toolkit.functions.eval_utils.constants import DEFAULT_MAX_OUTPUT_TOKENS, Metric
from ml_toolkit.functions.eval_utils.helpers.types import LLMJudgeConfig, YAMLMixin
from ml_toolkit.ops.helpers.logger import get_logger


def _convert_template_syntax(template: str) -> str:
    """Convert template syntax from {{variable}} (MLflow registry) to <<variable>> (eval_utils).

    Args:
        template: Template string with {{variable}} syntax.

    Returns:
        Template string with <<variable>> syntax.
    """
    import re

    # Replace {{variable}} with <<variable>>
    # Handle both {{variable}} and {{ variable }} (with spaces)
    pattern = r"\{\{\s*(\w+)\s*\}\}"
    return re.sub(pattern, r"<<\1>>", template)


def _normalize_template_whitespace(template: str) -> str:
    """Normalize whitespace in a template: collapse newlines to single space, trim.

    Prompts loaded from the registry often contain literal newlines (e.g. from
    multi-line strings). This normalizes them so the template behaves like a
    single-line direct prompt unless the user explicitly keeps newlines.

    Args:
        template: Template string (may contain \\n, \\r\\n).

    Returns:
        Template with newlines replaced by space and multiple spaces collapsed.
    """
    import re

    # Replace any newline (CR, LF, CRLF) with a space
    out = template.replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
    # Collapse multiple spaces into one
    out = re.sub(r" +", " ", out)
    return out.strip()


def _load_prompt_from_registry(
    prompt_registry_name: str,
    prompt_version: str | int | None = None,
    prompt_alias: str | None = None,
) -> tuple[str | None, str | None]:
    """Load prompt template and system prompt from MLflow Prompt Registry.

    Args:
        prompt_registry_name: Fully qualified prompt name (catalog.schema.prompt_name)
            or URI (prompts:/catalog.schema.prompt_name@alias).
        prompt_version: Specific version to load. Ignored if prompt_registry_name includes version/alias.
        prompt_alias: Alias to load (e.g., 'production'). Ignored if prompt_registry_name includes alias.

    Returns:
        Tuple of (prompt_template, system_prompt). Both may be None if not found.

    Raises:
        ImportError: If prompt registry module is not available.
        ValueError: If prompt cannot be loaded.
    """
    logger = get_logger()

    try:
        from ml_toolkit.functions.llm.prompt.function import load_prompt_from_mlflow
    except ImportError:
        raise ImportError(
            "Prompt registry integration requires ml_toolkit.functions.llm.prompt module. "
            "Ensure it is available in your environment."
        )

    # Build URI if needed
    # If prompt_registry_name is already a URI (prompts:/...), use it as-is
    if prompt_registry_name.startswith("prompts:/"):
        prompt_uri = prompt_registry_name
        # Extract version/alias from URI if present, don't pass version parameter
        # MLflow will handle version/alias from URI
        version_to_pass = None
    elif prompt_alias:
        # Build URI with alias
        prompt_uri = f"prompts:/{prompt_registry_name}@{prompt_alias}"
        version_to_pass = None  # Alias in URI takes precedence
    elif prompt_version is not None:
        # Use URI format with version to avoid MLflow trying "latest" alias
        # Format: prompts:/catalog.schema.name/version
        # Ensure version is properly formatted
        try:
            version_str = (
                str(int(prompt_version))
                if isinstance(prompt_version, (str, int))
                else str(prompt_version)
            )
        except (ValueError, TypeError):
            version_str = str(prompt_version)
        prompt_uri = f"prompts:/{prompt_registry_name}/{version_str}"
        version_to_pass = None  # Version is in URI, don't pass as parameter
    else:
        # No version or alias specified - load latest version by name only
        # Don't pass version=None as MLflow might try to use "latest" alias
        prompt_uri = prompt_registry_name
        version_to_pass = None

    logger.info(
        f"Loading prompt from registry: {prompt_uri}"
        + (f" (version param={version_to_pass})" if version_to_pass else "")
    )

    try:
        # Load without version parameter - version/alias is in URI if needed
        # This prevents MLflow from trying to use a non-existent "latest" alias
        prompt_version_obj = load_prompt_from_mlflow(
            name_or_uri=prompt_uri,
            allow_missing=False,
        )

        if prompt_version_obj is None:
            raise ValueError(f"Prompt '{prompt_registry_name}' not found in registry")

        template = prompt_version_obj.template
        # Convert template syntax from {{variable}} to <<variable>>
        converted_template = _convert_template_syntax(template)
        # Normalize newlines/whitespace so loaded prompts match direct-prompt style
        converted_template = _normalize_template_whitespace(converted_template)

        # MLflow Prompt Registry doesn't store system prompts separately,
        # so system_prompt will be None
        system_prompt = None

        logger.info(
            f"Successfully loaded prompt '{prompt_registry_name}' "
            f"(version {prompt_version_obj.version})"
        )

        return converted_template, system_prompt

    except Exception as e:
        error_msg = str(e)
        # Check if error is about "latest" alias not found
        if "latest" in error_msg.lower() and "not found" in error_msg.lower():
            raise ValueError(
                f"Failed to load prompt '{prompt_registry_name}' from registry: {error_msg}\n"
                f"Hint: When no version or alias is specified, MLflow tries to use a 'latest' alias.\n"
                f"Either:\n"
                f"  1. Specify a version: prompt_version=1\n"
                f"  2. Specify an alias: prompt_alias='production'\n"
                f"  3. Use URI format with alias: prompt_registry_name='prompts:/{prompt_registry_name}@production'\n"
                f"  4. Create a 'latest' alias for your prompt in the registry"
            ) from e
        else:
            raise ValueError(
                f"Failed to load prompt '{prompt_registry_name}' from registry: {error_msg}"
            ) from e


@dataclass

[docs]
class DatasetConfig(YAMLMixin):
    """Reference to an evaluation dataset.

    Encapsulates all information needed to locate and use an evaluation
    dataset for running evaluations.

    Args:
        table: Fully qualified table name (catalog.schema.table).
        primary_key: Name of the primary key column.
        input_column: Name of the input text column. Defaults to 'input'.
        expected_output_column: Name of the expected output column for
            comparison metrics. Set to None if not available.
        additional_context_columns: Additional columns to include in prompt
            template context.
        description: Optional description of the dataset.
        tags: Optional tags for the dataset.
        dataset_yaml: Optional path to YAML file to load config from.
            If provided, all other parameters are ignored.

    Example:
        >>> dataset = DatasetConfig(
        ...     table="catalog.schema.vendor_eval_20260126",
        ...     primary_key="row_id",
        ...     input_column="vendor_name",
        ...     expected_output_column="canonical_name",
        ... )

        # Or load from YAML:
        >>> dataset = DatasetConfig(dataset_yaml="path/to/dataset.yaml")
    """

    # Required (have defaults to allow dataset_yaml-only construction)
    table: str = ""
    primary_key: str = ""

    # Column configuration
    input_column: str = "input"
    expected_output_column: str | None = "expected_output"
    additional_context_columns: list[str] | None = None

    # Optional metadata
    description: str | None = None
    tags: dict[str, str] | None = None

    # YAML loading
    dataset_yaml: str | Path | None = field(default=None, repr=False)

    # Cached hash value
    _cached_hash: str | None = field(default=None, repr=False, compare=False)

    def __post_init__(self) -> None:
        """Load from YAML if dataset_yaml is provided, then validate."""
        if self.dataset_yaml is not None:
            loaded = self.from_yaml(self.dataset_yaml)
            object.__setattr__(self, "table", loaded.table)
            object.__setattr__(self, "primary_key", loaded.primary_key)
            object.__setattr__(self, "input_column", loaded.input_column)
            object.__setattr__(
                self, "expected_output_column", loaded.expected_output_column
            )
            object.__setattr__(
                self, "additional_context_columns", loaded.additional_context_columns
            )
            object.__setattr__(self, "description", loaded.description)
            object.__setattr__(self, "tags", loaded.tags)
            object.__setattr__(self, "dataset_yaml", None)

        # Validate required fields
        if not self.table:
            raise ValueError("DatasetConfig 'table' is required")
        if not self.primary_key:
            raise ValueError("DatasetConfig 'primary_key' is required")

    def compute_dataset_hash(self, spark: "ydbc.SparkSession | None" = None) -> str:
        """Compute MD5 hash of dataset for reproducibility tracking.

        The hash is computed from:
        - Table name
        - Row count
        - Sample of primary key values

        This provides a reproducible fingerprint of the dataset that changes
        if the dataset content changes.

        Args:
            spark: Optional SparkSession. If not provided, will get from ydbc.

        Returns:
            MD5 hash string (32 characters).
        """
        if self._cached_hash is not None:
            return self._cached_hash

        import yipit_databricks_client as ydbc

        if spark is None:
            spark = ydbc.get_spark_session()

        # Get table info for hashing
        df = spark.table(self.table)
        row_count = df.count()

        # Get sample of primary keys for content fingerprinting
        # Use first 100 PKs sorted to ensure consistency
        pk_sample = (
            df.select(self.primary_key).orderBy(self.primary_key).limit(100).collect()
        )
        pk_values = [str(row[self.primary_key]) for row in pk_sample]

        # Build hash input
        hash_input = f"{self.table}:{row_count}:{','.join(pk_values)}"
        hash_value = hashlib.md5(hash_input.encode()).hexdigest()

        # Cache the result
        object.__setattr__(self, "_cached_hash", hash_value)

        return hash_value

    @property
    def dataset_hash(self) -> str:
        """Get MD5 hash of dataset for reproducibility tracking.

        Note: This property computes the hash on first access if not cached.
        For better control over SparkSession, use compute_dataset_hash() directly.

        Returns:
            MD5 hash string (32 characters).
        """
        return self.compute_dataset_hash()

    def to_dict(self) -> dict[str, Any]:
        """Convert dataset reference to dictionary for serialization.

        Returns:
            Dict representation of the dataset reference.
        """
        return {
            "table": self.table,
            "primary_key": self.primary_key,
            "input_column": self.input_column,
            "expected_output_column": self.expected_output_column,
            "additional_context_columns": self.additional_context_columns,
            "description": self.description,
            "tags": self.tags,
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "DatasetConfig":
        """Create DatasetConfig from a dictionary.

        Args:
            data: Dictionary with config values.

        Returns:
            DatasetConfig instance.

        Raises:
            KeyError: If required fields (table, primary_key) are missing.
        """
        return cls(
            table=data["table"],
            primary_key=data["primary_key"],
            input_column=data.get("input_column", "input"),
            expected_output_column=data.get(
                "expected_output_column", "expected_output"
            ),
            additional_context_columns=data.get("additional_context_columns"),
            description=data.get("description"),
            tags=data.get("tags"),
        )



@dataclass

[docs]
class ModelConfig(YAMLMixin):
    """Configuration for a single model/prompt to evaluate.

    Represents one model/prompt configuration to evaluate against a dataset.
    Use with `Experiment` to compare multiple configurations.

    Validation runs automatically on construction via __post_init__.

    Args:
        name: Unique name for this config (used in MLflow run name and results).
        model_name: Unity Catalog model name (e.g., 'catalog.schema.model_name').
            If provided without endpoint, the endpoint name will be automatically
            inferred from the model name. Either model_name/endpoint or litellm_model
            must be specified.
        endpoint: Databricks Model Serving endpoint name. Optional if model_name
            is provided (will be inferred). Either model_name/endpoint or litellm_model
            must be specified (not both).
        litellm_model: LiteLLM model identifier (e.g., 'gpt-4', 'claude-3').
            Either model_name/endpoint or litellm_model must be specified (not both).
        version: Optional integer version for routing to a specific
            model version in multi-version serving endpoints. Only used when
            endpoint is specified. Example: 1, 2, 3. Defaults to None (latest version).
        prompt_template: Jinja2 template for constructing the user message.
            Available variables: <<input>>, <<candidates>>, and context columns.
            Use <<variable>> syntax. If prompt_registry_name is provided,
            this will be overridden by the loaded template.
        system_prompt: Optional system prompt for chat completion.
            If prompt_registry_name is provided, this will be overridden.
        prompt_registry_name: Fully qualified prompt name from MLflow Prompt Registry
            (e.g., 'catalog.schema.prompt_name') or URI (e.g., 'prompts:/catalog.schema.prompt_name@alias').
            If provided, loads prompt_template from registry (converts {{variable}} to <<variable>>).
        prompt_version: Specific version to load from registry. Ignored if prompt_registry_name
            includes version/alias or if prompt_alias is provided.
        prompt_alias: Alias to load from registry (e.g., 'production'). Takes precedence over prompt_version.
        max_output_tokens: Maximum output tokens for model response.
        temperature: Temperature for model sampling (None uses model default).
        tags: Additional tags to apply to the MLflow run for this config.
        model_yaml: Optional path to YAML file to load config from.
            If provided, all other parameters are ignored.

    Example:
        >>> from ml_toolkit.functions.eval_utils import ModelConfig
        >>> # Using model_name (endpoint automatically inferred)
        >>> config = ModelConfig(
        ...     name="llama-8b-prompt-v1",
        ...     model_name="catalog.schema.meta-llama-3-1-8b-instruct",
        ...     prompt_template="Tag the vendor: <<input>>",
        ... )

        >>> # Using endpoint directly
        >>> config = ModelConfig(
        ...     name="llama-8b-prompt-v1",
        ...     endpoint="databricks-meta-llama-3-1-8b-instruct",
        ...     prompt_template="Tag the vendor: <<input>>",
        ... )

        >>> # Load prompt from MLflow Prompt Registry
        >>> config = ModelConfig(
        ...     name="llama-8b-registry",
        ...     model_name="catalog.schema.meta-llama-3-1-8b-instruct",
        ...     prompt_registry_name="mycatalog.myschema.vendor_tagging_prompt",
        ...     prompt_alias="production",  # or prompt_version=2
        ... )

        # Or load from YAML:
        >>> config = ModelConfig(model_yaml="path/to/model.yaml")

    Raises:
        ValueError: If validation fails (missing endpoint/model or both specified).
    """

    # Required (has default to allow model_yaml-only construction)
    name: str = ""

    # Model/Endpoint (one required)
    model_name: str | None = (
        None  # Unity Catalog model name - used to infer endpoint if not provided
    )
    endpoint: str | None = None  # Optional - inferred from model_name if not provided
    litellm_model: str | None = None
    version: int | None = None

    # Prompt configuration
    prompt_template: str | None = None
    system_prompt: str | None = None

    # Prompt registry integration
    prompt_registry_name: str | None = None
    prompt_version: str | int | None = None
    prompt_alias: str | None = None

    # Inference parameters
    max_output_tokens: int = DEFAULT_MAX_OUTPUT_TOKENS
    temperature: float | None = None

    # Optional metadata
    tags: dict[str, str] | None = None
    litellm_model_kwargs: dict[str, Any] | None = None

    # YAML loading
    model_yaml: str | Path | None = field(default=None, repr=False)

    def __post_init__(self) -> None:
        """Load from YAML if model_yaml is provided, then load prompt from registry if needed, then validate."""
        if self.model_yaml is not None:
            loaded = self.from_yaml(self.model_yaml)
            object.__setattr__(self, "name", loaded.name)
            object.__setattr__(self, "model_name", loaded.model_name)
            object.__setattr__(self, "endpoint", loaded.endpoint)
            object.__setattr__(self, "litellm_model", loaded.litellm_model)
            object.__setattr__(self, "version", loaded.version)
            object.__setattr__(self, "prompt_template", loaded.prompt_template)
            object.__setattr__(self, "system_prompt", loaded.system_prompt)
            object.__setattr__(
                self, "prompt_registry_name", loaded.prompt_registry_name
            )
            object.__setattr__(self, "prompt_version", loaded.prompt_version)
            object.__setattr__(self, "prompt_alias", loaded.prompt_alias)
            object.__setattr__(self, "max_output_tokens", loaded.max_output_tokens)
            object.__setattr__(self, "temperature", loaded.temperature)
            object.__setattr__(self, "tags", loaded.tags)
            object.__setattr__(self, "model_yaml", None)

        # Infer endpoint from model_name if endpoint is not provided
        if self.model_name is not None and self.endpoint is None:
            from ml_toolkit.functions.llm.serving_endpoints.helpers.validation import (
                infer_endpoint_name_from_model,
            )

            inferred_endpoint = infer_endpoint_name_from_model(self.model_name)
            object.__setattr__(self, "endpoint", inferred_endpoint)
            logger = get_logger()
            logger.info(
                f"Inferred endpoint '{inferred_endpoint}' from model_name '{self.model_name}' for config '{self.name}'"
            )

        # Resolve version to latest if not specified and endpoint is provided
        if self.endpoint is not None and self.version is None:
            from ml_toolkit.functions.llm.serving_endpoints import (
                get_latest_version_from_endpoint,
            )

            try:
                latest_version = get_latest_version_from_endpoint(self.endpoint)
                object.__setattr__(self, "version", latest_version)
            except Exception:
                # If we can't resolve the version (e.g., endpoint doesn't exist yet),
                # leave it as None and let the evaluation function handle it
                pass

        # Store original prompt_template value before loading from registry
        # This is needed for validation to check if user provided both initially
        original_prompt_template = self.prompt_template

        # Load prompt from registry if prompt_registry_name is provided
        if self.prompt_registry_name:
            logger = get_logger()
            logger.info(
                f"Loading prompt from registry for config '{self.name}': {self.prompt_registry_name}"
            )
            # Ensure prompt_version is properly formatted (convert to int if string)
            prompt_version_to_use = self.prompt_version
            if prompt_version_to_use is not None:
                # Convert string version to int if needed (MLflow accepts both)
                try:
                    if isinstance(prompt_version_to_use, str):
                        prompt_version_to_use = int(prompt_version_to_use)
                except (ValueError, TypeError):
                    # If conversion fails, keep original value
                    pass

            template, system_prompt = _load_prompt_from_registry(
                prompt_registry_name=self.prompt_registry_name,
                prompt_version=prompt_version_to_use,
                prompt_alias=self.prompt_alias,
            )
            object.__setattr__(self, "prompt_template", template)
            if system_prompt is not None:
                object.__setattr__(self, "system_prompt", system_prompt)

        # Validate with original prompt_template to check if user provided both initially
        self._validate(original_prompt_template=original_prompt_template)

    def _validate(self, original_prompt_template: str | None = None) -> None:
        """Internal validation logic.

        Args:
            original_prompt_template: The prompt_template value before loading from registry.
                Used to check if user provided both direct prompt and registry initially.
        """
        if not self.name:
            raise ValueError("Config 'name' is required")

        # At this point, endpoint should have been inferred from model_name if provided
        # Check that we have either endpoint (with or without model_name) or litellm_model
        if not self.endpoint and not self.litellm_model:
            raise ValueError(
                f"Config '{self.name}': either 'model_name', 'endpoint', or 'litellm_model' is required"
            )

        if self.endpoint and self.litellm_model:
            raise ValueError(
                f"Config '{self.name}': only one of 'endpoint' or 'litellm_model' "
                "should be specified, not both"
            )

        # Validate prompt configuration: either direct prompts OR registry, not both
        # Use original_prompt_template to check if user provided both initially
        # (after loading from registry, prompt_template will be set, so we need original value)
        # Only check original_prompt_template - don't fall back to self.prompt_template
        # because that will be set after loading from registry
        has_direct_prompt = original_prompt_template is not None
        has_registry_prompt = self.prompt_registry_name is not None

        if has_direct_prompt and has_registry_prompt:
            raise ValueError(
                f"Config '{self.name}': cannot specify both 'prompt_template' and "
                "'prompt_registry_name'. Use either direct prompt or registry, not both."
            )

        # Validate that prompt_registry_name is provided if prompt_alias or prompt_version is used
        if (
            self.prompt_alias is not None or self.prompt_version is not None
        ) and not self.prompt_registry_name:
            raise ValueError(
                f"Config '{self.name}': 'prompt_registry_name' is required when "
                "'prompt_alias' or 'prompt_version' is specified."
            )

        if self.prompt_registry_name and self.prompt_version and self.prompt_alias:
            raise ValueError(
                f"Config '{self.name}': cannot specify both 'prompt_version' and "
                "'prompt_alias'. Use one or the other."
            )

    def to_dict(self) -> dict[str, Any]:
        """Convert config to dictionary for serialization.

        Returns:
            Dict representation of the config.
        """
        return {
            "name": self.name,
            "model_name": self.model_name,
            "endpoint": self.endpoint,
            "litellm_model": self.litellm_model,
            "version": self.version,
            "prompt_template": self.prompt_template,
            "system_prompt": self.system_prompt,
            "prompt_registry_name": self.prompt_registry_name,
            "prompt_version": self.prompt_version,
            "prompt_alias": self.prompt_alias,
            "max_output_tokens": self.max_output_tokens,
            "temperature": self.temperature,
            "tags": self.tags,
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "ModelConfig":
        """Create ModelConfig from a dictionary.

        Args:
            data: Dictionary with config values.

        Returns:
            ModelConfig instance.

        Raises:
            KeyError: If required field (name) is missing.
            ValueError: If validation fails.
        """
        return cls(
            name=data["name"],
            model_name=data.get("model_name"),
            endpoint=data.get("endpoint"),
            litellm_model=data.get("litellm_model"),
            version=data.get("version"),
            prompt_template=data.get("prompt_template"),
            system_prompt=data.get("system_prompt"),
            prompt_registry_name=data.get("prompt_registry_name"),
            prompt_version=data.get("prompt_version"),
            prompt_alias=data.get("prompt_alias"),
            max_output_tokens=data.get("max_output_tokens", DEFAULT_MAX_OUTPUT_TOKENS),
            temperature=data.get("temperature"),
            tags=data.get("tags"),
        )



@dataclass
class LocalModelConfig:
    """Configuration for a local GPU model to evaluate.

    Holds a pre-loaded model and tokenizer for local inference via
    ``run_pandas_batch_inference``. Not serializable — cannot be used
    with ``trigger_remote=True``.

    Args:
        name: Unique name for this config (used in MLflow run name and results).
        model: PyTorch model with ``.generate()`` method (e.g. HuggingFace
            ``AutoModelForCausalLM``).
        tokenizer: Corresponding tokenizer.
        prompt_column: Column in the eval table containing prompt text.
        batch_size: Number of prompts per generation batch.
        max_seq_len: Maximum input sequence length for tokenization.
        max_new_tokens: Maximum new tokens to generate.
        do_sample: Whether to use sampling (True) or greedy (False).
        temperature: Sampling temperature (only used when ``do_sample=True``).
        top_p: Nucleus sampling probability (only used when ``do_sample=True``).
        seed: Random seed for reproducibility.
        output_parser: Callable that parses raw output string to entity list.
        tags: Additional tags to apply to the MLflow run.

    Example:
        >>> from ml_toolkit.functions.eval_utils import LocalModelConfig
        >>> config = LocalModelConfig(
        ...     name="finetuned-llama",
        ...     model=model,
        ...     tokenizer=tokenizer,
        ...     max_new_tokens=128,
        ... )

    Raises:
        ValueError: If name is empty or model/tokenizer is None.
    """

    name: str
    model: Any = None
    tokenizer: Any = None

    # Column mapping
    prompt_column: str = "prompt"

    # Generation params (mirrors run_pandas_batch_inference)
    batch_size: int = 16
    max_seq_len: int = 2048
    max_new_tokens: int = 64
    do_sample: bool = False
    temperature: float | None = None
    top_p: float | None = None
    seed: int | None = None
    output_parser: Callable[[str], list[str]] | None = None

    # Metadata
    tags: dict[str, str] | None = None

    def __post_init__(self) -> None:
        """Validate required fields."""
        if not self.name:
            raise ValueError("LocalModelConfig 'name' is required")
        if self.model is None:
            raise ValueError(f"LocalModelConfig '{self.name}': 'model' is required")
        if self.tokenizer is None:
            raise ValueError(f"LocalModelConfig '{self.name}': 'tokenizer' is required")

    def to_dict(self) -> dict[str, Any]:
        """Convert config to dictionary for partial serialization.

        Excludes non-serializable fields (model, tokenizer, output_parser).

        Returns:
            Dict representation of the config.
        """
        return {
            "name": self.name,
            "type": "local",
            "prompt_column": self.prompt_column,
            "batch_size": self.batch_size,
            "max_seq_len": self.max_seq_len,
            "max_new_tokens": self.max_new_tokens,
            "do_sample": self.do_sample,
            "temperature": self.temperature,
            "top_p": self.top_p,
            "seed": self.seed,
            "tags": self.tags,
        }


@dataclass

[docs]
class Experiment(YAMLMixin):
    """Bundles a dataset, model configurations, and metrics together.

    An Experiment defines a complete evaluation scenario: what dataset to use,
    which models to compare, and which metrics to compute. Metrics are defined
    once and applied to ALL models, eliminating duplication.

    Args:
        name: Unique name for this experiment.
        dataset: DatasetConfig reference to the evaluation dataset.
        models: List of ModelConfig and/or LocalModelConfig objects to evaluate.
        metrics: List of metrics to compute for all models. Can include Metric
            enum values, string names, or custom scorer callables.
        llm_judge_config: Configuration for LLM-as-judge metric. Required
            if 'llm_judge' is in metrics list.
        description: Optional description of the experiment.
        tags: Optional tags for the experiment.
        experiment_yaml: Optional path to YAML file to load config from.
            If provided, all other parameters are ignored.

    Example:
        >>> from ml_toolkit.functions.eval_utils import (
        ...     DatasetConfig, ModelConfig, Experiment, Metric, LLMJudgeConfig
        ... )
        >>> dataset = DatasetConfig(
        ...     table="catalog.schema.vendor_eval",
        ...     primary_key="row_id",
        ...     input_column="vendor_name",
        ...     expected_output_column="canonical_name",
        ... )
        >>> experiment = Experiment(
        ...     name="vendor-tagger-comparison",
        ...     dataset=dataset,
        ...     models=[
        ...         ModelConfig(name="llama-8b", endpoint="llama-endpoint"),
        ...         ModelConfig(name="gpt-4o", litellm_model="gpt-4o"),
        ...     ],
        ...     metrics=[Metric.LATENCY, Metric.EXACT_MATCH, Metric.LLM_JUDGE],
        ...     llm_judge_config=LLMJudgeConfig(criteria="correctness"),
        ... )

        # Or load from YAML:
        >>> experiment = Experiment(experiment_yaml="path/to/config.yaml")

    Raises:
        ValueError: If validation fails (no models, duplicate names, or
            llm_judge metric without config).
    """

    # Required (have defaults to allow experiment_yaml-only construction)
    name: str = ""
    dataset: DatasetConfig = None
    models: list[ModelConfig | LocalModelConfig] = field(default_factory=list)

    # Metrics applied to ALL models
    metrics: list[str | Metric | Callable] = field(
        default_factory=lambda: [Metric.LATENCY, Metric.EXACT_MATCH]
    )
    llm_judge_config: LLMJudgeConfig | None = None

    # Metadata
    description: str | None = None
    tags: dict[str, str] | None = None

    # YAML loading
    experiment_yaml: str | Path | None = field(default=None, repr=False)

    def __post_init__(self) -> None:
        """Load from YAML if experiment_yaml is provided, then validate."""
        if self.experiment_yaml is not None:
            loaded = self.from_yaml(self.experiment_yaml)
            object.__setattr__(self, "name", loaded.name)
            object.__setattr__(self, "dataset", loaded.dataset)
            object.__setattr__(self, "models", loaded.models)
            object.__setattr__(self, "metrics", loaded.metrics)
            object.__setattr__(self, "llm_judge_config", loaded.llm_judge_config)
            object.__setattr__(self, "description", loaded.description)
            object.__setattr__(self, "tags", loaded.tags)
            object.__setattr__(self, "experiment_yaml", None)

        self._validate()

    def _validate(self) -> None:
        """Internal validation logic."""
        if not self.name:
            raise ValueError("Experiment 'name' is required")

        if self.dataset is None:
            raise ValueError(f"Experiment '{self.name}': 'dataset' is required")

        if not self.models:
            raise ValueError(
                f"Experiment '{self.name}': at least one model config must be provided"
            )

        # Check for duplicate model names
        names = [m.name for m in self.models]
        if len(names) != len(set(names)):
            duplicates = [n for n in names if names.count(n) > 1]
            raise ValueError(
                f"Experiment '{self.name}': model names must be unique. "
                f"Duplicates found: {set(duplicates)}"
            )

        # Check for llm_judge metric (handle both string and Metric enum)
        has_llm_judge = any(
            str(m) == Metric.LLM_JUDGE.value for m in self.metrics if not callable(m)
        )
        if has_llm_judge and self.llm_judge_config is None:
            raise ValueError(
                f"Experiment '{self.name}': 'llm_judge_config' is required when "
                "'llm_judge' metric is specified"
            )

    def to_dict(self) -> dict[str, Any]:
        """Convert experiment to dictionary for serialization.

        Returns:
            Dict representation of the experiment (excludes callable metrics).
        """
        # Convert metrics to strings (handles both str and Metric enum)
        # Filter out callable metrics which can't be serialized
        serializable_metrics = [str(m) for m in self.metrics if not callable(m)]

        result = {
            "name": self.name,
            "dataset": self.dataset.to_dict(),
            "models": [m.to_dict() for m in self.models],
            "metrics": serializable_metrics,
            "description": self.description,
            "tags": self.tags,
        }

        if self.llm_judge_config:
            result["llm_judge_config"] = self.llm_judge_config.to_dict()

        return result

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "Experiment":
        """Create Experiment from a dictionary.

        Parses nested dataset, models, and llm_judge_config from dicts.

        Args:
            data: Dictionary with config values.

        Returns:
            Experiment instance.

        Raises:
            KeyError: If required fields are missing.
            ValueError: If validation fails.
        """
        # Parse nested dataset config
        dataset = DatasetConfig.from_dict(data["dataset"])

        # Parse nested model configs
        models = [ModelConfig.from_dict(m) for m in data["models"]]

        # Parse optional llm_judge_config
        llm_judge_config = None
        if data.get("llm_judge_config"):
            llm_judge_config = LLMJudgeConfig.from_dict(data["llm_judge_config"])

        # Metrics work as strings since Metric inherits from str
        metrics = data.get("metrics", [Metric.LATENCY, Metric.EXACT_MATCH])

        return cls(
            name=data["name"],
            dataset=dataset,
            models=models,
            metrics=metrics,
            llm_judge_config=llm_judge_config,
            description=data.get("description"),
            tags=data.get("tags"),
        )