Source code for ml_toolkit.functions.eval_utils.constants
[docs]
"""Constants for eval_utils module."""
from enum import Enum
[docs]
class Metric(str, Enum):
"""Built-in evaluation metrics.
Inherits from str for backward compatibility with string metrics.
Example:
>>> from ml_toolkit.functions.eval_utils import Metric
>>> metrics = [Metric.LATENCY, Metric.EXACT_MATCH]
>>> # Also works with strings for backward compatibility
>>> "latency" in [m.value for m in Metric]
True
"""
LATENCY = "latency"
TOKEN_COUNT = "token_count"
EXACT_MATCH = "exact_match"
FUZZY_MATCH = "fuzzy_match"
LLM_JUDGE = "llm_judge"
HIT_RATE = "hit_rate"
MRR = "mrr"
RECALL_AT_3 = "recall_at_3"
RECALL_AT_5 = "recall_at_5"
RECALL_AT_10 = "recall_at_10"
PRECISION = "precision"
def __str__(self) -> str:
"""Return the value for string representation."""
return self.value
# Catalog and schema_name conventions
EVAL_CATALOG = "yd_tagging_platform_evals"
SCHEMA_SUFFIX = "_dataset_bronze"
EVAL_RUNS_SCHEMA = "_system"
EVAL_RUNS_TABLE = "eval_runs"
EVAL_RUNS_FULL_TABLE = f"{EVAL_CATALOG}.{EVAL_RUNS_SCHEMA}.{EVAL_RUNS_TABLE}"
# Experiment registry table
EVAL_EXPERIMENTS_TABLE = "eval_experiments"
EVAL_EXPERIMENTS_FULL_TABLE = (
f"{EVAL_CATALOG}.{EVAL_RUNS_SCHEMA}.{EVAL_EXPERIMENTS_TABLE}"
)
# Lock tag name
EVAL_LOCK_TAG = "eval_locked"
# Default model configurations
DEFAULT_MODEL = "databricks-meta-llama-3-1-8b-instruct"
DEFAULT_JUDGE_MODEL = "databricks-claude-3-7-sonnet"
DEFAULT_MAX_OUTPUT_TOKENS = 512
DEFAULT_JUDGE_MAX_TOKENS = 256
# Execution limits
MAX_ROWS_PER_EVAL = 100_000
DEFAULT_BATCH_SIZE = 100
DEFAULT_MAX_CONCURRENT = 10
DEFAULT_TIMEOUT_SECONDS = 120.0
# Error handling thresholds
MAX_ERROR_RATE_THRESHOLD = 0.5 # Fail if more than 50% of rows error
# MLflow experiment prefix
MLFLOW_EXPERIMENT_PREFIX = "/Workspace/ml_experiments"
# Built-in metric names (derived from Metric enum for consistency)
BUILTIN_METRICS = frozenset(m.value for m in Metric)
# Run status values
class RunStatus:
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
# Experiment status values
class ExperimentStatus:
RUNNING = "running"
COMPLETED = "completed"
PARTIAL = "partial" # Some models succeeded, some failed
FAILED = "failed"