Source code for ml_toolkit.functions.eval_utils.constants

[docs] """Constants for eval_utils module.""" from enum import Enum
[docs] class Metric(str, Enum): """Built-in evaluation metrics. Inherits from str for backward compatibility with string metrics. Example: >>> from ml_toolkit.functions.eval_utils import Metric >>> metrics = [Metric.LATENCY, Metric.EXACT_MATCH] >>> # Also works with strings for backward compatibility >>> "latency" in [m.value for m in Metric] True """ LATENCY = "latency" TOKEN_COUNT = "token_count" EXACT_MATCH = "exact_match" FUZZY_MATCH = "fuzzy_match" LLM_JUDGE = "llm_judge" HIT_RATE = "hit_rate" MRR = "mrr" RECALL_AT_3 = "recall_at_3" RECALL_AT_5 = "recall_at_5" RECALL_AT_10 = "recall_at_10" PRECISION = "precision" def __str__(self) -> str: """Return the value for string representation.""" return self.value
# Catalog and schema_name conventions EVAL_CATALOG = "yd_tagging_platform_evals" SCHEMA_SUFFIX = "_dataset_bronze" EVAL_RUNS_SCHEMA = "_system" EVAL_RUNS_TABLE = "eval_runs" EVAL_RUNS_FULL_TABLE = f"{EVAL_CATALOG}.{EVAL_RUNS_SCHEMA}.{EVAL_RUNS_TABLE}" # Experiment registry table EVAL_EXPERIMENTS_TABLE = "eval_experiments" EVAL_EXPERIMENTS_FULL_TABLE = ( f"{EVAL_CATALOG}.{EVAL_RUNS_SCHEMA}.{EVAL_EXPERIMENTS_TABLE}" ) # Lock tag name EVAL_LOCK_TAG = "eval_locked" # Default model configurations DEFAULT_MODEL = "databricks-meta-llama-3-1-8b-instruct" DEFAULT_JUDGE_MODEL = "databricks-claude-3-7-sonnet" DEFAULT_MAX_OUTPUT_TOKENS = 512 DEFAULT_JUDGE_MAX_TOKENS = 256 # Execution limits MAX_ROWS_PER_EVAL = 100_000 DEFAULT_BATCH_SIZE = 100 DEFAULT_MAX_CONCURRENT = 10 DEFAULT_TIMEOUT_SECONDS = 120.0 # Error handling thresholds MAX_ERROR_RATE_THRESHOLD = 0.5 # Fail if more than 50% of rows error # MLflow experiment prefix MLFLOW_EXPERIMENT_PREFIX = "/Workspace/ml_experiments" # Built-in metric names (derived from Metric enum for consistency) BUILTIN_METRICS = frozenset(m.value for m in Metric) # Run status values class RunStatus: RUNNING = "running" COMPLETED = "completed" FAILED = "failed" # Experiment status values class ExperimentStatus: RUNNING = "running" COMPLETED = "completed" PARTIAL = "partial" # Some models succeeded, some failed FAILED = "failed"