Split out cron and non-cron unit evals (#42472)

Release Notes:

- N/A

---------

Co-authored-by: Bennet Bo Fenner <bennetbo@gmx.de>
This commit is contained in:
Richard Feldman
2025-11-11 13:45:48 -05:00
committed by GitHub
parent 5f4d0dbaab
commit 908ef03502
6 changed files with 178 additions and 25 deletions

View File

@@ -33,6 +33,10 @@ pub fn run_workflows(_: GenerateWorkflowArgs) -> Result<()> {
("cherry_pick.yml", cherry_pick::cherry_pick()),
("compare_perf.yml", compare_perf::compare_perf()),
("run_unit_evals.yml", run_agent_evals::run_unit_evals()),
(
"run_cron_unit_evals.yml",
run_agent_evals::run_cron_unit_evals(),
),
("run_agent_evals.yml", run_agent_evals::run_agent_evals()),
("after_release.yml", after_release::after_release()),
];

View File

@@ -28,6 +28,36 @@ pub(crate) fn run_agent_evals() -> Workflow {
.add_job(agent_evals.name, agent_evals.job)
}
pub(crate) fn run_unit_evals() -> Workflow {
let model_name = Input::string("model_name", None);
let commit_sha = Input::string("commit_sha", None);
let unit_evals = named::job(unit_evals(Some(&commit_sha)));
named::workflow()
.name("run_unit_evals")
.on(Event::default().workflow_dispatch(
WorkflowDispatch::default()
.add_input(model_name.name, model_name.input())
.add_input(commit_sha.name, commit_sha.input()),
))
.concurrency(vars::one_workflow_per_non_main_branch())
.add_env(("CARGO_TERM_COLOR", "always"))
.add_env(("CARGO_INCREMENTAL", 0))
.add_env(("RUST_BACKTRACE", 1))
.add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED))
.add_env(("ZED_EVAL_TELEMETRY", 1))
.add_env(("MODEL_NAME", model_name.to_string()))
.add_job(unit_evals.name, unit_evals.job)
}
fn add_api_keys(step: Step<Run>) -> Step<Run> {
step.add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY))
.add_env(("OPENAI_API_KEY", vars::OPENAI_API_KEY))
.add_env(("GOOGLE_AI_API_KEY", vars::GOOGLE_AI_API_KEY))
.add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT))
}
fn agent_evals() -> NamedJob {
fn run_eval() -> Step<Run> {
named::bash(
@@ -44,16 +74,16 @@ fn agent_evals() -> NamedJob {
.map(steps::install_linux_dependencies)
.add_step(setup_cargo_config(Platform::Linux))
.add_step(steps::script("cargo build --package=eval"))
.add_step(run_eval())
.add_step(add_api_keys(run_eval()))
.add_step(steps::cleanup_cargo_config(Platform::Linux)),
)
}
pub(crate) fn run_unit_evals() -> Workflow {
let unit_evals = unit_evals();
pub(crate) fn run_cron_unit_evals() -> Workflow {
let unit_evals = cron_unit_evals();
named::workflow()
.name("run_unit_evals")
.name("run_cron_unit_evals")
.on(Event::default()
.schedule([
// GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
@@ -68,7 +98,7 @@ pub(crate) fn run_unit_evals() -> Workflow {
.add_job(unit_evals.name, unit_evals.job)
}
fn unit_evals() -> NamedJob {
fn cron_unit_evals() -> NamedJob {
fn send_failure_to_slack() -> Step<Use> {
named::uses(
"slackapi",
@@ -84,20 +114,39 @@ fn unit_evals() -> NamedJob {
"#}))
}
named::job(
Job::default()
.runs_on(runners::LINUX_DEFAULT)
.add_step(steps::checkout_repo())
.add_step(steps::setup_cargo_config(Platform::Linux))
.add_step(steps::cache_rust_dependencies_namespace())
.map(steps::install_linux_dependencies)
.add_step(steps::cargo_install_nextest(Platform::Linux))
.add_step(steps::clear_target_dir_if_large(Platform::Linux))
.add_step(
steps::script("./script/run-unit-evals")
.add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY)),
)
.add_step(send_failure_to_slack())
.add_step(steps::cleanup_cargo_config(Platform::Linux)),
)
named::job(unit_evals(None).add_step(send_failure_to_slack()))
}
fn unit_evals(commit: Option<&Input>) -> Job {
fn send_failure_to_slack() -> Step<Use> {
named::uses(
"slackapi",
"slack-github-action",
"b0fa283ad8fea605de13dc3f449259339835fc52",
)
.if_condition(Expression::new("${{ failure() }}"))
.add_with(("method", "chat.postMessage"))
.add_with(("token", vars::SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN))
.add_with(("payload", indoc::indoc!{r#"
channel: C04UDRNNJFQ
text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
"#}))
}
let script_step = add_api_keys(steps::script("./script/run-unit-evals"));
Job::default()
.runs_on(runners::LINUX_DEFAULT)
.add_step(steps::checkout_repo())
.add_step(steps::setup_cargo_config(Platform::Linux))
.add_step(steps::cache_rust_dependencies_namespace())
.map(steps::install_linux_dependencies)
.add_step(steps::cargo_install_nextest(Platform::Linux))
.add_step(steps::clear_target_dir_if_large(Platform::Linux))
.add_step(match commit {
Some(commit) => script_step.add_env(("UNIT_EVAL_COMMIT", commit)),
None => script_step,
})
.add_step(send_failure_to_slack())
.add_step(steps::cleanup_cargo_config(Platform::Linux))
}