Run the unit evals cron in a matrix (#43907)

For now, just using Sonnet 4.5 and Opus 4.5 - I'll make a separate PR
for non-Anthropic models, in case they introduce new failures.

Release Notes:

- N/A
This commit is contained in:
Richard Feldman
2025-12-01 11:03:00 -05:00
committed by GitHub
parent 26ef93ffeb
commit 7aa610e24f
4 changed files with 38 additions and 2 deletions

View File

@@ -18,5 +18,6 @@ toml.workspace = true
indoc.workspace = true
indexmap.workspace = true
serde.workspace = true
serde_json.workspace = true
toml_edit.workspace = true
gh-workflow.workspace = true

View File

@@ -1,4 +1,7 @@
use gh_workflow::{Event, Expression, Job, Run, Schedule, Step, Use, Workflow, WorkflowDispatch};
use gh_workflow::{
Event, Expression, Job, Run, Schedule, Step, Strategy, Use, Workflow, WorkflowDispatch,
};
use serde_json::json;
use crate::tasks::workflows::{
runners::{self, Platform},
@@ -114,7 +117,31 @@ fn cron_unit_evals() -> NamedJob {
"#}))
}
named::job(unit_evals(None).add_step(send_failure_to_slack()))
named::job(cron_unit_evals_job().add_step(send_failure_to_slack()))
}
const UNIT_EVAL_MODELS: &[&str] = &[
"anthropic/claude-sonnet-4-5-latest",
"anthropic/claude-opus-4-5-latest",
];
fn cron_unit_evals_job() -> Job {
let script_step = add_api_keys(steps::script("./script/run-unit-evals"))
.add_env(("ZED_AGENT_MODEL", "${{ matrix.model }}"));
Job::default()
.runs_on(runners::LINUX_DEFAULT)
.strategy(Strategy::default().fail_fast(false).matrix(json!({
"model": UNIT_EVAL_MODELS
})))
.add_step(steps::checkout_repo())
.add_step(steps::setup_cargo_config(Platform::Linux))
.add_step(steps::cache_rust_dependencies_namespace())
.map(steps::install_linux_dependencies)
.add_step(steps::cargo_install_nextest())
.add_step(steps::clear_target_dir_if_large(Platform::Linux))
.add_step(script_step)
.add_step(steps::cleanup_cargo_config(Platform::Linux))
}
fn unit_evals(commit: Option<&WorkflowInput>) -> Job {