Run the unit evals cron in a matrix (#43907)
For now, just using Sonnet 4.5 and Opus 4.5 - I'll make a separate PR for non-Anthropic models, in case they introduce new failures. Release Notes: - N/A
This commit is contained in:
@@ -18,5 +18,6 @@ toml.workspace = true
|
||||
indoc.workspace = true
|
||||
indexmap.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
toml_edit.workspace = true
|
||||
gh-workflow.workspace = true
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
use gh_workflow::{Event, Expression, Job, Run, Schedule, Step, Use, Workflow, WorkflowDispatch};
|
||||
use gh_workflow::{
|
||||
Event, Expression, Job, Run, Schedule, Step, Strategy, Use, Workflow, WorkflowDispatch,
|
||||
};
|
||||
use serde_json::json;
|
||||
|
||||
use crate::tasks::workflows::{
|
||||
runners::{self, Platform},
|
||||
@@ -114,7 +117,31 @@ fn cron_unit_evals() -> NamedJob {
|
||||
"#}))
|
||||
}
|
||||
|
||||
named::job(unit_evals(None).add_step(send_failure_to_slack()))
|
||||
named::job(cron_unit_evals_job().add_step(send_failure_to_slack()))
|
||||
}
|
||||
|
||||
const UNIT_EVAL_MODELS: &[&str] = &[
|
||||
"anthropic/claude-sonnet-4-5-latest",
|
||||
"anthropic/claude-opus-4-5-latest",
|
||||
];
|
||||
|
||||
fn cron_unit_evals_job() -> Job {
|
||||
let script_step = add_api_keys(steps::script("./script/run-unit-evals"))
|
||||
.add_env(("ZED_AGENT_MODEL", "${{ matrix.model }}"));
|
||||
|
||||
Job::default()
|
||||
.runs_on(runners::LINUX_DEFAULT)
|
||||
.strategy(Strategy::default().fail_fast(false).matrix(json!({
|
||||
"model": UNIT_EVAL_MODELS
|
||||
})))
|
||||
.add_step(steps::checkout_repo())
|
||||
.add_step(steps::setup_cargo_config(Platform::Linux))
|
||||
.add_step(steps::cache_rust_dependencies_namespace())
|
||||
.map(steps::install_linux_dependencies)
|
||||
.add_step(steps::cargo_install_nextest())
|
||||
.add_step(steps::clear_target_dir_if_large(Platform::Linux))
|
||||
.add_step(script_step)
|
||||
.add_step(steps::cleanup_cargo_config(Platform::Linux))
|
||||
}
|
||||
|
||||
fn unit_evals(commit: Option<&WorkflowInput>) -> Job {
|
||||
|
||||
Reference in New Issue
Block a user