Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4c4519c870 |
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -4922,6 +4922,7 @@ dependencies = [
|
|||||||
"regex",
|
"regex",
|
||||||
"release_channel",
|
"release_channel",
|
||||||
"reqwest_client",
|
"reqwest_client",
|
||||||
|
"rust-embed",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"settings",
|
"settings",
|
||||||
@@ -6281,6 +6282,7 @@ dependencies = [
|
|||||||
"log",
|
"log",
|
||||||
"pest",
|
"pest",
|
||||||
"pest_derive",
|
"pest_derive",
|
||||||
|
"rust-embed",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"thiserror 1.0.69",
|
"thiserror 1.0.69",
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ fs.workspace = true
|
|||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
gpui.workspace = true
|
gpui.workspace = true
|
||||||
gpui_tokio.workspace = true
|
gpui_tokio.workspace = true
|
||||||
handlebars.workspace = true
|
handlebars = { workspace = true, features = ["rust-embed"] }
|
||||||
language.workspace = true
|
language.workspace = true
|
||||||
language_extension.workspace = true
|
language_extension.workspace = true
|
||||||
language_model.workspace = true
|
language_model.workspace = true
|
||||||
@@ -36,6 +36,7 @@ prompt_store.workspace = true
|
|||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
release_channel.workspace = true
|
release_channel.workspace = true
|
||||||
reqwest_client.workspace = true
|
reqwest_client.workspace = true
|
||||||
|
rust-embed.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
settings.workspace = true
|
settings.workspace = true
|
||||||
@@ -47,6 +48,7 @@ unindent.workspace = true
|
|||||||
util.workspace = true
|
util.workspace = true
|
||||||
uuid = { version = "1.6", features = ["v4"] }
|
uuid = { version = "1.6", features = ["v4"] }
|
||||||
workspace-hack.workspace = true
|
workspace-hack.workspace = true
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
name = "eval"
|
name = "eval"
|
||||||
path = "src/eval.rs"
|
path = "src/eval.rs"
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
mod assertions;
|
mod assertions;
|
||||||
mod example;
|
mod example;
|
||||||
|
mod example2;
|
||||||
mod examples;
|
mod examples;
|
||||||
mod ids;
|
mod ids;
|
||||||
mod instance;
|
mod instance;
|
||||||
|
mod templates;
|
||||||
mod tool_metrics;
|
mod tool_metrics;
|
||||||
|
|
||||||
use assertions::display_error_row;
|
use assertions::display_error_row;
|
||||||
|
|||||||
251
crates/eval/src/example2.rs
Normal file
251
crates/eval/src/example2.rs
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
use crate::{example::LanguageServer, examples::ExampleToml};
|
||||||
|
use collections::BTreeMap;
|
||||||
|
use futures::future::LocalBoxFuture;
|
||||||
|
use gpui::SharedString;
|
||||||
|
use smol::future::FutureExt;
|
||||||
|
use std::{fs, path::Path, sync::Arc};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
|
pub struct ExampleId(SharedString);
|
||||||
|
|
||||||
|
impl<T: Into<SharedString>> From<T> for ExampleId {
|
||||||
|
fn from(value: T) -> Self {
|
||||||
|
Self(value.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct ExampleMetadata {
|
||||||
|
pub id: ExampleId,
|
||||||
|
pub url: String,
|
||||||
|
pub revision: String,
|
||||||
|
pub language_server: Option<LanguageServer>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
|
pub struct AssertionId(SharedString);
|
||||||
|
|
||||||
|
impl<T: Into<SharedString>> From<T> for AssertionId {
|
||||||
|
fn from(value: T) -> Self {
|
||||||
|
Self(value.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct ExampleSetup {
|
||||||
|
assertions: BTreeMap<AssertionId, Vec<Assertion>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExampleSetup {
|
||||||
|
fn assertion(&mut self, id: impl Into<SharedString>) -> AssertionId {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Assertion {
|
||||||
|
condition: bool,
|
||||||
|
message: SharedString,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ExampleRun {
|
||||||
|
metadata: ExampleMetadata,
|
||||||
|
example: Option<Arc<dyn ErasedExample>>,
|
||||||
|
assertions: BTreeMap<AssertionId, Vec<Assertion>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExampleRun {
|
||||||
|
pub fn assert(
|
||||||
|
&mut self,
|
||||||
|
assertion_id: &AssertionId,
|
||||||
|
condition: bool,
|
||||||
|
message: impl Into<SharedString>,
|
||||||
|
) {
|
||||||
|
self.assertions
|
||||||
|
.get_mut(&assertion_id)
|
||||||
|
.expect("assertion not found")
|
||||||
|
.push(Assertion {
|
||||||
|
condition,
|
||||||
|
message: message.into(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn evaluate(&mut self) {
|
||||||
|
let example = self.example.take().unwrap();
|
||||||
|
example.evaluate(self);
|
||||||
|
self.example = Some(example);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
trait Example: 'static {
|
||||||
|
fn metadata() -> ExampleMetadata
|
||||||
|
where
|
||||||
|
Self: Sized;
|
||||||
|
fn new(cx: &mut ExampleSetup) -> Self
|
||||||
|
where
|
||||||
|
Self: Sized;
|
||||||
|
async fn evaluate(&self, cx: &mut ExampleRun) -> anyhow::Result<()>;
|
||||||
|
}
|
||||||
|
|
||||||
|
trait ErasedExample {
|
||||||
|
fn evaluate<'a>(&'a self, cx: &'a mut ExampleRun) -> LocalBoxFuture<'a, anyhow::Result<()>>;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Erased<T>(T);
|
||||||
|
|
||||||
|
impl<T: Example> ErasedExample for Erased<T> {
|
||||||
|
fn evaluate<'a>(&'a self, cx: &'a mut ExampleRun) -> LocalBoxFuture<'a, anyhow::Result<()>> {
|
||||||
|
self.0.evaluate(cx).boxed_local()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct MyExample {
|
||||||
|
my_assertion: AssertionId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Example for MyExample {
|
||||||
|
fn metadata() -> ExampleMetadata
|
||||||
|
where
|
||||||
|
Self: Sized,
|
||||||
|
{
|
||||||
|
ExampleMetadata {
|
||||||
|
id: "my_example".into(),
|
||||||
|
url: "".into(),
|
||||||
|
revision: "".into(),
|
||||||
|
language_server: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new(cx: &mut ExampleSetup) -> Self
|
||||||
|
where
|
||||||
|
Self: Sized,
|
||||||
|
{
|
||||||
|
Self {
|
||||||
|
my_assertion: cx.assertion("doesnt_do_stupid_thing"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn evaluate(&self, cx: &mut ExampleRun) -> anyhow::Result<()> {
|
||||||
|
cx.assert(&self.my_assertion, true, "true was false");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct TomlExample {
|
||||||
|
diff_assertions: BTreeMap<AssertionId, SharedString>,
|
||||||
|
thread_assertions: BTreeMap<AssertionId, SharedString>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ErasedExample for TomlExample {
|
||||||
|
fn evaluate<'a>(&'a self, cx: &'a mut ExampleRun) -> LocalBoxFuture<'a, anyhow::Result<()>> {
|
||||||
|
// let sample = cx.sample().await?;
|
||||||
|
|
||||||
|
// for (assertion_id, condition) in self.diff_assertions.iter() {
|
||||||
|
// let prompt = diff_prompt.render(diff, condition);
|
||||||
|
// cx.judge(assertion_id, prompt).await?;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// for (assertion_id, condition) in self.diff_assertions.iter() {
|
||||||
|
// cx.judge(assertion_id, condition).await?;
|
||||||
|
// }
|
||||||
|
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct EvalSuite {
|
||||||
|
example_builders: BTreeMap<ExampleId, Box<dyn Fn() -> ExampleRun>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EvalSuite {
|
||||||
|
pub fn examples_to_run(&self, trials: usize) -> BTreeMap<ExampleId, Vec<ExampleRun>> {
|
||||||
|
let mut examples = BTreeMap::new();
|
||||||
|
for (example_id, builder) in &self.example_builders {
|
||||||
|
examples.insert(
|
||||||
|
example_id.clone(),
|
||||||
|
(0..trials).map(|_| (builder)()).collect(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
examples
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn register<T: Example>(&mut self) {
|
||||||
|
let metadata = T::metadata();
|
||||||
|
self.example_builders.insert(
|
||||||
|
metadata.id.clone(),
|
||||||
|
Box::new(move || {
|
||||||
|
let mut cx = ExampleSetup::default();
|
||||||
|
let example = T::new(&mut cx);
|
||||||
|
ExampleRun {
|
||||||
|
metadata: metadata.clone(),
|
||||||
|
example: Some(Arc::new(Erased(example))),
|
||||||
|
assertions: cx.assertions,
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn register_toml(&mut self, toml_abs_path: &Path) {
|
||||||
|
let name = toml_abs_path
|
||||||
|
.file_name()
|
||||||
|
.unwrap()
|
||||||
|
.to_string_lossy()
|
||||||
|
.to_string();
|
||||||
|
let id = ExampleId(name.into());
|
||||||
|
let base: ExampleToml =
|
||||||
|
toml::from_str(&fs::read_to_string(&toml_abs_path).unwrap()).unwrap();
|
||||||
|
|
||||||
|
let language_server = if base.require_lsp {
|
||||||
|
Some(crate::example::LanguageServer {
|
||||||
|
file_extension: base
|
||||||
|
.language_extension
|
||||||
|
.expect("Language extension is required when require_lsp = true"),
|
||||||
|
allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let metadata = ExampleMetadata {
|
||||||
|
id: id.clone(),
|
||||||
|
url: base.url,
|
||||||
|
revision: base.revision,
|
||||||
|
language_server,
|
||||||
|
};
|
||||||
|
|
||||||
|
self.example_builders.insert(
|
||||||
|
id,
|
||||||
|
Box::new(move || {
|
||||||
|
let mut cx = ExampleSetup::default();
|
||||||
|
let mut diff_assertions = BTreeMap::new();
|
||||||
|
let mut thread_assertions = BTreeMap::new();
|
||||||
|
for (assertion_key, condition) in base.diff_assertions.iter() {
|
||||||
|
let assertion_id = cx.assertion(assertion_key);
|
||||||
|
diff_assertions.insert(assertion_id, condition.into());
|
||||||
|
}
|
||||||
|
for (assertion_key, condition) in base.thread_assertions.iter() {
|
||||||
|
let assertion_id = cx.assertion(assertion_key);
|
||||||
|
thread_assertions.insert(assertion_id.into(), condition.into());
|
||||||
|
}
|
||||||
|
ExampleRun {
|
||||||
|
metadata: metadata.clone(),
|
||||||
|
example: Some(Arc::new(TomlExample {
|
||||||
|
diff_assertions,
|
||||||
|
thread_assertions,
|
||||||
|
})),
|
||||||
|
assertions: cx.assertions,
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
// let mut suite = EvalSuite::default();
|
||||||
|
// suite.register::<MyExample>();
|
||||||
|
// for path in list_files(..) {
|
||||||
|
// suite.register_static()
|
||||||
|
// }
|
||||||
|
|
||||||
|
// for (example_id, runs) in suite.examples_to_run(2) {}
|
||||||
|
}
|
||||||
@@ -480,8 +480,8 @@ impl ExampleInstance {
|
|||||||
|
|
||||||
println!("{}Running diff judge", self.log_prefix);
|
println!("{}Running diff judge", self.log_prefix);
|
||||||
|
|
||||||
let judge_diff_prompt = include_str!("judge_diff_prompt.hbs");
|
let judge_diff_prompt = include_str!("templates/judge_diff.hbs");
|
||||||
let judge_diff_prompt_name = "judge_diff_prompt";
|
let judge_diff_prompt_name = "judge_diff";
|
||||||
let mut hbs = Handlebars::new();
|
let mut hbs = Handlebars::new();
|
||||||
hbs.register_template_string(judge_diff_prompt_name, judge_diff_prompt)
|
hbs.register_template_string(judge_diff_prompt_name, judge_diff_prompt)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -525,7 +525,7 @@ impl ExampleInstance {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let judge_thread_prompt = include_str!("judge_thread_prompt.hbs");
|
let judge_thread_prompt = include_str!("templates/judge_thread.hbs");
|
||||||
let judge_diff_prompt_name = "judge_thread_prompt";
|
let judge_diff_prompt_name = "judge_thread_prompt";
|
||||||
let mut hbs = Handlebars::new();
|
let mut hbs = Handlebars::new();
|
||||||
hbs.register_template_string(judge_diff_prompt_name, judge_thread_prompt)
|
hbs.register_template_string(judge_diff_prompt_name, judge_thread_prompt)
|
||||||
|
|||||||
43
crates/eval/src/templates.rs
Normal file
43
crates/eval/src/templates.rs
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use handlebars::Handlebars;
|
||||||
|
use rust_embed::RustEmbed;
|
||||||
|
use serde::Serialize;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
#[derive(RustEmbed)]
|
||||||
|
#[folder = "src/templates"]
|
||||||
|
#[include = "*.hbs"]
|
||||||
|
struct Assets;
|
||||||
|
|
||||||
|
pub struct Templates(Handlebars<'static>);
|
||||||
|
|
||||||
|
impl Templates {
|
||||||
|
pub fn new() -> Arc<Self> {
|
||||||
|
let mut handlebars = Handlebars::new();
|
||||||
|
handlebars.register_embed_templates::<Assets>().unwrap();
|
||||||
|
// Disable HTML escaping so you don't .
|
||||||
|
handlebars.register_escape_fn(|text| text.into());
|
||||||
|
Arc::new(Self(handlebars))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait Template: Sized {
|
||||||
|
const TEMPLATE_NAME: &'static str;
|
||||||
|
|
||||||
|
fn render(&self, templates: &Templates) -> Result<String>
|
||||||
|
where
|
||||||
|
Self: Serialize + Sized,
|
||||||
|
{
|
||||||
|
Ok(templates.0.render(Self::TEMPLATE_NAME, self)?)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct JudgeDiffTemplate {
|
||||||
|
repository_diff: String,
|
||||||
|
assertion: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Template for JudgeDiffTemplate {
|
||||||
|
const TEMPLATE_NAME: &'static str = "judge_diff.hbs";
|
||||||
|
}
|
||||||
@@ -1,12 +1,7 @@
|
|||||||
You are an expert software developer. Your task is to evaluate a diff produced by an AI agent
|
You are an expert software developer. Your task is to evaluate a diff produced by an AI agent. Here is the prompt and the diff:
|
||||||
in response to a prompt. Here is the prompt and the diff:
|
|
||||||
|
|
||||||
<prompt>
|
|
||||||
{{{prompt}}}
|
|
||||||
</prompt>
|
|
||||||
|
|
||||||
<diff>
|
<diff>
|
||||||
{{{repository_diff}}}
|
{{repository_diff}}
|
||||||
</diff>
|
</diff>
|
||||||
|
|
||||||
Evaluate whether or not the diff passes the following assertion:
|
Evaluate whether or not the diff passes the following assertion:
|
||||||
Reference in New Issue
Block a user