Compare commits
10 Commits
make-langu
...
ai-worlds-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
abeb64e4f1 | ||
|
|
855ab86c68 | ||
|
|
e11aeb7b5d | ||
|
|
744b764b31 | ||
|
|
48f3eeca81 | ||
|
|
d60c72f146 | ||
|
|
f7fab92b9b | ||
|
|
426cfe434e | ||
|
|
bf9def0fbe | ||
|
|
53fb5e6ac8 |
@@ -238,7 +238,7 @@ impl EditAgent {
|
||||
let (output, edit_events) = Self::parse_edit_chunks(edit_chunks, cx);
|
||||
let mut edit_events = edit_events.peekable();
|
||||
while let Some(edit_event) = Pin::new(&mut edit_events).peek().await {
|
||||
// Skip events until we're at the start of a new edit.
|
||||
// Salta gli eventi finché non siamo all'inizio di una nuova modifica.
|
||||
let Ok(EditParserEvent::OldTextChunk { .. }) = edit_event else {
|
||||
edit_events.next().await.unwrap()?;
|
||||
continue;
|
||||
@@ -246,8 +246,8 @@ impl EditAgent {
|
||||
|
||||
let snapshot = buffer.read_with(cx, |buffer, _| buffer.snapshot())?;
|
||||
|
||||
// Resolve the old text in the background, updating the agent
|
||||
// location as we keep refining which range it corresponds to.
|
||||
// Risolvi il vecchio testo in background, aggiornando la posizione
|
||||
// dell'agente mentre continuiamo a perfezionare a quale intervallo corrisponde.
|
||||
let (resolve_old_text, mut old_range) =
|
||||
Self::resolve_old_text(snapshot.text.clone(), edit_events, cx);
|
||||
while let Ok(old_range) = old_range.recv().await {
|
||||
@@ -726,6 +726,9 @@ mod tests {
|
||||
);
|
||||
cx.run_until_parked();
|
||||
|
||||
// !talk: Questo è un test unitario più tradizionale.
|
||||
// !talk: È randomizzato, ma ancora fondamentalmente deterministico.
|
||||
// !talk: Ma comunque rilevante per lavorare con un LLM
|
||||
simulate_llm_output(
|
||||
&agent,
|
||||
indoc! {"
|
||||
@@ -749,6 +752,7 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
// !talk: Really interesting unit test - Again about purely algorithmic code but critical to performance on the task.
|
||||
#[gpui::test(iterations = 100)]
|
||||
async fn test_indentation(cx: &mut TestAppContext, mut rng: StdRng) {
|
||||
let agent = init_test(cx).await;
|
||||
|
||||
@@ -75,6 +75,8 @@ impl EditParser {
|
||||
chunk.pop();
|
||||
}
|
||||
|
||||
// !talk: We're tolerant of mismatched tags because we couldn't get this to zero
|
||||
// !talk: Seems like things are more likely on distribution if the model gets this right, but we don't really know.
|
||||
self.metrics.tags += 1;
|
||||
if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
|
||||
self.metrics.mismatched_tags += 1;
|
||||
@@ -333,6 +335,7 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
// !talk: This is the traditional randomized test on the parser covering the last N%.
|
||||
#[gpui::test(iterations = 1000)]
|
||||
fn test_mismatched_tags(mut rng: StdRng) {
|
||||
let mut parser = EditParser::new();
|
||||
|
||||
@@ -160,6 +160,7 @@ fn eval_delete_run_git_blame() {
|
||||
);
|
||||
}
|
||||
|
||||
// !talk: Go here after zoomed out eval.
|
||||
#[test]
|
||||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||||
fn eval_translate_doc_comments() {
|
||||
@@ -176,7 +177,7 @@ fn eval_translate_doc_comments() {
|
||||
let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
|
||||
let edit_description = "Translate all doc comments to Italian";
|
||||
eval(
|
||||
200,
|
||||
100,
|
||||
1.,
|
||||
EvalInput::from_conversation(
|
||||
vec![
|
||||
@@ -1349,6 +1350,8 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
|
||||
);
|
||||
}
|
||||
|
||||
// !talk: Here's a blanket assertion we added to the eval tracking the presence of mismatched tags
|
||||
// !talk: It's run on every eval because it's a cross cutting concern.
|
||||
let mismatched_tag_ratio =
|
||||
cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
|
||||
if mismatched_tag_ratio > 0.05 {
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
# Slide 2: Evals. This is our equivalent of swebench, but on our own codebase
|
||||
|
||||
url = "https://github.com/zed-industries/zed.git"
|
||||
revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
|
||||
language_extension = "rs"
|
||||
|
||||
9
talk.md
Normal file
9
talk.md
Normal file
@@ -0,0 +1,9 @@
|
||||
- Start with the find and replace diff card eval
|
||||
- /Users/nathan/src/zed/crates/eval/src/examples/find_and_replace_diff_card.toml
|
||||
|
||||
- Zoom in on streaming edits
|
||||
- /Users/nathan/src/zed/crates/assistant_tools/src/edit_agent/evals.rs
|
||||
- fn eval_translate_doc_comments() {
|
||||
|
||||
- Unit tests on streaming edits
|
||||
- Show the mismatched
|
||||
75
worlds_fair_talk/README.md
Normal file
75
worlds_fair_talk/README.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# Worlds Fair Talk: CI in the Era of AI
|
||||
|
||||
This crate contains the materials for Nathan Sobo's talk "CI in the Era of AI: From Unit Tests to Stochastic Evals" presented at the AI Engineer World's Fair.
|
||||
|
||||
## Overview
|
||||
|
||||
The talk explores how Zed's testing philosophy evolved when integrating language models, using the streaming edits feature as a case study. It demonstrates the shift from purely deterministic testing to embracing statistical methods when working with inherently stochastic systems.
|
||||
|
||||
## Structure
|
||||
|
||||
The talk is organized as numbered source files with accompanying speaker notes:
|
||||
|
||||
### Slides (in `src/`)
|
||||
- `00_intro.md` - Title slide and introduction
|
||||
- `01_deterministic_testing_at_zed.rs` - Zed's traditional deterministic testing approach
|
||||
- `02_stochastic_unit_tests.rs` - Introduction to statistical testing for LLMs
|
||||
- `03_streaming_edits_overview.md` - Overview of the streaming edits challenge
|
||||
- `04_deterministic_streaming_tests.rs` - Traditional tests for algorithmic components
|
||||
- `05_empty_old_text_problem.rs` - First eval failure: empty old_text bug
|
||||
- `06_tag_mismatch_discovery.rs` - XML tag mismatch issues (5% failure rate)
|
||||
- `07_the_indentation_discovery.rs` - Indentation problem and algorithmic solution
|
||||
- `08_escaping_chaos.rs` - Character escaping issues (especially for Gemini)
|
||||
- `09_lessons_learned.md` - Key takeaways about testing with LLMs
|
||||
|
||||
### Speaker Notes (in `notes/`)
|
||||
Each slide has a corresponding `.md` file with speaker notes in the `notes/` directory.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
### Streaming Edits Feature
|
||||
- Allows users to see AI code edits character-by-character as they're generated
|
||||
- Works around API limitations where tool calling can't stream edit content
|
||||
- Uses a two-phase approach: tool call for intent, then raw text streaming
|
||||
|
||||
### Testing Evolution
|
||||
1. **Deterministic Tests**: For parsing, algorithms, indentation adjustment
|
||||
2. **Statistical Tests (Evals)**: For LLM behavior, requiring threshold pass rates
|
||||
3. **Property-Based Tests**: For comprehensive algorithmic validation
|
||||
|
||||
### Major Discoveries
|
||||
- **Empty old_text**: 0% → 99% pass rate with one prompt line
|
||||
- **Tag mismatches**: Models mess up XML closing tags, made parser tolerant
|
||||
- **Indentation**: Built automatic adjustment algorithm
|
||||
- **Character escaping**: Gemini went from 35% → 86% with one instruction
|
||||
|
||||
## Historical Context
|
||||
|
||||
The prompt evolution was driven by specific eval failures:
|
||||
- Commit `ab017129d8` (May 22, 2025) by Oleksiy Syvokon made major improvements:
|
||||
- Gemini: 35% → 86%
|
||||
- Claude: 96% → 98%
|
||||
- GPT-4: 81% → 100%
|
||||
|
||||
## Talk Duration
|
||||
|
||||
Approximately 15 minutes, designed to move quickly through concrete examples.
|
||||
|
||||
## Building the Talk
|
||||
|
||||
This crate is not meant to be compiled - the code examples are illustrative and may use simplified types for clarity. The actual implementation lives in `crates/assistant_tools/`.
|
||||
|
||||
## Future Work
|
||||
|
||||
If continuing this talk:
|
||||
- Consider adding `test_edit_events` showing real-time event streaming
|
||||
- The `eval_add_overwrite_test` has surprisingly low pass rates (16-35%) and might reveal interesting failure modes
|
||||
- More examples of property-based testing could strengthen the deterministic testing section
|
||||
|
||||
## Key Message
|
||||
|
||||
The core thesis: When building on LLMs, you must embrace empirical methods. You can't reason about their behavior - you can only measure it. This requires:
|
||||
1. Statistical thresholds instead of binary pass/fail
|
||||
2. Learning from failure patterns
|
||||
3. Accepting imperfection and building resilient systems
|
||||
4. Layering deterministic and statistical tests appropriately
|
||||
5
worlds_fair_talk/slides/00_intro.md
Normal file
5
worlds_fair_talk/slides/00_intro.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# CI in the Era of AI: From Unit Tests to Stochastic Evals
|
||||
|
||||
## Evolving Zed's testing philosophy to embrace LLMs
|
||||
|
||||
### Nathan Sobo – Co-founder of Zed
|
||||
38
worlds_fair_talk/slides/01_deterministic_testing_at_zed.rs
Normal file
38
worlds_fair_talk/slides/01_deterministic_testing_at_zed.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
// Our foundation: deterministic tests with controlled randomness
|
||||
#[gpui::test(iterations = 50)]
|
||||
async fn test_collaborative_editing(executor: BackgroundExecutor) {
|
||||
let mut server = TestServer::start(executor.clone()).await;
|
||||
let client_a = server.create_client("user_a").await;
|
||||
let client_b = server.create_client("user_b").await;
|
||||
|
||||
// Create shared project
|
||||
let project_a = client_a.build_local_project("/code").await;
|
||||
let project_id = project_a.borrow_mut().share().await.unwrap();
|
||||
|
||||
// Client B joins
|
||||
let project_b = client_b.join_remote_project(project_id).await;
|
||||
|
||||
// Open same buffer
|
||||
let buffer_a = project_a.borrow_mut()
|
||||
.open_local_buffer("/code/main.rs").await.unwrap();
|
||||
let buffer_b = project_b.borrow_mut()
|
||||
.open_buffer("main.rs").await.unwrap();
|
||||
|
||||
// Concurrent edits
|
||||
buffer_a.borrow_mut().edit([(0..0, "// A's edit\n")]);
|
||||
buffer_b.borrow_mut().edit([(0..0, "// B's edit\n")]);
|
||||
|
||||
// Controlled network failures
|
||||
server.disconnect_client(client_a.peer_id().unwrap());
|
||||
executor.advance_clock(RECEIVE_TIMEOUT);
|
||||
|
||||
// B continues editing while A is disconnected
|
||||
buffer_b.borrow_mut().edit([(24..24, "// B alone\n")]);
|
||||
|
||||
// A reconnects
|
||||
executor.advance_clock(RECONNECT_TIMEOUT);
|
||||
executor.run_until_parked();
|
||||
|
||||
// Clear pass/fail - reproducible every time
|
||||
assert_eq!(buffer_a.borrow().text(), buffer_b.borrow().text());
|
||||
}
|
||||
45
worlds_fair_talk/slides/02_traditional_eval.toml
Normal file
45
worlds_fair_talk/slides/02_traditional_eval.toml
Normal file
@@ -0,0 +1,45 @@
|
||||
# Slide 2: Evals. This is our equivalent of swebench, but on our own codebase
|
||||
|
||||
url = "https://github.com/zed-industries/zed.git"
|
||||
revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
|
||||
language_extension = "rs"
|
||||
|
||||
prompt = """
|
||||
Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
|
||||
The card should implement the `Render` trait.
|
||||
|
||||
The card should show a diff. It should be a beautifully presented diff.
|
||||
The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
|
||||
I want to see a red background for lines that were deleted and a green background for lines
|
||||
that were added. We should have a div per diff line.
|
||||
"""
|
||||
|
||||
[diff_assertions]
|
||||
|
||||
modify_find_and_replace_tool = """
|
||||
The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
|
||||
The struct should contain an `output` field that is the same as the task we were returning before,
|
||||
and a new `card` field that contains a view for the card.
|
||||
"""
|
||||
|
||||
card_implementation = """
|
||||
The card should be a view that displays a diff.
|
||||
Each line in the diff should be colored according to whether it was added, removed or unchanged.
|
||||
"""
|
||||
|
||||
[thread_assertions]
|
||||
|
||||
path_search = """
|
||||
The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
|
||||
(*Not* grep, for example, or reading the file based on a guess at the path.)
|
||||
This is because we gave the model a filename and it needs to turn that into a real path.
|
||||
"""
|
||||
|
||||
read_file_from_path_search = """
|
||||
After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
|
||||
"""
|
||||
|
||||
symbol_search = """
|
||||
When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
|
||||
on what path the Render trait might be in.
|
||||
"""
|
||||
153
worlds_fair_talk/slides/03_programmatic_eval_example.rs
Normal file
153
worlds_fair_talk/slides/03_programmatic_eval_example.rs
Normal file
@@ -0,0 +1,153 @@
|
||||
use std::path::Path;
|
||||
|
||||
use agent_settings::AgentProfileId;
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer};
|
||||
|
||||
pub struct AddArgToTraitMethod;
|
||||
|
||||
#[async_trait(?Send)]
|
||||
impl Example for AddArgToTraitMethod {
|
||||
fn meta(&self) -> ExampleMetadata {
|
||||
ExampleMetadata {
|
||||
name: "add_arg_to_trait_method".to_string(),
|
||||
url: "https://github.com/zed-industries/zed.git".to_string(),
|
||||
revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(),
|
||||
language_server: Some(LanguageServer {
|
||||
file_extension: "rs".to_string(),
|
||||
allow_preexisting_diagnostics: false,
|
||||
}),
|
||||
max_assertions: None,
|
||||
profile_id: AgentProfileId::default(),
|
||||
existing_thread_json: None,
|
||||
max_turns: None,
|
||||
}
|
||||
}
|
||||
|
||||
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||
const FILENAME: &str = "assistant_tool.rs";
|
||||
cx.push_user_message(format!(
|
||||
r#"
|
||||
Add a `window: Option<gpui::AnyWindowHandle>` argument to the `Tool::run` trait method in {FILENAME},
|
||||
and update all the implementations of the trait and call sites accordingly.
|
||||
"#
|
||||
));
|
||||
|
||||
let _ = cx.run_to_end().await?;
|
||||
|
||||
// Adds ignored argument to all but `batch_tool`
|
||||
|
||||
let add_ignored_window_paths = &[
|
||||
"code_action_tool",
|
||||
"code_symbols_tool",
|
||||
"contents_tool",
|
||||
"copy_path_tool",
|
||||
"create_directory_tool",
|
||||
"create_file_tool",
|
||||
"delete_path_tool",
|
||||
"diagnostics_tool",
|
||||
"edit_file_tool",
|
||||
"fetch_tool",
|
||||
"grep_tool",
|
||||
"list_directory_tool",
|
||||
"move_path_tool",
|
||||
"now_tool",
|
||||
"open_tool",
|
||||
"path_search_tool",
|
||||
"read_file_tool",
|
||||
"rename_tool",
|
||||
"symbol_info_tool",
|
||||
"terminal_tool",
|
||||
"thinking_tool",
|
||||
"web_search_tool",
|
||||
];
|
||||
|
||||
let edits = cx.edits();
|
||||
|
||||
for tool_name in add_ignored_window_paths {
|
||||
let path_str = format!("crates/assistant_tools/src/{}.rs", tool_name);
|
||||
let edits = edits.get(Path::new(&path_str));
|
||||
|
||||
let ignored = edits.map_or(false, |edits| {
|
||||
edits.has_added_line(" _window: Option<gpui::AnyWindowHandle>,\n")
|
||||
});
|
||||
let uningored = edits.map_or(false, |edits| {
|
||||
edits.has_added_line(" window: Option<gpui::AnyWindowHandle>,\n")
|
||||
});
|
||||
|
||||
cx.assert(ignored || uningored, format!("Argument: {}", tool_name))
|
||||
.ok();
|
||||
|
||||
cx.assert(ignored, format!("`_` prefix: {}", tool_name))
|
||||
.ok();
|
||||
}
|
||||
|
||||
// Adds unignored argument to `batch_tool`
|
||||
|
||||
let batch_tool_edits = edits.get(Path::new("crates/assistant_tools/src/batch_tool.rs"));
|
||||
|
||||
cx.assert(
|
||||
batch_tool_edits.map_or(false, |edits| {
|
||||
edits.has_added_line(" window: Option<gpui::AnyWindowHandle>,\n")
|
||||
}),
|
||||
"Argument: batch_tool",
|
||||
)
|
||||
.ok();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn diff_assertions(&self) -> Vec<JudgeAssertion> {
|
||||
vec![
|
||||
JudgeAssertion {
|
||||
id: "batch tool passes window to each".to_string(),
|
||||
description:
|
||||
"batch_tool is modified to pass a clone of the window to each tool it calls."
|
||||
.to_string(),
|
||||
},
|
||||
JudgeAssertion {
|
||||
id: "tool tests updated".to_string(),
|
||||
description:
|
||||
"tool tests are updated to pass the new `window` argument (`None` is ok)."
|
||||
.to_string(),
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
#[gpui::test]
|
||||
async fn test_grep_function_args_and_body(cx: &mut TestAppContext) {
|
||||
let project = setup_syntax_test(cx).await;
|
||||
|
||||
// Test: Line with a function argument
|
||||
let input = serde_json::to_value(GrepToolInput {
|
||||
regex: "second_arg".to_string(),
|
||||
include_pattern: Some("**/*.rs".to_string()),
|
||||
offset: 0,
|
||||
case_sensitive: false,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let result = run_grep_tool(input, project.clone(), cx).await;
|
||||
let expected = r#"
|
||||
Found 1 matches:
|
||||
|
||||
## Matches in root/test_syntax.rs
|
||||
|
||||
### mod feature_module › pub mod nested_module › pub fn nested_function › L7-14
|
||||
```
|
||||
pub fn nested_function(
|
||||
first_arg: String,
|
||||
second_arg: i32,
|
||||
) {
|
||||
println!("Function in nested module");
|
||||
println!("{first_arg}");
|
||||
println!("{second_arg}");
|
||||
}
|
||||
```
|
||||
"#
|
||||
.unindent();
|
||||
assert_eq!(result, expected);
|
||||
}
|
||||
15
worlds_fair_talk/slides/04_grep_tool_output_old.md
Normal file
15
worlds_fair_talk/slides/04_grep_tool_output_old.md
Normal file
@@ -0,0 +1,15 @@
|
||||
Found 1 matches:
|
||||
|
||||
## Matches in crates/assistant_tool/src/assistant_tool.rs
|
||||
|
||||
### Lines 238-241
|
||||
|
||||
```rs
|
||||
}
|
||||
|
||||
/// Runs the tool with the provided input.
|
||||
fn run(
|
||||
self: Arc<Self>,
|
||||
input: serde_json::Value,
|
||||
request: Arc<LanguageModelRequest>,
|
||||
```
|
||||
38
worlds_fair_talk/slides/05_grep_tool_output_new.rs
Normal file
38
worlds_fair_talk/slides/05_grep_tool_output_new.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
#[gpui::test]
|
||||
async fn test_grep_function_args_and_body(cx: &mut TestAppContext) {
|
||||
let project = setup_syntax_test(cx).await;
|
||||
|
||||
// Test: Line with a function argument
|
||||
let input = serde_json::to_value(GrepToolInput {
|
||||
regex: "fn run(".to_string(),
|
||||
include_pattern: Some("**/*.rs".to_string()),
|
||||
offset: 0,
|
||||
case_sensitive: false,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let result = run_grep_tool(input, project.clone(), cx).await;
|
||||
let expected = r#"
|
||||
Found 1 matches:
|
||||
|
||||
## Matches in crates/assistant_tool/src/assistant_tool.rs
|
||||
|
||||
### trait AssistantTool › fn run › L238-241
|
||||
|
||||
```rs
|
||||
/// Runs the tool with the provided input.
|
||||
fn run(
|
||||
self: Arc<Self>,
|
||||
input: serde_json::Value,
|
||||
request: Arc<LanguageModelRequest>,
|
||||
project: Entity<Project>,
|
||||
action_log: Entity<ActionLog>,
|
||||
model: Arc<dyn LanguageModel>,
|
||||
window: Option<AnyWindowHandle>,
|
||||
cx: &mut App,
|
||||
) -> ToolResult;
|
||||
```
|
||||
"#
|
||||
.unindent();
|
||||
assert_eq!(result, expected);
|
||||
}
|
||||
19
worlds_fair_talk/slides/06_streaming_edits.md
Normal file
19
worlds_fair_talk/slides/06_streaming_edits.md
Normal file
@@ -0,0 +1,19 @@
|
||||
# Streaming Edits
|
||||
|
||||
Show the model edits as they happen, token by token.
|
||||
|
||||
## Challenges
|
||||
|
||||
1. Tool calling doesn't stream
|
||||
- JSON values must be complete before they are streamed
|
||||
- We can't use tool calling alone if we want to see streaming text
|
||||
- We ask it to stream `<old_text>` and `<new_text>` blocks
|
||||
|
||||
2. Parsing Complexity: XML tags arrive in random chunks
|
||||
- `</old_te` + `xt>` (split across network packets)
|
||||
- Must buffer and parse incrementally
|
||||
|
||||
3. Imperfect Model Behavior: Models don't follow instructions perfectly
|
||||
- Wrong closing tags: `<old_text>...</new_text>`
|
||||
- Inconsistent indentation and whitespace
|
||||
- Escaping
|
||||
54
worlds_fair_talk/slides/07_stochastic_unit_tests.rs
Normal file
54
worlds_fair_talk/slides/07_stochastic_unit_tests.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
// When AI enters the equation, we need a new approach
|
||||
// We test AI features by sampling their behavior:
|
||||
|
||||
#[test]
|
||||
fn eval_translate_doc_comments() {
|
||||
let input_file_path = "root/canvas.rs";
|
||||
let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
|
||||
let edit_description = "Translate all doc comments to Italian";
|
||||
eval(
|
||||
200,
|
||||
1.,
|
||||
EvalInput::from_conversation(
|
||||
vec![
|
||||
message(
|
||||
User,
|
||||
[text(formatdoc! {"
|
||||
Read the {input_file_path} file and edit it (without overwriting it),
|
||||
translating all the doc comments to italian.
|
||||
"})],
|
||||
),
|
||||
message(
|
||||
Assistant,
|
||||
[tool_use(
|
||||
"tool_1",
|
||||
"read_file",
|
||||
ReadFileToolInput {
|
||||
path: input_file_path.into(),
|
||||
start_line: None,
|
||||
end_line: None,
|
||||
},
|
||||
)],
|
||||
),
|
||||
message(
|
||||
User,
|
||||
[tool_result("tool_1", "read_file", input_file_content)],
|
||||
),
|
||||
message(
|
||||
Assistant,
|
||||
[tool_use(
|
||||
"tool_2",
|
||||
"edit_file",
|
||||
EditFileToolInput {
|
||||
display_description: edit_description.into(),
|
||||
path: input_file_path.into(),
|
||||
mode: EditFileMode::Edit,
|
||||
},
|
||||
)],
|
||||
),
|
||||
],
|
||||
Some(input_file_content.into()),
|
||||
EvalAssertion::judge_diff("Doc comments were translated to Italian"),
|
||||
),
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
// Some streaming edit problems can be tested the old-fashioned way!
|
||||
|
||||
// 1. Parser must handle chunks split ANYWHERE
|
||||
#[gpui::test(iterations = 100)]
|
||||
fn test_parser_random_chunks(mut rng: StdRng) {
|
||||
let input = "<old_text>hello world</old_text><new_text>goodbye</new_text>";
|
||||
|
||||
// Generate random chunk boundaries
|
||||
let mut chunks = vec![];
|
||||
let mut pos = 0;
|
||||
while pos < input.len() {
|
||||
let chunk_size = rng.gen_range(1..=10);
|
||||
let end = (pos + chunk_size).min(input.len());
|
||||
chunks.push(&input[pos..end]);
|
||||
pos = end;
|
||||
}
|
||||
|
||||
// Parser MUST handle any chunking
|
||||
let mut parser = EditParser::new();
|
||||
let events: Vec<_> = chunks.iter().flat_map(|chunk| parser.push(chunk)).collect();
|
||||
|
||||
assert_eq!(
|
||||
events,
|
||||
vec![Event::OldText("hello world"), Event::NewText("goodbye")]
|
||||
);
|
||||
}
|
||||
|
||||
// 2. Fuzzy matcher algorithm (without LLM input)
|
||||
#[test]
|
||||
fn test_fuzzy_match_algorithm() {
|
||||
let buffer = "fn calculate_price() {\n // TODO\n}";
|
||||
let query = "fn calculate_price() {"; // Extra spaces
|
||||
|
||||
let matcher = FuzzyMatcher::new(buffer);
|
||||
let range = matcher.find(query);
|
||||
|
||||
assert_eq!(range, Some(0..24)); // Found despite whitespace
|
||||
}
|
||||
|
||||
// 3. Streaming diff computes edits incrementally
|
||||
#[test]
|
||||
fn test_streaming_diff() {
|
||||
let old_text = "fn calculate() {\n todo!()\n}";
|
||||
let mut diff = StreamingDiff::new(old_text);
|
||||
|
||||
// Simulate new text arriving in chunks
|
||||
let ops1 = diff.push_new("fn calc");
|
||||
assert_eq!(
|
||||
ops1,
|
||||
vec![
|
||||
CharOp::Keep(7), // "fn calc"
|
||||
]
|
||||
);
|
||||
|
||||
let ops2 = diff.push_new("ulate_total(");
|
||||
assert_eq!(
|
||||
ops2,
|
||||
vec![
|
||||
CharOp::Insert("_total"), // Insert "_total"
|
||||
CharOp::Keep(5), // "ulate"
|
||||
CharOp::Delete(2), // Remove "()"
|
||||
CharOp::Keep(1), // "("
|
||||
]
|
||||
);
|
||||
|
||||
let ops3 = diff.push_new("items: &[Item]) {\n items.iter().sum()\n}");
|
||||
assert_eq!(
|
||||
ops3,
|
||||
vec![
|
||||
CharOp::Insert("items: &[Item]"),
|
||||
CharOp::Keep(4), // ") {\n"
|
||||
CharOp::Delete(10), // Remove " todo!()"
|
||||
CharOp::Insert(" items.iter().sum()"),
|
||||
CharOp::Keep(2), // "\n}"
|
||||
]
|
||||
);
|
||||
|
||||
// The magic: we computed a valid diff while text was still arriving!
|
||||
}
|
||||
52
worlds_fair_talk/slides/09_empty_old_text_problem.rs
Normal file
52
worlds_fair_talk/slides/09_empty_old_text_problem.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
// As we run the unit eval, we discover problems
|
||||
// Some of which can be solved alorithmically and tested deterministically
|
||||
|
||||
// This prompt change helped:
|
||||
//
|
||||
// - `<old_text>` cannot be empty
|
||||
|
||||
// But the model still wasn't perfect: So we then wrote a deterministic test to
|
||||
// gracefully handle the edge case:
|
||||
#[gpui::test(iterations = 100)]
|
||||
async fn test_empty_old_text(cx: &mut TestAppContext, mut rng: StdRng) {
|
||||
let agent = init_test(cx).await;
|
||||
let buffer = cx.new(|cx| {
|
||||
Buffer::local(
|
||||
indoc! {"
|
||||
abc
|
||||
def
|
||||
ghi
|
||||
"},
|
||||
cx,
|
||||
)
|
||||
});
|
||||
let (apply, _events) = agent.edit(
|
||||
buffer.clone(),
|
||||
String::new(),
|
||||
&LanguageModelRequest::default(),
|
||||
&mut cx.to_async(),
|
||||
);
|
||||
cx.run_until_parked();
|
||||
|
||||
simulate_llm_output(
|
||||
&agent,
|
||||
indoc! {"
|
||||
<old_text></old_text>
|
||||
<new_text>jkl</new_text>
|
||||
<old_text>def</old_text>
|
||||
<new_text>DEF</new_text>
|
||||
"},
|
||||
&mut rng,
|
||||
cx,
|
||||
);
|
||||
apply.await.unwrap();
|
||||
|
||||
pretty_assertions::assert_eq!(
|
||||
buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
|
||||
indoc! {"
|
||||
abc
|
||||
DEF
|
||||
ghi
|
||||
"}
|
||||
);
|
||||
}
|
||||
101
worlds_fair_talk/slides/10_mismatched_tags.rs
Normal file
101
worlds_fair_talk/slides/10_mismatched_tags.rs
Normal file
@@ -0,0 +1,101 @@
|
||||
// Another tricky case with XML: Tag mismatches
|
||||
|
||||
// Initial pass rate: 60%
|
||||
|
||||
// PROMPT FIX:
|
||||
// Added: "Always close all tags properly"
|
||||
|
||||
// After prompt fix: 95% pass rate
|
||||
// That last 5% wouldn't budge, so we made the parser forgiving:
|
||||
|
||||
#[gpui::test(iterations = 1000)]
|
||||
fn test_mismatched_tags(mut rng: StdRng) {
|
||||
let mut parser = EditParser::new();
|
||||
assert_eq!(
|
||||
parse_random_chunks(
|
||||
// Reduced from an actual Sonnet 3.7 output
|
||||
indoc! {"
|
||||
<old_text>
|
||||
a
|
||||
b
|
||||
c
|
||||
</new_text>
|
||||
<new_text>
|
||||
a
|
||||
B
|
||||
c
|
||||
</old_text>
|
||||
<old_text>
|
||||
d
|
||||
e
|
||||
f
|
||||
</new_text>
|
||||
<new_text>
|
||||
D
|
||||
e
|
||||
F
|
||||
</old_text>
|
||||
"},
|
||||
&mut parser,
|
||||
&mut rng
|
||||
),
|
||||
vec![
|
||||
Edit {
|
||||
old_text: "a\nb\nc".to_string(),
|
||||
new_text: "a\nB\nc".to_string(),
|
||||
},
|
||||
Edit {
|
||||
old_text: "d\ne\nf".to_string(),
|
||||
new_text: "D\ne\nF".to_string(),
|
||||
}
|
||||
]
|
||||
);
|
||||
assert_eq!(
|
||||
parser.finish(),
|
||||
EditParserMetrics {
|
||||
tags: 4,
|
||||
mismatched_tags: 4
|
||||
}
|
||||
);
|
||||
|
||||
let mut parser = EditParser::new();
|
||||
assert_eq!(
|
||||
parse_random_chunks(
|
||||
// Reduced from an actual Opus 4 output
|
||||
indoc! {"
|
||||
<edits>
|
||||
<old_text>
|
||||
Lorem
|
||||
</old_text>
|
||||
<new_text>
|
||||
LOREM
|
||||
</edits>
|
||||
"},
|
||||
&mut parser,
|
||||
&mut rng
|
||||
),
|
||||
vec![Edit {
|
||||
old_text: "Lorem".to_string(),
|
||||
new_text: "LOREM".to_string(),
|
||||
},]
|
||||
);
|
||||
assert_eq!(
|
||||
parser.finish(),
|
||||
EditParserMetrics {
|
||||
tags: 2,
|
||||
mismatched_tags: 1
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
|
||||
self.metrics.mismatched_tags += 1;
|
||||
// Keep parsing anyway - don't let bad XML stop us
|
||||
}
|
||||
|
||||
// We track mismatched tags across all evals and fail if > 5%:
|
||||
let mismatched_tag_ratio =
|
||||
cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
|
||||
if mismatched_tag_ratio > 0.05 {
|
||||
panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
|
||||
}
|
||||
101
worlds_fair_talk/slides/11_indentation.rs
Normal file
101
worlds_fair_talk/slides/11_indentation.rs
Normal file
@@ -0,0 +1,101 @@
|
||||
// EVAL 3: Models stripped indentation, breaking everything
|
||||
|
||||
#[test]
|
||||
fn eval_indented_code_editing() {
|
||||
eval(100, 0.95, "Edit nested function", || async {
|
||||
let buffer = Buffer::local(
|
||||
indoc! {"
|
||||
fn outer() {
|
||||
fn inner() {
|
||||
todo!()
|
||||
}
|
||||
}"
|
||||
}
|
||||
);
|
||||
|
||||
// Ask to modify the inner function
|
||||
agent.edit(
|
||||
buffer.clone(),
|
||||
"Replace todo with return 42",
|
||||
&conversation,
|
||||
).0.await;
|
||||
|
||||
// Models sent:
|
||||
// <old_text>
|
||||
// fn inner() {
|
||||
// todo!()
|
||||
// }
|
||||
// </old_text>
|
||||
// <new_text>
|
||||
// fn inner() {
|
||||
// return 42
|
||||
// }
|
||||
// </new_text>
|
||||
|
||||
// Code has 8-space indent, model used 4!
|
||||
});
|
||||
}
|
||||
|
||||
// This eval failure drove us to build an algorithmic solution:
|
||||
|
||||
fn calculate_indent_delta(buffer_text: &str, llm_text: &str) -> IndentDelta {
|
||||
let buffer_indent = detect_indent(buffer_text); // 8 spaces
|
||||
let llm_indent = detect_indent(llm_text); // 0 spaces
|
||||
|
||||
IndentDelta::Spaces(buffer_indent - llm_indent) // +8
|
||||
}
|
||||
|
||||
// Which we could then test deterministically:
|
||||
|
||||
#[gpui::test(iterations = 100)]
|
||||
async fn test_indentation(cx: &mut TestAppContext, mut rng: StdRng) {
|
||||
let agent = init_test(cx).await;
|
||||
let buffer = cx.new(|cx| {
|
||||
Buffer::local(
|
||||
indoc! {"
|
||||
lorem
|
||||
ipsum
|
||||
dolor
|
||||
sit
|
||||
"},
|
||||
cx,
|
||||
)
|
||||
});
|
||||
let (apply, _events) = agent.edit(
|
||||
buffer.clone(),
|
||||
String::new(),
|
||||
&LanguageModelRequest::default(),
|
||||
&mut cx.to_async(),
|
||||
);
|
||||
|
||||
simulate_llm_output(
|
||||
&agent,
|
||||
indoc! {"
|
||||
<old_text>
|
||||
ipsum
|
||||
dolor
|
||||
sit
|
||||
</old_text>
|
||||
<new_text>
|
||||
ipsum
|
||||
dolor
|
||||
sit
|
||||
amet
|
||||
</new_text>
|
||||
"},
|
||||
&mut rng,
|
||||
cx,
|
||||
);
|
||||
apply.await.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
|
||||
indoc! {"
|
||||
lorem
|
||||
ipsum
|
||||
dolor
|
||||
sit
|
||||
amet
|
||||
"}
|
||||
);
|
||||
}
|
||||
44
worlds_fair_talk/slides/12_escaping.rs
Normal file
44
worlds_fair_talk/slides/12_escaping.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
// EVAL 4: Models kept escaping characters inside XML tags
|
||||
|
||||
#[test]
|
||||
fn eval_string_escaping() {
|
||||
eval(100, 0.95, "Edit string with quotes", || async {
|
||||
let buffer = Buffer::local(
|
||||
r#"let msg = "Hello, world!";"#
|
||||
);
|
||||
|
||||
// Ask to change the message
|
||||
let (task, _) = agent.edit(
|
||||
buffer.clone(),
|
||||
"Change message to say goodbye",
|
||||
&conversation,
|
||||
);
|
||||
|
||||
let edited = task.await.unwrap();
|
||||
assert_eq!(edited.text(), r#"let msg = "Goodbye, world!";"#);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
// What models would generate:
|
||||
// <old_text>let msg = "Hello, world!";</old_text>
|
||||
// Or...
|
||||
// <old_text>let msg = \"Hello, world!\";</old_text>
|
||||
|
||||
// Fuzzy matcher can't find escaped version in buffer!
|
||||
|
||||
// Even more chaos with newlines:
|
||||
// Buffer: "fn test() {\n println!(\"hi\");\n}"
|
||||
// Model: "<old_text>fn test() {\\n println!(\\"hi\\");\\n}</old_text>"
|
||||
|
||||
// This was particularly bad for Gemini (only 35% pass rate!)
|
||||
|
||||
// PROMPT FIX
|
||||
// Added: "Do not escape quotes, newlines, or other characters within tags"
|
||||
|
||||
// Impact across models:
|
||||
// Gemini-2.5-pro: 35% → 86% (massive!)
|
||||
// Claude-3.7: 96% → 98%
|
||||
// GPT-4.1: 81% → 100%
|
||||
|
||||
// One line fixed Gemini's worst failure mode
|
||||
9
worlds_fair_talk/slides/13_lessons_learned.md
Normal file
9
worlds_fair_talk/slides/13_lessons_learned.md
Normal file
@@ -0,0 +1,9 @@
|
||||
## Take Aways
|
||||
|
||||
1. Rigorous automated testing is fundamental to software reliability
|
||||
2. Language models require an empirical, statistical approach.
|
||||
3. Useful AI-enabled software is an interplay of stochastic and deterministic components.
|
||||
4. Reliable AI-enabled software requires multiple layers of testing
|
||||
- Stochastic integration tests: a.k.a. "Evals"
|
||||
- Stochastic unit tests
|
||||
- Deterministic unit tests
|
||||
Reference in New Issue
Block a user