Compare commits

...

10 Commits

Author SHA1 Message Date
Nathan Sobo
abeb64e4f1 WIP 2025-06-05 14:23:59 -07:00
Nathan Sobo
855ab86c68 Add some slides about the grep tool 2025-06-04 17:21:32 -07:00
Nathan Sobo
e11aeb7b5d Delete notes, refine slides 2025-06-04 13:26:58 -07:00
Nathan Sobo
744b764b31 Don't worry about actually running the talk
Nice idea, out of time.
2025-06-04 10:33:34 -07:00
Nathan Sobo
48f3eeca81 Move talk 2025-06-04 09:39:57 -07:00
Nathan Sobo
d60c72f146 Update slides 2025-06-04 09:38:44 -07:00
Nathan Sobo
f7fab92b9b Add worlds_fair_talk crate with 'CI in the Era of AI' presentation
This talk explores how Zed's testing philosophy evolved when integrating
language models, using the streaming edits feature as a case study.

Key topics covered:
- Deterministic testing foundations at Zed
- Introduction of statistical testing (evals) for LLM behavior
- Specific eval failures that drove prompt evolution
- Property-based testing for algorithmic components
- Lessons learned about testing stochastic systems

The talk demonstrates how empirical methods become essential when working
with LLMs, showing concrete examples of how eval failures led to both
prompt improvements and algorithmic solutions.
2025-06-04 08:54:30 -07:00
Nathan Sobo
426cfe434e Add worlds_fair_talk: CI in the Era of AI presentation
This talk explores how we evolved our testing philosophy when integrating
language models into Zed, specifically for the streaming edits feature.

Key topics covered:
- Deterministic testing foundations at Zed
- Introduction of stochastic unit tests for LLM behavior
- Streaming edits implementation challenges
- Eval-driven prompt engineering discoveries
- Property-based testing for algorithmic components
- Lessons learned about empirical testing with AI

The talk is structured as executable code examples with accompanying
speaker notes, demonstrating real failures and solutions from our
development process.
2025-06-03 15:58:43 -07:00
Nathan Sobo
bf9def0fbe Merge origin/main into ai-worlds-fair, preserving important comments 2025-06-03 10:17:42 -07:00
Nathan Sobo
53fb5e6ac8 Comment interesting fodder for the talk 2025-05-22 11:09:23 -06:00
20 changed files with 853 additions and 4 deletions

View File

@@ -238,7 +238,7 @@ impl EditAgent {
let (output, edit_events) = Self::parse_edit_chunks(edit_chunks, cx);
let mut edit_events = edit_events.peekable();
while let Some(edit_event) = Pin::new(&mut edit_events).peek().await {
// Skip events until we're at the start of a new edit.
// Salta gli eventi finché non siamo all'inizio di una nuova modifica.
let Ok(EditParserEvent::OldTextChunk { .. }) = edit_event else {
edit_events.next().await.unwrap()?;
continue;
@@ -246,8 +246,8 @@ impl EditAgent {
let snapshot = buffer.read_with(cx, |buffer, _| buffer.snapshot())?;
// Resolve the old text in the background, updating the agent
// location as we keep refining which range it corresponds to.
// Risolvi il vecchio testo in background, aggiornando la posizione
// dell'agente mentre continuiamo a perfezionare a quale intervallo corrisponde.
let (resolve_old_text, mut old_range) =
Self::resolve_old_text(snapshot.text.clone(), edit_events, cx);
while let Ok(old_range) = old_range.recv().await {
@@ -726,6 +726,9 @@ mod tests {
);
cx.run_until_parked();
// !talk: Questo è un test unitario più tradizionale.
// !talk: È randomizzato, ma ancora fondamentalmente deterministico.
// !talk: Ma comunque rilevante per lavorare con un LLM
simulate_llm_output(
&agent,
indoc! {"
@@ -749,6 +752,7 @@ mod tests {
);
}
// !talk: Really interesting unit test - Again about purely algorithmic code but critical to performance on the task.
#[gpui::test(iterations = 100)]
async fn test_indentation(cx: &mut TestAppContext, mut rng: StdRng) {
let agent = init_test(cx).await;

View File

@@ -75,6 +75,8 @@ impl EditParser {
chunk.pop();
}
// !talk: We're tolerant of mismatched tags because we couldn't get this to zero
// !talk: Seems like things are more likely on distribution if the model gets this right, but we don't really know.
self.metrics.tags += 1;
if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
self.metrics.mismatched_tags += 1;
@@ -333,6 +335,7 @@ mod tests {
);
}
// !talk: This is the traditional randomized test on the parser covering the last N%.
#[gpui::test(iterations = 1000)]
fn test_mismatched_tags(mut rng: StdRng) {
let mut parser = EditParser::new();

View File

@@ -160,6 +160,7 @@ fn eval_delete_run_git_blame() {
);
}
// !talk: Go here after zoomed out eval.
#[test]
#[cfg_attr(not(feature = "eval"), ignore)]
fn eval_translate_doc_comments() {
@@ -176,7 +177,7 @@ fn eval_translate_doc_comments() {
let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
let edit_description = "Translate all doc comments to Italian";
eval(
200,
100,
1.,
EvalInput::from_conversation(
vec![
@@ -1349,6 +1350,8 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
);
}
// !talk: Here's a blanket assertion we added to the eval tracking the presence of mismatched tags
// !talk: It's run on every eval because it's a cross cutting concern.
let mismatched_tag_ratio =
cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
if mismatched_tag_ratio > 0.05 {

View File

@@ -1,3 +1,5 @@
# Slide 2: Evals. This is our equivalent of swebench, but on our own codebase
url = "https://github.com/zed-industries/zed.git"
revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
language_extension = "rs"

9
talk.md Normal file
View File

@@ -0,0 +1,9 @@
- Start with the find and replace diff card eval
- /Users/nathan/src/zed/crates/eval/src/examples/find_and_replace_diff_card.toml
- Zoom in on streaming edits
- /Users/nathan/src/zed/crates/assistant_tools/src/edit_agent/evals.rs
- fn eval_translate_doc_comments() {
- Unit tests on streaming edits
- Show the mismatched

View File

@@ -0,0 +1,75 @@
# Worlds Fair Talk: CI in the Era of AI
This crate contains the materials for Nathan Sobo's talk "CI in the Era of AI: From Unit Tests to Stochastic Evals" presented at the AI Engineer World's Fair.
## Overview
The talk explores how Zed's testing philosophy evolved when integrating language models, using the streaming edits feature as a case study. It demonstrates the shift from purely deterministic testing to embracing statistical methods when working with inherently stochastic systems.
## Structure
The talk is organized as numbered source files with accompanying speaker notes:
### Slides (in `src/`)
- `00_intro.md` - Title slide and introduction
- `01_deterministic_testing_at_zed.rs` - Zed's traditional deterministic testing approach
- `02_stochastic_unit_tests.rs` - Introduction to statistical testing for LLMs
- `03_streaming_edits_overview.md` - Overview of the streaming edits challenge
- `04_deterministic_streaming_tests.rs` - Traditional tests for algorithmic components
- `05_empty_old_text_problem.rs` - First eval failure: empty old_text bug
- `06_tag_mismatch_discovery.rs` - XML tag mismatch issues (5% failure rate)
- `07_the_indentation_discovery.rs` - Indentation problem and algorithmic solution
- `08_escaping_chaos.rs` - Character escaping issues (especially for Gemini)
- `09_lessons_learned.md` - Key takeaways about testing with LLMs
### Speaker Notes (in `notes/`)
Each slide has a corresponding `.md` file with speaker notes in the `notes/` directory.
## Key Concepts
### Streaming Edits Feature
- Allows users to see AI code edits character-by-character as they're generated
- Works around API limitations where tool calling can't stream edit content
- Uses a two-phase approach: tool call for intent, then raw text streaming
### Testing Evolution
1. **Deterministic Tests**: For parsing, algorithms, indentation adjustment
2. **Statistical Tests (Evals)**: For LLM behavior, requiring threshold pass rates
3. **Property-Based Tests**: For comprehensive algorithmic validation
### Major Discoveries
- **Empty old_text**: 0% → 99% pass rate with one prompt line
- **Tag mismatches**: Models mess up XML closing tags, made parser tolerant
- **Indentation**: Built automatic adjustment algorithm
- **Character escaping**: Gemini went from 35% → 86% with one instruction
## Historical Context
The prompt evolution was driven by specific eval failures:
- Commit `ab017129d8` (May 22, 2025) by Oleksiy Syvokon made major improvements:
- Gemini: 35% → 86%
- Claude: 96% → 98%
- GPT-4: 81% → 100%
## Talk Duration
Approximately 15 minutes, designed to move quickly through concrete examples.
## Building the Talk
This crate is not meant to be compiled - the code examples are illustrative and may use simplified types for clarity. The actual implementation lives in `crates/assistant_tools/`.
## Future Work
If continuing this talk:
- Consider adding `test_edit_events` showing real-time event streaming
- The `eval_add_overwrite_test` has surprisingly low pass rates (16-35%) and might reveal interesting failure modes
- More examples of property-based testing could strengthen the deterministic testing section
## Key Message
The core thesis: When building on LLMs, you must embrace empirical methods. You can't reason about their behavior - you can only measure it. This requires:
1. Statistical thresholds instead of binary pass/fail
2. Learning from failure patterns
3. Accepting imperfection and building resilient systems
4. Layering deterministic and statistical tests appropriately

View File

@@ -0,0 +1,5 @@
# CI in the Era of AI: From Unit Tests to Stochastic Evals
## Evolving Zed's testing philosophy to embrace LLMs
### Nathan Sobo Co-founder of Zed

View File

@@ -0,0 +1,38 @@
// Our foundation: deterministic tests with controlled randomness
#[gpui::test(iterations = 50)]
async fn test_collaborative_editing(executor: BackgroundExecutor) {
let mut server = TestServer::start(executor.clone()).await;
let client_a = server.create_client("user_a").await;
let client_b = server.create_client("user_b").await;
// Create shared project
let project_a = client_a.build_local_project("/code").await;
let project_id = project_a.borrow_mut().share().await.unwrap();
// Client B joins
let project_b = client_b.join_remote_project(project_id).await;
// Open same buffer
let buffer_a = project_a.borrow_mut()
.open_local_buffer("/code/main.rs").await.unwrap();
let buffer_b = project_b.borrow_mut()
.open_buffer("main.rs").await.unwrap();
// Concurrent edits
buffer_a.borrow_mut().edit([(0..0, "// A's edit\n")]);
buffer_b.borrow_mut().edit([(0..0, "// B's edit\n")]);
// Controlled network failures
server.disconnect_client(client_a.peer_id().unwrap());
executor.advance_clock(RECEIVE_TIMEOUT);
// B continues editing while A is disconnected
buffer_b.borrow_mut().edit([(24..24, "// B alone\n")]);
// A reconnects
executor.advance_clock(RECONNECT_TIMEOUT);
executor.run_until_parked();
// Clear pass/fail - reproducible every time
assert_eq!(buffer_a.borrow().text(), buffer_b.borrow().text());
}

View File

@@ -0,0 +1,45 @@
# Slide 2: Evals. This is our equivalent of swebench, but on our own codebase
url = "https://github.com/zed-industries/zed.git"
revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
language_extension = "rs"
prompt = """
Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
The card should implement the `Render` trait.
The card should show a diff. It should be a beautifully presented diff.
The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
I want to see a red background for lines that were deleted and a green background for lines
that were added. We should have a div per diff line.
"""
[diff_assertions]
modify_find_and_replace_tool = """
The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
The struct should contain an `output` field that is the same as the task we were returning before,
and a new `card` field that contains a view for the card.
"""
card_implementation = """
The card should be a view that displays a diff.
Each line in the diff should be colored according to whether it was added, removed or unchanged.
"""
[thread_assertions]
path_search = """
The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
(*Not* grep, for example, or reading the file based on a guess at the path.)
This is because we gave the model a filename and it needs to turn that into a real path.
"""
read_file_from_path_search = """
After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
"""
symbol_search = """
When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
on what path the Render trait might be in.
"""

View File

@@ -0,0 +1,153 @@
use std::path::Path;
use agent_settings::AgentProfileId;
use anyhow::Result;
use async_trait::async_trait;
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer};
pub struct AddArgToTraitMethod;
#[async_trait(?Send)]
impl Example for AddArgToTraitMethod {
fn meta(&self) -> ExampleMetadata {
ExampleMetadata {
name: "add_arg_to_trait_method".to_string(),
url: "https://github.com/zed-industries/zed.git".to_string(),
revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(),
language_server: Some(LanguageServer {
file_extension: "rs".to_string(),
allow_preexisting_diagnostics: false,
}),
max_assertions: None,
profile_id: AgentProfileId::default(),
existing_thread_json: None,
max_turns: None,
}
}
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
const FILENAME: &str = "assistant_tool.rs";
cx.push_user_message(format!(
r#"
Add a `window: Option<gpui::AnyWindowHandle>` argument to the `Tool::run` trait method in {FILENAME},
and update all the implementations of the trait and call sites accordingly.
"#
));
let _ = cx.run_to_end().await?;
// Adds ignored argument to all but `batch_tool`
let add_ignored_window_paths = &[
"code_action_tool",
"code_symbols_tool",
"contents_tool",
"copy_path_tool",
"create_directory_tool",
"create_file_tool",
"delete_path_tool",
"diagnostics_tool",
"edit_file_tool",
"fetch_tool",
"grep_tool",
"list_directory_tool",
"move_path_tool",
"now_tool",
"open_tool",
"path_search_tool",
"read_file_tool",
"rename_tool",
"symbol_info_tool",
"terminal_tool",
"thinking_tool",
"web_search_tool",
];
let edits = cx.edits();
for tool_name in add_ignored_window_paths {
let path_str = format!("crates/assistant_tools/src/{}.rs", tool_name);
let edits = edits.get(Path::new(&path_str));
let ignored = edits.map_or(false, |edits| {
edits.has_added_line(" _window: Option<gpui::AnyWindowHandle>,\n")
});
let uningored = edits.map_or(false, |edits| {
edits.has_added_line(" window: Option<gpui::AnyWindowHandle>,\n")
});
cx.assert(ignored || uningored, format!("Argument: {}", tool_name))
.ok();
cx.assert(ignored, format!("`_` prefix: {}", tool_name))
.ok();
}
// Adds unignored argument to `batch_tool`
let batch_tool_edits = edits.get(Path::new("crates/assistant_tools/src/batch_tool.rs"));
cx.assert(
batch_tool_edits.map_or(false, |edits| {
edits.has_added_line(" window: Option<gpui::AnyWindowHandle>,\n")
}),
"Argument: batch_tool",
)
.ok();
Ok(())
}
fn diff_assertions(&self) -> Vec<JudgeAssertion> {
vec![
JudgeAssertion {
id: "batch tool passes window to each".to_string(),
description:
"batch_tool is modified to pass a clone of the window to each tool it calls."
.to_string(),
},
JudgeAssertion {
id: "tool tests updated".to_string(),
description:
"tool tests are updated to pass the new `window` argument (`None` is ok)."
.to_string(),
},
]
}
}
#[gpui::test]
async fn test_grep_function_args_and_body(cx: &mut TestAppContext) {
let project = setup_syntax_test(cx).await;
// Test: Line with a function argument
let input = serde_json::to_value(GrepToolInput {
regex: "second_arg".to_string(),
include_pattern: Some("**/*.rs".to_string()),
offset: 0,
case_sensitive: false,
})
.unwrap();
let result = run_grep_tool(input, project.clone(), cx).await;
let expected = r#"
Found 1 matches:
## Matches in root/test_syntax.rs
### mod feature_module pub mod nested_module pub fn nested_function L7-14
```
pub fn nested_function(
first_arg: String,
second_arg: i32,
) {
println!("Function in nested module");
println!("{first_arg}");
println!("{second_arg}");
}
```
"#
.unindent();
assert_eq!(result, expected);
}

View File

@@ -0,0 +1,15 @@
Found 1 matches:
## Matches in crates/assistant_tool/src/assistant_tool.rs
### Lines 238-241
```rs
}
/// Runs the tool with the provided input.
fn run(
self: Arc<Self>,
input: serde_json::Value,
request: Arc<LanguageModelRequest>,
```

View File

@@ -0,0 +1,38 @@
#[gpui::test]
async fn test_grep_function_args_and_body(cx: &mut TestAppContext) {
let project = setup_syntax_test(cx).await;
// Test: Line with a function argument
let input = serde_json::to_value(GrepToolInput {
regex: "fn run(".to_string(),
include_pattern: Some("**/*.rs".to_string()),
offset: 0,
case_sensitive: false,
})
.unwrap();
let result = run_grep_tool(input, project.clone(), cx).await;
let expected = r#"
Found 1 matches:
## Matches in crates/assistant_tool/src/assistant_tool.rs
### trait AssistantTool fn run L238-241
```rs
/// Runs the tool with the provided input.
fn run(
self: Arc<Self>,
input: serde_json::Value,
request: Arc<LanguageModelRequest>,
project: Entity<Project>,
action_log: Entity<ActionLog>,
model: Arc<dyn LanguageModel>,
window: Option<AnyWindowHandle>,
cx: &mut App,
) -> ToolResult;
```
"#
.unindent();
assert_eq!(result, expected);
}

View File

@@ -0,0 +1,19 @@
# Streaming Edits
Show the model edits as they happen, token by token.
## Challenges
1. Tool calling doesn't stream
- JSON values must be complete before they are streamed
- We can't use tool calling alone if we want to see streaming text
- We ask it to stream `<old_text>` and `<new_text>` blocks
2. Parsing Complexity: XML tags arrive in random chunks
- `</old_te` + `xt>` (split across network packets)
- Must buffer and parse incrementally
3. Imperfect Model Behavior: Models don't follow instructions perfectly
- Wrong closing tags: `<old_text>...</new_text>`
- Inconsistent indentation and whitespace
- Escaping

View File

@@ -0,0 +1,54 @@
// When AI enters the equation, we need a new approach
// We test AI features by sampling their behavior:
#[test]
fn eval_translate_doc_comments() {
let input_file_path = "root/canvas.rs";
let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
let edit_description = "Translate all doc comments to Italian";
eval(
200,
1.,
EvalInput::from_conversation(
vec![
message(
User,
[text(formatdoc! {"
Read the {input_file_path} file and edit it (without overwriting it),
translating all the doc comments to italian.
"})],
),
message(
Assistant,
[tool_use(
"tool_1",
"read_file",
ReadFileToolInput {
path: input_file_path.into(),
start_line: None,
end_line: None,
},
)],
),
message(
User,
[tool_result("tool_1", "read_file", input_file_content)],
),
message(
Assistant,
[tool_use(
"tool_2",
"edit_file",
EditFileToolInput {
display_description: edit_description.into(),
path: input_file_path.into(),
mode: EditFileMode::Edit,
},
)],
),
],
Some(input_file_content.into()),
EvalAssertion::judge_diff("Doc comments were translated to Italian"),
),
);
}

View File

@@ -0,0 +1,79 @@
// Some streaming edit problems can be tested the old-fashioned way!
// 1. Parser must handle chunks split ANYWHERE
#[gpui::test(iterations = 100)]
fn test_parser_random_chunks(mut rng: StdRng) {
let input = "<old_text>hello world</old_text><new_text>goodbye</new_text>";
// Generate random chunk boundaries
let mut chunks = vec![];
let mut pos = 0;
while pos < input.len() {
let chunk_size = rng.gen_range(1..=10);
let end = (pos + chunk_size).min(input.len());
chunks.push(&input[pos..end]);
pos = end;
}
// Parser MUST handle any chunking
let mut parser = EditParser::new();
let events: Vec<_> = chunks.iter().flat_map(|chunk| parser.push(chunk)).collect();
assert_eq!(
events,
vec![Event::OldText("hello world"), Event::NewText("goodbye")]
);
}
// 2. Fuzzy matcher algorithm (without LLM input)
#[test]
fn test_fuzzy_match_algorithm() {
let buffer = "fn calculate_price() {\n // TODO\n}";
let query = "fn calculate_price() {"; // Extra spaces
let matcher = FuzzyMatcher::new(buffer);
let range = matcher.find(query);
assert_eq!(range, Some(0..24)); // Found despite whitespace
}
// 3. Streaming diff computes edits incrementally
#[test]
fn test_streaming_diff() {
let old_text = "fn calculate() {\n todo!()\n}";
let mut diff = StreamingDiff::new(old_text);
// Simulate new text arriving in chunks
let ops1 = diff.push_new("fn calc");
assert_eq!(
ops1,
vec![
CharOp::Keep(7), // "fn calc"
]
);
let ops2 = diff.push_new("ulate_total(");
assert_eq!(
ops2,
vec![
CharOp::Insert("_total"), // Insert "_total"
CharOp::Keep(5), // "ulate"
CharOp::Delete(2), // Remove "()"
CharOp::Keep(1), // "("
]
);
let ops3 = diff.push_new("items: &[Item]) {\n items.iter().sum()\n}");
assert_eq!(
ops3,
vec![
CharOp::Insert("items: &[Item]"),
CharOp::Keep(4), // ") {\n"
CharOp::Delete(10), // Remove " todo!()"
CharOp::Insert(" items.iter().sum()"),
CharOp::Keep(2), // "\n}"
]
);
// The magic: we computed a valid diff while text was still arriving!
}

View File

@@ -0,0 +1,52 @@
// As we run the unit eval, we discover problems
// Some of which can be solved alorithmically and tested deterministically
// This prompt change helped:
//
// - `<old_text>` cannot be empty
// But the model still wasn't perfect: So we then wrote a deterministic test to
// gracefully handle the edge case:
#[gpui::test(iterations = 100)]
async fn test_empty_old_text(cx: &mut TestAppContext, mut rng: StdRng) {
let agent = init_test(cx).await;
let buffer = cx.new(|cx| {
Buffer::local(
indoc! {"
abc
def
ghi
"},
cx,
)
});
let (apply, _events) = agent.edit(
buffer.clone(),
String::new(),
&LanguageModelRequest::default(),
&mut cx.to_async(),
);
cx.run_until_parked();
simulate_llm_output(
&agent,
indoc! {"
<old_text></old_text>
<new_text>jkl</new_text>
<old_text>def</old_text>
<new_text>DEF</new_text>
"},
&mut rng,
cx,
);
apply.await.unwrap();
pretty_assertions::assert_eq!(
buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
indoc! {"
abc
DEF
ghi
"}
);
}

View File

@@ -0,0 +1,101 @@
// Another tricky case with XML: Tag mismatches
// Initial pass rate: 60%
// PROMPT FIX:
// Added: "Always close all tags properly"
// After prompt fix: 95% pass rate
// That last 5% wouldn't budge, so we made the parser forgiving:
#[gpui::test(iterations = 1000)]
fn test_mismatched_tags(mut rng: StdRng) {
let mut parser = EditParser::new();
assert_eq!(
parse_random_chunks(
// Reduced from an actual Sonnet 3.7 output
indoc! {"
<old_text>
a
b
c
</new_text>
<new_text>
a
B
c
</old_text>
<old_text>
d
e
f
</new_text>
<new_text>
D
e
F
</old_text>
"},
&mut parser,
&mut rng
),
vec![
Edit {
old_text: "a\nb\nc".to_string(),
new_text: "a\nB\nc".to_string(),
},
Edit {
old_text: "d\ne\nf".to_string(),
new_text: "D\ne\nF".to_string(),
}
]
);
assert_eq!(
parser.finish(),
EditParserMetrics {
tags: 4,
mismatched_tags: 4
}
);
let mut parser = EditParser::new();
assert_eq!(
parse_random_chunks(
// Reduced from an actual Opus 4 output
indoc! {"
<edits>
<old_text>
Lorem
</old_text>
<new_text>
LOREM
</edits>
"},
&mut parser,
&mut rng
),
vec![Edit {
old_text: "Lorem".to_string(),
new_text: "LOREM".to_string(),
},]
);
assert_eq!(
parser.finish(),
EditParserMetrics {
tags: 2,
mismatched_tags: 1
}
);
}
if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
self.metrics.mismatched_tags += 1;
// Keep parsing anyway - don't let bad XML stop us
}
// We track mismatched tags across all evals and fail if > 5%:
let mismatched_tag_ratio =
cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
if mismatched_tag_ratio > 0.05 {
panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
}

View File

@@ -0,0 +1,101 @@
// EVAL 3: Models stripped indentation, breaking everything
#[test]
fn eval_indented_code_editing() {
eval(100, 0.95, "Edit nested function", || async {
let buffer = Buffer::local(
indoc! {"
fn outer() {
fn inner() {
todo!()
}
}"
}
);
// Ask to modify the inner function
agent.edit(
buffer.clone(),
"Replace todo with return 42",
&conversation,
).0.await;
// Models sent:
// <old_text>
// fn inner() {
// todo!()
// }
// </old_text>
// <new_text>
// fn inner() {
// return 42
// }
// </new_text>
// Code has 8-space indent, model used 4!
});
}
// This eval failure drove us to build an algorithmic solution:
fn calculate_indent_delta(buffer_text: &str, llm_text: &str) -> IndentDelta {
let buffer_indent = detect_indent(buffer_text); // 8 spaces
let llm_indent = detect_indent(llm_text); // 0 spaces
IndentDelta::Spaces(buffer_indent - llm_indent) // +8
}
// Which we could then test deterministically:
#[gpui::test(iterations = 100)]
async fn test_indentation(cx: &mut TestAppContext, mut rng: StdRng) {
let agent = init_test(cx).await;
let buffer = cx.new(|cx| {
Buffer::local(
indoc! {"
lorem
ipsum
dolor
sit
"},
cx,
)
});
let (apply, _events) = agent.edit(
buffer.clone(),
String::new(),
&LanguageModelRequest::default(),
&mut cx.to_async(),
);
simulate_llm_output(
&agent,
indoc! {"
<old_text>
ipsum
dolor
sit
</old_text>
<new_text>
ipsum
dolor
sit
amet
</new_text>
"},
&mut rng,
cx,
);
apply.await.unwrap();
assert_eq!(
buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
indoc! {"
lorem
ipsum
dolor
sit
amet
"}
);
}

View File

@@ -0,0 +1,44 @@
// EVAL 4: Models kept escaping characters inside XML tags
#[test]
fn eval_string_escaping() {
eval(100, 0.95, "Edit string with quotes", || async {
let buffer = Buffer::local(
r#"let msg = "Hello, world!";"#
);
// Ask to change the message
let (task, _) = agent.edit(
buffer.clone(),
"Change message to say goodbye",
&conversation,
);
let edited = task.await.unwrap();
assert_eq!(edited.text(), r#"let msg = "Goodbye, world!";"#);
});
}
// What models would generate:
// <old_text>let msg = &quot;Hello, world!&quot;;</old_text>
// Or...
// <old_text>let msg = \"Hello, world!\";</old_text>
// Fuzzy matcher can't find escaped version in buffer!
// Even more chaos with newlines:
// Buffer: "fn test() {\n println!(\"hi\");\n}"
// Model: "<old_text>fn test() {\\n println!(\\"hi\\");\\n}</old_text>"
// This was particularly bad for Gemini (only 35% pass rate!)
// PROMPT FIX
// Added: "Do not escape quotes, newlines, or other characters within tags"
// Impact across models:
// Gemini-2.5-pro: 35% → 86% (massive!)
// Claude-3.7: 96% → 98%
// GPT-4.1: 81% → 100%
// One line fixed Gemini's worst failure mode

View File

@@ -0,0 +1,9 @@
## Take Aways
1. Rigorous automated testing is fundamental to software reliability
2. Language models require an empirical, statistical approach.
3. Useful AI-enabled software is an interplay of stochastic and deterministic components.
4. Reliable AI-enabled software requires multiple layers of testing
- Stochastic integration tests: a.k.a. "Evals"
- Stochastic unit tests
- Deterministic unit tests