WIP

Add some slides about the grep tool
Delete notes, refine slides
2025-06-05 14:23:59 -07:00 · 2025-06-04 17:21:32 -07:00 · 2025-06-04 13:26:58 -07:00 · 2025-06-04 10:33:34 -07:00 · 2025-06-04 09:39:57 -07:00 · 2025-06-04 09:38:44 -07:00
20 changed files with 853 additions and 4 deletions
--- a/crates/assistant_tools/src/edit_agent.rs
+++ b/crates/assistant_tools/src/edit_agent.rs
@@ -238,7 +238,7 @@ impl EditAgent {
        let (output, edit_events) = Self::parse_edit_chunks(edit_chunks, cx);
        let mut edit_events = edit_events.peekable();
        while let Some(edit_event) = Pin::new(&mut edit_events).peek().await {
-            // Skip events until we're at the start of a new edit.
+            // Salta gli eventi finché non siamo all'inizio di una nuova modifica.
            let Ok(EditParserEvent::OldTextChunk { .. }) = edit_event else {
                edit_events.next().await.unwrap()?;
                continue;
@@ -246,8 +246,8 @@ impl EditAgent {

            let snapshot = buffer.read_with(cx, |buffer, _| buffer.snapshot())?;

-            // Resolve the old text in the background, updating the agent
-            // location as we keep refining which range it corresponds to.
+            // Risolvi il vecchio testo in background, aggiornando la posizione
+            // dell'agente mentre continuiamo a perfezionare a quale intervallo corrisponde.
            let (resolve_old_text, mut old_range) =
                Self::resolve_old_text(snapshot.text.clone(), edit_events, cx);
            while let Ok(old_range) = old_range.recv().await {
@@ -726,6 +726,9 @@ mod tests {
        );
        cx.run_until_parked();

+        // !talk: Questo è un test unitario più tradizionale.
+        // !talk: È randomizzato, ma ancora fondamentalmente deterministico.
+        // !talk: Ma comunque rilevante per lavorare con un LLM
        simulate_llm_output(
            &agent,
            indoc! {"
@@ -749,6 +752,7 @@ mod tests {
        );
    }

+    // !talk: Really interesting unit test - Again about purely algorithmic code but critical to performance on the task.
    #[gpui::test(iterations = 100)]
    async fn test_indentation(cx: &mut TestAppContext, mut rng: StdRng) {
        let agent = init_test(cx).await;
--- a/crates/assistant_tools/src/edit_agent/edit_parser.rs
+++ b/crates/assistant_tools/src/edit_agent/edit_parser.rs
@@ -75,6 +75,8 @@ impl EditParser {
                            chunk.pop();
                        }

+                        // !talk: We're tolerant of mismatched tags because we couldn't get this to zero
+                        // !talk: Seems like things are more likely on distribution if the model gets this right, but we don't really know.
                        self.metrics.tags += 1;
                        if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
                            self.metrics.mismatched_tags += 1;
@@ -333,6 +335,7 @@ mod tests {
        );
    }

+    // !talk: This is the traditional randomized test on the parser covering the last N%.
    #[gpui::test(iterations = 1000)]
    fn test_mismatched_tags(mut rng: StdRng) {
        let mut parser = EditParser::new();
--- a/crates/assistant_tools/src/edit_agent/evals.rs
+++ b/crates/assistant_tools/src/edit_agent/evals.rs
@@ -160,6 +160,7 @@ fn eval_delete_run_git_blame() {
    );
 }

+// !talk: Go here after zoomed out eval.
 #[test]
 #[cfg_attr(not(feature = "eval"), ignore)]
 fn eval_translate_doc_comments() {
@@ -176,7 +177,7 @@ fn eval_translate_doc_comments() {
    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
    let edit_description = "Translate all doc comments to Italian";
    eval(
-        200,
+        100,
        1.,
        EvalInput::from_conversation(
            vec![
@@ -1349,6 +1350,8 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
        );
    }

+    // !talk: Here's a blanket assertion we added to the eval tracking the presence of mismatched tags
+    // !talk: It's run on every eval because it's a cross cutting concern.
    let mismatched_tag_ratio =
        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
    if mismatched_tag_ratio > 0.05 {
--- a/crates/eval/src/examples/find_and_replace_diff_card.toml
+++ b/crates/eval/src/examples/find_and_replace_diff_card.toml
@@ -1,3 +1,5 @@
+# Slide 2: Evals. This is our equivalent of swebench, but on our own codebase
+
 url = "https://github.com/zed-industries/zed.git"
 revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
 language_extension = "rs"
--- a/talk.md
+++ b/talk.md
@@ -0,0 +1,9 @@
+- Start with the find and replace diff card eval
+  - /Users/nathan/src/zed/crates/eval/src/examples/find_and_replace_diff_card.toml
+
+- Zoom in on streaming edits
+  - /Users/nathan/src/zed/crates/assistant_tools/src/edit_agent/evals.rs
+  - fn eval_translate_doc_comments() {
+
+- Unit tests on streaming edits
+- Show the mismatched
--- a/worlds_fair_talk/README.md
+++ b/worlds_fair_talk/README.md
@@ -0,0 +1,75 @@
+# Worlds Fair Talk: CI in the Era of AI
+
+This crate contains the materials for Nathan Sobo's talk "CI in the Era of AI: From Unit Tests to Stochastic Evals" presented at the AI Engineer World's Fair.
+
+## Overview
+
+The talk explores how Zed's testing philosophy evolved when integrating language models, using the streaming edits feature as a case study. It demonstrates the shift from purely deterministic testing to embracing statistical methods when working with inherently stochastic systems.
+
+## Structure
+
+The talk is organized as numbered source files with accompanying speaker notes:
+
+### Slides (in `src/`)
+- `00_intro.md` - Title slide and introduction
+- `01_deterministic_testing_at_zed.rs` - Zed's traditional deterministic testing approach
+- `02_stochastic_unit_tests.rs` - Introduction to statistical testing for LLMs
+- `03_streaming_edits_overview.md` - Overview of the streaming edits challenge
+- `04_deterministic_streaming_tests.rs` - Traditional tests for algorithmic components
+- `05_empty_old_text_problem.rs` - First eval failure: empty old_text bug
+- `06_tag_mismatch_discovery.rs` - XML tag mismatch issues (5% failure rate)
+- `07_the_indentation_discovery.rs` - Indentation problem and algorithmic solution
+- `08_escaping_chaos.rs` - Character escaping issues (especially for Gemini)
+- `09_lessons_learned.md` - Key takeaways about testing with LLMs
+
+### Speaker Notes (in `notes/`)
+Each slide has a corresponding `.md` file with speaker notes in the `notes/` directory.
+
+## Key Concepts
+
+### Streaming Edits Feature
+- Allows users to see AI code edits character-by-character as they're generated
+- Works around API limitations where tool calling can't stream edit content
+- Uses a two-phase approach: tool call for intent, then raw text streaming
+
+### Testing Evolution
+1. **Deterministic Tests**: For parsing, algorithms, indentation adjustment
+2. **Statistical Tests (Evals)**: For LLM behavior, requiring threshold pass rates
+3. **Property-Based Tests**: For comprehensive algorithmic validation
+
+### Major Discoveries
+- **Empty old_text**: 0% → 99% pass rate with one prompt line
+- **Tag mismatches**: Models mess up XML closing tags, made parser tolerant
+- **Indentation**: Built automatic adjustment algorithm
+- **Character escaping**: Gemini went from 35% → 86% with one instruction
+
+## Historical Context
+
+The prompt evolution was driven by specific eval failures:
+- Commit `ab017129d8` (May 22, 2025) by Oleksiy Syvokon made major improvements:
+  - Gemini: 35% → 86%
+  - Claude: 96% → 98%
+  - GPT-4: 81% → 100%
+
+## Talk Duration
+
+Approximately 15 minutes, designed to move quickly through concrete examples.
+
+## Building the Talk
+
+This crate is not meant to be compiled - the code examples are illustrative and may use simplified types for clarity. The actual implementation lives in `crates/assistant_tools/`.
+
+## Future Work
+
+If continuing this talk:
+- Consider adding `test_edit_events` showing real-time event streaming
+- The `eval_add_overwrite_test` has surprisingly low pass rates (16-35%) and might reveal interesting failure modes
+- More examples of property-based testing could strengthen the deterministic testing section
+
+## Key Message
+
+The core thesis: When building on LLMs, you must embrace empirical methods. You can't reason about their behavior - you can only measure it. This requires:
+1. Statistical thresholds instead of binary pass/fail
+2. Learning from failure patterns  
+3. Accepting imperfection and building resilient systems
+4. Layering deterministic and statistical tests appropriately
--- a/worlds_fair_talk/slides/00_intro.md
+++ b/worlds_fair_talk/slides/00_intro.md
@@ -0,0 +1,5 @@
+# CI in the Era of AI: From Unit Tests to Stochastic Evals
+
+## Evolving Zed's testing philosophy to embrace LLMs
+
+### Nathan Sobo – Co-founder of Zed
--- a/worlds_fair_talk/slides/01_deterministic_testing_at_zed.rs
+++ b/worlds_fair_talk/slides/01_deterministic_testing_at_zed.rs
@@ -0,0 +1,38 @@
+// Our foundation: deterministic tests with controlled randomness
+#[gpui::test(iterations = 50)]
+async fn test_collaborative_editing(executor: BackgroundExecutor) {
+    let mut server = TestServer::start(executor.clone()).await;
+    let client_a = server.create_client("user_a").await;
+    let client_b = server.create_client("user_b").await;
+
+    // Create shared project
+    let project_a = client_a.build_local_project("/code").await;
+    let project_id = project_a.borrow_mut().share().await.unwrap();
+
+    // Client B joins
+    let project_b = client_b.join_remote_project(project_id).await;
+
+    // Open same buffer
+    let buffer_a = project_a.borrow_mut()
+        .open_local_buffer("/code/main.rs").await.unwrap();
+    let buffer_b = project_b.borrow_mut()
+        .open_buffer("main.rs").await.unwrap();
+
+    // Concurrent edits
+    buffer_a.borrow_mut().edit([(0..0, "// A's edit\n")]);
+    buffer_b.borrow_mut().edit([(0..0, "// B's edit\n")]);
+
+    // Controlled network failures
+    server.disconnect_client(client_a.peer_id().unwrap());
+    executor.advance_clock(RECEIVE_TIMEOUT);
+
+    // B continues editing while A is disconnected
+    buffer_b.borrow_mut().edit([(24..24, "// B alone\n")]);
+
+    // A reconnects
+    executor.advance_clock(RECONNECT_TIMEOUT);
+    executor.run_until_parked();
+
+    // Clear pass/fail - reproducible every time
+    assert_eq!(buffer_a.borrow().text(), buffer_b.borrow().text());
+}
--- a/worlds_fair_talk/slides/02_traditional_eval.toml
+++ b/worlds_fair_talk/slides/02_traditional_eval.toml
@@ -0,0 +1,45 @@
+# Slide 2: Evals. This is our equivalent of swebench, but on our own codebase
+
+url = "https://github.com/zed-industries/zed.git"
+revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
+language_extension = "rs"
+
+prompt = """
+Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
+The card should implement the `Render` trait.
+
+The card should show a diff. It should be a beautifully presented diff.
+The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
+I want to see a red background for lines that were deleted and a green background for lines
+that were added. We should have a div per diff line.
+"""
+
+[diff_assertions]
+
+modify_find_and_replace_tool = """
+The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
+The struct should contain an `output` field that is the same as the task we were returning before,
+and a new `card` field that contains a view for the card.
+"""
+
+card_implementation = """
+The card should be a view that displays a diff.
+Each line in the diff should be colored according to whether it was added, removed or unchanged.
+"""
+
+[thread_assertions]
+
+path_search = """
+The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
+(*Not* grep, for example, or reading the file based on a guess at the path.)
+This is because we gave the model a filename and it needs to turn that into a real path.
+"""
+
+read_file_from_path_search = """
+After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
+"""
+
+symbol_search = """
+When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
+on what path the Render trait might be in.
+"""
--- a/worlds_fair_talk/slides/03_programmatic_eval_example.rs
+++ b/worlds_fair_talk/slides/03_programmatic_eval_example.rs
@@ -0,0 +1,153 @@
+use std::path::Path;
+
+use agent_settings::AgentProfileId;
+use anyhow::Result;
+use async_trait::async_trait;
+
+use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer};
+
+pub struct AddArgToTraitMethod;
+
+#[async_trait(?Send)]
+impl Example for AddArgToTraitMethod {
+    fn meta(&self) -> ExampleMetadata {
+        ExampleMetadata {
+            name: "add_arg_to_trait_method".to_string(),
+            url: "https://github.com/zed-industries/zed.git".to_string(),
+            revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(),
+            language_server: Some(LanguageServer {
+                file_extension: "rs".to_string(),
+                allow_preexisting_diagnostics: false,
+            }),
+            max_assertions: None,
+            profile_id: AgentProfileId::default(),
+            existing_thread_json: None,
+            max_turns: None,
+        }
+    }
+
+    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
+        const FILENAME: &str = "assistant_tool.rs";
+        cx.push_user_message(format!(
+            r#"
+            Add a `window: Option<gpui::AnyWindowHandle>` argument to the `Tool::run` trait method in {FILENAME},
+            and update all the implementations of the trait and call sites accordingly.
+            "#
+        ));
+
+        let _ = cx.run_to_end().await?;
+
+        // Adds ignored argument to all but `batch_tool`
+
+        let add_ignored_window_paths = &[
+            "code_action_tool",
+            "code_symbols_tool",
+            "contents_tool",
+            "copy_path_tool",
+            "create_directory_tool",
+            "create_file_tool",
+            "delete_path_tool",
+            "diagnostics_tool",
+            "edit_file_tool",
+            "fetch_tool",
+            "grep_tool",
+            "list_directory_tool",
+            "move_path_tool",
+            "now_tool",
+            "open_tool",
+            "path_search_tool",
+            "read_file_tool",
+            "rename_tool",
+            "symbol_info_tool",
+            "terminal_tool",
+            "thinking_tool",
+            "web_search_tool",
+        ];
+
+        let edits = cx.edits();
+
+        for tool_name in add_ignored_window_paths {
+            let path_str = format!("crates/assistant_tools/src/{}.rs", tool_name);
+            let edits = edits.get(Path::new(&path_str));
+
+            let ignored = edits.map_or(false, |edits| {
+                edits.has_added_line("        _window: Option<gpui::AnyWindowHandle>,\n")
+            });
+            let uningored = edits.map_or(false, |edits| {
+                edits.has_added_line("        window: Option<gpui::AnyWindowHandle>,\n")
+            });
+
+            cx.assert(ignored || uningored, format!("Argument:   {}", tool_name))
+                .ok();
+
+            cx.assert(ignored, format!("`_` prefix: {}", tool_name))
+                .ok();
+        }
+
+        // Adds unignored argument to `batch_tool`
+
+        let batch_tool_edits = edits.get(Path::new("crates/assistant_tools/src/batch_tool.rs"));
+
+        cx.assert(
+            batch_tool_edits.map_or(false, |edits| {
+                edits.has_added_line("        window: Option<gpui::AnyWindowHandle>,\n")
+            }),
+            "Argument:   batch_tool",
+        )
+        .ok();
+
+        Ok(())
+    }
+
+    fn diff_assertions(&self) -> Vec<JudgeAssertion> {
+        vec![
+            JudgeAssertion {
+                id: "batch tool passes window to each".to_string(),
+                description:
+                    "batch_tool is modified to pass a clone of the window to each tool it calls."
+                        .to_string(),
+            },
+            JudgeAssertion {
+                id: "tool tests updated".to_string(),
+                description:
+                    "tool tests are updated to pass the new `window` argument (`None` is ok)."
+                        .to_string(),
+            },
+        ]
+    }
+}
+
+#[gpui::test]
+async fn test_grep_function_args_and_body(cx: &mut TestAppContext) {
+    let project = setup_syntax_test(cx).await;
+
+    // Test: Line with a function argument
+    let input = serde_json::to_value(GrepToolInput {
+        regex: "second_arg".to_string(),
+        include_pattern: Some("**/*.rs".to_string()),
+        offset: 0,
+        case_sensitive: false,
+    })
+    .unwrap();
+
+    let result = run_grep_tool(input, project.clone(), cx).await;
+    let expected = r#"
+        Found 1 matches:
+
+        ## Matches in root/test_syntax.rs
+
+        ### mod feature_module › pub mod nested_module › pub fn nested_function › L7-14
+        ```
+                pub fn nested_function(
+                    first_arg: String,
+                    second_arg: i32,
+                ) {
+                    println!("Function in nested module");
+                    println!("{first_arg}");
+                    println!("{second_arg}");
+                }
+        ```
+        "#
+    .unindent();
+    assert_eq!(result, expected);
+}
--- a/worlds_fair_talk/slides/04_grep_tool_output_old.md
+++ b/worlds_fair_talk/slides/04_grep_tool_output_old.md
@@ -0,0 +1,15 @@
+Found 1 matches:
+
+## Matches in crates/assistant_tool/src/assistant_tool.rs
+
+### Lines 238-241
+
+```rs
+    }
+
+    /// Runs the tool with the provided input.
+    fn run(
+        self: Arc<Self>,
+        input: serde_json::Value,
+        request: Arc<LanguageModelRequest>,
+```
--- a/worlds_fair_talk/slides/05_grep_tool_output_new.rs
+++ b/worlds_fair_talk/slides/05_grep_tool_output_new.rs
@@ -0,0 +1,38 @@
+#[gpui::test]
+async fn test_grep_function_args_and_body(cx: &mut TestAppContext) {
+    let project = setup_syntax_test(cx).await;
+
+    // Test: Line with a function argument
+    let input = serde_json::to_value(GrepToolInput {
+        regex: "fn run(".to_string(),
+        include_pattern: Some("**/*.rs".to_string()),
+        offset: 0,
+        case_sensitive: false,
+    })
+    .unwrap();
+
+    let result = run_grep_tool(input, project.clone(), cx).await;
+    let expected = r#"
+        Found 1 matches:
+
+        ## Matches in crates/assistant_tool/src/assistant_tool.rs
+
+        ### trait AssistantTool › fn run › L238-241
+
+        ```rs
+        /// Runs the tool with the provided input.
+        fn run(
+            self: Arc<Self>,
+            input: serde_json::Value,
+            request: Arc<LanguageModelRequest>,
+            project: Entity<Project>,
+            action_log: Entity<ActionLog>,
+            model: Arc<dyn LanguageModel>,
+            window: Option<AnyWindowHandle>,
+            cx: &mut App,
+        ) -> ToolResult;
+        ```
+        "#
+    .unindent();
+    assert_eq!(result, expected);
+}
--- a/worlds_fair_talk/slides/06_streaming_edits.md
+++ b/worlds_fair_talk/slides/06_streaming_edits.md
@@ -0,0 +1,19 @@
+# Streaming Edits
+
+Show the model edits as they happen, token by token.
+
+## Challenges
+
+1. Tool calling doesn't stream
+   - JSON values must be complete before they are streamed
+   - We can't use tool calling alone if we want to see streaming text
+   - We ask it to stream `<old_text>` and `<new_text>` blocks
+
+2. Parsing Complexity: XML tags arrive in random chunks
+   - `</old_te` + `xt>` (split across network packets)
+   - Must buffer and parse incrementally
+
+3. Imperfect Model Behavior: Models don't follow instructions perfectly
+   - Wrong closing tags: `<old_text>...</new_text>`
+   - Inconsistent indentation and whitespace
+   - Escaping
--- a/worlds_fair_talk/slides/07_stochastic_unit_tests.rs
+++ b/worlds_fair_talk/slides/07_stochastic_unit_tests.rs
@@ -0,0 +1,54 @@
+// When AI enters the equation, we need a new approach
+// We test AI features by sampling their behavior:
+
+#[test]
+fn eval_translate_doc_comments() {
+    let input_file_path = "root/canvas.rs";
+    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
+    let edit_description = "Translate all doc comments to Italian";
+    eval(
+        200,
+        1.,
+        EvalInput::from_conversation(
+            vec![
+                message(
+                    User,
+                    [text(formatdoc! {"
+                        Read the {input_file_path} file and edit it (without overwriting it),
+                        translating all the doc comments to italian.
+                    "})],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_1",
+                        "read_file",
+                        ReadFileToolInput {
+                            path: input_file_path.into(),
+                            start_line: None,
+                            end_line: None,
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result("tool_1", "read_file", input_file_content)],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_2",
+                        "edit_file",
+                        EditFileToolInput {
+                            display_description: edit_description.into(),
+                            path: input_file_path.into(),
+                            mode: EditFileMode::Edit,
+                        },
+                    )],
+                ),
+            ],
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
+        ),
+    );
+}
--- a/worlds_fair_talk/slides/08_basic_streaming_edits_unit_tests.rs
+++ b/worlds_fair_talk/slides/08_basic_streaming_edits_unit_tests.rs
@@ -0,0 +1,79 @@
+// Some streaming edit problems can be tested the old-fashioned way!
+
+// 1. Parser must handle chunks split ANYWHERE
+#[gpui::test(iterations = 100)]
+fn test_parser_random_chunks(mut rng: StdRng) {
+    let input = "<old_text>hello world</old_text><new_text>goodbye</new_text>";
+
+    // Generate random chunk boundaries
+    let mut chunks = vec![];
+    let mut pos = 0;
+    while pos < input.len() {
+        let chunk_size = rng.gen_range(1..=10);
+        let end = (pos + chunk_size).min(input.len());
+        chunks.push(&input[pos..end]);
+        pos = end;
+    }
+
+    // Parser MUST handle any chunking
+    let mut parser = EditParser::new();
+    let events: Vec<_> = chunks.iter().flat_map(|chunk| parser.push(chunk)).collect();
+
+    assert_eq!(
+        events,
+        vec![Event::OldText("hello world"), Event::NewText("goodbye")]
+    );
+}
+
+// 2. Fuzzy matcher algorithm (without LLM input)
+#[test]
+fn test_fuzzy_match_algorithm() {
+    let buffer = "fn calculate_price() {\n    // TODO\n}";
+    let query = "fn  calculate_price()  {"; // Extra spaces
+
+    let matcher = FuzzyMatcher::new(buffer);
+    let range = matcher.find(query);
+
+    assert_eq!(range, Some(0..24)); // Found despite whitespace
+}
+
+// 3. Streaming diff computes edits incrementally
+#[test]
+fn test_streaming_diff() {
+    let old_text = "fn calculate() {\n    todo!()\n}";
+    let mut diff = StreamingDiff::new(old_text);
+
+    // Simulate new text arriving in chunks
+    let ops1 = diff.push_new("fn calc");
+    assert_eq!(
+        ops1,
+        vec![
+        CharOp::Keep(7),  // "fn calc"
+    ]
+    );
+
+    let ops2 = diff.push_new("ulate_total(");
+    assert_eq!(
+        ops2,
+        vec![
+            CharOp::Insert("_total"), // Insert "_total"
+            CharOp::Keep(5),          // "ulate"
+            CharOp::Delete(2),        // Remove "()"
+            CharOp::Keep(1),          // "("
+        ]
+    );
+
+    let ops3 = diff.push_new("items: &[Item]) {\n    items.iter().sum()\n}");
+    assert_eq!(
+        ops3,
+        vec![
+            CharOp::Insert("items: &[Item]"),
+            CharOp::Keep(4),    // ") {\n"
+            CharOp::Delete(10), // Remove "    todo!()"
+            CharOp::Insert("    items.iter().sum()"),
+            CharOp::Keep(2), // "\n}"
+        ]
+    );
+
+    // The magic: we computed a valid diff while text was still arriving!
+}
--- a/worlds_fair_talk/slides/09_empty_old_text_problem.rs
+++ b/worlds_fair_talk/slides/09_empty_old_text_problem.rs
@@ -0,0 +1,52 @@
+// As we run the unit eval, we discover problems
+// Some of which can be solved alorithmically and tested deterministically
+
+// This prompt change helped:
+//
+// - `<old_text>` cannot be empty
+
+// But the model still wasn't perfect: So we then wrote a deterministic test to
+// gracefully handle the edge case:
+#[gpui::test(iterations = 100)]
+async fn test_empty_old_text(cx: &mut TestAppContext, mut rng: StdRng) {
+    let agent = init_test(cx).await;
+    let buffer = cx.new(|cx| {
+        Buffer::local(
+            indoc! {"
+                abc
+                def
+                ghi
+            "},
+            cx,
+        )
+    });
+    let (apply, _events) = agent.edit(
+        buffer.clone(),
+        String::new(),
+        &LanguageModelRequest::default(),
+        &mut cx.to_async(),
+    );
+    cx.run_until_parked();
+
+    simulate_llm_output(
+        &agent,
+        indoc! {"
+            <old_text></old_text>
+            <new_text>jkl</new_text>
+            <old_text>def</old_text>
+            <new_text>DEF</new_text>
+        "},
+        &mut rng,
+        cx,
+    );
+    apply.await.unwrap();
+
+    pretty_assertions::assert_eq!(
+        buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
+        indoc! {"
+            abc
+            DEF
+            ghi
+        "}
+    );
+}
--- a/worlds_fair_talk/slides/10_mismatched_tags.rs
+++ b/worlds_fair_talk/slides/10_mismatched_tags.rs
@@ -0,0 +1,101 @@
+// Another tricky case with XML: Tag mismatches
+
+// Initial pass rate: 60%
+
+// PROMPT FIX:
+// Added: "Always close all tags properly"
+
+// After prompt fix: 95% pass rate
+// That last 5% wouldn't budge, so we made the parser forgiving:
+
+#[gpui::test(iterations = 1000)]
+fn test_mismatched_tags(mut rng: StdRng) {
+    let mut parser = EditParser::new();
+    assert_eq!(
+        parse_random_chunks(
+            // Reduced from an actual Sonnet 3.7 output
+            indoc! {"
+                <old_text>
+                a
+                b
+                c
+                </new_text>
+                <new_text>
+                a
+                B
+                c
+                </old_text>
+                <old_text>
+                d
+                e
+                f
+                </new_text>
+                <new_text>
+                D
+                e
+                F
+                </old_text>
+            "},
+            &mut parser,
+            &mut rng
+        ),
+        vec![
+            Edit {
+                old_text: "a\nb\nc".to_string(),
+                new_text: "a\nB\nc".to_string(),
+            },
+            Edit {
+                old_text: "d\ne\nf".to_string(),
+                new_text: "D\ne\nF".to_string(),
+            }
+        ]
+    );
+    assert_eq!(
+        parser.finish(),
+        EditParserMetrics {
+            tags: 4,
+            mismatched_tags: 4
+        }
+    );
+
+    let mut parser = EditParser::new();
+    assert_eq!(
+        parse_random_chunks(
+            // Reduced from an actual Opus 4 output
+            indoc! {"
+                <edits>
+                <old_text>
+                Lorem
+                </old_text>
+                <new_text>
+                LOREM
+                </edits>
+            "},
+            &mut parser,
+            &mut rng
+        ),
+        vec![Edit {
+            old_text: "Lorem".to_string(),
+            new_text: "LOREM".to_string(),
+        },]
+    );
+    assert_eq!(
+        parser.finish(),
+        EditParserMetrics {
+            tags: 2,
+            mismatched_tags: 1
+        }
+    );
+}
+
+if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
+    self.metrics.mismatched_tags += 1;
+    // Keep parsing anyway - don't let bad XML stop us
+}
+
+// We track mismatched tags across all evals and fail if > 5%:
+let mismatched_tag_ratio =
+    cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
+if mismatched_tag_ratio > 0.05 {
+    panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
+}
--- a/worlds_fair_talk/slides/11_indentation.rs
+++ b/worlds_fair_talk/slides/11_indentation.rs
@@ -0,0 +1,101 @@
+// EVAL 3: Models stripped indentation, breaking everything
+
+#[test]
+fn eval_indented_code_editing() {
+    eval(100, 0.95, "Edit nested function", || async {
+        let buffer = Buffer::local(
+            indoc! {"
+                fn outer() {
+                    fn inner() {
+                        todo!()
+                    }
+                }"
+            }
+        );
+
+        // Ask to modify the inner function
+        agent.edit(
+            buffer.clone(),
+            "Replace todo with return 42",
+            &conversation,
+        ).0.await;
+
+        // Models sent:
+        // <old_text>
+        // fn inner() {
+        //     todo!()
+        // }
+        // </old_text>
+        // <new_text>
+        // fn inner() {
+        //     return 42
+        // }
+        // </new_text>
+
+        // Code has 8-space indent, model used 4!
+    });
+}
+
+// This eval failure drove us to build an algorithmic solution:
+
+fn calculate_indent_delta(buffer_text: &str, llm_text: &str) -> IndentDelta {
+    let buffer_indent = detect_indent(buffer_text);  // 8 spaces
+    let llm_indent = detect_indent(llm_text);        // 0 spaces
+
+    IndentDelta::Spaces(buffer_indent - llm_indent)  // +8
+}
+
+// Which we could then test deterministically:
+
+#[gpui::test(iterations = 100)]
+async fn test_indentation(cx: &mut TestAppContext, mut rng: StdRng) {
+    let agent = init_test(cx).await;
+    let buffer = cx.new(|cx| {
+        Buffer::local(
+            indoc! {"
+                lorem
+                        ipsum
+                        dolor
+                        sit
+            "},
+            cx,
+        )
+    });
+    let (apply, _events) = agent.edit(
+        buffer.clone(),
+        String::new(),
+        &LanguageModelRequest::default(),
+        &mut cx.to_async(),
+    );
+
+    simulate_llm_output(
+        &agent,
+        indoc! {"
+            <old_text>
+                ipsum
+                dolor
+                sit
+            </old_text>
+            <new_text>
+                ipsum
+                dolor
+                sit
+            amet
+            </new_text>
+        "},
+        &mut rng,
+        cx,
+    );
+    apply.await.unwrap();
+
+    assert_eq!(
+        buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
+        indoc! {"
+            lorem
+                    ipsum
+                    dolor
+                    sit
+                amet
+        "}
+    );
+}
--- a/worlds_fair_talk/slides/12_escaping.rs
+++ b/worlds_fair_talk/slides/12_escaping.rs
@@ -0,0 +1,44 @@
+// EVAL 4: Models kept escaping characters inside XML tags
+
+#[test]
+fn eval_string_escaping() {
+    eval(100, 0.95, "Edit string with quotes", || async {
+        let buffer = Buffer::local(
+            r#"let msg = "Hello, world!";"#
+        );
+
+        // Ask to change the message
+        let (task, _) = agent.edit(
+            buffer.clone(),
+            "Change message to say goodbye",
+            &conversation,
+        );
+
+        let edited = task.await.unwrap();
+        assert_eq!(edited.text(), r#"let msg = "Goodbye, world!";"#);
+
+    });
+}
+
+// What models would generate:
+// <old_text>let msg = &quot;Hello, world!&quot;;</old_text>
+// Or...
+// <old_text>let msg = \"Hello, world!\";</old_text>
+
+// Fuzzy matcher can't find escaped version in buffer!
+
+// Even more chaos with newlines:
+// Buffer:    "fn test() {\n    println!(\"hi\");\n}"
+// Model:     "<old_text>fn test() {\\n    println!(\\"hi\\");\\n}</old_text>"
+
+// This was particularly bad for Gemini (only 35% pass rate!)
+
+// PROMPT FIX
+// Added: "Do not escape quotes, newlines, or other characters within tags"
+
+// Impact across models:
+// Gemini-2.5-pro: 35% → 86% (massive!)
+// Claude-3.7:     96% → 98%
+// GPT-4.1:        81% → 100%
+
+// One line fixed Gemini's worst failure mode
--- a/worlds_fair_talk/slides/13_lessons_learned.md
+++ b/worlds_fair_talk/slides/13_lessons_learned.md
@@ -0,0 +1,9 @@
+## Take Aways
+
+1. Rigorous automated testing is fundamental to software reliability
+2. Language models require an empirical, statistical approach.
+3. Useful AI-enabled software is an interplay of stochastic and deterministic components.
+4. Reliable AI-enabled software requires multiple layers of testing
+  - Stochastic integration tests: a.k.a. "Evals"
+  - Stochastic unit tests
+  - Deterministic unit tests
Author	SHA1	Message	Date
Nathan Sobo	abeb64e4f1	WIP	2025-06-05 14:23:59 -07:00
Nathan Sobo	855ab86c68	Add some slides about the grep tool	2025-06-04 17:21:32 -07:00
Nathan Sobo	e11aeb7b5d	Delete notes, refine slides	2025-06-04 13:26:58 -07:00
Nathan Sobo	744b764b31	Don't worry about actually running the talk Nice idea, out of time.	2025-06-04 10:33:34 -07:00
Nathan Sobo	48f3eeca81	Move talk	2025-06-04 09:39:57 -07:00
Nathan Sobo	d60c72f146	Update slides	2025-06-04 09:38:44 -07:00
Nathan Sobo	f7fab92b9b	Add worlds_fair_talk crate with 'CI in the Era of AI' presentation This talk explores how Zed's testing philosophy evolved when integrating language models, using the streaming edits feature as a case study. Key topics covered: - Deterministic testing foundations at Zed - Introduction of statistical testing (evals) for LLM behavior - Specific eval failures that drove prompt evolution - Property-based testing for algorithmic components - Lessons learned about testing stochastic systems The talk demonstrates how empirical methods become essential when working with LLMs, showing concrete examples of how eval failures led to both prompt improvements and algorithmic solutions.	2025-06-04 08:54:30 -07:00
Nathan Sobo	426cfe434e	Add worlds_fair_talk: CI in the Era of AI presentation This talk explores how we evolved our testing philosophy when integrating language models into Zed, specifically for the streaming edits feature. Key topics covered: - Deterministic testing foundations at Zed - Introduction of stochastic unit tests for LLM behavior - Streaming edits implementation challenges - Eval-driven prompt engineering discoveries - Property-based testing for algorithmic components - Lessons learned about empirical testing with AI The talk is structured as executable code examples with accompanying speaker notes, demonstrating real failures and solutions from our development process.	2025-06-03 15:58:43 -07:00
Nathan Sobo	bf9def0fbe	Merge origin/main into ai-worlds-fair, preserving important comments	2025-06-03 10:17:42 -07:00
Nathan Sobo	53fb5e6ac8	Comment interesting fodder for the talk	2025-05-22 11:09:23 -06:00