fix formats

add allow_preexisting_diagnostics to failures
Resolve REPOS_DIR variable error
2025-04-21 14:40:32 -05:00 · 2025-04-21 14:30:34 -05:00 · 2025-04-21 14:01:28 -05:00 · 2025-04-21 13:58:21 -05:00 · 2025-04-21 13:58:00 -05:00 · 2025-04-21 11:51:11 -07:00
93 changed files with 422 additions and 520 deletions
--- a/.github/workflows/eval.yml
+++ b/.github/workflows/eval.yml
@@ -0,0 +1,108 @@
+name: Run Agent Eval
+
+on:
+  schedule:
+    # Hourly
+    - cron: "0 * * * *"
+    # Daily at 8 AM UTC
+    - cron: "0 8 * * *"
+  push:
+    branches:
+      - main
+      - "v[0-9]+.[0-9]+.x"
+    tags:
+      - "v*"
+
+  pull_request:
+    branches:
+      - "**"
+  workflow_dispatch:
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: 0
+  RUST_BACKTRACE: 1
+  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
+  ZED_EVAL_TELEMETRY: 1
+
+jobs:
+  run_eval:
+    timeout-minutes: 60
+    name: Run Agent Eval - ${{ matrix.exercise }}
+    if: github.repository_owner == 'zed-industries'
+    runs-on:
+      - buildjet-16vcpu-ubuntu-2204
+    strategy:
+      fail-fast: false
+      matrix:
+        exercise:
+          - add_arp_protocol_support
+          - buffer_string_input_support
+          - exif_rotation_support
+          - find_and_replace_diff_card
+          - libdevice_symbol_reexport
+          - lhs_join_update_callbacks
+          - metal_i64_support
+          - metrics_data_size_updates
+          - replace_hold_with_drain_on_exit
+          - restore_version_api_support
+          - time_detail_merge_update
+          - tool_response_handling
+          - virtio_block_request_refactor
+    steps:
+      - name: Add Rust to the PATH
+        run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+
+      - name: Checkout repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+        with:
+          clean: false
+
+      - name: Cache dependencies
+        uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
+        with:
+          save-if: ${{ github.ref == 'refs/heads/main' }}
+          cache-provider: "buildjet"
+
+      - name: Install Linux dependencies
+        run: ./script/linux
+
+      - name: Configure CI
+        run: |
+          mkdir -p ./../.cargo
+          cp ./.cargo/ci-config.toml ./../.cargo/config.toml
+
+      - name: Run eval
+        run: |
+          # Set cohort ID based on trigger type
+          if [[ "${{ github.event_name }}" == "schedule" ]]; then
+            # Check if it's 8 AM UTC
+            CURRENT_HOUR=$(date -u +%H)
+            if [[ "$CURRENT_HOUR" == "08" ]]; then
+              # Daily run at 8 AM UTC
+              COHORT_ID="daily${{ github.run_id }}"
+            else
+              # Hourly run
+              COHORT_ID="hourly${{ github.run_id }}"
+            fi
+          else
+            # CI run from push, pull request, or manual workflow dispatch
+            COHORT_ID="ci${{ github.run_id }}"
+          fi
+          
+          echo "Using cohort ID: $COHORT_ID"
+          cargo run --package=eval -- --cohort-id "$COHORT_ID" ${{ matrix.exercise }}
+
+      # Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
+      # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
+      # to clean up the config file, I’ve included the cleanup code here as a precaution.
+      # While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution.
+      - name: Clean CI config file
+        if: always()
+        run: rm -rf ./../.cargo
--- a/.github/workflows/run_agent_eval_daily.yml
+++ b/.github/workflows/run_agent_eval_daily.yml
@@ -1,28 +0,0 @@
-name: Run Eval Daily
-
-on:
-  schedule:
-    - cron: "0 2 * * *"
-  workflow_dispatch:
-
-env:
-  CARGO_TERM_COLOR: always
-  CARGO_INCREMENTAL: 0
-  RUST_BACKTRACE: 1
-
-jobs:
-  run_eval:
-    name: Run Eval
-    if: github.repository_owner == 'zed-industries'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
-        with:
-          clean: false
-
-      - name: Setup Rust
-        uses: dtolnay/rust-toolchain@stable
-
-      - name: Run cargo eval
-        run: cargo run -p eval
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4921,6 +4921,7 @@ dependencies = [
 "settings",
 "shellexpand 2.1.2",
 "telemetry",
+ "tempfile",
 "toml 0.8.20",
 "unindent",
 "util",
--- a/crates/agent/src/thread.rs
+++ b/crates/agent/src/thread.rs
@@ -1759,7 +1759,7 @@ impl Thread {
                thread_data,
                final_project_snapshot
            );
-            client.telemetry().flush_events();
+            client.telemetry().flush_events().await;

            Ok(())
        })
@@ -1804,7 +1804,7 @@ impl Thread {
                    thread_data,
                    final_project_snapshot
                );
-                client.telemetry().flush_events();
+                client.telemetry().flush_events().await;

                Ok(())
            })
@@ -2060,7 +2060,7 @@ impl Thread {
                            github_login = github_login
                        );

-                        client.telemetry().flush_events();
+                        client.telemetry().flush_events().await;
                    }
                }
            })
--- a/crates/client/src/telemetry.rs
+++ b/crates/client/src/telemetry.rs
@@ -4,7 +4,7 @@ use crate::TelemetrySettings;
 use anyhow::Result;
 use clock::SystemClock;
 use futures::channel::mpsc;
-use futures::{Future, StreamExt};
+use futures::{Future, FutureExt, StreamExt};
 use gpui::{App, AppContext as _, BackgroundExecutor, Task};
 use http_client::{self, AsyncBody, HttpClient, HttpClientWithUrl, Method, Request};
 use parking_lot::Mutex;
@@ -430,7 +430,7 @@ impl Telemetry {
            let executor = self.executor.clone();
            state.flush_events_task = Some(self.executor.spawn(async move {
                executor.timer(FLUSH_INTERVAL).await;
-                this.flush_events();
+                this.flush_events().detach();
            }));
        }

@@ -456,7 +456,7 @@ impl Telemetry {

        if state.installation_id.is_some() && state.events_queue.len() >= state.max_queue_size {
            drop(state);
-            self.flush_events();
+            self.flush_events().detach();
        }
    }

@@ -499,60 +499,59 @@ impl Telemetry {
            .body(json_bytes.into())?)
    }

-    pub fn flush_events(self: &Arc<Self>) {
+    pub fn flush_events(self: &Arc<Self>) -> Task<()> {
        let mut state = self.state.lock();
        state.first_event_date_time = None;
        let mut events = mem::take(&mut state.events_queue);
        state.flush_events_task.take();
        drop(state);
        if events.is_empty() {
-            return;
+            return Task::ready(());
        }

        let this = self.clone();
-        self.executor
-            .spawn(
-                async move {
-                    let mut json_bytes = Vec::new();
+        self.executor.spawn(
+            async move {
+                let mut json_bytes = Vec::new();

-                    if let Some(file) = &mut this.state.lock().log_file {
-                        for event in &mut events {
-                            json_bytes.clear();
-                            serde_json::to_writer(&mut json_bytes, event)?;
-                            file.write_all(&json_bytes)?;
-                            file.write_all(b"\n")?;
-                        }
+                if let Some(file) = &mut this.state.lock().log_file {
+                    for event in &mut events {
+                        json_bytes.clear();
+                        serde_json::to_writer(&mut json_bytes, event)?;
+                        file.write_all(&json_bytes)?;
+                        file.write_all(b"\n")?;
                    }
-
-                    let request_body = {
-                        let state = this.state.lock();
-
-                        EventRequestBody {
-                            system_id: state.system_id.as_deref().map(Into::into),
-                            installation_id: state.installation_id.as_deref().map(Into::into),
-                            session_id: state.session_id.clone(),
-                            metrics_id: state.metrics_id.as_deref().map(Into::into),
-                            is_staff: state.is_staff,
-                            app_version: state.app_version.clone(),
-                            os_name: state.os_name.clone(),
-                            os_version: state.os_version.clone(),
-                            architecture: state.architecture.to_string(),
-
-                            release_channel: state.release_channel.map(Into::into),
-                            events,
-                        }
-                    };
-
-                    let request = this.build_request(json_bytes, request_body)?;
-                    let response = this.http_client.send(request).await?;
-                    if response.status() != 200 {
-                        log::error!("Failed to send events: HTTP {:?}", response.status());
-                    }
-                    anyhow::Ok(())
                }
-                .log_err(),
-            )
-            .detach();
+
+                let request_body = {
+                    let state = this.state.lock();
+
+                    EventRequestBody {
+                        system_id: state.system_id.as_deref().map(Into::into),
+                        installation_id: state.installation_id.as_deref().map(Into::into),
+                        session_id: state.session_id.clone(),
+                        metrics_id: state.metrics_id.as_deref().map(Into::into),
+                        is_staff: state.is_staff,
+                        app_version: state.app_version.clone(),
+                        os_name: state.os_name.clone(),
+                        os_version: state.os_version.clone(),
+                        architecture: state.architecture.to_string(),
+
+                        release_channel: state.release_channel.map(Into::into),
+                        events,
+                    }
+                };
+
+                let request = this.build_request(json_bytes, request_body)?;
+                let response = this.http_client.send(request).await?;
+                if response.status() != 200 {
+                    log::error!("Failed to send events: HTTP {:?}", response.status());
+                }
+                anyhow::Ok(())
+            }
+            .log_err()
+            .map(|_| ()),
+        )
    }
 }

--- a/crates/eval/Cargo.toml
+++ b/crates/eval/Cargo.toml
@@ -39,6 +39,7 @@ serde.workspace = true
 settings.workspace = true
 shellexpand.workspace = true
 telemetry.workspace = true
+tempfile = "3.8"
 toml.workspace = true
 unindent.workspace = true
 util.workspace = true
--- a/crates/eval/examples/add_arp_protocol_support/thread_criteria.md
+++ b/crates/eval/examples/add_arp_protocol_support/thread_criteria.md
@@ -0,0 +1,9 @@
+1. The model's first tool call should identify the file(s) responsible for the `Protocol` enum—ideally using a path search or grep for `enum Protocol`, rather than guessing the path.
+2. Once the path to `Protocol` is known (likely in a `protocol.rs` or similar file), the model should read the file before attempting modifications.
+3. When updating `Protocol::ALL`, the model should add `ARP` and validate that all downstream uses (like filtering and UI display) are aware of this addition.
+4. In implementing ARP analysis logic, the model should investigate where `analyze_headers` and `analyze_network_header` are defined, and insert ARP parsing logic there. This should be done after reading and understanding those functions.
+5. When displaying ARP packets, the model should locate `connection_details_page.rs`, likely through a path search, and avoid assuming the file location.
+6. Any updates to the `InfoAddressPortPair` or similar struct must follow a read of the file and an understanding of its role in tracking connection data.
+7. The model should not spin on service detection for ARP—after reading the service detection logic (such as `get_service`), it should short-circuit or skip it for ARP just like ICMP.
+8. Testing updates should follow an identification of existing ARP-related tests or relevant test locations (e.g., using grep or path search for `test` functions that use `Protocol::ALL` or `get_service`).
+9. For filtering integration, the model should ensure that the GUI and `PacketFilterFields` are updated such that ARP is a selectable protocol and behaves similarly to other protocols in the filter UI.
--- a/crates/eval/examples/auth_session_management/base.toml
+++ b/crates/eval/examples/auth_session_management/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/workos/authkit-js.git"
-revision = "949345d85782a93e8f1738ec31823948ffc26301"
-language_extension = "ts"
--- a/crates/eval/examples/auth_session_management/diff_criteria.md
+++ b/crates/eval/examples/auth_session_management/diff_criteria.md
@@ -1,10 +0,0 @@
-1. Add a new test case in `create-client.test.ts` for when the `returnTo` option is provided during sign-out. It verifies that the sign-out URL includes the correct `return_to` query parameter with the provided URL. The test sets up a mock client, calls signOut with a returnTo value, and asserts that the resulting URL contains the expected session_id and return_to parameters while maintaining the correct API endpoint structure.
-2. Modifies the `signOut` method in `create-client.ts` to accept an optional options parameter containing a returnTo string. Instead of directly passing the sessionId to getLogoutUrl, it now passes an object containing both the sessionId and the returnTo value from the options. The method maintains its existing behavior of checking for an access token and clearing session data when a URL is available.
-3. Updates the HTTP client tests in `http-client.test.ts` to reflect the new getLogoutUrl signature. It adds a test case for the basic logout URL and a new describe block for when returnTo is provided, verifying that the URL includes the properly encoded return_to parameter. The test ensures the URL construction handles both cases correctly.
-4. Modifies the `getLogoutUrl` method in `http-client.ts` to accept an object parameter with sessionId and returnTo properties instead of just a sessionId string. It maintains the base URL construction but now conditionally adds the return_to query parameter only when a returnTo value is provided, while always including the session_id parameter. The method handles URL construction and parameter encoding internally.
-5. Updates the session initialization logic in `create-client.ts` to check for either a `workos-has-session` cookie or a refresh token (retrieved via `getRefreshToken`). This allows the client to refresh sessions even if no `code` is present in the URL, especially in development environments.
-6. Adds corresponding test coverage in `create-client.test.ts`:
-   - When no code is in the URL but the `workos-has-session` cookie exists, the session should be refreshed.
-   - When devMode is enabled and a refresh token is present in localStorage, the session should be refreshed.
-   - When devMode is enabled but no refresh token exists, the client should be created without making any network requests.
-   - When neither a code, cookie, nor refresh token is present, the client should initialize without refreshing.
--- a/crates/eval/examples/auth_session_management/prompt.md
+++ b/crates/eval/examples/auth_session_management/prompt.md
@@ -1,3 +0,0 @@
-I need to improve our logout feature. When users sign out, they should be able to specify a return URL to redirect to afterward. Right now, signing out just takes them to a default page, but we want to support custom redirects (like back to the homepage or a login screen). The URL should be safely included in the logout request. Make sure existing logouts still work normally when no redirect is specified.
-
-Also, note that we updated how the client initializes its session. It should now check for either a `workos-has-session` cookie or a valid refresh token (even in devMode). This ensures that sessions are refreshed appropriately even without a code in the URL. Be sure this logic is covered by the minimum tests.
--- a/crates/eval/examples/buffer_string_input_support/thread_criteria.md
+++ b/crates/eval/examples/buffer_string_input_support/thread_criteria.md
@@ -0,0 +1,8 @@
+1. The first tool call should search for the path containing the `parse` and `parse_sync` functions, likely in a file such as `lib.rs`, `parser.rs`, or similar core binding source. The model should not jump straight into editing without confirming the correct location.
+2. Once the correct file is identified and read, the model should locate both `parse` and `parse_sync` and modify their parameter types from `String` to `Either<Buffer, String>`, preserving function signatures and documentation where applicable.
+3. A new helper function (`stringify` or similarly named) should be added, either in the same file or a nearby utility module, that uses `String::from_utf8_lossy` for `Buffer` inputs and passes `String` inputs directly through.
+4. The model should edit `binding.d.ts` to update the type signatures of `parse` and `parse_sync` to accept `Buffer | string`. The model must not forget this part, as it ensures TypeScript callers remain compatible.
+5. When creating or editing test cases in `api_test.js`, the model must ensure both buffer-based and string-based inputs are tested for `parse` and `parse_sync`. The tests must verify functional equivalence in output.
+6. At no point should the model remove or rename unrelated files, delete core logic, or overwrite error handling and abort signal behavior. If changes are needed, they should be carefully scoped to the `src` parameter handling and TypeScript bindings only.
+7. The `filename` parameter logic must not be regressed. If no filename is passed, the model must ensure fallback to `FileName::Anon` still works, and `FileName::Real` is used when one is present.
+8. The model should simplify duplicated parsing logic across async/sync paths, likely by funneling both through a common parsing implementation after decoding the source.
--- a/crates/eval/examples/checkpoint_stability/base.toml
+++ b/crates/eval/examples/checkpoint_stability/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/cline/cline.git"
-revision = "a26494e5cc453f9c7e148d35895fda3f74d03284"
-language_extension = "ts"
--- a/crates/eval/examples/checkpoint_stability/diff_criteria.md
+++ b/crates/eval/examples/checkpoint_stability/diff_criteria.md
@@ -1,5 +0,0 @@
-1. A new changeset file is created to document a patch that improves diff editing animations and enhances prompts for large file edits. An indicator showing the number of diff edits is also added next to each file path.
-2. In `diff.ts`, the error message thrown when a `SEARCH` block doesn’t match content has been updated to clarify that the mismatch could be due to out-of-order blocks.
-3. In `responses.ts`, the assistant response for diff mismatches now recommends limiting to 1–3 `SEARCH/REPLACE` blocks at a time for large files. It also simplifies fallback instructions for using the `write_to_file` tool.
-4. The `DiffViewProvider.ts` file has been updated to replace line-by-line animations with chunk-based updates for better performance. For large diffs, a smooth scrolling animation is introduced to maintain visual context. Small diffs still scroll directly.
-5. In `CodeAccordian.tsx`, a new visual indicator displays the number of `REPLACE` blocks in the code diff using a diff icon and count, providing quick insight into the volume of changes.
--- a/crates/eval/examples/checkpoint_stability/prompt.md
+++ b/crates/eval/examples/checkpoint_stability/prompt.md
@@ -1,7 +0,0 @@
-We're trying to improve both performance and usability when working with large diffs in the editor. A few areas need attention:
-
-First, the current diff animation applies updates line-by-line, which can feel slow and visually jarring for large edits. Could you revise the logic so that we update the editor in larger chunks instead? For smaller diffs, direct scrolling to the edited line is fine, but for larger changes, it would be great to implement a smooth scrolling animation that steps through the affected region before settling at the final line.
-
-Second, the current error message when a SEARCH block doesn't match is a bit too vague. Let's make it clearer that the issue could be due to out-of-order or imprecise SEARCH/REPLACE blocks, especially when working with multiple blocks. It might also help to add a suggestion that users try only 1–3 changes at a time for large files before retrying.
-
-Finally, in the file accordion UI, it would be useful to show how many edits a file contains. Could you parse the diff content and display a count of REPLACE blocks next to the file path, maybe with a small icon for clarity?
--- a/crates/eval/examples/dd_iaptic_mcp_server_integration/base.toml
+++ b/crates/eval/examples/dd_iaptic_mcp_server_integration/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/punkpeye/awesome-mcp-servers.git"
-revision = "5480a9849b01ae8a5c1433d75ad0415975609571"
-language_extension = "md"
--- a/crates/eval/examples/dd_iaptic_mcp_server_integration/diff_criteria.md
+++ b/crates/eval/examples/dd_iaptic_mcp_server_integration/diff_criteria.md
@@ -1,5 +0,0 @@
-1. The diff shows changes to `README.md`, specifically adding a new entry to the "Tools and integrations" list. The new entry is for `@iaptic/mcp-server-iaptic`, which provides access to customer purchase and revenue data.
-2. The added line includes:
-   - The GitHub repository URL
-   - Three emojis: 🎖️ (possibly representing awards or achievements), 📇 (profiles or contacts), and ☁️ (cloud)
-   - A description of the tool's functionality: "Connect with [iaptic](https://www.iaptic.com) to ask about your Customer Purchases, Transaction data and App Revenue statistics"
--- a/crates/eval/examples/dd_iaptic_mcp_server_integration/prompt.md
+++ b/crates/eval/examples/dd_iaptic_mcp_server_integration/prompt.md
@@ -1,3 +0,0 @@
-Please add a new tool entry to the README.md file's integration list: "@iaptic/mcp-server-iaptic" with GitHub link, described as "Connect with [iaptic](https://www.iaptic.com) to ask about your Customer Purchases, Transaction data and App Revenue statistics", tagged with the following emojis: 🎖️ 📇 ☁️. Place it appropriately in the existing tools section, following the current alphabetical or category-based order.
-
-Edit the README file with the above, new resource
--- a/crates/eval/examples/debian_image_builder/base.toml
+++ b/crates/eval/examples/debian_image_builder/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/avkcode/container-tools.git"
-revision = "34137bb453b4d2dd28b08bd80e26bc3105a50ada"
-language_extension = "sh"
--- a/crates/eval/examples/debian_image_builder/diff_criteria.md
+++ b/crates/eval/examples/debian_image_builder/diff_criteria.md
@@ -1,4 +0,0 @@
-1. Changes to the Makefile where the parameter "--keyrign" was corrected to "--keyring" in multiple build targets including debian11, debian11-java, debian11-java-slim, debian11-graal, debian11-graal-slim, debian11-corretto, debian11-java-slim-maven, debian11-java-slim-gradle, debian11-graal-slim-maven, and debian11-graal-slim-gradle. This appears to be a typo fix across all Java-related build configurations in the Makefile.
-2. Introduces significant enhancements to the debian/mkimage.sh script, including adding a usage function with detailed documentation, improving error handling for command-line arguments, and fixing the "--keyrign" parameter to "--keyring" to match the Makefile changes. It also adds better validation for required arguments and more descriptive error messages when values are missing. The script now includes comprehensive documentation about its purpose and usage examples.
-3. Shows extensive improvements to the script's functionality and robustness, including adding tracing capabilities, better error handling, and more informative logging. It introduces new helper functions like usage(), die(), warn(), and info() for better user feedback. The script now properly checks for required commands (debootstrap, unzip, trivy) and provides installation instructions if they're missing. It also includes better system checks (Linux OS verification, root privileges check, SELinux status) and implements a more reliable way to handle GPG keys by setting up the correct directory structure and permissions before key import.
-4. Continues the script improvements with better package management, repository configuration, and container setup. It adds proper apt repository configuration in the target system, implements package installation with retries, and includes Docker-specific optimizations. The script now provides clearer output about installed packages and their sizes. It also includes better cleanup procedures and more informative completion messages with clear instructions on how to load and run the resulting Docker image. The output now includes example commands and proper formatting for better readability.
--- a/crates/eval/examples/debian_image_builder/prompt.md
+++ b/crates/eval/examples/debian_image_builder/prompt.md
@@ -1 +0,0 @@
-I need to make several improvements to our Debian image-building scripts. First, fix the typo in the `Makefile` where `--keyrign` is incorrectly used instead of `--keyring` across all build targets, including the standard Debian image and Java variants like `debian11-java`, `debian11-graal`, and `debian11-corretto`. Second, enhance the `debian/mkimage.sh` script to include proper error handling, usage documentation, and command-line argument validation. The script should check for required tools like `debootstrap`, `unzip`, and `trivy`, and provide installation instructions if they're missing. Improve the GPG key setup by ensuring the `/root/.gnupg` directory is properly configured before importing keys. Add structured logging with timestamps, warnings, and informational messages. Implement better package installation with retries and proper cleanup. Finally, include clear instructions at the end on how to load and run the generated Docker image, with example commands for verification. The script should be robust, well-documented, and fail early with meaningful error messages if system requirements aren't met.
--- a/crates/eval/examples/docs_restructure/base.toml
+++ b/crates/eval/examples/docs_restructure/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/YuhangSong/Arena-Baselines.git"
-revision = "801ed8566110ddc4a6ada0cc70171c636d78dbb8"
-language_extension = "py"
--- a/crates/eval/examples/docs_restructure/diff_criteria.md
+++ b/crates/eval/examples/docs_restructure/diff_criteria.md
@@ -1,12 +0,0 @@
-1. README.md Features Section Reorganization
-The features section has been reorganized into two subsections ("Baselines" and "Games") with markdown tables added. The previous bullet points were replaced with more structured content including supported/benchmarked status indicators. A new "Visualization" section was added with TensorBoard and port forwarding instructions.
-2. Content Relocation and File Restructuring
-The Tennis game documentation and action space details were moved from README.md to a new games.md file. The README was cleaned up by removing commented-out content and consolidating documentation sections. YAML config files (Benchmark-2T1P-Discrete.yaml and Test-Pong.yaml) were modified to replace `selfplay_recent_prob` with `playing_policy_load_recent_prob` and adjust population size options.
-3. train.py Refactoring
-Significant changes to train.py including:
- Renamed `selfplay_recent_prob` parameter to `playing_policy_load_recent_prob`
- Simplified the nested grid search structure by removing unnecessary loops
- Improved policy loading logic with better checkpoint path handling
- Enhanced error handling and logging for policy saving/reloading
- Removed redundant code and improved code organization
- Added more descriptive console output during policy operations
--- a/crates/eval/examples/docs_restructure/prompt.md
+++ b/crates/eval/examples/docs_restructure/prompt.md
@@ -1,13 +0,0 @@
-I need to refactor the multi-agent configuration system in our Arena-Baselines repository. The current policy_assignment parameter (self_play, independent) is too coarse. I want to replace it with a more flexible set of parameters to better support advanced training schemes like population-based training (PBT) and sophisticated self-play with historical opponents.
-
-Specifically, I will introduce four new configuration parameters:
-
-iterations_per_reload: Controls the frequency (in training iterations) at which policies are saved and potentially reloaded.
-num_learning_policies: Explicitly defines how many agents use policies that are actively being trained (can be an integer or 'all').
-selfplay_recent_prob: For non-learning agents (players), this determines the probability of loading the latest version of a learning policy versus loading a uniformly random historical version during reloads.
-size_population: Specifies the number of distinct policy versions maintained for each learning agent, enabling PBT-style experiments.
-To implement this, I will significantly modify train.py. This includes updating the argument parser, changing how experiment configurations are expanded (especially with grid_search), and implementing a new callback function (on_train_result). This callback will handle the periodic saving (using pickle) of learning policies to structured directories and the reloading of all policies (learning and playing) according to the new parameters (iterations_per_reload, selfplay_recent_prob, size_population). Playing policies will use deterministic actions.
-
-I'll also reorganize the codebase by renaming arena/rllib_env.py to arena/arena.py and creating a new arena/utils.py file to house utility functions (like configuration helpers, ID generators, DeterministicCategorical) and constants.
-
-Finally, I will update the example configuration files (Benchmark-2T1P-Discrete.yaml, Test-Pong.yaml) to remove policy_assignment and demonstrate the usage of the new parameters, including within grid_search.
--- a/crates/eval/examples/email_verification_refactor/base.toml
+++ b/crates/eval/examples/email_verification_refactor/base.toml
@@ -1,4 +0,0 @@
-url = "https://github.com/dani-garcia/vaultwarden.git"
-revision = "3a1f1bae002bebf26ce3a38b879c1ba26529af1e"
-language_extension = "rs"
-allow_preexisting_diagnostics = true
--- a/crates/eval/examples/email_verification_refactor/diff_criteria.md
+++ b/crates/eval/examples/email_verification_refactor/diff_criteria.md
@@ -1,6 +0,0 @@
-1. Refactors the `register_verification_email` logic to generate the JWT verification token earlier in the control flow, reducing duplication and improving readability.
-2. Improves conditional logic for sending verification emails by only querying the database when mail should be sent, reducing unnecessary operations.
-3. Refines the user existence check to specifically filter for users that have a `private_key`, adding stricter criteria before skipping email sending.
-4. Preserves existing timing attack mitigation by retaining randomized sleep behavior when user exists but an email is not sent.
-5. Ensures the email is sent only if appropriate, preserving previous behavior while streamlining logic and improving maintainability.
-6. Removes redundant code paths and unnecessary reassignments, improving clarity without affecting functionality.
--- a/crates/eval/examples/email_verification_refactor/prompt.md
+++ b/crates/eval/examples/email_verification_refactor/prompt.md
@@ -1 +0,0 @@
-I want to refactor the `register_verification_email` function to streamline how verification emails are handled. Currently, the code checks if a user exists and then sends an email or returns early. I’d like to move the JWT token generation to the top of the function to avoid duplication. Then, if mail sending is enabled, the code should check for the user, but only send the verification email if the user exists and has a `private_key` (otherwise it should send the email). Keep the random sleep logic for timing mitigation in the branch where no email is sent. Remove the old duplicated token generation logic and any redundant conditionals, while ensuring the core behavior and response flow stays the same.
--- a/crates/eval/examples/exif_rotation_support/thread_criteria.md
+++ b/crates/eval/examples/exif_rotation_support/thread_criteria.md
@@ -0,0 +1,10 @@
+1. The first tool call should locate the definition of the `get_dynamic_image_from_path` function via a **path search** that includes the filename `common_image.rs`. It should not begin by searching for function definitions using grep or guessing the file name.
+2. After resolving the correct path for `common_image.rs`, the model should **read** the file to examine the implementation of `get_dynamic_image_from_path` and determine how to inject EXIF-based rotation logic.
+3. The tool should then search for EXIF-related crates or documentation for `nom-exif`, `MediaParser`, and `ExifIter` **only after** confirming these are not already imported. This avoids speculative searching.
+4. The model should implement a custom `ExifOrientation` enum matching EXIF tag codes 1–8, and map each to the correct image transformation (e.g., `Rotate90`, `FlipH`). This logic must be encapsulated cleanly in a helper function like `get_rotation_from_exif`.
+5. The EXIF orientation must be applied **directly** to the returned `DynamicImage` object within `get_dynamic_image_from_path` using transformations like rotate or flip as required.
+6. Once image transformation logic is integrated, the `CACHE_IMAGE_VERSION` constant should be bumped to ensure cache invalidation. The model must **not** remove unrelated constants or variables.
+7. It must update the Rust version to `1.80.0` in `Cargo.toml` and CI files, but **should not** downgrade or alter unrelated configuration fields.
+8. The correct place to modify the GUI tab default (from `SimilarImages` to `DuplicateFiles`) should be located by reading the appropriate GUI state or initialization logic file—this should not be guessed or edited blindly.
+9. New dependencies (`nom-exif`, `iso6709parse`, etc.) should be added to the appropriate `[dependencies]` sections in `Cargo.toml` and respected in `Cargo.lock` without removing or altering unrelated dependencies.
+10. The model must **not** remove or delete any existing file unless explicitly instructed in the user prompt. It should avoid unnecessary file writes or edits unrelated to EXIF support.
--- a/crates/eval/examples/expand_laravel_php_support/base.toml
+++ b/crates/eval/examples/expand_laravel_php_support/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/calebporzio/sushi.git"
-revision = "01dd34fe3374f5fb7ce63756c0419385e31cd532"
-language_extension = "php"
--- a/crates/eval/examples/expand_laravel_php_support/diff_criteria.md
+++ b/crates/eval/examples/expand_laravel_php_support/diff_criteria.md
@@ -1,3 +0,0 @@
-1. The GitHub workflow file has been significantly updated to expand testing coverage and improve the CI process. The changes introduce a new `fail-fast: false` setting to allow all matrix combinations to complete even if some fail. The testing matrix now includes PHP 8.4 and Laravel 12.* alongside the existing versions. The configuration includes specific testbench version mappings for Laravel 12.* and removes the DBAL requirement for Laravel 11.* tests. Numerous new test combinations have been added across all Laravel versions to include PHP 8.4 testing. The dependency installation process has been restructured into separate steps - one specifically for DBAL when needed, and another for general dependencies using updated composer commands with precise version constraints.
-2. The composer.json file has been updated to support the newly added Laravel 12.* version in both the main requirements and development dependencies. The testbench package now explicitly includes versions 5.* and 10.* in its supported range. For testing tools, PHPUnit 11.* has been added to the list of supported versions while maintaining backward compatibility with older versions. These changes ensure the package can be used with the latest Laravel ecosystem components while preserving compatibility with existing installations.
-st file modifications primarily focus on adapting to changes in Laravel 11+ where column type handling was updated. The changes introduce version-aware assertions that check whether to expect 'string' or 'varchar' as column types based on the Laravel version being tested. A new import for the version comparison function was added to support these conditional checks. Additional safeguards were implemented, including a check for the HandlesAnnotations trait before running database migration tests, making the test suite more robust when running in different environments. The column type assertions in multiple test methods were updated to use these version-aware checks to maintain compatibility across Laravel versions.
--- a/crates/eval/examples/expand_laravel_php_support/prompt.md
+++ b/crates/eval/examples/expand_laravel_php_support/prompt.md
@@ -1,11 +0,0 @@
-
-I'd like to update our Laravel package's CI workflow and dependencies to ensure compatibility with the upcoming Laravel 12 release and PHP 8.4. Currently, our package supports Laravel versions 5.8 through 11 and PHP versions 7.1 through 8.3, and we'll need to extend this support while maintaining backward compatibility.
-
-**Key Changes Needed:**
-First, we'll need to update composer.json to explicitly support Laravel 12. The CI test matrix should also be expanded to include PHP 8.4 testing across all supported Laravel versions. The workflow configuration will require adjustments to properly handle these new version combinations.
-
-There are some test compatibility issues we'll need to address - particularly around how we check string column types in Laravel 11+ (where 'string' was changed to 'varchar'), and we should add conditional skipping for tests that depend on traits that might not be available in all test environments.
-
-While making these changes, we could also implement some workflow improvements: enabling the fail-fast: false option to get complete test results even with individual failures, modernizing our dependency installation approach using the newer composer update syntax, and making the DBAL dependency installation conditional since it's not needed for all test cases.
-
-Would you be able to help review these changes or suggest any additional considerations we should keep in mind for this compatibility update? I want to make sure we maintain stability while expanding our support coverage.
--- a/crates/eval/examples/finnish_translation/base.toml
+++ b/crates/eval/examples/finnish_translation/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/sdras/array-explorer.git"
-revision = "8ff1a72f7ba24d44946bf591c3586b0dcccc2381"
-language_extension = "js"
--- a/crates/eval/examples/finnish_translation/diff_criteria.md
+++ b/crates/eval/examples/finnish_translation/diff_criteria.md
@@ -1,12 +0,0 @@
-1. **EditorConfig Change**
-Added a new setting `quote_type = single` to the `.editorconfig` file. This specifies that single quotes should be used for quoting in the codebase.
-2. **New Finnish Locale Files**
-Added two new Finnish language files:
-   - `src/locale/fi/index.js`: Contains Finnish translations for UI strings and method descriptions
-   - `store/fi/index.js`: Contains Finnish translations for all array method documentation (298 lines)
-   - `store/fi/meta.json`: Metadata about the Finnish translation (language code "fi", full name "Finnish", created by "sjarva")
-3. **Store Integration Updates**
-Modified `store/index.js` to:
-   - Import the new Finnish locale files (`import fi from './fi/index'` and `import translationsFi from '../src/locale/fi/index'`)
-   - Add Finnish to the Vuex store state (`fi`)
-   - Register Finnish translations with Vue I18n (`Vue.i18n.add('fi', translationsFi)`)
--- a/crates/eval/examples/finnish_translation/prompt.md
+++ b/crates/eval/examples/finnish_translation/prompt.md
@@ -1,5 +0,0 @@
-I’m working on adding Finnish (fi) language support to our array method reference application, which helps users determine the right JavaScript array methods based on their needs. To achieve this, I’ll need to:
-
-First, create the Finnish locale file containing translations for method selection options, method types (such as add, remove, find, and iterate), and primary action choices. Next, I’ll add Finnish translations to the store, covering all array methods (like splice, push, and unshift), including detailed descriptions of their behaviors, parameters, return values, and example code with outputs.
-
-Additionally, I’ll generate a Finnish meta file with language metadata (language code, full name, and contributor info). Finally, I’ll update the main store index to integrate Finnish alongside existing languages like English, Spanish, and German.
--- a/crates/eval/examples/language_model_file_support/base.toml
+++ b/crates/eval/examples/language_model_file_support/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/vercel/ai.git"
-revision = "1766edec300deb05c84bb7fefc034af4c2bc1165"
-language_extension = "ts"
--- a/crates/eval/examples/language_model_file_support/diff_criteria.md
+++ b/crates/eval/examples/language_model_file_support/diff_criteria.md
@@ -1,3 +0,0 @@
-1. Introduces a new changeset file that documents a patch for the '@ai-sdk/provider' package. The changeset indicates a chore task where 'LanguageModelV2File' is being extracted, suggesting a refactoring effort to modularize the codebase by separating file-related types into their own module.
-2. Modifications to the language model v2 index file where a new export statement for 'language-model-v2-file' has been added. This change reflects the extraction mentioned in the changeset and makes the new file type available to other parts of the application. Additionally, there are significant changes to the language model v2 implementation file where the inline file type definition has been replaced with the newly extracted 'LanguageModelV2File' type, both in the main model interface and in the stream part union type, demonstrating the consolidation of file-related types into a single, reusable definition.
-3. Present the newly created 'language-model-v2-file.ts' file which defines the 'LanguageModelV2File' type with comprehensive documentation. The type includes two properties: 'mediaType' which specifies the IANA media type of the file with a reference to the official media types registry, and 'data' which can be either a base64 encoded string or binary data, with clear documentation about maintaining the original format from the API without unnecessary conversion. This new file represents the extracted type that is now being used throughout the codebase.
--- a/crates/eval/examples/language_model_file_support/prompt.md
+++ b/crates/eval/examples/language_model_file_support/prompt.md
@@ -1 +0,0 @@
-We need to improve how our language model handles file attachments by making the file type definitions more modular and reusable. Currently, file-related properties are defined inline within the model’s response and stream types, which makes maintenance harder and duplicates documentation. The goal is to extract these definitions into a dedicated type that can be shared consistently across both static responses and streaming payloads. The new type should include clear documentation about media types (referencing IANA standards) and support both base64 and binary data formats without unnecessary conversions. This change should maintain backward compatibility while centralizing the file structure definition for better type safety and readability. Focus on clean separation of concerns, and ensure the extracted type is properly exported and imported where needed.
--- a/crates/eval/examples/lhs_join_update_callbacks/thread_criteria.md
+++ b/crates/eval/examples/lhs_join_update_callbacks/thread_criteria.md
@@ -0,0 +1,8 @@
+1. The first tool call should be a **path search for the test file or test suite** where join subscriptions and callbacks like `on_insert` or `on_update` are defined. It should not guess or attempt a read without identifying the correct path.
+2. After locating the test file, the model should read its content to understand the testing structure, especially how subscriptions and callbacks are validated.
+3. The test should be added in a way that mirrors or extends existing join subscription test patterns, particularly those involving `pk_u32` and `unique_u32`.
+4. When verifying callback behavior, the model should avoid using hardcoded assumptions—look for existing helpers or patterns (e.g., assertions on call count or state transitions).
+5. The test must include a case with a logically equivalent WHERE clause written differently (e.g., `0 < x AND x < 5`) to ensure consistent behavior and coverage.
+6. If the model wants to confirm normalization behavior of expressions like `3 < x`, it should either reference the relevant part of the execution planner or reuse prior normalization test helpers—not implement a new planner from scratch.
+7. The model should not remove or replace unrelated tests, helper functions, or files. All modifications should be additive and scoped to the new test logic.
+8. There should be no unnecessary tool calls—once the path is found and read, edits should directly reflect the user’s request without exploratory file listings or excessive back-and-forth.
--- a/crates/eval/examples/libdevice_symbol_reexport/base.toml
+++ b/crates/eval/examples/libdevice_symbol_reexport/base.toml
@@ -1,3 +1,4 @@
 url = "https://github.com/Rust-GPU/Rust-CUDA.git"
 revision = "728013419b6c4c80e099a42413574c36a9aff9c7"
 language_extension = "rs"
+allow_preexisting_diagnostics = true
--- a/crates/eval/examples/libdevice_symbol_reexport/thread_criteria.md
+++ b/crates/eval/examples/libdevice_symbol_reexport/thread_criteria.md
@@ -0,0 +1,8 @@
+1. The first tool call should be a path search to locate the file or module that defines or contains `LIBDEVICE_BITCODE`. This is necessary to confirm the current source of the symbol (`cust_raw::nvvm_sys`) before refactoring.
+2. Once the relevant path (e.g., inside `cust_raw::nvvm_sys`) is confirmed, the model should read the contents of that file to identify how `LIBDEVICE_BITCODE` is currently defined or reexported.
+3. The model should then path search or navigate to the `nvvm` crate root (e.g., `crates/nvvm/src/lib.rs`) and modify it to include a `pub use` statement that reexports `LIBDEVICE_BITCODE` publicly from its original location.
+4. The model should locate all usages of `cust_raw::nvvm_sys::LIBDEVICE_BITCODE` in the `rustc_codegen_nvvm` crate and replace them with the new `nvvm::LIBDEVICE_BITCODE` path.
+5. The `rustc_codegen_nvvm/Cargo.toml` file should be read and modified to remove the dependency on `cust_raw` if it is no longer used directly.
+6. After updating imports, the model should clean up any `use` statements in `rustc_codegen_nvvm` files that reference `cust_raw` and are now redundant.
+7. At no point should the model attempt to remove or spin up irrelevant files or dependencies—tool use should be precise, focused on `nvvm`, `cust_raw`, and `rustc_codegen_nvvm` only.
+8. The model must not attempt to guess paths or rely on heuristics when looking for crate files—it should perform path search or directory listing when uncertain.
--- a/crates/eval/examples/license_management/base.toml
+++ b/crates/eval/examples/license_management/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/SAP-samples/abap-cheat-sheets.git"
-revision = "262c0472eeb03e05ff8235767356a328d97850e6"
-language_extension = "rs"
--- a/crates/eval/examples/license_management/diff_criteria.md
+++ b/crates/eval/examples/license_management/diff_criteria.md
@@ -1,3 +0,0 @@
-1. The file `.reuse/dep5` has been deleted. This file previously contained copyright and licensing information in Debian's copyright format, including details about API usage with SAP products, copyright notice (2022 SAP SE or affiliates), and Apache-2.0 license information.
-2. A new file `REUSE.toml` has been created with similar copyright and licensing information but in a different format. It includes the package name, supplier information, download location, and the same detailed disclaimer about API usage with SAP products that was in the deleted file.
-3. The new `REUSE.toml` file also contains annotations specifying that the copyright text and Apache-2.0 license apply to all files (`path = "**"`) with aggregate precedence, effectively maintaining the same licensing terms but in a different configuration format.
--- a/crates/eval/examples/license_management/prompt.md
+++ b/crates/eval/examples/license_management/prompt.md
@@ -1,17 +0,0 @@
-I need to switch our license stuff from the old .reuse/dep5 file to the new REUSE.toml format. basically same info, just different format. here's what's in the old file:
-
-project name: abap-cheat-sheets
-contact: daniel reger's email
-repo link
-that long SAP API disclaimer
-copyright: SAP + contributors, 2022
-license: Apache-2.0
-need to:
-
-delete the old .reuse/dep5 file
-make a new REUSE.toml with:
-same project info (name, contact, repo)
-same exact API disclaimer text
-SPDX-style copyright & license fields
-apply to all files (** glob) with aggregate precedence
-not changing any actual license terms, just updating the format. can you give me the exact REUSE.toml file we need?
--- a/crates/eval/examples/metal_i64_support/thread_criteria.md
+++ b/crates/eval/examples/metal_i64_support/thread_criteria.md
@@ -0,0 +1,8 @@
+1. The first tool call should search for the file containing the text generation entry point—most likely something like `main.rs`, `cli.rs`, or another file in the binary or CLI layer—rather than guessing and reading files blindly.
+2. When modifying argument parsing, the model should not remove fields like `temperature` or `top_p`—instead, it should set reasonable default values and ensure they are non-optional in downstream code.
+3. When changing `verbose_prompt` to `verbose`, the model must update all places in the codebase where `verbose_prompt` was previously referenced, not just the CLI argument itself.
+4. When updating how optional paths (`cache_path`, `weight_path`) are handled, the logic should gracefully fall back to defaults rather than panic or unwrap without checks.
+5. Deserialization from a JSON config should be added using Serde. The tool should avoid hardcoding configuration values and should prefer loading from a file with a fallback to sensible defaults using helper functions.
+6. The `Config` struct should be extended with a `rope_ratio` field, which includes a `Default` implementation or similar mechanism (e.g. `fn default_rope_ratio() -> f32`) to allow for clean deserialization.
+7. Any reordering of imports or cleanup should not introduce functional regressions or changes to logic; these changes should only enhance code clarity and consistency.
+8. The model should avoid spinning—repeated unnecessary tool calls such as rereading the same files or re-requesting already loaded information—and should move forward once relevant context has been gathered.
--- a/crates/eval/examples/metrics_data_size_updates/thread_criteria.md
+++ b/crates/eval/examples/metrics_data_size_updates/thread_criteria.md
@@ -0,0 +1,7 @@
+1. The first tool call should involve searching for the file containing `report_data_size` within the relevant module to locate its definition and understand its current visibility level. The correct path should be used, not an inferred guess.
+2. Once the path is found, the visibility of the `report_data_size` function should be updated from `pub(super)` to `pub`, ensuring it's publicly accessible.
+3. Next, search for the `record_tx_metrics` function in the `datastore.rs` file. The tool should remove the previously commented-out code invoking `report_data_size`, as it's no longer needed and is being refactored.
+4. After this, a new function `update_data_size_metrics` should be added to the `RelationalDB` struct. This function should encapsulate the logic for invoking `report_data_size`, ensuring better clarity in how the database layer handles data size metrics.
+5. A rename and refactor of the `storage_monitor` function should follow. It should be renamed to `metric_reporter`, and the function's responsibilities should be updated to reflect its new role in periodically reporting disk usage and invoking `update_data_size_metrics`.
+6. Asynchronous operations and time intervals related to disk usage reporting should be carefully evaluated to ensure there is no unnecessary repetition or redundant processes, ensuring more efficient metric collection.
+7. The code should include a `TODO` comment for potential future improvements, such as adding functionality for heap usage metrics. This serves as a reminder for future enhancement without breaking current functionality.
--- a/crates/eval/examples/nan_diff_handling/base.toml
+++ b/crates/eval/examples/nan_diff_handling/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/AsyncBanana/microdiff"
-revision = "ce2055948483d01fb1e96def4ab98d6339d3b2f9"
-language_extension = "js"
--- a/crates/eval/examples/nan_diff_handling/diff_criteria.md
+++ b/crates/eval/examples/nan_diff_handling/diff_criteria.md
@@ -1,6 +0,0 @@
-1. **NaN Comparison Logic Update**:
-The diff modifies the comparison function to explicitly handle NaN values as equivalent. Previously, the function relied on string conversion for NaN comparison, but now it first checks if both values are NaN using Number.isNaN() before proceeding with other comparison logic. This change ensures consistent behavior when comparing NaN values in objects.
-2. **New NaN Test Suite - Object Operations**:
-A comprehensive test suite is added to verify NaN handling in object operations. The tests cover: creating new objects with NaN values, changing NaN values to other numbers, verifying no changes when NaN values remain the same, and removing properties with NaN values. Each test case validates the diff output structure and type of operation.
-3. **New NaN Test Suite - Array Operations**:
-The test suite extends to array operations with similar test cases as objects but adapted for array contexts. It tests: adding NaN to arrays, replacing NaN with other numbers, maintaining arrays with unchanged NaN values, and removing NaN elements from arrays. The tests ensure consistent behavior between object and array operations involving NaN values.
--- a/crates/eval/examples/nan_diff_handling/prompt.md
+++ b/crates/eval/examples/nan_diff_handling/prompt.md
@@ -1 +0,0 @@
-The goal of this update is to fix NaN value handling in our JavaScript object diffing functionality. Currently, the diff function fails to properly recognize that two NaN values should be treated as equal due to JavaScript's native behavior where `NaN !== NaN`. This causes incorrect change detection when comparing objects or arrays containing NaN values. The solution involves modifying the diff function to explicitly check for NaN values using `Number.isNaN()` during comparisons of object keys and values, ensuring NaN values are treated as equivalent. The implementation requires adding specific NaN equivalence checks while maintaining existing comparison logic. Additionally, comprehensive unit tests are being added to verify correct handling across various scenarios: creating objects/arrays with NaN values, changing NaN values to other values, ensuring no false positives when NaN values remain unchanged, and properly tracking removal of NaN values from both objects and arrays. This change will bring the diff behavior in line with mathematical expectations for NaN comparisons while maintaining all other existing functionality.
--- a/crates/eval/examples/never_type_workaround/base.toml
+++ b/crates/eval/examples/never_type_workaround/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/bevyengine/bevy.git"
-revision = "ac52cca033b351cc966cd3d40eb99ffbefbdb104"
-language_extension = "rs"
--- a/crates/eval/examples/never_type_workaround/diff_criteria.md
+++ b/crates/eval/examples/never_type_workaround/diff_criteria.md
@@ -1,5 +0,0 @@
-1. Introduces a stable-Rust-compatible workaround for the unstable `!` (never) type by implementing a custom `Never` alias based on a trait (`FnRet`) and function signature (`fn() -> !`), mimicking the behavior of the `never_say_never` crate without an external dependency.
-2. Adds trait impls that enable Bevy systems and commands to accept `Never` as an output type, ensuring compatibility with panicking closures or intentionally non-returning functions like `todo!()` or `panic!()`.
-3. Updates internal wrappers (`InfallibleSystemWrapper`, `InfallibleObserverWrapper`) and trait bounds across observer and schedule systems to support this workaround by allowing `Never` as a valid output type while maintaining existing fallible/infallible behavior.
-4. Adds robust regression test coverage to ensure these `Never`-based trait implementations compile and function as expected, specifically targeting closures and functions that use `todo!()` or diverge without returning.
-5. Ensures this workaround does not compromise stability guarantees by isolating `Never` usage to internal APIs and clearly documenting the risks and rationale in the new `never.rs` module.
--- a/crates/eval/examples/never_type_workaround/prompt.md
+++ b/crates/eval/examples/never_type_workaround/prompt.md
@@ -1 +0,0 @@
-I'd like to add stable Rust support for handling the `!` (never) type in Bevy's ECS systems, in light of changes introduced in the Rust 2024 edition around never type fallback inference. Please create a new internal module (e.g., `never.rs`) that provides a type alias `Never` using a workaround based on a trait and `fn() -> !` to simulate the behavior of the unstable `!` type. Update the necessary traits and system wrappers (such as `HandleError`, `IntoScheduleConfigs`, and `IntoObserverSystem`) to accept `Never` as a valid output type, ensuring that closures or systems using `todo!()` or panics can still compile and behave correctly. Add a set of regression tests that exercise this compatibility by queuing and scheduling systems and commands with `todo!()` as their body, ensuring trait impls are resolved properly. Make sure to document this hack in the new module with a clear explanation of why it's being used and the risks involved.
--- a/crates/eval/examples/optimizer_schema_refactor/base.toml
+++ b/crates/eval/examples/optimizer_schema_refactor/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/redis/redis-vl-python.git"
-revision = "494e5e2f8cf800b90c7383385095c2e503404bc5"
-language_extension = "py"
--- a/crates/eval/examples/optimizer_schema_refactor/diff_criteria.md
+++ b/crates/eval/examples/optimizer_schema_refactor/diff_criteria.md
@@ -1,3 +0,0 @@
-1. The changes involve renaming the `TestData` class to `LabeledData` across multiple files. This includes updating the import statements in `__init__.py`, `cache.py`, `router.py`, `schema.py`, and `utils.py` to reflect this new class name. The `__all__` list in `__init__.py` is also updated to export `LabeledData` instead of `TestData`. This appears to be a conceptual renaming to better reflect the purpose of the data structure.
-2. The modifications update all function signatures and type hints that previously used `TestData` to now use `LabeledData`. This affects several functions in `cache.py` including `_generate_run_cache`, `_eval_cache`, and `_grid_search_opt_cache`, as well as functions in `router.py` like `_generate_run_router` and `_eval_router`. The utility functions in `utils.py` are also updated to work with `LabeledData` instead of `TestData`.
-3. The changes introduce a new `search_step` parameter in the router optimization logic within `router.py`, with a default value of 0.10. This parameter is passed through to the `_router_random_search` function and is used in the optimization process. The test file `test_threshold_optimizer.py` is updated to explicitly set this parameter to 0.5 when calling the optimize method, demonstrating how it can be configured for different search granularities during threshold optimization.
--- a/crates/eval/examples/optimizer_schema_refactor/prompt.md
+++ b/crates/eval/examples/optimizer_schema_refactor/prompt.md
@@ -1 +0,0 @@
-I need to refactor our codebase to improve the clarity and consistency of our data model, particularly around how we handle labeled evaluation data for our threshold optimization system. Currently, the naming and structure might imply that this data is only used for testing, when in reality it represents labeled examples that power both training and evaluation. The changes should better reflect that these are curated data points with known outcomes, not just test cases. Focus on updating the core data model and ensuring all dependent components—like the cache optimizer, router, and evaluation utilities—properly reference this updated concept. The implementation should maintain all existing functionality while making the naming more semantically accurate. Where relevant, consider adding parameters to fine-tune optimization behavior, like allowing control over the granularity of threshold searches.
--- a/crates/eval/examples/rate_limit_endpoints/base.toml
+++ b/crates/eval/examples/rate_limit_endpoints/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/matryer/goblueprints.git"
-revision = "68041a598865cc3f4fa2acd4119081a2ea0826bf"
-language_extension = "go"
--- a/crates/eval/examples/rate_limit_endpoints/diff_criteria.md
+++ b/crates/eval/examples/rate_limit_endpoints/diff_criteria.md
@@ -1,12 +0,0 @@
-1. The main.go changes introduce rate-limited endpoints by creating them via `MakeEndpoints` and passing them to both HTTP and gRPC servers instead of directly using the service. This includes:
-   - Adding endpoint creation before server startup
-   - Modifying HTTP server to use endpoints
-   - Modifying gRPC server to use endpoints
-2. The server_grpc.go changes update the gRPC server implementation to use the provided endpoints instead of creating them internally. This affects both hash and validate endpoints which are now taken from the Endpoints struct rather than being created via makeHashEndpoint/makeValidateEndpoint.
-3. The server_http.go changes mirror the gRPC server changes, modifying the HTTP server to use endpoints from the Endpoints struct rather than creating them internally for both hash and validate routes.
-4. The service.go changes include:
-   - Renaming makeHashEndpoint to MakeHashEndpoint and making it public
-   - Renaming makeValidateEndpoint to MakeValidateEndpoint and making it public
-   - Adding new MakeEndpoints function that creates rate-limited endpoints using a token bucket (5 requests per second)
-   - Adding new dependencies for rate limiting (kitrl and ratelimit packages)
-   - The Endpoints struct remains the same but is now populated with rate-limited versions of the endpoints
--- a/crates/eval/examples/rate_limit_endpoints/prompt.md
+++ b/crates/eval/examples/rate_limit_endpoints/prompt.md
@@ -1,18 +0,0 @@
-Here’s a more abstract, goal-oriented version of your request without diving into implementation specifics:
-
---
-
-### **Request: Add Rate Limiting to Vault Service**
-
-We need to introduce rate limiting to our vault service to protect it from excessive traffic and ensure fair usage. The service currently handles password hashing and validation through both HTTP and gRPC, and we want to enforce a controlled request rate across all endpoints.
-
-#### **Key Requirements:**
- Apply a global rate limit (e.g., 5 requests per second) to prevent abuse.
- Ensure the rate limiting works consistently across both HTTP and gRPC interfaces.
- Refactor the service to cleanly support rate limiting without breaking existing functionality.
- Maintain flexibility so that limits can be adjusted if needed.
-
-#### **Implementation Approach (High-Level):**
- Use a token bucket or similar algorithm for smooth rate limiting.
- Integrate with our existing middleware/request pipeline.
- Keep the changes minimal but scalable for future adjustments.
--- a/crates/eval/examples/replace_hold_with_drain_on_exit/thread_criteria.md
+++ b/crates/eval/examples/replace_hold_with_drain_on_exit/thread_criteria.md
@@ -0,0 +1,6 @@
+1. **Initial Tool Call**: The first tool call should involve a path search for the files containing the `hold` field, specifically looking for `Options`, `TerminalOptions`, and `PtyOptions`. This should not start with a guess at the file paths but should resolve the paths from the actual locations in the codebase.
+2. **Renaming `hold`**: Once the correct files are identified, the tool should rename `hold` to `drain_on_exit` in the specified structs (`Options`, `TerminalOptions`, `PtyOptions`) and update the code to use the new field name. This should ensure that all the relevant code reflects the new semantic distinction.
+3. **Behavioral Update**: Modify any logic that previously relied on `hold` to now use `drain_on_exit`. This includes updating `event_loop.rs` to ensure that the output is drained properly and the window behavior is adjusted accordingly.
+4. **New `Window` Field**: Add the new `hold` field to the `Window` struct, separate from the terminal process logic, and ensure it is initialized and integrated properly into the codebase.
+5. **Manual Window Close Behavior**: Update the exit logic to explicitly set the new `hold` field to `false` when a user manually closes the window via `WindowEvent::CloseRequested`. This ensures that shutdown occurs as expected.
+6. **Changelog Entry**: Ensure that a changelog entry is created for version `0.25.0-dev` documenting the change from `hold` to `drain_on_exit` and the introduction of the new `hold` field in the `Window` struct. The entry should explain the reasoning behind the change to improve clarity and separate terminal process handling from window behavior.
--- a/crates/eval/examples/request_to_axios_migration/base.toml
+++ b/crates/eval/examples/request_to_axios_migration/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/localtunnel/localtunnel.git"
-revision = "4c136a265c2005bcb81bf47709c8ca9b634f2fc1"
-language_extension = "js"
--- a/crates/eval/examples/request_to_axios_migration/diff_criteria.md
+++ b/crates/eval/examples/request_to_axios_migration/diff_criteria.md
@@ -1,3 +0,0 @@
-1. The first change replaces the `request` module import with `axios` in Tunnel.js. This is accompanied by modifications to the request parameters where `path` and `json` fields are removed and replaced with `responseType: 'json'`. The request URI construction is also slightly modified to separate the base URI from the parameters.
-2. The second chunk shows significant changes to the request handling logic in Tunnel.js. The callback-based `request` implementation is replaced with a promise-based `axios.get` approach. The error handling is restructured to use `.catch()` instead of checking for errors in the callback. The success case now extracts data from `res.data` instead of directly from the response body, and the status code check looks at `res.status` instead of `res.statusCode`.
-3. The third chunk shows changes to package.json where the `request` dependency is removed and replaced with `axios` at version 0.17.1. The dependencies are also reordered, with `debug` and `openurl` moved up and `yargs` moved to the end of the list, though their versions remain unchanged. The devDependencies section remains untouched.
--- a/crates/eval/examples/request_to_axios_migration/prompt.md
+++ b/crates/eval/examples/request_to_axios_migration/prompt.md
@@ -1 +0,0 @@
-I need help modernizing the HTTP client in my Node.js tunneling service. The current implementation uses the older `request` library, which is now deprecated, and I'd like to switch to a more modern, promise-based alternative like `axios`. The changes should maintain all existing functionality—including error handling, retry logic, and response parsing—but improve readability and maintainability by using async/await or proper promise chaining where possible. The request parameters and response handling should be updated to match the new library's conventions while preserving the same behavior for downstream consumers. Additionally, ensure the package.json dependencies are updated accordingly, removing deprecated packages and cleaning up the dependency list. The core tunneling logic should remain unchanged; this is purely about updating the HTTP client layer to be more future-proof.
--- a/crates/eval/examples/restore_version_api_support/thread_criteria.md
+++ b/crates/eval/examples/restore_version_api_support/thread_criteria.md
@@ -0,0 +1,5 @@
+1. The first tool call should be to locate the relevant Python files (`_lancedb.pyi`, `table.py`, `remote/table.py`) and Rust files (`python/src/table.rs`, `rust/lancedb/src/remote/table.rs`). The search should be based on file names and their specific locations in the project. (*Not* a guess based on partial file paths.)
+2. After locating the Python files, modify the async bindings (`_lancedb.pyi`, `table.py`, and `remote/table.py`) to update the method signature of `restore` to `restore(version: Optional[int] = None)`, and align their implementations to perform a checkout if a version is specified.
+3. When making changes to the Rust FFI layer (`python/src/table.rs`), the tool should be used to ensure that the Rust code properly handles the optional `version` argument and performs the checkout before proceeding with restore, ensuring compatibility with the new Python interface.
+4. The `RemoteTable` implementation in `rust/lancedb/src/remote/table.rs` should be modified to include logic for sending versioned restore requests over HTTP, and the tool should be used to verify the accuracy of the version being included in the request body.
+5. Throughout all changes, the tool should also ensure that docstrings and comments are updated to clarify the behavior of the `restore` method, especially in cases where the `version` argument is provided or omitted. The tool should aim to prevent any omissions or outdated explanations in the code.
--- a/crates/eval/examples/runtime_script_refactor/base.toml
+++ b/crates/eval/examples/runtime_script_refactor/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/thalissonvs/pydoll.git"
-revision = "9ea9e91c716b60a7cc8f11ecd865093d460f31aa"
-language_extension = "py"
--- a/crates/eval/examples/runtime_script_refactor/diff_criteria.md
+++ b/crates/eval/examples/runtime_script_refactor/diff_criteria.md
@@ -1,6 +0,0 @@
-1. **Added RuntimeCommands import and WebElement to page.py**
-The changes add an import for `RuntimeCommands` and `WebElement` to `page.py`. The `execute_js_script` method is renamed to `execute_script` and enhanced to support execution in the context of a WebElement. The method now uses `RuntimeCommands` for script evaluation.
-2. **Refactored Runtime-related commands from DomCommands to new RuntimeCommands class**
-The changes move all Runtime-related command templates and methods from `DomCommands` in `dom.py` to a new `runtime.py` file. This includes `EVALUATE_TEMPLATE`, `CALL_FUNCTION_ON_TEMPLATE`, `GET_PROPERTIES`, and their associated methods. The DomCommands class now uses RuntimeCommands for JavaScript evaluation.
-3. **Added Scripts constants and enhanced WebElement functionality**
-The changes add a new `Scripts` class to `constants.py` containing JavaScript snippets for common operations. The `element.py` file is significantly enhanced with new methods for script execution, visibility checking, and improved click handling. New exceptions are added to `exceptions.py` for better error handling.
--- a/crates/eval/examples/runtime_script_refactor/prompt.md
+++ b/crates/eval/examples/runtime_script_refactor/prompt.md
@@ -1,7 +0,0 @@
-I'm looking to improve our Python web automation library (pydoll) to make it more robust and maintainable, particularly around JavaScript execution and element interactions. Currently, we need to better organize our Runtime-related commands and enhance how scripts are executed in the browser context.
-
-The main focus areas include creating a dedicated RuntimeCommands class to centralize all JavaScript-related operations, moving these functions out of DomCommands for cleaner separation of concerns. This new class would handle script evaluation, function calling, and property lookups. We should also enhance the existing page.execute_js_script method—renaming it to execute_script for clarity—and expand its functionality to support execution within specific WebElement contexts, including passing elements as arguments.
-
-For element interactions, we need more reliable mechanisms, particularly around clicking elements. The improvements would include visibility checks, verifying elements aren't obscured, and implementing proper error handling with descriptive exceptions when interactions fail. The current click implementation should be moved to realistic_click, while the new click method would incorporate these safety checks. Additionally, we should consolidate commonly used JavaScript snippets into a centralized Scripts class for better maintainability.
-
-The overall goal is to strengthen the library's reliability for automation tasks while making the codebase more organized and easier to maintain. These changes will provide better error handling, clearer structure, and more intuitive APIs for working with page elements and JavaScript execution. Would you be able to help break this down into actionable steps or suggest any improvements to this approach?
--- a/crates/eval/examples/standardized_docker_dependency_checks/base.toml
+++ b/crates/eval/examples/standardized_docker_dependency_checks/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/basecamp/kamal.git"
-revision = "0174b872bfc34b66852cffb58514ae079f21d299"
-language_extension = "rb"
--- a/crates/eval/examples/standardized_docker_dependency_checks/diff_criteria.md
+++ b/crates/eval/examples/standardized_docker_dependency_checks/diff_criteria.md
@@ -1,7 +0,0 @@
-1. The changes introduce a new `DependencyError` class in `kamal/cli.rb` alongside other error classes like `BootError` and `HookError`. This new error class will be used to handle dependency-related failures.
-2. In `kamal/cli/base.rb`, a new method `ensure_docker_installed` is added which checks for Docker and buildx plugin installation locally. It raises the new `DependencyError` with appropriate messages if either Docker or buildx plugin are not found, replacing similar functionality that was previously scattered elsewhere.
-3. The `kamal/cli/build.rb` file is modified to use the new `ensure_docker_installed` method instead of the removed `verify_local_dependencies` method. The error handling is now consistent, using `DependencyError` instead of `BuildError` for dependency-related failures.
-4. The `kamal/cli/registry.rb` file now includes a call to `ensure_docker_installed` at the start of the login method, ensuring Docker is available before attempting registry operations.
-5. The `kamal/commands/base.rb` file adds a new public method `ensure_docker_installed` that combines checks for both Docker and buildx plugin installation, moving this functionality from the Builder class.
-6. The `kamal/commands/builder.rb` file is simplified by removing the `ensure_local_dependencies_installed` method and related private methods, as this functionality has been moved to the base commands class.
-7. Test files are updated to reflect these changes, with `build_test.rb` now expecting `DependencyError` instead of `BuildError` for dependency failures, and `registry_test.rb` adding a new test case for Docker dependency checking during login.
--- a/crates/eval/examples/standardized_docker_dependency_checks/prompt.md
+++ b/crates/eval/examples/standardized_docker_dependency_checks/prompt.md
@@ -1 +0,0 @@
-I need to improve how our codebase handles Docker dependency checks and error reporting. Right now, the logic for verifying Docker and buildx installations is scattered across different classes, and the error messages aren't consistent. I'd like a more unified approach where we centralize these checks in a single place, making it easier to maintain and reuse. Additionally, we should introduce a dedicated error type for dependency-related failures instead of repurposing existing errors like BuildError. The changes should ensure that any command requiring Docker (like builds or registry logins) properly validates dependencies first, with clear error messages if something is missing. The solution should be clean, follow existing patterns in the codebase, and include any necessary test updates to reflect the new behavior.
--- a/crates/eval/examples/table_metrics_sorting/base.toml
+++ b/crates/eval/examples/table_metrics_sorting/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/duyet/clickhouse-monitoring.git"
-revision = "b8ab1a957115f41c916e7061b432ae00b1bbe7db"
-language_extension = "ts"
--- a/crates/eval/examples/table_metrics_sorting/diff_criteria.md
+++ b/crates/eval/examples/table_metrics_sorting/diff_criteria.md
@@ -1,5 +0,0 @@
-1. The SQL query in tables-overview.ts has been enhanced to include additional metrics for part sizes, both average and maximum. New fields have been added for compressed and uncompressed average part sizes with their readable formats and percentage calculations. Similarly, maximum part size metrics have been added with the same set of calculations. These additions provide more granular visibility into table partition characteristics while maintaining the existing percentage calculations relative to the maximum values across all tables.
-2. The column ordering and formatting in tables-overview.ts has been updated to accommodate the new part size metrics. The new readable_avg_part_size and readable_max_part_size columns have been added to the columns array and configured with BackgroundBar formatting. The engine column has been moved to the end of the list for better grouping of related metrics. The sortingFns configuration has been added to specify custom sorting behavior for various compressed and uncompressed size columns.
-3. The column definitions system has been enhanced to support custom sorting functions. A new sorting-fns.ts file has been created containing a sort_column_using_actual_value function that enables sorting based on underlying numeric values rather than formatted strings. The getColumnDefs function now checks for both custom and built-in sorting functions in the config and applies them appropriately to column definitions.
-4. The data table component has been updated to include custom sorting functions in its configuration. The getCustomSortingFns function is now passed to the table's sortingFns option, making these functions available for all columns. The ValueOf utility type has been added to generic.ts to support proper typing of the sorting functions.
-5. The query config type has been extended to include a new optional sortingFns property. This property allows specifying custom sorting functions for specific columns in the table configuration. The type imports have been reorganized, and CustomSortingFnNames is now properly imported and used in the QueryConfig interface.
--- a/crates/eval/examples/table_metrics_sorting/prompt.md
+++ b/crates/eval/examples/table_metrics_sorting/prompt.md
@@ -1 +0,0 @@
-I need to enhance our data table functionality to support more advanced sorting capabilities, particularly for columns that display formatted values (like readable sizes or percentages) but should sort based on their underlying raw numeric values. The table should also include additional metrics for average and maximum part sizes (both compressed and uncompressed) to give better insights into table storage characteristics. These new metrics should follow the same pattern as existing columns, with formatted readable versions, percentage calculations relative to the dataset maximum, and proper sorting behavior. The sorting system should be flexible enough to support both custom sorting logic (like comparing raw numbers behind formatted strings) and built-in sorting methods, with a clean way to configure which columns use which sorting approach. The implementation should maintain consistency with our existing column formatting system and integrate smoothly with the React Table setup we already have in place.
--- a/crates/eval/examples/tax_id_validation/base.toml
+++ b/crates/eval/examples/tax_id_validation/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/go-playground/validator.git"
-revision = "4676b8e43bb907ef07f3bcc4ae2a218b05d60397"
-language_extension = "go"
--- a/crates/eval/examples/tax_id_validation/diff_criteria.md
+++ b/crates/eval/examples/tax_id_validation/diff_criteria.md
@@ -1,3 +0,0 @@
-1. Documentation updates in README.md, where a new validation type for Employer Identification Numbers (EIN) was added to the supported validators table. This addition was carefully positioned between the existing "e164" phone number format and "email" validators to maintain alphabetical ordering. The entry follows the established table format with pipe-separated columns and includes a clear description indicating its purpose for validating U.S. Employer Identification Numbers. Notably, this change was made without modifying any of the existing documentation entries, preserving all current validator descriptions while expanding the supported validation types.
-2. Core implementation of the EIN validation across multiple files. In baked_in.go, this involved adding an "ein" entry to the validator map that points to a newly created isEIN function, following the same pattern as other validator registrations. The isEIN() function itself implements the validation logic, checking for both length requirements (exactly 10 characters) and pattern matching using a new regular expression. The regexes.go file was updated with a new einRegexString constant defining the EIN pattern (##-#######) and corresponding regex variable initialization, utilizing the existing lazyRegexCompile helper function for consistency. Documentation was added in doc.go following the established format for validator descriptions, complete with a simple usage example. Throughout these changes, careful attention was paid to maintain consistent error handling patterns and code organization while removing unnecessary newlines in several functions to improve readability.
-3. Testing improvements and code quality enhancements, primarily in validator_test.go. A comprehensive TestEINStringValidation test case was added, covering various valid and invalid EIN formats, including tests for length requirements and hyphen positioning. This new test follows the same structure and assertion patterns as existing validation tests. Numerous code quality improvements were made throughout the test file, including grouping interface declarations, fixing comment formatting, removing unnecessary newlines in struct declarations, correcting indentation in test cases, and adding missing newlines between tests. These changes significantly improved code readability while maintaining all existing test logic and ensuring backward compatibility. The improvements demonstrate careful attention to maintaining consistent patterns throughout the test suite while adding thorough test coverage for the new EIN validation functionality.
--- a/crates/eval/examples/tax_id_validation/prompt.md
+++ b/crates/eval/examples/tax_id_validation/prompt.md
@@ -1,10 +0,0 @@
-
-Add validation support for Employer Identification Numbers (EIN) to the Go validator library
-
-I need to implement a new validator function for US Employer Identification Numbers (EIN) in this Go validation library. The EIN validator should:
-
-1. Create a new tag called "ein" that validates if a string is a valid US Employer Identification Number
-2. Follow the pattern of ##-#######, where # is a digit (regex pattern would be ^(\d{2}-\d{7})$)
-3. Ensure the field contains exactly 10 characters (including the hyphen)
-4. Document the new validator in the README.md and doc.go files
-5. Add proper unit tests to verify validation works correctly for valid and invalid EINs
--- a/crates/eval/examples/test_infrastructure/base.toml
+++ b/crates/eval/examples/test_infrastructure/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/dagster-io/dagster.git"
-revision = "c9ed914a76baa6fb761a97f3236f96cd7d5361e6"
-language_extension = "py"
--- a/crates/eval/examples/test_infrastructure/diff_criteria.md
+++ b/crates/eval/examples/test_infrastructure/diff_criteria.md
@@ -1,3 +0,0 @@
-1. Introduce a new docker-compose.yml file in the integration tests directory for the monitoring daemon test suite. This file defines two services: a PostgreSQL database with test credentials exposed on port 5432, and a localstack S3 service exposed on port 4566. These services provide the necessary infrastructure for running the monitoring tests.
-2. Shows significant modifications to the test_monitoring.py file, including new imports (boto3, Path, and docker_compose_cm), removal of the dagster_aws tests import, and the addition of new fixtures. The new fixtures handle docker-compose setup, provide hostnames for services, configure AWS environment variables with test credentials, and initialize an S3 bucket for testing purposes. The changes reflect a shift from using external AWS credentials to using localstack for S3 testing.
-3. Reveals structural changes to the test file, where the aws_env fixture has been moved from the bottom of the file to be grouped with other fixtures. The original implementation that relied on get_aws_creds() has been replaced with a new implementation that uses localstack with hardcoded test credentials, and the test_docker_monitoring_run_out_of_attempts function remains at the end of the file but now uses the new aws_env fixture implementation.
--- a/crates/eval/examples/test_infrastructure/prompt.md
+++ b/crates/eval/examples/test_infrastructure/prompt.md
@@ -1 +0,0 @@
-Refactor the monitoring daemon integration tests to use local Docker-managed dependencies instead of direct AWS dependencies. First, create a docker-compose.yml file with two services: a PostgreSQL container with test credentials exposed on port 5432, and a LocalStack S3 container exposed on port 4566. Next, modify the test file to remove reliance on external AWS credentials and replace them with fixtures that configure a LocalStack S3 mock. The fixtures should include session-scoped setup for hostnames, PostgreSQL connections, and AWS environment variables with hardcoded test credentials (e.g., fake access keys). Ensure the S3 fixture initializes a test bucket. Move the AWS environment fixture to align with other fixtures and update the test logic to use the new LocalStack endpoint URL, handling both local and Buildkite environments. Keep the core test cases (like monitoring run attempts) intact but adapt them to use the new Docker-based dependencies.
--- a/crates/eval/examples/time_detail_merge_update/base.toml
+++ b/crates/eval/examples/time_detail_merge_update/base.toml
@@ -1,3 +1,4 @@
 url = "https://github.com/tikv/tikv.git"
 revision = "be74cadcdd6608e5788d0c2a6784c456b4ce84e6"
 language_extension = "rs"
+allow_preexisting_diagnostics = true
--- a/crates/eval/examples/time_detail_merge_update/thread_criteria.md
+++ b/crates/eval/examples/time_detail_merge_update/thread_criteria.md
@@ -0,0 +1,6 @@
+1. The first tool call should be to search for the `write_time_detail` function in the codebase. This will help identify all locations where the old function is being used and needs to be updated.
+2. Once the occurrences of `write_time_detail` are found, the tool should replace those references with `merge_time_detail`. This change should occur in files such as `src/coprocessor/endpoint.rs`, `src/server/service/kv.rs`, and `src/storage/txn/tracker.rs`.
+3. After updating the code to use `merge_time_detail`, the tool should ensure that the function signature and usage align with existing code practices, ensuring consistency across the codebase.
+4. A new test case should be added to the file `tests/integrations/coprocessor/test_select.rs`. The test should validate that `process_wall_time_ns` is not zero and confirm that time details are being correctly merged without regressions.
+5. The changes must be backward-compatible, ensuring that no functionality is broken, and that the merging process preserves previously existing values while adding new time details.
+6. After implementing the changes, verify that no additional files (other than those related to `write_time_detail` or `merge_time_detail`) are modified, ensuring that the change is limited to the relevant scope.
--- a/crates/eval/examples/tool_response_handling/thread_criteria.md
+++ b/crates/eval/examples/tool_response_handling/thread_criteria.md
@@ -0,0 +1,5 @@
+1. The first tool call should focus on updating the version numbers of all Goose-related packages (`goose`, `goose-bench`, `goose-cli`, `goose-mcp`, `goose-server`) in the `Cargo.lock` file. The tool must verify and update all relevant instances of the version, ensuring consistency across all package references. This should happen before updating any code in the source files.
+2. After updating the `Cargo.lock`, the next tool call should update the `ui/desktop/package-lock.json` file to reflect the updated version `1.0.18` for `goose-app`. The tool should ensure that the version number is updated accurately and no other dependencies are unintentionally altered.
+3. The tool should then address the changes in `App.tsx`, ensuring that `addExtensionToConfig` references are replaced with `addExtension`. This involves scanning the file for all instances of the old function and updating them to use the new method. The tool should also ensure that any corresponding hooks or async calls are properly updated to reflect the new usage.
+4. Ensure that no functional behavior is altered during the update process. The tool should only make the necessary updates and not introduce any additional changes that would affect the app's operation.
+5. Avoid unnecessary file removals or modifications outside of the specified tasks. The tool should focus strictly on the version updates and function changes as specified, without making extraneous edits.
--- a/crates/eval/examples/toolbar_endpoints/base.toml
+++ b/crates/eval/examples/toolbar_endpoints/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/django-cms/django-cms.git"
-revision = "0b775f27300c4347be18a5bb7b1b172d6a943ccf"
-language_extension = "py"
--- a/crates/eval/examples/toolbar_endpoints/diff_criteria.md
+++ b/crates/eval/examples/toolbar_endpoints/diff_criteria.md
@@ -1,3 +0,0 @@
-1. The changes add two new URL patterns ('cms_placeholder_add_plugin' and 'cms_placeholder_edit_plugin') to the list of endpoints in the toolbar middleware configuration. These endpoints will now be recognized by the toolbar system.
-2. The changes add test cases for the new toolbar endpoints in the test file. The first test case verifies that the toolbar is properly attached to requests for the 'cms_placeholder_add_plugin' admin endpoint. The test creates a mock request and checks that the toolbar attribute is present after middleware processing.
-3. The changes include a second test case that verifies toolbar functionality for the 'cms_placeholder_edit_plugin' admin endpoint. Similar to the first test, it creates a mock request with plugin ID (1) and checks for the presence of the toolbar attribute after middleware processing. This maintains consistency with the existing test for 'cms_placeholder_clear_placeholder'.
--- a/crates/eval/examples/toolbar_endpoints/prompt.md
+++ b/crates/eval/examples/toolbar_endpoints/prompt.md
@@ -1,3 +0,0 @@
-I'm working on improving the Django CMS toolbar middleware to better support plugin management functionality. Currently, the toolbar is only enabled for specific views defined in the `TOOLBAR_URL_PREFIXES` within toolbar.py, but I've noticed we're missing support for two critical plugin-related operations: adding and editing plugins through the `cms_placeholder_add_plugin` and `cms_placeholder_edit_plugin` views. These views should have access to the toolbar object just like our other administrative actions, as they're fundamental to the content editing experience.
-
-To implement this enhancement, we'll need to make two key changes. First, we should add both 'cms_placeholder_add_plugin' and 'cms_placeholder_edit_plugin' to the allowed URL prefixes list in cms/middleware/toolbar.py. Second, we should expand our test coverage in cms/tests/test_toolbar.py to verify that the toolbar object is properly attached to requests hitting these endpoints, maintaining consistency with how we test other toolbar-enabled views. This change will ensure a more complete and reliable toolbar experience throughout the entire plugin management workflow.
--- a/crates/eval/examples/virtio_block_request_refactor/thread_criteria.md
+++ b/crates/eval/examples/virtio_block_request_refactor/thread_criteria.md
@@ -0,0 +1,7 @@
+1. The first tool call should search for references to `FileEngine`, `AsyncFileEngine`, and `PendingRequest` within the project directory, specifically focusing on the files where these types are declared and used. This ensures that the correct locations are targeted for refactoring.
+2. After locating the relevant files, the model should begin by directly replacing the generic type parameters of `FileEngine`, `AsyncFileEngine`, and related structures with `PendingRequest`. This change should be implemented throughout the project, ensuring consistency.
+3. When modifying error handling, the model should focus on replacing `UserDataError` with `RequestError`, ensuring that `RequestError` carries `PendingRequest` information. This should be done throughout the codebase, including refactoring any associated handling of error contexts.
+4. The model should rename `WrappedUserData` to `WrappedRequest` and ensure that `WrappedRequest` directly embeds `PendingRequest`. Any usages of `WrappedUserData` should be updated to reflect the new naming and structural changes.
+5. For test code updates, the model should search for any test cases or mocks that use placeholder types (`()`) and replace them with `PendingRequest::default()` to maintain type consistency and proper request initialization.
+6. The tool should assist in consolidating imports by searching for duplicate import statements (such as `IO_URING_NUM_ENTRIES` and `PendingRequest`) and simplifying them to avoid redundancy.
+7. The model should verify that the existing async/sync I/O functionality is preserved, including ensuring that the dirty memory tracking and request completion logic still work correctly after the refactor.
--- a/crates/eval/examples/war_and_uri_corrections/base.toml
+++ b/crates/eval/examples/war_and_uri_corrections/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/jetty/jetty.project.git"
-revision = "dc685b6f84e94ad2eb6a3930769e6eab0cab3fa6"
-language_extension = "java"
--- a/crates/eval/examples/war_and_uri_corrections/diff_criteria.md
+++ b/crates/eval/examples/war_and_uri_corrections/diff_criteria.md
@@ -1,7 +0,0 @@
-1. The changes add an import for `URIUtil` and modify the URL creation in `OSGiApp.java` to use `URIUtil.correctURI()` for proper URI handling. The modification ensures correct URI formatting before converting to URL.
-2. The changes add an import for `URIUtil` and modify the URI creation in `Util.java` to use `URIUtil.correctURI()` when handling file paths. This ensures proper URI formatting for paths starting with "file:/".
-3. The changes in both `WebInfConfiguration.java` files (EE10 and EE9 versions) refactor the war file handling logic. The modifications:
-   - Add explanatory comments about looking for sibling directories
-   - Change how the war path is obtained (using webApp.getPath() instead of creating new resources)
-   - Restructure the conditional logic for better clarity
-   - Maintain the same functionality but with improved safety checks and documentation
--- a/crates/eval/examples/war_and_uri_corrections/prompt.md
+++ b/crates/eval/examples/war_and_uri_corrections/prompt.md
@@ -1,7 +0,0 @@
-I’m working on improvements to a Jetty OSGi application’s file path handling and deployment logic. The changes focus on two main areas: URI normalization and WAR file extraction.
-
-First, the URI handling logic needs updates to ensure consistent formatting, particularly when dealing with file paths. Currently, there are cases where paths aren’t properly normalized, especially when converting between file URIs and URLs. This affects both core OSGi resource resolution and utility methods that process path strings. The goal is to apply systematic corrections so that paths are reliably formatted across different scenarios.
-
-Second, the WAR file extraction process requires refinement to make it more robust. The current implementation checks for pre-extracted sibling directories, but the logic could be strengthened by using the resolved webApp path directly rather than reconstructing it from strings. Additionally, the code would benefit from clearer documentation and added safeguards to handle edge cases gracefully. These changes will apply to both the EE9 and EE10 WebApp configurations, ensuring consistent behavior across versions.
-
-The overarching aim is to reduce deployment failures and improve maintainability while keeping the changes backward-compatible.
--- a/crates/eval/examples/window_title_support/base.toml
+++ b/crates/eval/examples/window_title_support/base.toml
@@ -1,3 +0,0 @@
-url = "https://github.com/charmbracelet/bubbletea.git"
-revision = "bc1c475eb0263aba13ef430f191677e153dc0320"
-language_extension = "go"
--- a/crates/eval/examples/window_title_support/diff_criteria.md
+++ b/crates/eval/examples/window_title_support/diff_criteria.md
@@ -1,4 +0,0 @@
-1. Adds a new `setWindowTitle` method to the `standardRenderer` struct that sets the terminal window title using the OSC 0 escape sequence. It includes thread safety with mutex locking and uses fmt.Fprintf to send the escape sequence with the provided title.
-2. Modifies the `handleMessages` method in `standardRenderer` to handle a new `setWindowTitleMsg` message type by calling the new `setWindowTitle` method. This completes the rendering-side implementation for window title updates.
-3. Updates the event loop in the Program struct to properly handle `setWindowTitleMsg` messages by passing them through to the renderer without additional processing, similar to other renderer-specific messages.
-4. Adds documentation to the commands tutorial README explaining how to set window titles in Bubble Tea applications. It shows examples of using `tea.SetWindowTitle()` in both Init and Update methods, and explains its usefulness for reflecting application state in the window title.
--- a/crates/eval/examples/window_title_support/prompt.md
+++ b/crates/eval/examples/window_title_support/prompt.md
@@ -1,11 +0,0 @@
-I’d like to add the ability to set terminal window titles in our Bubble Tea framework. This would let applications dynamically update the title bar (e.g., to show status or app names).
-
-Requirements:
-
-Expose a user-friendly way to set titles (e.g., a SetWindowTitle command).
-Ensure it works cross-platform with standard terminal escape codes.
-Include a minimal example and docs showing usage.
-Constraints:
-
-Follow existing patterns for commands/messages.
-Thread-safe rendering.
--- a/crates/eval/src/eval.rs
+++ b/crates/eval/src/eval.rs
@@ -24,13 +24,11 @@ use prompt_store::PromptBuilder;
 use release_channel::AppVersion;
 use reqwest_client::ReqwestClient;
 use settings::{Settings, SettingsStore};
+use std::env;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
-use std::usize;
 use util::ResultExt as _;

-pub const RUNS_DIR: &str = "./crates/eval/runs";
-
 #[derive(Parser, Debug)]
 #[command(name = "eval", disable_version_flag = true)]
 struct Args {
@@ -52,13 +50,42 @@ struct Args {
    /// Maximum number of examples to run concurrently.
    #[arg(long, default_value = "10")]
    concurrency: usize,
+    /// Custom identifier for the cohort (default: timestamp-based)
+    #[arg(long)]
+    cohort_id: Option<String>,
 }

 fn main() {
    env_logger::init();

+    let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
+    let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
+    let session_id = uuid::Uuid::new_v4().to_string();
+    let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
+    let run_id = match env::var("GITHUB_RUN_ID") {
+        Ok(run_id) => format!("github/{}", run_id),
+        Err(_) => format!("local/{}", run_timestamp),
+    };
+
+    let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .unwrap()
+        .parent()
+        .unwrap();
+    let eval_crate_dir = root_dir.join("crates/eval");
+    let repos_dir = eval_crate_dir.join("repos");
+    let worktrees_dir = eval_crate_dir.join("worktrees");
+    let examples_dir = eval_crate_dir.join("examples");
+    let runs_dir = eval_crate_dir.join("runs");
+    let run_dir = runs_dir.join(format!("{}", run_timestamp));
+    std::fs::create_dir_all(&run_dir).unwrap();
+    std::fs::create_dir_all(&repos_dir).unwrap();
+    std::fs::create_dir_all(&worktrees_dir).unwrap();
+    std::fs::create_dir_all(&examples_dir).unwrap();
+
+    let zed_commit_sha = commit_sha_for_path(root_dir);
    let args = Args::parse();
-    let all_available_examples = list_all_examples().unwrap();
+    let all_available_examples = list_all_examples(&examples_dir).unwrap();
    let languages = args.languages.unwrap_or_else(|| vec!["rs".to_string()]);

    let example_paths = all_available_examples
@@ -84,10 +111,6 @@ fn main() {
    app.run(move |cx| {
        let app_state = init(cx);

-        let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
-        let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
-        let session_id = uuid::Uuid::new_v4().to_string();
-
        app_state
            .client
            .telemetry()
@@ -115,13 +138,12 @@ fn main() {
        cx.spawn(async move |cx| {
            authenticate_task.await.unwrap();

-            std::fs::create_dir_all(REPOS_DIR)?;
-            std::fs::create_dir_all(WORKTREES_DIR)?;
+            std::fs::create_dir_all(&repos_dir)?;
+            std::fs::create_dir_all(&worktrees_dir)?;

-            let run_dir = Path::new(RUNS_DIR).join(format!(
-                "{}",
-                chrono::Local::now().format("%Y-%m-%d_%H-%M-%S")
-            ));
+            let run_dir = runs_dir.join(args.cohort_id.clone().unwrap_or_else(|| {
+                format!("{}", chrono::Local::now().format("%Y-%m-%d_%H-%M-%S"))
+            }));
            std::fs::create_dir_all(&run_dir)?;

            let mut examples = Vec::new();
@@ -145,7 +167,12 @@ fn main() {
            let mut skipped = Vec::new();

            for example_path in &example_paths {
-                let example = Example::load_from_directory(example_path, &run_dir)?;
+                let example = Example::load_from_directory(
+                    example_path,
+                    &run_dir,
+                    &worktrees_dir,
+                    &repos_dir,
+                )?;

                if !example
                    .base
@@ -195,7 +222,7 @@ fn main() {

                let repo_url = example.base.url.clone();
                if repo_urls.insert(repo_url.clone()) {
-                    let repo_path = repo_path_for_url(&repo_url);
+                    let repo_path = example.repo_path.clone();

                    if !repo_path.join(".git").is_dir() {
                        println!(
@@ -246,6 +273,8 @@ fn main() {
                let app_state = app_state.clone();
                let model = model.clone();
                let example = example.clone();
+                let zed_commit_sha = zed_commit_sha.clone();
+                let run_id = run_id.clone();
                cx.spawn(async move |cx| {
                    let result = async {
                        let run_output = cx
@@ -255,6 +284,8 @@ fn main() {
                            run_judge_repetition(
                                example.clone(),
                                model.clone(),
+                                &zed_commit_sha,
+                                &run_id,
                                &run_output,
                                round,
                                cx,
@@ -368,9 +399,7 @@ fn main() {
            print_header("CUMULATIVE TOOL METRICS");
            println!("{}", cumulative_tool_metrics);

-            std::thread::sleep(std::time::Duration::from_secs(2));
-
-            app_state.client.telemetry().flush_events();
+            app_state.client.telemetry().flush_events().await;

            cx.update(|cx| cx.quit())
        })
@@ -378,8 +407,8 @@ fn main() {
    });
 }

-fn list_all_examples() -> Result<Vec<PathBuf>> {
-    let path = std::fs::canonicalize(EXAMPLES_DIR).unwrap();
+fn list_all_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
+    let path = std::fs::canonicalize(examples_dir).unwrap();
    let entries = std::fs::read_dir(path).unwrap();
    let mut result_paths = Vec::new();
    for entry in entries {
@@ -533,79 +562,54 @@ pub fn find_model(
    Ok(model)
 }

-pub async fn get_current_commit_id(repo_path: &Path) -> Option<String> {
-    (run_git(repo_path, &["rev-parse", "HEAD"]).await).ok()
-}
-
-pub fn get_current_commit_id_sync(repo_path: &Path) -> String {
-    futures::executor::block_on(async {
-        get_current_commit_id(repo_path).await.unwrap_or_default()
-    })
+pub fn commit_sha_for_path(repo_path: &Path) -> String {
+    futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
 }

 async fn run_judge_repetition(
    example: Example,
    model: Arc<dyn LanguageModel>,
+    zed_commit_sha: &str,
+    run_id: &str,
    run_output: &RunOutput,
    round: u32,
    cx: &AsyncApp,
 ) -> Result<JudgeOutput> {
-    let judge_result = example.judge(model.clone(), &run_output, round, cx).await;
+    let judge_output = example.judge(model.clone(), &run_output, round, cx).await;

-    if let Ok(judge_output) = &judge_result {
-        let cohort_id = example
-            .run_directory_path
-            .file_name()
-            .map(|name| name.to_string_lossy().to_string())
-            .unwrap_or(chrono::Local::now().format("%Y-%m-%d_%H-%M-%S").to_string());
-
-        let path = std::path::Path::new(".");
-        let commit_id = get_current_commit_id(path).await.unwrap_or_default();
-
-        if let Some(thread) = &judge_output.thread {
-            telemetry::event!(
-                "Agent Eval Completed",
-                cohort_id = cohort_id,
-                example_name = example.name.clone(),
-                round = round,
-                diff_score = judge_output.diff.score,
-                diff_analysis = judge_output.diff.analysis,
-                thread_score = thread.score,
-                thread_analysis = thread.analysis,
-                tool_metrics = run_output.tool_metrics,
-                response_count = run_output.response_count,
-                token_usage = run_output.token_usage,
-                model = model.telemetry_id(),
-                model_provider = model.provider_id().to_string(),
-                repository_url = example.base.url.clone(),
-                repository_revision = example.base.revision.clone(),
-                diagnostics_before = run_output.diagnostics_before,
-                diagnostics_after = run_output.diagnostics_after,
-                commit_id = commit_id
-            );
-        } else {
-            telemetry::event!(
-                "Agent Eval Completed",
-                cohort_id = cohort_id,
-                example_name = example.name.clone(),
-                round = round,
-                diff_score = judge_output.diff.score,
-                diff_analysis = judge_output.diff.analysis,
-                tool_metrics = run_output.tool_metrics,
-                response_count = run_output.response_count,
-                token_usage = run_output.token_usage,
-                model = model.telemetry_id(),
-                model_provider = model.provider_id().to_string(),
-                repository_url = example.base.url.clone(),
-                repository_revision = example.base.revision.clone(),
-                diagnostics_before = run_output.diagnostics_before,
-                diagnostics_after = run_output.diagnostics_after,
-                commit_id = commit_id
-            );
-        }
+    let diff_evaluation;
+    let thread_diff_evaluation;
+    if let Ok(output) = judge_output.as_ref() {
+        diff_evaluation = Some(output.diff.clone());
+        thread_diff_evaluation = output.thread.clone();
+    } else {
+        diff_evaluation = None;
+        thread_diff_evaluation = None;
    }

-    judge_result
+    let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").map_or(false, |value| value == "1");
+    if enable_telemetry {
+        telemetry::event!(
+            "Agent Example Evaluated",
+            zed_commit_sha = zed_commit_sha,
+            run_id = run_id,
+            example_name = example.name.clone(),
+            round = round,
+            diff_evaluation = diff_evaluation,
+            thread_evaluation = thread_diff_evaluation,
+            tool_metrics = run_output.tool_metrics,
+            response_count = run_output.response_count,
+            token_usage = run_output.token_usage,
+            model = model.telemetry_id(),
+            model_provider = model.provider_id().to_string(),
+            repository_url = example.base.url.clone(),
+            repository_revision = example.base.revision.clone(),
+            diagnostics_before = run_output.diagnostics_before,
+            diagnostics_after = run_output.diagnostics_after,
+        );
+    }
+
+    judge_output
 }

 fn print_header(header: &str) {
@@ -613,3 +617,56 @@ fn print_header(header: &str) {
    println!("{:^40}", header);
    println!("========================================\n");
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::path::PathBuf;
+
+    #[test]
+    fn test_cohort_id_parameter() {
+        let custom_cohort_id = "testcohort123";
+        let args = Args {
+            examples: vec![],
+            model: "claude-3-7-sonnet-latest".to_string(),
+            languages: None,
+            repetitions: 1,
+            judge_repetitions: 3,
+            concurrency: 10,
+            cohort_id: Some(custom_cohort_id.to_string()),
+        };
+
+        let runs_dir = PathBuf::from(std::env!("CARGO_MANIFEST_DIR"))
+            .parent()
+            .unwrap()
+            .parent()
+            .unwrap()
+            .join("crates/eval/runs");
+        let expected_run_dir = runs_dir.join(custom_cohort_id);
+
+        let actual_run_dir =
+            runs_dir.join(args.cohort_id.clone().unwrap_or_else(|| {
+                format!("{}", chrono::Local::now().format("%Y-%m-%d_%H-%M-%S"))
+            }));
+
+        assert_eq!(expected_run_dir, actual_run_dir);
+
+        let args_without_cohort_id = Args {
+            examples: vec![],
+            model: "claude-3-7-sonnet-latest".to_string(),
+            languages: None,
+            repetitions: 1,
+            judge_repetitions: 3,
+            concurrency: 10,
+            cohort_id: None,
+        };
+
+        let default_run_dir =
+            runs_dir.join(args_without_cohort_id.cohort_id.clone().unwrap_or_else(|| {
+                format!("{}", chrono::Local::now().format("%Y-%m-%d_%H-%M-%S"))
+            }));
+
+        assert_ne!(runs_dir, default_run_dir);
+        assert!(default_run_dir.to_string_lossy().len() > runs_dir.to_string_lossy().len());
+    }
+}
--- a/crates/eval/src/example.rs
+++ b/crates/eval/src/example.rs
@@ -32,10 +32,6 @@ use util::command::new_smol_command;
 use util::markdown::MarkdownString;
 use util::serde::default_true;

-pub const EXAMPLES_DIR: &str = "./crates/eval/examples";
-pub const REPOS_DIR: &str = "./crates/eval/repos";
-pub const WORKTREES_DIR: &str = "./crates/eval/worktrees";
-
 const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2);

 #[derive(Clone, Debug, Deserialize)]
@@ -76,6 +72,8 @@ pub struct Example {
    pub run_directory_path: PathBuf,
    /// Prefix used for logging that identifies this example
    pub log_prefix: String,
+    pub worktree_path: PathBuf,
+    pub repo_path: PathBuf,
 }

 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -121,7 +119,12 @@ pub struct JudgeOutput {

 impl Example {
    /// Load an example from a directory containing base.toml, prompt.md, and criteria.md
-    pub fn load_from_directory(dir_path: &Path, run_dir: &Path) -> Result<Self> {
+    pub fn load_from_directory(
+        dir_path: &Path,
+        run_dir: &Path,
+        worktrees_dir: &Path,
+        repos_dir: &Path,
+    ) -> Result<Self> {
        let name = Self::name_from_path(dir_path);
        let base_path = dir_path.join("base.toml");
        let prompt_path = dir_path.join("prompt.md");
@@ -133,13 +136,25 @@ impl Example {
            None
        };

+        let base: ExampleBase = toml::from_str(&fs::read_to_string(&base_path)?)?;
+
+        let repo_path = repo_path_for_url(repos_dir, &base.url);
+
+        let worktree_path = worktrees_dir
+            .canonicalize()
+            .unwrap()
+            .join(&name)
+            .join(&base.repo_name());
+
        Ok(Example {
            name: name.clone(),
-            base: toml::from_str(&fs::read_to_string(&base_path)?)?,
+            base,
            prompt: fs::read_to_string(prompt_path.clone())?,
            thread_criteria,
            diff_criteria: fs::read_to_string(diff_criteria_path.clone())?,
            run_directory_path: run_dir.to_path_buf(),
+            worktree_path,
+            repo_path,
            log_prefix: name,
        })
    }
@@ -167,21 +182,10 @@ impl Example {
        path.file_name().unwrap().to_string_lossy().to_string()
    }

-    pub fn worktree_path(&self) -> PathBuf {
-        Path::new(WORKTREES_DIR)
-            .canonicalize()
-            .context(format!("No such directory {WORKTREES_DIR}"))
-            .unwrap()
-            .join(&self.name)
-            .join(self.base.repo_name())
-    }
-
    /// Set up the example by checking out the specified Git revision
    pub async fn setup(&mut self) -> Result<()> {
-        let repo_path = repo_path_for_url(&self.base.url);
-
        let revision_exists = run_git(
-            &repo_path,
+            &self.repo_path,
            &["rev-parse", &format!("{}^{{commit}}", self.base.revision)],
        )
        .await
@@ -193,29 +197,27 @@ impl Example {
                self.log_prefix, &self.base.revision
            );
            run_git(
-                &repo_path,
+                &self.repo_path,
                &["fetch", "--depth", "1", "origin", &self.base.revision],
            )
            .await?;
        }

-        let worktree_path = self.worktree_path();
-
-        if worktree_path.is_dir() {
+        if self.worktree_path.is_dir() {
            println!("{}Resetting existing worktree", self.log_prefix);

            // TODO: consider including "-x" to remove ignored files. The downside of this is that
            // it will also remove build artifacts, and so prevent incremental reuse there.
-            run_git(&worktree_path, &["clean", "--force", "-d"]).await?;
-            run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?;
-            run_git(&worktree_path, &["checkout", &self.base.revision]).await?;
+            run_git(&self.worktree_path, &["clean", "--force", "-d"]).await?;
+            run_git(&self.worktree_path, &["reset", "--hard", "HEAD"]).await?;
+            run_git(&self.worktree_path, &["checkout", &self.base.revision]).await?;
        } else {
            println!("{}Creating worktree", self.log_prefix);

-            let worktree_path_string = worktree_path.to_string_lossy().to_string();
+            let worktree_path_string = self.worktree_path.to_string_lossy().to_string();

            run_git(
-                &repo_path,
+                &self.repo_path,
                &[
                    "worktree",
                    "add",
@@ -249,9 +251,8 @@ impl Example {
            cx,
        );

-        let worktree_path = self.worktree_path();
        let worktree = project.update(cx, |project, cx| {
-            project.create_worktree(&worktree_path, true, cx)
+            project.create_worktree(&self.worktree_path, true, cx)
        });

        let tools = cx.new(|_| ToolWorkingSet::default());
@@ -637,9 +638,8 @@ impl Example {
    }

    async fn repository_diff(&self) -> Result<String> {
-        let worktree_path = self.worktree_path();
-        run_git(&worktree_path, &["add", "."]).await?;
-        run_git(&worktree_path, &["diff", "--staged"]).await
+        run_git(&self.worktree_path, &["add", "."]).await?;
+        run_git(&self.worktree_path, &["diff", "--staged"]).await
    }
 }

@@ -800,13 +800,13 @@ fn get_tag(name: &'static str, response: &str) -> Result<String> {
    anyhow::Ok(content)
 }

-pub fn repo_path_for_url(repo_url: &str) -> PathBuf {
+pub fn repo_path_for_url(repos_dir: &Path, repo_url: &str) -> PathBuf {
    let repo_name = repo_url
        .trim_start_matches("https://")
        .replace(|c: char| !c.is_alphanumeric(), "-");
-    Path::new(REPOS_DIR)
+    Path::new(repos_dir)
        .canonicalize()
-        .context(format!("No such directory {REPOS_DIR}"))
+        .context(format!("No such directory {}", repos_dir.display()))
        .unwrap()
        .join(repo_name)
 }
--- a/crates/zed/src/main.rs
+++ b/crates/zed/src/main.rs
@@ -606,7 +606,7 @@ fn main() {
            setting = "keymap",
            value = BaseKeymap::get_global(cx).to_string()
        );
-        telemetry.flush_events();
+        telemetry.flush_events().detach();

        let fs = app_state.fs.clone();
        load_user_themes_in_background(fs.clone(), cx);
--- a/crates/zeta/src/zeta.rs
+++ b/crates/zeta/src/zeta.rs
@@ -982,7 +982,7 @@ and then another
            output_excerpt = completion.output_excerpt,
            feedback
        );
-        self.client.telemetry().flush_events();
+        self.client.telemetry().flush_events().detach();
        cx.notify();
    }
Author	SHA1	Message	Date
Thomas Mickley-Doyle	8632817ed8	fix formats	2025-04-21 14:40:32 -05:00
Thomas Mickley-Doyle	aa44e6a06f	add allow_preexisting_diagnostics to failures	2025-04-21 14:30:34 -05:00
Thomas Mickley-Doyle	7a72b6eb3a	Resolve REPOS_DIR variable error	2025-04-21 14:01:28 -05:00
Thomas Mickley-Doyle	a274d4c689	Merge branch 'run-eval-on-ci' into thomas/run-eval-on-ci	2025-04-21 13:58:21 -05:00
Thomas Mickley-Doyle	911a01c86c	Remove non-rust examples from evals	2025-04-21 13:58:00 -05:00
Max Brunsfeld	a7efb1cb68	Get eval compiling	2025-04-21 11:51:11 -07:00
Thomas Mickley-Doyle	5c2bffcf99	Update cohort test	2025-04-21 12:36:30 -05:00
Thomas Mickley-Doyle	068c5a7e82	Update cohort test	2025-04-21 12:34:50 -05:00
Thomas Mickley-Doyle	7897ecb081	Merge branch 'run-eval-on-ci' into thomas/run-eval-on-ci	2025-04-21 12:15:07 -05:00
Thomas Mickley-Doyle	a105d9bfe9	Add thread criteria examples, cohort, and daily runs	2025-04-21 12:12:25 -05:00
Antonio Scandurra	e18efcb8f6	WIP	2025-04-21 19:01:16 +02:00
Antonio Scandurra	0252db20df	WIP	2025-04-21 18:54:50 +02:00
Thomas Mickley-Doyle	4a8f2f49f9	Add 12 more rust tests	2025-04-21 11:12:41 -05:00
Antonio Scandurra	9074c27a2d	Enable telemetry only on CI	2025-04-21 18:06:41 +02:00
Thomas Mickley-Doyle	90bd6cefc3	Add matrix run to evals	2025-04-21 10:59:53 -05:00
Antonio Scandurra	b934660809	Simplify reporting of telemetry for evals	2025-04-21 17:56:33 +02:00
Antonio Scandurra	d1245c873c	Wait for telemetry events to be flushed before quitting eval	2025-04-21 17:48:38 +02:00
Nathan Sobo	c92e1ecca0	Just run the find_and_replace_diff_card example for now on CI Co-authored-by: Antonio Scandurra <me@as-cii.com> Co-authored-by: Agus Zubiaga <hi@aguz.me>	2025-04-21 09:25:19 -06:00
Nathan Sobo	c8b299d8e6	Update eval.yml Co-authored-by: Antonio Scandurra <me@as-cii.com>	2025-04-21 09:18:52 -06:00
Nathan Sobo	15931ed1d5	Try shit Co-authored-by: Antonio Scandurra <me@as-cii.com>	2025-04-21 09:09:03 -06:00
Nathan Sobo	ef1af4f7f9	Allow eval workflow to be triggered manually Co-authored-by: Antonio Scandurra <me@as-cii.com>	2025-04-21 09:03:56 -06:00
Nathan Sobo	18251ce15f	Add new action to run agent eval The old one wasn't linking. Co-authored-by: Antonio Scandurra <me@as-cii.com>	2025-04-21 09:00:01 -06:00
				`@@ -1 +0,0 @@`
				I need to make several improvements to our Debian image-building scripts. First, fix the typo in the `Makefile` where `--keyrign` is incorrectly used instead of `--keyring` across all build targets, including the standard Debian image and Java variants like `debian11-java`, `debian11-graal`, and `debian11-corretto`. Second, enhance the `debian/mkimage.sh` script to include proper error handling, usage documentation, and command-line argument validation. The script should check for required tools like `debootstrap`, `unzip`, and `trivy`, and provide installation instructions if they're missing. Improve the GPG key setup by ensuring the `/root/.gnupg` directory is properly configured before importing keys. Add structured logging with timestamps, warnings, and informational messages. Implement better package installation with retries and proper cleanup. Finally, include clear instructions at the end on how to load and run the generated Docker image, with example commands for verification. The script should be robust, well-documented, and fail early with meaningful error messages if system requirements aren't met.
				`@@ -1 +0,0 @@`
				I want to refactor the `register_verification_email` function to streamline how verification emails are handled. Currently, the code checks if a user exists and then sends an email or returns early. I’d like to move the JWT token generation to the top of the function to avoid duplication. Then, if mail sending is enabled, the code should check for the user, but only send the verification email if the user exists and has a `private_key` (otherwise it should send the email). Keep the random sleep logic for timing mitigation in the branch where no email is sent. Remove the old duplicated token generation logic and any redundant conditionals, while ensuring the core behavior and response flow stays the same.
				`@@ -1 +0,0 @@`
				We need to improve how our language model handles file attachments by making the file type definitions more modular and reusable. Currently, file-related properties are defined inline within the model’s response and stream types, which makes maintenance harder and duplicates documentation. The goal is to extract these definitions into a dedicated type that can be shared consistently across both static responses and streaming payloads. The new type should include clear documentation about media types (referencing IANA standards) and support both base64 and binary data formats without unnecessary conversions. This change should maintain backward compatibility while centralizing the file structure definition for better type safety and readability. Focus on clean separation of concerns, and ensure the extracted type is properly exported and imported where needed.
				`@@ -1 +0,0 @@`
				The goal of this update is to fix NaN value handling in our JavaScript object diffing functionality. Currently, the diff function fails to properly recognize that two NaN values should be treated as equal due to JavaScript's native behavior where `NaN !== NaN`. This causes incorrect change detection when comparing objects or arrays containing NaN values. The solution involves modifying the diff function to explicitly check for NaN values using `Number.isNaN()` during comparisons of object keys and values, ensuring NaN values are treated as equivalent. The implementation requires adding specific NaN equivalence checks while maintaining existing comparison logic. Additionally, comprehensive unit tests are being added to verify correct handling across various scenarios: creating objects/arrays with NaN values, changing NaN values to other values, ensuring no false positives when NaN values remain unchanged, and properly tracking removal of NaN values from both objects and arrays. This change will bring the diff behavior in line with mathematical expectations for NaN comparisons while maintaining all other existing functionality.
				`@@ -1 +0,0 @@`
				I'd like to add stable Rust support for handling the `!` (never) type in Bevy's ECS systems, in light of changes introduced in the Rust 2024 edition around never type fallback inference. Please create a new internal module (e.g., `never.rs`) that provides a type alias `Never` using a workaround based on a trait and `fn() -> !` to simulate the behavior of the unstable `!` type. Update the necessary traits and system wrappers (such as `HandleError`, `IntoScheduleConfigs`, and `IntoObserverSystem`) to accept `Never` as a valid output type, ensuring that closures or systems using `todo!()` or panics can still compile and behave correctly. Add a set of regression tests that exercise this compatibility by queuing and scheduling systems and commands with `todo!()` as their body, ensuring trait impls are resolved properly. Make sure to document this hack in the new module with a clear explanation of why it's being used and the risks involved.
				`@@ -1 +0,0 @@`
				I need to refactor our codebase to improve the clarity and consistency of our data model, particularly around how we handle labeled evaluation data for our threshold optimization system. Currently, the naming and structure might imply that this data is only used for testing, when in reality it represents labeled examples that power both training and evaluation. The changes should better reflect that these are curated data points with known outcomes, not just test cases. Focus on updating the core data model and ensuring all dependent components—like the cache optimizer, router, and evaluation utilities—properly reference this updated concept. The implementation should maintain all existing functionality while making the naming more semantically accurate. Where relevant, consider adding parameters to fine-tune optimization behavior, like allowing control over the granularity of threshold searches.
				`@@ -1 +0,0 @@`
				I need help modernizing the HTTP client in my Node.js tunneling service. The current implementation uses the older `request` library, which is now deprecated, and I'd like to switch to a more modern, promise-based alternative like `axios`. The changes should maintain all existing functionality—including error handling, retry logic, and response parsing—but improve readability and maintainability by using async/await or proper promise chaining where possible. The request parameters and response handling should be updated to match the new library's conventions while preserving the same behavior for downstream consumers. Additionally, ensure the package.json dependencies are updated accordingly, removing deprecated packages and cleaning up the dependency list. The core tunneling logic should remain unchanged; this is purely about updating the HTTP client layer to be more future-proof.
				`@@ -1 +0,0 @@`
				I need to improve how our codebase handles Docker dependency checks and error reporting. Right now, the logic for verifying Docker and buildx installations is scattered across different classes, and the error messages aren't consistent. I'd like a more unified approach where we centralize these checks in a single place, making it easier to maintain and reuse. Additionally, we should introduce a dedicated error type for dependency-related failures instead of repurposing existing errors like BuildError. The changes should ensure that any command requiring Docker (like builds or registry logins) properly validates dependencies first, with clear error messages if something is missing. The solution should be clean, follow existing patterns in the codebase, and include any necessary test updates to reflect the new behavior.
				`@@ -1 +0,0 @@`
				I need to enhance our data table functionality to support more advanced sorting capabilities, particularly for columns that display formatted values (like readable sizes or percentages) but should sort based on their underlying raw numeric values. The table should also include additional metrics for average and maximum part sizes (both compressed and uncompressed) to give better insights into table storage characteristics. These new metrics should follow the same pattern as existing columns, with formatted readable versions, percentage calculations relative to the dataset maximum, and proper sorting behavior. The sorting system should be flexible enough to support both custom sorting logic (like comparing raw numbers behind formatted strings) and built-in sorting methods, with a clean way to configure which columns use which sorting approach. The implementation should maintain consistency with our existing column formatting system and integrate smoothly with the React Table setup we already have in place.