wip address long lines in regex search

wip
Move find_matches down
2025-03-19 10:44:44 -04:00 · 2025-03-18 12:15:52 -04:00 · 2025-03-18 11:58:33 -04:00 · 2025-03-18 11:56:21 -04:00 · 2025-03-18 11:51:23 -04:00
1 changed files with 295 additions and 35 deletions
--- a/crates/assistant_tools/src/regex_search.rs
+++ b/crates/assistant_tools/src/regex_search.rs
@@ -2,7 +2,7 @@ use anyhow::{anyhow, Result};
 use assistant_tool::{ActionLog, Tool};
 use futures::StreamExt;
 use gpui::{App, Entity, Task};
-use language::OffsetRangeExt;
+use language::{OffsetRangeExt, Point};
 use language_model::LanguageModelRequestMessage;
 use project::{
    search::{SearchQuery, SearchResult},
@@ -10,7 +10,7 @@ use project::{
 };
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
-use std::{cmp, fmt::Write, sync::Arc};
+use std::{cmp, fmt::Write, ops::Range, sync::Arc};
 use util::paths::PathMatcher;

 #[derive(Debug, Serialize, Deserialize, JsonSchema)]
@@ -26,6 +26,8 @@ pub struct RegexSearchToolInput {
 }

 const RESULTS_PER_PAGE: usize = 20;
+const MAX_LINE_LENGTH: u32 = 240;
+const LONG_LINE_CONTEXT: usize = 120;

 pub struct RegexSearchTool;

@@ -58,6 +60,10 @@ impl Tool for RegexSearchTool {
            Err(err) => return Task::ready(Err(anyhow!(err))),
        };

+        if regex.is_empty() {
+            return Task::ready(Err(anyhow!("Empty regex pattern is not allowed")));
+        };
+
        let query = match SearchQuery::regex(
            &regex,
            false,
@@ -73,6 +79,11 @@ impl Tool for RegexSearchTool {

        let results = project.update(cx, |project, cx| project.search(query, cx));

+        enum MatchRange {
+            Lines(Range<Point>),
+            LongLine(Range<Point>),
+        }
+
        cx.spawn(|cx| async move {
            futures::pin_mut!(results);

@@ -91,58 +102,80 @@ impl Tool for RegexSearchTool {
                        let mut file_header_written = false;
                        let mut ranges = ranges
                            .into_iter()
-                            .map(|range| {
+                            .map(|range| -> MatchRange {
                                let mut point_range = range.to_point(buffer);
-                                point_range.start.row =
-                                    point_range.start.row.saturating_sub(CONTEXT_LINES);
-                                point_range.start.column = 0;
-                                point_range.end.row = cmp::min(
-                                    buffer.max_point().row,
-                                    point_range.end.row + CONTEXT_LINES,
-                                );
-                                point_range.end.column = buffer.line_len(point_range.end.row);
-                                point_range
+                                let is_long_line = buffer.line_len(point_range.start.row) > MAX_LINE_LENGTH;
+
+                                if is_long_line {
+                                    let line_range = Point::new(point_range.start.row, 0)..Point::new(point_range.start.row, buffer.line_len(point_range.start.row));
+                                    let line_text = buffer.text_for_range(line_range).collect::<String>();
+
+                                    for (match_start, match_end) in find_matches(line_text.clone(), &regex) {
+                                        let start_char = match_start.saturating_sub(LONG_LINE_CONTEXT);
+                                        let end_char = (match_end + LONG_LINE_CONTEXT).min(buffer.line_len(point_range.start.row) as usize);
+                                        writeln!(output, "\n### Line {}, chars {}-{}\n```", point_range.start.row + 1, start_char, end_char)?;
+                                        output.push_str(&line_text[start_char..end_char]);
+                                        output.push_str("\n```\n");
+                                    }
+                                    matches_found += 1;
+
+                                    if matches_found >= RESULTS_PER_PAGE {
+                                        has_more_matches = true;
+                                    }
+
+                                    Ok((Point::new(point_range.start.row + 1, 0)..Point::new(point_range.start.row + 1, 0), true))
+                                } else {
+                                    point_range.start.row = point_range.start.row.saturating_sub(CONTEXT_LINES);
+                                    point_range.start.column = 0;
+                                    point_range.end.row = cmp::min(
+                                        buffer.max_point().row,
+                                        point_range.end.row + CONTEXT_LINES,
+                                    );
+                                    point_range.end.column = buffer.line_len(point_range.end.row);
+                                    Ok((point_range, is_long_line))
+                                }
                            })
                            .peekable();

-                        while let Some(mut range) = ranges.next() {
+                        while let Some(range_result) = ranges.next() {
+                            let (range, is_long) = range_result?;
+                            // Skip long lines as they were already handled
+                            if is_long {
+                                continue;
+                            }
+
                            if skips_remaining > 0 {
                                skips_remaining -= 1;
                                continue;
                            }

-                            // We'd already found a full page of matches, and we just found one more.
+                            // We've found a full page of matches
                            if matches_found >= RESULTS_PER_PAGE {
                                has_more_matches = true;
-                                return Ok(());
-                            }
-
-                            while let Some(next_range) = ranges.peek() {
-                                if range.end.row >= next_range.start.row {
-                                    range.end = next_range.end;
-                                    ranges.next();
-                                } else {
-                                    break;
-                                }
+                                break;
                            }

+                            // Write file header if needed
                            if !file_header_written {
-                                writeln!(output, "\n## Matches in {}", path.display())?;
+                                let _ = writeln!(output, "\n## Matches in {}", path.display());
                                file_header_written = true;
                            }

-                            let start_line = range.start.row + 1;
-                            let end_line = range.end.row + 1;
-                            writeln!(output, "\n### Lines {start_line}-{end_line}\n```")?;
-                            output.extend(buffer.text_for_range(range));
-                            output.push_str("\n```\n");
+                            // Show the match with context lines
+                            let context_start = range.start.row;
+                            let context_end = range.end.row;
+                            let context_range = Point::new(context_start, 0)..Point::new(context_end, buffer.line_len(context_end));
+                            let context_text = buffer.text_for_range(context_range).collect::<String>();

-                            matches_found += 1;
+                            if context_text.contains(&regex) {
+                                let _ = writeln!(output, "\n### Lines {}-{}\n```", context_start + 1, context_end + 1);
+                                output.push_str(&context_text);
+                                output.push_str("\n```\n");
+                                matches_found += 1;
+                            }
                        }
                    }
-
-                    Ok(())
-                })??;
+                })?;
            }

            if matches_found == 0 {
@@ -154,9 +187,236 @@ impl Tool for RegexSearchTool {
                    offset + matches_found,
                    offset + RESULTS_PER_PAGE,
                ))
-          } else {
+            } else {
                Ok(format!("Found {matches_found} matches:\n{output}"))
            }
        })
    }
 }
+
+fn find_matches(text: String, pattern: &str) -> Vec<(usize, usize)> {
+    let mut matches = Vec::new();
+    if pattern.is_empty() {
+        return matches;
+    }
+
+    let mut start = 0;
+    while start < text.len() {
+        match text[start..].find(pattern) {
+            Some(pos) => {
+                let match_start = start + pos;
+                let match_end = match_start + pattern.len();
+                if match_end <= text.len() {
+                    matches.push((match_start, match_end));
+                }
+                start = match_start + 1;
+            }
+            None => break,
+        }
+    }
+    matches
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_find_matches() {
+        // Test basic match finding
+        let text = "hello world hello".to_string();
+        let matches = find_matches(text, "hello");
+        assert_eq!(matches.len(), 2);
+        assert_eq!(matches[0], (0, 5));
+        assert_eq!(matches[1], (12, 17));
+
+        // Test overlapping matches
+        let overlaps = find_matches("abababa".to_string(), "aba");
+        assert_eq!(overlaps.len(), 3);
+        assert_eq!(overlaps[0], (0, 3));
+        assert_eq!(overlaps[1], (2, 5));
+        assert_eq!(overlaps[2], (4, 7));
+
+        // Test edge cases
+        assert_eq!(find_matches("".to_string(), "pattern"), vec![]);
+        assert_eq!(find_matches("text".to_string(), ""), vec![]);
+    }
+
+    #[test]
+    fn test_long_line_context_calculation() {
+        // Create a line that exceeds MAX_LINE_LENGTH
+        let prefix = "x".repeat(100);
+        let target = "TARGET";
+        let suffix = "y".repeat(300);
+        let long_line = format!("{}{}{}", prefix, target, suffix);
+
+        // Find the target in the long line
+        let matches = find_matches(long_line.clone(), target);
+        assert_eq!(matches.len(), 1);
+
+        let (match_start, match_end) = matches[0];
+
+        // Verify context calculation
+        let start_char = match_start.saturating_sub(LONG_LINE_CONTEXT);
+        let end_char = (match_end + LONG_LINE_CONTEXT).min(long_line.len());
+
+        // Context should start no more than LONG_LINE_CONTEXT chars before match
+        assert!(match_start - start_char <= LONG_LINE_CONTEXT);
+
+        // Context should end no more than LONG_LINE_CONTEXT chars after match
+        assert!(end_char - match_end <= LONG_LINE_CONTEXT);
+
+        // Context should contain the target
+        let context = &long_line[start_char..end_char];
+        assert!(context.contains(target));
+    }
+
+    #[test]
+    fn test_line_length_classification() {
+        // Test if lines are correctly classified as long or regular
+        let max_len = MAX_LINE_LENGTH as usize;
+
+        // Line shorter than MAX_LINE_LENGTH
+        let regular_line = "x".repeat(max_len - 1);
+        assert!(regular_line.len() < max_len);
+
+        // Line exactly at MAX_LINE_LENGTH
+        let boundary_line = "x".repeat(max_len);
+        assert_eq!(boundary_line.len(), max_len);
+
+        // Line longer than MAX_LINE_LENGTH
+        let long_line = "x".repeat(max_len + 1);
+        assert!(long_line.len() > max_len);
+    }
+
+    #[test]
+    fn test_heading_format() {
+        // For long lines: "### Line X, chars Y-Z"
+        // For regular lines: "### Lines X-Y"
+
+        let long_line_row = 42_usize;
+        let start_char = 100_usize;
+        let end_char = 340_usize;
+
+        // In the implementation, the heading format for long lines is "### Line"
+        let long_line_heading = format!(
+            "### Line {}, chars {}-{}",
+            long_line_row + 1,
+            start_char,
+            end_char
+        );
+        assert_eq!(long_line_heading, "### Line 43, chars 100-340");
+
+        let context_start = 40_usize;
+        let context_end = 44_usize;
+        let regular_heading = format!("### Lines {}-{}", context_start + 1, context_end + 1);
+        assert_eq!(regular_heading, "### Lines 41-45");
+    }
+
+    #[test]
+    fn test_pagination() {
+        // Initialize variables to simulate pagination
+        let mut skips_remaining = 2;
+        let mut matches_found = 0;
+
+        // Simulate processing 5 matches with offset 2
+        for _ in 0..5 {
+            if skips_remaining > 0 {
+                skips_remaining -= 1;
+                continue;
+            }
+            matches_found += 1;
+        }
+
+        // Should have processed 3 matches after skipping 2
+        assert_eq!(matches_found, 3);
+        assert_eq!(skips_remaining, 0);
+
+        // Test page limits
+        let mut matches_found = 0;
+        let mut has_more_matches = false;
+
+        // Simulate processing more matches than fit in a page
+        for _ in 0..(RESULTS_PER_PAGE + 5) {
+            // We'd already found a full page of matches, and we just found one more.
+            if matches_found >= RESULTS_PER_PAGE {
+                has_more_matches = true;
+                break;
+            }
+            matches_found += 1;
+        }
+
+        // Should have stopped at RESULTS_PER_PAGE
+        assert_eq!(matches_found, RESULTS_PER_PAGE);
+        assert!(has_more_matches);
+    }
+
+    #[test]
+    fn test_handling_of_very_long_lines() {
+        // Test very long lines with matches at different positions
+
+        // Create a very long line (3 * MAX_LINE_LENGTH)
+        let max_len = MAX_LINE_LENGTH as usize;
+        let long_prefix = "prefix_".repeat(max_len / 7);
+        let middle = "middle_".repeat(max_len / 7);
+        let long_suffix = "suffix_".repeat(max_len / 7);
+        let very_long_line = format!("{}{}{}", long_prefix, middle, long_suffix);
+
+        assert!(very_long_line.len() > max_len);
+
+        // Find matches for "middle" in the very long line
+        let matches = find_matches(very_long_line.clone(), "middle_");
+        assert!(!matches.is_empty());
+
+        // First match should be after the prefix
+        let (first_match_start, _) = matches[0];
+        assert!(first_match_start >= long_prefix.len());
+
+        // With 120 chars of context, we should not see the start of the string
+        let context_start = first_match_start.saturating_sub(LONG_LINE_CONTEXT);
+        assert!(context_start > 0); // Context should not include the very beginning
+
+        // But context should include the match
+        let context_end = first_match_start + 7 + LONG_LINE_CONTEXT; // "middle_" is 7 chars
+        let context = &very_long_line[context_start..context_end.min(very_long_line.len())];
+        assert!(context.contains("middle_"));
+    }
+
+    #[test]
+    fn test_output_format() {
+        // Test format consistency
+
+        // For a long line match (> 240 chars), we show:
+        // - Only the matched line (no context lines)
+        // - Format: "### Line X, chars Y-Z"
+        // - Context: 120 chars before and after the match
+
+        // For regular lines (< 240 chars), we show:
+        // - The matched line plus context (2 lines before, 2 lines after)
+        // - Format: "### Lines X-Y"
+
+        // Test that multiple matches in a long line are shown separately
+        let row = 42_usize;
+        let match1_start = 100_usize;
+        let match1_end = 110_usize;
+        let match2_start = 300_usize;
+        let match2_end = 310_usize;
+
+        let heading1 = format!(
+            "### Line {}, chars {}-{}",
+            row + 1,
+            match1_start.saturating_sub(LONG_LINE_CONTEXT),
+            (match1_end + LONG_LINE_CONTEXT)
+        );
+
+        let heading2 = format!(
+            "### Line {}, chars {}-{}",
+            row + 1,
+            match2_start.saturating_sub(LONG_LINE_CONTEXT),
+            (match2_end + LONG_LINE_CONTEXT)
+        );
+
+        // Each match should have its own heading
+        assert_ne!(heading1, heading2);
+    }
+}
Author	SHA1	Message	Date
Richard Feldman	4d74b23ce8	wip address long lines in regex search	2025-03-19 10:44:44 -04:00
Richard Feldman	c248b2f26f	wip	2025-03-18 12:15:52 -04:00
Richard Feldman	3b2258b9e3	Move find_matches down	2025-03-18 11:58:33 -04:00
Richard Feldman	d771c275f5	Get rid of unnecessary matches_regex function	2025-03-18 11:56:21 -04:00
Richard Feldman	618d0cbf59	Add long line handling	2025-03-18 11:51:23 -04:00