terminal: Sanitize URLs with characters that cannot be last (#43559)

Closes #43345

The list of characters comes from the linkify crate, which is already
used for URL detection in the editor:


5239e12e26/src/url.rs (L228)

Release Notes:

- Improved url links detection in terminals.

---------

Signed-off-by: Marco Mihai Condrache <52580954+marcocondrache@users.noreply.github.com>
This commit is contained in:
Marco Mihai Condrache
2025-12-16 03:03:16 +01:00
committed by GitHub
parent dfdad947e1
commit b17b097204

View File

@@ -160,8 +160,8 @@ fn sanitize_url_punctuation<T: EventListener>(
let mut sanitized_url = url;
let mut chars_trimmed = 0;
// First, handle parentheses balancing using single traversal
let (open_parens, close_parens) =
// Count parentheses in the URL
let (open_parens, mut close_parens) =
sanitized_url
.chars()
.fold((0, 0), |(opens, closes), c| match c {
@@ -170,33 +170,27 @@ fn sanitize_url_punctuation<T: EventListener>(
_ => (opens, closes),
});
// Trim unbalanced closing parentheses
if close_parens > open_parens {
let mut remaining_close = close_parens;
while sanitized_url.ends_with(')') && remaining_close > open_parens {
sanitized_url.pop();
chars_trimmed += 1;
remaining_close -= 1;
}
}
// Handle trailing periods
if sanitized_url.ends_with('.') {
let trailing_periods = sanitized_url
.chars()
.rev()
.take_while(|&c| c == '.')
.count();
if trailing_periods > 1 {
sanitized_url.truncate(sanitized_url.len() - trailing_periods);
chars_trimmed += trailing_periods;
} else if trailing_periods == 1
&& let Some(second_last_char) = sanitized_url.chars().rev().nth(1)
&& (second_last_char.is_alphanumeric() || second_last_char == '/')
{
// Remove trailing characters that shouldn't be at the end of URLs
while let Some(last_char) = sanitized_url.chars().last() {
let should_remove = match last_char {
// These may be part of a URL but not at the end. It's not that the spec
// doesn't allow them, but they are frequently used in plain text as delimiters
// where they're not meant to be part of the URL.
'.' | ',' | ':' | ';' => true,
'(' => true,
')' if close_parens > open_parens => {
close_parens -= 1;
true
}
_ => false,
};
if should_remove {
sanitized_url.pop();
chars_trimmed += 1;
} else {
break;
}
}
@@ -413,6 +407,8 @@ mod tests {
("https://www.google.com/)", "https://www.google.com/"),
("https://example.com/path)", "https://example.com/path"),
("https://test.com/))", "https://test.com/"),
("https://test.com/(((", "https://test.com/"),
("https://test.com/(test)(", "https://test.com/(test)"),
// Cases that should NOT be sanitized (balanced parentheses)
(
"https://en.wikipedia.org/wiki/Example_(disambiguation)",
@@ -443,10 +439,10 @@ mod tests {
}
#[test]
fn test_url_periods_sanitization() {
// Test URLs with trailing periods (sentence punctuation)
fn test_url_punctuation_sanitization() {
// Test URLs with trailing punctuation (sentence/text punctuation)
// The sanitize_url_punctuation function removes ., ,, :, ;, from the end
let test_cases = vec![
// Cases that should be sanitized (trailing periods likely punctuation)
("https://example.com.", "https://example.com"),
(
"https://github.com/zed-industries/zed.",
@@ -466,13 +462,36 @@ mod tests {
"https://en.wikipedia.org/wiki/C.E.O.",
"https://en.wikipedia.org/wiki/C.E.O",
),
// Cases that should NOT be sanitized (periods are part of URL structure)
("https://example.com,", "https://example.com"),
("https://example.com/path,", "https://example.com/path"),
("https://example.com,,", "https://example.com"),
("https://example.com:", "https://example.com"),
("https://example.com/path:", "https://example.com/path"),
("https://example.com::", "https://example.com"),
("https://example.com;", "https://example.com"),
("https://example.com/path;", "https://example.com/path"),
("https://example.com;;", "https://example.com"),
("https://example.com.,", "https://example.com"),
("https://example.com.:;", "https://example.com"),
("https://example.com!.", "https://example.com!"),
("https://example.com/).", "https://example.com/"),
("https://example.com/);", "https://example.com/"),
("https://example.com/;)", "https://example.com/"),
(
"https://example.com/v1.0/api",
"https://example.com/v1.0/api",
),
("https://192.168.1.1", "https://192.168.1.1"),
("https://sub.domain.com", "https://sub.domain.com"),
(
"https://example.com?query=value",
"https://example.com?query=value",
),
("https://example.com?a=1&b=2", "https://example.com?a=1&b=2"),
(
"https://example.com/path:8080",
"https://example.com/path:8080",
),
];
for (input, expected) in test_cases {
@@ -484,7 +503,6 @@ mod tests {
let end_point = AlacPoint::new(Line(0), Column(input.len()));
let dummy_match = Match::new(start_point, end_point);
// This test should initially fail since we haven't implemented period sanitization yet
let (result, _) = sanitize_url_punctuation(input.to_string(), dummy_match, &term);
assert_eq!(result, expected, "Failed for input: {}", input);
}