Correct UTF-16 saving and add heuristic encoding detection (#45243)

This commit fixes an issue where saving UTF-16 files resulted in UTF-8
bytes due to `encoding_rs` default behavior. It also introduces a
heuristic to detect BOM-less UTF-16 and binary files.

Changes:
- Manually implement UTF-16LE/BE encoding during file save to avoid
implicit UTF-8 conversion.
- Add `analyze_byte_content` to guess UTF-16LE/BE or Binary based on
null byte distribution.
- Prevent loading binary files as text by returning an error when binary
content is detected.

Special thanks to @CrazyboyQCD for pointing out the `encoding_rs`
behavior and providing the fix, and to @ConradIrwin for the suggestion
on the detection heuristic.

Closes #14654

Release Notes:

- (nightly only) Fixed an issue where saving files with UTF-16 encoding
incorrectly wrote them as UTF-8. Also improved detection for binary
files and BOM-less UTF-16.
This commit is contained in:
Ichimura Tomoo
2025-12-20 03:18:20 +09:00
committed by GitHub
parent 22916311cd
commit 1bc3fa8154
3 changed files with 334 additions and 136 deletions

View File

@@ -1490,19 +1490,23 @@ impl Buffer {
let (tx, rx) = futures::channel::oneshot::channel();
let prev_version = self.text.version();
self.reload_task = Some(cx.spawn(async move |this, cx| {
let Some((new_mtime, new_text)) = this.update(cx, |this, cx| {
let Some((new_mtime, load_bytes_task, encoding)) = this.update(cx, |this, cx| {
let file = this.file.as_ref()?.as_local()?;
Some((file.disk_state().mtime(), file.load(cx)))
Some((
file.disk_state().mtime(),
file.load_bytes(cx),
this.encoding,
))
})?
else {
return Ok(());
};
let new_text = new_text.await?;
let diff = this
.update(cx, |this, cx| this.diff(new_text.clone(), cx))?
.await;
let bytes = load_bytes_task.await?;
let (cow, _encoding_used, _has_errors) = encoding.decode(&bytes);
let new_text = cow.into_owned();
let diff = this.update(cx, |this, cx| this.diff(new_text, cx))?.await;
this.update(cx, |this, cx| {
if this.version() == diff.base_version {
this.finalize_last_transaction();

View File

@@ -1361,7 +1361,7 @@ impl LocalWorktree {
}
let content = fs.load_bytes(&abs_path).await?;
let (text, encoding, has_bom) = decode_byte(content);
let (text, encoding, has_bom) = decode_byte(content)?;
let worktree = this.upgrade().context("worktree was dropped")?;
let file = match entry.await? {
@@ -1489,25 +1489,12 @@ impl LocalWorktree {
let fs = fs.clone();
let abs_path = abs_path.clone();
async move {
let bom_bytes = if has_bom {
if encoding == encoding_rs::UTF_16LE {
vec![0xFF, 0xFE]
} else if encoding == encoding_rs::UTF_16BE {
vec![0xFE, 0xFF]
} else if encoding == encoding_rs::UTF_8 {
vec![0xEF, 0xBB, 0xBF]
} else {
vec![]
}
} else {
vec![]
};
// For UTF-8, use the optimized `fs.save` which writes Rope chunks directly to disk
// without allocating a contiguous string.
if encoding == encoding_rs::UTF_8 && !has_bom {
return fs.save(&abs_path, &text, line_ending).await;
}
// For legacy encodings (e.g. Shift-JIS), we fall back to converting the entire Rope
// to a String/Bytes in memory before writing.
//
@@ -1520,13 +1507,45 @@ impl LocalWorktree {
LineEnding::Windows => text_string.replace('\n', "\r\n"),
};
let (cow, _, _) = encoding.encode(&normalized_text);
let bytes = if !bom_bytes.is_empty() {
let mut bytes = bom_bytes;
bytes.extend_from_slice(&cow);
bytes.into()
// Create the byte vector manually for UTF-16 encodings because encoding_rs encodes to UTF-8 by default (per WHATWG standards),
// which is not what we want for saving files.
let bytes = if encoding == encoding_rs::UTF_16BE {
let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2);
if has_bom {
data.extend_from_slice(&[0xFE, 0xFF]); // BOM
}
let utf16be_bytes =
normalized_text.encode_utf16().flat_map(|u| u.to_be_bytes());
data.extend(utf16be_bytes);
data.into()
} else if encoding == encoding_rs::UTF_16LE {
let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2);
if has_bom {
data.extend_from_slice(&[0xFF, 0xFE]); // BOM
}
let utf16le_bytes =
normalized_text.encode_utf16().flat_map(|u| u.to_le_bytes());
data.extend(utf16le_bytes);
data.into()
} else {
cow
// For other encodings (Shift-JIS, UTF-8 with BOM, etc.), delegate to encoding_rs.
let bom_bytes = if has_bom {
if encoding == encoding_rs::UTF_8 {
vec![0xEF, 0xBB, 0xBF]
} else {
vec![]
}
} else {
vec![]
};
let (cow, _, _) = encoding.encode(&normalized_text);
if !bom_bytes.is_empty() {
let mut bytes = bom_bytes;
bytes.extend_from_slice(&cow);
bytes.into()
} else {
cow
}
};
fs.write(&abs_path, &bytes).await
@@ -5842,11 +5861,28 @@ impl fs::Watcher for NullWatcher {
}
}
fn decode_byte(bytes: Vec<u8>) -> (String, &'static Encoding, bool) {
fn decode_byte(bytes: Vec<u8>) -> anyhow::Result<(String, &'static Encoding, bool)> {
// check BOM
if let Some((encoding, _bom_len)) = Encoding::for_bom(&bytes) {
let (cow, _) = encoding.decode_with_bom_removal(&bytes);
return (cow.into_owned(), encoding, true);
return Ok((cow.into_owned(), encoding, true));
}
match analyze_byte_content(&bytes) {
ByteContent::Utf16Le => {
let encoding = encoding_rs::UTF_16LE;
let (cow, _, _) = encoding.decode(&bytes);
return Ok((cow.into_owned(), encoding, false));
}
ByteContent::Utf16Be => {
let encoding = encoding_rs::UTF_16BE;
let (cow, _, _) = encoding.decode(&bytes);
return Ok((cow.into_owned(), encoding, false));
}
ByteContent::Binary => {
anyhow::bail!("Binary files are not supported");
}
ByteContent::Unknown => {}
}
fn detect_encoding(bytes: Vec<u8>) -> (String, &'static Encoding) {
@@ -5867,14 +5903,66 @@ fn decode_byte(bytes: Vec<u8>) -> (String, &'static Encoding, bool) {
// displaying raw escape sequences instead of the correct characters.
if text.contains('\x1b') {
let (s, enc) = detect_encoding(text.into_bytes());
(s, enc, false)
Ok((s, enc, false))
} else {
(text, encoding_rs::UTF_8, false)
Ok((text, encoding_rs::UTF_8, false))
}
}
Err(e) => {
let (s, enc) = detect_encoding(e.into_bytes());
(s, enc, false)
Ok((s, enc, false))
}
}
}
#[derive(PartialEq)]
enum ByteContent {
Utf16Le,
Utf16Be,
Binary,
Unknown,
}
// Heuristic check using null byte distribution.
// NOTE: This relies on the presence of ASCII characters (which become `0x00` in UTF-16).
// Files consisting purely of non-ASCII characters (like Japanese) may not be detected here
// and will result in `Unknown`.
fn analyze_byte_content(bytes: &[u8]) -> ByteContent {
if bytes.len() < 2 {
return ByteContent::Unknown;
}
let check_len = bytes.len().min(1024);
let sample = &bytes[..check_len];
if !sample.contains(&0) {
return ByteContent::Unknown;
}
let mut even_nulls = 0;
let mut odd_nulls = 0;
for (i, &byte) in sample.iter().enumerate() {
if byte == 0 {
if i % 2 == 0 {
even_nulls += 1;
} else {
odd_nulls += 1;
}
}
}
let total_nulls = even_nulls + odd_nulls;
if total_nulls < check_len / 10 {
return ByteContent::Unknown;
}
if even_nulls > odd_nulls * 4 {
return ByteContent::Utf16Be;
}
if odd_nulls > even_nulls * 4 {
return ByteContent::Utf16Le;
}
ByteContent::Binary
}

View File

@@ -1,5 +1,5 @@
use crate::{Entry, EntryKind, Event, PathChange, Worktree, WorktreeModelHandle};
use anyhow::{Context as _, Result};
use anyhow::Result;
use encoding_rs;
use fs::{FakeFs, Fs, RealFs, RemoveOptions};
use git::{DOT_GIT, GITIGNORE, REPO_EXCLUDE};
@@ -2568,71 +2568,87 @@ fn init_test(cx: &mut gpui::TestAppContext) {
#[gpui::test]
async fn test_load_file_encoding(cx: &mut TestAppContext) {
init_test(cx);
let test_cases: Vec<(&str, &[u8], &str)> = vec![
("utf8.txt", "こんにちは".as_bytes(), "こんにちは"), // "こんにちは" is Japanese "Hello"
(
"sjis.txt",
&[0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
"こんにちは",
),
(
"eucjp.txt",
&[0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf],
"こんにちは",
),
(
"iso2022jp.txt",
&[
struct TestCase {
name: &'static str,
bytes: Vec<u8>,
expected_text: &'static str,
}
// --- Success Cases ---
let success_cases = vec![
TestCase {
name: "utf8.txt",
bytes: "こんにちは".as_bytes().to_vec(),
expected_text: "こんにちは",
},
TestCase {
name: "sjis.txt",
bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
expected_text: "こんにちは",
},
TestCase {
name: "eucjp.txt",
bytes: vec![0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf],
expected_text: "こんにちは",
},
TestCase {
name: "iso2022jp.txt",
bytes: vec![
0x1b, 0x24, 0x42, 0x24, 0x33, 0x24, 0x73, 0x24, 0x4b, 0x24, 0x41, 0x24, 0x4f, 0x1b,
0x28, 0x42,
],
"こんにちは",
),
// Western Europe (Windows-1252)
// "Café" -> 0xE9 is 'é' in Windows-1252 (it is typically 0xC3 0xA9 in UTF-8)
("win1252.txt", &[0x43, 0x61, 0x66, 0xe9], "Café"),
// Chinese Simplified (GBK)
// Note: We use a slightly longer string here because short byte sequences can be ambiguous
// in multi-byte encodings. Providing more context helps the heuristic detector guess correctly.
// Text: "今天天气不错" (Today's weather is not bad / nice)
// Bytes:
// 今: BD F1
// 天: CC EC
// 天: CC EC
// 气: C6 F8
// 不: B2 BB
// 错: B4 ED
(
"gbk.txt",
&[
expected_text: "こんにちは",
},
TestCase {
name: "win1252.txt",
bytes: vec![0x43, 0x61, 0x66, 0xe9],
expected_text: "Café",
},
TestCase {
name: "gbk.txt",
bytes: vec![
0xbd, 0xf1, 0xcc, 0xec, 0xcc, 0xec, 0xc6, 0xf8, 0xb2, 0xbb, 0xb4, 0xed,
],
"今天天气不错",
),
(
"utf16le_bom.txt",
&[
expected_text: "今天天气不错",
},
// UTF-16LE with BOM
TestCase {
name: "utf16le_bom.txt",
bytes: vec![
0xFF, 0xFE, // BOM
0x53, 0x30, // こ
0x93, 0x30, // ん
0x6B, 0x30, // に
0x61, 0x30, // ち
0x6F, 0x30, // は
0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30,
],
"こんにちは",
),
(
"utf8_bom.txt",
&[
0xEF, 0xBB, 0xBF, // UTF-8 BOM
0xE3, 0x81, 0x93, //
0xE3, 0x82, 0x93, // ん
0xE3, 0x81, 0xAB, // に
0xE3, 0x81, 0xA1, // ち
0xE3, 0x81, 0xAF, // は
expected_text: "こんにちは",
},
// UTF-16BE with BOM
TestCase {
name: "utf16be_bom.txt",
bytes: vec![
0xFE, 0xFF, // BOM
0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F,
],
"こんにちは",
),
expected_text: "こんにちは",
},
// UTF-16LE without BOM (ASCII only)
// This relies on the "null byte heuristic" we implemented.
// "ABC" -> 41 00 42 00 43 00
TestCase {
name: "utf16le_ascii_no_bom.txt",
bytes: vec![0x41, 0x00, 0x42, 0x00, 0x43, 0x00],
expected_text: "ABC",
},
];
// --- Failure Cases ---
let failure_cases = vec![
// Binary File (Should be detected by heuristic and return Error)
// Contains random bytes and mixed nulls that don't match UTF-16 patterns
TestCase {
name: "binary.bin",
bytes: vec![0x00, 0xFF, 0x12, 0x00, 0x99, 0x88, 0x77, 0x66, 0x00],
expected_text: "", // Not used
},
];
let root_path = if cfg!(windows) {
@@ -2642,15 +2658,11 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) {
};
let fs = FakeFs::new(cx.background_executor.clone());
fs.create_dir(root_path).await.unwrap();
let mut files_json = serde_json::Map::new();
for (name, _, _) in &test_cases {
files_json.insert(name.to_string(), serde_json::Value::String("".to_string()));
}
for (name, bytes, _) in &test_cases {
let path = root_path.join(name);
fs.write(&path, bytes).await.unwrap();
for case in success_cases.iter().chain(failure_cases.iter()) {
let path = root_path.join(case.name);
fs.write(&path, &case.bytes).await.unwrap();
}
let tree = Worktree::local(
@@ -2667,34 +2679,54 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) {
cx.read(|cx| tree.read(cx).as_local().unwrap().scan_complete())
.await;
for (name, _, expected) in test_cases {
let loaded = tree
.update(cx, |tree, cx| tree.load_file(rel_path(name), cx))
.await
.with_context(|| format!("Failed to load {}", name))
.unwrap();
let rel_path = |name: &str| {
RelPath::new(&Path::new(name), PathStyle::local())
.unwrap()
.into_arc()
};
// Run Success Tests
for case in success_cases {
let loaded = tree
.update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx))
.await;
if let Err(e) = &loaded {
panic!("Failed to load success case '{}': {:?}", case.name, e);
}
let loaded = loaded.unwrap();
assert_eq!(
loaded.text, expected,
loaded.text, case.expected_text,
"Encoding mismatch for file: {}",
name
case.name
);
}
// Run Failure Tests
for case in failure_cases {
let loaded = tree
.update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx))
.await;
assert!(
loaded.is_err(),
"Failure case '{}' unexpectedly succeeded! It should have been detected as binary.",
case.name
);
let err_msg = loaded.unwrap_err().to_string();
println!("Got expected error for {}: {}", case.name, err_msg);
}
}
#[gpui::test]
async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) {
init_test(cx);
let fs = FakeFs::new(cx.executor());
let root_path = if cfg!(windows) {
Path::new("C:\\root")
} else {
Path::new("/root")
};
fs.create_dir(root_path).await.unwrap();
let file_path = root_path.join("test.txt");
fs.insert_file(&file_path, "initial".into()).await;
let worktree = Worktree::local(
root_path,
@@ -2707,33 +2739,107 @@ async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) {
.await
.unwrap();
let path: Arc<Path> = Path::new("test.txt").into();
let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc();
// Define test case structure
struct TestCase {
name: &'static str,
text: &'static str,
encoding: &'static encoding_rs::Encoding,
has_bom: bool,
expected_bytes: Vec<u8>,
}
let text = text::Rope::from("こんにちは");
let task = worktree.update(cx, |wt, cx| {
wt.write_file(
rel_path,
text,
text::LineEnding::Unix,
encoding_rs::SHIFT_JIS,
false,
cx,
)
});
task.await.unwrap();
let bytes = fs.load_bytes(&file_path).await.unwrap();
let expected_bytes = vec![
0x82, 0xb1, // こ
0x82, 0xf1, // ん
0x82, 0xc9, // に
0x82, 0xbf, // ち
0x82, 0xcd, // は
let cases = vec![
// Shift_JIS with Japanese
TestCase {
name: "Shift_JIS with Japanese",
text: "こんにちは",
encoding: encoding_rs::SHIFT_JIS,
has_bom: false,
expected_bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
},
// UTF-8 No BOM
TestCase {
name: "UTF-8 No BOM",
text: "AB",
encoding: encoding_rs::UTF_8,
has_bom: false,
expected_bytes: vec![0x41, 0x42],
},
// UTF-8 with BOM
TestCase {
name: "UTF-8 with BOM",
text: "AB",
encoding: encoding_rs::UTF_8,
has_bom: true,
expected_bytes: vec![0xEF, 0xBB, 0xBF, 0x41, 0x42],
},
// UTF-16LE No BOM with Japanese
// NOTE: This passes thanks to the manual encoding fix implemented in `write_file`.
TestCase {
name: "UTF-16LE No BOM with Japanese",
text: "こんにちは",
encoding: encoding_rs::UTF_16LE,
has_bom: false,
expected_bytes: vec![0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f, 0x30],
},
// UTF-16LE with BOM
TestCase {
name: "UTF-16LE with BOM",
text: "A",
encoding: encoding_rs::UTF_16LE,
has_bom: true,
expected_bytes: vec![0xFF, 0xFE, 0x41, 0x00],
},
// UTF-16BE No BOM with Japanese
// NOTE: This passes thanks to the manual encoding fix.
TestCase {
name: "UTF-16BE No BOM with Japanese",
text: "こんにちは",
encoding: encoding_rs::UTF_16BE,
has_bom: false,
expected_bytes: vec![0x30, 0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f],
},
// UTF-16BE with BOM
TestCase {
name: "UTF-16BE with BOM",
text: "A",
encoding: encoding_rs::UTF_16BE,
has_bom: true,
expected_bytes: vec![0xFE, 0xFF, 0x00, 0x41],
},
];
assert_eq!(bytes, expected_bytes, "Should be saved as Shift-JIS");
for (i, case) in cases.into_iter().enumerate() {
let file_name = format!("test_{}.txt", i);
let path: Arc<Path> = Path::new(&file_name).into();
let file_path = root_path.join(&file_name);
fs.insert_file(&file_path, "".into()).await;
let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc();
let text = text::Rope::from(case.text);
let task = worktree.update(cx, |wt, cx| {
wt.write_file(
rel_path,
text,
text::LineEnding::Unix,
case.encoding,
case.has_bom,
cx,
)
});
if let Err(e) = task.await {
panic!("Unexpected error in case '{}': {:?}", case.name, e);
}
let bytes = fs.load_bytes(&file_path).await.unwrap();
assert_eq!(
bytes, case.expected_bytes,
"case '{}' mismatch. Expected {:?}, but got {:?}",
case.name, case.expected_bytes, bytes
);
}
}