Correct UTF-16 saving and add heuristic encoding detection (#45243)
This commit fixes an issue where saving UTF-16 files resulted in UTF-8 bytes due to `encoding_rs` default behavior. It also introduces a heuristic to detect BOM-less UTF-16 and binary files. Changes: - Manually implement UTF-16LE/BE encoding during file save to avoid implicit UTF-8 conversion. - Add `analyze_byte_content` to guess UTF-16LE/BE or Binary based on null byte distribution. - Prevent loading binary files as text by returning an error when binary content is detected. Special thanks to @CrazyboyQCD for pointing out the `encoding_rs` behavior and providing the fix, and to @ConradIrwin for the suggestion on the detection heuristic. Closes #14654 Release Notes: - (nightly only) Fixed an issue where saving files with UTF-16 encoding incorrectly wrote them as UTF-8. Also improved detection for binary files and BOM-less UTF-16.
This commit is contained in:
@@ -1490,19 +1490,23 @@ impl Buffer {
|
||||
let (tx, rx) = futures::channel::oneshot::channel();
|
||||
let prev_version = self.text.version();
|
||||
self.reload_task = Some(cx.spawn(async move |this, cx| {
|
||||
let Some((new_mtime, new_text)) = this.update(cx, |this, cx| {
|
||||
let Some((new_mtime, load_bytes_task, encoding)) = this.update(cx, |this, cx| {
|
||||
let file = this.file.as_ref()?.as_local()?;
|
||||
|
||||
Some((file.disk_state().mtime(), file.load(cx)))
|
||||
Some((
|
||||
file.disk_state().mtime(),
|
||||
file.load_bytes(cx),
|
||||
this.encoding,
|
||||
))
|
||||
})?
|
||||
else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let new_text = new_text.await?;
|
||||
let diff = this
|
||||
.update(cx, |this, cx| this.diff(new_text.clone(), cx))?
|
||||
.await;
|
||||
let bytes = load_bytes_task.await?;
|
||||
let (cow, _encoding_used, _has_errors) = encoding.decode(&bytes);
|
||||
let new_text = cow.into_owned();
|
||||
|
||||
let diff = this.update(cx, |this, cx| this.diff(new_text, cx))?.await;
|
||||
this.update(cx, |this, cx| {
|
||||
if this.version() == diff.base_version {
|
||||
this.finalize_last_transaction();
|
||||
|
||||
@@ -1361,7 +1361,7 @@ impl LocalWorktree {
|
||||
}
|
||||
|
||||
let content = fs.load_bytes(&abs_path).await?;
|
||||
let (text, encoding, has_bom) = decode_byte(content);
|
||||
let (text, encoding, has_bom) = decode_byte(content)?;
|
||||
|
||||
let worktree = this.upgrade().context("worktree was dropped")?;
|
||||
let file = match entry.await? {
|
||||
@@ -1489,25 +1489,12 @@ impl LocalWorktree {
|
||||
let fs = fs.clone();
|
||||
let abs_path = abs_path.clone();
|
||||
async move {
|
||||
let bom_bytes = if has_bom {
|
||||
if encoding == encoding_rs::UTF_16LE {
|
||||
vec![0xFF, 0xFE]
|
||||
} else if encoding == encoding_rs::UTF_16BE {
|
||||
vec![0xFE, 0xFF]
|
||||
} else if encoding == encoding_rs::UTF_8 {
|
||||
vec![0xEF, 0xBB, 0xBF]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
// For UTF-8, use the optimized `fs.save` which writes Rope chunks directly to disk
|
||||
// without allocating a contiguous string.
|
||||
if encoding == encoding_rs::UTF_8 && !has_bom {
|
||||
return fs.save(&abs_path, &text, line_ending).await;
|
||||
}
|
||||
|
||||
// For legacy encodings (e.g. Shift-JIS), we fall back to converting the entire Rope
|
||||
// to a String/Bytes in memory before writing.
|
||||
//
|
||||
@@ -1520,13 +1507,45 @@ impl LocalWorktree {
|
||||
LineEnding::Windows => text_string.replace('\n', "\r\n"),
|
||||
};
|
||||
|
||||
let (cow, _, _) = encoding.encode(&normalized_text);
|
||||
let bytes = if !bom_bytes.is_empty() {
|
||||
let mut bytes = bom_bytes;
|
||||
bytes.extend_from_slice(&cow);
|
||||
bytes.into()
|
||||
// Create the byte vector manually for UTF-16 encodings because encoding_rs encodes to UTF-8 by default (per WHATWG standards),
|
||||
// which is not what we want for saving files.
|
||||
let bytes = if encoding == encoding_rs::UTF_16BE {
|
||||
let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2);
|
||||
if has_bom {
|
||||
data.extend_from_slice(&[0xFE, 0xFF]); // BOM
|
||||
}
|
||||
let utf16be_bytes =
|
||||
normalized_text.encode_utf16().flat_map(|u| u.to_be_bytes());
|
||||
data.extend(utf16be_bytes);
|
||||
data.into()
|
||||
} else if encoding == encoding_rs::UTF_16LE {
|
||||
let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2);
|
||||
if has_bom {
|
||||
data.extend_from_slice(&[0xFF, 0xFE]); // BOM
|
||||
}
|
||||
let utf16le_bytes =
|
||||
normalized_text.encode_utf16().flat_map(|u| u.to_le_bytes());
|
||||
data.extend(utf16le_bytes);
|
||||
data.into()
|
||||
} else {
|
||||
cow
|
||||
// For other encodings (Shift-JIS, UTF-8 with BOM, etc.), delegate to encoding_rs.
|
||||
let bom_bytes = if has_bom {
|
||||
if encoding == encoding_rs::UTF_8 {
|
||||
vec![0xEF, 0xBB, 0xBF]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
let (cow, _, _) = encoding.encode(&normalized_text);
|
||||
if !bom_bytes.is_empty() {
|
||||
let mut bytes = bom_bytes;
|
||||
bytes.extend_from_slice(&cow);
|
||||
bytes.into()
|
||||
} else {
|
||||
cow
|
||||
}
|
||||
};
|
||||
|
||||
fs.write(&abs_path, &bytes).await
|
||||
@@ -5842,11 +5861,28 @@ impl fs::Watcher for NullWatcher {
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_byte(bytes: Vec<u8>) -> (String, &'static Encoding, bool) {
|
||||
fn decode_byte(bytes: Vec<u8>) -> anyhow::Result<(String, &'static Encoding, bool)> {
|
||||
// check BOM
|
||||
if let Some((encoding, _bom_len)) = Encoding::for_bom(&bytes) {
|
||||
let (cow, _) = encoding.decode_with_bom_removal(&bytes);
|
||||
return (cow.into_owned(), encoding, true);
|
||||
return Ok((cow.into_owned(), encoding, true));
|
||||
}
|
||||
|
||||
match analyze_byte_content(&bytes) {
|
||||
ByteContent::Utf16Le => {
|
||||
let encoding = encoding_rs::UTF_16LE;
|
||||
let (cow, _, _) = encoding.decode(&bytes);
|
||||
return Ok((cow.into_owned(), encoding, false));
|
||||
}
|
||||
ByteContent::Utf16Be => {
|
||||
let encoding = encoding_rs::UTF_16BE;
|
||||
let (cow, _, _) = encoding.decode(&bytes);
|
||||
return Ok((cow.into_owned(), encoding, false));
|
||||
}
|
||||
ByteContent::Binary => {
|
||||
anyhow::bail!("Binary files are not supported");
|
||||
}
|
||||
ByteContent::Unknown => {}
|
||||
}
|
||||
|
||||
fn detect_encoding(bytes: Vec<u8>) -> (String, &'static Encoding) {
|
||||
@@ -5867,14 +5903,66 @@ fn decode_byte(bytes: Vec<u8>) -> (String, &'static Encoding, bool) {
|
||||
// displaying raw escape sequences instead of the correct characters.
|
||||
if text.contains('\x1b') {
|
||||
let (s, enc) = detect_encoding(text.into_bytes());
|
||||
(s, enc, false)
|
||||
Ok((s, enc, false))
|
||||
} else {
|
||||
(text, encoding_rs::UTF_8, false)
|
||||
Ok((text, encoding_rs::UTF_8, false))
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
let (s, enc) = detect_encoding(e.into_bytes());
|
||||
(s, enc, false)
|
||||
Ok((s, enc, false))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum ByteContent {
|
||||
Utf16Le,
|
||||
Utf16Be,
|
||||
Binary,
|
||||
Unknown,
|
||||
}
|
||||
// Heuristic check using null byte distribution.
|
||||
// NOTE: This relies on the presence of ASCII characters (which become `0x00` in UTF-16).
|
||||
// Files consisting purely of non-ASCII characters (like Japanese) may not be detected here
|
||||
// and will result in `Unknown`.
|
||||
fn analyze_byte_content(bytes: &[u8]) -> ByteContent {
|
||||
if bytes.len() < 2 {
|
||||
return ByteContent::Unknown;
|
||||
}
|
||||
|
||||
let check_len = bytes.len().min(1024);
|
||||
let sample = &bytes[..check_len];
|
||||
|
||||
if !sample.contains(&0) {
|
||||
return ByteContent::Unknown;
|
||||
}
|
||||
|
||||
let mut even_nulls = 0;
|
||||
let mut odd_nulls = 0;
|
||||
|
||||
for (i, &byte) in sample.iter().enumerate() {
|
||||
if byte == 0 {
|
||||
if i % 2 == 0 {
|
||||
even_nulls += 1;
|
||||
} else {
|
||||
odd_nulls += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let total_nulls = even_nulls + odd_nulls;
|
||||
if total_nulls < check_len / 10 {
|
||||
return ByteContent::Unknown;
|
||||
}
|
||||
|
||||
if even_nulls > odd_nulls * 4 {
|
||||
return ByteContent::Utf16Be;
|
||||
}
|
||||
|
||||
if odd_nulls > even_nulls * 4 {
|
||||
return ByteContent::Utf16Le;
|
||||
}
|
||||
|
||||
ByteContent::Binary
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::{Entry, EntryKind, Event, PathChange, Worktree, WorktreeModelHandle};
|
||||
use anyhow::{Context as _, Result};
|
||||
use anyhow::Result;
|
||||
use encoding_rs;
|
||||
use fs::{FakeFs, Fs, RealFs, RemoveOptions};
|
||||
use git::{DOT_GIT, GITIGNORE, REPO_EXCLUDE};
|
||||
@@ -2568,71 +2568,87 @@ fn init_test(cx: &mut gpui::TestAppContext) {
|
||||
#[gpui::test]
|
||||
async fn test_load_file_encoding(cx: &mut TestAppContext) {
|
||||
init_test(cx);
|
||||
let test_cases: Vec<(&str, &[u8], &str)> = vec![
|
||||
("utf8.txt", "こんにちは".as_bytes(), "こんにちは"), // "こんにちは" is Japanese "Hello"
|
||||
(
|
||||
"sjis.txt",
|
||||
&[0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
|
||||
"こんにちは",
|
||||
),
|
||||
(
|
||||
"eucjp.txt",
|
||||
&[0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf],
|
||||
"こんにちは",
|
||||
),
|
||||
(
|
||||
"iso2022jp.txt",
|
||||
&[
|
||||
|
||||
struct TestCase {
|
||||
name: &'static str,
|
||||
bytes: Vec<u8>,
|
||||
expected_text: &'static str,
|
||||
}
|
||||
|
||||
// --- Success Cases ---
|
||||
let success_cases = vec![
|
||||
TestCase {
|
||||
name: "utf8.txt",
|
||||
bytes: "こんにちは".as_bytes().to_vec(),
|
||||
expected_text: "こんにちは",
|
||||
},
|
||||
TestCase {
|
||||
name: "sjis.txt",
|
||||
bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
|
||||
expected_text: "こんにちは",
|
||||
},
|
||||
TestCase {
|
||||
name: "eucjp.txt",
|
||||
bytes: vec![0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf],
|
||||
expected_text: "こんにちは",
|
||||
},
|
||||
TestCase {
|
||||
name: "iso2022jp.txt",
|
||||
bytes: vec![
|
||||
0x1b, 0x24, 0x42, 0x24, 0x33, 0x24, 0x73, 0x24, 0x4b, 0x24, 0x41, 0x24, 0x4f, 0x1b,
|
||||
0x28, 0x42,
|
||||
],
|
||||
"こんにちは",
|
||||
),
|
||||
// Western Europe (Windows-1252)
|
||||
// "Café" -> 0xE9 is 'é' in Windows-1252 (it is typically 0xC3 0xA9 in UTF-8)
|
||||
("win1252.txt", &[0x43, 0x61, 0x66, 0xe9], "Café"),
|
||||
// Chinese Simplified (GBK)
|
||||
// Note: We use a slightly longer string here because short byte sequences can be ambiguous
|
||||
// in multi-byte encodings. Providing more context helps the heuristic detector guess correctly.
|
||||
// Text: "今天天气不错" (Today's weather is not bad / nice)
|
||||
// Bytes:
|
||||
// 今: BD F1
|
||||
// 天: CC EC
|
||||
// 天: CC EC
|
||||
// 气: C6 F8
|
||||
// 不: B2 BB
|
||||
// 错: B4 ED
|
||||
(
|
||||
"gbk.txt",
|
||||
&[
|
||||
expected_text: "こんにちは",
|
||||
},
|
||||
TestCase {
|
||||
name: "win1252.txt",
|
||||
bytes: vec![0x43, 0x61, 0x66, 0xe9],
|
||||
expected_text: "Café",
|
||||
},
|
||||
TestCase {
|
||||
name: "gbk.txt",
|
||||
bytes: vec![
|
||||
0xbd, 0xf1, 0xcc, 0xec, 0xcc, 0xec, 0xc6, 0xf8, 0xb2, 0xbb, 0xb4, 0xed,
|
||||
],
|
||||
"今天天气不错",
|
||||
),
|
||||
(
|
||||
"utf16le_bom.txt",
|
||||
&[
|
||||
expected_text: "今天天气不错",
|
||||
},
|
||||
// UTF-16LE with BOM
|
||||
TestCase {
|
||||
name: "utf16le_bom.txt",
|
||||
bytes: vec![
|
||||
0xFF, 0xFE, // BOM
|
||||
0x53, 0x30, // こ
|
||||
0x93, 0x30, // ん
|
||||
0x6B, 0x30, // に
|
||||
0x61, 0x30, // ち
|
||||
0x6F, 0x30, // は
|
||||
0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30,
|
||||
],
|
||||
"こんにちは",
|
||||
),
|
||||
(
|
||||
"utf8_bom.txt",
|
||||
&[
|
||||
0xEF, 0xBB, 0xBF, // UTF-8 BOM
|
||||
0xE3, 0x81, 0x93, // こ
|
||||
0xE3, 0x82, 0x93, // ん
|
||||
0xE3, 0x81, 0xAB, // に
|
||||
0xE3, 0x81, 0xA1, // ち
|
||||
0xE3, 0x81, 0xAF, // は
|
||||
expected_text: "こんにちは",
|
||||
},
|
||||
// UTF-16BE with BOM
|
||||
TestCase {
|
||||
name: "utf16be_bom.txt",
|
||||
bytes: vec![
|
||||
0xFE, 0xFF, // BOM
|
||||
0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F,
|
||||
],
|
||||
"こんにちは",
|
||||
),
|
||||
expected_text: "こんにちは",
|
||||
},
|
||||
// UTF-16LE without BOM (ASCII only)
|
||||
// This relies on the "null byte heuristic" we implemented.
|
||||
// "ABC" -> 41 00 42 00 43 00
|
||||
TestCase {
|
||||
name: "utf16le_ascii_no_bom.txt",
|
||||
bytes: vec![0x41, 0x00, 0x42, 0x00, 0x43, 0x00],
|
||||
expected_text: "ABC",
|
||||
},
|
||||
];
|
||||
|
||||
// --- Failure Cases ---
|
||||
let failure_cases = vec![
|
||||
// Binary File (Should be detected by heuristic and return Error)
|
||||
// Contains random bytes and mixed nulls that don't match UTF-16 patterns
|
||||
TestCase {
|
||||
name: "binary.bin",
|
||||
bytes: vec![0x00, 0xFF, 0x12, 0x00, 0x99, 0x88, 0x77, 0x66, 0x00],
|
||||
expected_text: "", // Not used
|
||||
},
|
||||
];
|
||||
|
||||
let root_path = if cfg!(windows) {
|
||||
@@ -2642,15 +2658,11 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) {
|
||||
};
|
||||
|
||||
let fs = FakeFs::new(cx.background_executor.clone());
|
||||
fs.create_dir(root_path).await.unwrap();
|
||||
|
||||
let mut files_json = serde_json::Map::new();
|
||||
for (name, _, _) in &test_cases {
|
||||
files_json.insert(name.to_string(), serde_json::Value::String("".to_string()));
|
||||
}
|
||||
|
||||
for (name, bytes, _) in &test_cases {
|
||||
let path = root_path.join(name);
|
||||
fs.write(&path, bytes).await.unwrap();
|
||||
for case in success_cases.iter().chain(failure_cases.iter()) {
|
||||
let path = root_path.join(case.name);
|
||||
fs.write(&path, &case.bytes).await.unwrap();
|
||||
}
|
||||
|
||||
let tree = Worktree::local(
|
||||
@@ -2667,34 +2679,54 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) {
|
||||
cx.read(|cx| tree.read(cx).as_local().unwrap().scan_complete())
|
||||
.await;
|
||||
|
||||
for (name, _, expected) in test_cases {
|
||||
let loaded = tree
|
||||
.update(cx, |tree, cx| tree.load_file(rel_path(name), cx))
|
||||
.await
|
||||
.with_context(|| format!("Failed to load {}", name))
|
||||
.unwrap();
|
||||
let rel_path = |name: &str| {
|
||||
RelPath::new(&Path::new(name), PathStyle::local())
|
||||
.unwrap()
|
||||
.into_arc()
|
||||
};
|
||||
|
||||
// Run Success Tests
|
||||
for case in success_cases {
|
||||
let loaded = tree
|
||||
.update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx))
|
||||
.await;
|
||||
if let Err(e) = &loaded {
|
||||
panic!("Failed to load success case '{}': {:?}", case.name, e);
|
||||
}
|
||||
let loaded = loaded.unwrap();
|
||||
assert_eq!(
|
||||
loaded.text, expected,
|
||||
loaded.text, case.expected_text,
|
||||
"Encoding mismatch for file: {}",
|
||||
name
|
||||
case.name
|
||||
);
|
||||
}
|
||||
|
||||
// Run Failure Tests
|
||||
for case in failure_cases {
|
||||
let loaded = tree
|
||||
.update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx))
|
||||
.await;
|
||||
assert!(
|
||||
loaded.is_err(),
|
||||
"Failure case '{}' unexpectedly succeeded! It should have been detected as binary.",
|
||||
case.name
|
||||
);
|
||||
let err_msg = loaded.unwrap_err().to_string();
|
||||
println!("Got expected error for {}: {}", case.name, err_msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[gpui::test]
|
||||
async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) {
|
||||
init_test(cx);
|
||||
let fs = FakeFs::new(cx.executor());
|
||||
|
||||
let root_path = if cfg!(windows) {
|
||||
Path::new("C:\\root")
|
||||
} else {
|
||||
Path::new("/root")
|
||||
};
|
||||
fs.create_dir(root_path).await.unwrap();
|
||||
let file_path = root_path.join("test.txt");
|
||||
|
||||
fs.insert_file(&file_path, "initial".into()).await;
|
||||
|
||||
let worktree = Worktree::local(
|
||||
root_path,
|
||||
@@ -2707,33 +2739,107 @@ async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let path: Arc<Path> = Path::new("test.txt").into();
|
||||
let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc();
|
||||
// Define test case structure
|
||||
struct TestCase {
|
||||
name: &'static str,
|
||||
text: &'static str,
|
||||
encoding: &'static encoding_rs::Encoding,
|
||||
has_bom: bool,
|
||||
expected_bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
let text = text::Rope::from("こんにちは");
|
||||
|
||||
let task = worktree.update(cx, |wt, cx| {
|
||||
wt.write_file(
|
||||
rel_path,
|
||||
text,
|
||||
text::LineEnding::Unix,
|
||||
encoding_rs::SHIFT_JIS,
|
||||
false,
|
||||
cx,
|
||||
)
|
||||
});
|
||||
|
||||
task.await.unwrap();
|
||||
|
||||
let bytes = fs.load_bytes(&file_path).await.unwrap();
|
||||
|
||||
let expected_bytes = vec![
|
||||
0x82, 0xb1, // こ
|
||||
0x82, 0xf1, // ん
|
||||
0x82, 0xc9, // に
|
||||
0x82, 0xbf, // ち
|
||||
0x82, 0xcd, // は
|
||||
let cases = vec![
|
||||
// Shift_JIS with Japanese
|
||||
TestCase {
|
||||
name: "Shift_JIS with Japanese",
|
||||
text: "こんにちは",
|
||||
encoding: encoding_rs::SHIFT_JIS,
|
||||
has_bom: false,
|
||||
expected_bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
|
||||
},
|
||||
// UTF-8 No BOM
|
||||
TestCase {
|
||||
name: "UTF-8 No BOM",
|
||||
text: "AB",
|
||||
encoding: encoding_rs::UTF_8,
|
||||
has_bom: false,
|
||||
expected_bytes: vec![0x41, 0x42],
|
||||
},
|
||||
// UTF-8 with BOM
|
||||
TestCase {
|
||||
name: "UTF-8 with BOM",
|
||||
text: "AB",
|
||||
encoding: encoding_rs::UTF_8,
|
||||
has_bom: true,
|
||||
expected_bytes: vec![0xEF, 0xBB, 0xBF, 0x41, 0x42],
|
||||
},
|
||||
// UTF-16LE No BOM with Japanese
|
||||
// NOTE: This passes thanks to the manual encoding fix implemented in `write_file`.
|
||||
TestCase {
|
||||
name: "UTF-16LE No BOM with Japanese",
|
||||
text: "こんにちは",
|
||||
encoding: encoding_rs::UTF_16LE,
|
||||
has_bom: false,
|
||||
expected_bytes: vec![0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f, 0x30],
|
||||
},
|
||||
// UTF-16LE with BOM
|
||||
TestCase {
|
||||
name: "UTF-16LE with BOM",
|
||||
text: "A",
|
||||
encoding: encoding_rs::UTF_16LE,
|
||||
has_bom: true,
|
||||
expected_bytes: vec![0xFF, 0xFE, 0x41, 0x00],
|
||||
},
|
||||
// UTF-16BE No BOM with Japanese
|
||||
// NOTE: This passes thanks to the manual encoding fix.
|
||||
TestCase {
|
||||
name: "UTF-16BE No BOM with Japanese",
|
||||
text: "こんにちは",
|
||||
encoding: encoding_rs::UTF_16BE,
|
||||
has_bom: false,
|
||||
expected_bytes: vec![0x30, 0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f],
|
||||
},
|
||||
// UTF-16BE with BOM
|
||||
TestCase {
|
||||
name: "UTF-16BE with BOM",
|
||||
text: "A",
|
||||
encoding: encoding_rs::UTF_16BE,
|
||||
has_bom: true,
|
||||
expected_bytes: vec![0xFE, 0xFF, 0x00, 0x41],
|
||||
},
|
||||
];
|
||||
|
||||
assert_eq!(bytes, expected_bytes, "Should be saved as Shift-JIS");
|
||||
for (i, case) in cases.into_iter().enumerate() {
|
||||
let file_name = format!("test_{}.txt", i);
|
||||
let path: Arc<Path> = Path::new(&file_name).into();
|
||||
let file_path = root_path.join(&file_name);
|
||||
|
||||
fs.insert_file(&file_path, "".into()).await;
|
||||
|
||||
let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc();
|
||||
let text = text::Rope::from(case.text);
|
||||
|
||||
let task = worktree.update(cx, |wt, cx| {
|
||||
wt.write_file(
|
||||
rel_path,
|
||||
text,
|
||||
text::LineEnding::Unix,
|
||||
case.encoding,
|
||||
case.has_bom,
|
||||
cx,
|
||||
)
|
||||
});
|
||||
|
||||
if let Err(e) = task.await {
|
||||
panic!("Unexpected error in case '{}': {:?}", case.name, e);
|
||||
}
|
||||
|
||||
let bytes = fs.load_bytes(&file_path).await.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
bytes, case.expected_bytes,
|
||||
"case '{}' mismatch. Expected {:?}, but got {:?}",
|
||||
case.name, case.expected_bytes, bytes
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user