From a3540000909bdc6a59ba07c85d21afeb3a7e54e2 Mon Sep 17 00:00:00 2001 From: Ersikan Date: Sun, 14 Mar 2021 07:03:22 +0100 Subject: [PATCH 1/3] zig fmt: fix non-UTF-8 encoding #2820 Fixes #2820 After reading the source code, the first two bytes are inspected, and if they correspond to a UTF-16 BOM in little-endian order, the source code is converted to UTF-8. --- src/main.zig | 62 +++++++++++++++++++++++++++++++++++++++------------- test/cli.zig | 9 ++++++++ 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/src/main.zig b/src/main.zig index bc4f209b45..1625b7213f 100644 --- a/src/main.zig +++ b/src/main.zig @@ -2708,9 +2708,22 @@ pub fn cmdFmt(gpa: *Allocator, args: []const []const u8) !void { fatal("cannot use --stdin with positional arguments", .{}); } - const stdin = io.getStdIn().reader(); + const stdin = io.getStdIn(); - const source_code = try stdin.readAllAlloc(gpa, max_src_size); + const source_code = blk: { + const source_code = try stdin.readToEndAllocOptions(gpa, max_src_size, null, @alignOf(u16), null); + errdefer gpa.free(source_code); + + // If the file starts with a UTF-16 BOM, translate it to UTF-8 + if (mem.startsWith(u8, source_code, "\xff\xfe")) { + const source_code_utf16_le = mem.bytesAsSlice(u16, source_code); + const source_code_utf8 = try std.unicode.utf16leToUtf8Alloc(gpa, source_code_utf16_le); + gpa.free(source_code); + break :blk source_code_utf8; + } else { + break :blk source_code; + } + }; defer gpa.free(source_code); var tree = std.zig.parse(gpa, source_code) catch |err| { @@ -2785,6 +2798,7 @@ const FmtError = error{ EndOfStream, Unseekable, NotOpenForWriting, + UnknownTextFormat, } || fs.File.OpenError; fn fmtPath(fmt: *Fmt, file_path: []const u8, check_mode: bool, dir: fs.Dir, sub_path: []const u8) FmtError!void { @@ -2850,20 +2864,38 @@ fn fmtPathFile( if (stat.kind == .Directory) return error.IsDir; - const source_code = source_file.readToEndAllocOptions( - fmt.gpa, - max_src_size, - std.math.cast(usize, stat.size) catch return error.FileTooBig, - @alignOf(u8), - null, - ) catch |err| switch (err) { - error.ConnectionResetByPeer => unreachable, - error.ConnectionTimedOut => unreachable, - error.NotOpenForReading => unreachable, - else => |e| return e, + const source_code = blk: { + const source_code = source_file.readToEndAllocOptions( + fmt.gpa, + max_src_size, + std.math.cast(usize, stat.size) catch return error.FileTooBig, + @alignOf(u16), + null, + ) catch |err| switch (err) { + error.ConnectionResetByPeer => unreachable, + error.ConnectionTimedOut => unreachable, + error.NotOpenForReading => unreachable, + else => |e| return e, + }; + source_file.close(); + file_closed = true; + errdefer fmt.gpa.free(source_code); + + // If the file starts with a UTF-16 BOM, translate it to UTF-8 + if (mem.eql(u8, source_code[0..2], "\xff\xfe")) { + const source_code_utf16_le = mem.bytesAsSlice(u16, source_code); + const source_code_utf8 = std.unicode.utf16leToUtf8Alloc(fmt.gpa, source_code_utf16_le) catch |err| return switch (err) { + error.DanglingSurrogateHalf => FmtError.UnknownTextFormat, + error.ExpectedSecondSurrogateHalf => FmtError.UnknownTextFormat, + error.UnexpectedSecondSurrogateHalf => FmtError.UnknownTextFormat, + else => |e| e, + }; + fmt.gpa.free(source_code); + break :blk source_code_utf8; + } else { + break :blk source_code; + } }; - source_file.close(); - file_closed = true; defer fmt.gpa.free(source_code); // Add to set after no longer possible to get error.IsDir. diff --git a/test/cli.zig b/test/cli.zig index c0702fa54c..db58511d14 100644 --- a/test/cli.zig +++ b/test/cli.zig @@ -174,4 +174,13 @@ fn testZigFmt(zig_exe: []const u8, dir_path: []const u8) !void { const run_result3 = try exec(dir_path, true, &[_][]const u8{ zig_exe, "fmt", dir_path }); // both files have been formatted, nothing should change now testing.expect(run_result3.stdout.len == 0); + + // Check UTF-16 decoding + const fmt4_zig_path = try fs.path.join(a, &[_][]const u8{ dir_path, "fmt4.zig" }); + var unformatted_code_utf16 = "\xff\xfe \x00 \x00 \x00 \x00/\x00/\x00 \x00n\x00o\x00 \x00r\x00e\x00a\x00s\x00o\x00n\x00"; + try fs.cwd().writeFile(fmt4_zig_path, unformatted_code_utf16); + + const run_result4 = try exec(dir_path, true, &[_][]const u8{ zig_exe, "fmt", dir_path }); + testing.expect(std.mem.startsWith(u8, run_result4.stdout, fmt4_zig_path)); + testing.expect(run_result4.stdout.len == fmt4_zig_path.len + 1 and run_result4.stdout[run_result4.stdout.len - 1] == '\n'); } From 8942243f7a825e42c16c8d210f5f9dc3baa76b2f Mon Sep 17 00:00:00 2001 From: Ersikan Date: Sun, 14 Mar 2021 18:07:09 +0100 Subject: [PATCH 2/3] zig fmt: factorize source file reading and decoding Now reading a source file and decoding it from UTF-16LE to UTF-8 is done in a single function. Error messages are improved, and an error is emitted when the source file has a BOM not supported (UTF-16BE, UTF-32). Please note that the BOM of UTF-32 is composed of the same bytes as the BOM of UTF-16 followed by a null character. Therefore a source file in UTF-16LE starting with a null byte will be interpreted as an UTF-32, and rejeted because of an invalid format. In pratice this is not a problem, as the code would have been rejected later anyway because of the null character. --- src/main.zig | 102 +++++++++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 47 deletions(-) diff --git a/src/main.zig b/src/main.zig index 1625b7213f..be227a2895 100644 --- a/src/main.zig +++ b/src/main.zig @@ -2637,6 +2637,50 @@ fn argvCmd(allocator: *Allocator, argv: []const []const u8) ![]u8 { return cmd.toOwnedSlice(); } +fn readSourceFileToEndAlloc(allocator: *mem.Allocator, input: *const fs.File, size_hint: ?usize) ![]const u8 { + const source_code = input.readToEndAllocOptions( + allocator, + max_src_size, + size_hint, + @alignOf(u16), + null, + ) catch |err| switch (err) { + error.ConnectionResetByPeer => unreachable, + error.ConnectionTimedOut => unreachable, + error.NotOpenForReading => unreachable, + else => |e| return e, + }; + errdefer allocator.free(source_code); + + // Detect unsupported file types with their Byte Order Mark + const unsupported_boms = [_][]const u8{ + "\xff\xfe\x00\x00", // UTF-32 little endian + "\xfe\xff\x00\x00", // UTF-32 big endian + "\xfe\xff", // UTF-16 big endian + }; + for (unsupported_boms) |bom| { + if (mem.startsWith(u8, source_code, bom)) { + return error.UnsupportedEncoding; + } + } + + // If the file starts with a UTF-16 little endian BOM, translate it to UTF-8 + if (mem.startsWith(u8, source_code, "\xff\xfe")) { + const source_code_utf16_le = mem.bytesAsSlice(u16, source_code); + const source_code_utf8 = std.unicode.utf16leToUtf8Alloc(allocator, source_code_utf16_le) catch |err| switch (err) { + error.DanglingSurrogateHalf => error.UnsupportedEncoding, + error.ExpectedSecondSurrogateHalf => error.UnsupportedEncoding, + error.UnexpectedSecondSurrogateHalf => error.UnsupportedEncoding, + else => |e| return e, + }; + + allocator.free(source_code); + return source_code_utf8; + } + + return source_code; +} + pub const usage_fmt = \\Usage: zig fmt [file]... \\ @@ -2709,20 +2753,8 @@ pub fn cmdFmt(gpa: *Allocator, args: []const []const u8) !void { } const stdin = io.getStdIn(); - - const source_code = blk: { - const source_code = try stdin.readToEndAllocOptions(gpa, max_src_size, null, @alignOf(u16), null); - errdefer gpa.free(source_code); - - // If the file starts with a UTF-16 BOM, translate it to UTF-8 - if (mem.startsWith(u8, source_code, "\xff\xfe")) { - const source_code_utf16_le = mem.bytesAsSlice(u16, source_code); - const source_code_utf8 = try std.unicode.utf16leToUtf8Alloc(gpa, source_code_utf16_le); - gpa.free(source_code); - break :blk source_code_utf8; - } else { - break :blk source_code; - } + const source_code = readSourceFileToEndAlloc(gpa, &stdin, null) catch |err| { + fatal("unable to read stdin: {s}", .{err}); }; defer gpa.free(source_code); @@ -2798,7 +2830,7 @@ const FmtError = error{ EndOfStream, Unseekable, NotOpenForWriting, - UnknownTextFormat, + UnsupportedEncoding, } || fs.File.OpenError; fn fmtPath(fmt: *Fmt, file_path: []const u8, check_mode: bool, dir: fs.Dir, sub_path: []const u8) FmtError!void { @@ -2864,40 +2896,16 @@ fn fmtPathFile( if (stat.kind == .Directory) return error.IsDir; - const source_code = blk: { - const source_code = source_file.readToEndAllocOptions( - fmt.gpa, - max_src_size, - std.math.cast(usize, stat.size) catch return error.FileTooBig, - @alignOf(u16), - null, - ) catch |err| switch (err) { - error.ConnectionResetByPeer => unreachable, - error.ConnectionTimedOut => unreachable, - error.NotOpenForReading => unreachable, - else => |e| return e, - }; - source_file.close(); - file_closed = true; - errdefer fmt.gpa.free(source_code); - - // If the file starts with a UTF-16 BOM, translate it to UTF-8 - if (mem.eql(u8, source_code[0..2], "\xff\xfe")) { - const source_code_utf16_le = mem.bytesAsSlice(u16, source_code); - const source_code_utf8 = std.unicode.utf16leToUtf8Alloc(fmt.gpa, source_code_utf16_le) catch |err| return switch (err) { - error.DanglingSurrogateHalf => FmtError.UnknownTextFormat, - error.ExpectedSecondSurrogateHalf => FmtError.UnknownTextFormat, - error.UnexpectedSecondSurrogateHalf => FmtError.UnknownTextFormat, - else => |e| e, - }; - fmt.gpa.free(source_code); - break :blk source_code_utf8; - } else { - break :blk source_code; - } - }; + const source_code = try readSourceFileToEndAlloc( + fmt.gpa, + &source_file, + std.math.cast(usize, stat.size) catch return error.FileTooBig, + ); defer fmt.gpa.free(source_code); + source_file.close(); + file_closed = true; + // Add to set after no longer possible to get error.IsDir. if (try fmt.seen.fetchPut(stat.inode, {})) |_| return; From 36db4b7cc48e9e2b1bed5a977f36ef3a0f158ee8 Mon Sep 17 00:00:00 2001 From: Ersikan Date: Sun, 14 Mar 2021 18:12:42 +0100 Subject: [PATCH 3/3] test-cli: Remove temporary directory after tests --- test/cli.zig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/cli.zig b/test/cli.zig index db58511d14..dedea67a59 100644 --- a/test/cli.zig +++ b/test/cli.zig @@ -28,6 +28,8 @@ pub fn main() !void { const zig_exe = try fs.path.resolve(a, &[_][]const u8{zig_exe_rel}); const dir_path = try fs.path.join(a, &[_][]const u8{ cache_root, "clitest" }); + defer fs.cwd().deleteTree(dir_path) catch {}; + const TestFn = fn ([]const u8, []const u8) anyerror!void; const test_fns = [_]TestFn{ testZigInitLib,