Fix fmt UTF-8 characters as fill (#18533)

Co-authored-by: Jacob Young <jacobly0@users.noreply.github.com>
This commit is contained in:
vinnichase 2024-01-14 04:47:03 +01:00 committed by GitHub
parent b723296e1f
commit 279607cae5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 88 additions and 34 deletions

View File

@ -23,7 +23,7 @@ pub const FormatOptions = struct {
precision: ?usize = null,
width: ?usize = null,
alignment: Alignment = .right,
fill: u8 = ' ',
fill: u21 = ' ',
};
/// Renders fmt string with args, calling `writer` with slices of bytes.
@ -211,14 +211,18 @@ fn cacheString(str: anytype) []const u8 {
pub const Placeholder = struct {
specifier_arg: []const u8,
fill: u8,
fill: u21,
alignment: Alignment,
arg: Specifier,
width: Specifier,
precision: Specifier,
pub fn parse(comptime str: anytype) Placeholder {
comptime var parser = Parser{ .buf = &str };
const view = std.unicode.Utf8View.initComptime(&str);
comptime var parser = Parser{
.buf = &str,
.iter = view.iterator(),
};
// Parse the positional argument number
const arg = comptime parser.specifier() catch |err|
@ -230,7 +234,7 @@ pub const Placeholder = struct {
// Skip the colon, if present
if (comptime parser.char()) |ch| {
if (ch != ':') {
@compileError("expected : or }, found '" ++ [1]u8{ch} ++ "'");
@compileError("expected : or }, found '" ++ unicode.utf8EncodeComptime(ch) ++ "'");
}
}
@ -265,7 +269,7 @@ pub const Placeholder = struct {
// Skip the dot, if present
if (comptime parser.char()) |ch| {
if (ch != '.') {
@compileError("expected . or }, found '" ++ [1]u8{ch} ++ "'");
@compileError("expected . or }, found '" ++ unicode.utf8EncodeComptime(ch) ++ "'");
}
}
@ -274,7 +278,7 @@ pub const Placeholder = struct {
@compileError(@errorName(err));
if (comptime parser.char()) |ch| {
@compileError("extraneous trailing character '" ++ [1]u8{ch} ++ "'");
@compileError("extraneous trailing character '" ++ unicode.utf8EncodeComptime(ch) ++ "'");
}
return Placeholder{
@ -297,21 +301,23 @@ pub const Specifier = union(enum) {
pub const Parser = struct {
buf: []const u8,
pos: usize = 0,
iter: std.unicode.Utf8Iterator = undefined,
// Returns a decimal number or null if the current character is not a
// digit
pub fn number(self: *@This()) ?usize {
var r: ?usize = null;
while (self.pos < self.buf.len) : (self.pos += 1) {
switch (self.buf[self.pos]) {
while (self.peek(0)) |code_point| {
switch (code_point) {
'0'...'9' => {
if (r == null) r = 0;
r.? *= 10;
r.? += self.buf[self.pos] - '0';
r.? += code_point - '0';
},
else => break,
}
_ = self.iter.nextCodepoint();
}
return r;
@ -319,31 +325,27 @@ pub const Parser = struct {
// Returns a substring of the input starting from the current position
// and ending where `ch` is found or until the end if not found
pub fn until(self: *@This(), ch: u8) []const u8 {
const start = self.pos;
if (start >= self.buf.len)
return &[_]u8{};
while (self.pos < self.buf.len) : (self.pos += 1) {
if (self.buf[self.pos] == ch) break;
pub fn until(self: *@This(), ch: u21) []const u8 {
var result: []const u8 = &[_]u8{};
while (self.peek(0)) |code_point| {
if (code_point == ch)
break;
result = result ++ (self.iter.nextCodepointSlice() orelse &[_]u8{});
}
return self.buf[start..self.pos];
return result;
}
// Returns one character, if available
pub fn char(self: *@This()) ?u8 {
if (self.pos < self.buf.len) {
const ch = self.buf[self.pos];
self.pos += 1;
return ch;
pub fn char(self: *@This()) ?u21 {
if (self.iter.nextCodepoint()) |code_point| {
return code_point;
}
return null;
}
pub fn maybe(self: *@This(), val: u8) bool {
if (self.pos < self.buf.len and self.buf[self.pos] == val) {
self.pos += 1;
pub fn maybe(self: *@This(), val: u21) bool {
if (self.peek(0) == val) {
_ = self.iter.nextCodepoint();
return true;
}
return false;
@ -367,8 +369,17 @@ pub const Parser = struct {
}
// Returns the n-th next character or null if that's past the end
pub fn peek(self: *@This(), n: usize) ?u8 {
return if (self.pos + n < self.buf.len) self.buf[self.pos + n] else null;
pub fn peek(self: *@This(), n: usize) ?u21 {
const original_i = self.iter.i;
defer self.iter.i = original_i;
var i = 0;
var code_point: ?u21 = null;
while (i <= n) : (i += 1) {
code_point = self.iter.nextCodepoint();
if (code_point == null) return null;
}
return code_point;
}
};
@ -965,8 +976,7 @@ pub fn formatUnicodeCodepoint(
var buf: [4]u8 = undefined;
const len = unicode.utf8Encode(c, &buf) catch |err| switch (err) {
error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => {
const len = unicode.utf8Encode(unicode.replacement_character, &buf) catch unreachable;
return formatBuf(buf[0..len], options, writer);
return formatBuf(&unicode.utf8EncodeComptime(unicode.replacement_character), options, writer);
},
};
return formatBuf(buf[0..len], options, writer);
@ -985,20 +995,28 @@ pub fn formatBuf(
if (padding == 0)
return writer.writeAll(buf);
var fill_buffer: [4]u8 = undefined;
const fill_utf8 = if (unicode.utf8Encode(options.fill, &fill_buffer)) |len|
fill_buffer[0..len]
else |err| switch (err) {
error.Utf8CannotEncodeSurrogateHalf,
error.CodepointTooLarge,
=> &unicode.utf8EncodeComptime(unicode.replacement_character),
};
switch (options.alignment) {
.left => {
try writer.writeAll(buf);
try writer.writeByteNTimes(options.fill, padding);
try writer.writeBytesNTimes(fill_utf8, padding);
},
.center => {
const left_padding = padding / 2;
const right_padding = (padding + 1) / 2;
try writer.writeByteNTimes(options.fill, left_padding);
try writer.writeBytesNTimes(fill_utf8, left_padding);
try writer.writeAll(buf);
try writer.writeByteNTimes(options.fill, right_padding);
try writer.writeBytesNTimes(fill_utf8, right_padding);
},
.right => {
try writer.writeByteNTimes(options.fill, padding);
try writer.writeBytesNTimes(fill_utf8, padding);
try writer.writeAll(buf);
},
}
@ -2793,6 +2811,15 @@ test "padding" {
try expectFmt("a====", "{c:=<5}", .{'a'});
}
test "padding fill char utf" {
try expectFmt("──crêpe───", "{s:─^10}", .{"crêpe"});
try expectFmt("─────crêpe", "{s:─>10}", .{"crêpe"});
try expectFmt("crêpe─────", "{s:─<10}", .{"crêpe"});
try expectFmt("────a", "{c:─>5}", .{'a'});
try expectFmt("──a──", "{c:─^5}", .{'a'});
try expectFmt("a────", "{c:─<5}", .{'a'});
}
test "decimal float padding" {
const number: f32 = 3.1415;
try expectFmt("left-pad: **3.141\n", "left-pad: {d:*>7.3}\n", .{number});

View File

@ -45,6 +45,13 @@ pub fn Writer(
}
}
pub fn writeBytesNTimes(self: Self, bytes: []const u8, n: usize) Error!void {
var i: usize = 0;
while (i < n) : (i += 1) {
try self.writeAll(bytes);
}
}
pub inline fn writeInt(self: Self, comptime T: type, value: T, endian: std.builtin.Endian) Error!void {
var bytes: [@divExact(@typeInfo(T).Int.bits, 8)]u8 = undefined;
mem.writeInt(std.math.ByteAlignedInt(@TypeOf(value)), &bytes, value, endian);

View File

@ -69,6 +69,19 @@ pub fn utf8Encode(c: u21, out: []u8) !u3 {
return length;
}
pub inline fn utf8EncodeComptime(comptime c: u21) [
utf8CodepointSequenceLength(c) catch |err|
@compileError(@errorName(err))
]u8 {
comptime var result: [
utf8CodepointSequenceLength(c) catch
unreachable
]u8 = undefined;
comptime assert((utf8Encode(c, &result) catch |err|
@compileError(@errorName(err))) == result.len);
return result;
}
const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
@ -525,6 +538,13 @@ fn testUtf8Encode() !void {
try testing.expect(array[3] == 0b10001000);
}
test "utf8 encode comptime" {
try testing.expectEqualSlices(u8, "", &utf8EncodeComptime('€'));
try testing.expectEqualSlices(u8, "$", &utf8EncodeComptime('$'));
try testing.expectEqualSlices(u8, "¢", &utf8EncodeComptime('¢'));
try testing.expectEqualSlices(u8, "𐍈", &utf8EncodeComptime('𐍈'));
}
test "utf8 encode error" {
try comptime testUtf8EncodeError();
try testUtf8EncodeError();