Fix fmt UTF-8 characters as fill (#18533)

Co-authored-by: Jacob Young <jacobly0@users.noreply.github.com>
2024-11-28 08:02:32 +00:00 · 2024-01-14 04:47:03 +01:00 · 2024-01-14 04:47:03 +01:00 · 279607cae5
commit 279607cae5
parent b723296e1f
3 changed files with 88 additions and 34 deletions
--- a/lib/std/fmt.zig
+++ b/lib/std/fmt.zig
@ -23,7 +23,7 @@ pub const FormatOptions = struct {
    precision: ?usize = null,
    width: ?usize = null,
    alignment: Alignment = .right,
-    fill: u8 = ' ',
+    fill: u21 = ' ',
 };

 /// Renders fmt string with args, calling `writer` with slices of bytes.
@ -211,14 +211,18 @@ fn cacheString(str: anytype) []const u8 {

 pub const Placeholder = struct {
    specifier_arg: []const u8,
-    fill: u8,
+    fill: u21,
    alignment: Alignment,
    arg: Specifier,
    width: Specifier,
    precision: Specifier,

    pub fn parse(comptime str: anytype) Placeholder {
-        comptime var parser = Parser{ .buf = &str };
+        const view = std.unicode.Utf8View.initComptime(&str);
+        comptime var parser = Parser{
+            .buf = &str,
+            .iter = view.iterator(),
+        };

        // Parse the positional argument number
        const arg = comptime parser.specifier() catch |err|
@ -230,7 +234,7 @@ pub const Placeholder = struct {
        // Skip the colon, if present
        if (comptime parser.char()) |ch| {
            if (ch != ':') {
-                @compileError("expected : or }, found '" ++ [1]u8{ch} ++ "'");
+                @compileError("expected : or }, found '" ++ unicode.utf8EncodeComptime(ch) ++ "'");
            }
        }

@ -265,7 +269,7 @@ pub const Placeholder = struct {
        // Skip the dot, if present
        if (comptime parser.char()) |ch| {
            if (ch != '.') {
-                @compileError("expected . or }, found '" ++ [1]u8{ch} ++ "'");
+                @compileError("expected . or }, found '" ++ unicode.utf8EncodeComptime(ch) ++ "'");
            }
        }

@ -274,7 +278,7 @@ pub const Placeholder = struct {
            @compileError(@errorName(err));

        if (comptime parser.char()) |ch| {
-            @compileError("extraneous trailing character '" ++ [1]u8{ch} ++ "'");
+            @compileError("extraneous trailing character '" ++ unicode.utf8EncodeComptime(ch) ++ "'");
        }

        return Placeholder{
@ -297,21 +301,23 @@ pub const Specifier = union(enum) {
 pub const Parser = struct {
    buf: []const u8,
    pos: usize = 0,
+    iter: std.unicode.Utf8Iterator = undefined,

    // Returns a decimal number or null if the current character is not a
    // digit
    pub fn number(self: *@This()) ?usize {
        var r: ?usize = null;

-        while (self.pos < self.buf.len) : (self.pos += 1) {
-            switch (self.buf[self.pos]) {
+        while (self.peek(0)) |code_point| {
+            switch (code_point) {
                '0'...'9' => {
                    if (r == null) r = 0;
                    r.? *= 10;
-                    r.? += self.buf[self.pos] - '0';
+                    r.? += code_point - '0';
                },
                else => break,
            }
+            _ = self.iter.nextCodepoint();
        }

        return r;
@ -319,31 +325,27 @@ pub const Parser = struct {

    // Returns a substring of the input starting from the current position
    // and ending where `ch` is found or until the end if not found
-    pub fn until(self: *@This(), ch: u8) []const u8 {
-        const start = self.pos;
-
-        if (start >= self.buf.len)
-            return &[_]u8{};
-
-        while (self.pos < self.buf.len) : (self.pos += 1) {
-            if (self.buf[self.pos] == ch) break;
+    pub fn until(self: *@This(), ch: u21) []const u8 {
+        var result: []const u8 = &[_]u8{};
+        while (self.peek(0)) |code_point| {
+            if (code_point == ch)
+                break;
+            result = result ++ (self.iter.nextCodepointSlice() orelse &[_]u8{});
        }
-        return self.buf[start..self.pos];
+        return result;
    }

    // Returns one character, if available
-    pub fn char(self: *@This()) ?u8 {
-        if (self.pos < self.buf.len) {
-            const ch = self.buf[self.pos];
-            self.pos += 1;
-            return ch;
+    pub fn char(self: *@This()) ?u21 {
+        if (self.iter.nextCodepoint()) |code_point| {
+            return code_point;
        }
        return null;
    }

-    pub fn maybe(self: *@This(), val: u8) bool {
-        if (self.pos < self.buf.len and self.buf[self.pos] == val) {
-            self.pos += 1;
+    pub fn maybe(self: *@This(), val: u21) bool {
+        if (self.peek(0) == val) {
+            _ = self.iter.nextCodepoint();
            return true;
        }
        return false;
@ -367,8 +369,17 @@ pub const Parser = struct {
    }

    // Returns the n-th next character or null if that's past the end
-    pub fn peek(self: *@This(), n: usize) ?u8 {
-        return if (self.pos + n < self.buf.len) self.buf[self.pos + n] else null;
+    pub fn peek(self: *@This(), n: usize) ?u21 {
+        const original_i = self.iter.i;
+        defer self.iter.i = original_i;
+
+        var i = 0;
+        var code_point: ?u21 = null;
+        while (i <= n) : (i += 1) {
+            code_point = self.iter.nextCodepoint();
+            if (code_point == null) return null;
+        }
+        return code_point;
    }
 };

@ -965,8 +976,7 @@ pub fn formatUnicodeCodepoint(
    var buf: [4]u8 = undefined;
    const len = unicode.utf8Encode(c, &buf) catch |err| switch (err) {
        error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => {
-            const len = unicode.utf8Encode(unicode.replacement_character, &buf) catch unreachable;
-            return formatBuf(buf[0..len], options, writer);
+            return formatBuf(&unicode.utf8EncodeComptime(unicode.replacement_character), options, writer);
        },
    };
    return formatBuf(buf[0..len], options, writer);
@ -985,20 +995,28 @@ pub fn formatBuf(
        if (padding == 0)
            return writer.writeAll(buf);

+        var fill_buffer: [4]u8 = undefined;
+        const fill_utf8 = if (unicode.utf8Encode(options.fill, &fill_buffer)) |len|
+            fill_buffer[0..len]
+        else |err| switch (err) {
+            error.Utf8CannotEncodeSurrogateHalf,
+            error.CodepointTooLarge,
+            => &unicode.utf8EncodeComptime(unicode.replacement_character),
+        };
        switch (options.alignment) {
            .left => {
                try writer.writeAll(buf);
-                try writer.writeByteNTimes(options.fill, padding);
+                try writer.writeBytesNTimes(fill_utf8, padding);
            },
            .center => {
                const left_padding = padding / 2;
                const right_padding = (padding + 1) / 2;
-                try writer.writeByteNTimes(options.fill, left_padding);
+                try writer.writeBytesNTimes(fill_utf8, left_padding);
                try writer.writeAll(buf);
-                try writer.writeByteNTimes(options.fill, right_padding);
+                try writer.writeBytesNTimes(fill_utf8, right_padding);
            },
            .right => {
-                try writer.writeByteNTimes(options.fill, padding);
+                try writer.writeBytesNTimes(fill_utf8, padding);
                try writer.writeAll(buf);
            },
        }
@ -2793,6 +2811,15 @@ test "padding" {
    try expectFmt("a====", "{c:=<5}", .{'a'});
 }

+test "padding fill char utf" {
+    try expectFmt("──crêpe───", "{s:─^10}", .{"crêpe"});
+    try expectFmt("─────crêpe", "{s:─>10}", .{"crêpe"});
+    try expectFmt("crêpe─────", "{s:─<10}", .{"crêpe"});
+    try expectFmt("────a", "{c:─>5}", .{'a'});
+    try expectFmt("──a──", "{c:─^5}", .{'a'});
+    try expectFmt("a────", "{c:─<5}", .{'a'});
+}
+
 test "decimal float padding" {
    const number: f32 = 3.1415;
    try expectFmt("left-pad:   **3.141\n", "left-pad:   {d:*>7.3}\n", .{number});
--- a/lib/std/io/writer.zig
+++ b/lib/std/io/writer.zig
@ -45,6 +45,13 @@ pub fn Writer(
            }
        }

+        pub fn writeBytesNTimes(self: Self, bytes: []const u8, n: usize) Error!void {
+            var i: usize = 0;
+            while (i < n) : (i += 1) {
+                try self.writeAll(bytes);
+            }
+        }
+
        pub inline fn writeInt(self: Self, comptime T: type, value: T, endian: std.builtin.Endian) Error!void {
            var bytes: [@divExact(@typeInfo(T).Int.bits, 8)]u8 = undefined;
            mem.writeInt(std.math.ByteAlignedInt(@TypeOf(value)), &bytes, value, endian);
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@ -69,6 +69,19 @@ pub fn utf8Encode(c: u21, out: []u8) !u3 {
    return length;
 }

+pub inline fn utf8EncodeComptime(comptime c: u21) [
+    utf8CodepointSequenceLength(c) catch |err|
+        @compileError(@errorName(err))
+]u8 {
+    comptime var result: [
+        utf8CodepointSequenceLength(c) catch
+            unreachable
+    ]u8 = undefined;
+    comptime assert((utf8Encode(c, &result) catch |err|
+        @compileError(@errorName(err))) == result.len);
+    return result;
+}
+
 const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;

 /// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
@ -525,6 +538,13 @@ fn testUtf8Encode() !void {
    try testing.expect(array[3] == 0b10001000);
 }

+test "utf8 encode comptime" {
+    try testing.expectEqualSlices(u8, "€", &utf8EncodeComptime('€'));
+    try testing.expectEqualSlices(u8, "$", &utf8EncodeComptime('$'));
+    try testing.expectEqualSlices(u8, "¢", &utf8EncodeComptime('¢'));
+    try testing.expectEqualSlices(u8, "𐍈", &utf8EncodeComptime('𐍈'));
+}
+
 test "utf8 encode error" {
    try comptime testUtf8EncodeError();
    try testUtf8EncodeError();