Module: fix @embedFile of files containing zero bytes

If an adapted string key with embedded nulls was put in a hash map with
`std.hash_map.StringIndexAdapter`, then an incorrect hash would be
entered for that entry such that it is possible that when looking for
the exact key that matches the prefix of the original key up to the
first null would sometimes match this entry due to hash collisions and
sometimes not if performed later after a grow + rehash, causing the same
key to exist with two different indices breaking every string equality
comparison ever, for example claiming that a container type doesn't
contain a field because the field name string in the struct and the
string representing the identifier to lookup might be equal strings but
have different string indices.  This could maybe be fixed by changing
`std.hash_map.StringIndexAdapter.hash` to only hash up to the first
null, therefore ensuring that the entry's hash is correct and that all
future lookups will be consistent, but I don't trust anything so instead
I assert that there are no embedded nulls.
This commit is contained in:
Jacob Young 2024-02-21 16:21:14 +01:00 committed by Andrew Kelley
parent 241e100827
commit e60d667111
6 changed files with 28 additions and 18 deletions

View File

@ -92,27 +92,24 @@ pub fn hashString(s: []const u8) u64 {
pub const StringIndexContext = struct {
bytes: *const std.ArrayListUnmanaged(u8),
pub fn eql(self: @This(), a: u32, b: u32) bool {
_ = self;
pub fn eql(_: @This(), a: u32, b: u32) bool {
return a == b;
}
pub fn hash(self: @This(), x: u32) u64 {
const x_slice = mem.sliceTo(@as([*:0]const u8, @ptrCast(self.bytes.items.ptr)) + x, 0);
return hashString(x_slice);
pub fn hash(ctx: @This(), key: u32) u64 {
return hashString(mem.sliceTo(ctx.bytes.items[key..], 0));
}
};
pub const StringIndexAdapter = struct {
bytes: *const std.ArrayListUnmanaged(u8),
pub fn eql(self: @This(), a_slice: []const u8, b: u32) bool {
const b_slice = mem.sliceTo(@as([*:0]const u8, @ptrCast(self.bytes.items.ptr)) + b, 0);
return mem.eql(u8, a_slice, b_slice);
pub fn eql(ctx: @This(), a: []const u8, b: u32) bool {
return mem.eql(u8, a, mem.sliceTo(ctx.bytes.items[b..], 0));
}
pub fn hash(self: @This(), adapted_key: []const u8) u64 {
_ = self;
pub fn hash(_: @This(), adapted_key: []const u8) u64 {
assert(mem.indexOfScalar(u8, adapted_key, 0) == null);
return hashString(adapted_key);
}
};

View File

@ -11461,6 +11461,10 @@ fn strLitAsString(astgen: *AstGen, str_lit_token: Ast.TokenIndex) !IndexSlice {
const token_bytes = astgen.tree.tokenSlice(str_lit_token);
try astgen.parseStrLit(str_lit_token, string_bytes, token_bytes, 0);
const key: []const u8 = string_bytes.items[str_index..];
if (std.mem.indexOfScalar(u8, key, 0)) |_| return .{
.index = @enumFromInt(str_index),
.len = @intCast(key.len),
};
const gop = try astgen.string_table.getOrPutContextAdapted(gpa, key, StringIndexAdapter{
.bytes = string_bytes,
}, StringIndexContext{
@ -11468,7 +11472,7 @@ fn strLitAsString(astgen: *AstGen, str_lit_token: Ast.TokenIndex) !IndexSlice {
});
if (gop.found_existing) {
string_bytes.shrinkRetainingCapacity(str_index);
return IndexSlice{
return .{
.index = @enumFromInt(gop.key_ptr.*),
.len = @intCast(key.len),
};
@ -11478,7 +11482,7 @@ fn strLitAsString(astgen: *AstGen, str_lit_token: Ast.TokenIndex) !IndexSlice {
// to lookup null terminated strings, so if we get a match, it has to
// be null terminated for that to work.
try string_bytes.append(gpa, 0);
return IndexSlice{
return .{
.index = @enumFromInt(str_index),
.len = @intCast(key.len),
};

View File

@ -7985,7 +7985,8 @@ pub fn getTrailingAggregate(
) Allocator.Error!Index {
try ip.items.ensureUnusedCapacity(gpa, 1);
try ip.extra.ensureUnusedCapacity(gpa, @typeInfo(Bytes).Struct.fields.len);
const str: String = @enumFromInt(@intFromEnum(try getOrPutTrailingString(ip, gpa, len)));
const str: String = @enumFromInt(ip.string_bytes.items.len - len);
const adapter: KeyAdapter = .{ .intern_pool = ip };
const gop = try ip.map.getOrPutAdapted(gpa, Key{ .aggregate = .{
.ty = ty,

View File

@ -4400,6 +4400,7 @@ fn newEmbedFile(
src_loc: SrcLoc,
) !InternPool.Index {
const gpa = mod.gpa;
const ip = &mod.intern_pool;
const new_file = try gpa.create(EmbedFile);
errdefer gpa.destroy(new_file);
@ -4414,11 +4415,11 @@ fn newEmbedFile(
.mtime = actual_stat.mtime,
};
const size = std.math.cast(usize, actual_stat.size) orelse return error.Overflow;
const ip = &mod.intern_pool;
const ptr = try ip.string_bytes.addManyAsSlice(gpa, size);
const actual_read = try file.readAll(ptr);
const bytes = try ip.string_bytes.addManyAsSlice(gpa, try std.math.add(usize, size, 1));
const actual_read = try file.readAll(bytes[0..size]);
if (actual_read != size) return error.UnexpectedEndOfFile;
bytes[size] = 0;
const comp = mod.comp;
switch (comp.cache_use) {
@ -4427,7 +4428,7 @@ fn newEmbedFile(
errdefer gpa.free(copied_resolved_path);
whole.cache_manifest_mutex.lock();
defer whole.cache_manifest_mutex.unlock();
try man.addFilePostContents(copied_resolved_path, ptr, stat);
try man.addFilePostContents(copied_resolved_path, bytes[0..size], stat);
},
.incremental => {},
}
@ -4437,7 +4438,7 @@ fn newEmbedFile(
.sentinel = .zero_u8,
.child = .u8_type,
} });
const array_val = try ip.getTrailingAggregate(gpa, array_ty, size);
const array_val = try ip.getTrailingAggregate(gpa, array_ty, bytes.len);
const ptr_ty = (try mod.ptrType(.{
.child = array_ty,

View File

@ -127,3 +127,10 @@ test {
_ = @import("behavior/export_keyword.zig");
}
}
// This bug only repros in the root file
test "deference @embedFile() of a file full of zero bytes" {
const contents = @embedFile("behavior/zero.bin").*;
try @import("std").testing.expect(contents.len == 456);
for (contents) |byte| try @import("std").testing.expect(byte == 0);
}

BIN
test/behavior/zero.bin Normal file

Binary file not shown.