mirror of
https://github.com/ziglang/zig.git
synced 2024-11-26 23:22:44 +00:00
zig1.c: decompress zig1.wasm.zst with zstd
This commit is contained in:
parent
d1b3409df1
commit
fb9a6084e2
@ -181,6 +181,16 @@ set(ZIG_CONFIG_ZIG_OUT "${CMAKE_BINARY_DIR}/config.zig")
|
||||
|
||||
set(STAGE1_SOURCES
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zig1.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/decompress/huf_decompress.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/decompress/zstd_ddict.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/decompress/zstd_decompress.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/decompress/zstd_decompress_block.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/entropy_common.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/error_private.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/fse_decompress.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/pool.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/xxhash.c"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/zstd_common.c"
|
||||
)
|
||||
set(ZIG_CPP_SOURCES
|
||||
# These are planned to stay even when we are self-hosted.
|
||||
@ -710,14 +720,15 @@ endif()
|
||||
add_executable(zig1 ${STAGE1_SOURCES})
|
||||
set_target_properties(zig1 PROPERTIES COMPILE_FLAGS ${ZIG1_COMPILE_FLAGS})
|
||||
target_link_libraries(zig1 LINK_PUBLIC m)
|
||||
|
||||
target_include_directories(zig1 PUBLIC "${CMAKE_SOURCE_DIR}/stage1/zstd/lib")
|
||||
target_compile_definitions(zig1 PRIVATE ZSTD_DISABLE_ASM)
|
||||
|
||||
set(ZIG2_C_SOURCE "${CMAKE_BINARY_DIR}/zig2.c")
|
||||
set(BUILD_ZIG2_ARGS
|
||||
"${CMAKE_SOURCE_DIR}/lib"
|
||||
"${CMAKE_BINARY_DIR}"
|
||||
zig2
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zig1.wasm"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zig1.wasm.zst"
|
||||
build-exe src/main.zig -ofmt=c -lc
|
||||
-target x86_64-linux-musl # TODO: autodetect in zig1.c
|
||||
-OReleaseFast
|
||||
@ -736,7 +747,7 @@ set(BUILD_COMPILER_RT_ARGS
|
||||
"${CMAKE_SOURCE_DIR}/lib"
|
||||
"${CMAKE_BINARY_DIR}"
|
||||
compiler_rt
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zig1.wasm"
|
||||
"${CMAKE_SOURCE_DIR}/stage1/zig1.wasm.zst"
|
||||
build-obj lib/compiler_rt.zig -ofmt=c
|
||||
-target x86_64-linux-musl # TODO: autodetect in zig1.c
|
||||
-OReleaseFast
|
||||
|
119
stage1/zig1.c
119
stage1/zig1.c
@ -20,6 +20,8 @@
|
||||
#include <sys/random.h>
|
||||
#endif
|
||||
|
||||
#include <zstd.h>
|
||||
|
||||
enum wasi_errno_t {
|
||||
WASI_ESUCCESS = 0,
|
||||
WASI_E2BIG = 1,
|
||||
@ -4122,7 +4124,12 @@ int main(int argc, char **argv) {
|
||||
|
||||
new_argv[new_argv_i] = NULL;
|
||||
|
||||
const struct ByteSlice mod = read_file_alloc(wasm_file);
|
||||
const struct ByteSlice compressed_bytes = read_file_alloc(wasm_file);
|
||||
|
||||
const size_t max_uncompressed_size = 2500000;
|
||||
char *mod_ptr = arena_alloc(max_uncompressed_size);
|
||||
size_t mod_len = ZSTD_decompress(mod_ptr, max_uncompressed_size,
|
||||
compressed_bytes.ptr, compressed_bytes.len);
|
||||
|
||||
int cwd = err_wrap("opening cwd", open(".", O_DIRECTORY|O_RDONLY|O_CLOEXEC|O_PATH));
|
||||
int zig_lib_dir = err_wrap("opening zig lib dir", open(zig_lib_dir_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC|O_PATH));
|
||||
@ -4136,22 +4143,22 @@ int main(int argc, char **argv) {
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
if (mod.ptr[0] != 0 || mod.ptr[1] != 'a' || mod.ptr[2] != 's' || mod.ptr[3] != 'm') {
|
||||
if (mod_ptr[0] != 0 || mod_ptr[1] != 'a' || mod_ptr[2] != 's' || mod_ptr[3] != 'm') {
|
||||
panic("bad magic");
|
||||
}
|
||||
i += 4;
|
||||
|
||||
uint32_t version = read_u32_le(mod.ptr + i);
|
||||
uint32_t version = read_u32_le(mod_ptr + i);
|
||||
i += 4;
|
||||
if (version != 1) panic("bad wasm version");
|
||||
|
||||
uint32_t section_starts[13];
|
||||
memset(§ion_starts, 0, sizeof(uint32_t) * 13);
|
||||
|
||||
while (i < mod.len) {
|
||||
uint8_t section_id = mod.ptr[i];
|
||||
while (i < mod_len) {
|
||||
uint8_t section_id = mod_ptr[i];
|
||||
i += 1;
|
||||
uint32_t section_len = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t section_len = read32_uleb128(mod_ptr, &i);
|
||||
section_starts[section_id] = i;
|
||||
i += section_len;
|
||||
}
|
||||
@ -4160,18 +4167,18 @@ int main(int argc, char **argv) {
|
||||
struct TypeInfo *types;
|
||||
{
|
||||
i = section_starts[Section_type];
|
||||
uint32_t types_len = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t types_len = read32_uleb128(mod_ptr, &i);
|
||||
types = arena_alloc(sizeof(struct TypeInfo) * types_len);
|
||||
for (size_t type_i = 0; type_i < types_len; type_i += 1) {
|
||||
struct TypeInfo *info = &types[type_i];
|
||||
if (mod.ptr[i] != 0x60) panic("bad type byte");
|
||||
if (mod_ptr[i] != 0x60) panic("bad type byte");
|
||||
i += 1;
|
||||
|
||||
info->param_count = read32_uleb128(mod.ptr, &i);
|
||||
info->param_count = read32_uleb128(mod_ptr, &i);
|
||||
if (info->param_count > 32) panic("found a type with over 32 parameters");
|
||||
info->param_types = 0;
|
||||
for (uint32_t param_i = 0; param_i < info->param_count; param_i += 1) {
|
||||
int64_t param_type = read64_ileb128(mod.ptr, &i);
|
||||
int64_t param_type = read64_ileb128(mod_ptr, &i);
|
||||
switch (param_type) {
|
||||
case -1: case -3: bs_unset(&info->param_types, param_i); break;
|
||||
case -2: case -4: bs_set(&info->param_types, param_i); break;
|
||||
@ -4179,10 +4186,10 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
info->result_count = read32_uleb128(mod.ptr, &i);
|
||||
info->result_count = read32_uleb128(mod_ptr, &i);
|
||||
info->result_types = 0;
|
||||
for (uint32_t result_i = 0; result_i < info->result_count; result_i += 1) {
|
||||
int64_t result_type = read64_ileb128(mod.ptr, &i);
|
||||
int64_t result_type = read64_ileb128(mod_ptr, &i);
|
||||
switch (result_type) {
|
||||
case -1: case -3: bs_unset(&info->result_types, result_i); break;
|
||||
case -2: case -4: bs_set(&info->result_types, result_i); break;
|
||||
@ -4197,18 +4204,18 @@ int main(int argc, char **argv) {
|
||||
uint32_t imports_len;
|
||||
{
|
||||
i = section_starts[Section_import];
|
||||
imports_len = read32_uleb128(mod.ptr, &i);
|
||||
imports_len = read32_uleb128(mod_ptr, &i);
|
||||
imports = arena_alloc(sizeof(struct Import) * imports_len);
|
||||
for (size_t imp_i = 0; imp_i < imports_len; imp_i += 1) {
|
||||
struct Import *imp = &imports[imp_i];
|
||||
|
||||
struct ByteSlice mod_name = read_name(mod.ptr, &i);
|
||||
struct ByteSlice mod_name = read_name(mod_ptr, &i);
|
||||
if (mod_name.len == strlen("wasi_snapshot_preview1") &&
|
||||
memcmp(mod_name.ptr, "wasi_snapshot_preview1", mod_name.len) == 0) {
|
||||
imp->mod = ImpMod_wasi_snapshot_preview1;
|
||||
} else panic("unknown import module");
|
||||
|
||||
struct ByteSlice sym_name = read_name(mod.ptr, &i);
|
||||
struct ByteSlice sym_name = read_name(mod_ptr, &i);
|
||||
if (sym_name.len == strlen("args_get") &&
|
||||
memcmp(sym_name.ptr, "args_get", sym_name.len) == 0) {
|
||||
imp->name = ImpName_args_get;
|
||||
@ -4292,9 +4299,9 @@ int main(int argc, char **argv) {
|
||||
imp->name = ImpName_random_get;
|
||||
} else panic("unknown import name");
|
||||
|
||||
uint32_t desc = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t desc = read32_uleb128(mod_ptr, &i);
|
||||
if (desc != 0) panic("external kind not function");
|
||||
imp->type_idx = read32_uleb128(mod.ptr, &i);
|
||||
imp->type_idx = read32_uleb128(mod_ptr, &i);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4302,11 +4309,11 @@ int main(int argc, char **argv) {
|
||||
uint32_t start_fn_idx;
|
||||
{
|
||||
i = section_starts[Section_export];
|
||||
uint32_t count = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t count = read32_uleb128(mod_ptr, &i);
|
||||
for (; count > 0; count -= 1) {
|
||||
struct ByteSlice name = read_name(mod.ptr, &i);
|
||||
uint32_t desc = read32_uleb128(mod.ptr, &i);
|
||||
start_fn_idx = read32_uleb128(mod.ptr, &i);
|
||||
struct ByteSlice name = read_name(mod_ptr, &i);
|
||||
uint32_t desc = read32_uleb128(mod_ptr, &i);
|
||||
start_fn_idx = read32_uleb128(mod_ptr, &i);
|
||||
if (desc == 0 && name.len == strlen("_start") &&
|
||||
memcmp(name.ptr, "_start", name.len) == 0)
|
||||
{
|
||||
@ -4321,11 +4328,11 @@ int main(int argc, char **argv) {
|
||||
uint32_t functions_len;
|
||||
{
|
||||
i = section_starts[Section_function];
|
||||
functions_len = read32_uleb128(mod.ptr, &i);
|
||||
functions_len = read32_uleb128(mod_ptr, &i);
|
||||
functions = arena_alloc(sizeof(struct Function) * functions_len);
|
||||
for (size_t func_i = 0; func_i < functions_len; func_i += 1) {
|
||||
struct Function *func = &functions[func_i];
|
||||
func->type_idx = read32_uleb128(mod.ptr, &i);
|
||||
func->type_idx = read32_uleb128(mod_ptr, &i);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4333,18 +4340,18 @@ int main(int argc, char **argv) {
|
||||
uint64_t *globals;
|
||||
{
|
||||
i = section_starts[Section_global];
|
||||
uint32_t globals_len = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t globals_len = read32_uleb128(mod_ptr, &i);
|
||||
globals = arena_alloc(sizeof(uint64_t) * globals_len);
|
||||
for (size_t glob_i = 0; glob_i < globals_len; glob_i += 1) {
|
||||
uint64_t *global = &globals[glob_i];
|
||||
uint32_t content_type = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t mutability = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t content_type = read32_uleb128(mod_ptr, &i);
|
||||
uint32_t mutability = read32_uleb128(mod_ptr, &i);
|
||||
if (mutability != 1) panic("expected mutable global");
|
||||
if (content_type != 0x7f) panic("unexpected content type");
|
||||
uint8_t opcode = mod.ptr[i];
|
||||
uint8_t opcode = mod_ptr[i];
|
||||
i += 1;
|
||||
if (opcode != WasmOp_i32_const) panic("expected i32_const op");
|
||||
uint32_t init = read32_ileb128(mod.ptr, &i);
|
||||
uint32_t init = read32_ileb128(mod_ptr, &i);
|
||||
*global = (uint32_t)init;
|
||||
}
|
||||
}
|
||||
@ -4353,26 +4360,26 @@ int main(int argc, char **argv) {
|
||||
uint32_t memory_len;
|
||||
{
|
||||
i = section_starts[Section_memory];
|
||||
uint32_t memories_len = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t memories_len = read32_uleb128(mod_ptr, &i);
|
||||
if (memories_len != 1) panic("unexpected memory count");
|
||||
uint32_t flags = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t flags = read32_uleb128(mod_ptr, &i);
|
||||
(void)flags;
|
||||
memory_len = read32_uleb128(mod.ptr, &i) * wasm_page_size;
|
||||
memory_len = read32_uleb128(mod_ptr, &i) * wasm_page_size;
|
||||
|
||||
i = section_starts[Section_data];
|
||||
uint32_t datas_count = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t datas_count = read32_uleb128(mod_ptr, &i);
|
||||
for (; datas_count > 0; datas_count -= 1) {
|
||||
uint32_t mode = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t mode = read32_uleb128(mod_ptr, &i);
|
||||
if (mode != 0) panic("expected mode 0");
|
||||
enum WasmOp opcode = mod.ptr[i];
|
||||
enum WasmOp opcode = mod_ptr[i];
|
||||
i += 1;
|
||||
if (opcode != WasmOp_i32_const) panic("expected opcode i32_const");
|
||||
uint32_t offset = read32_uleb128(mod.ptr, &i);
|
||||
enum WasmOp end = mod.ptr[i];
|
||||
uint32_t offset = read32_uleb128(mod_ptr, &i);
|
||||
enum WasmOp end = mod_ptr[i];
|
||||
if (end != WasmOp_end) panic("expected end opcode");
|
||||
i += 1;
|
||||
uint32_t bytes_len = read32_uleb128(mod.ptr, &i);
|
||||
memcpy(memory + offset, mod.ptr + i, bytes_len);
|
||||
uint32_t bytes_len = read32_uleb128(mod_ptr, &i);
|
||||
memcpy(memory + offset, mod_ptr + i, bytes_len);
|
||||
i += bytes_len;
|
||||
}
|
||||
}
|
||||
@ -4380,37 +4387,37 @@ int main(int argc, char **argv) {
|
||||
uint32_t *table = NULL;
|
||||
{
|
||||
i = section_starts[Section_table];
|
||||
uint32_t table_count = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t table_count = read32_uleb128(mod_ptr, &i);
|
||||
if (table_count > 1) {
|
||||
panic("expected only one table section");
|
||||
} else if (table_count == 1) {
|
||||
uint32_t element_type = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t element_type = read32_uleb128(mod_ptr, &i);
|
||||
(void)element_type;
|
||||
uint32_t has_max = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t has_max = read32_uleb128(mod_ptr, &i);
|
||||
if (has_max != 1) panic("expected has_max==1");
|
||||
uint32_t initial = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t initial = read32_uleb128(mod_ptr, &i);
|
||||
(void)initial;
|
||||
uint32_t maximum = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t maximum = read32_uleb128(mod_ptr, &i);
|
||||
|
||||
i = section_starts[Section_element];
|
||||
uint32_t element_section_count = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t element_section_count = read32_uleb128(mod_ptr, &i);
|
||||
if (element_section_count != 1) panic("expected one element section");
|
||||
uint32_t flags = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t flags = read32_uleb128(mod_ptr, &i);
|
||||
(void)flags;
|
||||
enum WasmOp opcode = mod.ptr[i];
|
||||
enum WasmOp opcode = mod_ptr[i];
|
||||
i += 1;
|
||||
if (opcode != WasmOp_i32_const) panic("expected op i32_const");
|
||||
uint32_t offset = read32_uleb128(mod.ptr, &i);
|
||||
enum WasmOp end = mod.ptr[i];
|
||||
uint32_t offset = read32_uleb128(mod_ptr, &i);
|
||||
enum WasmOp end = mod_ptr[i];
|
||||
if (end != WasmOp_end) panic("expected op end");
|
||||
i += 1;
|
||||
uint32_t elem_count = read32_uleb128(mod.ptr, &i);
|
||||
uint32_t elem_count = read32_uleb128(mod_ptr, &i);
|
||||
|
||||
table = arena_alloc(sizeof(uint32_t) * maximum);
|
||||
memset(table, 0, sizeof(uint32_t) * maximum);
|
||||
|
||||
for (uint32_t elem_i = 0; elem_i < elem_count; elem_i += 1) {
|
||||
table[elem_i + offset] = read32_uleb128(mod.ptr, &i);
|
||||
table[elem_i + offset] = read32_uleb128(mod_ptr, &i);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4420,7 +4427,7 @@ int main(int argc, char **argv) {
|
||||
memset(&vm, 0xaa, sizeof(struct VirtualMachine)); // to match the zig version
|
||||
#endif
|
||||
vm.stack = arena_alloc(sizeof(uint64_t) * 10000000),
|
||||
vm.mod_ptr = mod.ptr;
|
||||
vm.mod_ptr = mod_ptr;
|
||||
vm.opcodes = arena_alloc(2000000);
|
||||
vm.operands = arena_alloc(sizeof(uint32_t) * 2000000);
|
||||
vm.stack_top = 0;
|
||||
@ -4436,14 +4443,14 @@ int main(int argc, char **argv) {
|
||||
|
||||
{
|
||||
uint32_t code_i = section_starts[Section_code];
|
||||
uint32_t codes_len = read32_uleb128(mod.ptr, &code_i);
|
||||
uint32_t codes_len = read32_uleb128(mod_ptr, &code_i);
|
||||
if (codes_len != functions_len) panic("code/function length mismatch");
|
||||
struct ProgramCounter pc;
|
||||
pc.opcode = 0;
|
||||
pc.operand = 0;
|
||||
for (uint32_t func_i = 0; func_i < functions_len; func_i += 1) {
|
||||
struct Function *func = &functions[func_i];
|
||||
uint32_t size = read32_uleb128(mod.ptr, &code_i);
|
||||
uint32_t size = read32_uleb128(mod_ptr, &code_i);
|
||||
uint32_t code_begin = code_i;
|
||||
|
||||
struct TypeInfo *type_info = &vm.types[func->type_idx];
|
||||
@ -4451,11 +4458,11 @@ int main(int argc, char **argv) {
|
||||
func->local_types = malloc(sizeof(uint32_t) * ((type_info->param_count + func->locals_count + 31) / 32));
|
||||
func->local_types[0] = type_info->param_types;
|
||||
|
||||
for (uint32_t local_sets_count = read32_uleb128(mod.ptr, &code_i);
|
||||
for (uint32_t local_sets_count = read32_uleb128(mod_ptr, &code_i);
|
||||
local_sets_count > 0; local_sets_count -= 1)
|
||||
{
|
||||
uint32_t set_count = read32_uleb128(mod.ptr, &code_i);
|
||||
int64_t local_type = read64_ileb128(mod.ptr, &code_i);
|
||||
uint32_t set_count = read32_uleb128(mod_ptr, &code_i);
|
||||
int64_t local_type = read64_ileb128(mod_ptr, &code_i);
|
||||
|
||||
uint32_t i = type_info->param_count + func->locals_count;
|
||||
func->locals_count += set_count;
|
||||
|
@ -1,585 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) Facebook, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* This source code is licensed under both the BSD-style license (found in the
|
||||
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
||||
* in the COPYING file in the root directory of this source tree).
|
||||
* You may select, at your option, one of the above-listed licenses.
|
||||
*/
|
||||
|
||||
#include "../common/portability_macros.h"
|
||||
|
||||
/* Stack marking
|
||||
* ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
|
||||
*/
|
||||
#if defined(__ELF__) && defined(__GNUC__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
|
||||
#if ZSTD_ENABLE_ASM_X86_64_BMI2
|
||||
|
||||
/* Calling convention:
|
||||
*
|
||||
* %rdi contains the first argument: HUF_DecompressAsmArgs*.
|
||||
* %rbp isn't maintained (no frame pointer).
|
||||
* %rsp contains the stack pointer that grows down.
|
||||
* No red-zone is assumed, only addresses >= %rsp are used.
|
||||
* All register contents are preserved.
|
||||
*
|
||||
* TODO: Support Windows calling convention.
|
||||
*/
|
||||
|
||||
ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
|
||||
ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
|
||||
ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
|
||||
ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
|
||||
.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
|
||||
.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
|
||||
.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
|
||||
.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
|
||||
.text
|
||||
|
||||
/* Sets up register mappings for clarity.
|
||||
* op[], bits[], dtable & ip[0] each get their own register.
|
||||
* ip[1,2,3] & olimit alias var[].
|
||||
* %rax is a scratch register.
|
||||
*/
|
||||
|
||||
#define op0 rsi
|
||||
#define op1 rbx
|
||||
#define op2 rcx
|
||||
#define op3 rdi
|
||||
|
||||
#define ip0 r8
|
||||
#define ip1 r9
|
||||
#define ip2 r10
|
||||
#define ip3 r11
|
||||
|
||||
#define bits0 rbp
|
||||
#define bits1 rdx
|
||||
#define bits2 r12
|
||||
#define bits3 r13
|
||||
#define dtable r14
|
||||
#define olimit r15
|
||||
|
||||
/* var[] aliases ip[1,2,3] & olimit
|
||||
* ip[1,2,3] are saved every iteration.
|
||||
* olimit is only used in compute_olimit.
|
||||
*/
|
||||
#define var0 r15
|
||||
#define var1 r9
|
||||
#define var2 r10
|
||||
#define var3 r11
|
||||
|
||||
/* 32-bit var registers */
|
||||
#define vard0 r15d
|
||||
#define vard1 r9d
|
||||
#define vard2 r10d
|
||||
#define vard3 r11d
|
||||
|
||||
/* Calls X(N) for each stream 0, 1, 2, 3. */
|
||||
#define FOR_EACH_STREAM(X) \
|
||||
X(0); \
|
||||
X(1); \
|
||||
X(2); \
|
||||
X(3)
|
||||
|
||||
/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
|
||||
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
|
||||
X(0, idx); \
|
||||
X(1, idx); \
|
||||
X(2, idx); \
|
||||
X(3, idx)
|
||||
|
||||
/* Define both _HUF_* & HUF_* symbols because MacOS
|
||||
* C symbols are prefixed with '_' & Linux symbols aren't.
|
||||
*/
|
||||
_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
|
||||
HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
|
||||
/* Save all registers - even if they are callee saved for simplicity. */
|
||||
push %rax
|
||||
push %rbx
|
||||
push %rcx
|
||||
push %rdx
|
||||
push %rbp
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %r8
|
||||
push %r9
|
||||
push %r10
|
||||
push %r11
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
/* Read HUF_DecompressAsmArgs* args from %rax */
|
||||
movq %rdi, %rax
|
||||
movq 0(%rax), %ip0
|
||||
movq 8(%rax), %ip1
|
||||
movq 16(%rax), %ip2
|
||||
movq 24(%rax), %ip3
|
||||
movq 32(%rax), %op0
|
||||
movq 40(%rax), %op1
|
||||
movq 48(%rax), %op2
|
||||
movq 56(%rax), %op3
|
||||
movq 64(%rax), %bits0
|
||||
movq 72(%rax), %bits1
|
||||
movq 80(%rax), %bits2
|
||||
movq 88(%rax), %bits3
|
||||
movq 96(%rax), %dtable
|
||||
push %rax /* argument */
|
||||
push 104(%rax) /* ilimit */
|
||||
push 112(%rax) /* oend */
|
||||
push %olimit /* olimit space */
|
||||
|
||||
subq $24, %rsp
|
||||
|
||||
.L_4X1_compute_olimit:
|
||||
/* Computes how many iterations we can do safely
|
||||
* %r15, %rax may be clobbered
|
||||
* rbx, rdx must be saved
|
||||
* op3 & ip0 mustn't be clobbered
|
||||
*/
|
||||
movq %rbx, 0(%rsp)
|
||||
movq %rdx, 8(%rsp)
|
||||
|
||||
movq 32(%rsp), %rax /* rax = oend */
|
||||
subq %op3, %rax /* rax = oend - op3 */
|
||||
|
||||
/* r15 = (oend - op3) / 5 */
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
mulq %rdx
|
||||
movq %rdx, %r15
|
||||
shrq $2, %r15
|
||||
|
||||
movq %ip0, %rax /* rax = ip0 */
|
||||
movq 40(%rsp), %rdx /* rdx = ilimit */
|
||||
subq %rdx, %rax /* rax = ip0 - ilimit */
|
||||
movq %rax, %rbx /* rbx = ip0 - ilimit */
|
||||
|
||||
/* rdx = (ip0 - ilimit) / 7 */
|
||||
movabsq $2635249153387078803, %rdx
|
||||
mulq %rdx
|
||||
subq %rdx, %rbx
|
||||
shrq %rbx
|
||||
addq %rbx, %rdx
|
||||
shrq $2, %rdx
|
||||
|
||||
/* r15 = min(%rdx, %r15) */
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
/* r15 = r15 * 5 */
|
||||
leaq (%r15, %r15, 4), %r15
|
||||
|
||||
/* olimit = op3 + r15 */
|
||||
addq %op3, %olimit
|
||||
|
||||
movq 8(%rsp), %rdx
|
||||
movq 0(%rsp), %rbx
|
||||
|
||||
/* If (op3 + 20 > olimit) */
|
||||
movq %op3, %rax /* rax = op3 */
|
||||
addq $20, %rax /* rax = op3 + 20 */
|
||||
cmpq %rax, %olimit /* op3 + 20 > olimit */
|
||||
jb .L_4X1_exit
|
||||
|
||||
/* If (ip1 < ip0) go to exit */
|
||||
cmpq %ip0, %ip1
|
||||
jb .L_4X1_exit
|
||||
|
||||
/* If (ip2 < ip1) go to exit */
|
||||
cmpq %ip1, %ip2
|
||||
jb .L_4X1_exit
|
||||
|
||||
/* If (ip3 < ip2) go to exit */
|
||||
cmpq %ip2, %ip3
|
||||
jb .L_4X1_exit
|
||||
|
||||
/* Reads top 11 bits from bits[n]
|
||||
* Loads dt[bits[n]] into var[n]
|
||||
*/
|
||||
#define GET_NEXT_DELT(n) \
|
||||
movq $53, %var##n; \
|
||||
shrxq %var##n, %bits##n, %var##n; \
|
||||
movzwl (%dtable,%var##n,2),%vard##n
|
||||
|
||||
/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
|
||||
* Moves var[n] to %rax
|
||||
* bits[n] <<= var[n] & 63
|
||||
* op[n][idx] = %rax >> 8
|
||||
* %ah is a way to access bits [8, 16) of %rax
|
||||
*/
|
||||
#define DECODE_FROM_DELT(n, idx) \
|
||||
movq %var##n, %rax; \
|
||||
shlxq %var##n, %bits##n, %bits##n; \
|
||||
movb %ah, idx(%op##n)
|
||||
|
||||
/* Assumes GET_NEXT_DELT has been called.
|
||||
* Calls DECODE_FROM_DELT then GET_NEXT_DELT
|
||||
*/
|
||||
#define DECODE_AND_GET_NEXT(n, idx) \
|
||||
DECODE_FROM_DELT(n, idx); \
|
||||
GET_NEXT_DELT(n) \
|
||||
|
||||
/* // ctz & nbBytes is stored in bits[n]
|
||||
* // nbBits is stored in %rax
|
||||
* ctz = CTZ[bits[n]]
|
||||
* nbBits = ctz & 7
|
||||
* nbBytes = ctz >> 3
|
||||
* op[n] += 5
|
||||
* ip[n] -= nbBytes
|
||||
* // Note: x86-64 is little-endian ==> no bswap
|
||||
* bits[n] = MEM_readST(ip[n]) | 1
|
||||
* bits[n] <<= nbBits
|
||||
*/
|
||||
#define RELOAD_BITS(n) \
|
||||
bsfq %bits##n, %bits##n; \
|
||||
movq %bits##n, %rax; \
|
||||
andq $7, %rax; \
|
||||
shrq $3, %bits##n; \
|
||||
leaq 5(%op##n), %op##n; \
|
||||
subq %bits##n, %ip##n; \
|
||||
movq (%ip##n), %bits##n; \
|
||||
orq $1, %bits##n; \
|
||||
shlx %rax, %bits##n, %bits##n
|
||||
|
||||
/* Store clobbered variables on the stack */
|
||||
movq %olimit, 24(%rsp)
|
||||
movq %ip1, 0(%rsp)
|
||||
movq %ip2, 8(%rsp)
|
||||
movq %ip3, 16(%rsp)
|
||||
|
||||
/* Call GET_NEXT_DELT for each stream */
|
||||
FOR_EACH_STREAM(GET_NEXT_DELT)
|
||||
|
||||
.p2align 6
|
||||
|
||||
.L_4X1_loop_body:
|
||||
/* Decode 5 symbols in each of the 4 streams (20 total)
|
||||
* Must have called GET_NEXT_DELT for each stream
|
||||
*/
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
|
||||
|
||||
/* Load ip[1,2,3] from stack (var[] aliases them)
|
||||
* ip[] is needed for RELOAD_BITS
|
||||
* Each will be stored back to the stack after RELOAD
|
||||
*/
|
||||
movq 0(%rsp), %ip1
|
||||
movq 8(%rsp), %ip2
|
||||
movq 16(%rsp), %ip3
|
||||
|
||||
/* Reload each stream & fetch the next table entry
|
||||
* to prepare for the next iteration
|
||||
*/
|
||||
RELOAD_BITS(0)
|
||||
GET_NEXT_DELT(0)
|
||||
|
||||
RELOAD_BITS(1)
|
||||
movq %ip1, 0(%rsp)
|
||||
GET_NEXT_DELT(1)
|
||||
|
||||
RELOAD_BITS(2)
|
||||
movq %ip2, 8(%rsp)
|
||||
GET_NEXT_DELT(2)
|
||||
|
||||
RELOAD_BITS(3)
|
||||
movq %ip3, 16(%rsp)
|
||||
GET_NEXT_DELT(3)
|
||||
|
||||
/* If op3 < olimit: continue the loop */
|
||||
cmp %op3, 24(%rsp)
|
||||
ja .L_4X1_loop_body
|
||||
|
||||
/* Reload ip[1,2,3] from stack */
|
||||
movq 0(%rsp), %ip1
|
||||
movq 8(%rsp), %ip2
|
||||
movq 16(%rsp), %ip3
|
||||
|
||||
/* Re-compute olimit */
|
||||
jmp .L_4X1_compute_olimit
|
||||
|
||||
#undef GET_NEXT_DELT
|
||||
#undef DECODE_FROM_DELT
|
||||
#undef DECODE
|
||||
#undef RELOAD_BITS
|
||||
.L_4X1_exit:
|
||||
addq $24, %rsp
|
||||
|
||||
/* Restore stack (oend & olimit) */
|
||||
pop %rax /* olimit */
|
||||
pop %rax /* oend */
|
||||
pop %rax /* ilimit */
|
||||
pop %rax /* arg */
|
||||
|
||||
/* Save ip / op / bits */
|
||||
movq %ip0, 0(%rax)
|
||||
movq %ip1, 8(%rax)
|
||||
movq %ip2, 16(%rax)
|
||||
movq %ip3, 24(%rax)
|
||||
movq %op0, 32(%rax)
|
||||
movq %op1, 40(%rax)
|
||||
movq %op2, 48(%rax)
|
||||
movq %op3, 56(%rax)
|
||||
movq %bits0, 64(%rax)
|
||||
movq %bits1, 72(%rax)
|
||||
movq %bits2, 80(%rax)
|
||||
movq %bits3, 88(%rax)
|
||||
|
||||
/* Restore registers */
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %r11
|
||||
pop %r10
|
||||
pop %r9
|
||||
pop %r8
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
pop %rbp
|
||||
pop %rdx
|
||||
pop %rcx
|
||||
pop %rbx
|
||||
pop %rax
|
||||
ret
|
||||
|
||||
_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
|
||||
HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
|
||||
/* Save all registers - even if they are callee saved for simplicity. */
|
||||
push %rax
|
||||
push %rbx
|
||||
push %rcx
|
||||
push %rdx
|
||||
push %rbp
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %r8
|
||||
push %r9
|
||||
push %r10
|
||||
push %r11
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
movq %rdi, %rax
|
||||
movq 0(%rax), %ip0
|
||||
movq 8(%rax), %ip1
|
||||
movq 16(%rax), %ip2
|
||||
movq 24(%rax), %ip3
|
||||
movq 32(%rax), %op0
|
||||
movq 40(%rax), %op1
|
||||
movq 48(%rax), %op2
|
||||
movq 56(%rax), %op3
|
||||
movq 64(%rax), %bits0
|
||||
movq 72(%rax), %bits1
|
||||
movq 80(%rax), %bits2
|
||||
movq 88(%rax), %bits3
|
||||
movq 96(%rax), %dtable
|
||||
push %rax /* argument */
|
||||
push %rax /* olimit */
|
||||
push 104(%rax) /* ilimit */
|
||||
|
||||
movq 112(%rax), %rax
|
||||
push %rax /* oend3 */
|
||||
|
||||
movq %op3, %rax
|
||||
push %rax /* oend2 */
|
||||
|
||||
movq %op2, %rax
|
||||
push %rax /* oend1 */
|
||||
|
||||
movq %op1, %rax
|
||||
push %rax /* oend0 */
|
||||
|
||||
/* Scratch space */
|
||||
subq $8, %rsp
|
||||
|
||||
.L_4X2_compute_olimit:
|
||||
/* Computes how many iterations we can do safely
|
||||
* %r15, %rax may be clobbered
|
||||
* rdx must be saved
|
||||
* op[1,2,3,4] & ip0 mustn't be clobbered
|
||||
*/
|
||||
movq %rdx, 0(%rsp)
|
||||
|
||||
/* We can consume up to 7 input bytes each iteration. */
|
||||
movq %ip0, %rax /* rax = ip0 */
|
||||
movq 40(%rsp), %rdx /* rdx = ilimit */
|
||||
subq %rdx, %rax /* rax = ip0 - ilimit */
|
||||
movq %rax, %r15 /* r15 = ip0 - ilimit */
|
||||
|
||||
/* rdx = rax / 7 */
|
||||
movabsq $2635249153387078803, %rdx
|
||||
mulq %rdx
|
||||
subq %rdx, %r15
|
||||
shrq %r15
|
||||
addq %r15, %rdx
|
||||
shrq $2, %rdx
|
||||
|
||||
/* r15 = (ip0 - ilimit) / 7 */
|
||||
movq %rdx, %r15
|
||||
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
movq 8(%rsp), %rax /* rax = oend0 */
|
||||
subq %op0, %rax /* rax = oend0 - op0 */
|
||||
mulq %rdx
|
||||
shrq $3, %rdx /* rdx = rax / 10 */
|
||||
|
||||
/* r15 = min(%rdx, %r15) */
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
movq 16(%rsp), %rax /* rax = oend1 */
|
||||
subq %op1, %rax /* rax = oend1 - op1 */
|
||||
mulq %rdx
|
||||
shrq $3, %rdx /* rdx = rax / 10 */
|
||||
|
||||
/* r15 = min(%rdx, %r15) */
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
movq 24(%rsp), %rax /* rax = oend2 */
|
||||
subq %op2, %rax /* rax = oend2 - op2 */
|
||||
mulq %rdx
|
||||
shrq $3, %rdx /* rdx = rax / 10 */
|
||||
|
||||
/* r15 = min(%rdx, %r15) */
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
movabsq $-3689348814741910323, %rdx
|
||||
movq 32(%rsp), %rax /* rax = oend3 */
|
||||
subq %op3, %rax /* rax = oend3 - op3 */
|
||||
mulq %rdx
|
||||
shrq $3, %rdx /* rdx = rax / 10 */
|
||||
|
||||
/* r15 = min(%rdx, %r15) */
|
||||
cmpq %rdx, %r15
|
||||
cmova %rdx, %r15
|
||||
|
||||
/* olimit = op3 + 5 * r15 */
|
||||
movq %r15, %rax
|
||||
leaq (%op3, %rax, 4), %olimit
|
||||
addq %rax, %olimit
|
||||
|
||||
movq 0(%rsp), %rdx
|
||||
|
||||
/* If (op3 + 10 > olimit) */
|
||||
movq %op3, %rax /* rax = op3 */
|
||||
addq $10, %rax /* rax = op3 + 10 */
|
||||
cmpq %rax, %olimit /* op3 + 10 > olimit */
|
||||
jb .L_4X2_exit
|
||||
|
||||
/* If (ip1 < ip0) go to exit */
|
||||
cmpq %ip0, %ip1
|
||||
jb .L_4X2_exit
|
||||
|
||||
/* If (ip2 < ip1) go to exit */
|
||||
cmpq %ip1, %ip2
|
||||
jb .L_4X2_exit
|
||||
|
||||
/* If (ip3 < ip2) go to exit */
|
||||
cmpq %ip2, %ip3
|
||||
jb .L_4X2_exit
|
||||
|
||||
#define DECODE(n, idx) \
|
||||
movq %bits##n, %rax; \
|
||||
shrq $53, %rax; \
|
||||
movzwl 0(%dtable,%rax,4),%r8d; \
|
||||
movzbl 2(%dtable,%rax,4),%r15d; \
|
||||
movzbl 3(%dtable,%rax,4),%eax; \
|
||||
movw %r8w, (%op##n); \
|
||||
shlxq %r15, %bits##n, %bits##n; \
|
||||
addq %rax, %op##n
|
||||
|
||||
#define RELOAD_BITS(n) \
|
||||
bsfq %bits##n, %bits##n; \
|
||||
movq %bits##n, %rax; \
|
||||
shrq $3, %bits##n; \
|
||||
andq $7, %rax; \
|
||||
subq %bits##n, %ip##n; \
|
||||
movq (%ip##n), %bits##n; \
|
||||
orq $1, %bits##n; \
|
||||
shlxq %rax, %bits##n, %bits##n
|
||||
|
||||
|
||||
movq %olimit, 48(%rsp)
|
||||
|
||||
.p2align 6
|
||||
|
||||
.L_4X2_loop_body:
|
||||
/* We clobber r8, so store it on the stack */
|
||||
movq %r8, 0(%rsp)
|
||||
|
||||
/* Decode 5 symbols from each of the 4 streams (20 symbols total). */
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
|
||||
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
|
||||
|
||||
/* Reload r8 */
|
||||
movq 0(%rsp), %r8
|
||||
|
||||
FOR_EACH_STREAM(RELOAD_BITS)
|
||||
|
||||
cmp %op3, 48(%rsp)
|
||||
ja .L_4X2_loop_body
|
||||
jmp .L_4X2_compute_olimit
|
||||
|
||||
#undef DECODE
|
||||
#undef RELOAD_BITS
|
||||
.L_4X2_exit:
|
||||
addq $8, %rsp
|
||||
/* Restore stack (oend & olimit) */
|
||||
pop %rax /* oend0 */
|
||||
pop %rax /* oend1 */
|
||||
pop %rax /* oend2 */
|
||||
pop %rax /* oend3 */
|
||||
pop %rax /* ilimit */
|
||||
pop %rax /* olimit */
|
||||
pop %rax /* arg */
|
||||
|
||||
/* Save ip / op / bits */
|
||||
movq %ip0, 0(%rax)
|
||||
movq %ip1, 8(%rax)
|
||||
movq %ip2, 16(%rax)
|
||||
movq %ip3, 24(%rax)
|
||||
movq %op0, 32(%rax)
|
||||
movq %op1, 40(%rax)
|
||||
movq %op2, 48(%rax)
|
||||
movq %op3, 56(%rax)
|
||||
movq %bits0, 64(%rax)
|
||||
movq %bits1, 72(%rax)
|
||||
movq %bits2, 80(%rax)
|
||||
movq %bits3, 88(%rax)
|
||||
|
||||
/* Restore registers */
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %r11
|
||||
pop %r10
|
||||
pop %r9
|
||||
pop %r8
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
pop %rbp
|
||||
pop %rdx
|
||||
pop %rcx
|
||||
pop %rbx
|
||||
pop %rax
|
||||
ret
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user