From fb9a6084e28f3b732dbbce85b0706a70d848c24c Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 15 Nov 2022 21:51:33 -0700 Subject: [PATCH] zig1.c: decompress zig1.wasm.zst with zstd --- CMakeLists.txt | 17 +- stage1/zig1.c | 119 ++-- .../lib/decompress/huf_decompress_amd64.S | 585 ------------------ 3 files changed, 77 insertions(+), 644 deletions(-) delete mode 100644 stage1/zstd/lib/decompress/huf_decompress_amd64.S diff --git a/CMakeLists.txt b/CMakeLists.txt index d48187a90f..3d495e15c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,16 @@ set(ZIG_CONFIG_ZIG_OUT "${CMAKE_BINARY_DIR}/config.zig") set(STAGE1_SOURCES "${CMAKE_SOURCE_DIR}/stage1/zig1.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/decompress/huf_decompress.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/decompress/zstd_ddict.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/decompress/zstd_decompress.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/decompress/zstd_decompress_block.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/entropy_common.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/error_private.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/fse_decompress.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/pool.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/xxhash.c" + "${CMAKE_SOURCE_DIR}/stage1/zstd/lib/common/zstd_common.c" ) set(ZIG_CPP_SOURCES # These are planned to stay even when we are self-hosted. @@ -710,14 +720,15 @@ endif() add_executable(zig1 ${STAGE1_SOURCES}) set_target_properties(zig1 PROPERTIES COMPILE_FLAGS ${ZIG1_COMPILE_FLAGS}) target_link_libraries(zig1 LINK_PUBLIC m) - +target_include_directories(zig1 PUBLIC "${CMAKE_SOURCE_DIR}/stage1/zstd/lib") +target_compile_definitions(zig1 PRIVATE ZSTD_DISABLE_ASM) set(ZIG2_C_SOURCE "${CMAKE_BINARY_DIR}/zig2.c") set(BUILD_ZIG2_ARGS "${CMAKE_SOURCE_DIR}/lib" "${CMAKE_BINARY_DIR}" zig2 - "${CMAKE_SOURCE_DIR}/stage1/zig1.wasm" + "${CMAKE_SOURCE_DIR}/stage1/zig1.wasm.zst" build-exe src/main.zig -ofmt=c -lc -target x86_64-linux-musl # TODO: autodetect in zig1.c -OReleaseFast @@ -736,7 +747,7 @@ set(BUILD_COMPILER_RT_ARGS "${CMAKE_SOURCE_DIR}/lib" "${CMAKE_BINARY_DIR}" compiler_rt - "${CMAKE_SOURCE_DIR}/stage1/zig1.wasm" + "${CMAKE_SOURCE_DIR}/stage1/zig1.wasm.zst" build-obj lib/compiler_rt.zig -ofmt=c -target x86_64-linux-musl # TODO: autodetect in zig1.c -OReleaseFast diff --git a/stage1/zig1.c b/stage1/zig1.c index 34df5230f5..05d4b73947 100755 --- a/stage1/zig1.c +++ b/stage1/zig1.c @@ -20,6 +20,8 @@ #include #endif +#include + enum wasi_errno_t { WASI_ESUCCESS = 0, WASI_E2BIG = 1, @@ -4122,7 +4124,12 @@ int main(int argc, char **argv) { new_argv[new_argv_i] = NULL; - const struct ByteSlice mod = read_file_alloc(wasm_file); + const struct ByteSlice compressed_bytes = read_file_alloc(wasm_file); + + const size_t max_uncompressed_size = 2500000; + char *mod_ptr = arena_alloc(max_uncompressed_size); + size_t mod_len = ZSTD_decompress(mod_ptr, max_uncompressed_size, + compressed_bytes.ptr, compressed_bytes.len); int cwd = err_wrap("opening cwd", open(".", O_DIRECTORY|O_RDONLY|O_CLOEXEC|O_PATH)); int zig_lib_dir = err_wrap("opening zig lib dir", open(zig_lib_dir_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC|O_PATH)); @@ -4136,22 +4143,22 @@ int main(int argc, char **argv) { uint32_t i = 0; - if (mod.ptr[0] != 0 || mod.ptr[1] != 'a' || mod.ptr[2] != 's' || mod.ptr[3] != 'm') { + if (mod_ptr[0] != 0 || mod_ptr[1] != 'a' || mod_ptr[2] != 's' || mod_ptr[3] != 'm') { panic("bad magic"); } i += 4; - uint32_t version = read_u32_le(mod.ptr + i); + uint32_t version = read_u32_le(mod_ptr + i); i += 4; if (version != 1) panic("bad wasm version"); uint32_t section_starts[13]; memset(§ion_starts, 0, sizeof(uint32_t) * 13); - while (i < mod.len) { - uint8_t section_id = mod.ptr[i]; + while (i < mod_len) { + uint8_t section_id = mod_ptr[i]; i += 1; - uint32_t section_len = read32_uleb128(mod.ptr, &i); + uint32_t section_len = read32_uleb128(mod_ptr, &i); section_starts[section_id] = i; i += section_len; } @@ -4160,18 +4167,18 @@ int main(int argc, char **argv) { struct TypeInfo *types; { i = section_starts[Section_type]; - uint32_t types_len = read32_uleb128(mod.ptr, &i); + uint32_t types_len = read32_uleb128(mod_ptr, &i); types = arena_alloc(sizeof(struct TypeInfo) * types_len); for (size_t type_i = 0; type_i < types_len; type_i += 1) { struct TypeInfo *info = &types[type_i]; - if (mod.ptr[i] != 0x60) panic("bad type byte"); + if (mod_ptr[i] != 0x60) panic("bad type byte"); i += 1; - info->param_count = read32_uleb128(mod.ptr, &i); + info->param_count = read32_uleb128(mod_ptr, &i); if (info->param_count > 32) panic("found a type with over 32 parameters"); info->param_types = 0; for (uint32_t param_i = 0; param_i < info->param_count; param_i += 1) { - int64_t param_type = read64_ileb128(mod.ptr, &i); + int64_t param_type = read64_ileb128(mod_ptr, &i); switch (param_type) { case -1: case -3: bs_unset(&info->param_types, param_i); break; case -2: case -4: bs_set(&info->param_types, param_i); break; @@ -4179,10 +4186,10 @@ int main(int argc, char **argv) { } } - info->result_count = read32_uleb128(mod.ptr, &i); + info->result_count = read32_uleb128(mod_ptr, &i); info->result_types = 0; for (uint32_t result_i = 0; result_i < info->result_count; result_i += 1) { - int64_t result_type = read64_ileb128(mod.ptr, &i); + int64_t result_type = read64_ileb128(mod_ptr, &i); switch (result_type) { case -1: case -3: bs_unset(&info->result_types, result_i); break; case -2: case -4: bs_set(&info->result_types, result_i); break; @@ -4197,18 +4204,18 @@ int main(int argc, char **argv) { uint32_t imports_len; { i = section_starts[Section_import]; - imports_len = read32_uleb128(mod.ptr, &i); + imports_len = read32_uleb128(mod_ptr, &i); imports = arena_alloc(sizeof(struct Import) * imports_len); for (size_t imp_i = 0; imp_i < imports_len; imp_i += 1) { struct Import *imp = &imports[imp_i]; - struct ByteSlice mod_name = read_name(mod.ptr, &i); + struct ByteSlice mod_name = read_name(mod_ptr, &i); if (mod_name.len == strlen("wasi_snapshot_preview1") && memcmp(mod_name.ptr, "wasi_snapshot_preview1", mod_name.len) == 0) { imp->mod = ImpMod_wasi_snapshot_preview1; } else panic("unknown import module"); - struct ByteSlice sym_name = read_name(mod.ptr, &i); + struct ByteSlice sym_name = read_name(mod_ptr, &i); if (sym_name.len == strlen("args_get") && memcmp(sym_name.ptr, "args_get", sym_name.len) == 0) { imp->name = ImpName_args_get; @@ -4292,9 +4299,9 @@ int main(int argc, char **argv) { imp->name = ImpName_random_get; } else panic("unknown import name"); - uint32_t desc = read32_uleb128(mod.ptr, &i); + uint32_t desc = read32_uleb128(mod_ptr, &i); if (desc != 0) panic("external kind not function"); - imp->type_idx = read32_uleb128(mod.ptr, &i); + imp->type_idx = read32_uleb128(mod_ptr, &i); } } @@ -4302,11 +4309,11 @@ int main(int argc, char **argv) { uint32_t start_fn_idx; { i = section_starts[Section_export]; - uint32_t count = read32_uleb128(mod.ptr, &i); + uint32_t count = read32_uleb128(mod_ptr, &i); for (; count > 0; count -= 1) { - struct ByteSlice name = read_name(mod.ptr, &i); - uint32_t desc = read32_uleb128(mod.ptr, &i); - start_fn_idx = read32_uleb128(mod.ptr, &i); + struct ByteSlice name = read_name(mod_ptr, &i); + uint32_t desc = read32_uleb128(mod_ptr, &i); + start_fn_idx = read32_uleb128(mod_ptr, &i); if (desc == 0 && name.len == strlen("_start") && memcmp(name.ptr, "_start", name.len) == 0) { @@ -4321,11 +4328,11 @@ int main(int argc, char **argv) { uint32_t functions_len; { i = section_starts[Section_function]; - functions_len = read32_uleb128(mod.ptr, &i); + functions_len = read32_uleb128(mod_ptr, &i); functions = arena_alloc(sizeof(struct Function) * functions_len); for (size_t func_i = 0; func_i < functions_len; func_i += 1) { struct Function *func = &functions[func_i]; - func->type_idx = read32_uleb128(mod.ptr, &i); + func->type_idx = read32_uleb128(mod_ptr, &i); } } @@ -4333,18 +4340,18 @@ int main(int argc, char **argv) { uint64_t *globals; { i = section_starts[Section_global]; - uint32_t globals_len = read32_uleb128(mod.ptr, &i); + uint32_t globals_len = read32_uleb128(mod_ptr, &i); globals = arena_alloc(sizeof(uint64_t) * globals_len); for (size_t glob_i = 0; glob_i < globals_len; glob_i += 1) { uint64_t *global = &globals[glob_i]; - uint32_t content_type = read32_uleb128(mod.ptr, &i); - uint32_t mutability = read32_uleb128(mod.ptr, &i); + uint32_t content_type = read32_uleb128(mod_ptr, &i); + uint32_t mutability = read32_uleb128(mod_ptr, &i); if (mutability != 1) panic("expected mutable global"); if (content_type != 0x7f) panic("unexpected content type"); - uint8_t opcode = mod.ptr[i]; + uint8_t opcode = mod_ptr[i]; i += 1; if (opcode != WasmOp_i32_const) panic("expected i32_const op"); - uint32_t init = read32_ileb128(mod.ptr, &i); + uint32_t init = read32_ileb128(mod_ptr, &i); *global = (uint32_t)init; } } @@ -4353,26 +4360,26 @@ int main(int argc, char **argv) { uint32_t memory_len; { i = section_starts[Section_memory]; - uint32_t memories_len = read32_uleb128(mod.ptr, &i); + uint32_t memories_len = read32_uleb128(mod_ptr, &i); if (memories_len != 1) panic("unexpected memory count"); - uint32_t flags = read32_uleb128(mod.ptr, &i); + uint32_t flags = read32_uleb128(mod_ptr, &i); (void)flags; - memory_len = read32_uleb128(mod.ptr, &i) * wasm_page_size; + memory_len = read32_uleb128(mod_ptr, &i) * wasm_page_size; i = section_starts[Section_data]; - uint32_t datas_count = read32_uleb128(mod.ptr, &i); + uint32_t datas_count = read32_uleb128(mod_ptr, &i); for (; datas_count > 0; datas_count -= 1) { - uint32_t mode = read32_uleb128(mod.ptr, &i); + uint32_t mode = read32_uleb128(mod_ptr, &i); if (mode != 0) panic("expected mode 0"); - enum WasmOp opcode = mod.ptr[i]; + enum WasmOp opcode = mod_ptr[i]; i += 1; if (opcode != WasmOp_i32_const) panic("expected opcode i32_const"); - uint32_t offset = read32_uleb128(mod.ptr, &i); - enum WasmOp end = mod.ptr[i]; + uint32_t offset = read32_uleb128(mod_ptr, &i); + enum WasmOp end = mod_ptr[i]; if (end != WasmOp_end) panic("expected end opcode"); i += 1; - uint32_t bytes_len = read32_uleb128(mod.ptr, &i); - memcpy(memory + offset, mod.ptr + i, bytes_len); + uint32_t bytes_len = read32_uleb128(mod_ptr, &i); + memcpy(memory + offset, mod_ptr + i, bytes_len); i += bytes_len; } } @@ -4380,37 +4387,37 @@ int main(int argc, char **argv) { uint32_t *table = NULL; { i = section_starts[Section_table]; - uint32_t table_count = read32_uleb128(mod.ptr, &i); + uint32_t table_count = read32_uleb128(mod_ptr, &i); if (table_count > 1) { panic("expected only one table section"); } else if (table_count == 1) { - uint32_t element_type = read32_uleb128(mod.ptr, &i); + uint32_t element_type = read32_uleb128(mod_ptr, &i); (void)element_type; - uint32_t has_max = read32_uleb128(mod.ptr, &i); + uint32_t has_max = read32_uleb128(mod_ptr, &i); if (has_max != 1) panic("expected has_max==1"); - uint32_t initial = read32_uleb128(mod.ptr, &i); + uint32_t initial = read32_uleb128(mod_ptr, &i); (void)initial; - uint32_t maximum = read32_uleb128(mod.ptr, &i); + uint32_t maximum = read32_uleb128(mod_ptr, &i); i = section_starts[Section_element]; - uint32_t element_section_count = read32_uleb128(mod.ptr, &i); + uint32_t element_section_count = read32_uleb128(mod_ptr, &i); if (element_section_count != 1) panic("expected one element section"); - uint32_t flags = read32_uleb128(mod.ptr, &i); + uint32_t flags = read32_uleb128(mod_ptr, &i); (void)flags; - enum WasmOp opcode = mod.ptr[i]; + enum WasmOp opcode = mod_ptr[i]; i += 1; if (opcode != WasmOp_i32_const) panic("expected op i32_const"); - uint32_t offset = read32_uleb128(mod.ptr, &i); - enum WasmOp end = mod.ptr[i]; + uint32_t offset = read32_uleb128(mod_ptr, &i); + enum WasmOp end = mod_ptr[i]; if (end != WasmOp_end) panic("expected op end"); i += 1; - uint32_t elem_count = read32_uleb128(mod.ptr, &i); + uint32_t elem_count = read32_uleb128(mod_ptr, &i); table = arena_alloc(sizeof(uint32_t) * maximum); memset(table, 0, sizeof(uint32_t) * maximum); for (uint32_t elem_i = 0; elem_i < elem_count; elem_i += 1) { - table[elem_i + offset] = read32_uleb128(mod.ptr, &i); + table[elem_i + offset] = read32_uleb128(mod_ptr, &i); } } } @@ -4420,7 +4427,7 @@ int main(int argc, char **argv) { memset(&vm, 0xaa, sizeof(struct VirtualMachine)); // to match the zig version #endif vm.stack = arena_alloc(sizeof(uint64_t) * 10000000), - vm.mod_ptr = mod.ptr; + vm.mod_ptr = mod_ptr; vm.opcodes = arena_alloc(2000000); vm.operands = arena_alloc(sizeof(uint32_t) * 2000000); vm.stack_top = 0; @@ -4436,14 +4443,14 @@ int main(int argc, char **argv) { { uint32_t code_i = section_starts[Section_code]; - uint32_t codes_len = read32_uleb128(mod.ptr, &code_i); + uint32_t codes_len = read32_uleb128(mod_ptr, &code_i); if (codes_len != functions_len) panic("code/function length mismatch"); struct ProgramCounter pc; pc.opcode = 0; pc.operand = 0; for (uint32_t func_i = 0; func_i < functions_len; func_i += 1) { struct Function *func = &functions[func_i]; - uint32_t size = read32_uleb128(mod.ptr, &code_i); + uint32_t size = read32_uleb128(mod_ptr, &code_i); uint32_t code_begin = code_i; struct TypeInfo *type_info = &vm.types[func->type_idx]; @@ -4451,11 +4458,11 @@ int main(int argc, char **argv) { func->local_types = malloc(sizeof(uint32_t) * ((type_info->param_count + func->locals_count + 31) / 32)); func->local_types[0] = type_info->param_types; - for (uint32_t local_sets_count = read32_uleb128(mod.ptr, &code_i); + for (uint32_t local_sets_count = read32_uleb128(mod_ptr, &code_i); local_sets_count > 0; local_sets_count -= 1) { - uint32_t set_count = read32_uleb128(mod.ptr, &code_i); - int64_t local_type = read64_ileb128(mod.ptr, &code_i); + uint32_t set_count = read32_uleb128(mod_ptr, &code_i); + int64_t local_type = read64_ileb128(mod_ptr, &code_i); uint32_t i = type_info->param_count + func->locals_count; func->locals_count += set_count; diff --git a/stage1/zstd/lib/decompress/huf_decompress_amd64.S b/stage1/zstd/lib/decompress/huf_decompress_amd64.S deleted file mode 100644 index 49589cb611..0000000000 --- a/stage1/zstd/lib/decompress/huf_decompress_amd64.S +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * You may select, at your option, one of the above-listed licenses. - */ - -#include "../common/portability_macros.h" - -/* Stack marking - * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart - */ -#if defined(__ELF__) && defined(__GNUC__) -.section .note.GNU-stack,"",%progbits -#endif - -#if ZSTD_ENABLE_ASM_X86_64_BMI2 - -/* Calling convention: - * - * %rdi contains the first argument: HUF_DecompressAsmArgs*. - * %rbp isn't maintained (no frame pointer). - * %rsp contains the stack pointer that grows down. - * No red-zone is assumed, only addresses >= %rsp are used. - * All register contents are preserved. - * - * TODO: Support Windows calling convention. - */ - -ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) -ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) -ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) -ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) -.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop -.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop -.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop -.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop -.text - -/* Sets up register mappings for clarity. - * op[], bits[], dtable & ip[0] each get their own register. - * ip[1,2,3] & olimit alias var[]. - * %rax is a scratch register. - */ - -#define op0 rsi -#define op1 rbx -#define op2 rcx -#define op3 rdi - -#define ip0 r8 -#define ip1 r9 -#define ip2 r10 -#define ip3 r11 - -#define bits0 rbp -#define bits1 rdx -#define bits2 r12 -#define bits3 r13 -#define dtable r14 -#define olimit r15 - -/* var[] aliases ip[1,2,3] & olimit - * ip[1,2,3] are saved every iteration. - * olimit is only used in compute_olimit. - */ -#define var0 r15 -#define var1 r9 -#define var2 r10 -#define var3 r11 - -/* 32-bit var registers */ -#define vard0 r15d -#define vard1 r9d -#define vard2 r10d -#define vard3 r11d - -/* Calls X(N) for each stream 0, 1, 2, 3. */ -#define FOR_EACH_STREAM(X) \ - X(0); \ - X(1); \ - X(2); \ - X(3) - -/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ -#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ - X(0, idx); \ - X(1, idx); \ - X(2, idx); \ - X(3, idx) - -/* Define both _HUF_* & HUF_* symbols because MacOS - * C symbols are prefixed with '_' & Linux symbols aren't. - */ -_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: -HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: - /* Save all registers - even if they are callee saved for simplicity. */ - push %rax - push %rbx - push %rcx - push %rdx - push %rbp - push %rsi - push %rdi - push %r8 - push %r9 - push %r10 - push %r11 - push %r12 - push %r13 - push %r14 - push %r15 - - /* Read HUF_DecompressAsmArgs* args from %rax */ - movq %rdi, %rax - movq 0(%rax), %ip0 - movq 8(%rax), %ip1 - movq 16(%rax), %ip2 - movq 24(%rax), %ip3 - movq 32(%rax), %op0 - movq 40(%rax), %op1 - movq 48(%rax), %op2 - movq 56(%rax), %op3 - movq 64(%rax), %bits0 - movq 72(%rax), %bits1 - movq 80(%rax), %bits2 - movq 88(%rax), %bits3 - movq 96(%rax), %dtable - push %rax /* argument */ - push 104(%rax) /* ilimit */ - push 112(%rax) /* oend */ - push %olimit /* olimit space */ - - subq $24, %rsp - -.L_4X1_compute_olimit: - /* Computes how many iterations we can do safely - * %r15, %rax may be clobbered - * rbx, rdx must be saved - * op3 & ip0 mustn't be clobbered - */ - movq %rbx, 0(%rsp) - movq %rdx, 8(%rsp) - - movq 32(%rsp), %rax /* rax = oend */ - subq %op3, %rax /* rax = oend - op3 */ - - /* r15 = (oend - op3) / 5 */ - movabsq $-3689348814741910323, %rdx - mulq %rdx - movq %rdx, %r15 - shrq $2, %r15 - - movq %ip0, %rax /* rax = ip0 */ - movq 40(%rsp), %rdx /* rdx = ilimit */ - subq %rdx, %rax /* rax = ip0 - ilimit */ - movq %rax, %rbx /* rbx = ip0 - ilimit */ - - /* rdx = (ip0 - ilimit) / 7 */ - movabsq $2635249153387078803, %rdx - mulq %rdx - subq %rdx, %rbx - shrq %rbx - addq %rbx, %rdx - shrq $2, %rdx - - /* r15 = min(%rdx, %r15) */ - cmpq %rdx, %r15 - cmova %rdx, %r15 - - /* r15 = r15 * 5 */ - leaq (%r15, %r15, 4), %r15 - - /* olimit = op3 + r15 */ - addq %op3, %olimit - - movq 8(%rsp), %rdx - movq 0(%rsp), %rbx - - /* If (op3 + 20 > olimit) */ - movq %op3, %rax /* rax = op3 */ - addq $20, %rax /* rax = op3 + 20 */ - cmpq %rax, %olimit /* op3 + 20 > olimit */ - jb .L_4X1_exit - - /* If (ip1 < ip0) go to exit */ - cmpq %ip0, %ip1 - jb .L_4X1_exit - - /* If (ip2 < ip1) go to exit */ - cmpq %ip1, %ip2 - jb .L_4X1_exit - - /* If (ip3 < ip2) go to exit */ - cmpq %ip2, %ip3 - jb .L_4X1_exit - -/* Reads top 11 bits from bits[n] - * Loads dt[bits[n]] into var[n] - */ -#define GET_NEXT_DELT(n) \ - movq $53, %var##n; \ - shrxq %var##n, %bits##n, %var##n; \ - movzwl (%dtable,%var##n,2),%vard##n - -/* var[n] must contain the DTable entry computed with GET_NEXT_DELT - * Moves var[n] to %rax - * bits[n] <<= var[n] & 63 - * op[n][idx] = %rax >> 8 - * %ah is a way to access bits [8, 16) of %rax - */ -#define DECODE_FROM_DELT(n, idx) \ - movq %var##n, %rax; \ - shlxq %var##n, %bits##n, %bits##n; \ - movb %ah, idx(%op##n) - -/* Assumes GET_NEXT_DELT has been called. - * Calls DECODE_FROM_DELT then GET_NEXT_DELT - */ -#define DECODE_AND_GET_NEXT(n, idx) \ - DECODE_FROM_DELT(n, idx); \ - GET_NEXT_DELT(n) \ - -/* // ctz & nbBytes is stored in bits[n] - * // nbBits is stored in %rax - * ctz = CTZ[bits[n]] - * nbBits = ctz & 7 - * nbBytes = ctz >> 3 - * op[n] += 5 - * ip[n] -= nbBytes - * // Note: x86-64 is little-endian ==> no bswap - * bits[n] = MEM_readST(ip[n]) | 1 - * bits[n] <<= nbBits - */ -#define RELOAD_BITS(n) \ - bsfq %bits##n, %bits##n; \ - movq %bits##n, %rax; \ - andq $7, %rax; \ - shrq $3, %bits##n; \ - leaq 5(%op##n), %op##n; \ - subq %bits##n, %ip##n; \ - movq (%ip##n), %bits##n; \ - orq $1, %bits##n; \ - shlx %rax, %bits##n, %bits##n - - /* Store clobbered variables on the stack */ - movq %olimit, 24(%rsp) - movq %ip1, 0(%rsp) - movq %ip2, 8(%rsp) - movq %ip3, 16(%rsp) - - /* Call GET_NEXT_DELT for each stream */ - FOR_EACH_STREAM(GET_NEXT_DELT) - - .p2align 6 - -.L_4X1_loop_body: - /* Decode 5 symbols in each of the 4 streams (20 total) - * Must have called GET_NEXT_DELT for each stream - */ - FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) - FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) - FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) - FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) - FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) - - /* Load ip[1,2,3] from stack (var[] aliases them) - * ip[] is needed for RELOAD_BITS - * Each will be stored back to the stack after RELOAD - */ - movq 0(%rsp), %ip1 - movq 8(%rsp), %ip2 - movq 16(%rsp), %ip3 - - /* Reload each stream & fetch the next table entry - * to prepare for the next iteration - */ - RELOAD_BITS(0) - GET_NEXT_DELT(0) - - RELOAD_BITS(1) - movq %ip1, 0(%rsp) - GET_NEXT_DELT(1) - - RELOAD_BITS(2) - movq %ip2, 8(%rsp) - GET_NEXT_DELT(2) - - RELOAD_BITS(3) - movq %ip3, 16(%rsp) - GET_NEXT_DELT(3) - - /* If op3 < olimit: continue the loop */ - cmp %op3, 24(%rsp) - ja .L_4X1_loop_body - - /* Reload ip[1,2,3] from stack */ - movq 0(%rsp), %ip1 - movq 8(%rsp), %ip2 - movq 16(%rsp), %ip3 - - /* Re-compute olimit */ - jmp .L_4X1_compute_olimit - -#undef GET_NEXT_DELT -#undef DECODE_FROM_DELT -#undef DECODE -#undef RELOAD_BITS -.L_4X1_exit: - addq $24, %rsp - - /* Restore stack (oend & olimit) */ - pop %rax /* olimit */ - pop %rax /* oend */ - pop %rax /* ilimit */ - pop %rax /* arg */ - - /* Save ip / op / bits */ - movq %ip0, 0(%rax) - movq %ip1, 8(%rax) - movq %ip2, 16(%rax) - movq %ip3, 24(%rax) - movq %op0, 32(%rax) - movq %op1, 40(%rax) - movq %op2, 48(%rax) - movq %op3, 56(%rax) - movq %bits0, 64(%rax) - movq %bits1, 72(%rax) - movq %bits2, 80(%rax) - movq %bits3, 88(%rax) - - /* Restore registers */ - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %r11 - pop %r10 - pop %r9 - pop %r8 - pop %rdi - pop %rsi - pop %rbp - pop %rdx - pop %rcx - pop %rbx - pop %rax - ret - -_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: -HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: - /* Save all registers - even if they are callee saved for simplicity. */ - push %rax - push %rbx - push %rcx - push %rdx - push %rbp - push %rsi - push %rdi - push %r8 - push %r9 - push %r10 - push %r11 - push %r12 - push %r13 - push %r14 - push %r15 - - movq %rdi, %rax - movq 0(%rax), %ip0 - movq 8(%rax), %ip1 - movq 16(%rax), %ip2 - movq 24(%rax), %ip3 - movq 32(%rax), %op0 - movq 40(%rax), %op1 - movq 48(%rax), %op2 - movq 56(%rax), %op3 - movq 64(%rax), %bits0 - movq 72(%rax), %bits1 - movq 80(%rax), %bits2 - movq 88(%rax), %bits3 - movq 96(%rax), %dtable - push %rax /* argument */ - push %rax /* olimit */ - push 104(%rax) /* ilimit */ - - movq 112(%rax), %rax - push %rax /* oend3 */ - - movq %op3, %rax - push %rax /* oend2 */ - - movq %op2, %rax - push %rax /* oend1 */ - - movq %op1, %rax - push %rax /* oend0 */ - - /* Scratch space */ - subq $8, %rsp - -.L_4X2_compute_olimit: - /* Computes how many iterations we can do safely - * %r15, %rax may be clobbered - * rdx must be saved - * op[1,2,3,4] & ip0 mustn't be clobbered - */ - movq %rdx, 0(%rsp) - - /* We can consume up to 7 input bytes each iteration. */ - movq %ip0, %rax /* rax = ip0 */ - movq 40(%rsp), %rdx /* rdx = ilimit */ - subq %rdx, %rax /* rax = ip0 - ilimit */ - movq %rax, %r15 /* r15 = ip0 - ilimit */ - - /* rdx = rax / 7 */ - movabsq $2635249153387078803, %rdx - mulq %rdx - subq %rdx, %r15 - shrq %r15 - addq %r15, %rdx - shrq $2, %rdx - - /* r15 = (ip0 - ilimit) / 7 */ - movq %rdx, %r15 - - movabsq $-3689348814741910323, %rdx - movq 8(%rsp), %rax /* rax = oend0 */ - subq %op0, %rax /* rax = oend0 - op0 */ - mulq %rdx - shrq $3, %rdx /* rdx = rax / 10 */ - - /* r15 = min(%rdx, %r15) */ - cmpq %rdx, %r15 - cmova %rdx, %r15 - - movabsq $-3689348814741910323, %rdx - movq 16(%rsp), %rax /* rax = oend1 */ - subq %op1, %rax /* rax = oend1 - op1 */ - mulq %rdx - shrq $3, %rdx /* rdx = rax / 10 */ - - /* r15 = min(%rdx, %r15) */ - cmpq %rdx, %r15 - cmova %rdx, %r15 - - movabsq $-3689348814741910323, %rdx - movq 24(%rsp), %rax /* rax = oend2 */ - subq %op2, %rax /* rax = oend2 - op2 */ - mulq %rdx - shrq $3, %rdx /* rdx = rax / 10 */ - - /* r15 = min(%rdx, %r15) */ - cmpq %rdx, %r15 - cmova %rdx, %r15 - - movabsq $-3689348814741910323, %rdx - movq 32(%rsp), %rax /* rax = oend3 */ - subq %op3, %rax /* rax = oend3 - op3 */ - mulq %rdx - shrq $3, %rdx /* rdx = rax / 10 */ - - /* r15 = min(%rdx, %r15) */ - cmpq %rdx, %r15 - cmova %rdx, %r15 - - /* olimit = op3 + 5 * r15 */ - movq %r15, %rax - leaq (%op3, %rax, 4), %olimit - addq %rax, %olimit - - movq 0(%rsp), %rdx - - /* If (op3 + 10 > olimit) */ - movq %op3, %rax /* rax = op3 */ - addq $10, %rax /* rax = op3 + 10 */ - cmpq %rax, %olimit /* op3 + 10 > olimit */ - jb .L_4X2_exit - - /* If (ip1 < ip0) go to exit */ - cmpq %ip0, %ip1 - jb .L_4X2_exit - - /* If (ip2 < ip1) go to exit */ - cmpq %ip1, %ip2 - jb .L_4X2_exit - - /* If (ip3 < ip2) go to exit */ - cmpq %ip2, %ip3 - jb .L_4X2_exit - -#define DECODE(n, idx) \ - movq %bits##n, %rax; \ - shrq $53, %rax; \ - movzwl 0(%dtable,%rax,4),%r8d; \ - movzbl 2(%dtable,%rax,4),%r15d; \ - movzbl 3(%dtable,%rax,4),%eax; \ - movw %r8w, (%op##n); \ - shlxq %r15, %bits##n, %bits##n; \ - addq %rax, %op##n - -#define RELOAD_BITS(n) \ - bsfq %bits##n, %bits##n; \ - movq %bits##n, %rax; \ - shrq $3, %bits##n; \ - andq $7, %rax; \ - subq %bits##n, %ip##n; \ - movq (%ip##n), %bits##n; \ - orq $1, %bits##n; \ - shlxq %rax, %bits##n, %bits##n - - - movq %olimit, 48(%rsp) - - .p2align 6 - -.L_4X2_loop_body: - /* We clobber r8, so store it on the stack */ - movq %r8, 0(%rsp) - - /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ - FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) - - /* Reload r8 */ - movq 0(%rsp), %r8 - - FOR_EACH_STREAM(RELOAD_BITS) - - cmp %op3, 48(%rsp) - ja .L_4X2_loop_body - jmp .L_4X2_compute_olimit - -#undef DECODE -#undef RELOAD_BITS -.L_4X2_exit: - addq $8, %rsp - /* Restore stack (oend & olimit) */ - pop %rax /* oend0 */ - pop %rax /* oend1 */ - pop %rax /* oend2 */ - pop %rax /* oend3 */ - pop %rax /* ilimit */ - pop %rax /* olimit */ - pop %rax /* arg */ - - /* Save ip / op / bits */ - movq %ip0, 0(%rax) - movq %ip1, 8(%rax) - movq %ip2, 16(%rax) - movq %ip3, 24(%rax) - movq %op0, 32(%rax) - movq %op1, 40(%rax) - movq %op2, 48(%rax) - movq %op3, 56(%rax) - movq %bits0, 64(%rax) - movq %bits1, 72(%rax) - movq %bits2, 80(%rax) - movq %bits3, 88(%rax) - - /* Restore registers */ - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %r11 - pop %r10 - pop %r9 - pop %r8 - pop %rdi - pop %rsi - pop %rbp - pop %rdx - pop %rcx - pop %rbx - pop %rax - ret - -#endif