std.crypto.aes: introduce AES block vectors (#22023)

* std.crypto.aes: introduce AES block vectors Modern Intel CPUs with the VAES extension can handle more than a single AES block per instruction. So can some ARM and RISC-V CPUs. Software implementations with bitslicing can also greatly benefit from this. Implement low-level operations on AES block vectors, and the parallel AEGIS variants on top of them. AMD Zen4: aegis-128x4: 73225 MiB/s aegis-128x2: 51571 MiB/s aegis-128l: 25806 MiB/s aegis-256x4: 46742 MiB/s aegis-256x2: 30227 MiB/s aegis-256: 8436 MiB/s aes128-gcm: 5926 MiB/s aes256-gcm: 5085 MiB/s AES-GCM, and anything based on AES-CTR are also going to benefit from this later. * Make AEGIS-MAC twice a fast
2024-11-26 15:12:31 +00:00 · 2024-11-22 10:00:49 +01:00 · 2024-11-22 10:00:49 +01:00 · 636308a17d
commit 636308a17d
parent f845fa04a0
7 changed files with 1012 additions and 387 deletions
--- a/lib/std/crypto.zig
+++ b/lib/std/crypto.zig
@ -7,10 +7,23 @@ pub const timing_safe = @import("crypto/timing_safe.zig");
 /// Authenticated Encryption with Associated Data
 pub const aead = struct {
    pub const aegis = struct {
-        pub const Aegis128L = @import("crypto/aegis.zig").Aegis128L;
-        pub const Aegis128L_256 = @import("crypto/aegis.zig").Aegis128L_256;
-        pub const Aegis256 = @import("crypto/aegis.zig").Aegis256;
-        pub const Aegis256_256 = @import("crypto/aegis.zig").Aegis256_256;
+        const variants = @import("crypto/aegis.zig");
+
+        pub const Aegis128X4 = variants.Aegis128X4;
+        pub const Aegis128X2 = variants.Aegis128X2;
+        pub const Aegis128L = variants.Aegis128L;
+
+        pub const Aegis256X4 = variants.Aegis256X4;
+        pub const Aegis256X2 = variants.Aegis256X2;
+        pub const Aegis256 = variants.Aegis256;
+
+        pub const Aegis128X4_256 = variants.Aegis128X4_256;
+        pub const Aegis128X2_256 = variants.Aegis128X2_256;
+        pub const Aegis128L_256 = variants.Aegis128L_256;
+
+        pub const Aegis256X4_256 = variants.Aegis256X4_256;
+        pub const Aegis256X2_256 = variants.Aegis256X2_256;
+        pub const Aegis256_256 = variants.Aegis256_256;
    };

    pub const aes_gcm = struct {
@ -44,10 +57,22 @@ pub const auth = struct {
    pub const hmac = @import("crypto/hmac.zig");
    pub const siphash = @import("crypto/siphash.zig");
    pub const aegis = struct {
-        pub const Aegis128LMac = @import("crypto/aegis.zig").Aegis128LMac;
-        pub const Aegis128LMac_128 = @import("crypto/aegis.zig").Aegis128LMac_128;
-        pub const Aegis256Mac = @import("crypto/aegis.zig").Aegis256Mac;
-        pub const Aegis256Mac_128 = @import("crypto/aegis.zig").Aegis256Mac_128;
+        const variants = @import("crypto/aegis.zig");
+        pub const Aegis128X4Mac = variants.Aegis128X4Mac;
+        pub const Aegis128X2Mac = variants.Aegis128X2Mac;
+        pub const Aegis128LMac = variants.Aegis128LMac;
+
+        pub const Aegis256X4Mac = variants.Aegis256X4Mac;
+        pub const Aegis256X2Mac = variants.Aegis256X2Mac;
+        pub const Aegis256Mac = variants.Aegis256Mac;
+
+        pub const Aegis128X4Mac_128 = variants.Aegis128X4Mac_128;
+        pub const Aegis128X2Mac_128 = variants.Aegis128X2Mac_128;
+        pub const Aegis128LMac_128 = variants.Aegis128LMac_128;
+
+        pub const Aegis256X4Mac_128 = variants.Aegis256X4Mac_128;
+        pub const Aegis256X2Mac_128 = variants.Aegis256X2Mac_128;
+        pub const Aegis256Mac_128 = variants.Aegis256Mac_128;
    };
    pub const cmac = @import("crypto/cmac.zig");
 };
--- a/lib/std/crypto/aegis.zig
+++ b/lib/std/crypto/aegis.zig
--- a/lib/std/crypto/aes.zig
+++ b/lib/std/crypto/aes.zig
@ -22,6 +22,7 @@ pub const has_hardware_support =
    (builtin.cpu.arch == .aarch64 and has_armaes);

 pub const Block = impl.Block;
+pub const BlockVec = impl.BlockVec;
 pub const AesEncryptCtx = impl.AesEncryptCtx;
 pub const AesDecryptCtx = impl.AesDecryptCtx;
 pub const Aes128 = impl.Aes128;
--- a/lib/std/crypto/aes/aesni.zig
+++ b/lib/std/crypto/aes/aesni.zig
@ -2,18 +2,23 @@ const std = @import("../../std.zig");
 const builtin = @import("builtin");
 const mem = std.mem;
 const debug = std.debug;
-const BlockVec = @Vector(2, u64);
+
+const has_vaes = builtin.cpu.arch == .x86_64 and std.Target.x86.featureSetHas(builtin.cpu.features, .vaes);
+const has_avx512f = builtin.cpu.arch == .x86_64 and std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f);

 /// A single AES block.
 pub const Block = struct {
+    const Repr = @Vector(2, u64);
+
+    /// The length of an AES block in bytes.
    pub const block_length: usize = 16;

    /// Internal representation of a block.
-    repr: BlockVec,
+    repr: Repr,

    /// Convert a byte sequence into an internal representation.
    pub inline fn fromBytes(bytes: *const [16]u8) Block {
-        const repr = mem.bytesToValue(BlockVec, bytes);
+        const repr = mem.bytesToValue(Repr, bytes);
        return Block{ .repr = repr };
    }

@ -33,7 +38,7 @@ pub const Block = struct {
        return Block{
            .repr = asm (
                \\ vaesenc %[rk], %[in], %[out]
-                : [out] "=x" (-> BlockVec),
+                : [out] "=x" (-> Repr),
                : [in] "x" (block.repr),
                  [rk] "x" (round_key.repr),
            ),
@ -45,7 +50,7 @@ pub const Block = struct {
        return Block{
            .repr = asm (
                \\ vaesenclast %[rk], %[in], %[out]
-                : [out] "=x" (-> BlockVec),
+                : [out] "=x" (-> Repr),
                : [in] "x" (block.repr),
                  [rk] "x" (round_key.repr),
            ),
@ -57,7 +62,7 @@ pub const Block = struct {
        return Block{
            .repr = asm (
                \\ vaesdec %[rk], %[in], %[out]
-                : [out] "=x" (-> BlockVec),
+                : [out] "=x" (-> Repr),
                : [in] "x" (block.repr),
                  [rk] "x" (inv_round_key.repr),
            ),
@ -69,7 +74,7 @@ pub const Block = struct {
        return Block{
            .repr = asm (
                \\ vaesdeclast %[rk], %[in], %[out]
-                : [out] "=x" (-> BlockVec),
+                : [out] "=x" (-> Repr),
                : [in] "x" (block.repr),
                  [rk] "x" (inv_round_key.repr),
            ),
@ -168,17 +173,158 @@ pub const Block = struct {
    };
 };

+/// A fixed-size vector of AES blocks.
+/// All operations are performed in parallel, using SIMD instructions when available.
+pub fn BlockVec(comptime blocks_count: comptime_int) type {
+    return struct {
+        const Self = @This();
+
+        /// The number of AES blocks the target architecture can process with a single instruction.
+        pub const native_vector_size = w: {
+            if (has_avx512f and blocks_count % 4 == 0) break :w 4;
+            if (has_vaes and blocks_count % 2 == 0) break :w 2;
+            break :w 1;
+        };
+
+        /// The size of the AES block vector that the target architecture can process with a single instruction, in bytes.
+        pub const native_word_size = native_vector_size * 16;
+
+        const native_words = blocks_count / native_vector_size;
+
+        const Repr = @Vector(native_vector_size * 2, u64);
+
+        /// Internal representation of a block vector.
+        repr: [native_words]Repr,
+
+        /// Length of the block vector in bytes.
+        pub const block_length: usize = blocks_count * 16;
+
+        /// Convert a byte sequence into an internal representation.
+        pub inline fn fromBytes(bytes: *const [blocks_count * 16]u8) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = mem.bytesToValue(Repr, bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Convert the internal representation of a block vector into a byte sequence.
+        pub inline fn toBytes(block_vec: Self) [blocks_count * 16]u8 {
+            var out: [blocks_count * 16]u8 = undefined;
+            inline for (0..native_words) |i| {
+                out[i * native_word_size ..][0..native_word_size].* = mem.toBytes(block_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// XOR the block vector with a byte sequence.
+        pub inline fn xorBytes(block_vec: Self, bytes: *const [blocks_count * 16]u8) [blocks_count * 16]u8 {
+            var x: Self = undefined;
+            inline for (0..native_words) |i| {
+                x.repr[i] = block_vec.repr[i] ^ mem.bytesToValue(Repr, bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return x.toBytes();
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of round keys.
+        pub inline fn encrypt(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = asm (
+                    \\ vaesenc %[rk], %[in], %[out]
+                    : [out] "=x" (-> Repr),
+                    : [in] "x" (block_vec.repr[i]),
+                      [rk] "x" (round_key_vec.repr[i]),
+                );
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of last round keys.
+        pub inline fn encryptLast(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = asm (
+                    \\ vaesenclast %[rk], %[in], %[out]
+                    : [out] "=x" (-> Repr),
+                    : [in] "x" (block_vec.repr[i]),
+                      [rk] "x" (round_key_vec.repr[i]),
+                );
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of round keys.
+        pub inline fn decrypt(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = asm (
+                    \\ vaesdec %[rk], %[in], %[out]
+                    : [out] "=x" (-> Repr),
+                    : [in] "x" (block_vec.repr[i]),
+                      [rk] "x" (inv_round_key_vec.repr[i]),
+                );
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of last round keys.
+        pub inline fn decryptLast(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = asm (
+                    \\ vaesdeclast %[rk], %[in], %[out]
+                    : [out] "=x" (-> Repr),
+                    : [in] "x" (block_vec.repr[i]),
+                      [rk] "x" (inv_round_key_vec.repr[i]),
+                );
+            }
+            return out;
+        }
+
+        /// Apply the bitwise XOR operation to the content of two block vectors.
+        pub inline fn xorBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i] ^ block_vec2.repr[i];
+            }
+            return out;
+        }
+
+        /// Apply the bitwise AND operation to the content of two block vectors.
+        pub inline fn andBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i] & block_vec2.repr[i];
+            }
+            return out;
+        }
+
+        /// Apply the bitwise OR operation to the content of two block vectors.
+        pub inline fn orBlocks(block_vec1: Self, block_vec2: Block) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i] | block_vec2.repr[i];
+            }
+            return out;
+        }
+    };
+}
+
 fn KeySchedule(comptime Aes: type) type {
    std.debug.assert(Aes.rounds == 10 or Aes.rounds == 14);
    const rounds = Aes.rounds;

    return struct {
        const Self = @This();
+
+        const Repr = Aes.block.Repr;
+
        round_keys: [rounds + 1]Block,

-        fn drc(comptime second: bool, comptime rc: u8, t: BlockVec, tx: BlockVec) BlockVec {
-            var s: BlockVec = undefined;
-            var ts: BlockVec = undefined;
+        fn drc(comptime second: bool, comptime rc: u8, t: Repr, tx: Repr) Repr {
+            var s: Repr = undefined;
+            var ts: Repr = undefined;
            return asm (
                \\ vaeskeygenassist %[rc], %[t], %[s]
                \\ vpslldq $4, %[tx], %[ts]
@ -187,7 +333,7 @@ fn KeySchedule(comptime Aes: type) type {
                \\ vpxor   %[ts], %[r], %[r]
                \\ vpshufd %[mask], %[s], %[ts]
                \\ vpxor   %[ts], %[r], %[r]
-                : [r] "=&x" (-> BlockVec),
+                : [r] "=&x" (-> Repr),
                  [s] "=&x" (s),
                  [ts] "=&x" (ts),
                : [rc] "n" (rc),
@ -234,7 +380,7 @@ fn KeySchedule(comptime Aes: type) type {
                inv_round_keys[i] = Block{
                    .repr = asm (
                        \\ vaesimc %[rk], %[inv_rk]
-                        : [inv_rk] "=x" (-> BlockVec),
+                        : [inv_rk] "=x" (-> Repr),
                        : [rk] "x" (round_keys[rounds - i].repr),
                    ),
                };
--- a/lib/std/crypto/aes/armcrypto.zig
+++ b/lib/std/crypto/aes/armcrypto.zig
@ -1,18 +1,19 @@
 const std = @import("../../std.zig");
 const mem = std.mem;
 const debug = std.debug;
-const BlockVec = @Vector(2, u64);

 /// A single AES block.
 pub const Block = struct {
+    const Repr = @Vector(2, u64);
+
    pub const block_length: usize = 16;

    /// Internal representation of a block.
-    repr: BlockVec,
+    repr: Repr,

    /// Convert a byte sequence into an internal representation.
    pub inline fn fromBytes(bytes: *const [16]u8) Block {
-        const repr = mem.bytesToValue(BlockVec, bytes);
+        const repr = mem.bytesToValue(Repr, bytes);
        return Block{ .repr = repr };
    }

@ -36,7 +37,7 @@ pub const Block = struct {
                \\ mov   %[out].16b, %[in].16b
                \\ aese  %[out].16b, %[zero].16b
                \\ aesmc %[out].16b, %[out].16b
-                : [out] "=&x" (-> BlockVec),
+                : [out] "=&x" (-> Repr),
                : [in] "x" (block.repr),
                  [zero] "x" (zero),
            )) ^ round_key.repr,
@ -49,7 +50,7 @@ pub const Block = struct {
            .repr = (asm (
                \\ mov   %[out].16b, %[in].16b
                \\ aese  %[out].16b, %[zero].16b
-                : [out] "=&x" (-> BlockVec),
+                : [out] "=&x" (-> Repr),
                : [in] "x" (block.repr),
                  [zero] "x" (zero),
            )) ^ round_key.repr,
@ -63,7 +64,7 @@ pub const Block = struct {
                \\ mov   %[out].16b, %[in].16b
                \\ aesd  %[out].16b, %[zero].16b
                \\ aesimc %[out].16b, %[out].16b
-                : [out] "=&x" (-> BlockVec),
+                : [out] "=&x" (-> Repr),
                : [in] "x" (block.repr),
                  [zero] "x" (zero),
            )) ^ inv_round_key.repr,
@ -76,7 +77,7 @@ pub const Block = struct {
            .repr = (asm (
                \\ mov   %[out].16b, %[in].16b
                \\ aesd  %[out].16b, %[zero].16b
-                : [out] "=&x" (-> BlockVec),
+                : [out] "=&x" (-> Repr),
                : [in] "x" (block.repr),
                  [zero] "x" (zero),
            )) ^ inv_round_key.repr,
@ -165,6 +166,118 @@ pub const Block = struct {
    };
 };

+/// A fixed-size vector of AES blocks.
+/// All operations are performed in parallel, using SIMD instructions when available.
+pub fn BlockVec(comptime blocks_count: comptime_int) type {
+    return struct {
+        const Self = @This();
+
+        /// The number of AES blocks the target architecture can process with a single instruction.
+        pub const native_vector_size = 1;
+
+        /// The size of the AES block vector that the target architecture can process with a single instruction, in bytes.
+        pub const native_word_size = native_vector_size * 16;
+
+        const native_words = blocks_count;
+
+        /// Internal representation of a block vector.
+        repr: [native_words]Block,
+
+        /// Length of the block vector in bytes.
+        pub const block_length: usize = blocks_count * 16;
+
+        /// Convert a byte sequence into an internal representation.
+        pub inline fn fromBytes(bytes: *const [blocks_count * 16]u8) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = Block.fromBytes(bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Convert the internal representation of a block vector into a byte sequence.
+        pub inline fn toBytes(block_vec: Self) [blocks_count * 16]u8 {
+            var out: [blocks_count * 16]u8 = undefined;
+            inline for (0..native_words) |i| {
+                out[i * native_word_size ..][0..native_word_size].* = block_vec.repr[i].toBytes();
+            }
+            return out;
+        }
+
+        /// XOR the block vector with a byte sequence.
+        pub inline fn xorBytes(block_vec: Self, bytes: *const [blocks_count * 16]u8) [32]u8 {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].xorBytes(bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of round keys.
+        pub inline fn encrypt(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].encrypt(round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of last round keys.
+        pub inline fn encryptLast(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].encryptLast(round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of round keys.
+        pub inline fn decrypt(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].decrypt(inv_round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of last round keys.
+        pub inline fn decryptLast(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].decryptLast(inv_round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise XOR operation to the content of two block vectors.
+        pub inline fn xorBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].xorBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise AND operation to the content of two block vectors.
+        pub inline fn andBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].andBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise OR operation to the content of two block vectors.
+        pub inline fn orBlocks(block_vec1: Self, block_vec2: Block) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].orBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+    };
+}
+
 fn KeySchedule(comptime Aes: type) type {
    std.debug.assert(Aes.rounds == 10 or Aes.rounds == 14);
    const rounds = Aes.rounds;
@ -172,17 +285,19 @@ fn KeySchedule(comptime Aes: type) type {
    return struct {
        const Self = @This();

+        const Repr = Aes.block.Repr;
+
        const zero = @Vector(2, u64){ 0, 0 };
        const mask1 = @Vector(16, u8){ 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 };
        const mask2 = @Vector(16, u8){ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 };

        round_keys: [rounds + 1]Block,

-        fn drc128(comptime rc: u8, t: BlockVec) BlockVec {
-            var v1: BlockVec = undefined;
-            var v2: BlockVec = undefined;
-            var v3: BlockVec = undefined;
-            var v4: BlockVec = undefined;
+        fn drc128(comptime rc: u8, t: Repr) Repr {
+            var v1: Repr = undefined;
+            var v2: Repr = undefined;
+            var v3: Repr = undefined;
+            var v4: Repr = undefined;

            return asm (
                \\ movi %[v2].4s, %[rc]
@ -196,7 +311,7 @@ fn KeySchedule(comptime Aes: type) type {
                \\ eor  %[v1].16b, %[v1].16b, %[r].16b
                \\ eor  %[r].16b, %[v1].16b, %[v3].16b
                \\ eor  %[r].16b, %[r].16b, %[v4].16b
-                : [r] "=&x" (-> BlockVec),
+                : [r] "=&x" (-> Repr),
                  [v1] "=&x" (v1),
                  [v2] "=&x" (v2),
                  [v3] "=&x" (v3),
@ -208,11 +323,11 @@ fn KeySchedule(comptime Aes: type) type {
            );
        }

-        fn drc256(comptime second: bool, comptime rc: u8, t: BlockVec, tx: BlockVec) BlockVec {
-            var v1: BlockVec = undefined;
-            var v2: BlockVec = undefined;
-            var v3: BlockVec = undefined;
-            var v4: BlockVec = undefined;
+        fn drc256(comptime second: bool, comptime rc: u8, t: Repr, tx: Repr) Repr {
+            var v1: Repr = undefined;
+            var v2: Repr = undefined;
+            var v3: Repr = undefined;
+            var v4: Repr = undefined;

            return asm (
                \\ movi %[v2].4s, %[rc]
@ -226,7 +341,7 @@ fn KeySchedule(comptime Aes: type) type {
                \\ eor  %[v1].16b, %[v1].16b, %[v2].16b
                \\ eor  %[v1].16b, %[v1].16b, %[v3].16b
                \\ eor  %[r].16b, %[v1].16b, %[v4].16b
-                : [r] "=&x" (-> BlockVec),
+                : [r] "=&x" (-> Repr),
                  [v1] "=&x" (v1),
                  [v2] "=&x" (v2),
                  [v3] "=&x" (v3),
@ -276,7 +391,7 @@ fn KeySchedule(comptime Aes: type) type {
                inv_round_keys[i] = Block{
                    .repr = asm (
                        \\ aesimc %[inv_rk].16b, %[rk].16b
-                        : [inv_rk] "=x" (-> BlockVec),
+                        : [inv_rk] "=x" (-> Repr),
                        : [rk] "x" (round_keys[rounds - i].repr),
                    ),
                };
--- a/lib/std/crypto/aes/soft.zig
+++ b/lib/std/crypto/aes/soft.zig
@ -2,16 +2,16 @@ const std = @import("../../std.zig");
 const math = std.math;
 const mem = std.mem;

-const BlockVec = [4]u32;
-
 const side_channels_mitigations = std.options.side_channels_mitigations;

 /// A single AES block.
 pub const Block = struct {
+    const Repr = [4]u32;
+
    pub const block_length: usize = 16;

    /// Internal representation of a block.
-    repr: BlockVec align(16),
+    repr: Repr align(16),

    /// Convert a byte sequence into an internal representation.
    pub inline fn fromBytes(bytes: *const [16]u8) Block {
@ -19,7 +19,7 @@ pub const Block = struct {
        const s1 = mem.readInt(u32, bytes[4..8], .little);
        const s2 = mem.readInt(u32, bytes[8..12], .little);
        const s3 = mem.readInt(u32, bytes[12..16], .little);
-        return Block{ .repr = BlockVec{ s0, s1, s2, s3 } };
+        return Block{ .repr = Repr{ s0, s1, s2, s3 } };
    }

    /// Convert the internal representation of a block into a byte sequence.
@ -65,7 +65,7 @@ pub const Block = struct {
        t2 ^= round_key.repr[2];
        t3 ^= round_key.repr[3];

-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
    }

    /// Encrypt a block with a round key *WITHOUT ANY PROTECTION AGAINST SIDE CHANNELS*
@ -110,7 +110,7 @@ pub const Block = struct {
        t2 ^= round_key.repr[2];
        t3 ^= round_key.repr[3];

-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
    }

    /// Encrypt a block with the last round key.
@ -136,7 +136,7 @@ pub const Block = struct {
        t2 ^= round_key.repr[2];
        t3 ^= round_key.repr[3];

-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
    }

    /// Decrypt a block with a round key.
@ -161,7 +161,7 @@ pub const Block = struct {
        t2 ^= round_key.repr[2];
        t3 ^= round_key.repr[3];

-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
    }

    /// Decrypt a block with a round key *WITHOUT ANY PROTECTION AGAINST SIDE CHANNELS*
@ -206,7 +206,7 @@ pub const Block = struct {
        t2 ^= round_key.repr[2];
        t3 ^= round_key.repr[3];

-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
    }

    /// Decrypt a block with the last round key.
@ -232,12 +232,12 @@ pub const Block = struct {
        t2 ^= round_key.repr[2];
        t3 ^= round_key.repr[3];

-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
    }

    /// Apply the bitwise XOR operation to the content of two blocks.
    pub inline fn xorBlocks(block1: Block, block2: Block) Block {
-        var x: BlockVec = undefined;
+        var x: Repr = undefined;
        comptime var i = 0;
        inline while (i < 4) : (i += 1) {
            x[i] = block1.repr[i] ^ block2.repr[i];
@ -247,7 +247,7 @@ pub const Block = struct {

    /// Apply the bitwise AND operation to the content of two blocks.
    pub inline fn andBlocks(block1: Block, block2: Block) Block {
-        var x: BlockVec = undefined;
+        var x: Repr = undefined;
        comptime var i = 0;
        inline while (i < 4) : (i += 1) {
            x[i] = block1.repr[i] & block2.repr[i];
@ -257,7 +257,7 @@ pub const Block = struct {

    /// Apply the bitwise OR operation to the content of two blocks.
    pub inline fn orBlocks(block1: Block, block2: Block) Block {
-        var x: BlockVec = undefined;
+        var x: Repr = undefined;
        comptime var i = 0;
        inline while (i < 4) : (i += 1) {
            x[i] = block1.repr[i] | block2.repr[i];
@ -332,6 +332,118 @@ pub const Block = struct {
    };
 };

+/// A fixed-size vector of AES blocks.
+/// All operations are performed in parallel, using SIMD instructions when available.
+pub fn BlockVec(comptime blocks_count: comptime_int) type {
+    return struct {
+        const Self = @This();
+
+        /// The number of AES blocks the target architecture can process with a single instruction.
+        pub const native_vector_size = 1;
+
+        /// The size of the AES block vector that the target architecture can process with a single instruction, in bytes.
+        pub const native_word_size = native_vector_size * 16;
+
+        const native_words = blocks_count;
+
+        /// Internal representation of a block vector.
+        repr: [native_words]Block,
+
+        /// Length of the block vector in bytes.
+        pub const block_length: usize = blocks_count * 16;
+
+        /// Convert a byte sequence into an internal representation.
+        pub inline fn fromBytes(bytes: *const [blocks_count * 16]u8) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = Block.fromBytes(bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Convert the internal representation of a block vector into a byte sequence.
+        pub inline fn toBytes(block_vec: Self) [blocks_count * 16]u8 {
+            var out: [blocks_count * 16]u8 = undefined;
+            for (0..native_words) |i| {
+                out[i * native_word_size ..][0..native_word_size].* = block_vec.repr[i].toBytes();
+            }
+            return out;
+        }
+
+        /// XOR the block vector with a byte sequence.
+        pub inline fn xorBytes(block_vec: Self, bytes: *const [blocks_count * 16]u8) [32]u8 {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].xorBytes(bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of round keys.
+        pub inline fn encrypt(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].encrypt(round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of last round keys.
+        pub inline fn encryptLast(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].encryptLast(round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of round keys.
+        pub inline fn decrypt(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].decrypt(inv_round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of last round keys.
+        pub inline fn decryptLast(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].decryptLast(inv_round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise XOR operation to the content of two block vectors.
+        pub inline fn xorBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].xorBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise AND operation to the content of two block vectors.
+        pub inline fn andBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].andBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise OR operation to the content of two block vectors.
+        pub inline fn orBlocks(block_vec1: Self, block_vec2: Block) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].orBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+    };
+}
+
 fn KeySchedule(comptime Aes: type) type {
    std.debug.assert(Aes.rounds == 10 or Aes.rounds == 14);
    const key_length = Aes.key_bits / 8;
@ -671,7 +783,7 @@ fn mul(a: u8, b: u8) u8 {

 const cache_line_bytes = std.atomic.cache_line;

-inline fn sbox_lookup(sbox: *align(64) const [256]u8, idx0: u8, idx1: u8, idx2: u8, idx3: u8) [4]u8 {
+fn sbox_lookup(sbox: *align(64) const [256]u8, idx0: u8, idx1: u8, idx2: u8, idx3: u8) [4]u8 {
    if (side_channels_mitigations == .none) {
        return [4]u8{
            sbox[idx0],
@ -709,7 +821,7 @@ inline fn sbox_lookup(sbox: *align(64) const [256]u8, idx0: u8, idx1: u8, idx2:
    }
 }

-inline fn table_lookup(table: *align(64) const [4][256]u32, idx0: u8, idx1: u8, idx2: u8, idx3: u8) [4]u32 {
+fn table_lookup(table: *align(64) const [4][256]u32, idx0: u8, idx1: u8, idx2: u8, idx3: u8) [4]u32 {
    if (side_channels_mitigations == .none) {
        return [4]u32{
            table[0][idx0],
@ -718,17 +830,18 @@ inline fn table_lookup(table: *align(64) const [4][256]u32, idx0: u8, idx1: u8,
            table[3][idx3],
        };
    } else {
+        const table_len: usize = 256;
        const stride = switch (side_channels_mitigations) {
            .none => unreachable,
-            .basic => table[0].len / 4,
-            .medium => @max(1, @min(table[0].len, 2 * cache_line_bytes / 4)),
-            .full => @max(1, @min(table[0].len, cache_line_bytes / 4)),
+            .basic => table_len / 4,
+            .medium => @max(1, @min(table_len, 2 * cache_line_bytes / 4)),
+            .full => @max(1, @min(table_len, cache_line_bytes / 4)),
        };
        const of0 = idx0 % stride;
        const of1 = idx1 % stride;
        const of2 = idx2 % stride;
        const of3 = idx3 % stride;
-        var t: [4][table[0].len / stride]u32 align(64) = undefined;
+        var t: [4][table_len / stride]u32 align(64) = undefined;
        var i: usize = 0;
        while (i < t[0].len) : (i += 1) {
            const tx = table[0][i * stride ..];
--- a/lib/std/crypto/benchmark.zig
+++ b/lib/std/crypto/benchmark.zig
@ -72,6 +72,10 @@ const macs = [_]Crypto{
    Crypto{ .ty = crypto.auth.siphash.SipHash64(1, 3), .name = "siphash-1-3" },
    Crypto{ .ty = crypto.auth.siphash.SipHash128(2, 4), .name = "siphash128-2-4" },
    Crypto{ .ty = crypto.auth.siphash.SipHash128(1, 3), .name = "siphash128-1-3" },
+    Crypto{ .ty = crypto.auth.aegis.Aegis128X4Mac, .name = "aegis-128x4 mac" },
+    Crypto{ .ty = crypto.auth.aegis.Aegis256X4Mac, .name = "aegis-256x4 mac" },
+    Crypto{ .ty = crypto.auth.aegis.Aegis128X2Mac, .name = "aegis-128x2 mac" },
+    Crypto{ .ty = crypto.auth.aegis.Aegis256X2Mac, .name = "aegis-256x2 mac" },
    Crypto{ .ty = crypto.auth.aegis.Aegis128LMac, .name = "aegis-128l mac" },
    Crypto{ .ty = crypto.auth.aegis.Aegis256Mac, .name = "aegis-256 mac" },
    Crypto{ .ty = crypto.auth.cmac.CmacAes128, .name = "aes-cmac" },
@ -283,7 +287,11 @@ const aeads = [_]Crypto{
    Crypto{ .ty = crypto.aead.chacha_poly.XChaCha20Poly1305, .name = "xchacha20Poly1305" },
    Crypto{ .ty = crypto.aead.chacha_poly.XChaCha8Poly1305, .name = "xchacha8Poly1305" },
    Crypto{ .ty = crypto.aead.salsa_poly.XSalsa20Poly1305, .name = "xsalsa20Poly1305" },
+    Crypto{ .ty = crypto.aead.aegis.Aegis128X4, .name = "aegis-128x4" },
+    Crypto{ .ty = crypto.aead.aegis.Aegis128X2, .name = "aegis-128x2" },
    Crypto{ .ty = crypto.aead.aegis.Aegis128L, .name = "aegis-128l" },
+    Crypto{ .ty = crypto.aead.aegis.Aegis256X4, .name = "aegis-256x4" },
+    Crypto{ .ty = crypto.aead.aegis.Aegis256X2, .name = "aegis-256x2" },
    Crypto{ .ty = crypto.aead.aegis.Aegis256, .name = "aegis-256" },
    Crypto{ .ty = crypto.aead.aes_gcm.Aes128Gcm, .name = "aes128-gcm" },
    Crypto{ .ty = crypto.aead.aes_gcm.Aes256Gcm, .name = "aes256-gcm" },