Update BLAKE3 to latest master (0816badf3a) which is after 1.5.1.

2026-04-25 16:24:56 +02:00 · 2024-04-13 15:11:46 +08:00
parent 402aaf0e93
commit b577d1205c
9 changed files with 149 additions and 89 deletions
--- a/NanaZip.Codecs/BLAKE3/blake3.c
+++ b/NanaZip.Codecs/BLAKE3/blake3.c
@@ -246,7 +246,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,

 // The wide helper function returns (writes out) an array of chaining values
 // and returns the length of that array. The number of chaining values returned
-// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
 // if the input is shorter than that many chunks. The reason for maintaining a
 // wide array of chaining values going back up the tree, is to allow the
 // implementation to hash as many parents in parallel as possible.
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
 // As a special case when the SIMD degree is 1, this function will still return
 // at least 2 outputs. This guarantees that this function doesn't perform the
 // root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable ouput.) Note that this function is
+// wouldn't be able to implement extendable output.) Note that this function is
 // not used when the whole input is only 1 chunk long; that's a different
 // codepath.
 //
@@ -341,21 +341,24 @@ INLINE void compress_subtree_to_parent_node(
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
                                                chunk_counter, flags, cv_array);
  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
-
-  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+  // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
+  // as we just asserted, num_cvs will always be <=2 in that case. But GCC
+  // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
+  // set then it emits incorrect warnings here. We tried a few different
+  // hacks to silence these, but in the end our hacks just produced different
+  // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
+  // desperation, we ifdef out this entire loop when we know it's not needed.
+#if MAX_SIMD_DEGREE_OR_2 > 2
+  // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
  // compress_subtree_wide() returns more than 2 chaining values. Condense
  // them into 2 by forming parent nodes repeatedly.
  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
-  // The second half of this loop condition is always true, and we just
-  // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
-  // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
-  // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
-  // this code, test it against that version.
-  while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
+  while (num_cvs > 2) {
    num_cvs =
        compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
    memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
  }
+#endif
  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
 }

--- a/NanaZip.Codecs/BLAKE3/blake3.h
+++ b/NanaZip.Codecs/BLAKE3/blake3.h
@@ -4,11 +4,33 @@
 #include <stddef.h>
 #include <stdint.h>

+#if !defined(BLAKE3_API)
+# if defined(_WIN32) || defined(__CYGWIN__)
+#   if defined(BLAKE3_DLL)
+#     if defined(BLAKE3_DLL_EXPORTS)
+#       define BLAKE3_API __declspec(dllexport)
+#     else
+#       define BLAKE3_API __declspec(dllimport)
+#     endif
+#     define BLAKE3_PRIVATE
+#   else
+#     define BLAKE3_API
+#     define BLAKE3_PRIVATE
+#   endif
+# elif __GNUC__ >= 4
+#   define BLAKE3_API __attribute__((visibility("default")))
+#   define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
+# else
+#   define BLAKE3_API
+#   define BLAKE3_PRIVATE
+# endif
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif

-#define BLAKE3_VERSION_STRING "1.3.1"
+#define BLAKE3_VERSION_STRING "1.5.1"
 #define BLAKE3_KEY_LEN 32
 #define BLAKE3_OUT_LEN 32
 #define BLAKE3_BLOCK_LEN 64
@@ -38,20 +60,20 @@ typedef struct {
  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
 } blake3_hasher;

-const char *blake3_version(void);
-void blake3_hasher_init(blake3_hasher *self);
-void blake3_hasher_init_keyed(blake3_hasher *self,
-                              const uint8_t key[BLAKE3_KEY_LEN]);
-void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
-void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
-                                       size_t context_len);
-void blake3_hasher_update(blake3_hasher *self, const void *input,
-                          size_t input_len);
-void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
-                            size_t out_len);
-void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
-                                 uint8_t *out, size_t out_len);
-void blake3_hasher_reset(blake3_hasher *self);
+BLAKE3_API const char *blake3_version(void);
+BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
+BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
+                                         const uint8_t key[BLAKE3_KEY_LEN]);
+BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
+BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+                                                  size_t context_len);
+BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
+                                     size_t input_len);
+BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                                       size_t out_len);
+BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
+                                            uint8_t *out, size_t out_len);
+BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);

 #ifdef __cplusplus
 }
--- a/NanaZip.Codecs/BLAKE3/blake3_avx2.c
+++ b/NanaZip.Codecs/BLAKE3/blake3_avx2.c
@@ -1,8 +1,5 @@
 #include "blake3_impl.h"

-#if defined(IS_X86)
-#if !defined(BLAKE3_NO_AVX2)
-
 #include <immintrin.h>

 #define DEGREE 8
@@ -170,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);

-  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
  // 11/33.
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -327,6 +324,3 @@ void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
                            out);
 #endif
 }
-
-#endif
-#endif
--- a/NanaZip.Codecs/BLAKE3/blake3_avx512.c
+++ b/NanaZip.Codecs/BLAKE3/blake3_avx512.c
@@ -1,8 +1,5 @@
 #include "blake3_impl.h"

-#if defined(IS_X86)
-#if !defined(BLAKE3_NO_AVX512)
-
 #include <immintrin.h>

 #define _mm_shuffle_ps2(a, b, c)                                               \
@@ -432,7 +429,7 @@ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
 }

 INLINE void transpose_vecs_128(__m128i vecs[4]) {
-  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
  // 22/33. Note that this doesn't split the vector into two lanes, as the
  // AVX2 counterparts do.
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -687,7 +684,7 @@ INLINE void transpose_vecs_256(__m256i vecs[8]) {
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);

-  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
  // 11/33.
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -962,7 +959,7 @@ INLINE void transpose_vecs_512(__m512i vecs[16]) {
  __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
  __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);

-  // Interleave 64-bit lates. The _0 unpack is lanes
+  // Interleave 64-bit lanes. The _0 unpack is lanes
  // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
  // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
  // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
@@ -1050,13 +1047,26 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
 INLINE void load_counters16(uint64_t counter, bool increment_counter,
                            __m512i *out_lo, __m512i *out_hi) {
  const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
-  const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  const __m512i add1 = _mm512_and_si512(mask, add0);
-  __m512i l = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), add1);
-  __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
-  __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carry, _mm512_set1_epi32((int32_t)(counter >> 32)), _mm512_set1_epi32(1));
-  *out_lo = l;
-  *out_hi = h;
+  const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
+  const __m512i low_words = _mm512_add_epi32(
+    _mm512_set1_epi32((int32_t)counter),
+    masked_deltas);
+  // The carry bit is 1 if the high bit of the word was 1 before addition and is
+  // 0 after.
+  // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
+  // compute the carry bits here, and originally we did, but that intrinsic is
+  // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
+  const __m512i carries = _mm512_srli_epi32(
+    _mm512_andnot_si512(
+        low_words, // 0 after (gets inverted by andnot)
+        _mm512_set1_epi32((int32_t)counter)), // and 1 before
+    31);
+  const __m512i high_words = _mm512_add_epi32(
+    _mm512_set1_epi32((int32_t)(counter >> 32)),
+    carries);
+  *out_lo = low_words;
+  *out_hi = high_words;
 }

 static
@@ -1208,6 +1218,3 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
    out = &out[BLAKE3_OUT_LEN];
  }
 }
-
-#endif
-#endif
--- a/NanaZip.Codecs/BLAKE3/blake3_dispatch.c
+++ b/NanaZip.Codecs/BLAKE3/blake3_dispatch.c
@@ -4,20 +4,50 @@

 #include "blake3_impl.h"

+#if defined(_MSC_VER)
+#include <Windows.h>
+#endif
+
 #if defined(IS_X86)
 #if defined(_MSC_VER)
 #include <intrin.h>
 #elif defined(__GNUC__)
 #include <immintrin.h>
 #else
-#error "Unimplemented!"
+#undef IS_X86 /* Unimplemented! */
 #endif
 #endif

+#if !defined(BLAKE3_ATOMICS)
+#if defined(__has_include)
+#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
+#define BLAKE3_ATOMICS 1
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* defined(__has_include) */
+#endif /* BLAKE3_ATOMICS */
+
+#if BLAKE3_ATOMICS
+#define ATOMIC_INT _Atomic int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#elif defined(_MSC_VER)
+#define ATOMIC_INT LONG
+#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
+#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
+#else
+#define ATOMIC_INT int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#endif
+
 #define MAYBE_UNUSED(x) (void)((x))

 #if defined(IS_X86)
-static uint64_t xgetbv() {
+static uint64_t xgetbv(void) {
 #if defined(_MSC_VER)
  return _xgetbv(0);
 #else
@@ -76,22 +106,24 @@ enum cpu_feature {
 #if !defined(BLAKE3_TESTING)
 static /* Allow the variable to be controlled manually for testing */
 #endif
-    enum cpu_feature g_cpu_features = UNDEFINED;
+    ATOMIC_INT g_cpu_features = UNDEFINED;

 #if !defined(BLAKE3_TESTING)
 static
 #endif
    enum cpu_feature
-    get_cpu_features() {
+    get_cpu_features(void) {

-  if (g_cpu_features != UNDEFINED) {
-    return g_cpu_features;
+  /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
+  enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
+  if (features != UNDEFINED) {
+    return features;
  } else {
 #if defined(IS_X86)
    uint32_t regs[4] = {0};
    uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
    (void)edx;
-    enum cpu_feature features = 0;
+    features = 0;
    cpuid(regs, 0);
    const int max_id = *eax;
    cpuid(regs, 1);
@@ -101,7 +133,7 @@ static
    if (*edx & (1UL << 26))
      features |= SSE2;
 #endif
-    if (*ecx & (1UL << 0))
+    if (*ecx & (1UL << 9))
      features |= SSSE3;
    if (*ecx & (1UL << 19))
      features |= SSE41;
@@ -124,7 +156,7 @@ static
        }
      }
    }
-    g_cpu_features = features;
+    ATOMIC_STORE(g_cpu_features, features);
    return features;
 #else
    /* How to detect NEON? */
--- a/NanaZip.Codecs/BLAKE3/blake3_impl.h
+++ b/NanaZip.Codecs/BLAKE3/blake3_impl.h
@@ -28,7 +28,7 @@ enum blake3_flags {
 #define INLINE static inline __attribute__((always_inline))
 #endif

-#if defined(__x86_64__) || defined(_M_X64) 
+#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
 #define IS_X86
 #define IS_X86_64
 #endif
@@ -38,7 +38,7 @@ enum blake3_flags {
 #define IS_X86_32
 #endif

-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #define IS_AARCH64
 #endif

@@ -46,13 +46,16 @@ enum blake3_flags {
 #if defined(_MSC_VER)
 #include <intrin.h>
 #endif
-#include <immintrin.h>
 #endif

 #if !defined(BLAKE3_USE_NEON) 
  // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
  #if defined(IS_AARCH64)
-    #define BLAKE3_USE_NEON 1
+    #if defined(__ARM_BIG_ENDIAN)
+      #define BLAKE3_USE_NEON 0
+    #else
+      #define BLAKE3_USE_NEON 1
+    #endif
  #else
    #define BLAKE3_USE_NEON 0
  #endif
@@ -88,7 +91,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
 /* x is assumed to be nonzero.       */
 static unsigned int highest_one(uint64_t x) {
 #if defined(__GNUC__) || defined(__clang__)
-  return 63 ^ __builtin_clzll(x);
+  return 63 ^ (unsigned int)__builtin_clzll(x);
 #elif defined(_MSC_VER) && defined(IS_X86_64)
  unsigned long index;
  _BitScanReverse64(&index, x);
@@ -118,7 +121,7 @@ static unsigned int highest_one(uint64_t x) {
 // Count the number of 1 bits.
 INLINE unsigned int popcnt(uint64_t x) {
 #if defined(__GNUC__) || defined(__clang__)
-  return __builtin_popcountll(x);
+  return (unsigned int)__builtin_popcountll(x);
 #else
  unsigned int count = 0;
  while (x != 0) {
--- a/NanaZip.Codecs/BLAKE3/blake3_neon.c
+++ b/NanaZip.Codecs/BLAKE3/blake3_neon.c
@@ -1,7 +1,5 @@
 #include "blake3_impl.h"

-#if BLAKE3_USE_NEON == 1
-
 #include <arm_neon.h>

 #ifdef __ARM_BIG_ENDIAN
@@ -12,14 +10,12 @@

 INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
  // vld1q_u32 has alignment requirements. Don't use it.
-  uint32x4_t x;
-  memcpy(&x, src, 16);
-  return x;
+  return vreinterpretq_u32_u8(vld1q_u8(src));
 }

 INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
  // vst1q_u32 has alignment requirements. Don't use it.
-  memcpy(dest, &src, 16);
+  vst1q_u8(dest, vreinterpretq_u8_u32(src));
 }

 INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
@@ -38,19 +34,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
 }

 INLINE uint32x4_t rot16_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  // The straightfoward implementation would be two shifts and an or, but that's
+  // slower on microarchitectures we've tested. See
+  // https://github.com/BLAKE3-team/BLAKE3/pull/319.
+  // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
 }

 INLINE uint32x4_t rot12_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
 }

 INLINE uint32x4_t rot8_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+#if defined(__clang__)
+  return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
+#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
+  static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
+  return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
+#else 
+  return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
+#endif
 }

 INLINE uint32x4_t rot7_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
 }

 // TODO: compress_neon
@@ -351,5 +364,3 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
    out = &out[BLAKE3_OUT_LEN];
  }
 }
-
-#endif
--- a/NanaZip.Codecs/BLAKE3/blake3_sse2.c
+++ b/NanaZip.Codecs/BLAKE3/blake3_sse2.c
@@ -1,8 +1,5 @@
 #include "blake3_impl.h"

-#if defined(IS_X86)
-#if !defined(BLAKE3_NO_SSE2)
-
 #include <immintrin.h>

 #define DEGREE 4
@@ -399,7 +396,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
 }

 INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
-  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
  // 22/33. Note that this doesn't split the vector into two lanes, as the
  // AVX2 counterparts do.
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -567,6 +564,3 @@ void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
    out = &out[BLAKE3_OUT_LEN];
  }
 }
-
-#endif
-#endif
--- a/NanaZip.Codecs/BLAKE3/blake3_sse41.c
+++ b/NanaZip.Codecs/BLAKE3/blake3_sse41.c
@@ -1,8 +1,5 @@
 #include "blake3_impl.h"

-#if defined(IS_X86)
-#if !defined(BLAKE3_NO_SSE41)
-
 #include <immintrin.h>

 #define DEGREE 4
@@ -393,7 +390,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
 }

 INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
-  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
  // 22/33. Note that this doesn't split the vector into two lanes, as the
  // AVX2 counterparts do.
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -561,6 +558,3 @@ void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
    out = &out[BLAKE3_OUT_LEN];
  }
 }
-
-#endif
-#endif