/* * Copyright (c) 2020, Ali Mohammad Pur * Copyright (c) 2023, Jelle Raaijmakers * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include namespace Crypto::Hash { static constexpr auto ROTATE_LEFT(u32 value, size_t bits) { return (value << bits) | (value >> (32 - bits)); } template<> void SHA1::transform_impl() { auto& data = m_data_buffer; u32 blocks[80]; for (size_t i = 0; i < 16; ++i) blocks[i] = AK::convert_between_host_and_network_endian(((u32 const*)data)[i]); // w[i] = (w[i-3] xor w[i-8] xor w[i-14] xor w[i-16]) leftrotate 1 for (size_t i = 16; i < Rounds; ++i) blocks[i] = ROTATE_LEFT(blocks[i - 3] ^ blocks[i - 8] ^ blocks[i - 14] ^ blocks[i - 16], 1); auto a = m_state[0], b = m_state[1], c = m_state[2], d = m_state[3], e = m_state[4]; u32 f, k; for (size_t i = 0; i < Rounds; ++i) { if (i <= 19) { f = (b & c) | ((~b) & d); k = SHA1Constants::RoundConstants[0]; } else if (i <= 39) { f = b ^ c ^ d; k = SHA1Constants::RoundConstants[1]; } else if (i <= 59) { f = (b & c) | (b & d) | (c & d); k = SHA1Constants::RoundConstants[2]; } else { f = b ^ c ^ d; k = SHA1Constants::RoundConstants[3]; } auto temp = ROTATE_LEFT(a, 5) + f + e + k + blocks[i]; e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = temp; } m_state[0] += a; m_state[1] += b; m_state[2] += c; m_state[3] += d; m_state[4] += e; // "security" measures, as if SHA1 is secure a = 0; b = 0; c = 0; d = 0; e = 0; secure_zero(blocks, 16 * sizeof(u32)); } // Note: The SHA extension was introduced with // Intel Goldmont (SSE4.2), Ice Lake (AVX512), Rocket Lake (AVX512), and AMD Zen (AVX2) // So it's safe to assume that if we have SHA we have at least SSE4.2 // ~https://en.wikipedia.org/wiki/Intel_SHA_extensions #if AK_CAN_CODEGEN_FOR_X86_SHA && AK_CAN_CODEGEN_FOR_X86_SSE42 template<> [[gnu::target("sha,sse4.2")]] void SHA1::transform_impl() { # define SHA_TARGET gnu::target("sha"), gnu::always_inline auto& state = m_state; auto& data = m_data_buffer; using AK::SIMD::u32x4, AK::SIMD::i32x4; // Note: These need to be unsigned, as we add to them and expect them to wrap around, // and signed overflow is UB; // BUT: GCC -for some reason- expects the input types to the SHA intrinsics to be signed // and there does not seem to be a way to temporarily enable -flax-vector-conversions // so this leads to a few bit_casts further down u32x4 abcd[2] {}; u32x4 msgs[4] {}; abcd[0] = AK::SIMD::load_unaligned(&state[0]); abcd[0] = AK::SIMD::item_reverse(abcd[0]); u32x4 e { 0, 0, 0, state[4] }; auto sha_msg1 = [] [[SHA_TARGET]] (u32x4 a, u32x4 b) { return bit_cast(__builtin_ia32_sha1msg1(bit_cast(a), bit_cast(b))); }; auto sha_msg2 = [] [[SHA_TARGET]] (u32x4 a, u32x4 b) { return bit_cast(__builtin_ia32_sha1msg2(bit_cast(a), bit_cast(b))); }; auto sha_next_e = [] [[SHA_TARGET]] (u32x4 a, u32x4 b) { return bit_cast(__builtin_ia32_sha1nexte(bit_cast(a), bit_cast(b))); }; auto sha_rnds4 = [] [[SHA_TARGET]] (u32x4 a, u32x4 b) { return bit_cast(__builtin_ia32_sha1rnds4(bit_cast(a), bit_cast(b), i)); }; auto group = [&] [[SHA_TARGET]] () { for (size_t i_pack = 0; i_pack != 5; ++i_pack) { size_t i_msg = i_group * 5 + i_pack; if (i_msg < 4) { msgs[i_msg] = AK::SIMD::load_unaligned(&data[i_msg * 16]); msgs[i_msg] = AK::SIMD::byte_reverse(msgs[i_msg]); } else { msgs[i_msg % 4] = sha_msg1(msgs[i_msg % 4], msgs[(i_msg + 1) % 4]); msgs[i_msg % 4] ^= msgs[(i_msg + 2) % 4]; msgs[i_msg % 4] = sha_msg2(msgs[i_msg % 4], msgs[(i_msg + 3) % 4]); } if (i_msg == 0) { e += msgs[0]; } else { e = sha_next_e(abcd[(i_msg + 1) % 2], msgs[(i_msg + 0) % 4]); } abcd[(i_msg + 1) % 2] = sha_rnds4.operator()(abcd[(i_msg + 0) % 2], e); } }; auto old_abcd = abcd[0]; auto old_e = e; group.operator()<0>(); group.operator()<1>(); group.operator()<2>(); group.operator()<3>(); e = sha_next_e(abcd[1], u32x4 {}); abcd[0] += old_abcd; e += old_e; abcd[0] = AK::SIMD::item_reverse(abcd[0]); AK::SIMD::store_unaligned(&state[0], abcd[0]); state[4] = e[3]; # undef SHA_TARGET } #endif decltype(SHA1::transform_dispatched) SHA1::transform_dispatched = [] { CPUFeatures features = detect_cpu_features(); if constexpr (is_valid_feature(CPUFeatures::X86_SHA | CPUFeatures::X86_SSE42)) { if (has_flag(features, CPUFeatures::X86_SHA | CPUFeatures::X86_SSE42)) return &SHA1::transform_impl; } return &SHA1::transform_impl; }(); void SHA1::update(u8 const* message, size_t length) { while (length > 0) { size_t copy_bytes = AK::min(length, BlockSize - m_data_length); __builtin_memcpy(m_data_buffer + m_data_length, message, copy_bytes); message += copy_bytes; length -= copy_bytes; m_data_length += copy_bytes; if (m_data_length == BlockSize) { transform(); m_bit_length += BlockSize * 8; m_data_length = 0; } } } SHA1::DigestType SHA1::digest() { auto digest = peek(); reset(); return digest; } SHA1::DigestType SHA1::peek() { DigestType digest; size_t i = m_data_length; // make a local copy of the data as we modify it u8 data[BlockSize]; u32 state[5]; __builtin_memcpy(data, m_data_buffer, m_data_length); __builtin_memcpy(state, m_state, 20); if (m_data_length < FinalBlockDataSize) { m_data_buffer[i++] = 0x80; while (i < FinalBlockDataSize) m_data_buffer[i++] = 0x00; } else { // First, complete a block with some padding. m_data_buffer[i++] = 0x80; while (i < BlockSize) m_data_buffer[i++] = 0x00; transform(); // Then start another block with BlockSize - 8 bytes of zeros __builtin_memset(m_data_buffer, 0, FinalBlockDataSize); } // append total message length m_bit_length += m_data_length * 8; m_data_buffer[BlockSize - 1] = m_bit_length; m_data_buffer[BlockSize - 2] = m_bit_length >> 8; m_data_buffer[BlockSize - 3] = m_bit_length >> 16; m_data_buffer[BlockSize - 4] = m_bit_length >> 24; m_data_buffer[BlockSize - 5] = m_bit_length >> 32; m_data_buffer[BlockSize - 6] = m_bit_length >> 40; m_data_buffer[BlockSize - 7] = m_bit_length >> 48; m_data_buffer[BlockSize - 8] = m_bit_length >> 56; transform(); for (i = 0; i < 4; ++i) { digest.data[i + 0] = (m_state[0] >> (24 - i * 8)) & 0x000000ff; digest.data[i + 4] = (m_state[1] >> (24 - i * 8)) & 0x000000ff; digest.data[i + 8] = (m_state[2] >> (24 - i * 8)) & 0x000000ff; digest.data[i + 12] = (m_state[3] >> (24 - i * 8)) & 0x000000ff; digest.data[i + 16] = (m_state[4] >> (24 - i * 8)) & 0x000000ff; } // restore the data __builtin_memcpy(m_data_buffer, data, m_data_length); __builtin_memcpy(m_state, state, 20); return digest; } }