.NET-Community-Toolkit/CommunityToolkit.HighPerfor.../Helpers/Internals/SpanHelper.Hash.cs

310 lines
15 KiB
C#

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Numerics;
using System.Runtime.CompilerServices;
namespace CommunityToolkit.HighPerformance.Helpers.Internals;
/// <inheritdoc/>
partial class SpanHelper
{
/// <summary>
/// Calculates the djb2 hash for the target sequence of items of a given type.
/// </summary>
/// <typeparam name="T">The type of items to hash.</typeparam>
/// <param name="r0">The reference to the target memory area to hash.</param>
/// <param name="length">The number of items to hash.</param>
/// <returns>The Djb2 value for the input sequence of items.</returns>
public static int GetDjb2HashCode<T>(ref T r0, nint length)
where T : notnull
{
int hash = 5381;
nint offset = 0;
while (length >= 8)
{
// Doing a left shift by 5 and adding is equivalent to multiplying by 33.
// This is preferred for performance reasons, as when working with integer
// values most CPUs have higher latency for multiplication operations
// compared to a simple shift and add. For more info on this, see the
// details for imul, shl, add: https://gmplib.org/~tege/x86-timing.pdf.
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 0).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 1).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 2).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 3).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 4).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 5).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 6).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 7).GetHashCode());
length -= 8;
offset += 8;
}
if (length >= 4)
{
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 0).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 1).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 2).GetHashCode());
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset + 3).GetHashCode());
length -= 4;
offset += 4;
}
while (length > 0)
{
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset).GetHashCode());
length -= 1;
offset += 1;
}
return hash;
}
/// <summary>
/// Gets a content hash from a given memory area.
/// </summary>
/// <param name="r0">A <see cref="byte"/> reference to the start of the memory area.</param>
/// <param name="length">The size in bytes of the memory area.</param>
/// <returns>The hash code for the contents of the source memory area.</returns>
/// <remarks>
/// While this method is similar to <see cref="GetDjb2HashCode{T}"/> and can in some cases
/// produce the same output for a given memory area, it is not guaranteed to always be that way.
/// This is because this method can use SIMD instructions if possible, which can cause a computed
/// hash to differ for the same data, if processed on different machines with different CPU features.
/// The advantage of this method is that when SIMD instructions are available, it performs much
/// faster than <see cref="GetDjb2HashCode{T}"/>, as it can parallelize much of the workload.
/// </remarks>
public static unsafe int GetDjb2LikeByteHash(ref byte r0, nint length)
{
int hash = 5381;
nint offset = 0;
// Check whether SIMD instructions are supported, and also check
// whether we have enough data to perform at least one unrolled
// iteration of the vectorized path. This heuristics is to balance
// the overhead of loading the constant values in the two registers,
// and the final loop to combine the partial hash values.
// Note that even when we use the vectorized path we don't need to do
// any preprocessing to try to get memory aligned, as that would cause
// the hash codes to potentially be different for the same data.
if (Vector.IsHardwareAccelerated &&
length >= (Vector<byte>.Count << 3))
{
Vector<int> vh = new(5381);
Vector<int> v33 = new(33);
// First vectorized loop, with 8 unrolled iterations.
// Assuming 256-bit registers (AVX2), a total of 256 bytes are processed
// per iteration, with the partial hashes being accumulated for later use.
while (length >= (Vector<byte>.Count << 3))
{
ref byte ri0 = ref Unsafe.Add(ref r0, offset + (Vector<byte>.Count * 0));
Vector<int> vi0 = Unsafe.ReadUnaligned<Vector<int>>(ref ri0);
Vector<int> vp0 = Vector.Multiply(vh, v33);
vh = Vector.Xor(vp0, vi0);
ref byte ri1 = ref Unsafe.Add(ref r0, offset + (Vector<byte>.Count * 1));
Vector<int> vi1 = Unsafe.ReadUnaligned<Vector<int>>(ref ri1);
Vector<int> vp1 = Vector.Multiply(vh, v33);
vh = Vector.Xor(vp1, vi1);
ref byte ri2 = ref Unsafe.Add(ref r0, offset + (Vector<byte>.Count * 2));
Vector<int> vi2 = Unsafe.ReadUnaligned<Vector<int>>(ref ri2);
Vector<int> vp2 = Vector.Multiply(vh, v33);
vh = Vector.Xor(vp2, vi2);
ref byte ri3 = ref Unsafe.Add(ref r0, offset + (Vector<byte>.Count * 3));
Vector<int> vi3 = Unsafe.ReadUnaligned<Vector<int>>(ref ri3);
Vector<int> vp3 = Vector.Multiply(vh, v33);
vh = Vector.Xor(vp3, vi3);
ref byte ri4 = ref Unsafe.Add(ref r0, offset + (Vector<byte>.Count * 4));
Vector<int> vi4 = Unsafe.ReadUnaligned<Vector<int>>(ref ri4);
Vector<int> vp4 = Vector.Multiply(vh, v33);
vh = Vector.Xor(vp4, vi4);
ref byte ri5 = ref Unsafe.Add(ref r0, offset + (Vector<byte>.Count * 5));
Vector<int> vi5 = Unsafe.ReadUnaligned<Vector<int>>(ref ri5);
Vector<int> vp5 = Vector.Multiply(vh, v33);
vh = Vector.Xor(vp5, vi5);
ref byte ri6 = ref Unsafe.Add(ref r0, offset + (Vector<byte>.Count * 6));
Vector<int> vi6 = Unsafe.ReadUnaligned<Vector<int>>(ref ri6);
Vector<int> vp6 = Vector.Multiply(vh, v33);
vh = Vector.Xor(vp6, vi6);
ref byte ri7 = ref Unsafe.Add(ref r0, offset + (Vector<byte>.Count * 7));
Vector<int> vi7 = Unsafe.ReadUnaligned<Vector<int>>(ref ri7);
Vector<int> vp7 = Vector.Multiply(vh, v33);
vh = Vector.Xor(vp7, vi7);
length -= Vector<byte>.Count << 3;
offset += Vector<byte>.Count << 3;
}
// When this loop is reached, there are up to 255 bytes left (on AVX2).
// Each iteration processed an additional 32 bytes and accumulates the results.
while (length >= Vector<byte>.Count)
{
ref byte ri = ref Unsafe.Add(ref r0, offset);
Vector<int> vi = Unsafe.ReadUnaligned<Vector<int>>(ref ri);
Vector<int> vp = Vector.Multiply(vh, v33);
vh = Vector.Xor(vp, vi);
length -= Vector<byte>.Count;
offset += Vector<byte>.Count;
}
// Combine the partial hash values in each position.
// The loop below should automatically be unrolled by the JIT.
for (int j = 0; j < Vector<int>.Count; j++)
{
hash = unchecked(((hash << 5) + hash) ^ vh[j]);
}
}
else
{
// Only use the loop working with 64-bit values if we are on a
// 64-bit processor, otherwise the result would be much slower.
// Each unrolled iteration processes 64 bytes.
if (sizeof(nint) == sizeof(ulong))
{
while (length >= (sizeof(ulong) << 3))
{
ref byte ri0 = ref Unsafe.Add(ref r0, offset + (sizeof(ulong) * 0));
ulong value0 = Unsafe.ReadUnaligned<ulong>(ref ri0);
hash = unchecked(((hash << 5) + hash) ^ (int)value0 ^ (int)(value0 >> 32));
ref byte ri1 = ref Unsafe.Add(ref r0, offset + (sizeof(ulong) * 1));
ulong value1 = Unsafe.ReadUnaligned<ulong>(ref ri1);
hash = unchecked(((hash << 5) + hash) ^ (int)value1 ^ (int)(value1 >> 32));
ref byte ri2 = ref Unsafe.Add(ref r0, offset + (sizeof(ulong) * 2));
ulong value2 = Unsafe.ReadUnaligned<ulong>(ref ri2);
hash = unchecked(((hash << 5) + hash) ^ (int)value2 ^ (int)(value2 >> 32));
ref byte ri3 = ref Unsafe.Add(ref r0, offset + (sizeof(ulong) * 3));
ulong value3 = Unsafe.ReadUnaligned<ulong>(ref ri3);
hash = unchecked(((hash << 5) + hash) ^ (int)value3 ^ (int)(value3 >> 32));
ref byte ri4 = ref Unsafe.Add(ref r0, offset + (sizeof(ulong) * 4));
ulong value4 = Unsafe.ReadUnaligned<ulong>(ref ri4);
hash = unchecked(((hash << 5) + hash) ^ (int)value4 ^ (int)(value4 >> 32));
ref byte ri5 = ref Unsafe.Add(ref r0, offset + (sizeof(ulong) * 5));
ulong value5 = Unsafe.ReadUnaligned<ulong>(ref ri5);
hash = unchecked(((hash << 5) + hash) ^ (int)value5 ^ (int)(value5 >> 32));
ref byte ri6 = ref Unsafe.Add(ref r0, offset + (sizeof(ulong) * 6));
ulong value6 = Unsafe.ReadUnaligned<ulong>(ref ri6);
hash = unchecked(((hash << 5) + hash) ^ (int)value6 ^ (int)(value6 >> 32));
ref byte ri7 = ref Unsafe.Add(ref r0, offset + (sizeof(ulong) * 7));
ulong value7 = Unsafe.ReadUnaligned<ulong>(ref ri7);
hash = unchecked(((hash << 5) + hash) ^ (int)value7 ^ (int)(value7 >> 32));
length -= sizeof(ulong) << 3;
offset += sizeof(ulong) << 3;
}
}
// Each unrolled iteration processes 32 bytes
while (length >= (sizeof(uint) << 3))
{
ref byte ri0 = ref Unsafe.Add(ref r0, offset + (sizeof(uint) * 0));
uint value0 = Unsafe.ReadUnaligned<uint>(ref ri0);
hash = unchecked(((hash << 5) + hash) ^ (int)value0);
ref byte ri1 = ref Unsafe.Add(ref r0, offset + (sizeof(uint) * 1));
uint value1 = Unsafe.ReadUnaligned<uint>(ref ri1);
hash = unchecked(((hash << 5) + hash) ^ (int)value1);
ref byte ri2 = ref Unsafe.Add(ref r0, offset + (sizeof(uint) * 2));
uint value2 = Unsafe.ReadUnaligned<uint>(ref ri2);
hash = unchecked(((hash << 5) + hash) ^ (int)value2);
ref byte ri3 = ref Unsafe.Add(ref r0, offset + (sizeof(uint) * 3));
uint value3 = Unsafe.ReadUnaligned<uint>(ref ri3);
hash = unchecked(((hash << 5) + hash) ^ (int)value3);
ref byte ri4 = ref Unsafe.Add(ref r0, offset + (sizeof(uint) * 4));
uint value4 = Unsafe.ReadUnaligned<uint>(ref ri4);
hash = unchecked(((hash << 5) + hash) ^ (int)value4);
ref byte ri5 = ref Unsafe.Add(ref r0, offset + (sizeof(uint) * 5));
uint value5 = Unsafe.ReadUnaligned<uint>(ref ri5);
hash = unchecked(((hash << 5) + hash) ^ (int)value5);
ref byte ri6 = ref Unsafe.Add(ref r0, offset + (sizeof(uint) * 6));
uint value6 = Unsafe.ReadUnaligned<uint>(ref ri6);
hash = unchecked(((hash << 5) + hash) ^ (int)value6);
ref byte ri7 = ref Unsafe.Add(ref r0, offset + (sizeof(uint) * 7));
uint value7 = Unsafe.ReadUnaligned<uint>(ref ri7);
hash = unchecked(((hash << 5) + hash) ^ (int)value7);
length -= sizeof(uint) << 3;
offset += sizeof(uint) << 3;
}
}
// At this point (assuming AVX2), there will be up to 31 bytes
// left, both for the vectorized and non vectorized paths.
// That number would go up to 63 on AVX512 systems, in which case it is
// still useful to perform this last loop unrolling.
if (length >= (sizeof(ushort) << 3))
{
ref byte ri0 = ref Unsafe.Add(ref r0, offset + (sizeof(ushort) * 0));
ushort value0 = Unsafe.ReadUnaligned<ushort>(ref ri0);
hash = unchecked(((hash << 5) + hash) ^ value0);
ref byte ri1 = ref Unsafe.Add(ref r0, offset + (sizeof(ushort) * 1));
ushort value1 = Unsafe.ReadUnaligned<ushort>(ref ri1);
hash = unchecked(((hash << 5) + hash) ^ value1);
ref byte ri2 = ref Unsafe.Add(ref r0, offset + (sizeof(ushort) * 2));
ushort value2 = Unsafe.ReadUnaligned<ushort>(ref ri2);
hash = unchecked(((hash << 5) + hash) ^ value2);
ref byte ri3 = ref Unsafe.Add(ref r0, offset + (sizeof(ushort) * 3));
ushort value3 = Unsafe.ReadUnaligned<ushort>(ref ri3);
hash = unchecked(((hash << 5) + hash) ^ value3);
ref byte ri4 = ref Unsafe.Add(ref r0, offset + (sizeof(ushort) * 4));
ushort value4 = Unsafe.ReadUnaligned<ushort>(ref ri4);
hash = unchecked(((hash << 5) + hash) ^ value4);
ref byte ri5 = ref Unsafe.Add(ref r0, offset + (sizeof(ushort) * 5));
ushort value5 = Unsafe.ReadUnaligned<ushort>(ref ri5);
hash = unchecked(((hash << 5) + hash) ^ value5);
ref byte ri6 = ref Unsafe.Add(ref r0, offset + (sizeof(ushort) * 6));
ushort value6 = Unsafe.ReadUnaligned<ushort>(ref ri6);
hash = unchecked(((hash << 5) + hash) ^ value6);
ref byte ri7 = ref Unsafe.Add(ref r0, offset + (sizeof(ushort) * 7));
ushort value7 = Unsafe.ReadUnaligned<ushort>(ref ri7);
hash = unchecked(((hash << 5) + hash) ^ value7);
length -= sizeof(ushort) << 3;
offset += sizeof(ushort) << 3;
}
// Handle the leftover items
while (length > 0)
{
hash = unchecked(((hash << 5) + hash) ^ Unsafe.Add(ref r0, offset));
length -= 1;
offset += 1;
}
return hash;
}
}