Skip to content

Commit

Permalink
MergeMultiple method for merging more than two TDigest-s at once
Browse files Browse the repository at this point in the history
  • Loading branch information
ASolomatin committed Sep 7, 2023
1 parent 5d9a277 commit 4a060b6
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 3 deletions.
3 changes: 0 additions & 3 deletions src/TDigestNet.Tests/TestMerge.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ public class TestMerge : TestBase
private readonly List<double> _actual = new();
private readonly TDigest _digestA = new();
private readonly TDigest _digestB = new();
private readonly TDigest _digestAll = new();
private readonly TDigest _merged;

public TestMerge()
Expand All @@ -15,15 +14,13 @@ public TestMerge()
{
var n = (_rand.Next() % 50) + (_rand.Next() % 50);
_digestA.Add(n);
_digestAll.Add(n);
_actual.Add(n);
}

for (int i = 0; i < 10000; i++)
{
var n = (_rand.Next() % 100) + (_rand.Next() % 100);
_digestB.Add(n);
_digestAll.Add(n);
_actual.Add(n);
}

Expand Down
62 changes: 62 additions & 0 deletions src/TDigestNet.Tests/TestMergeMultiple.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
namespace TDigestNet.Tests;

public class TestMergeMultiple : TestBase
{
private readonly Random _rand = new();
private readonly List<double> _actual = new();
private readonly TDigest _digestA = new();
private readonly TDigest _digestB = new();
private readonly TDigest _digestC = new();
private readonly TDigest _digestD = new();
private readonly TDigest _merged;

public TestMergeMultiple()
{
for (int i = 0; i < 10000; i++)
{
var n = (_rand.Next() % 50) + (_rand.Next() % 50);
_digestA.Add(n);
_actual.Add(n);
}

for (int i = 0; i < 10000; i++)
{
var n = (_rand.Next() % 100) + (_rand.Next() % 100);
_digestB.Add(n);
_actual.Add(n);
}

for (int i = 0; i < 10000; i++)
{
var n = (_rand.Next() % 50) + (_rand.Next() % 50) + 100;
_digestC.Add(n);
_actual.Add(n);
}

for (int i = 0; i < 10000; i++)
{
var n = (_rand.Next() % 100) + (_rand.Next() % 100) + 100;
_digestD.Add(n);
_actual.Add(n);
}

_actual.Sort();

_merged = TDigest.MergeMultiple(_digestA, _digestB, _digestC, _digestD);
}

[Test, Order(0)]
public void ValidateStructure() => ValidateInternalTree(_merged);

[Test, Order(1)]
public void TestNotNull() => Assert.IsNotNull(_merged);

[Test]
public void TestCount() => Assert.That(_actual.Count, Is.EqualTo(_merged.Count));

[Test]
public void TestAvgError() => Assert.That(GetAvgError(_actual, _merged!), Is.LessThan(.01));

[Test]
public void TestAvgPercentileError() => Assert.That(GetAvgPercentileError(_actual, _merged), Is.LessThan(1));
}
126 changes: 126 additions & 0 deletions src/TDigestNet/TDigest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,132 @@ IEnumerable<Centroid> Enumerate()
}
}

/// <summary>
/// Merge multiple T-Digests with default accuracy and compression settings
/// </summary>
/// <param name="digests">T-Digests</param>
/// <returns>A T-Digest created by merging the specified T-Digests</returns>
public static TDigest MergeMultiple(params TDigest[] digests) => MergeMultiple(DEFAULT_ACCURACY, DEFAULT_COMPRESSION, digests as IEnumerable<TDigest>);

/// <summary>
/// Merge multiple T-Digests
/// </summary>
/// <param name="accuracy">Controls the trade-off between accuracy and memory consumption/performance.
/// Default value is .05, higher values result in worse accuracy, but better performance and decreased memory usage, while
/// lower values result in better accuracy and increased performance and memory usage</param>
/// <param name="compression">K value</param>
/// <param name="digests">T-Digests</param>
/// <returns>A T-Digest created by merging the specified T-Digests</returns>
public static TDigest MergeMultiple(double accuracy, double compression, params TDigest[] digests) => MergeMultiple(accuracy, compression, digests as IEnumerable<TDigest>);

/// <summary>
/// Merge multiple T-Digests with default accuracy and compression settings
/// </summary>
/// <param name="digests">T-Digests</param>
/// <returns>A T-Digest created by merging the specified T-Digests</returns>
public static TDigest MergeMultiple(IEnumerable<TDigest> digests) => MergeMultiple(DEFAULT_ACCURACY, DEFAULT_COMPRESSION, digests);

/// <summary>
/// Merge multiple T-Digests
/// </summary>
/// <param name="accuracy">Controls the trade-off between accuracy and memory consumption/performance.
/// Default value is .05, higher values result in worse accuracy, but better performance and decreased memory usage, while
/// lower values result in better accuracy and increased performance and memory usage</param>
/// <param name="compression">K value</param>
/// <param name="digests">T-Digests</param>
/// <returns>A T-Digest created by merging the specified T-Digests</returns>
public static TDigest MergeMultiple(double accuracy, double compression, IEnumerable<TDigest> digests)
{
var count = digests.Sum(d => d.Count);
var tree = CompressCentroidTree(Enumerate(), count, accuracy);

var digest = new TDigest(tree)
{
_average = digests.Sum(d => d._average * d.Count) / count,
Accuracy = accuracy,
CompressionConstant = compression,
Min = digests.Min(d => d.Min),
Max = digests.Max(d => d.Max),
};

if (digest._centroids.Count > (digest.CompressionConstant / digest.Accuracy))
digest._centroids = digest.CompressCentroidTree();

return digest;

IEnumerable<Centroid> Enumerate()
{
var enumerators = digests.Select(d => (IEnumerator<Centroid>?)d._centroids.GetEnumerator()).ToArray();
var enumeratorsCount = enumerators.Length;
var enumeratorsLost = enumeratorsCount;
var centroids = new Centroid?[enumeratorsCount];
try
{
for (int i = 0; i < enumeratorsCount;)
if (LoadValue(i))
i++;

while (enumeratorsLost != 0)
{
Centroid? minimum = null;
for (int i = 0; i < enumeratorsCount; i++)
{
var current = centroids[i];
if (current is not null && (minimum is null || current.mean < minimum.mean))
minimum = current;
}

if (minimum is null)
throw new ApplicationException("Centroid is null but this was not expected");

var weight = .0;
for (int i = 0; i < enumeratorsCount; i++)
{
var current = centroids[i];
if (current is not null && minimum.mean == current.mean)
{
weight += current.weight;
LoadValue(i);
}
}

if(weight == minimum.weight)
yield return minimum;
else
yield return new(minimum.mean, weight);
}
}
finally
{
foreach (var enumerator in enumerators)
enumerator?.Dispose();
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
bool LoadValue(int i)
{
var enumerator = enumerators[i];

if (enumerator is null)
throw new ApplicationException("Enumerator is null but this was not expected");

if (enumerator.MoveNext())
centroids[i] = enumerator.Current;
else
{
enumerator.Dispose();
enumerators[i] = null;
centroids[i] = null;
enumeratorsLost--;

return false;
}

return true;
}
}
}

#region Operators

/// <summary>
Expand Down

0 comments on commit 4a060b6

Please sign in to comment.