-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Much higher performance with these changes #9
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,91 +1,117 @@ | ||
using System; | ||
using System.Runtime.InteropServices; | ||
using System.Threading; | ||
using System.Runtime.CompilerServices; | ||
|
||
namespace MPMCQueue.NET | ||
{ | ||
[StructLayout(LayoutKind.Explicit, Size = 192, CharSet = CharSet.Ansi)] | ||
[StructLayout(LayoutKind.Explicit, Size = 336)] | ||
public class MPMCQueue | ||
{ | ||
[FieldOffset(0)] | ||
private readonly Cell[] _buffer; | ||
[FieldOffset(8)] | ||
private readonly int _bufferMask; | ||
[FieldOffset(64)] | ||
private int _enqueuePos; | ||
[FieldOffset(128)] | ||
private int _dequeuePos; | ||
/// <summary> | ||
/// 128 bytes cache line already exists in some CPUs. | ||
/// </summary> | ||
/// <remarks> | ||
/// Also "the spatial prefetcher strives to keep pairs of cache lines in the L2 cache." | ||
/// https://stackoverflow.com/questions/29199779/false-sharing-and-128-byte-alignment-padding | ||
/// </remarks> | ||
internal const int SAFE_CACHE_LINE = 128; | ||
|
||
[FieldOffset(SAFE_CACHE_LINE)] | ||
private readonly Cell[] _enqueueBuffer; | ||
|
||
[FieldOffset(SAFE_CACHE_LINE + 8)] | ||
private volatile int _enqueuePos; | ||
|
||
// Separate access to buffers from enqueue and dequeue. | ||
// This removes false sharing and accessing a buffer | ||
// reference also prefetches the following Pos with [(64 - (8 + 4 + 4)) = 52]/64 probability. | ||
|
||
[FieldOffset(SAFE_CACHE_LINE * 2)] | ||
private readonly Cell[] _dequeueBuffer; | ||
|
||
[FieldOffset(SAFE_CACHE_LINE * 2 + 8)] | ||
private volatile int _dequeuePos; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hello :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
With probability 8/64 yes. .NET aligns objects to 8 bytes, not cache line. And yes, if anything is modified on a cache line all other threads will need to reload that line, so modifying enqueuePos could interfere with Dequeue reading bufferMask with 8/64 probability. |
||
|
||
public MPMCQueue(int bufferSize) | ||
{ | ||
if (bufferSize < 2) throw new ArgumentException($"{nameof(bufferSize)} should be greater than or equal to 2"); | ||
if ((bufferSize & (bufferSize - 1)) != 0) throw new ArgumentException($"{nameof(bufferSize)} should be a power of 2"); | ||
|
||
_bufferMask = bufferSize - 1; | ||
_buffer = new Cell[bufferSize]; | ||
_enqueueBuffer = new Cell[bufferSize]; | ||
|
||
for (var i = 0; i < bufferSize; i++) | ||
{ | ||
_buffer[i] = new Cell(i, null); | ||
_enqueueBuffer[i] = new Cell(i, null); | ||
} | ||
|
||
_dequeueBuffer = _enqueueBuffer; | ||
_enqueuePos = 0; | ||
_dequeuePos = 0; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a significant change by itself. Is the perf the same without AI? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Without AI is slightly slower of cause. But the main point of changes in multi-threaded scalability: spinning and two references to the same buffer. Not sure about buffer bounds check with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The check is still there, but JIT reuses the load of Length for both the check and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any trick to avoid bounds check here? To leverage pow2 somehow. I searched through coreclr issues but couldn't find any. Also do you know if there is a single place with all current optimizations for BC elimination? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it can avoid bounds check. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't put There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is my bad habbit in general. But for low-level building blocks that are intended to be used later it's often better to have it: you could always wrap it into another method if you do not want inlining or prefer JIT to decide. Usually the caller decides this, not callee (relevant discussion dotnet/corefxlab#2592 (comment)). Here I never compared perf to the original implementation in absolute numbers, but the multithreaded scalability of the same method, which doesn't depend on the attribute. |
||
public bool TryEnqueue(object item) | ||
{ | ||
var spinner = new SpinWait(); | ||
do | ||
{ | ||
var buffer = _buffer; | ||
var buffer = _enqueueBuffer; | ||
var pos = _enqueuePos; | ||
var index = pos & _bufferMask; | ||
var cell = buffer[index]; | ||
var index = pos & (buffer.Length - 1); | ||
ref var cell = ref buffer[index]; | ||
if (cell.Sequence == pos && Interlocked.CompareExchange(ref _enqueuePos, pos + 1, pos) == pos) | ||
{ | ||
buffer[index].Element = item; | ||
Volatile.Write(ref buffer[index].Sequence, pos + 1); | ||
cell.Element = item; | ||
cell.Sequence = pos + 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
On line 62 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am still only planning to buy Raspberry PI for testing :) Yes, good point. It's next to Interlocked call anyways. I changed that mostly from aesthetics perspective. |
||
return true; | ||
} | ||
|
||
if (cell.Sequence < pos) | ||
{ | ||
return false; | ||
} | ||
|
||
spinner.SpinOnce(); | ||
} while (true); | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public bool TryDequeue(out object result) | ||
{ | ||
result = null; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Left from my API changes where I return |
||
var spinner = new SpinWait(); | ||
do | ||
{ | ||
var buffer = _buffer; | ||
var bufferMask = _bufferMask; | ||
var buffer = _dequeueBuffer; | ||
var pos = _dequeuePos; | ||
var index = pos & bufferMask; | ||
var cell = buffer[index]; | ||
var index = pos & (buffer.Length - 1); | ||
ref var cell = ref buffer[index]; | ||
if (cell.Sequence == pos + 1 && Interlocked.CompareExchange(ref _dequeuePos, pos + 1, pos) == pos) | ||
{ | ||
result = cell.Element; | ||
buffer[index].Element = null; | ||
Volatile.Write(ref buffer[index].Sequence, pos + bufferMask + 1); | ||
return true; | ||
cell.Element = null; | ||
cell.Sequence = pos + buffer.Length; | ||
break; | ||
} | ||
|
||
if (cell.Sequence < pos + 1) | ||
{ | ||
result = default(object); | ||
return false; | ||
break; | ||
} | ||
|
||
spinner.SpinOnce(); | ||
} while (true); | ||
|
||
return result != null; | ||
} | ||
|
||
[StructLayout(LayoutKind.Explicit, Size = 16, CharSet = CharSet.Ansi)] | ||
[StructLayout(LayoutKind.Explicit, Size = 16)] | ||
private struct Cell | ||
{ | ||
[FieldOffset(0)] | ||
public int Sequence; | ||
public volatile int Sequence; | ||
|
||
[FieldOffset(8)] | ||
public object Element; | ||
|
||
|
@@ -96,4 +122,4 @@ public Cell(int sequence, object element) | |
} | ||
} | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo, should be 384