-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Adding GetChunks which allow efficient scanning of a StringBuilder #17530
Changes from all commits
9b1f2b2
8ae3544
e646dd2
3159048
009a549
014c623
6eeb7b6
6198c74
1028e38
57055f1
79cbf2e
0ba7f8d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -557,6 +557,131 @@ public char this[int index] | |
} | ||
} | ||
|
||
/// <summary> | ||
/// EnumerateChunks returns ChunkEnumerator that follows the IEnumerable pattern and | ||
/// thus can be used in a C# 'foreach' statements to retreive the data in the StringBuilder | ||
/// as chunks (ReadOnlyMemory) of characters. An example use is: | ||
/// | ||
/// foreach (ReadOnlyMemory<char> chunk in sb.EnumerateChunks()) | ||
/// foreach(char c in chunk.Span) | ||
/// { /* operation on c } | ||
/// | ||
/// Note that creating a ReadOnlySpan from a ReadOnlyMemory is expensive compared to the | ||
/// fetching of the character, so create a local variable for the SPAN if you need to use | ||
/// a for statement for example | ||
/// | ||
/// foreach (ReadOnlyMemory<char> chunk in sb.EnumerateChunks()) | ||
/// { | ||
/// var span = chunk.Span; | ||
/// for(int i = 0; i < span.Length; i++) | ||
/// { /* operation on span[i] */ } | ||
/// } | ||
/// </summary> | ||
public ChunkEnumerator EnumerateChunks() => new ChunkEnumerator(this); | ||
|
||
/// <summary> | ||
/// ChunkEnumerator supports both the IEnumerable and IEnumerator pattern so foreach | ||
/// works (see EnumerateChunks). It needs to be public (so the compiler can use it | ||
/// when building a foreach statement) but users typically don't use it explicitly. | ||
/// (which is why it is a nested type). | ||
/// </summary> | ||
public struct ChunkEnumerator | ||
{ | ||
/// <summary> | ||
/// Implement IEnumerable.GetEnumerator() to return 'this' as the IEnumerator | ||
/// </summary> | ||
[ComponentModel.EditorBrowsable(ComponentModel.EditorBrowsableState.Never)] // Only here to make foreach work | ||
public ChunkEnumerator GetEnumerator() { return this; } | ||
|
||
/// <summary> | ||
/// Implements the IEnumerator pattern. | ||
/// </summary> | ||
public bool MoveNext() | ||
{ | ||
if (_currentChunk == _firstChunk) | ||
return false; | ||
|
||
if (_manyChunks != null) | ||
return _manyChunks.MoveNext(ref _currentChunk); | ||
|
||
StringBuilder next = _firstChunk; | ||
while (next.m_ChunkPrevious != _currentChunk) | ||
next = next.m_ChunkPrevious; | ||
_currentChunk = next; | ||
return true; | ||
} | ||
|
||
/// <summary> | ||
/// Implements the IEnumerator pattern. | ||
/// </summary> | ||
public ReadOnlyMemory<char> Current => new ReadOnlyMemory<char>(_currentChunk.m_ChunkChars, 0, _currentChunk.m_ChunkLength); | ||
|
||
#region private | ||
internal ChunkEnumerator(StringBuilder stringBuilder) | ||
{ | ||
Debug.Assert(stringBuilder != null); | ||
_firstChunk = stringBuilder; | ||
_currentChunk = null; // MoveNext will find the last chunk if we do this. | ||
_manyChunks = null; | ||
|
||
// There is a performance-vs-allocation tradeoff. Because the chunks | ||
// are a linked list with each chunk pointing to its PREDECESSOR, walking | ||
// the list FORWARD is not efficient. If there are few chunks (< 8) we | ||
// simply scan from the start each time, and tolerate the N*N behavior. | ||
// However above this size, we allocate an array to hold pointers to all | ||
// the chunks and we can be efficient for large N. | ||
int chunkCount = ChunkCount(stringBuilder); | ||
if (8 < chunkCount) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit, this is sort of a Yoda conditional There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't follow the change you want. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. Quite a while back, I adopted a convention of always making relational operators go from smallest to largest. (basically I never use > or >=). I find it slightly clearer and less error prone, as it eliminates one source of arbitrary variation, and allows visual position to help in reading the code (this is especially nice if you are testing for a range). For what it is worth... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense. I may do it. |
||
_manyChunks = new ManyChunkInfo(stringBuilder, chunkCount); | ||
} | ||
|
||
private static int ChunkCount(StringBuilder stringBuilder) | ||
{ | ||
int ret = 0; | ||
while (stringBuilder != null) | ||
{ | ||
ret++; | ||
stringBuilder = stringBuilder.m_ChunkPrevious; | ||
} | ||
return ret; | ||
} | ||
|
||
/// <summary> | ||
/// Used to hold all the chunks indexes when you have many chunks. | ||
/// </summary> | ||
private class ManyChunkInfo | ||
{ | ||
public bool MoveNext(ref StringBuilder current) | ||
{ | ||
int pos = ++_chunkPos; | ||
if (_chunks.Length <= pos) | ||
return false; | ||
current = _chunks[pos]; | ||
return true; | ||
} | ||
|
||
public ManyChunkInfo(StringBuilder stringBuilder, int chunkCount) | ||
{ | ||
_chunks = new StringBuilder[chunkCount]; | ||
while (0 <= --chunkCount) | ||
{ | ||
Debug.Assert(stringBuilder != null); | ||
_chunks[chunkCount] = stringBuilder; | ||
stringBuilder = stringBuilder.m_ChunkPrevious; | ||
} | ||
_chunkPos = -1; | ||
} | ||
|
||
readonly StringBuilder[] _chunks; // These are in normal order (first chunk first) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please put fields first, per coding guidelines, and add missing visibility specifiers (private)? in ManyChunkInfo and ChunkEnumerator |
||
int _chunkPos; | ||
} | ||
|
||
readonly StringBuilder _firstChunk; // The first Stringbuilder chunk (which is the end of the logical string) | ||
StringBuilder _currentChunk; // The chunk that this enumerator is currently returning (Current). | ||
readonly ManyChunkInfo _manyChunks; // Only used for long string builders with many chunks (see constructor) | ||
#endregion | ||
} | ||
|
||
/// <summary> | ||
/// Appends a character 0 or more times to the end of this builder. | ||
/// </summary> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess the number 8 is just a judgement call here? I don't have good intuition for the number of chunks in the real world.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, 8 is a judgement call. Bascially you want a number before N*N gets too big. Generally if you do 'nothing' StringBuilders start with 16 bytes and double each time, so this handles strings up to about 2K-4K if you don't ever specify a starting length. Thus 8 keeps things non-allocating for the common case of strings < 256 bytes (e.g. paths). On the other hand, at 2-4K of data, the cost of copying the data is probably now large with respect to the cost of allocating the array, so the value of making the nubmer bigger is probably not a big deal (especially because people using StringBuilder on large strings should be setting a good starting length).
We can tune it as necessary.