Performance discussion on the ToSnakeCase method #19471

wapco · 2024-04-02T07:55:24Z

Description

The ToSnakeCase method in ABP can be optimized using Span for better performance. In dotnet8, a ToSnakeCase method is also provided. I conducted a performance test using Benchmark and compared the three methods. The results show that my algorithm has the best performance, followed by Microsoft's dotnet8. However, when it comes to memory usage, dotnet8 is the best.

Benchmark Result

Method	Mean	Error	StdDev	Rank	Gen0	Allocated
ToSnakeCaseBySpan	52.34 ns	0.899 ns	0.797 ns	1	0.0803	168 B
ToSnakeCaseDotnet8	75.30 ns	0.472 ns	0.442 ns	2	0.0381	80 B
ToSnakeCaseByAbp	111.98 ns	1.160 ns	1.085 ns	3	0.1529	320 B

Benchmark Code

public static string name = "BBCToSnakeCaseBySpanTest2";

[Benchmark]
public string ToSnakeCaseBySpan()
{
    int bufferSize = name.Length;
    ReadOnlySpan<char> nameBuffer = name.AsSpan();
    Span<char> buffer = new char[bufferSize + bufferSize / 2];

    int bufferPosition = 0;

    int namePosition = 0;

    while (bufferPosition < bufferSize)
    {
        if (char.IsUpper(nameBuffer[namePosition]))
        {
            if (namePosition == 0 || namePosition == nameBuffer.Length - 1 ||
                (char.IsUpper(nameBuffer[namePosition + 1]) && char.IsUpper(nameBuffer[namePosition - 1])))
            {
                buffer[bufferPosition] = char.ToLowerInvariant(nameBuffer[namePosition]);
                bufferPosition++;
            }
            else
            {
                buffer[bufferPosition] = '_';
                buffer[bufferPosition + 1] = char.ToLowerInvariant(nameBuffer[namePosition]);
                bufferPosition += 2;
                bufferSize++;
            }

            namePosition++;
            continue;
        }

        buffer[bufferPosition] = nameBuffer[namePosition];

        bufferPosition++;
        namePosition++;
    }

    return buffer.Slice(0, bufferSize).ToString();
}

[Benchmark]
public string ToSnakeCaseDotnet8()
{
    if (string.IsNullOrEmpty(name))
    {
        return name;
    }

    char separator = '_';
    bool lowercase = true;
    var chars = name.AsSpan();

    char[]? rentedBuffer = null;

    // While we can't predict the expansion factor of the resultant string,
    // start with a buffer that is at least 20% larger than the input.
    int initialBufferLength = (int)(1.2 * chars.Length);
    Span<char> destination = initialBufferLength <= 128
        ? stackalloc char[128]
        : (rentedBuffer = ArrayPool<char>.Shared.Rent(initialBufferLength));

    SeparatorState state = SeparatorState.NotStarted;
    int charsWritten = 0;

    for (int i = 0; i < chars.Length; i++)
    {
        // NB this implementation does not handle surrogate pair letters
        // cf. https://github.com/dotnet/runtime/issues/90352

        char current = chars[i];
        UnicodeCategory category = char.GetUnicodeCategory(current);

        switch (category)
        {
            case UnicodeCategory.UppercaseLetter:

                switch (state)
                {
                    case SeparatorState.NotStarted:
                        break;

                    case SeparatorState.LowercaseLetterOrDigit:
                    case SeparatorState.SpaceSeparator:
                        // An uppercase letter following a sequence of lowercase letters or spaces
                        // denotes the start of a new grouping: emit a separator character.
                        WriteChar(separator, ref destination);
                        break;

                    case SeparatorState.UppercaseLetter:
                        // We are reading through a sequence of two or more uppercase letters.
                        // Uppercase letters are grouped together with the exception of the
                        // final letter, assuming it is followed by lowercase letters.
                        // For example, the value 'XMLReader' should render as 'xml_reader',
                        // however 'SHA512Hash' should render as 'sha512-hash'.
                        if (i + 1 < chars.Length && char.IsLower(chars[i + 1]))
                        {
                            WriteChar(separator, ref destination);
                        }

                        break;

                    default:
                        Debug.Fail($"Unexpected state {state}");
                        break;
                }

                if (lowercase)
                {
                    current = char.ToLowerInvariant(current);
                }

                WriteChar(current, ref destination);
                state = SeparatorState.UppercaseLetter;
                break;

            case UnicodeCategory.LowercaseLetter:
            case UnicodeCategory.DecimalDigitNumber:

                if (state is SeparatorState.SpaceSeparator)
                {
                    // Normalize preceding spaces to one separator.
                    WriteChar(separator, ref destination);
                }

                if (!lowercase && category is UnicodeCategory.LowercaseLetter)
                {
                    current = char.ToUpperInvariant(current);
                }

                WriteChar(current, ref destination);
                state = SeparatorState.LowercaseLetterOrDigit;
                break;

            case UnicodeCategory.SpaceSeparator:
                // Space characters are trimmed from the start and end of the input string
                // but are normalized to separator characters if between letters.
                if (state != SeparatorState.NotStarted)
                {
                    state = SeparatorState.SpaceSeparator;
                }

                break;

            default:
                // Non-alphanumeric characters (including the separator character and surrogates)
                // are written as-is to the output and reset the separator state.
                // E.g. 'ABC???def' maps to 'abc???def' in snake_case.

                WriteChar(current, ref destination);
                state = SeparatorState.NotStarted;
                break;
        }
    }

    string result = destination.Slice(0, charsWritten).ToString();

    if (rentedBuffer is not null)
    {
        destination.Slice(0, charsWritten).Clear();
        ArrayPool<char>.Shared.Return(rentedBuffer);
    }

    return result;

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    void WriteChar(char value, ref Span<char> destination)
    {
        if (charsWritten == destination.Length)
        {
            ExpandBuffer(ref destination);
        }

        destination[charsWritten++] = value;
    }

    void ExpandBuffer(ref Span<char> destination)
    {
        int newSize = checked(destination.Length * 2);
        char[] newBuffer = ArrayPool<char>.Shared.Rent(newSize);
        destination.CopyTo(newBuffer);

        if (rentedBuffer != null)
        {
            destination.Slice(0, charsWritten).Clear();
            ArrayPool<char>.Shared.Return(rentedBuffer);
        }

        rentedBuffer = newBuffer;
        destination = rentedBuffer;
    }
}

[Benchmark]
public string ToSnakeCaseByAbp()
{
    if (string.IsNullOrWhiteSpace(name))
    {
        return name;
    }

    var builder = new StringBuilder(name.Length + Math.Min(2, name.Length / 5));
    var previousCategory = default(UnicodeCategory?);

    for (var currentIndex = 0; currentIndex < name.Length; currentIndex++)
    {
        var currentChar = name[currentIndex];
        if (currentChar == '_')
        {
            builder.Append('_');
            previousCategory = null;
            continue;
        }

        var currentCategory = char.GetUnicodeCategory(currentChar);
        switch (currentCategory)
        {
            case UnicodeCategory.UppercaseLetter:
            case UnicodeCategory.TitlecaseLetter:
                if (previousCategory == UnicodeCategory.SpaceSeparator ||
                    previousCategory == UnicodeCategory.LowercaseLetter ||
                    previousCategory != UnicodeCategory.DecimalDigitNumber &&
                    previousCategory != null &&
                    currentIndex > 0 &&
                    currentIndex + 1 < name.Length &&
                    char.IsLower(name[currentIndex + 1]))
                {
                    builder.Append('_');
                }

                currentChar = char.ToLower(currentChar);
                break;

            case UnicodeCategory.LowercaseLetter:
            case UnicodeCategory.DecimalDigitNumber:
                if (previousCategory == UnicodeCategory.SpaceSeparator)
                {
                    builder.Append('_');
                }

                break;

            default:
                if (previousCategory != null)
                {
                    previousCategory = UnicodeCategory.SpaceSeparator;
                }

                continue;
        }

        builder.Append(currentChar);
        previousCategory = currentCategory;
    }

    return builder.ToString();
}

internal enum SnakeCaseState
{
    Start,
    Lower,
    Upper,
    NewWord
}

internal enum SeparatorState
{
    NotStarted,
    UppercaseLetter,
    LowercaseLetterOrDigit,
    SpaceSeparator,
}

realLiangshiwei · 2024-04-02T08:22:32Z

We can consider using the method provided by .NET 8

wapco added the problem label Apr 2, 2024

realLiangshiwei added enhancement and removed problem labels Apr 2, 2024

realLiangshiwei self-assigned this Apr 2, 2024

realLiangshiwei added the effort-xs label Apr 2, 2024

realLiangshiwei added this to the 8.2-preview milestone Apr 2, 2024

realLiangshiwei mentioned this issue Apr 2, 2024

Use the ToSnakeCase method provided by .NET8 #19472

Merged

2 tasks

maliming closed this as completed in #19472 Apr 2, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Performance discussion on the ToSnakeCase method #19471

Performance discussion on the ToSnakeCase method #19471

wapco commented Apr 2, 2024 •

edited

Loading

realLiangshiwei commented Apr 2, 2024

Performance discussion on the ToSnakeCase method #19471

Performance discussion on the ToSnakeCase method #19471

Comments

wapco commented Apr 2, 2024 • edited Loading

Description

Benchmark Result

Benchmark Code

realLiangshiwei commented Apr 2, 2024

wapco commented Apr 2, 2024 •

edited

Loading