Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance discussion on the ToSnakeCase method #19471

Closed
wapco opened this issue Apr 2, 2024 · 1 comment · Fixed by #19472
Closed

Performance discussion on the ToSnakeCase method #19471

wapco opened this issue Apr 2, 2024 · 1 comment · Fixed by #19472

Comments

@wapco
Copy link

wapco commented Apr 2, 2024

Description

The ToSnakeCase method in ABP can be optimized using Span for better performance. In dotnet8, a ToSnakeCase method is also provided. I conducted a performance test using Benchmark and compared the three methods. The results show that my algorithm has the best performance, followed by Microsoft's dotnet8. However, when it comes to memory usage, dotnet8 is the best.

Benchmark Result

Method Mean Error StdDev Rank Gen0 Allocated
ToSnakeCaseBySpan 52.34 ns 0.899 ns 0.797 ns 1 0.0803 168 B
ToSnakeCaseDotnet8 75.30 ns 0.472 ns 0.442 ns 2 0.0381 80 B
ToSnakeCaseByAbp 111.98 ns 1.160 ns 1.085 ns 3 0.1529 320 B

Benchmark Code

public static string name = "BBCToSnakeCaseBySpanTest2";

[Benchmark]
public string ToSnakeCaseBySpan()
{
    int bufferSize = name.Length;
    ReadOnlySpan<char> nameBuffer = name.AsSpan();
    Span<char> buffer = new char[bufferSize + bufferSize / 2];

    int bufferPosition = 0;

    int namePosition = 0;

    while (bufferPosition < bufferSize)
    {
        if (char.IsUpper(nameBuffer[namePosition]))
        {
            if (namePosition == 0 || namePosition == nameBuffer.Length - 1 ||
                (char.IsUpper(nameBuffer[namePosition + 1]) && char.IsUpper(nameBuffer[namePosition - 1])))
            {
                buffer[bufferPosition] = char.ToLowerInvariant(nameBuffer[namePosition]);
                bufferPosition++;
            }
            else
            {
                buffer[bufferPosition] = '_';
                buffer[bufferPosition + 1] = char.ToLowerInvariant(nameBuffer[namePosition]);
                bufferPosition += 2;
                bufferSize++;
            }

            namePosition++;
            continue;
        }

        buffer[bufferPosition] = nameBuffer[namePosition];

        bufferPosition++;
        namePosition++;
    }

    return buffer.Slice(0, bufferSize).ToString();
}

[Benchmark]
public string ToSnakeCaseDotnet8()
{
    if (string.IsNullOrEmpty(name))
    {
        return name;
    }

    char separator = '_';
    bool lowercase = true;
    var chars = name.AsSpan();

    char[]? rentedBuffer = null;

    // While we can't predict the expansion factor of the resultant string,
    // start with a buffer that is at least 20% larger than the input.
    int initialBufferLength = (int)(1.2 * chars.Length);
    Span<char> destination = initialBufferLength <= 128
        ? stackalloc char[128]
        : (rentedBuffer = ArrayPool<char>.Shared.Rent(initialBufferLength));

    SeparatorState state = SeparatorState.NotStarted;
    int charsWritten = 0;

    for (int i = 0; i < chars.Length; i++)
    {
        // NB this implementation does not handle surrogate pair letters
        // cf. https://github.com/dotnet/runtime/issues/90352

        char current = chars[i];
        UnicodeCategory category = char.GetUnicodeCategory(current);

        switch (category)
        {
            case UnicodeCategory.UppercaseLetter:

                switch (state)
                {
                    case SeparatorState.NotStarted:
                        break;

                    case SeparatorState.LowercaseLetterOrDigit:
                    case SeparatorState.SpaceSeparator:
                        // An uppercase letter following a sequence of lowercase letters or spaces
                        // denotes the start of a new grouping: emit a separator character.
                        WriteChar(separator, ref destination);
                        break;

                    case SeparatorState.UppercaseLetter:
                        // We are reading through a sequence of two or more uppercase letters.
                        // Uppercase letters are grouped together with the exception of the
                        // final letter, assuming it is followed by lowercase letters.
                        // For example, the value 'XMLReader' should render as 'xml_reader',
                        // however 'SHA512Hash' should render as 'sha512-hash'.
                        if (i + 1 < chars.Length && char.IsLower(chars[i + 1]))
                        {
                            WriteChar(separator, ref destination);
                        }

                        break;

                    default:
                        Debug.Fail($"Unexpected state {state}");
                        break;
                }

                if (lowercase)
                {
                    current = char.ToLowerInvariant(current);
                }

                WriteChar(current, ref destination);
                state = SeparatorState.UppercaseLetter;
                break;

            case UnicodeCategory.LowercaseLetter:
            case UnicodeCategory.DecimalDigitNumber:

                if (state is SeparatorState.SpaceSeparator)
                {
                    // Normalize preceding spaces to one separator.
                    WriteChar(separator, ref destination);
                }

                if (!lowercase && category is UnicodeCategory.LowercaseLetter)
                {
                    current = char.ToUpperInvariant(current);
                }

                WriteChar(current, ref destination);
                state = SeparatorState.LowercaseLetterOrDigit;
                break;

            case UnicodeCategory.SpaceSeparator:
                // Space characters are trimmed from the start and end of the input string
                // but are normalized to separator characters if between letters.
                if (state != SeparatorState.NotStarted)
                {
                    state = SeparatorState.SpaceSeparator;
                }

                break;

            default:
                // Non-alphanumeric characters (including the separator character and surrogates)
                // are written as-is to the output and reset the separator state.
                // E.g. 'ABC???def' maps to 'abc???def' in snake_case.

                WriteChar(current, ref destination);
                state = SeparatorState.NotStarted;
                break;
        }
    }

    string result = destination.Slice(0, charsWritten).ToString();

    if (rentedBuffer is not null)
    {
        destination.Slice(0, charsWritten).Clear();
        ArrayPool<char>.Shared.Return(rentedBuffer);
    }

    return result;

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    void WriteChar(char value, ref Span<char> destination)
    {
        if (charsWritten == destination.Length)
        {
            ExpandBuffer(ref destination);
        }

        destination[charsWritten++] = value;
    }

    void ExpandBuffer(ref Span<char> destination)
    {
        int newSize = checked(destination.Length * 2);
        char[] newBuffer = ArrayPool<char>.Shared.Rent(newSize);
        destination.CopyTo(newBuffer);

        if (rentedBuffer != null)
        {
            destination.Slice(0, charsWritten).Clear();
            ArrayPool<char>.Shared.Return(rentedBuffer);
        }

        rentedBuffer = newBuffer;
        destination = rentedBuffer;
    }
}

[Benchmark]
public string ToSnakeCaseByAbp()
{
    if (string.IsNullOrWhiteSpace(name))
    {
        return name;
    }

    var builder = new StringBuilder(name.Length + Math.Min(2, name.Length / 5));
    var previousCategory = default(UnicodeCategory?);

    for (var currentIndex = 0; currentIndex < name.Length; currentIndex++)
    {
        var currentChar = name[currentIndex];
        if (currentChar == '_')
        {
            builder.Append('_');
            previousCategory = null;
            continue;
        }

        var currentCategory = char.GetUnicodeCategory(currentChar);
        switch (currentCategory)
        {
            case UnicodeCategory.UppercaseLetter:
            case UnicodeCategory.TitlecaseLetter:
                if (previousCategory == UnicodeCategory.SpaceSeparator ||
                    previousCategory == UnicodeCategory.LowercaseLetter ||
                    previousCategory != UnicodeCategory.DecimalDigitNumber &&
                    previousCategory != null &&
                    currentIndex > 0 &&
                    currentIndex + 1 < name.Length &&
                    char.IsLower(name[currentIndex + 1]))
                {
                    builder.Append('_');
                }

                currentChar = char.ToLower(currentChar);
                break;

            case UnicodeCategory.LowercaseLetter:
            case UnicodeCategory.DecimalDigitNumber:
                if (previousCategory == UnicodeCategory.SpaceSeparator)
                {
                    builder.Append('_');
                }

                break;

            default:
                if (previousCategory != null)
                {
                    previousCategory = UnicodeCategory.SpaceSeparator;
                }

                continue;
        }

        builder.Append(currentChar);
        previousCategory = currentCategory;
    }

    return builder.ToString();
}

internal enum SnakeCaseState
{
    Start,
    Lower,
    Upper,
    NewWord
}

internal enum SeparatorState
{
    NotStarted,
    UppercaseLetter,
    LowercaseLetterOrDigit,
    SpaceSeparator,
}
@wapco wapco added the problem label Apr 2, 2024
@realLiangshiwei
Copy link
Member

We can consider using the method provided by .NET 8

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants