Skip to content

Commit

Permalink
Perf improvements (#13)
Browse files Browse the repository at this point in the history
* Fix whitespace cleaning tests

* Add test for double replace

* Create a new helper each time

* Use a dictionary for caching denied character regexes

* Cache the default config

* Whitespace and usings cleanup

* Lazily cache the cleanup regex

* Trim and lowercase string without allocating

* Collapse whitespace without allocating

* Apply string replacements without allocating

* Remove bad test

* Add more coverage for string replacements

* Fix string replacement bugs

* Replace RemoveDiacritics with a non-allocating version

* Add test for whitespace in a string replacement from @Buildstarted

* Don't use regexes for denied characters by default

* Update tests for allowed characters

* Add test for legacy behaviour

* Dont regress legacy benchmarks

* Process allowed characters

* Collapse dashes without allocating

* Cache IsWhitespace
  • Loading branch information
davidwengier authored Aug 17, 2020
1 parent 02e077b commit 01004f1
Show file tree
Hide file tree
Showing 4 changed files with 360 additions and 70 deletions.
32 changes: 29 additions & 3 deletions src/Slugify.Core/SlugHelper.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;
Expand Down Expand Up @@ -36,7 +37,13 @@ public string GenerateSlug(string inputString)
inputString = CleanWhiteSpace(inputString, _config.CollapseWhiteSpace);
inputString = ApplyReplacements(inputString, _config.StringReplacements);
inputString = RemoveDiacritics(inputString);
inputString = DeleteCharacters(inputString, _config.DeniedCharactersRegex);

string regex = _config.DeniedCharactersRegex;
if (regex == null)
{
regex = "[^" + Regex.Escape(string.Join("", _config.AllowedChars)).Replace("-", "\\-") + "]";
}
inputString = DeleteCharacters(inputString, regex);

if (_config.CollapseDashes)
{
Expand Down Expand Up @@ -91,14 +98,33 @@ protected string DeleteCharacters(string str, string regex)
/// </summary>
public class Config
{
// TODO: Implement a source generator so this can be done at compile time :)
private static readonly char[] s_allowedChars =
("abcdefghijklmnopqrstuvwxyz" +
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
"0123456789" +
"-._").ToCharArray();

private readonly HashSet<char> _allowedChars = new HashSet<char>(s_allowedChars);

public Dictionary<string, string> StringReplacements { get; set; } = new Dictionary<string, string>
{
{ " ", "-" }
};

public bool ForceLowerCase { get; set; } = true;
public bool CollapseWhiteSpace { get; set; } = true;
public string DeniedCharactersRegex { get; set; } = @"[^a-zA-Z0-9\-\._]";
/// <summary>
/// Note: Setting this property will stop the AllowedChars feature from being used
/// </summary>
public string DeniedCharactersRegex { get; set; }
public HashSet<char> AllowedChars
{
get
{
return DeniedCharactersRegex == null ? _allowedChars : throw new InvalidOperationException("After setting DeniedCharactersRegex the AllowedChars feature cannot be used.");
}
}
public bool CollapseDashes { get; set; } = true;
public bool TrimWhitespace { get; set; } = true;
}
Expand Down
197 changes: 155 additions & 42 deletions src/Slugify.Core/SlugHelperImproved.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,97 +3,210 @@
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;

namespace Slugify
{
public class SlugHelperImproved : ISlugHelper
{
protected SlugHelper.Config _config { get; set; }
private static readonly Dictionary<string, Regex> _deleteRegexMap = new Dictionary<string, Regex>();
private static readonly Lazy<SlugHelper.Config> _defaultConfig = new Lazy<SlugHelper.Config>(() => new SlugHelper.Config());

private readonly Regex _deniedCharactersRegex;
private readonly Regex _cleanWhiteSpaceRegex;
private readonly Regex _collapseDashesRegex;
protected SlugHelper.Config _config { get; set; }

public SlugHelperImproved() : this(new SlugHelper.Config()) { }
public SlugHelperImproved() : this(_defaultConfig.Value) { }

public SlugHelperImproved(SlugHelper.Config config)
{
_config = config ?? throw new ArgumentNullException(nameof(config), "can't be null use default config or empty constructor.");

_deniedCharactersRegex = new Regex(_config.DeniedCharactersRegex, RegexOptions.Compiled);
_cleanWhiteSpaceRegex = new Regex(_config.CollapseWhiteSpace ? @"\s+" : @"\s", RegexOptions.Compiled);
_collapseDashesRegex = new Regex("--+", RegexOptions.Compiled);
}

/// <summary>
/// Implements <see cref="ISlugHelper.GenerateSlug(string)"/>
/// </summary>
public string GenerateSlug(string inputString)
{
if (_config.TrimWhitespace)
StringBuilder sb = new StringBuilder();

// First we trim and lowercase if necessary
PrepareStringBuilder(inputString.Normalize(NormalizationForm.FormD), sb);
ApplyStringReplacements(sb);
RemoveNonSpacingMarks(sb);

if (_config.DeniedCharactersRegex == null)
{
inputString = inputString.Trim();
RemoveNotAllowedCharacters(sb);
}

if (_config.ForceLowerCase)
// For backwards compatibility
if (_config.DeniedCharactersRegex != null)
{
inputString = inputString.ToLower();
if (!_deleteRegexMap.TryGetValue(_config.DeniedCharactersRegex, out Regex deniedCharactersRegex))
{
deniedCharactersRegex = new Regex(_config.DeniedCharactersRegex, RegexOptions.Compiled);
_deleteRegexMap.Add(_config.DeniedCharactersRegex, deniedCharactersRegex);
}

sb.Clear();
sb.Append(DeleteCharacters(sb.ToString(), deniedCharactersRegex));
}

if (_config.CollapseDashes)
{
CollapseDashes(sb);
}

inputString = CleanWhiteSpace(inputString);
inputString = ApplyReplacements(inputString);
inputString = RemoveDiacritics(inputString);
inputString = DeleteCharacters(inputString);
return sb.ToString();
}

if (_config.CollapseDashes)
private void PrepareStringBuilder(string inputString, StringBuilder sb)
{
bool seenFirstNonWhitespace = false;
int indexOfLastNonWhitespace = 0;
for (int i = 0; i < inputString.Length; i++)
{
inputString = _collapseDashesRegex.Replace(inputString, "-");
// first, clean whitepace
char c = inputString[i];
bool isWhitespace = char.IsWhiteSpace(c);
if (!seenFirstNonWhitespace && isWhitespace)
{
if (_config.TrimWhitespace)
{
continue;
}
else
{
sb.Append(c);
}
}
else
{
seenFirstNonWhitespace = true;
if (!isWhitespace)
{
indexOfLastNonWhitespace = sb.Length;
}
else
{
c = ' ';

if (_config.CollapseWhiteSpace)
{
while ((i + 1) < inputString.Length && char.IsWhiteSpace(inputString[i + 1]))
{
i++;
}
}
}
if (_config.ForceLowerCase)
{
c = char.ToLower(c);
}

sb.Append(c);
}
}

return inputString;
if (_config.TrimWhitespace)
{
sb.Length = indexOfLastNonWhitespace + 1;
}
}


protected string CleanWhiteSpace(string str)
private void ApplyStringReplacements(StringBuilder sb)
{
return _cleanWhiteSpaceRegex.Replace(str, " ");
foreach (var replacement in _config.StringReplacements)
{
for (int i = 0; i < sb.Length; i++)
{
if (SubstringEquals(sb, i, replacement.Key))
{
sb.Remove(i, replacement.Key.Length);
sb.Insert(i, replacement.Value);

i += replacement.Value.Length - 1;
}
}
}
}

// Thanks http://stackoverflow.com/a/249126!
protected string RemoveDiacritics(string str)
private static bool SubstringEquals(StringBuilder sb, int index, string toMatch)
{
var stFormD = str.Normalize(NormalizationForm.FormD);

//perf: initialise this with the length of the chars
var sb = new StringBuilder(stFormD.Length);
if (sb.Length - index < toMatch.Length)
{
return false;
}

for (var ich = 0; ich < stFormD.Length; ich++)
for (int i = index; i < sb.Length; i++)
{
var uc = CharUnicodeInfo.GetUnicodeCategory(stFormD[ich]);
if (uc != UnicodeCategory.NonSpacingMark)
int matchIndex = i - index;

if (matchIndex == toMatch.Length)
{
sb.Append(stFormD[ich]);
return true;
}
else if (sb[i] != toMatch[matchIndex])
{
return false;
}
}
return (sb.Length - index) == toMatch.Length;
}

return sb.ToString().Normalize(NormalizationForm.FormC);
// Thanks http://stackoverflow.com/a/249126!
protected void RemoveNonSpacingMarks(StringBuilder sb)
{
for (var ich = 0; ich < sb.Length; ich++)
{
if (CharUnicodeInfo.GetUnicodeCategory(sb[ich]) == UnicodeCategory.NonSpacingMark)
{
sb.Remove(ich, 1);
ich--;
}
}
}

protected string ApplyReplacements(string str)
protected void RemoveNotAllowedCharacters(StringBuilder sb)
{
//perf: don't use string builder here, it's faster without
foreach (var replacement in _config.StringReplacements)
// perf!
HashSet<char> allowedChars = _config.AllowedChars;
for (var i = 0; i < sb.Length; i++)
{
str = str.Replace(replacement.Key, replacement.Value);
if (!allowedChars.Contains(sb[i]))
{
sb.Remove(i, 1);
i--;
}
}
}

return str;
protected void CollapseDashes(StringBuilder sb)
{
bool firstDash = true;
for (int i = 0; i < sb.Length; i++)
{
// first, clean whitepace
if (sb[i] == '-')
{
if (firstDash)
{
firstDash = false;
}
else
{
sb.Remove(i, 1);
i--;
}
}
else
{
firstDash = true;
}
}
}

protected string DeleteCharacters(string str)
protected string DeleteCharacters(string str, Regex deniedCharactersRegex)
{
return _deniedCharactersRegex.Replace(str, string.Empty);
return deniedCharactersRegex.Replace(str, string.Empty);
}
}
}
Expand Down
14 changes: 7 additions & 7 deletions tests/Slugify.Core.Benchmarks/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,18 @@ internal class Program
{
private static void Main(string[] args)
{
var summary = BenchmarkRunner.Run<SlugifyBenchmarks>();
BenchmarkRunner.Run<SlugifyBenchmarks>();
}
}

[MemoryDiagnoser]
public class SlugifyBenchmarks
{
private ISlugHelper _slugHelper;
private ISlugHelper _slugHelperImproved;
private List<string> _textList;

[GlobalSetup]
public void GlobalSetup()
{
_slugHelper = new SlugHelper();
_slugHelperImproved = new SlugHelperImproved();
_textList = File.ReadAllLines("gistfile.txt").ToList();
}

Expand All @@ -34,7 +30,11 @@ public void Baseline()
{
for (var i = 0; i < _textList.Count; i++)
{
_slugHelper.GenerateSlug(_textList[i]);
new SlugHelper(new SlugHelper.Config
{
// to enable legacy behaviour, for fairness
DeniedCharactersRegex = @"[^a-zA-Z0-9\-\._]"
}).GenerateSlug(_textList[i]);
}
}

Expand All @@ -43,7 +43,7 @@ public void Improved()
{
for (var i = 0; i < _textList.Count; i++)
{
_slugHelperImproved.GenerateSlug(_textList[i]);
new SlugHelperImproved().GenerateSlug(_textList[i]);
}
}
}
Expand Down
Loading

0 comments on commit 01004f1

Please sign in to comment.