Skip to content

Commit

Permalink
reduce some string allocation in Vocabulary (#1355)
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonCropp authored Feb 16, 2024
1 parent 62941ba commit 8970d74
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 156 deletions.
1 change: 1 addition & 0 deletions src/Humanizer/GlobalUsings.cs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
global using System.Globalization;
global using System.Text.RegularExpressions;
308 changes: 152 additions & 156 deletions src/Humanizer/Inflections/Vocabulary.cs
Original file line number Diff line number Diff line change
@@ -1,202 +1,198 @@
using System.Text.RegularExpressions;

namespace Humanizer
namespace Humanizer;

/// <summary>
/// A container for exceptions to simple pluralization/singularization rules.
/// Vocabularies.Default contains an extensive list of rules for US English.
/// At this time, multiple vocabularies and removing existing rules are not supported.
/// </summary>
public class Vocabulary
{
internal Vocabulary()
{
}

readonly List<Rule> plurals = [];
readonly List<Rule> singulars = [];
readonly HashSet<string> uncountables = new(StringComparer.CurrentCultureIgnoreCase);
readonly Regex letterS = new("^([sS])[sS]*$");

/// <summary>
/// A container for exceptions to simple pluralization/singularization rules.
/// Vocabularies.Default contains an extensive list of rules for US English.
/// At this time, multiple vocabularies and removing existing rules are not supported.
/// Adds a word to the vocabulary which cannot easily be pluralized/singularized by RegEx, e.g. "person" and "people".
/// </summary>
public class Vocabulary
/// <param name="singular">The singular form of the irregular word, e.g. "person".</param>
/// <param name="plural">The plural form of the irregular word, e.g. "people".</param>
/// <param name="matchEnding">True to match these words on their own as well as at the end of longer words. False, otherwise.</param>
public void AddIrregular(string singular, string plural, bool matchEnding = true)
{
internal Vocabulary()
if (matchEnding)
{
var singularSubstring = singular.Substring(1);
var pluralSubString = plural.Substring(1);
AddPlural($"({singular[0]}){singularSubstring}$", $"$1{pluralSubString}");
AddSingular($"({plural[0]}){pluralSubString}$", $"$1{singularSubstring}");
}

private readonly List<Rule> _plurals = new List<Rule>();
private readonly List<Rule> _singulars = new List<Rule>();
private readonly HashSet<string> _uncountables = new(StringComparer.CurrentCultureIgnoreCase);
private readonly Regex _letterS = new Regex("^([sS])[sS]*$");

/// <summary>
/// Adds a word to the vocabulary which cannot easily be pluralized/singularized by RegEx, e.g. "person" and "people".
/// </summary>
/// <param name="singular">The singular form of the irregular word, e.g. "person".</param>
/// <param name="plural">The plural form of the irregular word, e.g. "people".</param>
/// <param name="matchEnding">True to match these words on their own as well as at the end of longer words. False, otherwise.</param>
public void AddIrregular(string singular, string plural, bool matchEnding = true)
else
{
if (matchEnding)
{
AddPlural("(" + singular[0] + ")" + singular.Substring(1) + "$", "$1" + plural.Substring(1));
AddSingular("(" + plural[0] + ")" + plural.Substring(1) + "$", "$1" + singular.Substring(1));
}
else
{
AddPlural($"^{singular}$", plural);
AddSingular($"^{plural}$", singular);
}
AddPlural($"^{singular}$", plural);
AddSingular($"^{plural}$", singular);
}
}

/// <summary>
/// Adds an uncountable word to the vocabulary, e.g. "fish". Will be ignored when plurality is changed.
/// </summary>
/// <param name="word">Word to be added to the list of uncountables.</param>
public void AddUncountable(string word)
{
_uncountables.Add(word);
}
/// <summary>
/// Adds an uncountable word to the vocabulary, e.g. "fish". Will be ignored when plurality is changed.
/// </summary>
/// <param name="word">Word to be added to the list of uncountables.</param>
public void AddUncountable(string word) =>
uncountables.Add(word);

/// <summary>
/// Adds a rule to the vocabulary that does not follow trivial rules for pluralization, e.g. "bus" -> "buses"
/// </summary>
/// <param name="rule">RegEx to be matched, case insensitive, e.g. "(bus)es$"</param>
/// <param name="replacement">RegEx replacement e.g. "$1"</param>
public void AddPlural(string rule, string replacement)
{
_plurals.Add(new Rule(rule, replacement));
}
/// <summary>
/// Adds a rule to the vocabulary that does not follow trivial rules for pluralization, e.g. "bus" -> "buses"
/// </summary>
/// <param name="rule">RegEx to be matched, case insensitive, e.g. "(bus)es$"</param>
/// <param name="replacement">RegEx replacement e.g. "$1"</param>
public void AddPlural(string rule, string replacement) =>
plurals.Add(new(rule, replacement));

/// <summary>
/// Adds a rule to the vocabulary that does not follow trivial rules for singularization, e.g. "vertices/indices -> "vertex/index"
/// </summary>
/// <param name="rule">RegEx to be matched, case insensitive, e.g. ""(vert|ind)ices$""</param>
/// <param name="replacement">RegEx replacement e.g. "$1ex"</param>
public void AddSingular(string rule, string replacement)
{
_singulars.Add(new Rule(rule, replacement));
}
/// <summary>
/// Adds a rule to the vocabulary that does not follow trivial rules for singularization, e.g. "vertices/indices -> "vertex/index"
/// </summary>
/// <param name="rule">RegEx to be matched, case insensitive, e.g. ""(vert|ind)ices$""</param>
/// <param name="replacement">RegEx replacement e.g. "$1ex"</param>
public void AddSingular(string rule, string replacement) =>
singulars.Add(new(rule, replacement));

/// <summary>
/// Pluralizes the provided input considering irregular words
/// </summary>
/// <param name="word">Word to be pluralized</param>
/// <param name="inputIsKnownToBeSingular">Normally you call Pluralize on singular words; but if you're unsure call it with false</param>
public string Pluralize(string word, bool inputIsKnownToBeSingular = true)
/// <summary>
/// Pluralizes the provided input considering irregular words
/// </summary>
/// <param name="word">Word to be pluralized</param>
/// <param name="inputIsKnownToBeSingular">Normally you call Pluralize on singular words; but if you're unsure call it with false</param>
public string Pluralize(string word, bool inputIsKnownToBeSingular = true)
{
var s = LetterS(word);
if (s != null)
{
var s = LetterS(word);
if (s != null)
{
return s + "s";
}
return s + "s";
}

var result = ApplyRules(_plurals, word, false);
var result = ApplyRules(plurals, word, false);

if (inputIsKnownToBeSingular)
{
return result ?? word;
}

var asSingular = ApplyRules(_singulars, word, false);
var asSingularAsPlural = ApplyRules(_plurals, asSingular, false);
if (asSingular != null && asSingular != word && asSingular + "s" != word && asSingularAsPlural == word && result != word)
{
return word;
}

return result;
if (inputIsKnownToBeSingular)
{
return result ?? word;
}

/// <summary>
/// Singularizes the provided input considering irregular words
/// </summary>
/// <param name="word">Word to be singularized</param>
/// <param name="inputIsKnownToBePlural">Normally you call Singularize on plural words; but if you're unsure call it with false</param>
/// <param name="skipSimpleWords">Skip singularizing single words that have an 's' on the end</param>
public string Singularize(string word, bool inputIsKnownToBePlural = true, bool skipSimpleWords = false)
var asSingular = ApplyRules(singulars, word, false);
var asSingularAsPlural = ApplyRules(plurals, asSingular, false);
if (asSingular != null &&
asSingular != word &&
asSingular + "s" != word &&
asSingularAsPlural == word &&
result != word)
{
var s = LetterS(word);
if (s != null)
{
return s;
}
return word;
}

var result = ApplyRules(_singulars, word, skipSimpleWords);
return result;
}

if (inputIsKnownToBePlural)
{
return result ?? word;
}
/// <summary>
/// Singularizes the provided input considering irregular words
/// </summary>
/// <param name="word">Word to be singularized</param>
/// <param name="inputIsKnownToBePlural">Normally you call Singularize on plural words; but if you're unsure call it with false</param>
/// <param name="skipSimpleWords">Skip singularizing single words that have an 's' on the end</param>
public string Singularize(string word, bool inputIsKnownToBePlural = true, bool skipSimpleWords = false)
{
var s = LetterS(word);
if (s != null)
{
return s;
}

// the Plurality is unknown so we should check all possibilities
var asPlural = ApplyRules(_plurals, word, false);
var asPluralAsSingular = ApplyRules(_singulars, asPlural, false);
if (asPlural != word && word + "s" != asPlural && asPluralAsSingular == word && result != word)
{
return word;
}
var result = ApplyRules(singulars, word, skipSimpleWords);

if (inputIsKnownToBePlural)
{
return result ?? word;
}

private string ApplyRules(IList<Rule> rules, string word, bool skipFirstRule)
// the Plurality is unknown so we should check all possibilities
var asPlural = ApplyRules(plurals, word, false);
var asPluralAsSingular = ApplyRules(singulars, asPlural, false);
if (asPlural == word ||
word + "s" == asPlural ||
asPluralAsSingular != word ||
result == word)
{
if (word == null)
{
return null;
}

if (word.Length < 1)
{
return word;
}
return result ?? word;
}

if (IsUncountable(word))
{
return word;
}
return word;
}

var result = word;
var end = skipFirstRule ? 1 : 0;
for (var i = rules.Count - 1; i >= end; i--)
{
if ((result = rules[i].Apply(word)) != null)
{
break;
}
}
return result != null ? MatchUpperCase(word, result) : result;
string ApplyRules(IList<Rule> rules, string word, bool skipFirstRule)
{
if (word == null)
{
return null;
}

private bool IsUncountable(string word)
if (word.Length < 1)
{
return _uncountables.Contains(word);
return word;
}

private string MatchUpperCase(string word, string replacement)
if (IsUncountable(word))
{
return char.IsUpper(word[0]) && char.IsLower(replacement[0]) ? char.ToUpper(replacement[0]) + replacement.Substring(1) : replacement;
return word;
}

/// <summary>
/// If the word is the letter s, singular or plural, return the letter s singular
/// </summary>
private string LetterS(string word)
var result = word;
var end = skipFirstRule ? 1 : 0;
for (var i = rules.Count - 1; i >= end; i--)
{
var s = _letterS.Match(word);
return s.Groups.Count > 1 ? s.Groups[1].Value : null;
if ((result = rules[i].Apply(word)) != null)
{
break;
}
}

private class Rule
if (result == null)
{
private readonly Regex _regex;
private readonly string _replacement;
return null;
}

public Rule(string pattern, string replacement)
{
_regex = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptionsUtil.Compiled);
_replacement = replacement;
}
return MatchUpperCase(word, result);
}

public string Apply(string word)
{
if (!_regex.IsMatch(word))
{
return null;
}
bool IsUncountable(string word) =>
uncountables.Contains(word);

return _regex.Replace(word, _replacement);
static string MatchUpperCase(string word, string replacement) =>
char.IsUpper(word[0]) &&
char.IsLower(replacement[0]) ? char.ToUpper(replacement[0]) + replacement.Substring(1) : replacement;

/// <summary>
/// If the word is the letter s, singular or plural, return the letter s singular
/// </summary>
string LetterS(string word)
{
var s = letterS.Match(word);
return s.Groups.Count > 1 ? s.Groups[1].Value : null;
}

class Rule(string pattern, string replacement)
{
private readonly Regex regex = new(pattern, RegexOptions.IgnoreCase | RegexOptionsUtil.Compiled);

public string Apply(string word)
{
if (!regex.IsMatch(word))
{
return null;
}

return regex.Replace(word, replacement);
}
}
}
}

0 comments on commit 8970d74

Please sign in to comment.