-
Notifications
You must be signed in to change notification settings - Fork 970
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
reduce some string allocation in Vocabulary (#1355)
- Loading branch information
1 parent
62941ba
commit 8970d74
Showing
2 changed files
with
153 additions
and
156 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
global using System.Globalization; | ||
global using System.Text.RegularExpressions; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,202 +1,198 @@ | ||
using System.Text.RegularExpressions; | ||
|
||
namespace Humanizer | ||
namespace Humanizer; | ||
|
||
/// <summary> | ||
/// A container for exceptions to simple pluralization/singularization rules. | ||
/// Vocabularies.Default contains an extensive list of rules for US English. | ||
/// At this time, multiple vocabularies and removing existing rules are not supported. | ||
/// </summary> | ||
public class Vocabulary | ||
{ | ||
internal Vocabulary() | ||
{ | ||
} | ||
|
||
readonly List<Rule> plurals = []; | ||
readonly List<Rule> singulars = []; | ||
readonly HashSet<string> uncountables = new(StringComparer.CurrentCultureIgnoreCase); | ||
readonly Regex letterS = new("^([sS])[sS]*$"); | ||
|
||
/// <summary> | ||
/// A container for exceptions to simple pluralization/singularization rules. | ||
/// Vocabularies.Default contains an extensive list of rules for US English. | ||
/// At this time, multiple vocabularies and removing existing rules are not supported. | ||
/// Adds a word to the vocabulary which cannot easily be pluralized/singularized by RegEx, e.g. "person" and "people". | ||
/// </summary> | ||
public class Vocabulary | ||
/// <param name="singular">The singular form of the irregular word, e.g. "person".</param> | ||
/// <param name="plural">The plural form of the irregular word, e.g. "people".</param> | ||
/// <param name="matchEnding">True to match these words on their own as well as at the end of longer words. False, otherwise.</param> | ||
public void AddIrregular(string singular, string plural, bool matchEnding = true) | ||
{ | ||
internal Vocabulary() | ||
if (matchEnding) | ||
{ | ||
var singularSubstring = singular.Substring(1); | ||
var pluralSubString = plural.Substring(1); | ||
AddPlural($"({singular[0]}){singularSubstring}$", $"$1{pluralSubString}"); | ||
AddSingular($"({plural[0]}){pluralSubString}$", $"$1{singularSubstring}"); | ||
} | ||
|
||
private readonly List<Rule> _plurals = new List<Rule>(); | ||
private readonly List<Rule> _singulars = new List<Rule>(); | ||
private readonly HashSet<string> _uncountables = new(StringComparer.CurrentCultureIgnoreCase); | ||
private readonly Regex _letterS = new Regex("^([sS])[sS]*$"); | ||
|
||
/// <summary> | ||
/// Adds a word to the vocabulary which cannot easily be pluralized/singularized by RegEx, e.g. "person" and "people". | ||
/// </summary> | ||
/// <param name="singular">The singular form of the irregular word, e.g. "person".</param> | ||
/// <param name="plural">The plural form of the irregular word, e.g. "people".</param> | ||
/// <param name="matchEnding">True to match these words on their own as well as at the end of longer words. False, otherwise.</param> | ||
public void AddIrregular(string singular, string plural, bool matchEnding = true) | ||
else | ||
{ | ||
if (matchEnding) | ||
{ | ||
AddPlural("(" + singular[0] + ")" + singular.Substring(1) + "$", "$1" + plural.Substring(1)); | ||
AddSingular("(" + plural[0] + ")" + plural.Substring(1) + "$", "$1" + singular.Substring(1)); | ||
} | ||
else | ||
{ | ||
AddPlural($"^{singular}$", plural); | ||
AddSingular($"^{plural}$", singular); | ||
} | ||
AddPlural($"^{singular}$", plural); | ||
AddSingular($"^{plural}$", singular); | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// Adds an uncountable word to the vocabulary, e.g. "fish". Will be ignored when plurality is changed. | ||
/// </summary> | ||
/// <param name="word">Word to be added to the list of uncountables.</param> | ||
public void AddUncountable(string word) | ||
{ | ||
_uncountables.Add(word); | ||
} | ||
/// <summary> | ||
/// Adds an uncountable word to the vocabulary, e.g. "fish". Will be ignored when plurality is changed. | ||
/// </summary> | ||
/// <param name="word">Word to be added to the list of uncountables.</param> | ||
public void AddUncountable(string word) => | ||
uncountables.Add(word); | ||
|
||
/// <summary> | ||
/// Adds a rule to the vocabulary that does not follow trivial rules for pluralization, e.g. "bus" -> "buses" | ||
/// </summary> | ||
/// <param name="rule">RegEx to be matched, case insensitive, e.g. "(bus)es$"</param> | ||
/// <param name="replacement">RegEx replacement e.g. "$1"</param> | ||
public void AddPlural(string rule, string replacement) | ||
{ | ||
_plurals.Add(new Rule(rule, replacement)); | ||
} | ||
/// <summary> | ||
/// Adds a rule to the vocabulary that does not follow trivial rules for pluralization, e.g. "bus" -> "buses" | ||
/// </summary> | ||
/// <param name="rule">RegEx to be matched, case insensitive, e.g. "(bus)es$"</param> | ||
/// <param name="replacement">RegEx replacement e.g. "$1"</param> | ||
public void AddPlural(string rule, string replacement) => | ||
plurals.Add(new(rule, replacement)); | ||
|
||
/// <summary> | ||
/// Adds a rule to the vocabulary that does not follow trivial rules for singularization, e.g. "vertices/indices -> "vertex/index" | ||
/// </summary> | ||
/// <param name="rule">RegEx to be matched, case insensitive, e.g. ""(vert|ind)ices$""</param> | ||
/// <param name="replacement">RegEx replacement e.g. "$1ex"</param> | ||
public void AddSingular(string rule, string replacement) | ||
{ | ||
_singulars.Add(new Rule(rule, replacement)); | ||
} | ||
/// <summary> | ||
/// Adds a rule to the vocabulary that does not follow trivial rules for singularization, e.g. "vertices/indices -> "vertex/index" | ||
/// </summary> | ||
/// <param name="rule">RegEx to be matched, case insensitive, e.g. ""(vert|ind)ices$""</param> | ||
/// <param name="replacement">RegEx replacement e.g. "$1ex"</param> | ||
public void AddSingular(string rule, string replacement) => | ||
singulars.Add(new(rule, replacement)); | ||
|
||
/// <summary> | ||
/// Pluralizes the provided input considering irregular words | ||
/// </summary> | ||
/// <param name="word">Word to be pluralized</param> | ||
/// <param name="inputIsKnownToBeSingular">Normally you call Pluralize on singular words; but if you're unsure call it with false</param> | ||
public string Pluralize(string word, bool inputIsKnownToBeSingular = true) | ||
/// <summary> | ||
/// Pluralizes the provided input considering irregular words | ||
/// </summary> | ||
/// <param name="word">Word to be pluralized</param> | ||
/// <param name="inputIsKnownToBeSingular">Normally you call Pluralize on singular words; but if you're unsure call it with false</param> | ||
public string Pluralize(string word, bool inputIsKnownToBeSingular = true) | ||
{ | ||
var s = LetterS(word); | ||
if (s != null) | ||
{ | ||
var s = LetterS(word); | ||
if (s != null) | ||
{ | ||
return s + "s"; | ||
} | ||
return s + "s"; | ||
} | ||
|
||
var result = ApplyRules(_plurals, word, false); | ||
var result = ApplyRules(plurals, word, false); | ||
|
||
if (inputIsKnownToBeSingular) | ||
{ | ||
return result ?? word; | ||
} | ||
|
||
var asSingular = ApplyRules(_singulars, word, false); | ||
var asSingularAsPlural = ApplyRules(_plurals, asSingular, false); | ||
if (asSingular != null && asSingular != word && asSingular + "s" != word && asSingularAsPlural == word && result != word) | ||
{ | ||
return word; | ||
} | ||
|
||
return result; | ||
if (inputIsKnownToBeSingular) | ||
{ | ||
return result ?? word; | ||
} | ||
|
||
/// <summary> | ||
/// Singularizes the provided input considering irregular words | ||
/// </summary> | ||
/// <param name="word">Word to be singularized</param> | ||
/// <param name="inputIsKnownToBePlural">Normally you call Singularize on plural words; but if you're unsure call it with false</param> | ||
/// <param name="skipSimpleWords">Skip singularizing single words that have an 's' on the end</param> | ||
public string Singularize(string word, bool inputIsKnownToBePlural = true, bool skipSimpleWords = false) | ||
var asSingular = ApplyRules(singulars, word, false); | ||
var asSingularAsPlural = ApplyRules(plurals, asSingular, false); | ||
if (asSingular != null && | ||
asSingular != word && | ||
asSingular + "s" != word && | ||
asSingularAsPlural == word && | ||
result != word) | ||
{ | ||
var s = LetterS(word); | ||
if (s != null) | ||
{ | ||
return s; | ||
} | ||
return word; | ||
} | ||
|
||
var result = ApplyRules(_singulars, word, skipSimpleWords); | ||
return result; | ||
} | ||
|
||
if (inputIsKnownToBePlural) | ||
{ | ||
return result ?? word; | ||
} | ||
/// <summary> | ||
/// Singularizes the provided input considering irregular words | ||
/// </summary> | ||
/// <param name="word">Word to be singularized</param> | ||
/// <param name="inputIsKnownToBePlural">Normally you call Singularize on plural words; but if you're unsure call it with false</param> | ||
/// <param name="skipSimpleWords">Skip singularizing single words that have an 's' on the end</param> | ||
public string Singularize(string word, bool inputIsKnownToBePlural = true, bool skipSimpleWords = false) | ||
{ | ||
var s = LetterS(word); | ||
if (s != null) | ||
{ | ||
return s; | ||
} | ||
|
||
// the Plurality is unknown so we should check all possibilities | ||
var asPlural = ApplyRules(_plurals, word, false); | ||
var asPluralAsSingular = ApplyRules(_singulars, asPlural, false); | ||
if (asPlural != word && word + "s" != asPlural && asPluralAsSingular == word && result != word) | ||
{ | ||
return word; | ||
} | ||
var result = ApplyRules(singulars, word, skipSimpleWords); | ||
|
||
if (inputIsKnownToBePlural) | ||
{ | ||
return result ?? word; | ||
} | ||
|
||
private string ApplyRules(IList<Rule> rules, string word, bool skipFirstRule) | ||
// the Plurality is unknown so we should check all possibilities | ||
var asPlural = ApplyRules(plurals, word, false); | ||
var asPluralAsSingular = ApplyRules(singulars, asPlural, false); | ||
if (asPlural == word || | ||
word + "s" == asPlural || | ||
asPluralAsSingular != word || | ||
result == word) | ||
{ | ||
if (word == null) | ||
{ | ||
return null; | ||
} | ||
|
||
if (word.Length < 1) | ||
{ | ||
return word; | ||
} | ||
return result ?? word; | ||
} | ||
|
||
if (IsUncountable(word)) | ||
{ | ||
return word; | ||
} | ||
return word; | ||
} | ||
|
||
var result = word; | ||
var end = skipFirstRule ? 1 : 0; | ||
for (var i = rules.Count - 1; i >= end; i--) | ||
{ | ||
if ((result = rules[i].Apply(word)) != null) | ||
{ | ||
break; | ||
} | ||
} | ||
return result != null ? MatchUpperCase(word, result) : result; | ||
string ApplyRules(IList<Rule> rules, string word, bool skipFirstRule) | ||
{ | ||
if (word == null) | ||
{ | ||
return null; | ||
} | ||
|
||
private bool IsUncountable(string word) | ||
if (word.Length < 1) | ||
{ | ||
return _uncountables.Contains(word); | ||
return word; | ||
} | ||
|
||
private string MatchUpperCase(string word, string replacement) | ||
if (IsUncountable(word)) | ||
{ | ||
return char.IsUpper(word[0]) && char.IsLower(replacement[0]) ? char.ToUpper(replacement[0]) + replacement.Substring(1) : replacement; | ||
return word; | ||
} | ||
|
||
/// <summary> | ||
/// If the word is the letter s, singular or plural, return the letter s singular | ||
/// </summary> | ||
private string LetterS(string word) | ||
var result = word; | ||
var end = skipFirstRule ? 1 : 0; | ||
for (var i = rules.Count - 1; i >= end; i--) | ||
{ | ||
var s = _letterS.Match(word); | ||
return s.Groups.Count > 1 ? s.Groups[1].Value : null; | ||
if ((result = rules[i].Apply(word)) != null) | ||
{ | ||
break; | ||
} | ||
} | ||
|
||
private class Rule | ||
if (result == null) | ||
{ | ||
private readonly Regex _regex; | ||
private readonly string _replacement; | ||
return null; | ||
} | ||
|
||
public Rule(string pattern, string replacement) | ||
{ | ||
_regex = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptionsUtil.Compiled); | ||
_replacement = replacement; | ||
} | ||
return MatchUpperCase(word, result); | ||
} | ||
|
||
public string Apply(string word) | ||
{ | ||
if (!_regex.IsMatch(word)) | ||
{ | ||
return null; | ||
} | ||
bool IsUncountable(string word) => | ||
uncountables.Contains(word); | ||
|
||
return _regex.Replace(word, _replacement); | ||
static string MatchUpperCase(string word, string replacement) => | ||
char.IsUpper(word[0]) && | ||
char.IsLower(replacement[0]) ? char.ToUpper(replacement[0]) + replacement.Substring(1) : replacement; | ||
|
||
/// <summary> | ||
/// If the word is the letter s, singular or plural, return the letter s singular | ||
/// </summary> | ||
string LetterS(string word) | ||
{ | ||
var s = letterS.Match(word); | ||
return s.Groups.Count > 1 ? s.Groups[1].Value : null; | ||
} | ||
|
||
class Rule(string pattern, string replacement) | ||
{ | ||
private readonly Regex regex = new(pattern, RegexOptions.IgnoreCase | RegexOptionsUtil.Compiled); | ||
|
||
public string Apply(string word) | ||
{ | ||
if (!regex.IsMatch(word)) | ||
{ | ||
return null; | ||
} | ||
|
||
return regex.Replace(word, replacement); | ||
} | ||
} | ||
} | ||
} |