Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improved BDD Unicode table representation in NonBacktracking engine #61142

Merged
merged 5 commits into from
Nov 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ internal sealed class BDD : IComparable
/// </summary>
private static readonly long[] s_trueRepresentation = new long[] { 1 };

/// <summary>
/// Representation of False for compact serialization of BDDs.
/// </summary>
private static readonly byte[] s_falseRepresentationCompact = new byte[] { 0 };

/// <summary>
/// Representation of True for compact serialization of BDDs.
/// </summary>
private static readonly byte[] s_trueRepresentationCompact = new byte[] { 1 };

internal BDD(int ordinal, BDD? one, BDD? zero)
{
One = one;
Expand Down Expand Up @@ -300,6 +310,49 @@ public long[] Serialize()
return res;
}

/// <summary>
/// Serialize this BDD into a byte array.
/// This method is not valid for MTBDDs where some elements may be negative.
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
/// </summary>
public byte[] SerializeToBytes()
{
if (IsEmpty)
return s_falseRepresentationCompact;

if (IsFull)
return s_trueRepresentationCompact;

// in other cases make use of the general serializer to long[]
long[] serialized = Serialize();

// get the maximal element from the array
long m = 0;
for (int i = 0; i < serialized.Length; i++)
{
// make sure this serialization is not applied to MTBDDs
Debug.Assert(serialized[i] > 0);
m = Math.Max(serialized[i], m);
}

// k is the number of bytes needed to represent the maximal element
int k = m <= 0xFFFF ? 2 : (m <= 0xFF_FFFF ? 3 : (m <= 0xFFFF_FFFF ? 4 : (m <= 0xFF_FFFF_FFFF ? 5 : (m <= 0xFFFF_FFFF_FFFF ? 6 : (m <= 0xFF_FFFF_FFFF_FFFF ? 7 : 8)))));

// the result will contain k as the first element and the number of serialized elements times k
byte[] result = new byte[(k * serialized.Length) + 1];
result[0] = (byte)k;
for (int i=0; i < serialized.Length; i += 1)
{
long serialized_i = serialized[i];
// add the serialized longs as k-byte subsequences
for (int j = 1; j <= k; j++)
{
result[(i * k) + j] = (byte)serialized_i;
serialized_i = serialized_i >> 8;
}
}
return result;
}

/// <summary>
/// Recreates a BDD from a ulong array that has been created using Serialize.
/// Is executed using a lock on algebra (if algebra != null) in a single thread mode.
Expand Down Expand Up @@ -353,6 +406,70 @@ public static BDD Deserialize(long[] arcs, BDDAlgebra algebra)
return nodes[k - 1];
}

/// <summary>
/// Recreates a BDD from a byte array that has been created using SerializeToBytes.
/// </summary>
public static BDD Deserialize(byte[] bytes, BDDAlgebra algebra)
{
if (bytes.Length == 1)
{
return bytes[0] == 0 ? False : True;
}

// here bytes represents an array of longs with k = the number of bytes used per long
int k = (int)bytes[0];

// gets the i'th element from the underlying array of longs represented by bytes
long Get(int i)
{
long l = 0;
for (int j = k; j > 0; j--)
{
l = (l << 8) | bytes[(k * i) + j];
}
return l;
}

// n is the total nr of longs that corresponds also to the total number of BDD nodes needed
int n = (bytes.Length - 1) / k;

// make sure the represented nr of longs divides precisely without remainder
Debug.Assert((bytes.Length - 1) % k == 0);

// the number of bits used for ordinals and node identifiers are stored in the first two longs
int ordinal_bits = (int)Get(0);
int node_bits = (int)Get(1);

// create bit masks for the sizes of ordinals and node identifiers
long ordinal_mask = (1 << ordinal_bits) - 1;
long node_mask = (1 << node_bits) - 1;
BitLayout(ordinal_bits, node_bits, out int zero_node_shift, out int one_node_shift, out int ordinal_shift);

// store BDD nodes by their id when they are created
BDD[] nodes = new BDD[n];
nodes[0] = False;
nodes[1] = True;

for (int i = 2; i < n; i++)
{
// represents the triple (ordinal, one, zero)
long arc = Get(i);

// reconstruct the ordinal and child identifiers for a non-terminal
int ord = (int)((arc >> ordinal_shift) & ordinal_mask);
int oneId = (int)((arc >> one_node_shift) & node_mask);
int zeroId = (int)((arc >> zero_node_shift) & node_mask);

// the BDD nodes for the children are guaranteed to exist already
// because of the topological order they were serialized by
Debug.Assert(oneId < i && zeroId < i);
nodes[i] = algebra.GetOrCreateBDD(ord, nodes[oneId], nodes[zeroId]);
}

//the result is the last BDD in the nodes array
return nodes[n - 1];
}

/// <summary>
/// Use this bit layout in the serialization
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ namespace System.Text.RegularExpressions.Symbolic
/// </summary>
internal sealed class CharSetSolver : BDDAlgebra, ICharAlgebra<BDD>
{
/// <summary>BDDs for all characters for fast lookup.</summary>
private readonly BDD[] _charPredTable = new BDD[char.MaxValue + 1];
/// <summary>BDDs for all ASCII characters for fast lookup.</summary>
private readonly BDD[] _charPredTable = new BDD[128];
private readonly Unicode.IgnoreCaseTransformer _ignoreCase;
internal readonly BDD _nonAscii;

Expand All @@ -40,10 +40,23 @@ public BDD CharConstraint(char c, bool ignoreCase = false, string? culture = nul
else
{
//individual character BDDs are always fixed
return _charPredTable[c] ??= CreateSetFrom(c, 15);
BDD[] charPredTable = _charPredTable;
return c < charPredTable.Length ?
charPredTable[c] ??= CreateBDDFromChar(c) :
CreateBDDFromChar(c);
}
}

private BDD CreateBDDFromChar(ushort c)
{
BDD bdd = BDD.True;
for (int k = 0; k < 16; k++)
{
bdd = (c & (1 << k)) == 0 ? GetOrCreateBDD(k, BDD.False, bdd) : GetOrCreateBDD(k, bdd, BDD.False);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there cheaper ways to build up a BDD? Maybe the caching involved helps, but it seems like otherwise this is going to incrementally build up the BDD by creating 15 intermediate ones that are then thrown away?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are better ways, I think, but this would involve using e.g. a designated array and non-object base representation with own memory-management over that array.
However this incremental build only happens once per ASCII character, I think it is negligible.

}
return bdd;
}

/// <summary>
/// Make a CharSet from all the characters in the range from m to n.
/// Returns the empty set if n is less than m
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ public static void WriteInt64ArrayInitSyntax(StreamWriter sw, long[] values)
}
sw.Write("}");
}

public static void WriteByteArrayInitSyntax(StreamWriter sw, byte[] values)
{
sw.Write("new byte[] {");
for (int i = 0; i < values.Length; i++)
{
sw.Write($" 0x{values[i]:X}, ");
}
sw.Write("}");
}
}
#endif
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -41,70 +41,54 @@ private static void WriteIgnoreCaseBDD(StreamWriter sw)
sw.WriteLine(" /// <summary>Serialized BDD for mapping characters to their case-ignoring equivalence classes in the default (en-US) culture.</summary>");

var solver = new CharSetSolver();
Dictionary<char, BDD> ignoreCase = ComputeIgnoreCaseDictionary(solver, new CultureInfo(DefaultCultureName));
List<EquivalenceClass> ignoreCaseEquivalenceClasses = ComputeIgnoreCaseEquivalenceClasses(solver, new CultureInfo(DefaultCultureName));
BDD ignorecase = solver.False;
foreach (KeyValuePair<char, BDD> kv in ignoreCase)
foreach (EquivalenceClass ec in ignoreCaseEquivalenceClasses)
{
BDD a = solver.CreateCharSetFromRange(kv.Key, kv.Key);
BDD b = kv.Value;
ignorecase = solver.Or(ignorecase, solver.And(solver.ShiftLeft(a, 16), b));
// Create the Cartesian product of ec._set with itself
BDD crossproduct = solver.And(solver.ShiftLeft(ec._set, 16), ec._set);
// Add the product into the overall lookup table
ignorecase = solver.Or(ignorecase, crossproduct);
}

sw.Write(" public static readonly long[] IgnoreCaseEnUsSerializedBDD = ");
GeneratorHelper.WriteInt64ArrayInitSyntax(sw, ignorecase.Serialize());
sw.Write(" public static readonly byte[] IgnoreCaseEnUsSerializedBDD = ");
GeneratorHelper.WriteByteArrayInitSyntax(sw, ignorecase.SerializeToBytes());
sw.WriteLine(";");
}

private static Dictionary<char, BDD> ComputeIgnoreCaseDictionary(CharSetSolver solver, CultureInfo culture)
private static List<EquivalenceClass> ComputeIgnoreCaseEquivalenceClasses(CharSetSolver solver, CultureInfo culture)
{
CultureInfo originalCulture = CultureInfo.CurrentCulture;
try
{
CultureInfo.CurrentCulture = culture;
var ignoreCase = new Dictionary<char, EquivalenceClass>();
var sets = new List<EquivalenceClass>();

var ignoreCase = new Dictionary<char, BDD>();
for (uint i = 65; i <= 0xFFFF; i++)
{
char C = (char)i;
char c = char.ToLower(C, culture);

for (uint i = 0; i <= 0xFFFF; i++)
if (c == C)
{
char c = (char)i;
char cUpper = char.ToUpper(c);
char cLower = char.ToLower(c);

if (cUpper == cLower)
{
continue;
}

// c may be different from both cUpper as well as cLower.
// Make sure that the regex engine considers c as being equivalent to cUpper and cLower, else ignore c.
// In some cases c != cU but the regex engine does not consider the chacarters equivalent wrt the ignore-case option.
if (Regex.IsMatch($"{cUpper}{cLower}", $"^(?i:\\u{i:X4}\\u{i:X4})$"))
{
BDD equiv = solver.False;

if (ignoreCase.ContainsKey(c))
equiv = solver.Or(equiv, ignoreCase[c]);

if (ignoreCase.ContainsKey(cUpper))
equiv = solver.Or(equiv, ignoreCase[cUpper]);

if (ignoreCase.ContainsKey(cLower))
equiv = solver.Or(equiv, ignoreCase[cLower]);

// Make sure all characters are included initially or when some is still missing
equiv = solver.Or(equiv, solver.Or(solver.CreateCharSetFromRange(c, c), solver.Or(solver.CreateCharSetFromRange(cUpper, cUpper), solver.CreateCharSetFromRange(cLower, cLower))));

// Update all the members with their case-invariance equivalence classes
foreach (char d in solver.GenerateAllCharacters(equiv))
ignoreCase[d] = equiv;
}
continue;
}

return ignoreCase;
EquivalenceClass? ec;
if (!ignoreCase.TryGetValue(c, out ec))
{
ec = new EquivalenceClass(solver.CharConstraint(c));
ignoreCase[c] = ec;
sets.Add(ec);
}
ec._set = solver.Or(ec._set, solver.CharConstraint(C));
}
finally
return sets;
}

private class EquivalenceClass
{
public BDD _set;
public EquivalenceClass(BDD set)
{
CultureInfo.CurrentCulture = originalCulture;
_set = set;
}
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ private IgnoreCaseRelation EnsureDefault()
if (_relationDefault is null)
{
BDD instance = BDD.Deserialize(Unicode.IgnoreCaseRelation.IgnoreCaseEnUsSerializedBDD, _solver);
byte[] tmp = instance.SerializeToBytes();
BDD instance2 = BDD.Deserialize(tmp, _solver);
if (instance != instance2)
throw new Exception();
BDD instanceDomain = _solver.ShiftRight(instance, 16); // represents the set of all case-sensitive characters in the default culture.
_relationDefault = new IgnoreCaseRelation(instance, instanceDomain);
}
Expand Down
Loading