Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-34751: improved hash function for tuples #9471

Merged
merged 2 commits into from
Oct 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 93 additions & 18 deletions Lib/test/test_tuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,29 +62,104 @@ def f():
yield i
self.assertEqual(list(tuple(f())), list(range(1000)))

def test_hash(self):
# See SF bug 942952: Weakness in tuple hash
# The hash should:
# be non-commutative
# should spread-out closely spaced values
# should not exhibit cancellation in tuples like (x,(x,y))
# should be distinct from element hashes: hash(x)!=hash((x,))
# This test exercises those cases.
# For a pure random hash and N=50, the expected number of occupied
# buckets when tossing 252,600 balls into 2**32 buckets
# is 252,592.6, or about 7.4 expected collisions. The
# standard deviation is 2.73. On a box with 64-bit hash
# codes, no collisions are expected. Here we accept no
# more than 15 collisions. Any worse and the hash function
# is sorely suspect.

# Various tests for hashing of tuples to check that we get few collisions.
#
# Earlier versions of the tuple hash algorithm had collisions
# reported at:
# - https://bugs.python.org/issue942952
# - https://bugs.python.org/issue34751
#
# Notes:
# - The hash of tuples is deterministic: if the test passes once on a given
# system, it will always pass. So the probabilities mentioned in the
# test_hash functions below should be interpreted assuming that the
# hashes are random.
# - Due to the structure in the testsuite inputs, collisions are not
# independent. For example, if hash((a,b)) == hash((c,d)), then also
# hash((a,b,x)) == hash((c,d,x)). But the quoted probabilities assume
# independence anyway.
# - We limit the hash to 32 bits in the tests to have a good test on
# 64-bit systems too. Furthermore, this is also a sanity check that the
# lower 32 bits of a 64-bit hash are sufficiently random too.
def test_hash1(self):
# Check for hash collisions between small integers in range(50) and
# certain tuples and nested tuples of such integers.
N=50
base = list(range(N))
xp = [(i, j) for i in base for j in base]
inps = base + [(i, j) for i in base for j in xp] + \
[(i, j) for i in xp for j in base] + xp + list(zip(base))
collisions = len(inps) - len(set(map(hash, inps)))
self.assertTrue(collisions <= 15)
self.assertEqual(len(inps), 252600)
hashes = set(hash(x) % 2**32 for x in inps)
collisions = len(inps) - len(hashes)

# For a pure random 32-bit hash and N = 252,600 test items, the
# expected number of collisions equals
#
# 2**(-32) * N(N-1)/2 = 7.4
#
# We allow up to 15 collisions, which suffices to make the test
# pass with 99.5% confidence.
self.assertLessEqual(collisions, 15)

def test_hash2(self):
# Check for hash collisions between small integers (positive and
# negative), tuples and nested tuples of such integers.

# All numbers in the interval [-n, ..., n] except -1 because
# hash(-1) == hash(-2).
n = 5
A = [x for x in range(-n, n+1) if x != -1]

B = A + [(a,) for a in A]

L2 = [(a,b) for a in A for b in A]
L3 = L2 + [(a,b,c) for a in A for b in A for c in A]
L4 = L3 + [(a,b,c,d) for a in A for b in A for c in A for d in A]

# T = list of testcases. These consist of all (possibly nested
# at most 2 levels deep) tuples containing at most 4 items from
# the set A.
T = A
T += [(a,) for a in B + L4]
T += [(a,b) for a in L3 for b in B]
T += [(a,b) for a in L2 for b in L2]
T += [(a,b) for a in B for b in L3]
T += [(a,b,c) for a in B for b in B for c in L2]
T += [(a,b,c) for a in B for b in L2 for c in B]
T += [(a,b,c) for a in L2 for b in B for c in B]
T += [(a,b,c,d) for a in B for b in B for c in B for d in B]
self.assertEqual(len(T), 345130)
hashes = set(hash(x) % 2**32 for x in T)
collisions = len(T) - len(hashes)

# For a pure random 32-bit hash and N = 345,130 test items, the
# expected number of collisions equals
#
# 2**(-32) * N(N-1)/2 = 13.9
#
# We allow up to 20 collisions, which suffices to make the test
# pass with 95.5% confidence.
self.assertLessEqual(collisions, 20)

def test_hash3(self):
# Check for hash collisions between tuples containing 0.0 and 0.5.
# The hashes of 0.0 and 0.5 itself differ only in one high bit.
# So this implicitly tests propagation of high bits to low bits.
from itertools import product
T = list(product([0.0, 0.5], repeat=18))
self.assertEqual(len(T), 262144)
hashes = set(hash(x) % 2**32 for x in T)
collisions = len(T) - len(hashes)

# For a pure random 32-bit hash and N = 262,144 test items, the
# expected number of collisions equals
#
# 2**(-32) * N(N-1)/2 = 8.0
#
# We allow up to 15 collisions, which suffices to make the test
# pass with 99.1% confidence.
self.assertLessEqual(collisions, 15)

def test_repr(self):
l0 = tuple()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
The hash function for tuples is now based on xxHash
which gives better collision results on (formerly) pathological cases.
Additionally, on 64-bit systems it improves tuple hashes in general.
Patch by Jeroen Demeyer with substantial contributions by Tim Peters.
71 changes: 46 additions & 25 deletions Objects/tupleobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -333,39 +333,60 @@ tuplerepr(PyTupleObject *v)
return NULL;
}

/* The addend 82520, was selected from the range(0, 1000000) for
generating the greatest number of prime multipliers for tuples
up to length eight:

1082527, 1165049, 1082531, 1165057, 1247581, 1330103, 1082533,
1330111, 1412633, 1165069, 1247599, 1495177, 1577699

Tests have shown that it's not worth to cache the hash value, see
issue #9685.
/* Hash for tuples. This is a slightly simplified version of the xxHash
non-cryptographic hash:
- we do not use any parallellism, there is only 1 accumulator.
- we drop the final mixing since this is just a permutation of the
output space: it does not help against collisions.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not comfortable dropping the final permutation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Final mixing" here refers to the xxHash spec's post-loop avalanche code, which is one long serialized critical path spanning 8 instructions including 2 multiplies. I don't believe that's what you (Raymond) have in mind here at all.

If you're talking about merely adding a large constant, ya, that can't hurt (except to add a cycle to the critical path), but xxHash doesn't do it, and I'm at a loss to dream up "a reason" for why it might do any good.

One of my goals here is to make it as brainless as possible to replace this code with the guts of xxHash version 2, if and when someone in real life finds a horrible case that's added to the SMHasher test suite. So I would really like to see a "good reason" to deviate at all from the spec.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the final permutation, a single addend would suffice.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we can't identify the purpose of doing this, how could we know whether it suffices? Best I can tell, you believe it helped with nested tuples in some other function, but one of radically different structure. In this function, the result of any nested tuple's hash is immediately multiplied by _PyHASH_XXPRIME_2 inside the loop, which is a far more disruptive permutation than adding a constant.

What it does as-is is powerful enough that we don't have any hint of a problem in either of the intensely "nested tuple" tests we have now. I'd be astonished if adding a constant hurt the test results - but even more astonished if it helped. I really don't want to add a line of code for which the only comment I could write is:

/* This isn't part of the xxHash spec, and removing it has
 * no effect on any test we have, nor can we identify a
 * reason for why it's here.  DO NOT CHANGE!
 */

😉

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reasons I can't fully defend, I think the final addend is important and I'm not comfortable dropping that final step out of the tuplehash. The original xxhash has a complex permutation step, but I think and added will suffice to amplify effects between levels of nesting. It costs one clock, is possibly beneficial, and is utterly harmless. Unless you're dead set against it, let's not battle over this one.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't find it again, but I left a comment somewhere saying that making that change (predictably) made no significant difference in any of the tests. Just seemingly randomly made insignificant differences in the number of collisions in about 20% of the tests. So keeping it in is OK by me, but you come up with a comment to explain it ;-)

Explained before why xxHash does what it does: it's striving for avalanche perfection, and there aren't enough permutations inside the loop for each bit of the last input or two to have a decent chance of affecting all the other bits in the full-width hash code. So they pile up more permutations outside the loop, specifically designed for their avalanche properties.

But they're permutations: two full-width hash codes collide after their avalanche code if and only if they collided before their avalanche code. We do care about collisions, but don't care about avalanche perfection, so their avalanche code serves no purpose we care about.

Merely adding a constant would have done nothing to help xxHash meet its avalanche goals; it's quite clear to me why their post-loop code is as long-winded as it is.

tim-one marked this conversation as resolved.
Show resolved Hide resolved
- at the end, we mangle the length with a single constant.
For the xxHash specification, see
https://github.com/Cyan4973/xxHash/blob/master/doc/xxhash_spec.md

Below are the official constants from the xxHash specification. Optimizing
compilers should emit a single "rotate" instruction for the
_PyHASH_XXROTATE() expansion. If that doesn't happen for some important
platform, the macro could be changed to expand to a platform-specific rotate
spelling instead.
*/
#if SIZEOF_PY_UHASH_T > 4
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
#define _PyHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
#else
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
#define _PyHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
#endif

/* Tests have shown that it's not worth to cache the hash value, see
https://bugs.python.org/issue9685 */
static Py_hash_t
tuplehash(PyTupleObject *v)
{
Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Py_hash_t y;
Py_ssize_t len = Py_SIZE(v);
PyObject **p;
Py_uhash_t mult = _PyHASH_MULTIPLIER;
x = 0x345678UL;
p = v->ob_item;
while (--len >= 0) {
y = PyObject_Hash(*p++);
if (y == -1)
Py_ssize_t i, len = Py_SIZE(v);
PyObject **item = v->ob_item;

Py_uhash_t acc = _PyHASH_XXPRIME_5;
for (i = 0; i < len; i++) {
Py_uhash_t lane = PyObject_Hash(item[i]);
if (lane == (Py_uhash_t)-1) {
return -1;
x = (x ^ y) * mult;
/* the cast might truncate len; that doesn't change hash stability */
mult += (Py_hash_t)(82520UL + len + len);
tim-one marked this conversation as resolved.
Show resolved Hide resolved
}
acc += lane * _PyHASH_XXPRIME_2;
acc = _PyHASH_XXROTATE(acc);
acc *= _PyHASH_XXPRIME_1;
}

/* Add input length, mangled to keep the historical value of hash(()). */
acc += len ^ (_PyHASH_XXPRIME_5 ^ 3527539UL);

if (acc == (Py_uhash_t)-1) {
return 1546275796;
}
x += 97531UL;
tim-one marked this conversation as resolved.
Show resolved Hide resolved
if (x == (Py_uhash_t)-1)
x = -2;
return x;
return acc;
}

static Py_ssize_t
Expand Down