Skip to content

Commit 53880cf

Browse files
committed
CI/TST: Make test_vector_resize more deterministic (pandas-dev#46602)
(cherry picked from commit bea02f3)
1 parent a7a7481 commit 53880cf

File tree

1 file changed

+166
-39
lines changed

1 file changed

+166
-39
lines changed

pandas/tests/libs/test_hashtable.py

Lines changed: 166 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from contextlib import contextmanager
2+
import struct
23
import tracemalloc
34

45
import numpy as np
@@ -77,16 +78,16 @@ def test_get_set_contains_len(self, table_type, dtype):
7778
with pytest.raises(KeyError, match=str(index + 2)):
7879
table.get_item(index + 2)
7980

80-
def test_map(self, table_type, dtype, writable):
81-
# PyObjectHashTable has no map-method
82-
if table_type != ht.PyObjectHashTable:
81+
def test_map_keys_to_values(self, table_type, dtype, writable):
82+
# only Int64HashTable has this method
83+
if table_type == ht.Int64HashTable:
8384
N = 77
8485
table = table_type()
8586
keys = np.arange(N).astype(dtype)
8687
vals = np.arange(N).astype(np.int64) + N
8788
keys.flags.writeable = writable
8889
vals.flags.writeable = writable
89-
table.map(keys, vals)
90+
table.map_keys_to_values(keys, vals)
9091
for i in range(N):
9192
assert table.get_item(keys[i]) == i + N
9293

@@ -165,19 +166,139 @@ def test_get_state(self, table_type, dtype):
165166
assert "n_buckets" in state
166167
assert "upper_bound" in state
167168

168-
def test_no_reallocation(self, table_type, dtype):
169-
for N in range(1, 110):
170-
keys = np.arange(N).astype(dtype)
171-
preallocated_table = table_type(N)
172-
n_buckets_start = preallocated_table.get_state()["n_buckets"]
173-
preallocated_table.map_locations(keys)
174-
n_buckets_end = preallocated_table.get_state()["n_buckets"]
175-
# original number of buckets was enough:
176-
assert n_buckets_start == n_buckets_end
177-
# check with clean table (not too much preallocated)
178-
clean_table = table_type()
179-
clean_table.map_locations(keys)
180-
assert n_buckets_start == clean_table.get_state()["n_buckets"]
169+
@pytest.mark.parametrize("N", range(1, 110))
170+
def test_no_reallocation(self, table_type, dtype, N):
171+
keys = np.arange(N).astype(dtype)
172+
preallocated_table = table_type(N)
173+
n_buckets_start = preallocated_table.get_state()["n_buckets"]
174+
preallocated_table.map_locations(keys)
175+
n_buckets_end = preallocated_table.get_state()["n_buckets"]
176+
# original number of buckets was enough:
177+
assert n_buckets_start == n_buckets_end
178+
# check with clean table (not too much preallocated)
179+
clean_table = table_type()
180+
clean_table.map_locations(keys)
181+
assert n_buckets_start == clean_table.get_state()["n_buckets"]
182+
183+
184+
class TestHashTableUnsorted:
185+
# TODO: moved from test_algos; may be redundancies with other tests
186+
def test_string_hashtable_set_item_signature(self):
187+
# GH#30419 fix typing in StringHashTable.set_item to prevent segfault
188+
tbl = ht.StringHashTable()
189+
190+
tbl.set_item("key", 1)
191+
assert tbl.get_item("key") == 1
192+
193+
with pytest.raises(TypeError, match="'key' has incorrect type"):
194+
# key arg typed as string, not object
195+
tbl.set_item(4, 6)
196+
with pytest.raises(TypeError, match="'val' has incorrect type"):
197+
tbl.get_item(4)
198+
199+
def test_lookup_nan(self, writable):
200+
# GH#21688 ensure we can deal with readonly memory views
201+
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
202+
xs.setflags(write=writable)
203+
m = ht.Float64HashTable()
204+
m.map_locations(xs)
205+
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
206+
207+
def test_add_signed_zeros(self):
208+
# GH#21866 inconsistent hash-function for float64
209+
# default hash-function would lead to different hash-buckets
210+
# for 0.0 and -0.0 if there are more than 2^30 hash-buckets
211+
# but this would mean 16GB
212+
N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
213+
m = ht.Float64HashTable(N)
214+
m.set_item(0.0, 0)
215+
m.set_item(-0.0, 0)
216+
assert len(m) == 1 # 0.0 and -0.0 are equivalent
217+
218+
def test_add_different_nans(self):
219+
# GH#21866 inconsistent hash-function for float64
220+
# create different nans from bit-patterns:
221+
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
222+
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
223+
assert NAN1 != NAN1
224+
assert NAN2 != NAN2
225+
# default hash function would lead to different hash-buckets
226+
# for NAN1 and NAN2 even if there are only 4 buckets:
227+
m = ht.Float64HashTable()
228+
m.set_item(NAN1, 0)
229+
m.set_item(NAN2, 0)
230+
assert len(m) == 1 # NAN1 and NAN2 are equivalent
231+
232+
def test_lookup_overflow(self, writable):
233+
xs = np.array([1, 2, 2**63], dtype=np.uint64)
234+
# GH 21688 ensure we can deal with readonly memory views
235+
xs.setflags(write=writable)
236+
m = ht.UInt64HashTable()
237+
m.map_locations(xs)
238+
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
239+
240+
@pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case
241+
@pytest.mark.parametrize(
242+
"htable, uniques, dtype, safely_resizes",
243+
[
244+
(ht.PyObjectHashTable, ht.ObjectVector, "object", False),
245+
(ht.StringHashTable, ht.ObjectVector, "object", True),
246+
(ht.Float64HashTable, ht.Float64Vector, "float64", False),
247+
(ht.Int64HashTable, ht.Int64Vector, "int64", False),
248+
(ht.Int32HashTable, ht.Int32Vector, "int32", False),
249+
(ht.UInt64HashTable, ht.UInt64Vector, "uint64", False),
250+
],
251+
)
252+
def test_vector_resize(
253+
self, writable, htable, uniques, dtype, safely_resizes, nvals
254+
):
255+
# Test for memory errors after internal vector
256+
# reallocations (GH 7157)
257+
# Changed from using np.random.rand to range
258+
# which could cause flaky CI failures when safely_resizes=False
259+
vals = np.array(range(1000), dtype=dtype)
260+
261+
# GH 21688 ensures we can deal with read-only memory views
262+
vals.setflags(write=writable)
263+
264+
# initialise instances; cannot initialise in parametrization,
265+
# as otherwise external views would be held on the array (which is
266+
# one of the things this test is checking)
267+
htable = htable()
268+
uniques = uniques()
269+
270+
# get_labels may append to uniques
271+
htable.get_labels(vals[:nvals], uniques, 0, -1)
272+
# to_array() sets an external_view_exists flag on uniques.
273+
tmp = uniques.to_array()
274+
oldshape = tmp.shape
275+
276+
# subsequent get_labels() calls can no longer append to it
277+
# (except for StringHashTables + ObjectVector)
278+
if safely_resizes:
279+
htable.get_labels(vals, uniques, 0, -1)
280+
else:
281+
with pytest.raises(ValueError, match="external reference.*"):
282+
htable.get_labels(vals, uniques, 0, -1)
283+
284+
uniques.to_array() # should not raise here
285+
assert tmp.shape == oldshape
286+
287+
@pytest.mark.parametrize(
288+
"hashtable",
289+
[
290+
ht.PyObjectHashTable,
291+
ht.StringHashTable,
292+
ht.Float64HashTable,
293+
ht.Int64HashTable,
294+
ht.Int32HashTable,
295+
ht.UInt64HashTable,
296+
],
297+
)
298+
def test_hashtable_large_sizehint(self, hashtable):
299+
# GH#22729 smoketest for not raising when passing a large size_hint
300+
size_hint = np.iinfo(np.uint32).max + 1
301+
hashtable(size_hint=size_hint)
181302

182303

183304
class TestPyObjectHashTableWithNans:
@@ -282,19 +403,19 @@ def test_tracemalloc_for_empty_StringHashTable():
282403
assert get_allocated_khash_memory() == 0
283404

284405

285-
def test_no_reallocation_StringHashTable():
286-
for N in range(1, 110):
287-
keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
288-
preallocated_table = ht.StringHashTable(N)
289-
n_buckets_start = preallocated_table.get_state()["n_buckets"]
290-
preallocated_table.map_locations(keys)
291-
n_buckets_end = preallocated_table.get_state()["n_buckets"]
292-
# original number of buckets was enough:
293-
assert n_buckets_start == n_buckets_end
294-
# check with clean table (not too much preallocated)
295-
clean_table = ht.StringHashTable()
296-
clean_table.map_locations(keys)
297-
assert n_buckets_start == clean_table.get_state()["n_buckets"]
406+
@pytest.mark.parametrize("N", range(1, 110))
407+
def test_no_reallocation_StringHashTable(N):
408+
keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
409+
preallocated_table = ht.StringHashTable(N)
410+
n_buckets_start = preallocated_table.get_state()["n_buckets"]
411+
preallocated_table.map_locations(keys)
412+
n_buckets_end = preallocated_table.get_state()["n_buckets"]
413+
# original number of buckets was enough:
414+
assert n_buckets_start == n_buckets_end
415+
# check with clean table (not too much preallocated)
416+
clean_table = ht.StringHashTable()
417+
clean_table.map_locations(keys)
418+
assert n_buckets_start == clean_table.get_state()["n_buckets"]
298419

299420

300421
@pytest.mark.parametrize(
@@ -322,15 +443,6 @@ def test_get_set_contains_len(self, table_type, dtype):
322443
assert index in table
323444
assert table.get_item(index) == 41
324445

325-
def test_map(self, table_type, dtype):
326-
N = 332
327-
table = table_type()
328-
keys = np.full(N, np.nan, dtype=dtype)
329-
vals = (np.arange(N) + N).astype(np.int64)
330-
table.map(keys, vals)
331-
assert len(table) == 1
332-
assert table.get_item(np.nan) == 2 * N - 1
333-
334446
def test_map_locations(self, table_type, dtype):
335447
N = 10
336448
table = table_type()
@@ -468,6 +580,21 @@ def test_unique_label_indices_intp(writable):
468580
tm.assert_numpy_array_equal(result, expected)
469581

470582

583+
def test_unique_label_indices():
584+
585+
a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp)
586+
587+
left = ht.unique_label_indices(a)
588+
right = np.unique(a, return_index=True)[1]
589+
590+
tm.assert_numpy_array_equal(left, right, check_dtype=False)
591+
592+
a[np.random.choice(len(a), 10)] = -1
593+
left = ht.unique_label_indices(a)
594+
right = np.unique(a, return_index=True)[1][1:]
595+
tm.assert_numpy_array_equal(left, right, check_dtype=False)
596+
597+
471598
@pytest.mark.parametrize(
472599
"dtype",
473600
[

0 commit comments

Comments
 (0)