diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ed851439df..9c5ba7dcced 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - PR #1828 JSON Reader: add suport for bool8 columns - PR #1665 Add the point-in-polygon GIS function - PR #1863 Series and Dataframe methods for all and any +- PR #1917 Adds an index hashing method ## Improvements - PR #1538 Replacing LesserRTTI with inequality_comparator diff --git a/python/cudf/dataframe/index.py b/python/cudf/dataframe/index.py index 7eb0b893a1b..52ffcb4a6a2 100644 --- a/python/cudf/dataframe/index.py +++ b/python/cudf/dataframe/index.py @@ -23,6 +23,7 @@ from cudf.comm.serialize import register_distributed_serializer import cudf.bindings.copying as cpp_copying +import cudf.bindings.hash as cpp_hash class Index(object): @@ -530,6 +531,31 @@ def find_label_range(self, first, last): end += 1 return begin, end + def hash_index(self): + """Hash the given index and return a new Series + + Returns + ------- + Series : + Sequence of column names. If columns is *None* (unspecified), + all columns in the frame are used. + """ + from cudf.dataframe.series import Series + + initial_hash_values = None + buf = Buffer(rmm.device_array(len(self), dtype=np.int32)) + result = NumericalColumn(data=buf, dtype=buf.dtype) + + _hash = cpp_hash.hash_columns([self.as_column()], + result, initial_hash_values) + + sr = Series(_hash) + + # hash_columns produces negative valuesg + # probably can switch to np.uint32 + # when supported by libcudf + return abs(sr) + class DatetimeIndex(GenericIndex): # TODO this constructor should take a timezone or something to be diff --git a/python/cudf/tests/test_index.py b/python/cudf/tests/test_index.py index 822549e9e46..f4425335cf2 100644 --- a/python/cudf/tests/test_index.py +++ b/python/cudf/tests/test_index.py @@ -123,6 +123,24 @@ def test_categorical_index(): assert_eq(pdf.index, gdf2.index) +@pytest.mark.parametrize('index_name', [ + 'num_idx', + 'cat_idx', +]) +def test_hashing_index(index_name): + pdf = pd.DataFrame() + pdf['num_idx'] = [1, 2, 3, 1] + pdf['cat_idx'] = pd.Categorical(['a', 'b', 'c', 'a']) + gdf = DataFrame.from_pandas(pdf) + sr = gdf.set_index(index_name).index.hash_index() + + # values are always positive for modulo calculation + assert_eq(sr, sr[sr > 0]) + assert len(sr) == len(pdf[index_name]) + assert sr.iloc[0] == sr.iloc[-1] + assert len(sr.unique()) == len(sr) - 1 + + def test_pandas_as_index(): # Define Pandas Indexes pdf_int_index = pd.Int64Index([1, 2, 3, 4, 5])