Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate Min Hashing APIs to pylibcudf #17021

Merged
merged 15 commits into from
Oct 11, 2024
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ nvtext
edit_distance
generate_ngrams
jaccard
minhash
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
minhash
=======

.. automodule:: pylibcudf.nvtext.minhash
:members:
101 changes: 26 additions & 75 deletions python/cudf/cudf/_lib/nvtext/minhash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,93 +2,44 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.minhash cimport (
minhash as cpp_minhash,
minhash64 as cpp_minhash64,
word_minhash as cpp_word_minhash,
word_minhash64 as cpp_word_minhash64,
)
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column


@acquire_spill_lock()
def minhash(Column strings, Column seeds, int width):

cdef column_view c_strings = strings.view()
cdef size_type c_width = width
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_minhash(
c_strings,
c_seeds,
c_width
)
)

return Column.from_unique_ptr(move(c_result))
from pylibcudf import nvtext


@acquire_spill_lock()
def minhash64(Column strings, Column seeds, int width):

cdef column_view c_strings = strings.view()
cdef size_type c_width = width
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result
def minhash(Column input, Column seeds, int width=4):
result = nvtext.minhash.minhash(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
width,
)
return Column.from_pylibcudf(result)

with nogil:
c_result = move(
cpp_minhash64(
c_strings,
c_seeds,
c_width
)
)

return Column.from_unique_ptr(move(c_result))
@acquire_spill_lock()
def minhash64(Column input, Column seeds, int width=4):
result = nvtext.minhash.minhash64(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
width,
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def word_minhash(Column input, Column seeds):

cdef column_view c_input = input.view()
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_word_minhash(
c_input,
c_seeds
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.minhash.word_minhash(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def word_minhash64(Column input, Column seeds):

cdef column_view c_input = input.view()
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_word_minhash64(
c_input,
c_seeds
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.minhash.word_minhash64(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
)
return Column.from_pylibcudf(result)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector
from pylibcudf.libcudf.column.column cimport column, column_view
from pylibcudf.libcudf.table.table cimport table, table_view
from pylibcudf.libcudf.utilities.host_span cimport host_span
from pylibcudf.libcudf.utilities.span cimport host_span

from rmm._lib.device_buffer cimport device_buffer

Expand Down
1 change: 0 additions & 1 deletion python/pylibcudf/pylibcudf/libcudf/groupby.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport (
size_type,
sorted,
)
from pylibcudf.libcudf.utilities.host_span cimport host_span

# workaround for https://github.com/cython/cython/issues/3885
ctypedef const scalar constscalar
Expand Down
15 changes: 15 additions & 0 deletions python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t, uint64_t
from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.libcudf.utilities.span cimport device_span


cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:

cdef unique_ptr[column] minhash(
const column_view &strings,
const numeric_scalar[uint32_t] seed,
const size_type width,
) except +

cdef unique_ptr[column] minhash(
const column_view &strings,
const column_view &seeds,
Expand All @@ -20,6 +29,12 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
const size_type width,
) except +

cdef unique_ptr[column] minhash64(
const column_view &strings,
const numeric_scalar[uint64_t] seed,
const size_type width,
) except +

cdef unique_ptr[column] word_minhash(
const column_view &input,
const column_view &seeds
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libcpp.vector cimport vector

Expand All @@ -7,3 +7,6 @@ cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil:
cdef cppclass host_span[T]:
host_span() except +
host_span(vector[T]) except +
cdef cppclass device_span[T]:
device_span()
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
device_span(device_span other) except +
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)
set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance, generate_ngrams, jaccard
from . cimport edit_distance, generate_ngrams, jaccard, minhash

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
"minhash"
]
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance, generate_ngrams, jaccard
from . import edit_distance, generate_ngrams, jaccard, minhash

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
"minhash",
]
18 changes: 18 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/minhash.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t, uint64_t
from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)

cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)

cpdef Column word_minhash(Column input, Column seeds)

cpdef Column word_minhash64(Column input, Column seeds)
Loading
Loading