Skip to content

Commit

Permalink
Merge pull request #15 from dantegd/enh-cudf-references
Browse files Browse the repository at this point in the history
[REVIEW] FIX change pygdf references to cudf
  • Loading branch information
dantegd authored Oct 30, 2018
2 parents bd7824b + 3967b4c commit 65aeaef
Show file tree
Hide file tree
Showing 16 changed files with 117 additions and 117 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# From: https://github.com/rapidsai/pygdf/blob/master/Dockerfile
FROM pygdf
# From: https://github.com/rapidsai/cudf/blob/master/Dockerfile
FROM cudf

ADD ml-prims /cuML/ml-prims
ADD cuML /cuML/cuML
Expand Down
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Machine learning is a fundamental capability of RAPIDS. cuML is a suite of libra

The cuML repository contains:

1. ***python***: Python based GPU Dataframe (GDF) machine learning package that takes [cuDF](https://github.com/rapidsai/cudf-alpha) dataframes as input. cuML connects the data to C++/CUDA based cuML and ml-prims libraries without ever leaving GPU memory.
1. ***python***: Python based GPU Dataframe (GDF) machine learning package that takes [cuDF](https://github.com/rapidsai/cudf) dataframes as input. cuML connects the data to C++/CUDA based cuML and ml-prims libraries without ever leaving GPU memory.

2. ***cuML***: C++/CUDA machine learning algorithms. This library currently includes the following five algorithms;
a. Single GPU Truncated Singular Value Decomposition (tSVD),
Expand Down Expand Up @@ -47,12 +47,12 @@ To use cuML, it must be cloned and built in an environment that already has the

List of dependencies:

1. zlib
2. cmake (>= 3.8, version 3.11.4 is recommended and there are issues with version 3.12)
3. CUDA (>= 9.0)
4. Cython (>= 0.28)
5. gcc (>=5.4.0)
6. [cuDF](https://github.com/rapidsai/cudf-alpha)
1. [cuDF](https://github.com/rapidsai/cudf-alpha) (>=0.2.0)
2. zlib
3. cmake (>= 3.8, version 3.11.4 is recommended and there are issues with version 3.12)
4. CUDA (>= 9.0)
5. Cython (>= 0.28)
6. gcc (>=5.4.0)
7. faiss-gpu (>=1.4.0) - To install with conda: ```conda install -c pytorch faiss-gpu```

### Setup steps
Expand Down
2 changes: 1 addition & 1 deletion conda_environments/builddocs_py35.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ dependencies:
- python=3.5.*
- pytest
- cudatoolkit=9.2
- pygdf=0.1.0a3.*
- cudf=0.2.0
- numba>=0.40.0dev
- pandas=0.20.*
- pyarrow=0.10.*
Expand Down
2 changes: 1 addition & 1 deletion python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ List of dependencies:
4. Cython (>= 0.28)
5. gcc (>=5.4.0)
6. nvcc
7. [cuDF](https://github.com/gpuopenanalytics/pygdf)
7. [cuDF](https://github.com/gpuopenanalytics/cudf)

### Setup steps

Expand Down
38 changes: 19 additions & 19 deletions python/cuML/dbscan/dbscan_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
cimport c_dbscan
import numpy as np
from numba import cuda
import pygdf
import cudf
from libcpp cimport bool
import ctypes
from libc.stdint cimport uintptr_t
Expand All @@ -30,11 +30,11 @@ class DBSCAN:
.. code-block:: python
import pygdf
import cudf
from cuML import DBSCAN
import numpy as np
gdf_float = pygdf.DataFrame()
gdf_float = cudf.DataFrame()
gdf_float['0']=np.asarray([1.0,2.0,5.0],dtype=np.float32)
gdf_float['1']=np.asarray([4.0,2.0,1.0],dtype=np.float32)
gdf_float['2']=np.asarray([4.0,2.0,1.0],dtype=np.float32)
Expand All @@ -59,7 +59,7 @@ class DBSCAN:
self.eps = eps
self.min_samples = min_samples
self.labels_ = None

def _get_ctype_ptr(self, obj):
# The manner to access the pointers in the gdf's might change, so
# encapsulating access in the following 3 methods. They might also be
Expand All @@ -78,7 +78,7 @@ class DBSCAN:
Parameters
----------
X : PyGDF DataFrame
X : cuDF DataFrame
Dense matrix (floats or doubles) of shape (n_samples, n_features)
"""

Expand All @@ -90,39 +90,39 @@ class DBSCAN:
self.gdf_datatype = np.dtype(x[0])
self.n_rows = len(X)
self.n_cols = len(X._cols)

cdef uintptr_t input_ptr = self._get_gdf_as_matrix_ptr(X)
self.labels_ = pygdf.Series(np.zeros(self.n_rows, dtype=np.int32))
self.labels_ = cudf.Series(np.zeros(self.n_rows, dtype=np.int32))
cdef uintptr_t labels_ptr = self._get_column_ptr(self.labels_)

if self.gdf_datatype.type == np.float32:
c_dbscan.dbscanFit(<float*>input_ptr,
<int> self.n_rows,
<int> self.n_cols,
<float> self.eps,
c_dbscan.dbscanFit(<float*>input_ptr,
<int> self.n_rows,
<int> self.n_cols,
<float> self.eps,
<int> self.min_samples,
<int*> labels_ptr)
else:
c_dbscan.dbscanFit(<double*>input_ptr,
<int> self.n_rows,
<int> self.n_cols,
<double> self.eps,
c_dbscan.dbscanFit(<double*>input_ptr,
<int> self.n_rows,
<int> self.n_cols,
<double> self.eps,
<int> self.min_samples,
<int*> labels_ptr)


def fit_predict(self, X):
"""
Performs clustering on input_gdf and returns cluster labels.
Parameters
----------
X : PyGDF DataFrame
Dense matrix (floats or doubles) of shape (n_samples, n_features),
X : cuDF DataFrame
Dense matrix (floats or doubles) of shape (n_samples, n_features),
Returns
-------
y : PyGDF Series, shape (n_samples)
y : cuDF Series, shape (n_samples)
cluster labels
"""
self.fit(X)
Expand Down
20 changes: 10 additions & 10 deletions python/cuML/knn/knn_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import faiss
import numpy as np
import pandas as pd
import pygdf
import cudf

class KNNparams:
def __init__(self,n_gpus):
Expand All @@ -26,15 +26,15 @@ class KNN:
"""
Create a DataFrame, fill it with data, and compute KNN:
.. code-block:: python
import pygdf
import cudf
from cuML import KNN
import numpy as np
np_float = np.array([
[1.,2.,3.], # 1st point
[1.,2.,3.], # 1st point
[1.,2.,4.], # 2nd point
[2.,2.,4.] # 3rd point
]).astype('float32')
gdf_float = pygdf.DataFrame()
gdf_float = cudf.DataFrame()
gdf_float['dim_0'] = np.ascontiguousarray(np_float[:,0])
gdf_float['dim_1'] = np.ascontiguousarray(np_float[:,1])
gdf_float['dim_2'] = np.ascontiguousarray(np_float[:,2])
Expand Down Expand Up @@ -88,18 +88,18 @@ def fit(self,X):
def query(self,X,k):
X = self.to_nparray(X)
D,I = self.gpu_index.search(X, k)
D = self.to_pygdf(D,col='distance')
I = self.to_pygdf(I,col='index')
D = self.to_cudf(D,col='distance')
I = self.to_cudf(I,col='index')
return D,I

def to_nparray(self,x):
if isinstance(x,pygdf.DataFrame):
if isinstance(x,cudf.DataFrame):
x = x.to_pandas()
return np.ascontiguousarray(x)

def to_pygdf(self,df,col=''):
# convert pandas dataframe to pygdf dataframe
def to_cudf(self,df,col=''):
# convert pandas dataframe to cudf dataframe
if isinstance(df,np.ndarray):
df = pd.DataFrame({'%s_neighbor_%d'%(col,i):df[:,i] for i in range(df.shape[1])})
pdf = pygdf.DataFrame.from_pandas(df)
pdf = cudf.DataFrame.from_pandas(df)
return pdf
4 changes: 2 additions & 2 deletions python/cuML/pca/pca_random_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from cuML import PCA
from sklearn.decomposition import PCA as PCA_SKL
import pygdf
import cudf
import numpy as np
import pandas as pd
import time
Expand All @@ -31,7 +31,7 @@
df = pd.DataFrame(data)#, index = index)
df = df.astype('float32')

gdf = pygdf.DataFrame.from_pandas(df)
gdf = cudf.DataFrame.from_pandas(df)

print("\ninput:")

Expand Down
52 changes: 26 additions & 26 deletions python/cuML/pca/pca_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ cimport c_pca
import numpy as np
cimport numpy as np
from numba import cuda
import pygdf
import cudf
from libcpp cimport bool
import ctypes
from libc.stdint cimport uintptr_t
Expand All @@ -44,11 +44,11 @@ class PCA:
.. code-block:: python
import pygdf
import cudf
from cuML import PCA
import numpy as np
gdf_float = pygdf.DataFrame()
gdf_float = cudf.DataFrame()
gdf_float['0']=np.asarray([1.0,2.0,5.0],dtype=np.float32)
gdf_float['1']=np.asarray([4.0,2.0,1.0],dtype=np.float32)
gdf_float['2']=np.asarray([4.0,2.0,1.0],dtype=np.float32)
Expand Down Expand Up @@ -80,28 +80,28 @@ class PCA:
1 -0.72165036 -0.48949987 -0.4895003
explained variance:
0 8.510402
1 0.48959687
explained variance ratio:
0 0.9456003
1 0.054399658
singular values:
0 4.1256275
1 0.9895422
mean:
0 2.6666667
1 2.3333333
2 2.3333333
noise variance:
0 0.0
transformed matrix:
Expand Down Expand Up @@ -164,16 +164,16 @@ class PCA:
dtype=self.gdf_datatype))
self.components_ = cuda.to_device(np.zeros(n_components*n_cols,
dtype=self.gdf_datatype))
self.explained_variance_ = pygdf.Series(
self.explained_variance_ = cudf.Series(
np.zeros(n_components,
dtype=self.gdf_datatype))
self.explained_variance_ratio_ = pygdf.Series(
self.explained_variance_ratio_ = cudf.Series(
np.zeros(n_components,
dtype=self.gdf_datatype))
self.mean_ = pygdf.Series(np.zeros(n_cols, dtype=self.gdf_datatype))
self.singular_values_ = pygdf.Series(np.zeros(n_components,
self.mean_ = cudf.Series(np.zeros(n_cols, dtype=self.gdf_datatype))
self.singular_values_ = cudf.Series(np.zeros(n_components,
dtype=self.gdf_datatype))
self.noise_variance_ = pygdf.Series(np.zeros(1,
self.noise_variance_ = cudf.Series(np.zeros(1,
dtype=self.gdf_datatype))

def _get_ctype_ptr(self, obj):
Expand All @@ -194,7 +194,7 @@ class PCA:
Parameters
----------
X : PyGDF DataFrame
X : cuDF DataFrame
Dense matrix (floats or doubles) of shape (n_samples, n_features)
Returns
Expand Down Expand Up @@ -275,7 +275,7 @@ class PCA:
<double*> noise_vars_ptr,
params)

components_gdf = pygdf.DataFrame()
components_gdf = cudf.DataFrame()
for i in range(0, params.n_cols):
components_gdf[str(i)] = self.components_[i*params.n_components:(i+1)*params.n_components]

Expand All @@ -293,21 +293,21 @@ class PCA:
Parameters
----------
X : PyGDF DataFrame, shape (n_samples, n_features)
X : cuDF DataFrame, shape (n_samples, n_features)
training data (floats or doubles), where n_samples is the number of samples, and n_features is the number of features.
Returns
-------
X_new : PyGDF DataFrame, shape (n_samples, n_components)
X_new : cuDF DataFrame, shape (n_samples, n_components)
"""
self.fit(X, _transform=True)
X_new = pygdf.DataFrame()
X_new = cudf.DataFrame()
num_rows = self.params.n_rows

for i in range(0, self.params.n_components):
X_new[str(i)] = self.trans_input_[i*num_rows:(i+1)*num_rows]

return X_new
return X_new

def inverse_transform(self, X):
"""
Expand All @@ -317,12 +317,12 @@ class PCA:
Parameters
----------
X : PyGDF DataFrame, shape (n_samples, n_components)
X : cuDF DataFrame, shape (n_samples, n_components)
New data (floats or doubles), where n_samples is the number of samples and n_components is the number of components.
Returns
-------
X_original : PyGDF DataFrame, shape (n_samples, n_features)
X_original : cuDF DataFrame, shape (n_samples, n_features)
"""
cpdef c_pca.paramsPCA params
Expand Down Expand Up @@ -362,12 +362,12 @@ class PCA:
<double*> input_ptr,
params)

X_original = pygdf.DataFrame()
X_original = cudf.DataFrame()
for i in range(0, params.n_cols):
X_original[str(i)] = input_data[i*params.n_rows:(i+1)*params.n_rows]


return X_original
return X_original

def transform(self, X):
"""
Expand All @@ -377,12 +377,12 @@ class PCA:
Parameters
----------
X : PyGDF DataFrame, shape (n_samples, n_features)
X : cuDF DataFrame, shape (n_samples, n_features)
New data (floats or doubles), where n_samples is the number of samples and n_features is the number of features.
Returns
-------
X_new : PyGDF DataFrame, shape (n_samples, n_components)
X_new : cuDF DataFrame, shape (n_samples, n_components)
"""
cpdef c_pca.paramsPCA params
Expand Down Expand Up @@ -422,7 +422,7 @@ class PCA:
<double*> mean_ptr,
params)

X_new = pygdf.DataFrame()
X_new = cudf.DataFrame()
for i in range(0, params.n_components):
X_new[str(i)] = trans_input_data[i*params.n_rows:(i+1)*params.n_rows]

Expand Down
Loading

0 comments on commit 65aeaef

Please sign in to comment.