Merge pull request #15 from dantegd/enh-cudf-references

[REVIEW] FIX change pygdf references to cudf
rapidsai · Oct 30, 2018 · 65aeaef · 65aeaef
2 parents bd7824b + 3967b4c
commit 65aeaef
Show file tree

Hide file tree

Showing 16 changed files with 117 additions and 117 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
-# From: https://github.com/rapidsai/pygdf/blob/master/Dockerfile
-FROM pygdf
+# From: https://github.com/rapidsai/cudf/blob/master/Dockerfile
+FROM cudf
 
 ADD ml-prims /cuML/ml-prims
 ADD cuML /cuML/cuML

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ Machine learning is a fundamental capability of RAPIDS. cuML is a suite of libra
 
 The cuML repository contains:
 
-1. ***python***: Python based GPU Dataframe (GDF) machine learning package that takes [cuDF](https://github.com/rapidsai/cudf-alpha) dataframes as input. cuML connects the data to C++/CUDA based cuML and ml-prims libraries without ever leaving GPU memory.
+1. ***python***: Python based GPU Dataframe (GDF) machine learning package that takes [cuDF](https://github.com/rapidsai/cudf) dataframes as input. cuML connects the data to C++/CUDA based cuML and ml-prims libraries without ever leaving GPU memory.
 
 2. ***cuML***: C++/CUDA machine learning algorithms. This library currently includes the following five algorithms;
    a. Single GPU Truncated Singular Value Decomposition (tSVD),
@@ -47,12 +47,12 @@ To use cuML, it must be cloned and built in an environment that already has the
 
 List of dependencies:
 
-1. zlib
-2. cmake (>= 3.8, version 3.11.4 is recommended and there are issues with version 3.12)
-3. CUDA (>= 9.0)
-4. Cython (>= 0.28)
-5. gcc (>=5.4.0)
-6. [cuDF](https://github.com/rapidsai/cudf-alpha)
+1. [cuDF](https://github.com/rapidsai/cudf-alpha) (>=0.2.0)
+2. zlib
+3. cmake (>= 3.8, version 3.11.4 is recommended and there are issues with version 3.12)
+4. CUDA (>= 9.0)
+5. Cython (>= 0.28)
+6. gcc (>=5.4.0)
 7. faiss-gpu (>=1.4.0) - To install with conda: ```conda install -c pytorch faiss-gpu```
 
 ### Setup steps

diff --git a/conda_environments/builddocs_py35.yml b/conda_environments/builddocs_py35.yml
@@ -8,7 +8,7 @@ dependencies:
 - python=3.5.*
 - pytest
 - cudatoolkit=9.2
-- pygdf=0.1.0a3.*
+- cudf=0.2.0
 - numba>=0.40.0dev
 - pandas=0.20.*
 - pyarrow=0.10.*

diff --git a/python/README.md b/python/README.md
@@ -10,7 +10,7 @@ List of dependencies:
 4. Cython (>= 0.28)
 5. gcc (>=5.4.0)
 6. nvcc
-7. [cuDF](https://github.com/gpuopenanalytics/pygdf)
+7. [cuDF](https://github.com/gpuopenanalytics/cudf)
 
 ### Setup steps
 

diff --git a/python/cuML/dbscan/dbscan_wrapper.pyx b/python/cuML/dbscan/dbscan_wrapper.pyx
@@ -17,7 +17,7 @@
 cimport c_dbscan
 import numpy as np
 from numba import cuda
-import pygdf
+import cudf
 from libcpp cimport bool
 import ctypes
 from libc.stdint cimport uintptr_t
@@ -30,11 +30,11 @@ class DBSCAN:
 
     .. code-block:: python
 
-            import pygdf
+            import cudf
             from cuML import DBSCAN
             import numpy as np
 
-            gdf_float = pygdf.DataFrame()
+            gdf_float = cudf.DataFrame()
             gdf_float['0']=np.asarray([1.0,2.0,5.0],dtype=np.float32)
             gdf_float['1']=np.asarray([4.0,2.0,1.0],dtype=np.float32)
             gdf_float['2']=np.asarray([4.0,2.0,1.0],dtype=np.float32)
@@ -59,7 +59,7 @@ class DBSCAN:
         self.eps = eps
         self.min_samples = min_samples
         self.labels_ = None
-        
+
     def _get_ctype_ptr(self, obj):
         # The manner to access the pointers in the gdf's might change, so
         # encapsulating access in the following 3 methods. They might also be
@@ -78,7 +78,7 @@ class DBSCAN:
 
             Parameters
             ----------
-            X : PyGDF DataFrame
+            X : cuDF DataFrame
                Dense matrix (floats or doubles) of shape (n_samples, n_features)
         """
 
@@ -90,39 +90,39 @@ class DBSCAN:
         self.gdf_datatype = np.dtype(x[0])
         self.n_rows = len(X)
         self.n_cols = len(X._cols)
-        
+
         cdef uintptr_t input_ptr = self._get_gdf_as_matrix_ptr(X)
-        self.labels_ = pygdf.Series(np.zeros(self.n_rows, dtype=np.int32))
+        self.labels_ = cudf.Series(np.zeros(self.n_rows, dtype=np.int32))
         cdef uintptr_t labels_ptr = self._get_column_ptr(self.labels_)
 
         if self.gdf_datatype.type == np.float32:
-            c_dbscan.dbscanFit(<float*>input_ptr, 
-                               <int> self.n_rows, 
-                               <int> self.n_cols, 
-                               <float> self.eps, 
+            c_dbscan.dbscanFit(<float*>input_ptr,
+                               <int> self.n_rows,
+                               <int> self.n_cols,
+                               <float> self.eps,
                                <int> self.min_samples,
 		               <int*> labels_ptr)
         else:
-            c_dbscan.dbscanFit(<double*>input_ptr, 
-                               <int> self.n_rows, 
-                               <int> self.n_cols, 
-                               <double> self.eps, 
+            c_dbscan.dbscanFit(<double*>input_ptr,
+                               <int> self.n_rows,
+                               <int> self.n_cols,
+                               <double> self.eps,
                                <int> self.min_samples,
 		               <int*> labels_ptr)
 
-    
+
     def fit_predict(self, X):
         """
             Performs clustering on input_gdf and returns cluster labels.
 
             Parameters
             ----------
-            X : PyGDF DataFrame
-              Dense matrix (floats or doubles) of shape (n_samples, n_features), 
+            X : cuDF DataFrame
+              Dense matrix (floats or doubles) of shape (n_samples, n_features),
 
             Returns
             -------
-            y : PyGDF Series, shape (n_samples)
+            y : cuDF Series, shape (n_samples)
               cluster labels
         """
         self.fit(X)

diff --git a/python/cuML/knn/knn_wrapper.py b/python/cuML/knn/knn_wrapper.py
@@ -16,7 +16,7 @@
 import faiss
 import numpy as np
 import pandas as pd
-import pygdf
+import cudf
 
 class KNNparams:
     def __init__(self,n_gpus):
@@ -26,15 +26,15 @@ class KNN:
     """
     Create a DataFrame, fill it with data, and compute KNN:
     .. code-block:: python
-        import pygdf
+        import cudf
         from cuML import KNN
         import numpy as np
         np_float = np.array([
-                [1.,2.,3.], # 1st point 
+                [1.,2.,3.], # 1st point
                 [1.,2.,4.], # 2nd point
                 [2.,2.,4.]  # 3rd point
             ]).astype('float32')
-        gdf_float = pygdf.DataFrame()
+        gdf_float = cudf.DataFrame()
         gdf_float['dim_0'] = np.ascontiguousarray(np_float[:,0])
         gdf_float['dim_1'] = np.ascontiguousarray(np_float[:,1])
         gdf_float['dim_2'] = np.ascontiguousarray(np_float[:,2])
@@ -88,18 +88,18 @@ def fit(self,X):
     def query(self,X,k):
         X = self.to_nparray(X)
         D,I = self.gpu_index.search(X, k)
-        D = self.to_pygdf(D,col='distance')
-        I = self.to_pygdf(I,col='index')
+        D = self.to_cudf(D,col='distance')
+        I = self.to_cudf(I,col='index')
         return D,I
 
     def to_nparray(self,x):
-        if isinstance(x,pygdf.DataFrame):
+        if isinstance(x,cudf.DataFrame):
             x = x.to_pandas()
         return np.ascontiguousarray(x)
 
-    def to_pygdf(self,df,col=''):
-        # convert pandas dataframe to pygdf dataframe
+    def to_cudf(self,df,col=''):
+        # convert pandas dataframe to cudf dataframe
         if isinstance(df,np.ndarray):
             df = pd.DataFrame({'%s_neighbor_%d'%(col,i):df[:,i] for i in range(df.shape[1])})
-        pdf = pygdf.DataFrame.from_pandas(df)
+        pdf = cudf.DataFrame.from_pandas(df)
         return pdf
diff --git a/python/cuML/pca/pca_random_test.py b/python/cuML/pca/pca_random_test.py
@@ -15,7 +15,7 @@
 
 from cuML import PCA
 from sklearn.decomposition import PCA as PCA_SKL
-import pygdf
+import cudf
 import numpy as np
 import pandas as pd
 import time
@@ -31,7 +31,7 @@
 df = pd.DataFrame(data)#, index = index)
 df = df.astype('float32')
 
-gdf = pygdf.DataFrame.from_pandas(df)
+gdf = cudf.DataFrame.from_pandas(df)
 
 print("\ninput:")
 

diff --git a/python/cuML/pca/pca_wrapper.pyx b/python/cuML/pca/pca_wrapper.pyx
@@ -17,7 +17,7 @@ cimport c_pca
 import numpy as np
 cimport numpy as np
 from numba import cuda
-import pygdf
+import cudf
 from libcpp cimport bool
 import ctypes
 from libc.stdint cimport uintptr_t
@@ -44,11 +44,11 @@ class PCA:
 
     .. code-block:: python
 
-        import pygdf
+        import cudf
         from cuML import PCA
         import numpy as np
 
-        gdf_float = pygdf.DataFrame()
+        gdf_float = cudf.DataFrame()
         gdf_float['0']=np.asarray([1.0,2.0,5.0],dtype=np.float32)
         gdf_float['1']=np.asarray([4.0,2.0,1.0],dtype=np.float32)
         gdf_float['2']=np.asarray([4.0,2.0,1.0],dtype=np.float32)
@@ -80,28 +80,28 @@ class PCA:
                       1 -0.72165036 -0.48949987  -0.4895003
 
           explained variance:
-                      
+
                       0   8.510402
                       1 0.48959687
 
           explained variance ratio:
-                       
+
                        0   0.9456003
                        1 0.054399658
 
           singular values:
-                     
+
                      0 4.1256275
                      1 0.9895422
 
           mean:
-          
+
                     0 2.6666667
                     1 2.3333333
                     2 2.3333333
 
           noise variance:
-                
+
                 0  0.0
 
           transformed matrix:
@@ -164,16 +164,16 @@ class PCA:
                                                     dtype=self.gdf_datatype))
         self.components_ = cuda.to_device(np.zeros(n_components*n_cols,
                                                    dtype=self.gdf_datatype))
-        self.explained_variance_ = pygdf.Series(
+        self.explained_variance_ = cudf.Series(
                                       np.zeros(n_components,
                                                dtype=self.gdf_datatype))
-        self.explained_variance_ratio_ = pygdf.Series(
+        self.explained_variance_ratio_ = cudf.Series(
                                             np.zeros(n_components,
                                                      dtype=self.gdf_datatype))
-        self.mean_ = pygdf.Series(np.zeros(n_cols, dtype=self.gdf_datatype))
-        self.singular_values_ = pygdf.Series(np.zeros(n_components,
+        self.mean_ = cudf.Series(np.zeros(n_cols, dtype=self.gdf_datatype))
+        self.singular_values_ = cudf.Series(np.zeros(n_components,
                                                       dtype=self.gdf_datatype))
-        self.noise_variance_ = pygdf.Series(np.zeros(1,
+        self.noise_variance_ = cudf.Series(np.zeros(1,
                                                      dtype=self.gdf_datatype))
 
     def _get_ctype_ptr(self, obj):
@@ -194,7 +194,7 @@ class PCA:
 
         Parameters
         ----------
-        X : PyGDF DataFrame
+        X : cuDF DataFrame
           Dense matrix (floats or doubles) of shape (n_samples, n_features)
 
         Returns
@@ -275,7 +275,7 @@ class PCA:
                                       <double*> noise_vars_ptr,
                                       params)
 
-        components_gdf = pygdf.DataFrame()
+        components_gdf = cudf.DataFrame()
         for i in range(0, params.n_cols):
             components_gdf[str(i)] = self.components_[i*params.n_components:(i+1)*params.n_components]
 
@@ -293,21 +293,21 @@ class PCA:
 
         Parameters
         ----------
-        X : PyGDF DataFrame, shape (n_samples, n_features)
+        X : cuDF DataFrame, shape (n_samples, n_features)
           training data (floats or doubles), where n_samples is the number of samples, and n_features is the number of features.
 
         Returns
         -------
-        X_new : PyGDF DataFrame, shape (n_samples, n_components)
+        X_new : cuDF DataFrame, shape (n_samples, n_components)
         """
         self.fit(X, _transform=True)
-        X_new = pygdf.DataFrame()
+        X_new = cudf.DataFrame()
         num_rows = self.params.n_rows
 
         for i in range(0, self.params.n_components):
             X_new[str(i)] = self.trans_input_[i*num_rows:(i+1)*num_rows]
 
-        return X_new 
+        return X_new
 
     def inverse_transform(self, X):
         """
@@ -317,12 +317,12 @@ class PCA:
 
         Parameters
         ----------
-        X : PyGDF DataFrame, shape (n_samples, n_components)
+        X : cuDF DataFrame, shape (n_samples, n_components)
             New data (floats or doubles), where n_samples is the number of samples and n_components is the number of components.
 
         Returns
         -------
-        X_original : PyGDF DataFrame, shape (n_samples, n_features)
+        X_original : cuDF DataFrame, shape (n_samples, n_features)
 
         """
         cpdef c_pca.paramsPCA params
@@ -362,12 +362,12 @@ class PCA:
                                       <double*> input_ptr,
                                       params)
 
-        X_original = pygdf.DataFrame()
+        X_original = cudf.DataFrame()
         for i in range(0, params.n_cols):
             X_original[str(i)] = input_data[i*params.n_rows:(i+1)*params.n_rows]
 
 
-        return X_original 
+        return X_original
 
     def transform(self, X):
         """
@@ -377,12 +377,12 @@ class PCA:
 
         Parameters
         ----------
-        X : PyGDF DataFrame, shape (n_samples, n_features)
+        X : cuDF DataFrame, shape (n_samples, n_features)
             New data (floats or doubles), where n_samples is the number of samples and n_features is the number of features.
 
         Returns
         -------
-        X_new : PyGDF DataFrame, shape (n_samples, n_components)
+        X_new : cuDF DataFrame, shape (n_samples, n_components)
 
         """
         cpdef c_pca.paramsPCA params
@@ -422,7 +422,7 @@ class PCA:
                                <double*> mean_ptr,
                                params)
 
-        X_new = pygdf.DataFrame()
+        X_new = cudf.DataFrame()
         for i in range(0, params.n_components):
             X_new[str(i)] = trans_input_data[i*params.n_rows:(i+1)*params.n_rows]