Skip to content

Commit

Permalink
Revert "Adding queries 25, 26 and 30 to be reviewed (#241)"
Browse files Browse the repository at this point in the history
This reverts commit 9ae8a4d.
  • Loading branch information
VibhuJawa authored Mar 14, 2022
1 parent 9ae8a4d commit dd002e8
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 41 deletions.
15 changes: 4 additions & 11 deletions gpu_bdb/bdb_tools/q25_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.
#

import dask.dataframe as dd
import dask_cudf

from bdb_tools.utils import train_clustering_model
Expand All @@ -34,7 +33,6 @@ def read_tables(config, c=None):
data_format=config["file_format"],
basepath=config["data_dir"],
split_row_groups=config["split_row_groups"],
backend=config["backend"],
)

ss_cols = ["ss_customer_sk", "ss_sold_date_sk", "ss_ticket_number", "ss_net_paid"]
Expand Down Expand Up @@ -67,15 +65,10 @@ def get_clusters(client, ml_input_df):
results_dict = client.compute(*ml_tasks, sync=True)

output = ml_input_df.index.to_frame().reset_index(drop=True)

if isinstance(ml_input_df, cudf.DataFrame):
labels_final = dask_cudf.from_cudf(
results_dict["cid_labels"], npartitions=output.npartitions
)
else:
labels_final = dd.from_cudf(
results_dict["cid_labels"], npartitions=output.npartitions
)

labels_final = dask_cudf.from_cudf(
results_dict["cid_labels"], npartitions=output.npartitions
)
output["label"] = labels_final.reset_index()[0]

# Sort based on CDH6.1 q25-result formatting
Expand Down
1 change: 0 additions & 1 deletion gpu_bdb/bdb_tools/q26_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def read_tables(config, c=None):
data_format=config["file_format"],
basepath=config["data_dir"],
split_row_groups=config["split_row_groups"],
backend=config["backend"],
)

ss_cols = ["ss_customer_sk", "ss_item_sk"]
Expand Down
1 change: 0 additions & 1 deletion gpu_bdb/bdb_tools/q30_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ def read_tables(config, c=None):
data_format=config["file_format"],
basepath=config["data_dir"],
split_row_groups=config["split_row_groups"],
backend=config["backend"],
)

item_cols = ["i_category_id", "i_item_sk"]
Expand Down
19 changes: 5 additions & 14 deletions gpu_bdb/queries/q25/gpu_bdb_query_25_dask_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,28 +36,19 @@
from dask import delayed

def get_clusters(client, ml_input_df):
import dask.dataframe as dd
import dask_cudf
import cudf
import pandas as pd


ml_tasks = [
delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS, N_ITER)
for df in ml_input_df.to_delayed()
]
results_dict = client.compute(*ml_tasks, sync=True)

output = ml_input_df.index.to_frame().reset_index(drop=True)

if isinstance(ml_input_df, cudf.DataFrame):
labels_final = dask_cudf.from_cudf(
results_dict["cid_labels"], npartitions=output.npartitions
)
else:
labels_final = dd.from_pandas(
pd.DataFrame(results_dict["cid_labels"]), npartitions=output.npartitions
)


labels_final = dask_cudf.from_cudf(
results_dict["cid_labels"], npartitions=output.npartitions
)
output["label"] = labels_final.reset_index()[0]

# Based on CDH6.1 q25-result formatting
Expand Down
19 changes: 5 additions & 14 deletions gpu_bdb/queries/q26/gpu_bdb_query_26_dask_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,8 @@
from dask import delayed

def get_clusters(client, kmeans_input_df):
import dask.dataframe as dd
import dask_cudf
import cudf
import pandas as pd


ml_tasks = [
delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS, N_ITER)
for df in kmeans_input_df.to_delayed()
Expand All @@ -49,16 +46,10 @@ def get_clusters(client, kmeans_input_df):
results_dict = client.compute(*ml_tasks, sync=True)

output = kmeans_input_df.index.to_frame().reset_index(drop=True)

if isinstance(kmeans_input_df, cudf.DataFrame):
labels_final = dask_cudf.from_cudf(
results_dict["cid_labels"], npartitions=output.npartitions
)
else:
labels_final = dd.from_pandas(
pd.DataFrame(results_dict["cid_labels"]), npartitions=output.npartitions
)


labels_final = dask_cudf.from_cudf(
results_dict["cid_labels"], npartitions=output.npartitions
)
output["label"] = labels_final.reset_index()[0]

# Based on CDH6.1 q26-result formatting
Expand Down

0 comments on commit dd002e8

Please sign in to comment.