-
Notifications
You must be signed in to change notification settings - Fork 6.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Pipeline Step Caching Example Notebook (#3638)
* feature: pipeline caching notebook example * change: initialize notebook * feature: pipeline caching notebook example and tuning notebook adjustment * fix: example notebook * change: README * fix: notebook code * fix: grammar * fix: more grammar * fix: pr syntax and remove dataset * fix: updated paths * fix: tuning notebook formatting * fix: more path corrections Co-authored-by: Brock Wade <[email protected]>
- Loading branch information
1 parent
da8d4df
commit 2439f36
Showing
5 changed files
with
1,134 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
90 changes: 90 additions & 0 deletions
90
sagemaker-pipelines/tabular/caching/artifacts/code/processing/preprocessing.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import argparse | ||
import os | ||
import requests | ||
import tempfile | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.compose import ColumnTransformer | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.preprocessing import StandardScaler, OneHotEncoder | ||
|
||
|
||
# Since we get a headerless CSV file, we specify the column names here. | ||
feature_columns_names = [ | ||
"sex", | ||
"length", | ||
"diameter", | ||
"height", | ||
"whole_weight", | ||
"shucked_weight", | ||
"viscera_weight", | ||
"shell_weight", | ||
] | ||
label_column = "rings" | ||
|
||
feature_columns_dtype = { | ||
"sex": str, | ||
"length": np.float64, | ||
"diameter": np.float64, | ||
"height": np.float64, | ||
"whole_weight": np.float64, | ||
"shucked_weight": np.float64, | ||
"viscera_weight": np.float64, | ||
"shell_weight": np.float64, | ||
} | ||
label_column_dtype = {"rings": np.float64} | ||
|
||
|
||
def merge_two_dicts(x, y): | ||
z = x.copy() | ||
z.update(y) | ||
return z | ||
|
||
|
||
if __name__ == "__main__": | ||
base_dir = "/opt/ml/processing" | ||
|
||
df = pd.read_csv( | ||
f"{base_dir}/input/abalone-dataset.csv", | ||
header=None, | ||
names=feature_columns_names + [label_column], | ||
dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype), | ||
) | ||
numeric_features = list(feature_columns_names) | ||
numeric_features.remove("sex") | ||
numeric_transformer = Pipeline( | ||
steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())] | ||
) | ||
|
||
categorical_features = ["sex"] | ||
categorical_transformer = Pipeline( | ||
steps=[ | ||
("imputer", SimpleImputer(strategy="constant", fill_value="missing")), | ||
("onehot", OneHotEncoder(handle_unknown="ignore")), | ||
] | ||
) | ||
|
||
preprocess = ColumnTransformer( | ||
transformers=[ | ||
("num", numeric_transformer, numeric_features), | ||
("cat", categorical_transformer, categorical_features), | ||
] | ||
) | ||
|
||
y = df.pop("rings") | ||
X_pre = preprocess.fit_transform(df) | ||
y_pre = y.to_numpy().reshape(len(y), 1) | ||
|
||
X = np.concatenate((y_pre, X_pre), axis=1) | ||
|
||
np.random.shuffle(X) | ||
train, validation, test = np.split(X, [int(0.7 * len(X)), int(0.85 * len(X))]) | ||
|
||
pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv", header=False, index=False) | ||
pd.DataFrame(validation).to_csv( | ||
f"{base_dir}/validation/validation.csv", header=False, index=False | ||
) | ||
pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv", header=False, index=False) |
94 changes: 94 additions & 0 deletions
94
sagemaker-pipelines/tabular/caching/artifacts/code/processing/preprocessing_2.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import argparse | ||
import os | ||
import requests | ||
import tempfile | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.compose import ColumnTransformer | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.preprocessing import StandardScaler, OneHotEncoder | ||
|
||
|
||
# Since we get a headerless CSV file, we specify the column names here. | ||
feature_columns_names = [ | ||
"sex", | ||
"length", | ||
"diameter", | ||
"height", | ||
"whole_weight", | ||
"shucked_weight", | ||
"viscera_weight", | ||
"shell_weight", | ||
] | ||
label_column = "rings" | ||
|
||
feature_columns_dtype = { | ||
"sex": str, | ||
"length": np.float64, | ||
"diameter": np.float64, | ||
"height": np.float64, | ||
"whole_weight": np.float64, | ||
"shucked_weight": np.float64, | ||
"viscera_weight": np.float64, | ||
"shell_weight": np.float64, | ||
} | ||
label_column_dtype = {"rings": np.float64} | ||
|
||
|
||
def merge_two_dicts(x, y): | ||
z = x.copy() | ||
z.update(y) | ||
return z | ||
|
||
|
||
if __name__ == "__main__": | ||
base_dir = "/opt/ml/processing" | ||
|
||
df = pd.read_csv( | ||
f"{base_dir}/input/abalone-dataset.csv", | ||
header=None, | ||
names=feature_columns_names + [label_column], | ||
dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype), | ||
) | ||
numeric_features = list(feature_columns_names) | ||
numeric_features.remove("sex") | ||
numeric_transformer = Pipeline( | ||
steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())] | ||
) | ||
|
||
categorical_features = ["sex"] | ||
categorical_transformer = Pipeline( | ||
steps=[ | ||
("imputer", SimpleImputer(strategy="constant", fill_value="missing")), | ||
("onehot", OneHotEncoder(handle_unknown="ignore")), | ||
] | ||
) | ||
|
||
preprocess = ColumnTransformer( | ||
transformers=[ | ||
("num", numeric_transformer, numeric_features), | ||
("cat", categorical_transformer, categorical_features), | ||
] | ||
) | ||
|
||
y = df.pop("rings") | ||
X_pre = preprocess.fit_transform(df) | ||
y_pre = y.to_numpy().reshape(len(y), 1) | ||
|
||
X = np.concatenate((y_pre, X_pre), axis=1) | ||
|
||
np.random.shuffle(X) | ||
train, validation, test = np.split(X, [int(0.7 * len(X)), int(0.85 * len(X))]) | ||
|
||
pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv", header=False, index=False) | ||
pd.DataFrame(validation).to_csv( | ||
f"{base_dir}/validation/validation.csv", header=False, index=False | ||
) | ||
pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv", header=False, index=False) | ||
|
||
def additional_method(): | ||
"""this is an additional method in order to update this file's hash""" | ||
pass |
Oops, something went wrong.