-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
asdf
- Loading branch information
Showing
1 changed file
with
80 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,85 @@ | ||
# Databricks notebook source | ||
configs = {"fs.azure.account.auth.type": "OAuth", | ||
"fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider", | ||
"fs.azure.account.oauth2.client.id": dbutils.secrets.get(scope = "data-lake", key = "sp-app-id"), #Service Principal App ID | ||
"fs.azure.account.oauth2.client.secret": dbutils.secrets.get(scope = "data-lake", key = "sp-password"), #Service Principal Key | ||
"fs.azure.account.oauth2.client.endpoint": dbutils.secrets.get(scope = "data-lake", key = "sp-token-endpoint")} #directory id | ||
# FUTURE SERVICE PRINCIPAL STUFF FOR MOUNTING | ||
secrets = {} | ||
secrets["blob_name"] = dbutils.secrets.get(scope = "data-lake", key = "blob-name") | ||
secrets["blob_key"] = dbutils.secrets.get(scope = "data-lake", key = "blob-key") | ||
secrets["container_name"] = dbutils.secrets.get(scope = "data-lake", key = "container-name") | ||
secrets["subscription_id"] = dbutils.secrets.get(scope = "data-lake", key = "subscription-id") | ||
secrets["resource_group"] = dbutils.secrets.get(scope = "data-lake", key = "resource-group") | ||
secrets["ml_workspace_name"] = dbutils.secrets.get(scope = "data-lake", key = "ml-workspace-name") | ||
secrets["alg_state"] = dbutils.secrets.get(scope = "data-lake", key = "alg-state") | ||
try: | ||
secrets["created_by"] = dbutils.secrets.get(scope = "data-lake", key = "created-by") | ||
except Exception as e: | ||
print("falling back to user set created_by") | ||
secrets["created_by"] = "dacrook" | ||
|
||
#create the config variable to pass into mount | ||
configs = {"fs.azure.account.key." + secrets["blob_name"] + ".blob.core.windows.net":"" + secrets["blob_key"] + ""} | ||
|
||
# Optionally, you can add <your-directory-name> to the source URI of your mount point. | ||
dbutils.fs.mount( | ||
source = dbutils.secrets.get(scope = "data-lake", key = "datalake-fqdn"), | ||
mount_point = "/mnt/datalake", | ||
extra_configs = configs) | ||
#mounting the external blob storage as mount point datalake for data storage. | ||
dbutils.fs.mount( source = "wasbs://" + secrets["container_name"] + "@" + secrets["blob_name"] + ".blob.core.windows.net/", | ||
mount_point = "/mnt/datalake/", | ||
extra_configs = configs) | ||
|
||
# COMMAND ---------- | ||
#display the files in the folder | ||
dbutils.fs.ls("dbfs:/mnt/datalake") | ||
|
||
census = sqlContext.read.format('csv').options(header='true', inferSchema='true').load("/mnt/datalake/AdultCensusIncome.csv") | ||
display(census) | ||
census = sqlContext.read.format('csv').options(header='true', inferSchema='true').load('/mnt/datalake/AdultCensusIncome.csv') | ||
census.printSchema() | ||
display(census.select("age", " fnlwgt", " hours-per-week")) | ||
|
||
# COMMAND ---------- | ||
import numpy as np | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.linear_model import LinearRegression | ||
from sklearn.metrics import mean_absolute_error, r2_score | ||
import pickle | ||
|
||
#Race condition between display and mounting in interactive environment. | ||
#dbutils.fs.unmount("/mnt/datalake") | ||
x = np.array(census.select("age", " hours-per-week").collect()).reshape(-1,2) | ||
y = np.array(census.select(" fnlwgt").collect()).reshape(-1,1) | ||
|
||
#Split data & Train Scalers | ||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state = 777, shuffle = True) | ||
x_scaler = StandardScaler().fit(x_train) | ||
y_scaler = StandardScaler().fit(y_train) | ||
|
||
#Transform all data | ||
x_train = x_scaler.transform(x_train) | ||
x_test = x_scaler.transform(x_test) | ||
y_train = y_scaler.transform(y_train) | ||
y_test = y_scaler.transform(y_test) | ||
|
||
model = LinearRegression().fit(x_train, y_train) | ||
|
||
y_predicted = y_scaler.inverse_transform(model.predict(x_test)) | ||
|
||
mae = mean_absolute_error(y_test, y_predicted) | ||
r2 = r2_score(y_test, y_predicted) | ||
|
||
print(mae) | ||
print(r2) | ||
|
||
#Write Files to local file storage | ||
import os | ||
#Also works: "/dbfs/tmp/models/worst_regression/dacrook/" | ||
prefix = "file:/models/worst_regression/" | ||
if not os.path.exists(prefix): | ||
os.makedirs(prefix) | ||
with open(prefix + "x_scaler.pkl", "wb") as handle: | ||
pickle.dump(x_scaler, handle) | ||
with open(prefix + "y_scaler.pkl", "wb") as handle: | ||
pickle.dump(y_scaler, handle) | ||
with open(prefix + "model.pkl", "wb") as handle: | ||
pickle.dump(model, handle) | ||
|
||
#Create an Azure ML Model out of it tagged as dev | ||
from azureml.core.workspace import Workspace | ||
from azureml.core.authentication import ServicePrincipalAuthentication | ||
from azureml.core.model import Model | ||
print("Logging In - navigate to https://microsoft.com/devicelogin and enter the code from the print out below") | ||
az_ws = Workspace(secrets["subscription_id"], secrets["resource_group"], secrets["ml_workspace_name"]) | ||
print("Logged in and workspace retreived.") | ||
Model.register(az_ws, model_path = prefix, model_name = "worst_regression", tags={"state" : secrets["alg_state"], "created_by" : secrets["created_by"]}) | ||
|
||
#finally unmount the mount. | ||
dbutils.fs.unmount("/mnt/datalake") |