diff --git a/.gitignore b/.gitignore index 894a44c..beefb97 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,9 @@ venv.bak/ # mypy .mypy_cache/ + +# AZ ML TOOLING GENERATED FILES +/Project_One/dockerfile +/Project_One/my_runbuild_local.cmd +/Project_One/notebooks/myenv.yml +/Project_One-Tests/dockerfile \ No newline at end of file diff --git a/Project_One-Tests/__init__.py b/Project_One-Tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Project_One-Tests/dockerfile.base b/Project_One-Tests/dockerfile.base new file mode 100644 index 0000000..d571695 --- /dev/null +++ b/Project_One-Tests/dockerfile.base @@ -0,0 +1,18 @@ +FROM + +RUN pip install pytest +RUN pip install pytest-cov + +COPY . /var/azureml-app/tests +RUN mkdir /var/azureml-app/tests/junit + +COPY ./__init__.py /var/azureml-app + +RUN chmod +x /var/azureml-app/tests/runtests.sh + +WORKDIR "/var/azureml-app" + +#Ensures python print lines are printed out +ENV PYTHONUNBUFFERED 1 + +CMD ["bash", "/var/azureml-app/tests/runtests.sh"] diff --git a/Project_One-Tests/runtests.sh b/Project_One-Tests/runtests.sh new file mode 100644 index 0000000..254a704 --- /dev/null +++ b/Project_One-Tests/runtests.sh @@ -0,0 +1,2 @@ +#!/bin/bash +pytest --doctest-modules --junitxml=/var/azureml-app/tests/junit/test-results.xml --cov=/var/azureml-app/inference_code/ /var/azureml-app/tests --cov-report=xml:/var/azureml-app/tests/junit/coverage.xml --cov-report=html:/var/azureml-app/tests/junit/cov_html \ No newline at end of file diff --git a/Project_One-Tests/runtests_local.cmd b/Project_One-Tests/runtests_local.cmd new file mode 100644 index 0000000..35a3473 --- /dev/null +++ b/Project_One-Tests/runtests_local.cmd @@ -0,0 +1,23 @@ +:: Remove containers that could be running +docker stop mltests +docker rm mltests + +:: Build AML Container +:: cd .. +:: cd ./Project_One +:: docker build -t mlbuild . +:: docker run --name mlbuild --rm --privileged -v /var/run/docker.sock:/var/run/docker.sock mlbuild + +REM cd .. +REM cd ./Project_One +REM cmd runbuild_local.cmd + +:: TODO: Get generated container ID & replace token in docker file +powershell -Command "$dict = (gc c:/ml_temp/artifacts/artifacts.json) | ConvertFrom-JSON; (gc dockerfile.base) -replace '', $dict.image_location | Out-File dockerfile -Encoding utf8" + +cd .. +cd ./Project_One-Tests +docker build -t mltests . + +if not exist "C:\ml_temp\artifacts\test_results" mkdir C:\ml_temp\artifacts\test_results +docker run --name mltests --privileged -v c:/ml_temp/artifacts/test_results:/var/azureml-app/tests/junit mltests \ No newline at end of file diff --git a/Project_One-Tests/runtests_pipeline.sh b/Project_One-Tests/runtests_pipeline.sh new file mode 100644 index 0000000..167a345 --- /dev/null +++ b/Project_One-Tests/runtests_pipeline.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -euo pipefail +IFS=$'\n\t' + +# -e: immediately exit if any command has a non-zero exit status +# -o: prevents errors in a pipeline from being masked +# IFS new value is less likely to cause confusing bugs when looping arrays or arguments (e.g. $@) + +cd ml_temp/artifacts +str=$(jq -r '.image_location' artifacts.json) + +echo "################### Image to be tested ################### : " $str +cd / +cd $(System.DefaultWorkingDirectory)/Project_One-Tests +echo "################### Updating Tests Docker File ################### " +sed "s||${str}|g" dockerfile.base > dockerfile + +echo "################### Logging into ACR ################### " +docker login $ACR_NAME -u $ACR_USER -p $ACR_PASSWORD +echo "################### Building MLTESTS Image ################### " +docker build -t mltests . +echo "################### Running MLTests Container and Conducting Tests ################### " +docker run --name mltests -v $(Agent.HomeDirectory)/ml_temp/artifacts/test_results:/var/azureml-app/tests/junit mltests +echo "################### Ending Test Sequence ################### " +sudo chown -R $(id -u):$(id -u) $(Agent.HomeDirectory)/ml_temp/artifacts/test_results/cov_html/ diff --git a/Project_One-Tests/test_model.py b/Project_One-Tests/test_model.py new file mode 100644 index 0000000..49b86c5 --- /dev/null +++ b/Project_One-Tests/test_model.py @@ -0,0 +1,17 @@ +import sys +import os +import pytest +sys.path.append("../azureml-app/") #append path for 1 module level up. +from inference_code.model_class import MyModel + +class TestModel(object): + """ + testing of the model + """ + def setUp(self): + pass + + def test_init(self): + m = MyModel() + m.init() + assert(m.x_scaler is not None) diff --git a/Project_One/__init__.py b/Project_One/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Project_One/build.py b/Project_One/build.py new file mode 100644 index 0000000..8db924f --- /dev/null +++ b/Project_One/build.py @@ -0,0 +1,81 @@ +from azureml.core.workspace import Workspace +from azureml.core.authentication import ServicePrincipalAuthentication +from azureml.core.model import Model +from azureml.core.image import ContainerImage, Image +from azureml.core.conda_dependencies import CondaDependencies +import os +from os import walk +import shutil +import json + +def resolve_sub_id(): + return os.environ["SUBSCRIPTION_ID"] + +def resolve_rg(): + return os.environ["RESOURCE_GROUP"] + +def resolve_workspace_name(): + return os.environ["WORKSPACE_NAME"] + +def resolve_state(): + return os.environ["STATE"] + +def resolve_author(): + return os.environ["AUTHOR"] + +def resolve_model_name(): + return os.environ["MODEL_NAME"] + +def resolve_image_name(): + return os.environ["IMAGE_NAME"] + +def run(): + print("entered run") + variables_received = "sub_id: {}, rg: {}, work_name: {}, state: {}, author: {}, model_name: {}" \ + .format(resolve_sub_id(), + resolve_rg(), + resolve_workspace_name(), + resolve_state(), + resolve_author(), + resolve_model_name()) + print(variables_received) + + az_ws = Workspace(resolve_sub_id(), resolve_rg(), resolve_workspace_name()) + print("initialized workspace") + #Get & Download model + model = Model(az_ws, name=resolve_model_name(), tags={"state" : resolve_state(), "created_by" : resolve_author()}) + print("initialized model") + model.download(target_dir="./assets/") + print("downloaded model assets") + #TODO: remove workaround for ml sdk dropping assets into /assets/dacrook folder when files dropped to consistent location + for dir_p, _, f_n in walk("./assets"): + for f in f_n: + abs_path = os.path.abspath(os.path.join(dir_p, f)) + shutil.move(abs_path, "./assets/" + f) + + #Configure Image + my_env = CondaDependencies.create(conda_packages=["numpy", "scikit-learn"]) + with open("myenv.yml","w") as f: + f.write(my_env.serialize_to_string()) + image_config = ContainerImage.image_configuration(execution_script = "score.py", + runtime="python", + conda_file="myenv.yml", + dependencies=["assets", "inference_code"], + tags={"state" : resolve_state(), "created_by" : resolve_author()}) + print("configured image") + #TODO: use this once model is dropped to a consistent location +# image = Image.create(workspace = az_ws, name=resolve_image_name(), models=[model], image_config = image_config) + image = Image.create(workspace = az_ws, name=resolve_image_name(), models=[model], image_config = image_config) + image.wait_for_creation() + print("created image") + if(image.creation_state != "Succeeded"): + raise Exception("Failed to create image.") + print("image location: {}".format(image.image_location)) + artifacts = {"image_location" : image.image_location} + if(not os.path.exists("/artifacts/")): + os.makedirs("/artifacts/") + with open("/artifacts/artifacts.json", "w") as outjson: + json.dump(artifacts, outjson) + +if __name__ == "__main__": + run() \ No newline at end of file diff --git a/Project_One/dockerfile.base b/Project_One/dockerfile.base new file mode 100644 index 0000000..e3eba5e --- /dev/null +++ b/Project_One/dockerfile.base @@ -0,0 +1,26 @@ +FROM continuumio/miniconda3 + +RUN apt-get update -y +RUN apt-get upgrade -y + +RUN apt-get install build-essential -y + +RUN pip install --upgrade pip setuptools wheel + +ADD requirements.txt /prereqs/ +RUN pip install -r ./prereqs/requirements.txt + +COPY . . + +ENV SUBSCRIPTION_ID= \ +RESOURCE_GROUP= \ +WORKSPACE_NAME= \ +STATE= \ +AUTHOR= \ +MODEL_NAME= \ +IMAGE_NAME= + +#Ensures python print lines are printed out +ENV PYTHONUNBUFFERED 1 + +CMD ["python", "build.py"] \ No newline at end of file diff --git a/Project_One/inference_code/__init__.py b/Project_One/inference_code/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Project_One/inference_code/model_class.py b/Project_One/inference_code/model_class.py new file mode 100644 index 0000000..0cdb13a --- /dev/null +++ b/Project_One/inference_code/model_class.py @@ -0,0 +1,39 @@ +""" +@Description: Model wrapper class for testability. +@Author: David Crook +@Author_Email: DaCrook@Microsoft.com + +Copyright (c) Microsoft Corporation. All rights reserved. +Licensed under the MIT License. +""" +import pickle +import sys +sys.path.append("../azureml-app/") +from inference_code.utility import transform_input + +class MyModel: + x_scaler = None + y_scaler = None + model = None + + def init(self): + root_path = "./assets/" + with open(root_path + "x_scaler.pkl", "rb") as xfile: + self.x_scaler = pickle.load(xfile) + with open(root_path + "y_scaler.pkl", "rb") as yfile: + self.y_scaler = pickle.load(yfile) + with open(root_path + "model.pkl", "rb") as mfile: + self.model = pickle.load(mfile) + + def predict(self, input_package): + """ + input_package: json formatted string of the form + {"age": integer, "hours-per-week" : double, "sex" : string, "occupation" string} + + returns json formatted string of the form: {"estimated_wages" : float} + """ + x = transform_input(input_package) + x = self.x_scaler.transform(x) + y = self.model.predict(x) + y = self.y_scaler.inverse_transform(y) + return y \ No newline at end of file diff --git a/Project_One/inference_code/utility.py b/Project_One/inference_code/utility.py new file mode 100644 index 0000000..ceafa5b --- /dev/null +++ b/Project_One/inference_code/utility.py @@ -0,0 +1,27 @@ +""" +@Description: Utility class for transformation of the data package +@Author: David Crook +@Author_Email: DaCrook@Microsoft.com + +Copyright (c) Microsoft Corporation. All rights reserved. +Licensed under the MIT License. +""" +import json +import numpy as np + +def transform_input(input_package): + """ + input_package: raw json input package as agreed upon + returns: numpy array of correct format without pre-processing + """ + d = json.loads(input_package) + # Add extra processing for some reason. + x = np.array([d["age"], d["hours-per-week"]]).transpose() + return x + +def transform_output(y): + """ + takes raw output from model and transforms it into the agreed upon interface for worldly consumption + """ + d = {"estimated_wages" : y} + return json.dumps(d) \ No newline at end of file diff --git a/Project_One/notebooks/submit_run_db.py b/Project_One/notebooks/submit_run_db.py new file mode 100644 index 0000000..c6f74b6 --- /dev/null +++ b/Project_One/notebooks/submit_run_db.py @@ -0,0 +1,82 @@ +import os +import azureml.core +from azureml.core.runconfig import JarLibrary +from azureml.core.compute import ComputeTarget, DatabricksCompute +from azureml.exceptions import ComputeTargetException +from azureml.core import Workspace, Experiment +from azureml.pipeline.core import Pipeline, PipelineData +from azureml.pipeline.steps import DatabricksStep +from azureml.core.datastore import Datastore +from azureml.data.data_reference import DataReference +from azureml.core.conda_dependencies import CondaDependencies +import ast + +def resolve_dependencies(): + """ + ENV VAR OF FORM: "['numpy', 'scikit-learn', 'azureml-sdk']" + """ + dep_list = ast.literal_eval(os.environ["DEP_LIST"]) + return dep_list + +def resolve_compute_name(): + return os.environ["COMPUTE_NAME"] + +def resolve_rg(): + return os.environ["RESOURCE_GROUP"] + +def resolve_db_workspace_name(): + return os.environ["DB_WORKSPACE_NAME"] + +def resolve_db_access_token(): + return os.environ["DB_ACCESS_TOKEN"] + +def resolve_script_name(): + return os.environ["SCRIPT_NAME"] + +def resolve_subscription_id(): + return os.environ["SUBSCRIPTION_ID"] + +def resolve_ml_workspace_name(): + return os.environ["ML_WORKSPACE_NAME"] + +def resolve_source_directory(): + return os.environ["SOURCE_DIR"] + +def resolve_db_cluster_id(): + return os.environ["DB_CLUSTER_ID"] + +my_env = CondaDependencies.create(conda_packages=resolve_dependencies()) + +with open("myenv.yml","w") as f: + f.write(my_env.serialize_to_string()) + + +ws = Workspace(resolve_subscription_id(), resolve_rg(), resolve_ml_workspace_name()) + + +config = DatabricksCompute.attach_configuration( + resource_group = resolve_rg(), + workspace_name = resolve_db_workspace_name(), + access_token = resolve_db_access_token()) +databricks_compute=ComputeTarget.attach(ws, resolve_compute_name(), config) +databricks_compute.wait_for_completion(True) + +dbPythonInLocalMachineStep = DatabricksStep( + name="DBPythonInLocalMachine", + python_script_name=resolve_script_name(), + source_directory=resolve_source_directory(), + run_name='DB_Worst_Regression_Run', + compute_target=databricks_compute, + existing_cluster_id=resolve_db_cluster_id(), + allow_reuse=True +) + + +steps = [dbPythonInLocalMachineStep] +pipeline = Pipeline(workspace=ws, steps=steps) +pipeline_run = Experiment(ws, 'DB_Python_Local_demo').submit(pipeline) +pipeline_run.wait_for_completion() + + +#from azureml.widgets import RunDetails +#RunDetails(pipeline_run).show() \ No newline at end of file diff --git a/test_setup.py b/Project_One/notebooks/train_model.py similarity index 72% rename from test_setup.py rename to Project_One/notebooks/train_model.py index 8cb5669..d99cb92 100644 --- a/test_setup.py +++ b/Project_One/notebooks/train_model.py @@ -1,5 +1,8 @@ +# Databricks notebook source +# FUTURE SERVICE PRINCIPAL STUFF FOR MOUNTING secrets = {} -secrets["storage_account_name"] = dbutils.secrets.get(scope = "data-lake", key = "storage-account-name") +secrets["blob_name"] = dbutils.secrets.get(scope = "data-lake", key = "blob-name") +secrets["blob_key"] = dbutils.secrets.get(scope = "data-lake", key = "blob-key") secrets["container_name"] = dbutils.secrets.get(scope = "data-lake", key = "container-name") secrets["subscription_id"] = dbutils.secrets.get(scope = "data-lake", key = "subscription-id") secrets["resource_group"] = dbutils.secrets.get(scope = "data-lake", key = "resource-group") @@ -15,6 +18,27 @@ print("falling back to user set created_by") secrets["created_by"] = "dacrook" +# COMMAND ---------- + +# +# THIS IS FOR BLOB MOUNTING +# +#create the config variable to pass into mount +#configs = {"fs.azure.account.key." + secrets["blob_name"] + ".blob.core.windows.net":"" + secrets["blob_key"] + ""} + +#try: + #mounting the external blob storage as mount point datalake for data storage. +# dbutils.fs.mount( source = "wasbs://" + secrets["container_name"] + "@" + secrets["blob_name"] + ".blob.core.windows.net/", +# mount_point = "/mnt/datalake/", + # extra_configs = configs) +#except Exception as e: +# print("already mounted; no need to do so.") + +# COMMAND ---------- + +# +# THIS IS FOR ADLS V2 Mounting +# configs = {"fs.azure.account.auth.type": "OAuth", "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider", "fs.azure.account.oauth2.client.id": secrets["sp_app_id"], #Service Principal App ID @@ -22,6 +46,7 @@ "fs.azure.account.oauth2.client.endpoint": secrets["sp_token_endpoint"]} #directory id try: + #mounting the external blob storage as mount point datalake for data storage. dbutils.fs.mount( source = "abfss://" + secrets["container_name"] + "@" + secrets["storage_account_name"] + ".dfs.core.windows.net", #blobcontainername@storageaccount mount_point = "/mnt/datalake", @@ -29,10 +54,19 @@ except Exception as e: print("already mounted; no need to do so.") +# COMMAND ---------- + +#display the files in the folder +dbutils.fs.ls("dbfs:/mnt/datalake") + +# COMMAND ---------- + census = sqlContext.read.format('csv').options(header='true', inferSchema='true').load('/mnt/datalake/AdultCensusIncome.csv') census.printSchema() display(census.select("age", " fnlwgt", " hours-per-week")) +# COMMAND ---------- + import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler @@ -40,9 +74,18 @@ from sklearn.metrics import mean_absolute_error, r2_score import pickle +# COMMAND ---------- + x = np.array(census.select("age", " hours-per-week").collect()).reshape(-1,2) y = np.array(census.select(" fnlwgt").collect()).reshape(-1,1) +# COMMAND ---------- + +x +y + +# COMMAND ---------- + #Split data & Train Scalers x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state = 777, shuffle = True) x_scaler = StandardScaler().fit(x_train) @@ -64,6 +107,8 @@ print(mae) print(r2) +# COMMAND ---------- + #Write Files to local file storage import os #Also works: "/dbfs/tmp/models/worst_regression/dacrook/" @@ -81,10 +126,15 @@ from azureml.core.workspace import Workspace from azureml.core.authentication import ServicePrincipalAuthentication from azureml.core.model import Model -print("Logging In - navigate to https://microsoft.com/devicelogin and enter the code from the print out below") -az_ws = Workspace(secrets["subscription_id"], secrets["resource_group"], secrets["ml_workspace_name"]) +az_sp = ServicePrincipalAuthentication(secrets["sp_tenant_id"], secrets["sp_app_id"], secrets["sp_password"]) #tenant id +az_ws = Workspace(secrets["subscription_id"], secrets["resource_group"], secrets["ml_workspace_name"], auth= az_sp) print("Logged in and workspace retreived.") Model.register(az_ws, model_path = prefix, model_name = "worst_regression", tags={"state" : secrets["alg_state"], "created_by" : secrets["created_by"]}) +# COMMAND ---------- + #finally unmount the mount. -dbutils.fs.unmount("/mnt/datalake") +try: + dbutils.fs.unmount("/mnt/datalake") +except Exception as e: + print("already unmounted; no need to unmount again.") \ No newline at end of file diff --git a/Project_One/requirements.txt b/Project_One/requirements.txt new file mode 100644 index 0000000..8a76702 --- /dev/null +++ b/Project_One/requirements.txt @@ -0,0 +1 @@ +azureml-sdk \ No newline at end of file diff --git a/Project_One/run_build_local.cmd b/Project_One/run_build_local.cmd new file mode 100644 index 0000000..f688f9d --- /dev/null +++ b/Project_One/run_build_local.cmd @@ -0,0 +1,28 @@ +docker stop mlbuild +docker rm mlbuild + +if not exist "C:\ml_temp\artifacts" mkdir C:\ml_temp\artifacts + +:: REPLACE TOKENS IN dockerfile +set ml_subscription_id="" +set ml_resource_group="" +set ml_workspace_name="" +set ml_alg_state="" +set ml_alg_author="" +set ml_model_name="" +set ml_image_name="" + +powershell -Command "(gc dockerfile.base) -replace '', '%ml_subscription_id%' | Out-File dockerfile -Encoding utf8" +powershell -Command "(gc dockerfile) -replace '', '%ml_resource_group%' | Out-File dockerfile -Encoding utf8" +powershell -Command "(gc dockerfile) -replace '', '%ml_workspace_name%' | Out-File dockerfile -Encoding utf8" +powershell -Command "(gc dockerfile) -replace '', '%ml_alg_state%' | Out-File dockerfile -Encoding utf8" +powershell -Command "(gc dockerfile) -replace '', '%ml_alg_author%' | Out-File dockerfile -Encoding utf8" +powershell -Command "(gc dockerfile) -replace '', '%ml_model_name%' | Out-File dockerfile -Encoding utf8" +powershell -Command "(gc dockerfile) -replace '', '%ml_image_name%' | Out-File dockerfile -Encoding utf8" + +docker build -t mlbuild . +:: docker run --name mlbuild --rm --privileged -v /var/run/docker.sock:/var/run/docker.sock -v c:/Users/%USERNAME%/.azure:/root/.azure mlbuild +docker run --name mlbuild --privileged -v c:/ml_temp/artifacts:/artifacts/ -v c:/Users/%USERNAME%/.azure/:/root/.azure/ mlbuild + +:: use env variables & use a git ignore to hide settings. +:: possibly have a local build vs remote build cmd/.sh files. diff --git a/Project_One/runbuild_pipeline.sh b/Project_One/runbuild_pipeline.sh new file mode 100644 index 0000000..d2c51ad --- /dev/null +++ b/Project_One/runbuild_pipeline.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -euo pipefail +IFS=$'\n\t' + +# -e: immediately exit if any command has a non-zero exit status +# -o: prevents errors in a pipeline from being masked +# IFS new value is less likely to cause confusing bugs when looping arrays or arguments (e.g. $@) + +#Creating Artifact and Test Results Directories +mkdir ml_temp && cd ml_temp +mkdir artifacts && cd artifacts +mkdir test_results + +#Switching to Project Directory +cd / +cd $(System.DefaultWorkingDirectory)/Project_One + +#Docker Build Inf Container +echo "Building Inference Container" +docker build -t mlbuild . + +#Run Built Container +echo "Running Inference Container" +docker run -e SUBSCRIPTION_ID=$(SUBSCRIPTION_ID) -e RESOURCE_GROUP=$(RESOURCE_GROUP) -e WORKSPACE_NAME=$(WORKSPACE_NAME) -e STATE=$(STATE) -e AUTHOR=$(AUTHOR) -e MODEL_NAME=$(MODEL_NAME) -e IMAGE_NAME=$(IMAGE_NAME) --name mlbuild --rm -v $(Agent.HomeDirectory)/ml_temp/artifacts:/artifacts/ -v /home/vsts/.azure/:/root/.azure/ mlbuild + + + + + diff --git a/Project_One/score.py b/Project_One/score.py new file mode 100644 index 0000000..dd37e9b --- /dev/null +++ b/Project_One/score.py @@ -0,0 +1,12 @@ +import json +from inference_code.model_class import MyModel + +MODEL = None + +def init(): + global MODEL + MODEL = MyModel() + MODEL.init() + +def run(input_package): + return MODEL.predict(input_package) \ No newline at end of file diff --git a/README.md b/README.md index 797d816..a4579b6 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,20 @@ -# DataBricks_ML_DevOps -Showcase Dev Ops for ML practices with Azure DataBricks +# Introduction +TODO: Give a short introduction of your project. Let this section explain the objectives or the motivation behind this project. + +# Getting Started +TODO: Guide users through getting your code up and running on their own system. In this section you can talk about: +1. Installation process +2. Software dependencies +3. Latest releases +4. API references + +# Build and Test +TODO: Describe and show how to build your code and run the tests. + +# Contribute +TODO: Explain how other users and developers can contribute to make your code better. + +If you want to learn more about creating good readme files then refer the following [guidelines](https://www.visualstudio.com/en-us/docs/git/create-a-readme). You can also seek inspiration from the below readme files: +- [ASP.NET Core](https://github.com/aspnet/Home) +- [Visual Studio Code](https://github.com/Microsoft/vscode) +- [Chakra Core](https://github.com/Microsoft/ChakraCore) \ No newline at end of file