diff --git a/.github/ISSUE_TEMPLATE/example-request.md b/.github/ISSUE_TEMPLATE/example-request.md index ece3f7ae07..4a02f80224 100644 --- a/.github/ISSUE_TEMPLATE/example-request.md +++ b/.github/ISSUE_TEMPLATE/example-request.md @@ -18,3 +18,6 @@ A clear and concise description of the use case for this example. **Describe what other services (other than SageMaker) are involved*** + +**Describe which dataset could be used. Provide its location in s3://sagemaker-sample-files or another source.** + diff --git a/README.md b/README.md index 33e436ff61..20947747e4 100644 --- a/README.md +++ b/README.md @@ -60,8 +60,10 @@ These examples provide a gentle introduction to machine learning concepts as the These examples introduce SageMaker's hyperparameter tuning functionality which helps deliver the best possible predictions by running a large number of training jobs to determine which hyperparameter values are the most impactful. - [XGBoost Tuning](hyperparameter_tuning/xgboost_direct_marketing) shows how to use SageMaker hyperparameter tuning to improve your model fits for the [Targeted Direct Marketing](introduction_to_applying_machine_learning/xgboost_direct_marketing) task. +- [BlazingText Tuning](hyperparameter_tuning/blazingtext_text_classification_20_newsgroups) shows how to use SageMaker hyperparameter tuning with the BlazingText built-in algorithm and 20_newsgroups dataset.. - [TensorFlow Tuning](hyperparameter_tuning/tensorflow_mnist) shows how to use SageMaker hyperparameter tuning with the pre-built TensorFlow container and MNIST dataset. - [MXNet Tuning](hyperparameter_tuning/mxnet_mnist) shows how to use SageMaker hyperparameter tuning with the pre-built MXNet container and MNIST dataset. +- [HuggingFace Tuning](hyperparameter_tuning/huggingface_multiclass_text_classification_20_newsgroups) shows how to use SageMaker hyperparameter tuning with the pre-built HuggingFace container and 20_newsgroups dataset. - [Keras BYO Tuning](hyperparameter_tuning/keras_bring_your_own) shows how to use SageMaker hyperparameter tuning with a custom container running a Keras convolutional network on CIFAR-10 data. - [R BYO Tuning](hyperparameter_tuning/r_bring_your_own) shows how to use SageMaker hyperparameter tuning with the custom container from the [Bring Your Own R Algorithm](advanced_functionality/r_bring_your_own) example. - [Analyzing Results](hyperparameter_tuning/analyze_results) is a shared notebook that can be used after each of the above notebooks to provide analysis on how training jobs with different hyperparameters performed. @@ -244,6 +246,7 @@ These examples show you how to use model-packages and algorithms from AWS Market - [Evaluating ML models from AWS Marketplace for person counting use case](aws_marketplace/using_model_packages/evaluating_aws_marketplace_models_for_person_counting_use_case) will show you how to use two AWS Marketplace GluonCV pre-trained ML models for person counting use case and evaluate each model for performance in different types of crowd images. - [Using Dataset Products](aws_marketplace/using_data) - [Using Dataset Product from AWS Data Exchange with ML model from AWS Marketplace](aws_marketplace/using_data/using_data_with_ml_model) is a sample notebook which shows how a dataset from AWS Data Exchange can be used with an ML Model Package from AWS Marketplace. + - [Using Shutterstock Image Datasets to train Image Classification Models](aws_marketplace/using_data/image_classification_with_shutterstock_image_datasets) provides a detailed walkthrough on how to use the [Free Sample: Images & Metadata of “Whole Foods” Shoppers](https://aws.amazon.com/marketplace/pp/prodview-y6xuddt42fmbu?qid=1623195111604&sr=0-1&ref_=srh_res_product_title#offers) from Shutterstock's Image Datasets to train a multi-label image classification model using Shutterstock's pre-labeled image assets. You can learn more about this implementation [from this blog post](https://aws.amazon.com/blogs/awsmarketplace/using-shutterstocks-image-datasets-to-train-your-computer-vision-models/). ## :balance_scale: License diff --git a/_static/aws-ux-shortbread/index.js b/_static/aws-ux-shortbread/index.js deleted file mode 100644 index e0cf788307..0000000000 --- a/_static/aws-ux-shortbread/index.js +++ /dev/null @@ -1,3 +0,0 @@ -/*! Version: 1.0.13 */ -!function(e,c){if("object"==typeof exports&&"object"==typeof module)module.exports=c();else if("function"==typeof define&&define.amd)define([],c);else{var a=c();for(var t in a)("object"==typeof exports?exports:e)[t]=a[t]}}(window,(function(){return function(e){var c={};function a(t){if(c[t])return c[t].exports;var n=c[t]={i:t,l:!1,exports:{}};return e[t].call(n.exports,n,n.exports,a),n.l=!0,n.exports}return a.m=e,a.c=c,a.d=function(e,c,t){a.o(e,c)||Object.defineProperty(e,c,{enumerable:!0,get:t})},a.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},a.t=function(e,c){if(1&c&&(e=a(e)),8&c)return e;if(4&c&&"object"==typeof e&&e&&e.__esModule)return e;var t=Object.create(null);if(a.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&c&&"string"!=typeof e)for(var n in e)a.d(t,n,function(c){return e[c]}.bind(null,n));return t},a.n=function(e){var c=e&&e.__esModule?function(){return e.default}:function(){return e};return a.d(c,"a",c),c},a.o=function(e,c){return Object.prototype.hasOwnProperty.call(e,c)},a.p="",a(a.s=13)}([function(e,c,a){"use strict";var t=this&&this.__assign||function(){return(t=Object.assign||function(e){for(var c,a=1,t=arguments.length;a0&&r.forEach((function(c){if("string"==typeof c)l.appendChild(t.createTextNode(c));else if("number"==typeof c)l.appendChild(t.createTextNode(r.toString()));else{if(null===c)throw Error("Unsupported child type "+c);e(l,c,t,!0)}})),i?c.appendChild(l):c.insertBefore(l,c.firstChild)}},function(e,c,a){"use strict";var t,n=function(){return void 0===t&&(t=Boolean(window&&document&&document.all&&!window.atob)),t},i=function(){var e={};return function(c){if(void 0===e[c]){var a=document.querySelector(c);if(window.HTMLIFrameElement&&a instanceof window.HTMLIFrameElement)try{a=a.contentDocument.head}catch(e){a=null}e[c]=a}return e[c]}}(),o=[];function s(e){for(var c=-1,a=0;a-1?"awsccc-Rtl":"";function b(){return document.querySelector("div[data-id="+s.BANNER_ID+"]")}function f(){return document.querySelector("div[data-id="+s.CUSTOMIZE_ID+"]")}function h(e,c){var a=document.querySelector("label[data-id=awsccc-u-cb-"+e+"-label]"),t=a.classList,n=a.querySelector("input");c?(n.setAttribute("checked",""),t.add("awsccc-u-cb-checkbox-active")):(t.remove("awsccc-u-cb-checkbox-active"),n.removeAttribute("checked")),n.setAttribute("aria-checked",""+c)}var g=function(e){var c=e.event,a=e.category;"checkbox"!==c.target.getAttribute("type")&&"awsccc-cs-s-title"!==c.target.getAttribute("class")||h(a,!p(a))},m=function(c){return function(a,t){var n=b().querySelector("div[data-id=awsccc-cb-tabstart]");document.querySelector("div[data-id="+s.CUSTOMIZE_ID+"]").style.display="none",b().style.display="none",n.setAttribute("tabindex","-1"),e.onSaveConsent(a),document.body.classList.remove("awsccc-cs-modal-open"),e.log("info")(c,{detail:"Save Consent Clicked",source:t,cookie:e.getConsentCookie()})}},k=function(e){"Escape"===e.key&&x()},v=function(){return e.getConsentCookie()||u.DEFAULT_COOKIE},w=function(c){var a;a=v(),o.COOKIE_CATEGORIES.filter((function(e){return e!==o.ESSENTIAL})).forEach((function(e){h(e,a[e])})),f().addEventListener("keydown",k),f().style.display="block",document.body.classList.add("awsccc-cs-modal-open");var t=document.querySelectorAll("div[data-id="+s.TABTRAP_ID+"]");l.convertToArray(t).forEach((function(e,c){0===c&&e.focus({preventScroll:!0}),e.setAttribute("tabindex","0")})),e.log("info")("customizeCookies",{detail:"Customize Consent Clicked",source:c,cookie:e.getConsentCookie()})},x=function(){f().removeEventListener("keydown",k),f().style.display="none",document.body.classList.remove("awsccc-cs-modal-open");var c=f().querySelectorAll("div[data-id="+s.TABTRAP_ID+"]");(l.convertToArray(c).forEach((function(e){e.setAttribute("tabindex","-1")})),"block"===b().style.display)&&b().querySelector("div[data-id=awsccc-cb-tabstart]").focus({preventScroll:!0});e.onModalClose&&e.onModalClose()};return d.default((function(){document.querySelector("#"+s.CONTAINER_ID)||t.render(e.parent||document.body,t.act("div",{id:s.CONTAINER_ID},t.act("div",{id:s.APP_ID,class:a},t.act(n.default,{showConsentSelector:w,handleSaveClick:m("acceptAll"),localizedText:c.consentBanner,hasConsoleNavFooter:e.hasConsoleNavFooter}),t.act(i.default,{consentState:v(),handleSaveClick:m("customize"),handleCheckboxToggle:g,localizedText:c.consentSelector,closeConsentSelector:x,darkModeEnabled:e.hasConsoleNavFooter}))))})),{showConsentSelector:function(e){d.default((function(){w(e)}))},showBanner:function(e){d.default((function(){var c;c=b().querySelector("div[data-id=awsccc-cb-tabstart]"),b().style.display="block",c.setAttribute("tabindex","0"),c.focus({preventScroll:!0}),e()}))}}}c.isChecked=p,c.default={createShortbreadUi:function(e){return b(e)}}},function(e,c,a){"use strict";var t=this&&this.__assign||function(){return(t=Object.assign||function(e){for(var c,a=1,t=arguments.length;a0)try{var o=JSON.parse(atob(n[n.length-1]));return 1===(t=o).e&&"number"==typeof t.p&&"number"==typeof t.f&&"number"==typeof t.a&&"string"==typeof t.i&&"string"==typeof t.v?{essential:1===(a=o).e,performance:1===a.p,functional:1===a.f,advertising:1===a.a,id:a.i,version:a.v}:void i("getCookie",{detail:"Cookie format is not valid",cookie:o})}catch(e){return void i("getCookie",{detail:"Error parsing cookie",cookie:n[n.length-1]})}}function s(e){document.cookie=e}c.getConsentCookie=function(e,c){void 0===e&&(e=function(){return document.cookie});var a=o(e(),c);if(a)return{essential:a.essential,performance:a.performance,functional:a.functional,advertising:a.advertising}},c.setConsentCookie=function(e,c,a,r,l,u,d,p){void 0===c&&(c=".aws.amazon.com"),void 0===a&&(a=i.DEFAULT_COOKIE_AGE),void 0===r&&(r=n.default),void 0===l&&(l=s);var b,f=function(e){void 0===e&&(e=function(){return document.cookie});var c=o(e());if(c&&c.id)return c.id}()||r(u,d,p),h=t(t({},e),{id:f,version:i.COOKIE_VERSION}),g={e:(b=h).essential?1:0,p:b.performance?1:0,f:b.functional?1:0,a:b.advertising?1:0,i:b.id,v:b.version};return l("awsccc="+btoa(JSON.stringify(g))+"; domain="+c+"; path=/; max-age="+a+"; secure=true; SameSite=Lax"),h}},function(e,c,a){"use strict";Object.defineProperty(c,"__esModule",{value:!0});var t=a(46);c.default=function(e,c,a){void 0===c&&(c=t.v4),void 0===a&&(a=function(){return"ts-"+Date.now().toString()});var n=e?e("error"):function(){};try{return c()}catch(e){return n("uuid",{detail:"Error generating UUID",errorMessage:e.message||""}),a()}}},function(e,c,a){"use strict";Object.defineProperty(c,"__esModule",{value:!0}),c.queryGeolocationByHttpGetRequest=c.timestampUrl=c.QUERY_PARAM_KEY=void 0;var t=a(12);c.QUERY_PARAM_KEY="awsccc",c.timestampUrl=function(e){if(-1!==e.indexOf("?")){var a=e.split("?");e=a[0]+"?"+c.QUERY_PARAM_KEY+"="+Date.now()+"&"+a[1]}else{if(-1===e.indexOf("#"))return e+"?"+c.QUERY_PARAM_KEY+"="+Date.now();a=e.split("#");e=a[0]+"?"+c.QUERY_PARAM_KEY+"="+Date.now()+"#"+a[1]}return e},c.queryGeolocationByHttpGetRequest=function(e,a,n){function i(c,a,t,n,i){c("info")("geolocationLatency",{metric:a,region:t,detail:n,url:e,status:i.status})}return void 0===e&&(e="https://prod.tools.shortbread.aws.dev/1x1.png"),void 0===a&&(a=5e3),void 0===n&&(n=t.DEFAULT_LOGGER),function(o,s){void 0===s&&(s=n||t.DEFAULT_LOGGER);var r=Date.now(),l=new XMLHttpRequest;l.addEventListener("load",(function(){var e=403===l.status?"NON-EU":"EU";i(s,Date.now()-r,e,"Geolocation Response Received",l),o(e)})),l.addEventListener("timeout",(function(){o("EU");var c="Geolocation Request Timed out";i(s,a,"EU",c,l),s("error")("geolocationRequestTimeout",{url:e,timeoutSetting:a,detail:c})})),l.open("GET",c.timestampUrl(e)),l.timeout=a,l.send()}},c.default=c.queryGeolocationByHttpGetRequest},function(e,c,a){"use strict";var t=this&&this.__assign||function(){return(t=Object.assign||function(e){for(var c,a=1,t=arguments.length;aspan{color:#687078}",""]),e.exports=c},function(e,c,a){"use strict";Object.defineProperty(c,"__esModule",{value:!0});var t=a(0),n=a(6),i=a(3);a(35);var o=a(5);c.default=function(e){var c=e.handleSaveClick,a=e.handleCancelClick,s=e.localizedText;return t.act("div",{id:"awsccc-cs-f-c"},t.act(n.default,{dataId:i.CUSTOMIZE_CANCEL_BTN_ID,variant:"secondary",events:{onclick:a},text:s["button-cancel"],props:{"aria-label":s["button-cancel-aria-label"]}}),t.act(n.default,{dataId:i.CUSTOMIZE_SAVE_BTN_ID,variant:"primary",events:{onclick:function(){c({essential:!0,performance:o.isChecked("performance"),functional:o.isChecked("functional"),advertising:o.isChecked("advertising")},"preferencesModal")}},text:s["button-save"],props:{"aria-label":s["button-save-aria-label"]}}))}},function(e,c,a){var t=a(1),n=a(36);"string"==typeof(n=n.__esModule?n.default:n)&&(n=[[e.i,n,""]]);var i={insert:"head",singleton:!1};t(n,i);e.exports=n.locals||{}},function(e,c,a){(c=a(2)(!1)).push([e.i,"#awsccc-sb-ux-c #awsccc-sb-a.awsccc-Rtl #awsccc-cs-f-c{text-align:left}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-f-c{text-decoration:none;padding:10px 20px;text-align:right;border-top:1px solid #eaeded;display:flex;justify-content:center;flex-wrap:wrap}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-f-c .awsccc-u-btn{margin-left:10px}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-f-c .awsccc-u-btn.awsccc-u-btn-secondary{background-color:#fff;border-color:#fff;color:#545b64;margin-bottom:6px}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-f-c .awsccc-u-btn.awsccc-u-btn-secondary:hover{color:#000;background-color:#fafafa}@media screen and (min-width: 700px){#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-f-c{display:block}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-f-c .awsccc-u-btn.awsccc-u-btn-secondary{margin-bottom:0}}",""]),e.exports=c},function(e,c,a){var t=a(1),n=a(38);"string"==typeof(n=n.__esModule?n.default:n)&&(n=[[e.i,n,""]]);var i={insert:"head",singleton:!1};t(n,i);e.exports=n.locals||{}},function(e,c,a){(c=a(2)(!1)).push([e.i,"#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-container{display:flex;align-items:center;justify-items:center;bottom:0;left:0;right:0;top:0;position:fixed;z-index:10002;outline:0;overflow:hidden}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-container-inner{max-width:820px;box-sizing:border-box;outline:none;margin:10px auto;width:calc(100vw - 20px)}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-content{background-color:#fff;border-radius:0;box-sizing:border-box;margin-bottom:0;word-wrap:break-word;box-shadow:0 1px 1px 0 rgba(0,28,36,.3),1px 1px 1px 0 rgba(0,28,36,.15),-1px 1px 1px 0 rgba(0,28,36,.15)}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-header{background-color:#fafafa;padding:19px 20px;border-bottom:1px solid #eaeded}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-title{min-width:0;word-break:break-word;color:#16191f;flex:auto}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-title h2{font-size:18px;font-weight:700;margin:0}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-modalBody{overflow-y:auto;max-height:calc(100vh - 200px);padding:19px 20px}@media screen and (max-width: 480px){#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-modalBody{max-height:calc(100vh - 275px)}}#awsccc-sb-ux-c #awsccc-sb-a #awsccc-cs-modalOverlay{background-color:rgba(242,243,243,.9);position:fixed;z-index:10001;right:0;top:0;bottom:0;left:0}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled{background-color:#2a2e33}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalOverlay{background-color:rgba(22,25,31,.8)}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-header{background-color:#21252c;border-bottom:1px solid #414750}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-title h2{color:#eaeded}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody{background-color:#2a2e33}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody #awsccc-cs-i-container{border-bottom:1px solid #414750}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody #awsccc-cs-i-container span{color:#eaeded}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody .awsccc-cs-s-container{border-bottom:1px solid #414750}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody .awsccc-cs-s-container h3{color:#eaeded}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody .awsccc-cs-s-container p{color:#eaeded}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody .awsccc-cs-s-container span{color:#eaeded}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody .awsccc-cs-s-container rect{fill:#1a2029}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody .awsccc-cs-s-container .awsccc-cs-s-text{border-top:0}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody .awsccc-cs-s-container .awscc-u-cb-checkbox-poly-line{display:none}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody .awsccc-cs-s-container .awsccc-u-cb-checkbox-active .awscc-u-cb-checkbox-poly-line{display:inline-block}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-modalBody .awsccc-cs-s-container .awsccc-u-cb-checkbox-active .awscc-u-cb-checkbox-rect{fill:#00a1c9;stroke:#00a1c9}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-f-c{border-top:1px solid #414750;background-color:#2a2e33}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-f-c .awsccc-u-btn-secondary{background-color:#2a2e33}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-f-c .awsccc-u-btn-secondary span{color:#d5dbdb}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-f-c .awsccc-u-btn-secondary:hover{background-color:#21252c}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-f-c .awsccc-u-btn-primary span{color:#16191f}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-l-container span{color:#eaeded}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-l-container span a{display:inline-block}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-l-container span a span{color:#44b9d6}#awsccc-sb-ux-c #awsccc-sb-a .dark-mode-enabled #awsccc-cs-l-container path{color:#44b9d6}div[data-id=awsccc-cs]{display:none}",""]),e.exports=c},function(e,c,a){"use strict";Object.defineProperty(c,"__esModule",{value:!0});var t=a(0);a(40),c.default=function(){return t.act("div",{class:"awsccc-u-i-open-c"},t.act("svg",{class:"awsccc-u-i-open",viewBox:"0 0 16 16",focusable:"false","aria-hidden":"true"},t.act("path",{class:"awsccc-stroke-linecap-square",d:"M10 2h4v4"}),t.act("path",{d:"M6 10l8-8"}),t.act("path",{class:"awsccc-stroke-linejoin-round",d:"M14 9.048V14H2V2h5"})))}},function(e,c,a){var t=a(1),n=a(41);"string"==typeof(n=n.__esModule?n.default:n)&&(n=[[e.i,n,""]]);var i={insert:"head",singleton:!1};t(n,i);e.exports=n.locals||{}},function(e,c,a){(c=a(2)(!1)).push([e.i,"#awsccc-sb-ux-c #awsccc-sb-a .awsccc-u-i-open-c{display:inline-block;vertical-align:middle;line-height:1em;padding-left:.3em}#awsccc-sb-ux-c #awsccc-sb-a .awsccc-u-i-open-c svg{stroke-width:2px;pointer-events:none;fill:none;padding-bottom:1px;height:10px;width:10px}#awsccc-sb-ux-c #awsccc-sb-a .awsccc-u-i-open-c svg .awsccc-stroke-linecap-square{stroke-linecap:square}#awsccc-sb-ux-c #awsccc-sb-a .awsccc-u-i-open-c svg .awsccc-stroke-linejoin-round{stroke-linejoin:round}#awsccc-sb-ux-c #awsccc-sb-a .awsccc-u-i-open-c svg path{stroke:currentColor}",""]),e.exports=c},function(e,c,a){var t=a(1),n=a(43);"string"==typeof(n=n.__esModule?n.default:n)&&(n=[[e.i,n,""]]);var i={insert:"head",singleton:!1};t(n,i);e.exports=n.locals||{}},function(e,c,a){(c=a(2)(!1)).push([e.i,'#awsccc-sb-ux-c #awsccc-sb-a *{font-family:"Amazon Ember","HelveticaNeue","Helvetica Neue","Amazon Ember",Roboto,"Roboto-Regular","Amazon Ember",Helvetica,Arial,sans-serif;font-size:14px;line-height:21px;color:#16191f;text-align:left;background:none;border:0}#awsccc-sb-ux-c #awsccc-sb-a.awsccc-Rtl *{direction:rtl;text-align:right}#awsccc-sb-ux-c #awsccc-sb-a.awsccc-Rtl .awsccc-cs-s-container .awsccc-cs-s-action{right:auto;left:20px}@media screen and (min-width: 1020px){#awsccc-sb-ux-c #awsccc-sb-a.awsccc-Rtl #awsccc-cb-c #awsccc-cb-title{padding-right:40px}}#awsccc-sb-ux-c #awsccc-sb-a a,#awsccc-sb-ux-c #awsccc-sb-a a>span,#awsccc-sb-ux-c #awsccc-sb-a a svg path{color:#0073bb;text-decoration:none}#awsccc-sb-ux-c #awsccc-sb-a a:hover,#awsccc-sb-ux-c #awsccc-sb-a a>span:hover,#awsccc-sb-ux-c #awsccc-sb-a a svg path:hover{color:#0073bb;text-decoration:underline}#awsccc-sb-ux-c #awsccc-sb-a .awsccc-tab-helper{outline:0;text-decoration:none}.awsccc-cs-modal-open{overflow:hidden;-webkit-box-sizing:border-box;box-sizing:border-box}',""]),e.exports=c},function(e,c,a){"use strict";Object.defineProperty(c,"__esModule",{value:!0}),c.convertToArray=c.update=void 0,c.update=function(e,c){return Object.keys(c).forEach((function(a){e[a]=c[a]})),e},c.convertToArray=function(e){return Array.prototype.slice.call(e)}},function(e,c,a){"use strict";Object.defineProperty(c,"__esModule",{value:!0});c.default=function(e,c,a){function t(){c.removeEventListener("DOMContentLoaded",t),a.removeEventListener("load",t),e()}void 0===c&&(c=document),void 0===a&&(a=window),"loading"!==c.readyState?a.setTimeout(e):(c.addEventListener("DOMContentLoaded",t),a.addEventListener("load",t))}},function(e,c,a){"use strict";a.r(c),a.d(c,"v1",(function(){return b})),a.d(c,"v3",(function(){return y})),a.d(c,"v4",(function(){return C})),a.d(c,"v5",(function(){return A}));var t="undefined"!=typeof crypto&&crypto.getRandomValues&&crypto.getRandomValues.bind(crypto)||"undefined"!=typeof msCrypto&&"function"==typeof msCrypto.getRandomValues&&msCrypto.getRandomValues.bind(msCrypto),n=new Uint8Array(16);function i(){if(!t)throw new Error("crypto.getRandomValues() not supported. See https://github.com/uuidjs/uuid#getrandomvalues-not-supported");return t(n)}for(var o=[],s=0;s<256;++s)o.push((s+256).toString(16).substr(1));var r,l,u=function(e,c){var a=c||0,t=o;return(t[e[a+0]]+t[e[a+1]]+t[e[a+2]]+t[e[a+3]]+"-"+t[e[a+4]]+t[e[a+5]]+"-"+t[e[a+6]]+t[e[a+7]]+"-"+t[e[a+8]]+t[e[a+9]]+"-"+t[e[a+10]]+t[e[a+11]]+t[e[a+12]]+t[e[a+13]]+t[e[a+14]]+t[e[a+15]]).toLowerCase()},d=0,p=0;var b=function(e,c,a){var t=c&&a||0,n=c||[],o=(e=e||{}).node||r,s=void 0!==e.clockseq?e.clockseq:l;if(null==o||null==s){var b=e.random||(e.rng||i)();null==o&&(o=r=[1|b[0],b[1],b[2],b[3],b[4],b[5]]),null==s&&(s=l=16383&(b[6]<<8|b[7]))}var f=void 0!==e.msecs?e.msecs:Date.now(),h=void 0!==e.nsecs?e.nsecs:p+1,g=f-d+(h-p)/1e4;if(g<0&&void 0===e.clockseq&&(s=s+1&16383),(g<0||f>d)&&void 0===e.nsecs&&(h=0),h>=1e4)throw new Error("uuid.v1(): Can't create more than 10M uuids/sec");d=f,p=h,l=s;var m=(1e4*(268435455&(f+=122192928e5))+h)%4294967296;n[t++]=m>>>24&255,n[t++]=m>>>16&255,n[t++]=m>>>8&255,n[t++]=255&m;var k=f/4294967296*1e4&268435455;n[t++]=k>>>8&255,n[t++]=255&k,n[t++]=k>>>24&15|16,n[t++]=k>>>16&255,n[t++]=s>>>8|128,n[t++]=255&s;for(var v=0;v<6;++v)n[t+v]=o[v];return c||u(n)};var f=function(e,c,a){function t(e,t,n,i){var o=n&&i||0;if("string"==typeof e&&(e=function(e){e=unescape(encodeURIComponent(e));for(var c=[],a=0;a>>9<<4)+1}function g(e,c){var a=(65535&e)+(65535&c);return(e>>16)+(c>>16)+(a>>16)<<16|65535&a}function m(e,c,a,t,n,i){return g((o=g(g(c,e),g(t,i)))<<(s=n)|o>>>32-s,a);var o,s}function k(e,c,a,t,n,i,o){return m(c&a|~c&t,e,c,n,i,o)}function v(e,c,a,t,n,i,o){return m(c&t|a&~t,e,c,n,i,o)}function w(e,c,a,t,n,i,o){return m(c^a^t,e,c,n,i,o)}function x(e,c,a,t,n,i,o){return m(a^(c|~t),e,c,n,i,o)}var y=f("v3",48,(function(e){if("string"==typeof e){var c=unescape(encodeURIComponent(e));e=new Uint8Array(c.length);for(var a=0;a>5]>>>t%32&255,i=parseInt("0123456789abcdef".charAt(n>>>4&15)+"0123456789abcdef".charAt(15&n),16);c.push(i)}return c}(function(e,c){e[c>>5]|=128<>5]|=(255&e[t/8])<>>32-c}var A=f("v5",80,(function(e){var c=[1518500249,1859775393,2400959708,3395469782],a=[1732584193,4023233417,2562383102,271733878,3285377520];if("string"==typeof e){var t=unescape(encodeURIComponent(e));e=[];for(var n=0;n>>0;v=k,k=m,m=_(g,30)>>>0,g=h,h=y}a[0]=a[0]+h>>>0,a[1]=a[1]+g>>>0,a[2]=a[2]+m>>>0,a[3]=a[3]+k>>>0,a[4]=a[4]+v>>>0}return[a[0]>>24&255,a[0]>>16&255,a[0]>>8&255,255&a[0],a[1]>>24&255,a[1]>>16&255,a[1]>>8&255,255&a[1],a[2]>>24&255,a[2]>>16&255,a[2]>>8&255,255&a[2],a[3]>>24&255,a[3]>>16&255,a[3]>>8&255,255&a[3],a[4]>>24&255,a[4]>>16&255,a[4]>>8&255,255&a[4]]}))}])})); -//# sourceMappingURL=index.js.map \ No newline at end of file diff --git a/_static/aws-ux-shortbread/init.js b/_static/aws-ux-shortbread/init.js deleted file mode 100644 index f8cb6ace3b..0000000000 --- a/_static/aws-ux-shortbread/init.js +++ /dev/null @@ -1,34 +0,0 @@ -(function (w) { -w.URLSearchParams = w.URLSearchParams || function (searchString) { - var self = this; - self.searchString = searchString; - self.get = function (name) { - var results = new RegExp('[\?&]' + name + '=([^&#]*)').exec(self.searchString); - if (results === null) { - return null; - } - else { - return decodeURI(results[1]) || 0; - } - }; -} -})(window); - -const queryString = window.location.search; -const urlParams = new URLSearchParams(queryString); -const lang = urlParams.get('lang') -window.onload = function () { -var domainName = window.location.hostname; - -// remove an instance of shortbread if already exists -var existingShortbreadEl = document.getElementById("awsccc-sb-ux-c"); -existingShortbreadEl && existingShortbreadEl.remove(); - -var shortbread = AWSCShortbread({ - domain: domainName, - language: lang, - //queryGeolocation: function (geolocatedIn) { geolocatedIn("EU") }, -}); - -shortbread.checkForCookieConsent(); -} diff --git a/advanced_functionality/README.md b/advanced_functionality/README.md index 37f509fb17..275662f511 100644 --- a/advanced_functionality/README.md +++ b/advanced_functionality/README.md @@ -8,6 +8,7 @@ These examples that showcase unique functionality available in Amazon SageMaker. - [Encrypting Your Data](handling_kms_encrypted_data) shows how to use Server Side KMS encrypted data with Amazon SageMaker training. The IAM role used for S3 access needs to have permissions to encrypt and decrypt data with the KMS key. - [Using Parquet Data](parquet_to_recordio_protobuf) shows how to bring [Parquet](https://parquet.apache.org/) data sitting in S3 into an Amazon SageMaker Notebook and convert it into the recordIO-protobuf format that many SageMaker algorithms consume. - [Connecting to Redshift](working_with_redshift_data) demonstrates how to copy data from Redshift to S3 and vice-versa without leaving Amazon SageMaker Notebooks. +- [Bring Your Own scikit-learn Model](scikit_learn_bring_your_own_model) shows how to use Amazon SageMaker scikit-learn container to bring a pre-trained model to a realtime hosted endpoint without ever needing to think about REST APIs. - [Bring Your Own XGBoost Model](xgboost_bring_your_own_model) shows how to use Amazon SageMaker Algorithms containers to bring a pre-trained model to a realtime hosted endpoint without ever needing to think about REST APIs. - [Bring Your Own k-means Model](kmeans_bring_your_own_model) shows how to take a model that's been fit elsewhere and use Amazon SageMaker Algorithms containers to host it. - [Installing the R Kernel](install_r_kernel) shows how to install the R kernel into an Amazon SageMaker Notebook Instance. diff --git a/advanced_functionality/autogluon-tabular-containers/AutoGluon_Tabular_SageMaker_Containers.ipynb b/advanced_functionality/autogluon-tabular-containers/AutoGluon_Tabular_SageMaker_Containers.ipynb new file mode 100644 index 0000000000..c717e35c94 --- /dev/null +++ b/advanced_functionality/autogluon-tabular-containers/AutoGluon_Tabular_SageMaker_Containers.ipynb @@ -0,0 +1,765 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8cc97d20", + "metadata": {}, + "source": [ + "# AutoGluon Tabular with Deep Learning Containers on SageMaker" + ] + }, + { + "cell_type": "markdown", + "id": "ed0a4f78", + "metadata": {}, + "source": [ + "[AutoGluon](https://github.com/awslabs/autogluon) automates machine learning tasks enabling you to easily achieve strong predictive performance in your applications. With just a few lines of code, you can train and deploy high-accuracy deep learning models on tabular, image, and text data.\n", + "This example shows how to use AutoGluon-Tabular with Amazon SageMaker by applying [pre-built deep learning containers](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#autogluon-training-containers)." + ] + }, + { + "cell_type": "markdown", + "id": "4f94e506", + "metadata": {}, + "source": [ + "# Prerequisites" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "711d1879", + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "import pandas as pd\n", + "import numpy as np\n", + "from ag_model import AutoGluonTraining, AutoGluonInferenceModel, AutoGluonTabularPredictor\n", + "from sagemaker import utils\n", + "from sagemaker.serializers import CSVSerializer\n", + "import os\n", + "\n", + "role = sagemaker.get_execution_role()\n", + "sagemaker_session = sagemaker.session.Session()\n", + "region = sagemaker_session._region_name\n", + "\n", + "bucket = sagemaker_session.default_bucket()\n", + "s3_prefix = f\"autogluon_sm/{utils.sagemaker_timestamp()}\"\n", + "output_path = f\"s3://{bucket}/{s3_prefix}/output/\"" + ] + }, + { + "cell_type": "markdown", + "id": "187ab748", + "metadata": {}, + "source": [ + "### Get the data\n", + "We'll be using the [Adult Census dataset](https://archive.ics.uci.edu/ml/datasets/adult) for this exercise. \n", + "This data was extracted from the [1994 Census bureau database](http://www.census.gov/en.html) by Ronny Kohavi and Barry Becker (Data Mining and Visualization, Silicon Graphics), with the task being to predict if an individual person makes over 50K a year. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07c8b18f", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88e63263", + "metadata": {}, + "outputs": [], + "source": [ + "columns = [\n", + " \"age\",\n", + " \"workclass\",\n", + " \"fnlwgt\",\n", + " \"education\",\n", + " \"education-num\",\n", + " \"marital-status\",\n", + " \"occupation\",\n", + " \"relationship\",\n", + " \"race\",\n", + " \"sex\",\n", + " \"capital-gain\",\n", + " \"capital-loss\",\n", + " \"hours-per-week\",\n", + " \"native-country\",\n", + " \"class\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d62cf065", + "metadata": {}, + "outputs": [], + "source": [ + "df_train = pd.read_csv(\n", + " \"s3://sagemaker-sample-files/datasets/tabular/uci_adult/adult.data\", header=None\n", + ")\n", + "df_train.columns = columns\n", + "df_train.to_csv(\"data/train.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "008c78e1", + "metadata": {}, + "outputs": [], + "source": [ + "df_test = pd.read_csv(\n", + " \"s3://sagemaker-sample-files/datasets/tabular/uci_adult/adult.test\", header=None, skiprows=1\n", + ")\n", + "df_test.columns = columns\n", + "df_test[\"class\"] = df_test[\"class\"].map(\n", + " {\n", + " \" <=50K.\": \" <=50K\",\n", + " \" >50K.\": \" >50K\",\n", + " }\n", + ")\n", + "df_test.to_csv(\"data/test.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "47a9b23e", + "metadata": {}, + "source": [ + "# Training" + ] + }, + { + "cell_type": "markdown", + "id": "fa06ade7", + "metadata": {}, + "source": [ + "Users can create their own training/inference scripts using [SageMaker Python SDK examples](https://sagemaker.readthedocs.io/en/stable/overview.html#prepare-a-training-script).\n", + "The scripts we created allow to pass AutoGluon configuration as a YAML file (located in `data/config` directory).\n", + "\n", + "We are using [official AutoGluon Deep Learning Container images](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#autogluon-training-containers) with custom training scripts (see `scripts/` directory)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "255bbb99", + "metadata": {}, + "outputs": [], + "source": [ + "ag = AutoGluonTraining(\n", + " role=role,\n", + " entry_point=\"scripts/tabular_train.py\",\n", + " region=region,\n", + " instance_count=1,\n", + " instance_type=\"ml.m5.2xlarge\",\n", + " framework_version=\"0.3.1\",\n", + " base_job_name=\"autogluon-tabular-train\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8966dc5c", + "metadata": {}, + "source": [ + "Upload the data to s3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1d2add2", + "metadata": {}, + "outputs": [], + "source": [ + "s3_prefix = f\"autogluon_sm/{utils.sagemaker_timestamp()}\"\n", + "train_input = ag.sagemaker_session.upload_data(\n", + " path=os.path.join(\"data\", \"train.csv\"), key_prefix=s3_prefix\n", + ")\n", + "eval_input = ag.sagemaker_session.upload_data(\n", + " path=os.path.join(\"data\", \"test.csv\"), key_prefix=s3_prefix\n", + ")\n", + "config_input = ag.sagemaker_session.upload_data(\n", + " path=os.path.join(\"config\", \"config-med.yaml\"), key_prefix=s3_prefix\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3fc2f9d7", + "metadata": {}, + "source": [ + "### Fit The Model\n", + "For local training set `instance_type` to local.\n", + "\n", + "For non-local training the recommended instance type is `ml.m5.2xlarge`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d10602e", + "metadata": {}, + "outputs": [], + "source": [ + "job_name = utils.unique_name_from_base(\"test-autogluon-image\")\n", + "ag.fit({\"config\": config_input, \"train\": train_input, \"test\": eval_input}, job_name=job_name)" + ] + }, + { + "cell_type": "markdown", + "id": "21eaae5c", + "metadata": {}, + "source": [ + "### Model export\n", + "\n", + "AutoGluon models are portable: everything needed to deploy a trained model is in the tarball created by SageMaker.\n", + "\n", + "The artifact can be used locally, on EC2/ECS/EKS or served via SageMaker Inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a67917d", + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp {ag.model_data} ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23542165", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!ls -alF model.tar.gz" + ] + }, + { + "cell_type": "markdown", + "id": "79c1c176", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "source": [ + "# Endpoint Deployment" + ] + }, + { + "cell_type": "markdown", + "id": "bec424d5", + "metadata": {}, + "source": [ + "Upload the model we trained earlier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71bb5010", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "endpoint_name = sagemaker.utils.unique_name_from_base(\"sagemaker-autogluon-serving-trained-model\")\n", + "\n", + "model_data = sagemaker_session.upload_data(\n", + " path=os.path.join(\".\", \"model.tar.gz\"), key_prefix=f\"{endpoint_name}/models\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3c23409b", + "metadata": {}, + "source": [ + "Deploy remote or local endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47de1b5e", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "model = AutoGluonInferenceModel(\n", + " model_data=model_data,\n", + " role=role,\n", + " region=region,\n", + " framework_version=\"0.3.1\",\n", + " source_dir=\"scripts\",\n", + " entry_point=\"tabular_serve.py\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8659a03f", + "metadata": {}, + "outputs": [], + "source": [ + "instance_type = \"ml.m5.2xlarge\"\n", + "# instance_type = 'local'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b377372d", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "predictor = model.deploy(\n", + " initial_instance_count=1, serializer=CSVSerializer(), instance_type=instance_type\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5cf19121", + "metadata": {}, + "source": [ + "### Predict on unlabeled test data\n", + "\n", + "Remove target variable (`class`) from the data and get predictions for a sample of 100 rows using the deployed endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38b676af", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "df = pd.read_csv(\"data/test.csv\")\n", + "data = df.drop(columns=\"class\")[:100].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c1c8854", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "preds = predictor.predict(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12191d9c", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "p = pd.DataFrame({\"preds\": pd.DataFrame(preds)[0], \"actual\": df[\"class\"][: len(preds)]})\n", + "p.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9aa09241", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "print(f\"{(p.preds==p.actual).astype(int).sum()}/{len(p)} are correct\")" + ] + }, + { + "cell_type": "markdown", + "id": "cf65ac2e", + "metadata": {}, + "source": [ + "### Cleanup Endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "972620d2", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "predictor.delete_endpoint()" + ] + }, + { + "cell_type": "markdown", + "id": "2828456e", + "metadata": {}, + "source": [ + "# Batch Transform\n", + "\n", + "Deploying a trained model to a hosted endpoint has been available in SageMaker since launch and is a great way to provide real-time predictions to a service like a website or mobile app. But, if the goal is to generate predictions from a trained model on a large dataset where minimizing latency isn’t a concern, then the batch transform functionality may be easier, more scalable, and more appropriate.\n", + "\n", + "[Read more about Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86befaf0", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "endpoint_name = sagemaker.utils.unique_name_from_base(\n", + " \"sagemaker-autogluon-batch_transform-trained-model\"\n", + ")\n", + "\n", + "model_data = sagemaker_session.upload_data(\n", + " path=os.path.join(\".\", \"model.tar.gz\"), key_prefix=f\"{endpoint_name}/models\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32c9ac73", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "model = AutoGluonInferenceModel(\n", + " model_data=model_data,\n", + " role=role,\n", + " region=region,\n", + " framework_version=\"0.3.1\",\n", + " entry_point=\"tabular_serve-batch.py\",\n", + " source_dir=\"scripts\",\n", + " predictor_cls=AutoGluonTabularPredictor,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7b521b5", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "transformer = model.transformer(\n", + " instance_count=1,\n", + " instance_type=\"ml.m5.2xlarge\",\n", + " strategy=\"MultiRecord\",\n", + " max_payload=6,\n", + " max_concurrent_transforms=1,\n", + " output_path=output_path,\n", + " accept=\"application/json\",\n", + " assemble_with=\"Line\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "463aa8ff", + "metadata": {}, + "source": [ + "Prepare data for batch transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "209ecf3a", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "pd.read_csv(f\"data/test.csv\")[:100].to_csv(\"data/test_no_header.csv\", header=False, index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "1b05b046", + "metadata": {}, + "source": [ + "Upload data to sagemaker session" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b34e930", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "test_input = transformer.sagemaker_session.upload_data(\n", + " path=os.path.join(\"data\", \"test_no_header.csv\"), key_prefix=s3_prefix\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91459b63", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "transformer.transform(\n", + " test_input,\n", + " input_filter=\"$[:14]\", # filter-out target variable\n", + " split_type=\"Line\",\n", + " content_type=\"text/csv\",\n", + " output_filter=\"$['class']\", # keep only prediction class in the output\n", + ")\n", + "\n", + "transformer.wait()" + ] + }, + { + "cell_type": "markdown", + "id": "0c6ab00e", + "metadata": {}, + "source": [ + "Download batch transform outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc840f3a", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!aws s3 cp {transformer.output_path[:-1]}/test_no_header.csv.out ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c505891", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "p = pd.concat(\n", + " [\n", + " pd.read_json(\"test_no_header.csv.out\", orient=\"index\")\n", + " .sort_index()\n", + " .rename(columns={0: \"preds\"}),\n", + " pd.read_csv(\"data/test.csv\")[[\"class\"]].iloc[:100].rename(columns={\"class\": \"actual\"}),\n", + " ],\n", + " axis=1,\n", + ")\n", + "p.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0a8d5db", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "print(f\"{(p.preds==p.actual).astype(int).sum()}/{len(p)} are correct\")" + ] + }, + { + "cell_type": "markdown", + "id": "15658e6e", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "source": [ + "# Conclusion\n", + "\n", + "In this tutorial we successfully trained an AutoGluon model and explored a few options how to deploy it using SageMaker. Any of the sections of this tutorial (training/endpoint inference/batch inference) can be used independently (i.e. train locally, deploy to SageMaker, or vice versa).\n", + "\n", + "Next steps:\n", + "* [Learn more](https://auto.gluon.ai) about AutoGluon, explore [tutorials](https://auto.gluon.ai/stable/tutorials/index.html).\n", + "* Explore [SageMaker inference documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/deploy-model.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09168df8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (MXNet 1.8 Python 3.7 CPU Optimized)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/mxnet-1.8-cpu-py37-ubuntu16.04-v1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/advanced_functionality/autogluon-tabular-containers/README.md b/advanced_functionality/autogluon-tabular-containers/README.md new file mode 100644 index 0000000000..2caf270ec3 --- /dev/null +++ b/advanced_functionality/autogluon-tabular-containers/README.md @@ -0,0 +1,6 @@ + + +# AutoGluon Tabular with Amazon SageMaker + +[AutoGluon](https://github.com/awslabs/autogluon) automates machine learning tasks enabling you to easily achieve strong predictive performance in your applications. With just a few lines of code, you can train and deploy high-accuracy deep learning models on tabular, image, and text data. +This example shows how to use AutoGluon-Tabular with Amazon SageMaker by applying [pre-built deep learning containers](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#autogluon-training-containers). \ No newline at end of file diff --git a/advanced_functionality/autogluon-tabular-containers/ag_model.py b/advanced_functionality/autogluon-tabular-containers/ag_model.py new file mode 100644 index 0000000000..8ffd607c77 --- /dev/null +++ b/advanced_functionality/autogluon-tabular-containers/ag_model.py @@ -0,0 +1,64 @@ +from sagemaker.estimator import Framework +from sagemaker.predictor import Predictor +from sagemaker.mxnet import MXNetModel +from sagemaker.mxnet.model import MXNetPredictor +from sagemaker import utils +from sagemaker.serializers import CSVSerializer +from sagemaker.deserializers import StringDeserializer + + +ACCOUNT = 763104351884 +ECR_TRAINING_REPO = "autogluon-training" +ECR_INFERENCE_REPO = "autogluon-inference" +TRAINING_IMAGE_CPU = "cpu-py37-ubuntu18.04" +TRAINING_IMAGE_GPU = "gpu-py37-cu102-ubuntu18.04" +INFERENCE_IMAGE_CPU = "cpu-py37-ubuntu16.04" + + +class AutoGluonTraining(Framework): + def __init__( + self, + entry_point, + region, + framework_version, + image_type="cpu", + source_dir=None, + hyperparameters=None, + **kwargs, + ): + image = TRAINING_IMAGE_GPU if image_type == "gpu" else TRAINING_IMAGE_CPU + image = f"{framework_version}-{image}" + image_uri = f"{ACCOUNT}.dkr.ecr.{region}.amazonaws.com/{ECR_TRAINING_REPO}:{image}" + super().__init__(entry_point, source_dir, hyperparameters, image_uri=image_uri, **kwargs) + + def _configure_distribution(self, distributions): + return + + def create_model( + self, + model_server_workers=None, + role=None, + vpc_config_override=None, + entry_point=None, + source_dir=None, + dependencies=None, + image_name=None, + **kwargs, + ): + return None + + +class AutoGluonTabularPredictor(Predictor): + def __init__(self, *args, **kwargs): + super().__init__( + *args, serializer=CSVSerializer(), deserializer=StringDeserializer(), **kwargs + ) + + +class AutoGluonInferenceModel(MXNetModel): + def __init__(self, model_data, role, entry_point, region, framework_version, **kwargs): + image = f"{framework_version}-{INFERENCE_IMAGE_CPU}" + image_uri = f"{ACCOUNT}.dkr.ecr.{region}.amazonaws.com/{ECR_INFERENCE_REPO}:{image}" + super().__init__( + model_data, role, entry_point, image_uri=image_uri, framework_version="1.8.0", **kwargs + ) diff --git a/advanced_functionality/autogluon-tabular-containers/config/config-full.yaml b/advanced_functionality/autogluon-tabular-containers/config/config-full.yaml new file mode 100644 index 0000000000..3f8dacb61f --- /dev/null +++ b/advanced_functionality/autogluon-tabular-containers/config/config-full.yaml @@ -0,0 +1,14 @@ +# AutoGluon Predictor constructor arguments +# - see https://github.com/awslabs/autogluon/blob/ef3a5312dc2eaa0c6afde042d671860ac42cbafb/tabular/src/autogluon/tabular/predictor/predictor.py#L51-L159 +ag_predictor_args: + eval_metric: roc_auc + label: class + +# AutoGluon Predictor.fit arguments +# - see https://github.com/awslabs/autogluon/blob/ef3a5312dc2eaa0c6afde042d671860ac42cbafb/tabular/src/autogluon/tabular/predictor/predictor.py#L280-L651 +ag_fit_args: + presets: "best_quality" + +output_prediction_format: csv # predictions output format: csv or parquet +feature_importance: true # calculate and save feature importance if true +leaderboard: true # save leaderboard output if true \ No newline at end of file diff --git a/advanced_functionality/autogluon-tabular-containers/config/config-med.yaml b/advanced_functionality/autogluon-tabular-containers/config/config-med.yaml new file mode 100644 index 0000000000..42e4f9a182 --- /dev/null +++ b/advanced_functionality/autogluon-tabular-containers/config/config-med.yaml @@ -0,0 +1,22 @@ +# AutoGluon Predictor constructor arguments +# - see https://github.com/awslabs/autogluon/blob/ef3a5312dc2eaa0c6afde042d671860ac42cbafb/tabular/src/autogluon/tabular/predictor/predictor.py#L51-L159 +ag_predictor_args: + eval_metric: roc_auc + label: class + +# AutoGluon Predictor.fit arguments +# - see https://github.com/awslabs/autogluon/blob/ef3a5312dc2eaa0c6afde042d671860ac42cbafb/tabular/src/autogluon/tabular/predictor/predictor.py#L280-L651 +ag_fit_args: + hyperparameters: +# GBM: +# num_boost_round: 20 +# NN: +# num_epochs: 2 + presets: "medium_quality_faster_train" + num_bag_folds: 2 + num_bag_sets: 1 + num_stack_levels: 0 + +output_prediction_format: csv # predictions output format: csv or parquet +feature_importance: true # calculate and save feature importance if true +leaderboard: true # save leaderboard output if true \ No newline at end of file diff --git a/advanced_functionality/autogluon-tabular-containers/scripts/tabular_serve-batch.py b/advanced_functionality/autogluon-tabular-containers/scripts/tabular_serve-batch.py new file mode 100644 index 0000000000..cdc0c3cb74 --- /dev/null +++ b/advanced_functionality/autogluon-tabular-containers/scripts/tabular_serve-batch.py @@ -0,0 +1,37 @@ +from autogluon.tabular import TabularPredictor +import os +import json +from io import StringIO +import pandas as pd +import numpy as np + + +def model_fn(model_dir): + """loads model from previously saved artifact""" + model = TabularPredictor.load(model_dir) + globals()["column_names"] = model.feature_metadata_in.get_features() + + return model + + +def transform_fn(model, request_body, input_content_type, output_content_type="application/json"): + if input_content_type == "text/csv": + buf = StringIO(request_body) + data = pd.read_csv(buf, header=None) + num_cols = len(data.columns) + if num_cols != len(column_names): + raise Exception( + f"Invalid data format. Input data has {num_cols} while the model expects {len(column_names)}" + ) + + else: + data.columns = column_names + + else: + raise Exception(f"{input_content_type} content type not supported") + + pred = model.predict(data) + pred_proba = model.predict_proba(data) + prediction = pd.concat([pred, pred_proba], axis=1) + + return prediction.to_json(), output_content_type diff --git a/advanced_functionality/autogluon-tabular-containers/scripts/tabular_serve.py b/advanced_functionality/autogluon-tabular-containers/scripts/tabular_serve.py new file mode 100644 index 0000000000..e2d5c427e8 --- /dev/null +++ b/advanced_functionality/autogluon-tabular-containers/scripts/tabular_serve.py @@ -0,0 +1,39 @@ +from autogluon.tabular import TabularPredictor +import os +import json +from io import StringIO +import pandas as pd +import numpy as np + + +def model_fn(model_dir): + """loads model from previously saved artifact""" + model = TabularPredictor.load(model_dir) + globals()["column_names"] = model.feature_metadata_in.get_features() + + return model + + +def transform_fn(model, request_body, input_content_type, output_content_type="application/json"): + + if input_content_type == "text/csv": + buf = StringIO(request_body) + data = pd.read_csv(buf, header=None) + num_cols = len(data.columns) + + if num_cols != len(column_names): + raise Exception( + f"Invalid data format. Input data has {num_cols} while the model expects {len(column_names)}" + ) + + else: + data.columns = column_names + + else: + raise Exception(f"{input_content_type} content type not supported") + + pred = model.predict(data) + pred_proba = model.predict_proba(data) + prediction = pd.concat([pred, pred_proba], axis=1).values + + return json.dumps(prediction.tolist()), output_content_type diff --git a/advanced_functionality/autogluon-tabular-containers/scripts/tabular_train.py b/advanced_functionality/autogluon-tabular-containers/scripts/tabular_train.py new file mode 100644 index 0000000000..cf178e66b5 --- /dev/null +++ b/advanced_functionality/autogluon-tabular-containers/scripts/tabular_train.py @@ -0,0 +1,98 @@ +import argparse +import os +from pprint import pprint + +import yaml +from autogluon.tabular import TabularDataset, TabularPredictor + + +def get_input_path(path): + file = os.listdir(path)[0] + if len(os.listdir(path)) > 1: + print(f"WARN: more than one file is found in {channel} directory") + print(f"Using {file}") + filename = f"{path}/{file}" + return filename + + +def get_env_if_present(name): + result = None + if name in os.environ: + result = os.environ[name] + return result + + +if __name__ == "__main__": + # Disable Autotune + os.environ["MXNET_CUDNN_AUTOTUNE_DEFAULT"] = "0" + + # ------------------------------------------------------------ Args parsing + print("Starting AG") + parser = argparse.ArgumentParser() + + # Data, model, and output directories + parser.add_argument( + "--output-data-dir", type=str, default=get_env_if_present("SM_OUTPUT_DATA_DIR") + ) + parser.add_argument("--model-dir", type=str, default=get_env_if_present("SM_MODEL_DIR")) + parser.add_argument("--n_gpus", type=str, default=get_env_if_present("SM_NUM_GPUS")) + parser.add_argument("--training_dir", type=str, default=get_env_if_present("SM_CHANNEL_TRAIN")) + parser.add_argument( + "--test_dir", type=str, required=False, default=get_env_if_present("SM_CHANNEL_TEST") + ) + parser.add_argument("--ag_config", type=str, default=get_env_if_present("SM_CHANNEL_CONFIG")) + + args, _ = parser.parse_known_args() + + print(f"Args: {args}") + + # See SageMaker-specific environment variables: https://sagemaker.readthedocs.io/en/stable/overview.html#prepare-a-training-script + os.makedirs(args.output_data_dir, mode=0o777, exist_ok=True) + + config_file = get_input_path(args.ag_config) + with open(config_file) as f: + config = yaml.safe_load(f) # AutoGluon-specific config + + if args.n_gpus: + config["num_gpus"] = int(args.n_gpus) + + print("Running training job with the config:") + pprint(config) + + # ---------------------------------------------------------------- Training + + train_file = get_input_path(args.training_dir) + train_data = TabularDataset(train_file) + + ag_predictor_args = config["ag_predictor_args"] + ag_predictor_args["path"] = args.model_dir + ag_fit_args = config["ag_fit_args"] + + predictor = TabularPredictor(**ag_predictor_args).fit(train_data, **ag_fit_args) + + # --------------------------------------------------------------- Inference + + if args.test_dir: + test_file = get_input_path(args.test_dir) + test_data = TabularDataset(test_file) + + # Predictions + y_pred_proba = predictor.predict_proba(test_data) + if config.get("output_prediction_format", "csv") == "parquet": + y_pred_proba.to_parquet(f"{args.output_data_dir}/predictions.parquet") + else: + y_pred_proba.to_csv(f"{args.output_data_dir}/predictions.csv") + + # Leaderboard + if config.get("leaderboard", False): + lb = predictor.leaderboard(test_data, silent=False) + lb.to_csv(f"{args.output_data_dir}/leaderboard.csv") + + # Feature importance + if config.get("feature_importance", False): + feature_importance = predictor.feature_importance(test_data) + feature_importance.to_csv(f"{args.output_data_dir}/feature_importance.csv") + else: + if config.get("leaderboard", False): + lb = predictor.leaderboard(silent=False) + lb.to_csv(f"{args.output_data_dir}/leaderboard.csv") diff --git a/advanced_functionality/handling_kms_encrypted_data/handling_kms_encrypted_data.ipynb b/advanced_functionality/handling_kms_encrypted_data/handling_kms_encrypted_data.ipynb index 33e55939c8..e683b4ecf0 100644 --- a/advanced_functionality/handling_kms_encrypted_data/handling_kms_encrypted_data.ipynb +++ b/advanced_functionality/handling_kms_encrypted_data/handling_kms_encrypted_data.ipynb @@ -67,14 +67,15 @@ "import numpy as np\n", "import re\n", "from sagemaker import get_execution_role\n", + "import sagemaker\n", "\n", "region = boto3.Session().region_name\n", "\n", "role = get_execution_role()\n", "\n", - "kms_key_arn = \"\"\n", + "kms_key = \"\"\n", "\n", - "bucket = \"\" # put your s3 bucket name here, and create s3 bucket\n", + "bucket = sagemaker.Session().default_bucket()\n", "prefix = \"sagemaker/DEMO-kms\"\n", "# customize to your bucket where you have stored the data\n", "bucket_path = \"s3://{}\".format(bucket)" @@ -90,7 +91,11 @@ "\n", "### Data ingestion\n", "\n", - "We, first, read the dataset from an existing repository into memory. This processing could be done *in situ* by Amazon Athena, Apache Spark in Amazon EMR, Amazon Redshift, etc., assuming the dataset is present in the appropriate location. Then, the next step would be to transfer the data to S3 for use in training. For small datasets, such as the one used below, reading into memory isn't onerous, though it would be for larger datasets." + "We, first, read the dataset from an existing repository into memory. This processing could be done *in situ* by Amazon Athena, Apache Spark in Amazon EMR, Amazon Redshift, etc., assuming the dataset is present in the appropriate location. Then, the next step would be to transfer the data to S3 for use in training. For small datasets, such as the one used below, reading into memory isn't onerous, though it would be for larger datasets.\n", + "\n", + "This example uses the California Housing dataset, initially published in:\n", + "\n", + "> Pace, R. Kelley, and Ronald Barry. \"Sparse spatial autoregressions.\" Statistics & Probability Letters 33.3 (1997): 291-297." ] }, { @@ -99,16 +104,16 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import load_boston\n", + "from sklearn.datasets import fetch_california_housing\n", "\n", - "boston = load_boston()\n", - "X = boston[\"data\"]\n", - "y = boston[\"target\"]\n", - "feature_names = boston[\"feature_names\"]\n", + "california = fetch_california_housing()\n", + "X = california[\"data\"]\n", + "y = california[\"target\"]\n", + "feature_names = california[\"feature_names\"]\n", "data = pd.DataFrame(X, columns=feature_names)\n", "target = pd.DataFrame(y, columns={\"MEDV\"})\n", "data[\"MEDV\"] = y\n", - "local_file_name = \"boston.csv\"\n", + "local_file_name = \"california_housing.csv\"\n", "data.to_csv(local_file_name, header=False, index=False)" ] }, @@ -140,7 +145,7 @@ "outputs": [], "source": [ "def write_file(X, y, fname, include_labels=True):\n", - " feature_names = boston[\"feature_names\"]\n", + " feature_names = california[\"feature_names\"]\n", " data = pd.DataFrame(X, columns=feature_names)\n", " if include_labels:\n", " data.insert(0, \"MEDV\", y)\n", @@ -180,7 +185,7 @@ "\n", "data_train = open(train_file, \"rb\")\n", "key_train = \"{}/train/{}\".format(prefix, train_file)\n", - "kms_key_id = kms_key_arn.split(\":key/\")[1]\n", + "kms_key_id = kms_key.split(\":key/\")[1]\n", "\n", "print(\"Put object...\")\n", "s3.put_object(\n", @@ -227,7 +232,7 @@ "source": [ "## Training the SageMaker XGBoost model\n", "\n", - "Now that we have our data in S3, we can begin training. We'll use Amazon SageMaker XGboost algorithm as an example to demonstrate model training. Note that nothing needs to be changed in the way you'd call the training algorithm. The only requirement for training to succeed is that the IAM role (`role`) used for S3 access has permissions to encrypt and decrypt data with the KMS key (`kms_key_arn`). You can set these permissions using the instructions [here](http://docs.aws.amazon.com/kms/latest/developerguide/key-policies.html#key-policy-default-allow-users). If the permissions aren't set, you'll get the `Data download failed` error. Specify a `VolumeKmsKeyId` in the training job parameters to have the volume attached to the ML compute instance encrypted using key provided." + "Now that we have our data in S3, we can begin training. We'll use Amazon SageMaker XGboost algorithm as an example to demonstrate model training. Note that nothing needs to be changed in the way you'd call the training algorithm. The only requirement for training to succeed is that the IAM role (`role`) used for S3 access has permissions to encrypt and decrypt data with the KMS key (`kms_key`). You can set these permissions using the instructions [here](http://docs.aws.amazon.com/kms/latest/developerguide/key-policies.html#key-policy-default-allow-users). If the permissions aren't set, you'll get the `Data download failed` error. Specify a `VolumeKmsKeyId` in the training job parameters to have the volume attached to the ML compute instance encrypted using key provided." ] }, { @@ -236,9 +241,11 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.amazon.amazon_estimator import get_image_uri\n", + "from sagemaker import image_uris\n", "\n", - "container = get_image_uri(boto3.Session().region_name, \"xgboost\")" + "container = image_uris.retrieve(\n", + " region=boto3.Session().region_name, framework=\"xgboost\", version=\"latest\"\n", + ")" ] }, { @@ -262,7 +269,7 @@ " \"InstanceCount\": 1,\n", " \"InstanceType\": \"ml.m4.4xlarge\",\n", " \"VolumeSizeInGB\": 5,\n", - " \"VolumeKmsKeyId\": kms_key_arn,\n", + " \"VolumeKmsKeyId\": kms_key,\n", " },\n", " \"TrainingJobName\": job_name,\n", " \"HyperParameters\": {\n", @@ -379,7 +386,7 @@ "print(endpoint_config_name)\n", "create_endpoint_config_response = client.create_endpoint_config(\n", " EndpointConfigName=endpoint_config_name,\n", - " KmsKeyId=kms_key_arn,\n", + " KmsKeyId=kms_key,\n", " ProductionVariants=[\n", " {\n", " \"InstanceType\": \"ml.m4.xlarge\",\n", @@ -509,7 +516,7 @@ "metadata": {}, "source": [ "## Run batch prediction using batch transform\n", - "Create a transform job to do batch prediction using the trained model. Similar to the training section above, the execution role assumed by this notebook must have permissions to encrypt and decrypt data with the KMS key (`kms_key_arn`) used for S3 server-side encryption. Similar to training, specify a `VolumeKmsKeyId` so that the volume attached to the transform instance is encrypted using the key provided." + "Create a transform job to do batch prediction using the trained model. Similar to the training section above, the execution role assumed by this notebook must have permissions to encrypt and decrypt data with the KMS key (`kms_key`) used for S3 server-side encryption. Similar to training, specify a `VolumeKmsKeyId` so that the volume attached to the transform instance is encrypted using the key provided." ] }, { @@ -542,7 +549,7 @@ " \"TransformResources\": {\n", " \"InstanceCount\": 1,\n", " \"InstanceType\": \"ml.c4.xlarge\",\n", - " \"VolumeKmsKeyId\": kms_key_arn,\n", + " \"VolumeKmsKeyId\": kms_key,\n", " },\n", "}\n", "\n", @@ -605,7 +612,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.13" }, "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." }, diff --git a/advanced_functionality/multi_model_bring_your_own/container/Dockerfile b/advanced_functionality/multi_model_bring_your_own/container/Dockerfile index 9b72aa65eb..4191801e22 100644 --- a/advanced_functionality/multi_model_bring_your_own/container/Dockerfile +++ b/advanced_functionality/multi_model_bring_your_own/container/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:16.04 +FROM ubuntu:18.04 # Set a docker label to advertise multi-model support on the container LABEL com.amazonaws.sagemaker.capabilities.multi-models=true diff --git a/advanced_functionality/multi_model_pytorch/pytorch_multi_model_endpoint.ipynb b/advanced_functionality/multi_model_pytorch/pytorch_multi_model_endpoint.ipynb index 56e92f319f..9bb6153a32 100644 --- a/advanced_functionality/multi_model_pytorch/pytorch_multi_model_endpoint.ipynb +++ b/advanced_functionality/multi_model_pytorch/pytorch_multi_model_endpoint.ipynb @@ -2,16 +2,16 @@ "cells": [ { "cell_type": "markdown", - "id": "2d49d1af", + "id": "14a93ca6", "metadata": {}, "source": [ "# Amazon SageMaker Multi-Model Endpoints using PyTorch\n", "\n", "> *This notebook works well with SageMaker Studio kernel `Python 3 (Data Science)`, or SageMaker Notebook Instance kernel `conda_python3`*\n", "\n", - "With [Amazon SageMaker multi-model endpoints](https://docs.aws.amazon.com/sagemaker/latest/dg/multi-model-endpoints.html), customers can create an endpoint that seamlessly hosts up to thousands of models. These endpoints are well suited to use cases where any one of a large number of models, which can be served from a common inference container, needs to be invokable on-demand and where it is acceptable for infrequently invoked models to incur some additional latency. For applications which require consistently low inference latency, a traditional endpoint is still the best choice.\n", + "With [Amazon SageMaker multi-model endpoints](https://docs.aws.amazon.com/sagemaker/latest/dg/multi-model-endpoints.html), customers can create an endpoint that seamlessly hosts up to thousands of models. These endpoints are well suited to use cases where any one of many models, which can be served from a common inference container, needs to be callable on-demand and where it is acceptable for infrequently invoked models to incur some additional latency. For applications which require consistently low inference latency, a traditional endpoint is still the best choice.\n", "\n", - "In some cases where the variable latency is tolerable and cost optimization is more important, customers may also decide to use MMEs for A/B/n testing, in place of the more typical [production variant based strategy discussed here](https://aws.amazon.com/blogs/machine-learning/a-b-testing-ml-models-in-production-using-amazon-sagemaker/).\n", + "In some cases where the variable latency is tolerable, and cost optimization is more important, customers may also decide to use MMEs for A/B/n testing, in place of the more typical [production variant based strategy discussed here](https://aws.amazon.com/blogs/machine-learning/a-b-testing-ml-models-in-production-using-amazon-sagemaker/).\n", "\n", "To demonstrate how multi-model endpoints can be created and used, this notebook provides an example using models trained with the [SageMaker PyTorch framework container](https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html). We'll take an A/B scenario for simplicity, training and deploying just two models to our endpoint.\n", "\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "385984fe", + "id": "446701de", "metadata": {}, "source": [ "## Contents\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c1d5b02", + "id": "c7c6f24b", "metadata": {}, "outputs": [], "source": [ @@ -75,7 +75,7 @@ }, { "cell_type": "markdown", - "id": "411388e1", + "id": "10d1b8ac", "metadata": {}, "source": [ "## The example use case: MNIST\n", @@ -88,7 +88,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e79cc834", + "id": "f94bafde", "metadata": {}, "outputs": [], "source": [ @@ -135,7 +135,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d9b3681e", + "id": "bdd2050d", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "01272e25", + "id": "6c1caf03", "metadata": {}, "source": [ "## Train multiple models\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a5bf5989", + "id": "8f74a102", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +179,9 @@ " entry_point=\"train.py\",\n", " source_dir=\"code\", # directory of your training script\n", " role=role,\n", - " framework_version=\"1.8\",\n", + " # At the time of writing, this example gives a deployment error in container v1.8.1 with\n", + " # upgraded TorchServe: so specifically setting \"1.8.0\". But \"1.7\" and \"1.6\" should be fine.\n", + " framework_version=\"1.8.0\",\n", " py_version=\"py3\",\n", " instance_type=\"ml.c4.xlarge\",\n", " instance_count=1,\n", @@ -194,7 +196,7 @@ }, { "cell_type": "markdown", - "id": "024bd190", + "id": "7f3b6817", "metadata": {}, "source": [ "By default, calling the [SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/)'s [Estimator.fit()](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html#sagemaker.estimator.EstimatorBase.fit) method waits for the training job to complete, streaming progress information and logs to the notebook.\n", @@ -207,7 +209,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d29f6c77", + "id": "d48aad81", "metadata": { "scrolled": true }, @@ -225,7 +227,7 @@ }, { "cell_type": "markdown", - "id": "2fabc90f", + "id": "30112fc5", "metadata": {}, "source": [ "## Check single-model deployment\n", @@ -236,7 +238,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1548d75b", + "id": "dcead338", "metadata": {}, "outputs": [], "source": [ @@ -246,7 +248,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0ed21ed1", + "id": "59f58085", "metadata": {}, "outputs": [], "source": [ @@ -261,7 +263,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f48c5876", + "id": "9b9e4430", "metadata": {}, "outputs": [], "source": [ @@ -282,7 +284,7 @@ }, { "cell_type": "markdown", - "id": "04c7172c", + "id": "84737f76", "metadata": {}, "source": [ "Assuming the test worked, this endpoint is no longer needed so can be disposed:" @@ -291,7 +293,7 @@ { "cell_type": "code", "execution_count": null, - "id": "45c360fe", + "id": "c60c2486", "metadata": {}, "outputs": [], "source": [ @@ -300,7 +302,7 @@ }, { "cell_type": "markdown", - "id": "60a9485f", + "id": "9acd33e1", "metadata": {}, "source": [ "## Create the Multi-Model Endpoint with the SageMaker SDK\n", @@ -313,7 +315,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13c842ac", + "id": "9b8b69ac", "metadata": {}, "outputs": [], "source": [ @@ -322,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "1fea2e91", + "id": "7d7c9f75", "metadata": {}, "source": [ "### Create the Amazon SageMaker MultiDataModel entity\n", @@ -331,13 +333,13 @@ "\n", "You can create a MultiDataModel by directly passing in a `sagemaker.model.Model` object - in which case, the Endpoint will inherit information about the image to use, as well as any environmental variables, network isolation, etc., once the MultiDataModel is deployed.\n", "\n", - "In addition, a MultiDataModel can also be created without explictly passing a `sagemaker.model.Model` object. Please refer to the documentation for additional details." + "In addition, a MultiDataModel can also be created without explicitly passing a `sagemaker.model.Model` object. Please refer to the documentation for additional details." ] }, { "cell_type": "code", "execution_count": null, - "id": "7367b356", + "id": "f60d597d", "metadata": {}, "outputs": [], "source": [ @@ -350,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "127ae6ea", + "id": "f50efe26", "metadata": {}, "outputs": [], "source": [ @@ -364,7 +366,7 @@ }, { "cell_type": "markdown", - "id": "1318f613", + "id": "f6abb202", "metadata": {}, "source": [ "### Deploy the Multi-Model Endpoint\n", @@ -375,7 +377,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11ae8761", + "id": "a86e8be2", "metadata": {}, "outputs": [], "source": [ @@ -396,14 +398,14 @@ }, { "cell_type": "markdown", - "id": "e81b7f70", + "id": "fbe8e6dc", "metadata": {}, "source": [ "### Our endpoint has launched! Let's look at what models are available to the endpoint!\n", "\n", - "By 'available', what we mean is, what model artfiacts are currently stored under the S3 prefix we defined when setting up the `MultiDataModel` above i.e. `model_data_prefix`.\n", + "By 'available', what we mean is, what model artifacts are currently stored under the S3 prefix we defined when setting up the `MultiDataModel` above i.e. `model_data_prefix`.\n", "\n", - "Currently, since we have no artifacts (i.e. `tar.gz` files) stored under our defined S3 prefix, our endpoint, will have no models 'available' to serve inference requests.\n", + "Currently, since we have no artifacts (i.e. `tar.gz` files) stored under our defined S3 prefix, our endpoint, will have no models 'available' to serve inference requests.\n", "\n", "We will demonstrate how to make models 'available' to our endpoint below." ] @@ -411,7 +413,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b2c2c450", + "id": "1a174381", "metadata": {}, "outputs": [], "source": [ @@ -421,7 +423,7 @@ }, { "cell_type": "markdown", - "id": "e30c0200", + "id": "e42ae3e9", "metadata": {}, "source": [ "### Dynamically deploying models to the endpoint\n", @@ -445,7 +447,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b82fd397", + "id": "387900c3", "metadata": {}, "outputs": [], "source": [ @@ -457,7 +459,7 @@ }, { "cell_type": "markdown", - "id": "04aa25b7", + "id": "4d1470f7", "metadata": {}, "source": [ "### Our models are ready to invoke!\n", @@ -468,7 +470,7 @@ { "cell_type": "code", "execution_count": null, - "id": "692db269", + "id": "26261ce4", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +479,7 @@ }, { "cell_type": "markdown", - "id": "817bddd5", + "id": "289d28e7", "metadata": {}, "source": [ "## Get predictions from the endpoint\n", @@ -490,7 +492,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bb9ad0e6", + "id": "04b1de15", "metadata": {}, "outputs": [], "source": [ @@ -506,7 +508,7 @@ }, { "cell_type": "markdown", - "id": "b661cde2", + "id": "2880e078", "metadata": {}, "source": [ "## Updating a model\n", @@ -520,7 +522,7 @@ }, { "cell_type": "markdown", - "id": "951d6b63", + "id": "5bb05ed2", "metadata": {}, "source": [ "## Clean up\n", @@ -531,7 +533,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b25ccbaa", + "id": "deee6c61", "metadata": {}, "outputs": [], "source": [ @@ -541,7 +543,7 @@ { "cell_type": "code", "execution_count": null, - "id": "518ac926", + "id": "ffe61045", "metadata": {}, "outputs": [], "source": [] diff --git a/advanced_functionality/pipe_bring_your_own/Dockerfile b/advanced_functionality/pipe_bring_your_own/Dockerfile index 31ded3bb06..fc12c2b764 100644 --- a/advanced_functionality/pipe_bring_your_own/Dockerfile +++ b/advanced_functionality/pipe_bring_your_own/Dockerfile @@ -1,5 +1,7 @@ +ARG region + # SageMaker PyTorch image -FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.5.1-cpu-py36-ubuntu16.04 +FROM 763104351884.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.5.1-cpu-py36-ubuntu16.04 ENV PATH="/opt/ml/code:${PATH}" diff --git a/advanced_functionality/pipe_bring_your_own/pipe_bring_your_own.ipynb b/advanced_functionality/pipe_bring_your_own/pipe_bring_your_own.ipynb index 0e695822a0..32e05c3714 100644 --- a/advanced_functionality/pipe_bring_your_own/pipe_bring_your_own.ipynb +++ b/advanced_functionality/pipe_bring_your_own/pipe_bring_your_own.ipynb @@ -46,10 +46,14 @@ "\n", "Let's start by specifying:\n", "\n", - "- S3 URIs `s3_training_input` and `s3_model_output` that you want to use for training input and model data respectively. These should be within the same region as the Notebook Instance, training, and hosting. Since the \"algorithm\" you're building here doesn't really have any specific data-format, feel free to point `s3_training_input` to any s3 dataset you have, the bigger the dataset the better to test the raw IO throughput performance. For this example, the Boston Housing dataset will be copied over to your s3 bucket.\n", + "- S3 URIs `s3_training_input` and `s3_model_output` that you want to use for training input and model data respectively. These should be within the same region as the Notebook Instance, training, and hosting. Since the \"algorithm\" you're building here doesn't really have any specific data-format, feel free to point `s3_training_input` to any s3 dataset you have, the bigger the dataset the better to test the raw IO throughput performance. For this example, the California Housing dataset will be copied over to your s3 bucket.\n", "- The `training_instance_type` to use for training. More powerful instance types have more CPU and bandwidth which would result in higher throughput.\n", "- The IAM role arn used to give training access to your data.\n", "\n", + "The California Housing dataset was originally published in:\n", + "\n", + "> Pace, R. Kelley, and Ronald Barry. \\\"Sparse spatial autoregressions.\\\" Statistics & Probability Letters 33.3 (1997): 291-297.\n", + "\n", "### Permissions\n", "\n", "Running this notebook requires permissions in addition to the normal `SageMakerFullAccess` permissions. This is because you'll be creating a new repository in Amazon ECR. The easiest way to add these permissions is simply to add the managed policy `AmazonEC2ContainerRegistryFullAccess` to the role that you used to start your notebook instance. There's no need to restart your notebook instance when you do this, the new permissions will be available immediately." @@ -67,8 +71,7 @@ "import pandas as pd\n", "import sagemaker\n", "\n", - "# to load the boston housing dataset\n", - "from sklearn.datasets import *\n", + "from sklearn.datasets import fetch_california_housing\n", "\n", "# Get SageMaker session & default S3 bucket\n", "role = sagemaker.get_execution_role()\n", @@ -110,9 +113,9 @@ "metadata": {}, "outputs": [], "source": [ - "filename = \"boston_house.csv\"\n", + "filename = \"california_housing.csv\"\n", "# Download files from sklearns.datasets\n", - "tabular_data = load_boston()\n", + "tabular_data = fetch_california_housing()\n", "tabular_data_full = pd.DataFrame(tabular_data.data, columns=tabular_data.feature_names)\n", "tabular_data_full[\"target\"] = pd.DataFrame(tabular_data.target)\n", "tabular_data_full.to_csv(filename, index=False)" @@ -198,7 +201,9 @@ "outputs": [], "source": [ "%%sh\n", - "aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com" + "REGION=$(aws configure get region)\n", + "account=$(aws sts get-caller-identity --query Account --output text)\n", + "aws ecr get-login-password --region ${REGION} | docker login --username AWS --password-stdin ${account}.dkr.ecr.${REGION}.amazonaws.com" ] }, { @@ -215,7 +220,7 @@ "outputs": [], "source": [ "%%sh\n", - "docker build -t pipe_bring_your_own ." + "docker build -t pipe_bring_your_own . --build-arg region=$(aws configure get region)" ] }, { @@ -319,7 +324,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" }, "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." }, diff --git a/advanced_functionality/pytorch_bring_your_own_gan/build_gan_with_pytorch.ipynb b/advanced_functionality/pytorch_bring_your_own_gan/build_gan_with_pytorch.ipynb new file mode 100644 index 0000000000..3806057b0b --- /dev/null +++ b/advanced_functionality/pytorch_bring_your_own_gan/build_gan_with_pytorch.ipynb @@ -0,0 +1,928 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build GAN (Generative Adversarial Networks) with PyTorch and SageMaker" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### About GAN\n", + "\n", + "Generative Adversarial Network (GAN) is a generative machine learning model, which is widely used in advertising, games, entertainment, media, pharmaceuticals and other industries. It can be used to create fictional characters and scenes, simulate facial aging, and change image styles, and produce chemical formulas and so on.\n", + "\n", + "GAN was proposed by Ian Goodfellow in 2014, it is a deep neural network architecture consisting of a generative network and a discriminant network. The generation network generates \"fake\" data and tries to deceive the discrimination network; the discrimination network authenticates the generated data and tries to correctly identify all \"fake\" data. In the process of training iterations, the two networks continue to evolve and confront until they reach an equilibrium state (reference: Nash equilibrium), the discriminant network can no longer recognize \"fake\" data, and the training ends.\n", + "\n", + "This example will lead you to build a GAN model leveraging the PyTorch framework, introducing GAN from the perspective of engineering practice, and opening a new and interesting AI/ML experience in generative models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Environment setup\n", + "Upgrade packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade pip sagemaker awscli boto3 numpy ipywidgets\n", + "!pip install Pillow==7.1.2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create folders" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p data src tmp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download data\n", + "There are many public datasets on the Internet, which are very helpful for machine learning engineering and scientific research, such as algorithm study and evaluation. We will use MNIST dataset, which is a handwritten digits dataset, we will use it to train a GAN model, and eventually generate some fake \"handwritten\" digits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.s3 import S3Downloader as s3down\n", + "\n", + "s3down.download('s3://sagemaker-sample-files/datasets/image/MNIST/pytorch/', './data')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "isConfigCell": true + }, + "source": [ + "### Data preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "isConfigCell": true + }, + "source": [ + "PyTorch framework has a torchvision.datasets package, which provides access to a number of datasets, you may use the following commands to read MNIST pre-downloaded dataset from local storage, for later use.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "isConfigCell": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "from torchvision import datasets\n", + "\n", + "dataroot = './data'\n", + "trainset = datasets.MNIST(root=dataroot, train=True, download=False)\n", + "testset = datasets.MNIST(root=dataroot, train=False, download=False)\n", + "print(trainset)\n", + "print(testset)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "isConfigCell": true + }, + "source": [ + "SageMaker SDK will create a default Amazon S3 bucket for you to access various files and data, that you may need in the machine learning engineering lifecycle. We can get the name of this bucket through the default_bucket method of the sagemaker.session.Session class in the SageMaker SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "isConfigCell": true + }, + "outputs": [], + "source": [ + "from sagemaker.session import Session\n", + "\n", + "sess = Session()\n", + "\n", + "# S3 bucket for saving code and model artifacts.\n", + "# Feel free to specify a different bucket here if you wish.\n", + "bucket = sess.default_bucket()\n", + "prefix = 'byos-pytorch-gan'\n", + "\n", + "# Location to save your custom code in tar.gz format.\n", + "s3_custom_code_upload_location = f's3://{bucket}/{prefix}/customcode'\n", + "\n", + "# Location where results of model training are saved.\n", + "s3_model_artifacts_location = f's3://{bucket}/{prefix}/artifacts/'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The SageMaker SDK provides tools for operating AWS services. For example, the S3Downloader class is used to download objects in S3, and the S3Uploader is used to upload local files to S3. You will upload the dataset files to Amazon S3 for model training. During model training, we do not download data from the Internet to avoid network latency caused by fetching data from the Internet, and at the same time avoiding possible security risks due to direct access to the Internet.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "from sagemaker.s3 import S3Uploader as s3up\n", + "\n", + "s3_data_location = s3up.upload(os.path.join(dataroot, \"MNIST\"), f\"s3://{bucket}/{prefix}/data/mnist\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DCGAN (Deep Convolutional Generative Adversarial Networks) is a variant of the GAN families. This architecture essentially leverages Deep Convolutional Neural Networks to generate images belonging to a given distribution from noisy data using the Generator-Discriminator framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile src/train.py\n", + "\n", + "from __future__ import print_function\n", + "import argparse\n", + "import json\n", + "import logging\n", + "import os\n", + "import sys\n", + "import random\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.parallel\n", + "import torch.nn.functional as F\n", + "\n", + "import torch.optim as optim\n", + "import torch.backends.cudnn as cudnn\n", + "import torch.utils.data\n", + "import torchvision.datasets as dset\n", + "import torchvision.transforms as transforms\n", + "import torchvision.utils as vutils\n", + "\n", + "\n", + "cudnn.benchmark = True\n", + "\n", + "logger = logging.getLogger(__name__)\n", + "logger.setLevel(logging.DEBUG)\n", + "logger.addHandler(logging.StreamHandler(sys.stdout))\n", + "\n", + "\n", + "class Generator(nn.Module):\n", + " def __init__(self, *, nz, nc, ngf, ngpu=1):\n", + " super(Generator, self).__init__()\n", + " self.ngpu = ngpu\n", + " self.main = nn.Sequential(\n", + " # input is Z, going into a convolution\n", + " nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),\n", + " nn.BatchNorm2d(ngf * 8),\n", + " nn.ReLU(True),\n", + " # state size. (ngf*8) x 4 x 4\n", + " nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),\n", + " nn.BatchNorm2d(ngf * 4),\n", + " nn.ReLU(True),\n", + " # state size. (ngf*4) x 8 x 8\n", + " nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),\n", + " nn.BatchNorm2d(ngf * 2),\n", + " nn.ReLU(True),\n", + " # state size. (ngf*2) x 16 x 16\n", + " nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),\n", + " nn.BatchNorm2d(ngf),\n", + " nn.ReLU(True),\n", + " # state size. (ngf) x 32 x 32\n", + " nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),\n", + " nn.Tanh()\n", + " # state size. (nc) x 64 x 64\n", + " )\n", + "\n", + " def forward(self, input):\n", + " if input.is_cuda and self.ngpu > 1:\n", + " output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))\n", + " else:\n", + " output = self.main(input)\n", + " return output\n", + "\n", + " def save(self, path, *, filename=None, device='cpu'):\n", + " # recommended way from http://pytorch.org/docs/master/notes/serialization.html\n", + " self.to(device)\n", + " if not filename is None:\n", + " path = os.path.join(path, filename)\n", + " torch.save(self.state_dict(), path)\n", + "\n", + " def load(self, path, *, filename=None):\n", + " if not filename is None:\n", + " path = os.path.join(path, filename)\n", + " with open(path, 'rb') as f:\n", + " self.load_state_dict(torch.load(f))\n", + "\n", + " \n", + "class Discriminator(nn.Module):\n", + " def __init__(self, *, nc, ndf, ngpu=1):\n", + " super(Discriminator, self).__init__()\n", + " self.ngpu = ngpu\n", + " self.main = nn.Sequential(\n", + " # input is (nc) x 64 x 64\n", + " nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),\n", + " nn.LeakyReLU(0.2, inplace=True),\n", + " # state size. (ndf) x 32 x 32\n", + " nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),\n", + " nn.BatchNorm2d(ndf * 2),\n", + " nn.LeakyReLU(0.2, inplace=True),\n", + " # state size. (ndf*2) x 16 x 16\n", + " nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),\n", + " nn.BatchNorm2d(ndf * 4),\n", + " nn.LeakyReLU(0.2, inplace=True),\n", + " # state size. (ndf*4) x 8 x 8\n", + " nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),\n", + " nn.BatchNorm2d(ndf * 8),\n", + " nn.LeakyReLU(0.2, inplace=True),\n", + " # state size. (ndf*8) x 4 x 4\n", + " nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),\n", + " nn.Sigmoid()\n", + " )\n", + "\n", + " def forward(self, input):\n", + " if input.is_cuda and self.ngpu > 1:\n", + " output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))\n", + " else:\n", + " output = self.main(input)\n", + "\n", + " return output.view(-1, 1).squeeze(1)\n", + "\n", + " def save(self, path, *, filename=None, device='cpu'):\n", + " # recommended way from http://pytorch.org/docs/master/notes/serialization.html\n", + " self.to(device)\n", + " if not filename is None:\n", + " path = os.path.join(path, filename)\n", + " torch.save(self.state_dict(), path)\n", + "\n", + "\n", + " def load(self, path, *, filename=None):\n", + " if not filename is None:\n", + " path = os.path.join(path, filename)\n", + " with open(path, 'rb') as f:\n", + " self.load_state_dict(torch.load(f))\n", + " \n", + " \n", + "class DCGAN(object):\n", + " \"\"\"\n", + " A wrapper class for Generator and Discriminator,\n", + " 'train_step' method is for single batch training.\n", + " \"\"\"\n", + "\n", + " fixed_noise = None\n", + " criterion = None\n", + " device = None\n", + " netG = None\n", + " netD = None\n", + " optimizerG = None\n", + " optimizerD = None\n", + " nz = None\n", + " nc = None\n", + " ngf = None\n", + " ndf = None\n", + " real_cpu = None\n", + " \n", + " def __init__(self, *, batch_size, nz, nc, ngf, ndf, device, weights_init,\n", + " learning_rate, betas, real_label, fake_label):\n", + "\n", + " super(DCGAN, self).__init__()\n", + "\n", + " import torch\n", + " \n", + " self.nz = nz\n", + " self.nc = nc\n", + " self.ngf = ngf\n", + " self.ndf = ndf\n", + " \n", + " self.real_label = real_label\n", + " self.fake_label = fake_label\n", + " \n", + " self.fixed_noise = torch.randn(batch_size, nz, 1, 1, device=device)\n", + " self.criterion = nn.BCELoss()\n", + " self.device = device\n", + " \n", + " self.netG = Generator(nz=nz, nc=nc, ngf=ngf).to(device)\n", + " # print(netG)\n", + " self.netD = Discriminator(nc=nc, ndf=ndf).to(device)\n", + " # print(netD)\n", + " \n", + " self.netG.apply(weights_init)\n", + " self.netD.apply(weights_init)\n", + " \n", + " # setup optimizer\n", + " self.optimizerG = optim.Adam(self.netG.parameters(), lr=learning_rate, betas=betas)\n", + " self.optimizerD = optim.Adam(self.netD.parameters(), lr=learning_rate, betas=betas)\n", + "\n", + "\n", + " def train_step(self, data, *, epoch, epochs):\n", + " import torch\n", + "\n", + " ############################\n", + " # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))\n", + " ###########################\n", + " # train with real\n", + " self.netD.zero_grad()\n", + " self.real_cpu = data[0]\n", + " real = data[0].to(self.device)\n", + " batch_size = real.size(0)\n", + " label = torch.full((batch_size,), self.real_label, device=self.device)\n", + " \n", + " output = self.netD(real).view(-1)\n", + " errD_real = self.criterion(output, label)\n", + " errD_real.backward()\n", + " D_x = output.mean().item()\n", + "\n", + "\n", + " # train with fake\n", + " noise = torch.randn(batch_size, self.nz, 1, 1, device=self.device)\n", + " fake = self.netG(noise)\n", + " label.fill_(self.fake_label)\n", + " output = self.netD(fake.detach()).view(-1)\n", + " errD_fake = self.criterion(output, label)\n", + " errD_fake.backward()\n", + " D_G_z1 = output.mean().item()\n", + " errD = errD_real + errD_fake\n", + " self.optimizerD.step()\n", + " \n", + "\n", + " ############################\n", + " # (2) Update G network: maximize log(D(G(z)))\n", + " ###########################\n", + " self.netG.zero_grad()\n", + " label.fill_(self.real_label) # fake labels are real for generator cost\n", + " output = self.netD(fake).view(-1)\n", + " errG = self.criterion(output, label)\n", + " errG.backward()\n", + " D_G_z2 = output.mean().item()\n", + " self.optimizerG.step()\n", + "\n", + "\n", + " return errG.item(), errD.item(), D_x, D_G_z1, D_G_z2\n", + " \n", + "\n", + " \n", + "# custom weights initialization called on netG and netD\n", + "def weights_init(m):\n", + " classname = m.__class__.__name__\n", + " if classname.find('Conv') != -1:\n", + " torch.nn.init.normal_(m.weight, 0.0, 0.02)\n", + " elif classname.find('BatchNorm') != -1:\n", + " torch.nn.init.normal_(m.weight, 1.0, 0.02)\n", + " torch.nn.init.zeros_(m.bias)\n", + "\n", + "\n", + "def log_batch(epoch, epochs, batch, batches, errD, errG, D_x, D_G_z1, D_G_z2, *, log_interval=10, output_dir):\n", + "\n", + " if batch % log_interval == 0:\n", + " logger.info(f\"Epoch[{epoch}/{epochs}], Batch[{batch}/{batches}], \" +\n", + " f\"Loss_D: {errD:.4}, Loss_G: {errG:.4}, D(x): {D_x:.4}, D(G(z)): {D_G_z1:.4}/{D_G_z2:.4}\")\n", + "\n", + "\n", + "\n", + "def get_device(use_cuda):\n", + " import torch\n", + "\n", + " device = \"cpu\"\n", + " num_gpus = 0\n", + " \n", + " if torch.cuda.is_available():\n", + " if use_cuda:\n", + " device = \"cuda\"\n", + " torch.cuda.set_device(0)\n", + " num_gpus = torch.cuda.device_count()\n", + " else:\n", + " logger.debug(\"WARNING: You have a CUDA device, so you should probably run with --cuda 1\")\n", + "\n", + " logger.debug(f\"Number of gpus available: {num_gpus}\")\n", + " \n", + " return device, num_gpus\n", + "\n", + "\n", + "def train(dataloader, hps, test_batch_size,\n", + " device, model_dir, output_dir, seed, log_interval):\n", + " \n", + " epochs = hps['epochs']\n", + " batch_size = hps['batch-size']\n", + " nz = hps['nz']\n", + " ngf = hps['ngf']\n", + " ndf = hps['ndf']\n", + " learning_rate = hps['learning-rate']\n", + " beta1 = hps['beta1']\n", + " \n", + " dcgan = DCGAN(batch_size=batch_size, nz=nz, nc=1, ngf=ngf, ndf=ndf,\n", + " device=device, weights_init=weights_init, learning_rate=learning_rate,\n", + " betas=(beta1, 0.999), real_label=1, fake_label=0)\n", + "\n", + " for epoch in range(epochs):\n", + " batches = len(dataloader)\n", + " for batch, data in enumerate(dataloader, 0):\n", + " errG, errD, D_x, D_G_z1, D_G_z2 = dcgan.train_step(data,\n", + " epoch=epoch, epochs=epochs)\n", + " \n", + " log_batch(epoch, epochs, batch, batches, errD, errG,\n", + " D_x, D_G_z1, D_G_z2, log_interval=log_interval, output_dir=output_dir)\n", + "\n", + " \n", + " save_model(model_dir, dcgan.netG)\n", + "\n", + " return\n", + "\n", + "\n", + "def save_model(model_dir, model):\n", + " logger.info(\"Saving the model.\")\n", + " model.save(model_dir, filename=\"model.pth\")\n", + "\n", + " \n", + "def load_model(model_dir, device=None):\n", + " logger.info(\"Loading the model.\")\n", + " if device is None:\n", + " device = get_training_device_name(1)\n", + "\n", + " netG.load(model_dir, filename=\"model.pth\", device=device)\n", + "\n", + " return netG\n", + "\n", + "\n", + "def parse_args():\n", + " # Training settings\n", + " parser = argparse.ArgumentParser(description='PyTorch Example')\n", + " \n", + " parser.add_argument('--batch-size', type=int, default=1000, metavar='N',\n", + " help='input batch size (default: 1000)')\n", + " parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',\n", + " help='input batch size for testing (default: 1000)')\n", + " parser.add_argument('--seed', type=int, default=None, metavar='S',\n", + " help='random seed')\n", + " parser.add_argument('--log-interval', type=int, default=10, metavar='N',\n", + " help='how many batches to wait before logging training status')\n", + "\n", + " parser.add_argument('--save-model', action='store_true', default=False,\n", + " help='For Saving the current Model')\n", + "\n", + " parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR', None))\n", + " \n", + " parser.add_argument('--cuda', type=int, default=1)\n", + " parser.add_argument('--num-gpus', type=int, default=os.environ.get('SM_NUM_GPUS', None))\n", + " \n", + " parser.add_argument('--pin-memory', type=bool, default=os.environ.get('SM_PIN_MEMORY', False))\n", + "\n", + " parser.add_argument('--data-dir', required=False, default=None, help='path to data dir')\n", + " parser.add_argument('--workers', type=int, help='number of data loading workers', default=2)\n", + " parser.add_argument('--output-dir', default=os.environ.get('SM_OUTPUT_DATA_DIR', None), help='folder to output images and model checkpoints')\n", + " parser.add_argument('--hps', default=os.environ.get('SM_HPS', None), help='Hyperparameters')\n", + " \n", + " return parser.parse_known_args()\n", + "\n", + "\n", + "def get_datasets(*, dataroot='/opt/ml/input/data', classes=None):\n", + "\n", + " dataset = dset.MNIST(root=dataroot,\n", + " transform=transforms.Compose([\n", + " transforms.Resize(64),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.5,), (0.5,)),\n", + " ]))\n", + "\n", + " return dataset\n", + " \n", + "\n", + "if __name__ == '__main__':\n", + " args, unknown = parse_args()\n", + " \n", + " # get training options\n", + " hps = json.loads(args.hps)\n", + "\n", + " try:\n", + " os.makedirs(args.output_dir)\n", + " except OSError:\n", + " pass\n", + "\n", + " if args.seed is None:\n", + " random_seed = random.randint(1, 10000)\n", + " logger.debug(f\"Generated Random Seed: {random_seed}\")\n", + " cudnn.benchmark = True\n", + " else:\n", + " logger.debug(f\"Provided Random Seed: {args.seed}\")\n", + " random_seed = args.seed\n", + " cudnn.deterministic = True\n", + " cudnn.benchmark = False\n", + " \n", + " random.seed(random_seed)\n", + " torch.manual_seed(random_seed)\n", + "\n", + " pin_memory=args.pin_memory\n", + " num_workers = int(args.workers)\n", + " \n", + " device, num_gpus = get_device(args.cuda)\n", + " \n", + " if device == 'cuda':\n", + " num_workers = 1\n", + " pin_memory = True\n", + "\n", + " \n", + " if args.data_dir is None:\n", + " input_dir = os.environ.get('SM_INPUT_DIR', None)\n", + " if input_dir is None and str(args.dataset).lower() != 'fake':\n", + " raise ValueError(f\"`--data-dir` parameter is required for dataset \\\"{args.dataset}\\\"\")\n", + "\n", + " dataroot = input_dir + \"/data\"\n", + " else:\n", + " dataroot = args.data_dir\n", + "\n", + " dataset = get_datasets(dataroot=dataroot)\n", + "\n", + " assert dataset\n", + " dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size,\n", + " shuffle=True, num_workers=num_workers, pin_memory=pin_memory)\n", + "\n", + " \n", + " train(dataloader, hps, args.test_batch_size, device, args.model_dir, args.output_dir, args.seed, args.log_interval)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Per sagemaker.get_execution_role() method, the notebook can get the role pre-assigned to the notebook instance. This role will be used to obtain training resources, such as downloading training framework images, allocating Amazon EC2 instances, and so on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from sagemaker import get_execution_role\n", + "\n", + "# IAM execution role that gives SageMaker access to resources in your AWS account.\n", + "# We can use the SageMaker Python SDK to get the role from our notebook environment. \n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The hyperparameters, that used in the model training tasks, can be defined in the notebook so that it is separated from the algorithm and training code. The hyperparameters are passed in when the training task is created and dynamically combined with the training task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "hps = {\n", + " 'seed': 0,\n", + " 'learning-rate': 0.0002,\n", + " 'epochs': 18,\n", + " 'pin-memory': 1,\n", + " 'beta1': 0.5,\n", + " 'nz': 100,\n", + " 'ngf': 28,\n", + " 'ndf': 28,\n", + " 'batch-size': 128,\n", + " 'log-interval': 20,\n", + " }\n", + "\n", + "\n", + "str_hps = json.dumps(hps, indent = 4)\n", + "print(str_hps)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```PyTorch``` class from sagemaker.pytorch package, is an estimator for PyTorch framework, it can be used to create and execute training tasks, as well as to deploy trained models. In the parameter list, ``instance_type`` is used to specify the instance type, such as CPU or GPU instances. The directory containing training script and the model code are specified by ``source_dir``, and the training script file name must be clearly defined by ``entry_point``. These parameters will be passed to the training task along with other parameters, and they determine the environment settings of the training task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from sagemaker.pytorch import PyTorch\n", + "\n", + "estimator = PyTorch(role=role,\n", + " entry_point='train.py',\n", + " source_dir='./src',\n", + " output_path=s3_model_artifacts_location,\n", + " code_location=s3_custom_code_upload_location,\n", + " instance_count=1,\n", + " instance_type='ml.g4dn.2xlarge',\n", + " framework_version='1.5.0',\n", + " py_version='py3',\n", + " hyperparameters=hps,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please pay special attention to the ``train_use_spot_instances`` parameter. The value of ``True`` means that you want to use SPOT instances first. Since machine learning training usually requires a large amount of computing resources to run for a long time, leveraging SPOT instances can help you control your cost. The SPOT instances may save cost up to 90% of the on-demand instances, depending on the instance type, region, and time, the actual price might be different." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You have created a PyTorch object, and you can use it to fit pre-uploaded data on Amazon S3. The following command will initiate the training task, and the training data will be imported into the training environment in the form of an input channel named **MNIST**. When the training task starts, the training data was already downloaded from S3 to the local file system of the training instance, and the training script ```train.py``` will load the data from the local disk afterwards." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Start training\n", + "estimator.fit({\"MNIST\": s3_data_location}, wait=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Depending on the training instance you choose, the training process may last from tens of minutes to several hours. It is recommended to set the ``wait`` parameter to ``False``, this option will detach the notebook from the training task. In scenarios with long training time and many training logs, it can prevent the notebook context from being lost due to network interruption or session timeout. After the notebook detached from the training task, the output will be temporarily invisible. You can execute the following code, and the notebook will obtain and resume the previous training session." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%time\n", + "from sagemaker.estimator import Estimator\n", + "\n", + "# Attaching previous training session\n", + "training_job_name = estimator.latest_training_job.name\n", + "attached_estimator = Estimator.attach(training_job_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the model was designed to leverage the GPU power to accelerate training, it will be much faster on GPU instances than on CPU instances. For example, the g4dn.2xlarge instance will take about 12 minutes, while the c5.xlarge instance may take more than 6 hours. The current model does not support multi-instance training, so instance_count parameter, with value more than 1, will not bring extra benefits in training time optimisation.\n", + "\n", + "When the training completes, the trained model will be uploaded to S3. The upload location is specified by the `output_path` parameter provided when creating the `PyTorch` object." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "isConfigCell": true + }, + "source": [ + "### Model verification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "isConfigCell": true + }, + "source": [ + "You will download the trained model from Amazon S3 to the local file system of the instance where the notebook is located. The following code will load the model, and then generate a picture with a random number as input, then display picture.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "isConfigCell": true + }, + "outputs": [], + "source": [ + "from sagemaker.s3 import S3Downloader as s3down\n", + "\n", + "model_url = attached_estimator.model_data\n", + "s3down.download(model_url, './tmp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "isConfigCell": true + }, + "outputs": [], + "source": [ + "!tar -zxf tmp/model.tar.gz -C ./tmp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "isConfigCell": true + }, + "source": [ + "Execute the following instructions to load the trained model, and generate a set of \"handwritten\" digits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "isConfigCell": true + }, + "outputs": [], + "source": [ + "def generate_fake_handwriting(model, *, num_images, nz, device=None):\n", + "\n", + " import torch\n", + " import torchvision.utils as vutils\n", + " from io import BytesIO\n", + " from PIL import Image\n", + " \n", + "\n", + " z = torch.randn(num_images, nz, 1, 1, device=device)\n", + " fake = model(z)\n", + "\n", + " imgio = BytesIO()\n", + " vutils.save_image(fake.detach(), imgio, normalize=True, format=\"PNG\")\n", + " img = Image.open(imgio)\n", + " \n", + " return img\n", + "\n", + "\n", + "def load_model(path, *, model_cls=None, params=None, filename=None, device=None, strict=True):\n", + "\n", + " import os\n", + " import torch\n", + " \n", + " model_pt_path = path\n", + " if not filename is None:\n", + " model_pt_path = os.path.join(path, filename)\n", + " \n", + " if device is None:\n", + " device = 'cpu'\n", + " \n", + " if not model_cls is None:\n", + " model = model_cls(**params)\n", + " model.load_state_dict(torch.load(model_pt_path, map_location=torch.device(device)), strict=strict)\n", + " else:\n", + " model = torch.jit.load(model_pt_path, map_location=torch.device(device))\n", + "\n", + " model.to(device)\n", + " \n", + " return model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "isConfigCell": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import torch\n", + "from src.train import Generator\n", + "\n", + "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "params = {'nz': hps['nz'], 'nc': 1, 'ngf': hps['ngf']}\n", + "model = load_model(\"./tmp/model.pth\", model_cls=Generator, params=params, device=device, strict=False)\n", + "img = generate_fake_handwriting(model, num_images=64, nz=hps['nz'], device=device)\n", + "\n", + "plt.imshow(np.asarray(img))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "isConfigCell": true + }, + "source": [ + "### Clean up\n", + "\n", + "Run the following commandline in a terminal, to remove files generated by this notebook from S3 and local storage\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "isConfigCell": true + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "print(f\"aws s3 rm --recursive s3://{bucket}/{prefix}\")\n", + "print(f\"rm -rf {os.path.abspath(dataroot)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The PyTorch framework, as one of the most popular deep learning framework, is being widely recognised and applied, has become one of the de facto mainstream frameworks.\n", + "\n", + "Amazon SageMaker is tightly integrated with a variety of AWS services, such as Amazon EC2 instances of various types and sizes, Amazon S3, Amazon ECR, etc., providing an end-to-end, consistent machine learning experience for all framework practitioners. Amazon SageMaker continues to support mainstream machine learning frameworks, including PyTorch. Machine learning algorithms and models developed with PyTorch can be easily transplanted to Amazon SageMaker environment, by using Amazon SageMaker's fully managed Jupyter Notebook, SPOT training instances, Amazon Elastic Container Registry, SageMaker SDK, and so on, the complexity of machine learning engineering and infrastracture operation are simplified, productivity and efficiency are improved, operation and maintenance costs reduced.\n", + "\n", + "DCGAN is a landmark in the field of generative adversarial networks, and it is the cornerstone of many complex GANs today. We will explore some of the most recent and interesting variants of GAN in later examples.\n", + "\n", + "I believe that through the introduction and engineering practice of this example, it will be helpful for you to understand the principles and engineering methods for GAN in general." + ] + } + ], + "metadata": { + "instance_type": "ml.g4dn.xlarge", + "kernelspec": { + "display_name": "Python 3 (PyTorch 1.6 Python 3.6 GPU Optimized)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/pytorch-1.6-gpu-py36-cu110-ubuntu18.04-v3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + }, + "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.", + "toc-autonumbering": false, + "toc-showcode": false, + "toc-showmarkdowntxt": false + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/advanced_functionality/scikit_learn_bring_your_own_model/code/inference.py b/advanced_functionality/scikit_learn_bring_your_own_model/code/inference.py new file mode 100644 index 0000000000..586143a9c2 --- /dev/null +++ b/advanced_functionality/scikit_learn_bring_your_own_model/code/inference.py @@ -0,0 +1,17 @@ +import os +import joblib + +def predict_fn(input_object, model): + ########################################### + # Do your custom preprocessing logic here # + ########################################### + + print("calling model") + predictions = model.predict(input_object) + return predictions + + +def model_fn(model_dir): + print("loading model.joblib from: {}".format(model_dir)) + loaded_model = joblib.load(os.path.join(model_dir, "model.joblib")) + return loaded_model diff --git a/advanced_functionality/scikit_learn_bring_your_own_model/code/requirements.txt b/advanced_functionality/scikit_learn_bring_your_own_model/code/requirements.txt new file mode 100644 index 0000000000..1944c20d04 --- /dev/null +++ b/advanced_functionality/scikit_learn_bring_your_own_model/code/requirements.txt @@ -0,0 +1,3 @@ +boto3 +requests +nltk \ No newline at end of file diff --git a/advanced_functionality/scikit_learn_bring_your_own_model/scikit_learn_bring_your_own_model.ipynb b/advanced_functionality/scikit_learn_bring_your_own_model/scikit_learn_bring_your_own_model.ipynb new file mode 100644 index 0000000000..6510b6951d --- /dev/null +++ b/advanced_functionality/scikit_learn_bring_your_own_model/scikit_learn_bring_your_own_model.ipynb @@ -0,0 +1,494 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0d1bf0a9", + "metadata": {}, + "source": [ + "# Amazon SageMaker scikit-learn Bring Your Own Model\n", + "_**Hosting a pre-trained scikit-learn Model in Amazon SageMaker scikit-learn Container**_\n", + "\n", + "---\n", + "\n", + "---\n", + "\n", + "## Background\n", + "\n", + "Amazon SageMaker includes functionality to support a hosted notebook environment, distributed, serverless training, and real-time hosting. We think it works best when all three of these services are used together, but they can also be used independently. Some use cases may only require hosting. Maybe the model was trained prior to Amazon SageMaker existing, in a different service.\n", + "\n", + "This notebook shows how to use a pre-trained scikit-learn model with the Amazon SageMaker scikit-learn container to quickly create a hosted endpoint for that model.\n", + "We use the California Housing dataset, present in Scikit-Learn: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html. The California Housing dataset was originally published in:\n", + "\n", + "> Pace, R. Kelley, and Ronald Barry. \"Sparse spatial auto-regressions.\" Statistics & Probability Letters 33.3 (1997): 291-297.\n", + "\n", + "---\n", + "## Setup\n", + "\n", + "Let's start by specifying:\n", + "\n", + "* AWS region.\n", + "* The IAM role arn used to give learning and hosting access to your data.\n", + "* The S3 bucket that you want to use for training and model data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc7bec22", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import boto3\n", + "import re\n", + "import json\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.sklearn.model import SKLearnModel\n", + "from sklearn.datasets import fetch_california_housing\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "region = boto3.Session().region_name\n", + "\n", + "role = get_execution_role()\n", + "\n", + "bucket = sagemaker.Session().default_bucket()\n", + "prefix = \"sagemaker/DEMO-sklearn-byo-model\"\n", + "\n", + "print(f\"bucket: {bucket}\")" + ] + }, + { + "cell_type": "markdown", + "id": "61240ddf", + "metadata": {}, + "source": [ + "## Prepare data for model inference\n", + "\n", + "We load the California housing dataset from sklearn, and will use it to invoke SageMaker Endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a10d2340", + "metadata": {}, + "outputs": [], + "source": [ + "data = fetch_california_housing()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cee9c5a", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " data.data, data.target, test_size=0.25, random_state=42\n", + ")\n", + "\n", + "# we don't train a model, so we will need only the testing data\n", + "testX = pd.DataFrame(X_test, columns=data.feature_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca83f5b0", + "metadata": {}, + "outputs": [], + "source": [ + "testX.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "e9f9c09f", + "metadata": {}, + "source": [ + "## Download a pre-trained model file\n", + "\n", + "Download a pretrained Scikit-Learn Random Forest model.\n", + "\n", + "We used the California Housing dataset, present in Scikit-Learn: https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset to train the model.\n", + "\n", + "For more details on how to train the model with Amazon SageMaker, please refer to the [Develop, Train, Optimize and Deploy Scikit-Learn Random Forest notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ccaece6", + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp s3://aws-ml-blog/artifacts/scikit_learn_bring_your_own_model/model.joblib ." + ] + }, + { + "cell_type": "markdown", + "id": "a425243b", + "metadata": {}, + "source": [ + "### Compressed the model file to a GZIP tar archive \n", + "\n", + "Note that the model file name must satisfy the regular expression pattern: `^[a-zA-Z0-9](-*[a-zA-Z0-9])*;`. The model file needs to be tar-zipped. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47b9153e", + "metadata": {}, + "outputs": [], + "source": [ + "model_file_name = \"model.joblib\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa131b42", + "metadata": {}, + "outputs": [], + "source": [ + "!tar czvf model.tar.gz $model_file_name" + ] + }, + { + "cell_type": "markdown", + "id": "6ea55281", + "metadata": {}, + "source": [ + "## Upload the pre-trained model `model.tar.gz` file to S3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "385ef0d6", + "metadata": {}, + "outputs": [], + "source": [ + "fObj = open(\"model.tar.gz\", \"rb\")\n", + "key = os.path.join(prefix, \"model.tar.gz\")\n", + "boto3.Session().resource(\"s3\").Bucket(bucket).Object(key).upload_fileobj(fObj)" + ] + }, + { + "cell_type": "markdown", + "id": "2f6e19f0", + "metadata": {}, + "source": [ + "## Set up hosting for the model\n", + "\n", + "This involves creating a SageMaker model from the model file previously uploaded to S3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b848a666", + "metadata": {}, + "outputs": [], + "source": [ + "model_data = \"s3://{}/{}\".format(bucket, key)\n", + "print(f\"model data: {model_data}\")" + ] + }, + { + "cell_type": "markdown", + "id": "8f2c9b40", + "metadata": {}, + "source": [ + "### Write the Inference Script\n", + "\n", + "When using endpoints with the Amazon SageMaker managed `Scikit Learn` container, we need to provide an entry point script for inference that will **at least** load the saved model.\n", + "\n", + "After the SageMaker model server has loaded your model by calling `model_fn`, SageMaker will serve your model. Model serving is the process of responding to inference requests, received by SageMaker `InvokeEndpoint` API calls.\n", + "\n", + "\n", + "We will implement also the `predict_fn()` function that takes the deserialized request object and performs inference against the loaded model.\n", + "\n", + "We will now create this script and call it `inference.py` and store it at the root of a directory called `code`.\n", + "\n", + "**Note:** You would modify the script below to implement your own inferencing logic.\n", + "\n", + "Additional information on model loading and model serving for scikit-learn on SageMaker can be found in the [SageMaker Scikit-learn Model Server documentation](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html#deploy-a-scikit-learn-model)\n", + "\n", + "There are also several functions for hosting which we won't define,\n", + " - `input_fn()` - Takes request data and deserializes the data into an object for prediction.\n", + " - `output_fn()` - Takes the result of prediction and serializes this according to the response content type.\n", + "\n", + "These will take on their default values as described [SageMaker Scikit-learn Serve a Model documentation](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html#serve-a-model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8e0fe49", + "metadata": {}, + "outputs": [], + "source": [ + "!pygmentize ./code/inference.py" + ] + }, + { + "cell_type": "markdown", + "id": "1ad63dfa", + "metadata": {}, + "source": [ + "### Installing additional Python dependencies\n", + "\n", + "It also may be necessary to supply a `requirements.txt` file to ensure any necessary dependencies are installed in the container along with the script. For this script, in addition to the Python standard libraries, we showcase how to install the `boto3` `requests`, and `nltk` libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98372621", + "metadata": {}, + "outputs": [], + "source": [ + "!pygmentize ./code/requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "93217e26", + "metadata": {}, + "source": [ + "### Deploy with Python SDK\n", + "\n", + "Here we showcase the process of creating a model from s3 artifacts, that could be used to deploy a model that was trained in a different session or even out of SageMaker." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c408083", + "metadata": {}, + "outputs": [], + "source": [ + "model = SKLearnModel(\n", + " role=role,\n", + " model_data=model_data,\n", + " framework_version=\"0.23-1\",\n", + " py_version=\"py3\",\n", + " source_dir=\"code\",\n", + " entry_point=\"inference.py\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5e05fe2b", + "metadata": {}, + "source": [ + "### Create endpoint\n", + "Lastly, you create the endpoint that serves up the model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into production applications. This takes 5-10 minutes to complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b06fb61a", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "predictor = model.deploy(instance_type=\"ml.t2.medium\", initial_instance_count=1)" + ] + }, + { + "cell_type": "markdown", + "id": "e7d0c01a", + "metadata": {}, + "source": [ + "## Validate the model for use\n", + "Now you can obtain the endpoint from the client library using the result from previous operations and generate classifications from the model using that endpoint." + ] + }, + { + "cell_type": "markdown", + "id": "9407488f", + "metadata": {}, + "source": [ + "### Invoke with the Python SDK" + ] + }, + { + "cell_type": "markdown", + "id": "2b263bbf", + "metadata": {}, + "source": [ + "Let's generate the prediction for a single data point. We'll pick one from the test data generated earlier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79227345", + "metadata": {}, + "outputs": [], + "source": [ + "# the SKLearnPredictor does the serialization from pandas for us\n", + "predictions = predictor.predict(testX[data.feature_names])\n", + "print(predictions)" + ] + }, + { + "cell_type": "markdown", + "id": "5a30e6af", + "metadata": {}, + "source": [ + "### Alternative: invoke with `boto3`\n", + "\n", + "This is useful when invoking the model from external clients, e.g. Lambda Functions, or other micro-services." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "775c1b18", + "metadata": {}, + "outputs": [], + "source": [ + "runtime = boto3.client(\"sagemaker-runtime\")" + ] + }, + { + "cell_type": "markdown", + "id": "2f34e183", + "metadata": {}, + "source": [ + "#### Option 1: `csv` serialization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "783a343d", + "metadata": {}, + "outputs": [], + "source": [ + "# csv serialization\n", + "response = runtime.invoke_endpoint(\n", + " EndpointName=predictor.endpoint,\n", + " Body=testX[data.feature_names].to_csv(header=False, index=False).encode(\"utf-8\"),\n", + " ContentType=\"text/csv\",\n", + ")\n", + "\n", + "print(response[\"Body\"].read())" + ] + }, + { + "cell_type": "markdown", + "id": "f78ab683", + "metadata": {}, + "source": [ + "#### Option 2: `npy` serialization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bb751f6", + "metadata": {}, + "outputs": [], + "source": [ + "# npy serialization\n", + "from io import BytesIO\n", + "\n", + "\n", + "# Serialise numpy ndarray as bytes\n", + "buffer = BytesIO()\n", + "# Assuming testX is a data frame\n", + "np.save(buffer, testX[data.feature_names].values)\n", + "\n", + "response = runtime.invoke_endpoint(\n", + " EndpointName=predictor.endpoint, Body=buffer.getvalue(), ContentType=\"application/x-npy\"\n", + ")\n", + "\n", + "print(response[\"Body\"].read())" + ] + }, + { + "cell_type": "markdown", + "id": "a3b13de5", + "metadata": {}, + "source": [ + "### (Optional) Delete the Endpoint\n", + "\n", + "If you're ready to be done with this notebook, please run the delete_endpoint line in the cell below. This will remove the hosted endpoint you created and avoid any charges from a stray instance being left on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5398938", + "metadata": {}, + "outputs": [], + "source": [ + "predictor.delete_endpoint()" + ] + }, + { + "cell_type": "markdown", + "id": "9145ac7c", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "4543c65a", + "metadata": {}, + "source": [ + "In this notebook you successfully deployed a pre-trained scikit-learn model with the Amazon SageMaker scikit-learn container to quickly create a hosted endpoint for that model.\n", + "You then used the Python SDK and `boto3` to invoke the endpoint with `csv` payload, and then with `npy` payload to get predictions from the model.\n", + "\n", + "As next steps you can try to [Automatically Scale Amazon SageMaker Models](https://docs.aws.amazon.com/sagemaker/latest/dg/endpoint-auto-scaling.html), [Register and Deploy Models with Model Registry](https://docs.aws.amazon.com/sagemaker/latest/dg/model-registry.html) or [Train your Model with Amazon SageMaker](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-training.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4937421", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (Data Science)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_functionality/xgboost_bring_your_own_model/out.sh b/advanced_functionality/xgboost_bring_your_own_model/out.sh new file mode 100644 index 0000000000..f268b7b511 --- /dev/null +++ b/advanced_functionality/xgboost_bring_your_own_model/out.sh @@ -0,0 +1,324 @@ +c4b301c88fc1:amazon-sagemaker-examples julkroll$ git diff +diff --git a/advanced_functionality/xgboost_bring_your_own_model/xgboost_bring_your_own_model.ipynb b/advanced_functionality/xgboost_bring_your_own_model/xgboost_bring_your_own_model.ipynb +index 8df40914..f9b5a511 100644 +--- a/advanced_functionality/xgboost_bring_your_own_model/xgboost_bring_your_own_model.ipynb ++++ b/advanced_functionality/xgboost_bring_your_own_model/xgboost_bring_your_own_model.ipynb +@@ -1,18 +1,7 @@ + { + "cells": [ +- { +- "cell_type": "code", +- "execution_count": null, +- "id": "canadian-powell", +- "metadata": {}, +- "outputs": [], +- "source": [ +- "!pip install -Uq xgboost" +- ] +- }, + { + "cell_type": "markdown", +- "id": "animal-static", + "metadata": {}, + "source": [ + "# Amazon SageMaker XGBoost Bring Your Own Model\n", +@@ -54,7 +43,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "comic-jonathan", + "metadata": {}, + "outputs": [], + "source": [ +@@ -77,7 +65,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "revolutionary-egypt", + "metadata": {}, + "outputs": [], + "source": [ +@@ -88,30 +75,15 @@ + }, + { + "cell_type": "markdown", +- "id": "second-traffic", + "metadata": {}, + "source": [ + "## Optionally, train a scikit learn XGBoost model\n", + "\n", +- "These steps are optional and are needed to generate the scikit-learn model that will eventually be hosted using the SageMaker Algorithm contained. \n", +- "\n", +- "### Install XGboost\n", +- "Note that for conda based installation, you'll need to change the Notebook kernel to the environment with conda and Python3. " +- ] +- }, +- { +- "cell_type": "code", +- "execution_count": null, +- "id": "expanded-dress", +- "metadata": {}, +- "outputs": [], +- "source": [ +- "!conda install -y -c conda-forge xgboost==0.90" ++ "These steps are optional and are needed to generate the scikit-learn model that will eventually be hosted using the SageMaker Algorithm contained. \n" + ] + }, + { + "cell_type": "markdown", +- "id": "little-still", + "metadata": {}, + "source": [ + "### Fetch the dataset" +@@ -120,7 +92,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "injured-crawford", + "metadata": {}, + "outputs": [], + "source": [ +@@ -129,7 +100,6 @@ + }, + { + "cell_type": "markdown", +- "id": "tough-facial", + "metadata": {}, + "source": [ + "### Prepare the dataset for training" +@@ -138,7 +108,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "bright-powder", + "metadata": {}, + "outputs": [], + "source": [ +@@ -162,7 +131,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "wooden-thesis", + "metadata": {}, + "outputs": [], + "source": [ +@@ -178,7 +146,6 @@ + }, + { + "cell_type": "markdown", +- "id": "strong-height", + "metadata": {}, + "source": [ + "### Train the XGBClassifier" +@@ -187,7 +154,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "sought-genome", + "metadata": {}, + "outputs": [], + "source": [ +@@ -202,7 +168,6 @@ + }, + { + "cell_type": "markdown", +- "id": "patient-endorsement", + "metadata": {}, + "source": [ + "### Save the trained model file\n", +@@ -212,7 +177,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "arctic-retail", + "metadata": {}, + "outputs": [], + "source": [ +@@ -223,7 +187,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "fatty-chapel", + "metadata": {}, + "outputs": [], + "source": [ +@@ -232,7 +195,6 @@ + }, + { + "cell_type": "markdown", +- "id": "forced-illustration", + "metadata": {}, + "source": [ + "## Upload the pre-trained model to S3" +@@ -241,7 +203,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "molecular-admission", + "metadata": {}, + "outputs": [], + "source": [ +@@ -252,7 +213,6 @@ + }, + { + "cell_type": "markdown", +- "id": "willing-miami", + "metadata": {}, + "source": [ + "## Set up hosting for the model\n", +@@ -264,19 +224,17 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "juvenile-glossary", + "metadata": {}, + "outputs": [], + "source": [ +- "from sagemaker.amazon.amazon_estimator import get_image_uri\n", ++ "from sagemaker import image_uris\n", + "\n", +- "container = get_image_uri(boto3.Session().region_name, \"xgboost\", \"0.90-2\")" ++ "container = image_uris.retrieve(region=boto3.Session().region_name, framework=\"xgboost\", version=\"0.90-2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, +- "id": "competitive-mozambique", + "metadata": {}, + "outputs": [], + "source": [ +@@ -303,7 +261,6 @@ + }, + { + "cell_type": "markdown", +- "id": "announced-affect", + "metadata": {}, + "source": [ + "### Create endpoint configuration\n", +@@ -314,7 +271,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "alike-experience", + "metadata": {}, + "outputs": [], + "source": [ +@@ -340,7 +296,6 @@ + }, + { + "cell_type": "markdown", +- "id": "otherwise-wiring", + "metadata": {}, + "source": [ + "### Create endpoint\n", +@@ -350,7 +305,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "experienced-makeup", + "metadata": {}, + "outputs": [], + "source": [ +@@ -380,7 +334,6 @@ + }, + { + "cell_type": "markdown", +- "id": "specific-sheffield", + "metadata": {}, + "source": [ + "## Validate the model for use\n", +@@ -390,7 +343,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "pediatric-subject", + "metadata": {}, + "outputs": [], + "source": [ +@@ -399,7 +351,6 @@ + }, + { + "cell_type": "markdown", +- "id": "saving-ghost", + "metadata": {}, + "source": [ + "Lets generate the prediction for a single datapoint. We'll pick one from the test data generated earlier." +@@ -408,7 +359,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "polish-laugh", + "metadata": {}, + "outputs": [], + "source": [ +@@ -423,7 +373,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "reported-coalition", + "metadata": {}, + "outputs": [], + "source": [ +@@ -447,7 +396,6 @@ + }, + { + "cell_type": "markdown", +- "id": "pursuant-cemetery", + "metadata": {}, + "source": [ + "### Post process the output\n", +@@ -457,7 +405,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "broken-individual", + "metadata": {}, + "outputs": [], + "source": [ +@@ -469,7 +416,6 @@ + }, + { + "cell_type": "markdown", +- "id": "going-popularity", + "metadata": {}, + "source": [ + "### (Optional) Delete the Endpoint\n", +@@ -480,7 +426,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "foster-steps", + "metadata": {}, + "outputs": [], + "source": [ +@@ -490,10 +435,11 @@ + ], + "metadata": { + "anaconda-cloud": {}, ++ "instance_type": "ml.t3.medium", + "kernelspec": { +- "display_name": "Environment (conda_anaconda3)", ++ "display_name": "Python 3 (Data Science)", + "language": "python", +- "name": "conda_anaconda3" ++ "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { +@@ -504,10 +450,11 @@ + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", +- "pygments_lexer": "ipython3" ++ "pygments_lexer": "ipython3", ++ "version": "3.6.13" + }, + "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." + }, + "nbformat": 4, + "nbformat_minor": 5 +-} +\ No newline at end of file ++} +c4b301c88fc1:amazon-sagemaker-examples julkroll$ diff --git a/advanced_functionality/xgboost_bring_your_own_model/out.txt b/advanced_functionality/xgboost_bring_your_own_model/out.txt new file mode 100644 index 0000000000..f268b7b511 --- /dev/null +++ b/advanced_functionality/xgboost_bring_your_own_model/out.txt @@ -0,0 +1,324 @@ +c4b301c88fc1:amazon-sagemaker-examples julkroll$ git diff +diff --git a/advanced_functionality/xgboost_bring_your_own_model/xgboost_bring_your_own_model.ipynb b/advanced_functionality/xgboost_bring_your_own_model/xgboost_bring_your_own_model.ipynb +index 8df40914..f9b5a511 100644 +--- a/advanced_functionality/xgboost_bring_your_own_model/xgboost_bring_your_own_model.ipynb ++++ b/advanced_functionality/xgboost_bring_your_own_model/xgboost_bring_your_own_model.ipynb +@@ -1,18 +1,7 @@ + { + "cells": [ +- { +- "cell_type": "code", +- "execution_count": null, +- "id": "canadian-powell", +- "metadata": {}, +- "outputs": [], +- "source": [ +- "!pip install -Uq xgboost" +- ] +- }, + { + "cell_type": "markdown", +- "id": "animal-static", + "metadata": {}, + "source": [ + "# Amazon SageMaker XGBoost Bring Your Own Model\n", +@@ -54,7 +43,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "comic-jonathan", + "metadata": {}, + "outputs": [], + "source": [ +@@ -77,7 +65,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "revolutionary-egypt", + "metadata": {}, + "outputs": [], + "source": [ +@@ -88,30 +75,15 @@ + }, + { + "cell_type": "markdown", +- "id": "second-traffic", + "metadata": {}, + "source": [ + "## Optionally, train a scikit learn XGBoost model\n", + "\n", +- "These steps are optional and are needed to generate the scikit-learn model that will eventually be hosted using the SageMaker Algorithm contained. \n", +- "\n", +- "### Install XGboost\n", +- "Note that for conda based installation, you'll need to change the Notebook kernel to the environment with conda and Python3. " +- ] +- }, +- { +- "cell_type": "code", +- "execution_count": null, +- "id": "expanded-dress", +- "metadata": {}, +- "outputs": [], +- "source": [ +- "!conda install -y -c conda-forge xgboost==0.90" ++ "These steps are optional and are needed to generate the scikit-learn model that will eventually be hosted using the SageMaker Algorithm contained. \n" + ] + }, + { + "cell_type": "markdown", +- "id": "little-still", + "metadata": {}, + "source": [ + "### Fetch the dataset" +@@ -120,7 +92,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "injured-crawford", + "metadata": {}, + "outputs": [], + "source": [ +@@ -129,7 +100,6 @@ + }, + { + "cell_type": "markdown", +- "id": "tough-facial", + "metadata": {}, + "source": [ + "### Prepare the dataset for training" +@@ -138,7 +108,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "bright-powder", + "metadata": {}, + "outputs": [], + "source": [ +@@ -162,7 +131,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "wooden-thesis", + "metadata": {}, + "outputs": [], + "source": [ +@@ -178,7 +146,6 @@ + }, + { + "cell_type": "markdown", +- "id": "strong-height", + "metadata": {}, + "source": [ + "### Train the XGBClassifier" +@@ -187,7 +154,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "sought-genome", + "metadata": {}, + "outputs": [], + "source": [ +@@ -202,7 +168,6 @@ + }, + { + "cell_type": "markdown", +- "id": "patient-endorsement", + "metadata": {}, + "source": [ + "### Save the trained model file\n", +@@ -212,7 +177,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "arctic-retail", + "metadata": {}, + "outputs": [], + "source": [ +@@ -223,7 +187,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "fatty-chapel", + "metadata": {}, + "outputs": [], + "source": [ +@@ -232,7 +195,6 @@ + }, + { + "cell_type": "markdown", +- "id": "forced-illustration", + "metadata": {}, + "source": [ + "## Upload the pre-trained model to S3" +@@ -241,7 +203,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "molecular-admission", + "metadata": {}, + "outputs": [], + "source": [ +@@ -252,7 +213,6 @@ + }, + { + "cell_type": "markdown", +- "id": "willing-miami", + "metadata": {}, + "source": [ + "## Set up hosting for the model\n", +@@ -264,19 +224,17 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "juvenile-glossary", + "metadata": {}, + "outputs": [], + "source": [ +- "from sagemaker.amazon.amazon_estimator import get_image_uri\n", ++ "from sagemaker import image_uris\n", + "\n", +- "container = get_image_uri(boto3.Session().region_name, \"xgboost\", \"0.90-2\")" ++ "container = image_uris.retrieve(region=boto3.Session().region_name, framework=\"xgboost\", version=\"0.90-2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, +- "id": "competitive-mozambique", + "metadata": {}, + "outputs": [], + "source": [ +@@ -303,7 +261,6 @@ + }, + { + "cell_type": "markdown", +- "id": "announced-affect", + "metadata": {}, + "source": [ + "### Create endpoint configuration\n", +@@ -314,7 +271,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "alike-experience", + "metadata": {}, + "outputs": [], + "source": [ +@@ -340,7 +296,6 @@ + }, + { + "cell_type": "markdown", +- "id": "otherwise-wiring", + "metadata": {}, + "source": [ + "### Create endpoint\n", +@@ -350,7 +305,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "experienced-makeup", + "metadata": {}, + "outputs": [], + "source": [ +@@ -380,7 +334,6 @@ + }, + { + "cell_type": "markdown", +- "id": "specific-sheffield", + "metadata": {}, + "source": [ + "## Validate the model for use\n", +@@ -390,7 +343,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "pediatric-subject", + "metadata": {}, + "outputs": [], + "source": [ +@@ -399,7 +351,6 @@ + }, + { + "cell_type": "markdown", +- "id": "saving-ghost", + "metadata": {}, + "source": [ + "Lets generate the prediction for a single datapoint. We'll pick one from the test data generated earlier." +@@ -408,7 +359,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "polish-laugh", + "metadata": {}, + "outputs": [], + "source": [ +@@ -423,7 +373,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "reported-coalition", + "metadata": {}, + "outputs": [], + "source": [ +@@ -447,7 +396,6 @@ + }, + { + "cell_type": "markdown", +- "id": "pursuant-cemetery", + "metadata": {}, + "source": [ + "### Post process the output\n", +@@ -457,7 +405,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "broken-individual", + "metadata": {}, + "outputs": [], + "source": [ +@@ -469,7 +416,6 @@ + }, + { + "cell_type": "markdown", +- "id": "going-popularity", + "metadata": {}, + "source": [ + "### (Optional) Delete the Endpoint\n", +@@ -480,7 +426,6 @@ + { + "cell_type": "code", + "execution_count": null, +- "id": "foster-steps", + "metadata": {}, + "outputs": [], + "source": [ +@@ -490,10 +435,11 @@ + ], + "metadata": { + "anaconda-cloud": {}, ++ "instance_type": "ml.t3.medium", + "kernelspec": { +- "display_name": "Environment (conda_anaconda3)", ++ "display_name": "Python 3 (Data Science)", + "language": "python", +- "name": "conda_anaconda3" ++ "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { +@@ -504,10 +450,11 @@ + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", +- "pygments_lexer": "ipython3" ++ "pygments_lexer": "ipython3", ++ "version": "3.6.13" + }, + "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." + }, + "nbformat": 4, + "nbformat_minor": 5 +-} +\ No newline at end of file ++} +c4b301c88fc1:amazon-sagemaker-examples julkroll$ diff --git a/async-inference/Async-Inference-Walkthrough.ipynb b/async-inference/Async-Inference-Walkthrough.ipynb new file mode 100755 index 0000000000..d8175871a9 --- /dev/null +++ b/async-inference/Async-Inference-Walkthrough.ipynb @@ -0,0 +1,683 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Amazon SageMaker Asynchronous Inference\n", + "_**A new near real-time Inference option for generating machine learning model predictions**_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Table of Contents**\n", + "\n", + "* [Background](#background)\n", + "* [Notebook Scope](#scope)\n", + "* [Overview and sample end to end flow](#overview)\n", + "* [Section 1 - Setup](#setup) \n", + " * [Create Model](#createmodel)\n", + " * [Create EndpointConfig](#endpoint-config)\n", + " * [Create Endpoint](#create-endpoint)\n", + " * [Setup AutoScaling policy (Optional)](#setup-autoscaling)\n", + "* [Section 2 - Using the Endpoint](#endpoint) \n", + " * [Invoke Endpoint](#invoke-endpoint)\n", + " * [Check Output Location](#check-output)\n", + " * [Multiple Invocations](#multiple-invoke) \n", + "* [Section 3 - Clean up](#clean)\n", + "\n", + "### Background \n", + "Amazon SageMaker Asynchronous Inference is a new capability in SageMaker that queues incoming requests and processes them asynchronously. SageMaker currently offers two inference options for customers to deploy machine learning models: 1) a real-time option for low-latency workloads 2) Batch transform, an offline option to process inference requests on batches of data available upfront. Real-time inference is suited for workloads with payload sizes of less than 6 MB and require inference requests to be processed within 60 seconds. Batch transform is suitable for offline inference on batches of data. \n", + "\n", + "Asynchronous inference is a new inference option for near real-time inference needs. Requests can take up to 15 minutes to process and have payload sizes of up to 1 GB. Asynchronous inference is suitable for workloads that do not have sub-second latency requirements and have relaxed latency requirements. For example, you might need to process an inference on a large image of several MBs within 5 minutes. In addition, asynchronous inference endpoints let you control costs by scaling down endpoints instance count to zero when they are idle, so you only pay when your endpoints are processing requests. \n", + "\n", + "### Notebook scope \n", + "This notebook provides an introduction to the SageMaker Asynchronous inference capability. This notebook will cover the steps required to create an asynchonous inference endpoint and test it with some sample requests. \n", + "\n", + "### Overview and sample end to end flow \n", + "Asynchronous inference endpoints have many similarities (and some key differences) compared to real-time endpoints. The process to create asynchronous endpoints is similar to real-time endpoints. You need to create: a model, an endpoint configuration, and then an endpoint. However, there are specific configuration parameters specific to asynchronous inference endpoints which we will explore below. \n", + "\n", + "Invocation of asynchronous endpoints differ from real-time endpoints. Rather than pass request payload inline with the request, you upload the payload to Amazon S3 and pass an Amazon S3 URI as a part of the request. Upon receiving the request, SageMaker provides you with a token with the output location where the result will be placed once processed. Internally, SageMaker maintains a queue with these requests and processes them. During endpoint creation, you can optionally specify an Amazon SNS topic to receive success or error notifications. Once you receive the notification that your inference request has been successfully processed, you can access the result in the output Amazon S3 location. \n", + "\n", + "The diagram below provides a visual overview of the end-to-end flow with Asynchronous inference endpoint." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![title](images/e2e.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We're about to work with the [Titanic dataset](https://www.openml.org/d/40945)[1]. From the dataset documentation:\n", + "\n", + "> The original Titanic dataset, describing the survival status of individual passengers on the Titanic. The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. The principal source for data about Titanic passengers is the Encyclopedia Titanica. The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n", + ">\n", + "> Thomas Cason of UVa has greatly updated and improved the Titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", + ">\n", + "> For more information about how this dataset was constructed:\n", + "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n", + ">\n", + "> [1] Author: Frank E. Harrell Jr., Thomas Cason\n", + "Source: [Vanderbilt Biostatistics](http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 1. Setup " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we ensure we have an updated version of boto3, which includes the latest SageMaker features:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import the required python libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install --upgrade pip --quiet\n", + "!pip install -U awscli --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "import boto3\n", + "from time import gmtime, strftime\n", + "\n", + "boto_session = boto3.session.Session()\n", + "sm_session = sagemaker.session.Session()\n", + "sm_client = boto_session.client(\"sagemaker\")\n", + "sm_runtime = boto_session.client(\"sagemaker-runtime\")\n", + "region = boto_session.region_name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify your IAM role. Go the AWS IAM console (https://console.aws.amazon.com/iam/home) and add the following policies to your IAM Role:\n", + "* SageMakerFullAccessPolicy\n", + "* Amazon S3 access: Apply this to get and put objects in your Amazon S3 bucket. Replace `bucket_name` with the name of your Amazon S3 bucket: \n", + "\n", + "```json\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Action\": [\n", + " \"s3:GetObject\",\n", + " \"s3:PutObject\",\n", + " \"s3:AbortMultipartUpload\",\n", + " \"s3:ListBucket\"\n", + " ],\n", + " \"Effect\": \"Allow\",\n", + " \"Resource\": \"arn:aws:s3:::bucket_name/*\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "* (Optional) Amazon SNS access: Add `sns:Publish` on the topics you define. Apply this if you plan to use Amazon SNS to receive notifications.\n", + "\n", + "```json\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Action\": [\n", + " \"sns:Publish\"\n", + " ],\n", + " \"Effect\": \"Allow\",\n", + " \"Resource\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "* (Optional) KMS decrypt, encrypt if your Amazon S3 bucket is encrypte." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download the Input files and model from S3 bucket\n", + "!aws s3 cp --recursive s3://sagemaker-sample-files/models/async-inference/input-files/ input/\n", + "!aws s3 cp s3://sagemaker-sample-files/models/async-inference/demo-xgboost-model.tar.gz model/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify your SageMaker IAM Role (`sm_role`) and Amazon S3 bucket (`s3_bucket`). You can optionally use a default SageMaker Session IAM Role and Amazon S3 bucket. Make sure the role you use has the necessary permissions for SageMaker, Amazon S3, and optionally Amazon SNS." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sm_role = sagemaker.get_execution_role()\n", + "# Feel free to use your own role here\n", + "# sm_role = \"arn:aws:iam::123456789012:role/sagemaker-custom-role\"\n", + "print(f\"Using Role: {sm_role}\")\n", + "s3_bucket = sm_session.default_bucket()\n", + "print(f\"Will use bucket '{s3_bucket}' for storing all resources related to this notebook\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bucket_prefix = \"async-inference-demo\"\n", + "resource_name = \"AsyncInferenceDemo\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, you will create a model with `CreateModel`, an endpoint configuration with `CreateEndpointConfig`, and then an endpoint with the `CreateEndpoint` API.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 Create Model \n", + "Specify the location of the pre-trained model stored in Amazon S3. This example uses a pre-trained XGBoost model name demo-xgboost-model.tar.gz. The full Amazon S3 URI is stored in a string variable `model_url`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_s3_key = f\"{bucket_prefix}/demo-xgboost-model.tar.gz\"\n", + "model_url = f\"s3://{s3_bucket}/{model_s3_key}\"\n", + "print(f\"Uploading Model to {model_url}\")\n", + "\n", + "with open(\"model/demo-xgboost-model.tar.gz\", \"rb\") as model_file:\n", + " boto_session.resource(\"s3\").Bucket(s3_bucket).Object(model_s3_key).upload_fileobj(model_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify a primary container. For the primary container, you specify the Docker image that contains inference code, artifacts (from prior training), and a custom environment map that the inference code uses when you deploy the model for predictions. In this example, we specify an XGBoost built-in algorithm container image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import image_uris\n", + "\n", + "# Specify an AWS container image and region as desired\n", + "container = image_uris.retrieve(region=region, framework=\"xgboost\", version=\"0.90-1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a model by specifying the `ModelName`, the `ExecutionRoleARN` (the ARN of the IAM role that Amazon SageMaker can assume to access model artifacts/ docker images for deployment), and the `PrimaryContainer`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = resource_name.format(\"Model\")\n", + "create_model_response = sm_client.create_model(\n", + " ModelName=model_name,\n", + " ExecutionRoleArn=sm_role,\n", + " PrimaryContainer={\n", + " \"Image\": container,\n", + " \"ModelDataUrl\": model_url,\n", + " },\n", + ")\n", + "\n", + "print(f\"Created Model: {create_model_response['ModelArn']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Create EndpointConfig " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have a model, create an endpoint configuration with `CreateEndpointConfig`. Amazon SageMaker hosting services uses this configuration to deploy models. In the configuration, you identify one or more model that were created using with `CreateModel` API, to deploy the resources that you want Amazon SageMaker to provision. Specify the `AsyncInferenceConfig` object and provide an output Amazon S3 location for `OutputConfig`. You can optionally specify Amazon SNS topics on which to send notifications about prediction results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "endpoint_config_name = resource_name.format(\"EndpointConfig\")\n", + "create_endpoint_config_response = sm_client.create_endpoint_config(\n", + " EndpointConfigName=endpoint_config_name,\n", + " ProductionVariants=[\n", + " {\n", + " \"VariantName\": \"variant1\",\n", + " \"ModelName\": model_name,\n", + " \"InstanceType\": \"ml.m5.xlarge\",\n", + " \"InitialInstanceCount\": 1,\n", + " }\n", + " ],\n", + " AsyncInferenceConfig={\n", + " \"OutputConfig\": {\n", + " \"S3OutputPath\": f\"s3://{s3_bucket}/{bucket_prefix}/output\",\n", + " # Optionally specify Amazon SNS topics\n", + " # \"NotificationConfig\": {\n", + " # \"SuccessTopic\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\",\n", + " # \"ErrorTopic\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\",\n", + " # }\n", + " },\n", + " \"ClientConfig\": {\"MaxConcurrentInvocationsPerInstance\": 4},\n", + " },\n", + ")\n", + "print(f\"Created EndpointConfig: {create_endpoint_config_response['EndpointConfigArn']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 Create Endpoint " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have your model and endpoint configuration, use the `CreateEndpoint` API to create your endpoint. The endpoint name must be unique within an AWS Region in your AWS account." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "endpoint_name = resource_name.format(\"Endpoint\")\n", + "create_endpoint_response = sm_client.create_endpoint(\n", + " EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name\n", + ")\n", + "print(f\"Created Endpoint: {create_endpoint_response['EndpointArn']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Validate that the endpoint is created before invoking it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "waiter = sm_client.get_waiter(\"endpoint_in_service\")\n", + "print(\"Waiting for endpoint to create...\")\n", + "waiter.wait(EndpointName=endpoint_name)\n", + "resp = sm_client.describe_endpoint(EndpointName=endpoint_name)\n", + "print(f\"Endpoint Status: {resp['EndpointStatus']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.4 Setup AutoScaling policy (Optional) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section describes how to configure autoscaling on your asynchronous endpoint using Application Autoscaling. You need to first register your endpoint variant with Application Autoscaling, define a scaling policy, and then apply the scaling policy. In this configuration, we use a custom metric, `CustomizedMetricSpecification`, called `ApproximateBacklogSizePerInstance`. Please refer to the SageMaker Developer guide for a detailed list of metrics available with your asynchronous inference endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = boto3.client(\n", + " \"application-autoscaling\"\n", + ") # Common class representing Application Auto Scaling for SageMaker amongst other services\n", + "\n", + "resource_id = (\n", + " \"endpoint/\" + endpoint_name + \"/variant/\" + \"variant1\"\n", + ") # This is the format in which application autoscaling references the endpoint\n", + "\n", + "# Configure Autoscaling on asynchronous endpoint down to zero instances\n", + "response = client.register_scalable_target(\n", + " ServiceNamespace=\"sagemaker\",\n", + " ResourceId=resource_id,\n", + " ScalableDimension=\"sagemaker:variant:DesiredInstanceCount\",\n", + " MinCapacity=0,\n", + " MaxCapacity=5,\n", + ")\n", + "\n", + "response = client.put_scaling_policy(\n", + " PolicyName=\"Invocations-ScalingPolicy\",\n", + " ServiceNamespace=\"sagemaker\", # The namespace of the AWS service that provides the resource.\n", + " ResourceId=resource_id, # Endpoint name\n", + " ScalableDimension=\"sagemaker:variant:DesiredInstanceCount\", # SageMaker supports only Instance Count\n", + " PolicyType=\"TargetTrackingScaling\", # 'StepScaling'|'TargetTrackingScaling'\n", + " TargetTrackingScalingPolicyConfiguration={\n", + " \"TargetValue\": 5.0, # The target value for the metric. - here the metric is - SageMakerVariantInvocationsPerInstance\n", + " \"CustomizedMetricSpecification\": {\n", + " \"MetricName\": \"ApproximateBacklogSizePerInstance\",\n", + " \"Namespace\": \"AWS/SageMaker\",\n", + " \"Dimensions\": [{\"Name\": \"EndpointName\", \"Value\": endpoint_name}],\n", + " \"Statistic\": \"Average\",\n", + " },\n", + " \"ScaleInCooldown\": 600, # The cooldown period helps you prevent your Auto Scaling group from launching or terminating\n", + " # additional instances before the effects of previous activities are visible.\n", + " # You can configure the length of time based on your instance startup time or other application needs.\n", + " # ScaleInCooldown - The amount of time, in seconds, after a scale in activity completes before another scale in activity can start.\n", + " \"ScaleOutCooldown\": 300 # ScaleOutCooldown - The amount of time, in seconds, after a scale out activity completes before another scale out activity can start.\n", + " # 'DisableScaleIn': True|False - ndicates whether scale in by the target tracking policy is disabled.\n", + " # If the value is true , scale in is disabled and the target tracking policy won't remove capacity from the scalable resource.\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The endpoint is now ready for invocation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "--- \n", + "## 2. Using the Endpoint \n", + "\n", + "### 2.1 Uploading the Request Payload " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, you need to upload the request to Amazon S3. We define a function called, `upload_file`, to make it easier to make multiple invocations in a later step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "\n", + "def upload_file(input_location):\n", + " prefix = f\"{bucket_prefix}/input\"\n", + " return sm_session.upload_data(\n", + " input_location,\n", + " bucket=sm_session.default_bucket(),\n", + " key_prefix=prefix,\n", + " extra_args={\"ContentType\": \"text/libsvm\"},\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_1_location = \"input/test_point_0.libsvm\"\n", + "input_1_s3_location = upload_file(input_1_location)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Invoke Endpoint " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get inferences from the model hosted at your asynchronous endpoint with `InvokeEndpointAsync`. Specify the location of your inference data in the `InputLocation` field and the name of your endpoint for `EndpointName`. The response payload contains the output Amazon S3 location where the result will be placed. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = sm_runtime.invoke_endpoint_async(\n", + " EndpointName=endpoint_name, InputLocation=input_1_s3_location\n", + ")\n", + "output_location = response[\"OutputLocation\"]\n", + "print(f\"OutputLocation: {output_location}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Check Output Location " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the output location to see if the inference has been processed. We make multiple requests (beginning of the `while True` statement in the `get_output` function) every two seconds until there is an output of the inference request: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import urllib, time\n", + "from botocore.exceptions import ClientError\n", + "\n", + "\n", + "def get_output(output_location):\n", + " output_url = urllib.parse.urlparse(output_location)\n", + " bucket = output_url.netloc\n", + " key = output_url.path[1:]\n", + " while True:\n", + " try:\n", + " return sm_session.read_s3_file(bucket=output_url.netloc, key_prefix=output_url.path[1:])\n", + " except ClientError as e:\n", + " if e.response[\"Error\"][\"Code\"] == \"NoSuchKey\":\n", + " print(\"waiting for output...\")\n", + " time.sleep(2)\n", + " continue\n", + " raise" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output = get_output(output_location)\n", + "print(f\"Output: {output}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 Multiple Invocations " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following shows how you can invoke the endpoint with multiple requests:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inferences = []\n", + "for i in range(25):\n", + " input_file = f\"input/test_point_{i}.libsvm\"\n", + " input_file_s3_location = upload_file(input_file)\n", + " print(f\"Invoking Endpoint with {input_file}\")\n", + " response = sm_runtime.invoke_endpoint_async(\n", + " EndpointName=endpoint_name, InputLocation=input_file_s3_location\n", + " )\n", + " output_location = response[\"OutputLocation\"]\n", + " inferences += [(input_file, output_location)]\n", + " time.sleep(0.5)\n", + "\n", + "for input_file, output_location in inferences:\n", + " output = get_output(output_location)\n", + " print(f\"Input File: {input_file}, Output: {output}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Clean up " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you enabled auto-scaling for your endpoint, ensure you deregister the endpoint as a scalable target before deleting the endpoint. To do this, run the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "response = client.deregister_scalable_target(\n", + " ServiceNamespace='sagemaker',\n", + " ResourceId='resource_id',\n", + " ScalableDimension='sagemaker:variant:DesiredInstanceCount'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remember to delete your endpoint after use as you will be charged for the instances used in this Demo. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sm_client.delete_endpoint(EndpointName=endpoint_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may also want to delete any other resources you might have created such as SNS topics, S3 objects, etc." + ] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/async-inference/HuggingFace-Async-Inference-Walkthrough.ipynb b/async-inference/HuggingFace-Async-Inference-Walkthrough.ipynb new file mode 100644 index 0000000000..56e31e622a --- /dev/null +++ b/async-inference/HuggingFace-Async-Inference-Walkthrough.ipynb @@ -0,0 +1,498 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Amazon SageMaker Asynchronous Inference with Hugging Face Model\n", + "_**A new near real-time Inference option for generating machine learning model predictions**_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Table of Contents**\n", + "\n", + "* [Background](#background)\n", + "* [Notebook Scope](#scope)\n", + "* [Overview and sample end to end flow](#overview)\n", + "* [Section 1 - Setup](#setup) \n", + " * [Create Model](#createmodel)\n", + " * [Create EndpointConfig](#endpoint-config)\n", + " * [Create Endpoint](#create-endpoint)\n", + "* [Section 2 - Using the Endpoint](#endpoint) \n", + " * [Invoke Endpoint](#invoke-endpoint)\n", + " * [Check Output Location](#check-output) \n", + "* [Section 3 - Clean up](#clean)\n", + "\n", + "### Background \n", + "Amazon SageMaker Asynchronous Inference is a new capability in SageMaker that queues incoming requests and processes them asynchronously. SageMaker currently offers two inference options for customers to deploy machine learning models: 1) a real-time option for low-latency workloads 2) Batch transform, an offline option to process inference requests on batches of data available upfront. Real-time inference is suited for workloads with payload sizes of less than 6 MB and require inference requests to be processed within 60 seconds. Batch transform is suitable for offline inference on batches of data. \n", + "\n", + "Asynchronous inference is a new inference option for near real-time inference needs. Requests can take up to 15 minutes to process and have payload sizes of up to 1 GB. Asynchronous inference is suitable for workloads that do not have sub-second latency requirements and have relaxed latency requirements. For example, you might need to process an inference on a large image of several MBs within 5 minutes. In addition, asynchronous inference endpoints let you control costs by scaling down endpoints instance count to zero when they are idle, so you only pay when your endpoints are processing requests. \n", + "\n", + "### Notebook scope \n", + "This notebook provides an introduction on how to use the SageMaker Asynchronous inference capability with Hugging Face models. This notebook will cover the steps required to create an Asynchronous inference endpoint and test it with some sample requests. \n", + "\n", + "### Overview \n", + "Asynchronous inference endpoints have many similarities (and some key differences) compared to real-time endpoints. The process to create asynchronous endpoints is similar to real-time endpoints. You need to create: a model, an endpoint configuration, and then an endpoint. However, there are specific configuration parameters specific to asynchronous inference endpoints which we will explore below. \n", + "\n", + "Invocation of asynchronous endpoints differ from real-time endpoints. Rather than pass request payload inline with the request, you upload the payload to Amazon S3 and pass an Amazon S3 URI as a part of the request. Upon receiving the request, SageMaker provides you with a token with the output location where the result will be placed once processed. Internally, SageMaker maintains a queue with these requests and processes them. During endpoint creation, you can optionally specify an Amazon SNS topic to receive success or error notifications. Once you receive the notification that your inference request has been successfully processed, you can access the result in the output Amazon S3 location. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 1. Setup " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we ensure we have an updated version of Sagemaker, which includes the latest SageMaker features:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import the required python libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install --upgrade pip --quiet\n", + "!pip install -U awscli --quiet\n", + "!pip install --upgrade sagemaker --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from time import gmtime, strftime\n", + "from sagemaker import image_uris\n", + "import sagemaker\n", + "import logging\n", + "import boto3\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logger = logging.getLogger(\"__name__\")\n", + "logger.setLevel(logging.DEBUG)\n", + "logger.addHandler(logging.StreamHandler())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logger.info(f\"Using SageMaker version: {sagemaker.__version__}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region = sagemaker.Session().boto_region_name\n", + "role = sagemaker.get_execution_role()\n", + "boto3.setup_default_session(region_name=region)\n", + "boto_session = boto3.Session(region_name=region)\n", + "sm_session = sagemaker.session.Session()\n", + "sagemaker_client = boto_session.client(\"sagemaker\")\n", + "sm_runtime = boto_session.client(\"sagemaker-runtime\")\n", + "s3_bucket = sm_session.default_bucket()\n", + "current_timestamp = strftime(\"%m-%d-%H-%M\", gmtime())\n", + "logger.info(f\"Region = {region}\")\n", + "logger.info(f\"Role = {role}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify your IAM role. Go the AWS IAM console (https://console.aws.amazon.com/iam/home) and add the following policies to your IAM Role:\n", + "* SageMakerFullAccessPolicy\n", + "* Amazon S3 access: Apply this to get and put objects in your Amazon S3 bucket. Replace `bucket_name` with the name of your Amazon S3 bucket: \n", + "\n", + "```json\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Action\": [\n", + " \"s3:GetObject\",\n", + " \"s3:PutObject\",\n", + " \"s3:AbortMultipartUpload\",\n", + " \"s3:ListBucket\"\n", + " ],\n", + " \"Effect\": \"Allow\",\n", + " \"Resource\": \"arn:aws:s3:::/*\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "* (Optional) Amazon SNS access: Add `sns:Publish` on the topics you define. Apply this if you plan to use Amazon SNS to receive notifications.\n", + "\n", + "```json\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Action\": [\n", + " \"sns:Publish\"\n", + " ],\n", + " \"Effect\": \"Allow\",\n", + " \"Resource\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "* (Optional) KMS decrypt, encrypt if your Amazon S3 bucket is encrypted." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify your SageMaker IAM Role (`role`) and Amazon S3 bucket . You can optionally use a default SageMaker Session IAM Role and Amazon S3 bucket. Make sure the role you use has the necessary permissions for SageMaker, Amazon S3, and optionally Amazon SNS." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 Create Model \n", + "Specify the location of the pre-trained model stored in Amazon S3. This example uses a pre-trained Hugging Face model name (https://huggingface.co/finiteautomata/beto-sentiment-analysis) sentimentanalysis.tar.gz. The full Amazon S3 URI is stored in a string variable `MODEL_DATA_URL`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_DATA_URL = \"s3://asyncendpointexperiment/sentimentanalysis.tar.gz\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify a primary container. For the primary container, you specify the Docker image that contains inference code, artifacts (from prior training), and a custom environment map that the inference code uses when you deploy the model for predictions. In this example, we retrieve the appropriate container image by specifying the right framework version and framework details. Here in this case we are downloading container image associated with Hugging Face framework. For further details on right container images to use for your use case please refer to this link https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/ and look in to appropriate ecr folder pertaining to the region of your interest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ecr_image = image_uris.retrieve(\n", + " framework=\"huggingface\",\n", + " region=region,\n", + " version=\"4.6.1\",\n", + " image_scope=\"inference\",\n", + " base_framework_version=\"pytorch1.7.1\",\n", + " py_version=\"py36\",\n", + " container_version=\"ubuntu18.04\",\n", + " instance_type=\"ml.m5.xlarge\",\n", + ")\n", + "ecr_image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = f\"beto-sentiment-analysis-async\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a model by specifying the `ModelName`, the `ExecutionRoleARN` (the ARN of the IAM role that Amazon SageMaker can assume to access model artifacts/ docker images for deployment), and the `PrimaryContainer`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = sagemaker_client.create_model(\n", + " ModelName=model_name,\n", + " ExecutionRoleArn=role,\n", + " PrimaryContainer={\n", + " \"Image\": ecr_image,\n", + " \"ModelDataUrl\": MODEL_DATA_URL,\n", + " \"Environment\": {\n", + " \"HF_MODEL_ID\": \"finiteautomata/beto-sentiment-analysis\",\n", + " \"HF_TASK\": \"text-classification\",\n", + " \"SAGEMAKER_CONTAINER_LOG_LEVEL\": \"20\",\n", + " \"SAGEMAKER_REGION\": region,\n", + " },\n", + " },\n", + ")\n", + "model_arn = response[\"ModelArn\"]\n", + "\n", + "logger.info(f\"Created Model: {model_arn}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "endpoint_config_name = model_name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Create EndpointConfig " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have a model, create an endpoint configuration with CreateEndpointConfig. Amazon SageMaker hosting services uses this configuration to deploy models. In the configuration, you identify one or more models that were created using with CreateModel API, to deploy the resources that you want Amazon SageMaker to provision. Specify the AsyncInferenceConfig object and provide an output Amazon S3 location for OutputConfig. You can optionally specify Amazon SNS topics on which to send notifications about prediction results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = sagemaker_client.create_endpoint_config(\n", + " EndpointConfigName=endpoint_config_name,\n", + " ProductionVariants=[\n", + " {\n", + " \"VariantName\": \"variant-1\",\n", + " \"ModelName\": model_name,\n", + " \"InstanceType\": \"ml.m5.xlarge\",\n", + " \"InitialInstanceCount\": 1,\n", + " }\n", + " ],\n", + " AsyncInferenceConfig={\n", + " \"OutputConfig\": {\n", + " \"S3OutputPath\": f\"s3://{s3_bucket}/output\",\n", + " # Optionally specify Amazon SNS topics\n", + " # \"NotificationConfig\": {\n", + " # \"SuccessTopic\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\",\n", + " # \"ErrorTopic\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\",\n", + " # }\n", + " },\n", + " },\n", + ")\n", + "endpoint_config_arn = response[\"EndpointConfigArn\"]\n", + "logger.info(f\"Created EndpointConfig: {endpoint_config_arn}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 Create Endpoint " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have your model and endpoint configuration, use the CreateEndpoint API to create your endpoint. The endpoint name must be unique within an AWS Region in your AWS account." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "endpoint_name = model_name\n", + "response = sagemaker_client.create_endpoint(\n", + " EndpointName=\"HuggingFaceAsyncEndpoint\", EndpointConfigName=\"beto-sentiment-analysis-async\"\n", + ")\n", + "endpoint_arn = response[\"EndpointArn\"]\n", + "logger.info(f\"Created Endpoint: {endpoint_arn}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "--- \n", + "## 2. Using the Endpoint \n", + "\n", + "### 2.1 Uploading the Request Payload " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sample input.json placed in the input location\n", + "\n", + "{\"inputs\": [\"I like you. I love you\",\"This is sad\",\"am so happy that i want to cry\",\"async endpoints are awesome\"]}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_s3_location = f\"s3://{s3_bucket}/input/input.json\"\n", + "print(input_s3_location)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Invoke Endpoint " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get inferences from the model hosted at your asynchronous endpoint with InvokeEndpointAsync. Specify the location of your inference data in the InputLocation field and the name of your endpoint for EndpointName. The response payload contains the output Amazon S3 location where the result will be placed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = sm_runtime.invoke_endpoint_async(\n", + " EndpointName=\"HuggingFaceAsyncEndpoint\",\n", + " InputLocation=input_s3_location,\n", + " ContentType=\"application/json\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Check Output Location " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the output location to see if the inference has been processed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sample inference output processed and placed in the output location\n", + "\n", + "[{\"label\":\"POS\",\"score\":0.9982852339744568},{\"label\":\"NEG\",\"score\":0.9333241581916809},{\"label\":\"POS\",\"score\":0.595783531665802},{\"label\":\"NEU\",\"score\":0.9964613318443298}]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Summary & Clean up " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To Summarize, In this notebook we learned how to use the SageMaker Asynchronous inference capability with pre-trained Hugging Face models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you enabled auto-scaling for your endpoint, ensure you deregister the endpoint as a scalable target before deleting the endpoint. To do this, run the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = client.deregister_scalable_target(\n", + " ServiceNamespace=\"sagemaker\",\n", + " ResourceId=\"resource_id\",\n", + " ScalableDimension=\"sagemaker:variant:DesiredInstanceCount\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remember to delete your endpoint after use as you will be charged for the instances used in this Demo. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may also want to delete any other resources you might have created such as SNS topics, S3 objects, etc." + ] + } + ], + "metadata": { + "instance_type": "ml.g4dn.xlarge", + "kernelspec": { + "display_name": "Python 3 (PyTorch 1.6 Python 3.6 GPU Optimized)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/pytorch-1.6-gpu-py36-cu110-ubuntu18.04-v3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/async-inference/images/e2e.png b/async-inference/images/e2e.png new file mode 100644 index 0000000000..8cf8b7bc30 Binary files /dev/null and b/async-inference/images/e2e.png differ diff --git a/autopilot/custom-feature-selection/Feature_selection_autopilot.ipynb b/autopilot/custom-feature-selection/Feature_selection_autopilot.ipynb index 88a303e4b8..45aa1b4801 100644 --- a/autopilot/custom-feature-selection/Feature_selection_autopilot.ipynb +++ b/autopilot/custom-feature-selection/Feature_selection_autopilot.ipynb @@ -49,16 +49,16 @@ }, "outputs": [], "source": [ - "# S3 prefix\n", - "bucket = \"qqnl-autopilot\"\n", - "prefix = \"reuse-autopilot-blog\"\n", - "\n", "import sagemaker\n", "import os\n", "from sagemaker import get_execution_role\n", "\n", "sagemaker_session = sagemaker.Session()\n", "\n", + "# S3 prefix\n", + "bucket = sagemaker_session.default_bucket()\n", + "prefix = \"reuse-autopilot-blog\"\n", + "\n", "# Get a SageMaker-compatible role used by this Notebook Instance.\n", "role = get_execution_role()" ] @@ -176,11 +176,11 @@ "\n", "import argparse\n", "import csv\n", + "import joblib\n", "import json\n", "import numpy as np\n", "import pandas as pd\n", "\n", - "from sklearn.externals import joblib\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.svm import SVR\n", @@ -319,7 +319,7 @@ "\n", "* __entry_point__: The path to the Python script SageMaker runs for training and prediction.\n", "* __role__: Role ARN\n", - "* __train_instance_type__ *(optional)*: The type of SageMaker instances for training. __Note__: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.\n", + "* __instance_type__ *(optional)*: The type of SageMaker instances for training. __Note__: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.\n", "* __sagemaker_session__ *(optional)*: The session used to train on Sagemaker.\n", "\n", "To see the code for the SKLearn Estimator, see here: https://github.com/aws/sagemaker-python-sdk/tree/master/src/sagemaker/sklearn" @@ -342,8 +342,10 @@ " entry_point=script_path,\n", " role=role,\n", " output_path=model_output_path,\n", - " train_instance_type=\"ml.c4.xlarge\",\n", + " instance_type=\"ml.c4.xlarge\",\n", " sagemaker_session=None,\n", + " framework_version=\"0.23-1\",\n", + " py_version=\"py3\",\n", ")\n", "\n", "sklearn_preprocessor.fit({\"train\": train_input})" @@ -387,7 +389,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.externals import joblib\n", + "import joblib\n", "\n", "feature_list = list(joblib.load(\"selected_feature_names.joblib\"))\n", "print(feature_list)" @@ -692,7 +694,7 @@ "pipeline_endpoint_config_name = \"pipeline-blog-endpoint-config-\" + time_stamp\n", "pipeline_endpoint_name = \"pipeline-blog-endpoint-\" + time_stamp\n", "\n", - "sklearn_image = sklearn_preprocessor.image_name\n", + "sklearn_image = sklearn_preprocessor.image_uri\n", "container_1_source = sklearn_preprocessor.latest_training_job.describe()[\"HyperParameters\"][\n", " \"sagemaker_submit_directory\"\n", "][1:-1]\n", @@ -795,19 +797,18 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.predictor import RealTimePredictor, csv_serializer\n", - "from sagemaker.content_types import CONTENT_TYPE_CSV\n", + "from sagemaker.predictor import Predictor\n", + "from sagemaker.serializers import IdentitySerializer\n", + "from sagemaker.deserializers import CSVDeserializer\n", "\n", - "predictor = RealTimePredictor(\n", - " endpoint=pipeline_endpoint_name,\n", - " serializer=csv_serializer,\n", + "predictor = Predictor(\n", + " endpoint_name=pipeline_endpoint_name,\n", " sagemaker_session=sagemaker_session,\n", - " content_type=CONTENT_TYPE_CSV,\n", - " accept=CONTENT_TYPE_CSV,\n", + " serializer=IdentitySerializer(content_type=\"text/csv\"),\n", + " deserializer=CSVDeserializer(),\n", ")\n", "\n", - "predictor.content_type = \"text/csv\"\n", - "predictor.predict(test_data.to_csv(sep=\",\", header=True, index=False)).decode(\"utf-8\")" + "predictor.predict(test_data.to_csv(sep=\",\", header=True, index=False))" ] }, { @@ -827,13 +828,6 @@ "sm_client = sagemaker_session.boto_session.client(\"sagemaker\")\n", "sm_client.delete_endpoint(EndpointName=pipeline_endpoint_name)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -853,7 +847,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/autopilot/custom-feature-selection/sklearn_feature_selection.py b/autopilot/custom-feature-selection/sklearn_feature_selection.py index e63633f754..3954fb2022 100644 --- a/autopilot/custom-feature-selection/sklearn_feature_selection.py +++ b/autopilot/custom-feature-selection/sklearn_feature_selection.py @@ -2,6 +2,7 @@ import argparse import csv +import joblib import json import os import shutil @@ -19,7 +20,6 @@ transformer, worker, ) -from sklearn.externals import joblib from sklearn.feature_selection import RFE, SelectKBest, f_regression, mutual_info_regression from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline diff --git a/aws_marketplace/README.md b/aws_marketplace/README.md index 4b67dea7c1..c2c327ce08 100644 --- a/aws_marketplace/README.md +++ b/aws_marketplace/README.md @@ -40,6 +40,8 @@ These examples show you how to use model-packages and algorithms from AWS Market - [Using Dataset Products](using_data) - [Using dataset from AWS Data Exchange with ML model from AWS Marketplace](using_data/using_data_with_ml_model) is a sample notebook which shows how a dataset from AWS Data Exchange can be used with an ML Model Package from AWS Marketplace. + - [Using Shutterstock Image Datasets to train Image Classification Models](using_data/image_classification_with_shutterstock_image_datasets) provides a detailed walkthrough on how to use the [Free Sample: Images & Metadata of “Whole Foods” Shoppers](https://aws.amazon.com/marketplace/pp/prodview-y6xuddt42fmbu?qid=1623195111604&sr=0-1&ref_=srh_res_product_title#offers) from Shutterstock's Image Datasets to train a multi-label image classification model using Shutterstock's pre-labeled image assets. You can learn more about this implementation [from this blog post](https://aws.amazon.com/blogs/awsmarketplace/using-shutterstocks-image-datasets-to-train-your-computer-vision-models/). + ## FAQ *What do I need in order to get started?* diff --git a/aws_marketplace/index.rst b/aws_marketplace/index.rst index 383aeb5270..7b5c0475a0 100644 --- a/aws_marketplace/index.rst +++ b/aws_marketplace/index.rst @@ -38,7 +38,7 @@ Use AWS Data Exchange products .. toctree:: :maxdepth: 0 - using_data/using_data_with_ml_model/using-dataset-product-from-aws-data-exchange-with-ml-model-from-aws-marketplace + using_data/image_classification_with_shutterstock_image_datasets/image-classification-with-shutterstock-datasets Use AWS Marketplace algorithms diff --git a/aws_marketplace/using_algorithms/autogluon/autogluon_tabular_marketplace.ipynb b/aws_marketplace/using_algorithms/autogluon/autogluon_tabular_marketplace.ipynb index 67bc1c3d81..7181491f84 100644 --- a/aws_marketplace/using_algorithms/autogluon/autogluon_tabular_marketplace.ipynb +++ b/aws_marketplace/using_algorithms/autogluon/autogluon_tabular_marketplace.ipynb @@ -105,26 +105,16 @@ "algorithm_arn = AlgorithmArnProvider.get_algorithm_arn(region)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import subprocess\n", - "\n", - "subprocess.run(\"apt-get update -y\", shell=True)\n", - "subprocess.run(\"apt install unzip\", shell=True)" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 3: Get the data\n", "\n", - "In this example we'll use the direct-marketing dataset to build a binary classification model that predicts whether customers will accept or decline a marketing offer. \n", - "First we'll download the data and split it into train and test sets. AutoGluon does not require a separate validation set (it uses bagged k-fold cross-validation)." + "In this example we'll use the [1] [UCI Machine Learning Repository: Adult Data Set](https://archive.ics.uci.edu/ml/datasets/adult) to build a binary classification model that predicts whether customers will accept or decline a marketing offer. \n", + "First we'll download the data and split it into train and test sets. AutoGluon does not require a separate validation set (it uses bagged k-fold cross-validation).\n", + "\n", + "[1] Dua, D. and Graff, C. (2019). [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml). Irvine, CA: University of California, School of Information and Computer Science." ] }, { @@ -133,23 +123,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Download and unzip the data\n", - "subprocess.run(\n", - " f\"aws s3 cp --region {region} s3://sagemaker-sample-data-{region}/autopilot/direct_marketing/bank-additional.zip .\",\n", - " shell=True,\n", - ")\n", - "subprocess.run(\"unzip -qq -o bank-additional.zip\", shell=True)\n", - "subprocess.run(\"rm bank-additional.zip\", shell=True)\n", - "\n", - "local_data_path = \"./bank-additional/bank-additional-full.csv\"\n", - "data = pd.read_csv(local_data_path)\n", + "# Download the data\n", + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(\"autogluon\", \"datasets/Inc/train.csv\", \"train.csv\")\n", + "s3.download_file(\"autogluon\", \"datasets/Inc/test.csv\", \"test.csv\")\n", "\n", "# Split train/test data\n", - "train = data.sample(frac=0.7, random_state=42)\n", - "test = data.drop(train.index)\n", + "train = pd.read_csv('train.csv')\n", + "test = pd.read_csv('test.csv')\n", "\n", "# Split test X/y\n", - "label = \"y\"\n", + "label = \"class\"\n", "y_test = test[label]\n", "X_test = test.drop(columns=[label])" ] @@ -220,7 +204,7 @@ "outputs": [], "source": [ "# Define required label and optional additional parameters\n", - "init_args = {\"label\": \"y\"}\n", + "init_args = {\"label\": \"class\"}\n", "\n", "# Define additional parameters\n", "fit_args = {\n", @@ -434,7 +418,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/aws_marketplace/using_data/image_classification_with_shutterstock_image_datasets/README.md b/aws_marketplace/using_data/image_classification_with_shutterstock_image_datasets/README.md new file mode 100644 index 0000000000..451b073693 --- /dev/null +++ b/aws_marketplace/using_data/image_classification_with_shutterstock_image_datasets/README.md @@ -0,0 +1,25 @@ +## Using Shutterstock's image datasets to train a multi-label image classification model + +Welcome! This directory contains the Amazon SageMaker notebook code used in the blog post [**Using Shutterstock's image datasets to train your computer vision models**](https://aws.amazon.com/blogs/awsmarketplace/using-shutterstocks-image-datasets-to-train-your-computer-vision-models/). + + +For this example, we use the Free Sample: Images & Metadata of “Whole Foods” Shoppers dataset from Shutterstock’s Image Datasets to demonstrate how to train a multi-label image classification model using Shutterstock’s prelabeled image assets. This dataset can be found in the [AWS Data Exchange](https://aws.amazon.com/data-exchange/) and contains images of Whole Foods shoppers. Each image is tagged with 7-50 keywords describing what is seen in the image. + + +You can get started using this notebook by following the steps outlined in [**Using Shutterstock's image datasets to train your computer vision models**](https://aws.amazon.com/blogs/awsmarketplace/using-shutterstocks-image-datasets-to-train-your-computer-vision-models/).. + +At a high level, setup involves these 4 steps: + +1. Subscribe to the [Free Sample: Images & Metadata of “Whole Foods” Shoppers dataset](https://aws.amazon.com/marketplace/pp/prodview-y6xuddt42fmbu?sr=0-1&ref_=beagle&applicationId=AWSMPContessa) from Shutterstock’s Image Datasets. Export this dataset to an S3 bucket. +2. Create an Amazon SageMaker Notebook instance by navigating to the [Amazon SageMaker console](https://console.aws.amazon.com/sagemaker/home), choosing **Notebook instances** from *Notebook* section, and selecting **Create notebook**. For the development of this notebook, we used an `ml.t2.medium`. Make sure the SageMaker role has access to your Shutterstock Image Dataset S3 bucket. Note that charges apply. +3. Once your notebook instance is ready, click **Open Jupyter** and navigate to the **Sagemaker Examples** tab. Scroll down to find the **AWS Marketplace** section and select it to expand and view the notebooks available. +4. Locate the notebook named `image-classification-with-shutterstock-datasets.ipynb`, select **Use**, and then select **Create copy** to copy the notebook into your environment. +5. Locate the `TO DO` items to fill in the name of the S3 bucket that is being used to store your image datasets. Also be sure to enter any prefixes (if applicable) that the images are stored under. +6. Walk through the notebook step-by-step and select **Run** to run each cell of code. + +## License + +This library is licensed under the MIT-0 License. See the LICENSE file. + +By subscribing to the dataset [Free Sample: Images & Metadata of "Whole Foods" Shoppers](https://aws.amazon.com/marketplace/pp/prodview-y6xuddt42fmbu?sr=0-1&ref_=beagle&applicationId=AWSMPContessa), you are also subject to the terms defined under in the dataset's [Usage information](https://aws.amazon.com/marketplace/pp/prodview-y6xuddt42fmbu?ref_=srh_res_product_title#usage). + diff --git a/aws_marketplace/using_data/image_classification_with_shutterstock_image_datasets/image-classification-with-shutterstock-datasets.ipynb b/aws_marketplace/using_data/image_classification_with_shutterstock_image_datasets/image-classification-with-shutterstock-datasets.ipynb new file mode 100644 index 0000000000..5aaa01fb43 --- /dev/null +++ b/aws_marketplace/using_data/image_classification_with_shutterstock_image_datasets/image-classification-with-shutterstock-datasets.ipynb @@ -0,0 +1,532 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Using Shutterstock's Image datasets to train a multi-label image classification model" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "# Introduction\n", + "\n", + "This example of **multi-label image classification** trains the **Amazon SageMaker 1P image classification algorithm**. We will use the Amazon SageMaker image classification algorithm in transfer learning mode to fine-tune a pre-trained model (trained on ImageNet data) to learn to classify a new multi-label dataset. The pre-trained model will be fine-tuned using the [Free Sample: Images & Metadata of “Whole Foods” Shoppers dataset from Shutterstock’s Image Datasets](https://aws.amazon.com/marketplace/pp/prodview-y6xuddt42fmbu?ref_=srh_res_product_title). \n", + "\n", + "You can learn more about this implementation from [**Using Shutterstock's image datasets to train your computer vision models**](https://aws.amazon.com/blogs/awsmarketplace/using-shutterstocks-image-datasets-to-train-your-computer-vision-models/).\n", + "\n", + "To get started, we need to set up the environment with a few prerequisite steps, for permissions, configurations, and so on.\n", + "\n", + "Note: This notebook is a modification of the existing [SageMaker example notebook for multi-label image classification](https://github.com/aws/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/imageclassification_mscoco_multi_label/Image-classification-multilabel-lst.ipynb)." + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Prerequisites\n", + "\n", + "### Step 1: Set permissions and environment variables\n", + "\n", + "Here we set up the pointers and authentication to AWS services. There are three parts to this:\n", + "\n", + "* The roles used to give learning and hosting access to your data. This will automatically be obtained from the role used to start the notebook\n", + "* The S3 bucket that you want to use for SageMaker training and model data\n", + "* The S3 bucket that is storing the images from the Shutterstock Image Dataset for Whole Foods Shoppers\n", + "* The SageMaker image classification docker image (which we will not need to change)\n", + "\n", + "**FILL IN YOUR S3 BUCKET NAME:** For our `images_bucket` variable, replace `` with the name of the S3 bucket that contains your Shutterstock Image Dataset. If you stored your images under a prefix, please also update the `images_bucket_prefix` variable with this prefix." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.amazon.amazon_estimator import get_image_uri\n", + "\n", + "# SageMaker notebook execution role\n", + "role = get_execution_role()\n", + "print(\"Sagemaker Execution Role: {}\".format(role))\n", + "\n", + "# SageMaker S3 Bucket for training and model data\n", + "sagemaker_bucket = sagemaker.Session().default_bucket()\n", + "sagemaker_bucket_prefix = \"wholefoods-ic-multilabel\"\n", + "print(\"Using S3 Bucket for SageMaker training and model data: {}\".format(sagemaker_bucket))\n", + "\n", + "# SageMaker Training image for image classification\n", + "training_image = sagemaker.image_uris.retrieve(\n", + " region=sagemaker.Session().boto_region_name, framework=\"image-classification\", version=\"latest\"\n", + ")\n", + "print(\"Using Training Image: {}\".format(training_image))\n", + "\n", + "\n", + "images_adx_directory = \"adx_free_data_sample/wholefoods/\"\n", + "\n", + "# TO DO: enter the name of the S3 bucket that contains your Shutterstock Image Dataset\n", + "images_bucket = \"\"\n", + "\n", + "# TO DO: enter the prefix (if applicable) where your images are stored, if you stored them under an additional prefix\n", + "images_bucket_prefix = \"\"\n", + "\n", + "print(\"Using S3 Bucket with Training Images: {}\".format(images_bucket))\n", + "\n", + "# Pointer to image metadata file\n", + "image_metadata_filename = \"AWS_Wholefood Shoppers_ Metadata.csv\"" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Data Preparation\n", + "\n", + "The Shutterstock Image Datasets are collections of curated images from Shutterstock’s library of more than 370 million images. You can subscribe to one of the preexisting collections, such as Food & Beverage, Clothing, or Hospitality, or you can work with the Shutterstock Data Exchange team to request a custom collection of images for your use case. Each image includes a descriptive title with up to 200 characters and an optimal 7-50 keywords. \n", + "\n", + "For this example, we use the [Free Sample: Images & Metadata of “Whole Foods” Shoppers dataset from Shutterstock’s Image Datasets](https://aws.amazon.com/marketplace/pp/prodview-y6xuddt42fmbu?ref_=srh_res_product_title) to demonstrate how to train a multi-label image classification model using Shutterstock’s pre-labeled image assets. This dataset contains images of Whole Foods shoppers. Each image is tagged with 30-50 keywords describing what is seen in the image.\n", + "\n", + "An image can contain objects of multiple keywords. Because the purpose of this notebook is to show how the Shutterstock dataset can be used to train for multi-label image classification, we will limit the keywords that we are training on for demonstration purposes. We have a limited set of 200 images in our free sample.\n", + "\n", + "To start, we will train the model to learn to classify the first 5 keywords that appear in our dataset.\n", + "\n", + "1. Adult\n", + "2. Beautiful\n", + "3. Buying\n", + "4. Car\n", + "5. Casual\n", + "\n", + "\n", + "The image classification algorithm can take two types of input formats. The first is a [recordIO file format](https://mxnet.apache.org/versions/1.7.0/api/faq/recordio) and the other is an LST file format. We will use the LST file format for training. " + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Step 2: Parse the metadata .csv file to create a .lst file for training\n", + "A .lst file is a tab-separated file with three columns that contains a list of image files. The first column specifies the image index, the second column specifies the class label index for the image, and the third column specifies the relative path of the image file. The image index in the first column should be unique across all the images. \n", + "\n", + "The Whole Foods Shoppers dataset comes with a metadata file in CSV format. We will use this to generate our .lst file. This CSV metadata file has the following headers in the first row: 'ASSET_ID' | 'COLLECTION' |'KEYWORDS' | 'DESCRIPTION'. \n", + "\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "import numpy as np\n", + "import csv\n", + "\n", + "origin_file = images_bucket_prefix + image_metadata_filename\n", + "destination_file = \"./%s\" % image_metadata_filename\n", + "\n", + "s3 = boto3.resource(\"s3\")\n", + "s3.Bucket(images_bucket).download_file(origin_file, destination_file)\n", + "\n", + "# Create a dictionary (key: keyword, value: list of associated images IDs)\n", + "keyword_images = {}\n", + "\n", + "# convert the metadata CSV file into a nested list\n", + "with open(destination_file, \"r\", encoding=\"UTF-8\", newline=\"\") as f:\n", + " reader = csv.reader(f)\n", + " data = list(reader)\n", + "\n", + " # CSV header in Row 1 should be ['ASSET_ID, 'COLLECTION', 'KEYWORDS', 'DESCRIPTION']\n", + " print(\"Data Columns: {}\".format(data[0]))\n", + "\n", + " # Remove header row\n", + " data.pop(0)\n", + "\n", + " # Map the keywords to the associated image IDs\n", + " for row in data:\n", + " image_id = row[0]\n", + " image_keywords = row[2].split(\",\")\n", + "\n", + " for word in image_keywords:\n", + " if word not in keyword_images:\n", + " keyword_images[word] = []\n", + " keyword_images[word].append(image_id)\n", + "\n", + "\n", + "# Train for the first 5 keywords\n", + "num_training_keywords = 5\n", + "training_keywords = list(keyword_images.keys())[0:num_training_keywords]\n", + "\n", + "print(\"Training for the following Keywords: {}\".format(training_keywords))\n", + "\n", + "# obtain image ids and labels for images with these 5 keywords\n", + "image_ids = []\n", + "labels = []\n", + "\n", + "for index, key in enumerate(training_keywords):\n", + " for image_id in keyword_images[key]:\n", + " if image_id in image_ids:\n", + " labels[image_ids.index(image_id)][index] = 1\n", + " else:\n", + " image_ids.append(image_id)\n", + " labels.append(np.zeros(len(training_keywords), dtype=np.int))\n", + " labels[-1][index] = 1\n", + "\n", + "# Construct the LST file from the image ids and labels\n", + "# The first column is the image index, the last is the image filename\n", + "# and the second to last but one are the labels\n", + "with open(\"image.lst\", \"w\") as fp:\n", + " sum_labels = labels[0]\n", + " for ind, image_id in enumerate(image_ids):\n", + " image_path = \"{}.jpg\".format(image_id)\n", + " label_h = labels[ind]\n", + " sum_labels += label_h\n", + " fp.write(str(ind) + \"\\t\")\n", + " for j in label_h:\n", + " fp.write(str(j) + \"\\t\")\n", + " fp.write(image_path)\n", + " fp.write(\"\\n\")\n", + " fp.close()\n", + "print(sum_labels)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "training_keywords" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Step 3: Create the training and validation datasets\n", + "Create training and validation set by splitting the LST file. Limit images to the top 150 images. Use 70% of the images for training and 30% of the images for validation." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "!shuf -n 150 image.lst > im.lst\n", + "!head -n 105 im.lst > sstkimagestrain.lst\n", + "!tail -n 45 im.lst > sstkimagesval.lst\n", + "!head sstkimagestrain.lst\n", + "!wc -l sstkimagestrain.lst\n", + "!wc -l sstkimagesval.lst" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Step 4: Copy the training data to your SageMaker S3 Bucket\n", + "Upload the data into the SageMaker S3 bucket. The images are uploaded under the train and validation prefixes. The LST files are uploaded under train_lst and validation_lst prefixes. " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "# Four channels: train, validation, train_lst, and validation_lst\n", + "s3_train = \"s3://{}/{}/train/\".format(sagemaker_bucket, sagemaker_bucket_prefix)\n", + "s3_validation = \"s3://{}/{}/validation/\".format(sagemaker_bucket, sagemaker_bucket_prefix)\n", + "s3_train_lst = \"s3://{}/{}/train_lst/\".format(sagemaker_bucket, sagemaker_bucket_prefix)\n", + "s3_validation_lst = \"s3://{}/{}/validation_lst/\".format(sagemaker_bucket, sagemaker_bucket_prefix)\n", + "\n", + "\n", + "s3_img_source = \"s3://{}/{}{}\".format(images_bucket, images_bucket_prefix, images_adx_directory)\n", + "\n", + "\n", + "# upload the image files to train and validation channels\n", + "!aws s3 sync $s3_img_source $s3_train\n", + "!aws s3 sync $s3_img_source $s3_validation\n", + "\n", + "# upload the LST files to train_lst and validation_lst channels\n", + "!aws s3 cp sstkimagestrain.lst $s3_train_lst --quiet\n", + "!aws s3 cp sstkimagesval.lst $s3_validation_lst --quiet" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Training our Multi-Label Image Classification Model\n", + "Now that we have prepared our training data, we are ready to train our image classification model. First, we will create a SageMaker Estimator object. This estimator will launch the training job.\n", + "\n", + "### Step 5: Set the training parameters\n", + "There are two kinds of parameters that need to be set for training. The first are the parameters for the training job. These include:\n", + "\n", + "* **Training instance count**: This is the number of instances on which to run the training. When the number of instances is greater than one, then the image classification algorithm will run in distributed settings. \n", + "* **Training instance type**: This indicates the type of machine on which to run the training. Here, we will use ml.p2.xlarge.\n", + "* **Output path**: This the S3 folder in which the training output is stored" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "s3_output_location = \"s3://{}/{}/output\".format(sagemaker_bucket, sagemaker_bucket_prefix)\n", + "multilabel_ic = sagemaker.estimator.Estimator(\n", + " training_image,\n", + " role,\n", + " instance_count=1,\n", + " instance_type=\"ml.p2.xlarge\",\n", + " volume_size=50,\n", + " max_run=360000,\n", + " input_mode=\"File\",\n", + " output_path=s3_output_location,\n", + " sagemaker_session=sagemaker.Session(),\n", + ")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Step 6: Set the algorithm parameters\n", + "\n", + "Apart from the above set of parameters, there are hyperparameters that are specific to the algorithm. These are:\n", + "\n", + "* **num_layers**: The number of layers (depth) for the network.\n", + "* **use_pretrained_model**: Set to 1 to use pretrained model for transfer learning.\n", + "* **image_shape**: The input image dimensions,'num_channels, height, width', for the network. It should be no larger than the actual image size. The number of channels should be same as the actual image.\n", + "* **num_classes**: This is the number of output classes for the dataset.\n", + "* **mini_batch_size**: The number of training samples used for each mini batch. In distributed training, the number of training samples used per batch will be N * mini_batch_size where N is the number of hosts on which training is run\n", + "* **resize**: Resize the image before using it for training. The images are resized so that the shortest side is of this parameter. If the parameter is not set, then the training data is used as such without resizing.\n", + "* **epochs**: Number of training epochs\n", + "* **learning_rate**: Learning rate for training\n", + "* **num_training_samples**: This is the total number of training samples. \n", + "* **use_weighted_loss**: This parameter is used to balance the influence of the positive and negative samples within the dataset.\n", + "* **augmentation_type**: This parameter determines the type of augmentation used for training. It can take on three values, 'crop', 'crop_color' and 'crop_color_transform'\n", + "* **precision_dtype**: The data type precision used during training. Using ``float16`` can lead to faster training with minimal drop in accuracy, particularly on P3 machines. By default, the parameter is set to ``float32``\n", + "* **multi_label**: Set multi_label to 1 for multi-label processing\n", + "\n", + "You can find a detailed description of all the algorithm parameters at https://docs.aws.amazon.com/sagemaker/latest/dg/IC-Hyperparameter.html" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "multilabel_ic.set_hyperparameters(\n", + " num_layers=18,\n", + " use_pretrained_model=1,\n", + " image_shape=\"3,224,224\",\n", + " num_classes=5,\n", + " mini_batch_size=25,\n", + " resize=256,\n", + " epochs=5,\n", + " learning_rate=0.001,\n", + " num_training_samples=100,\n", + " use_weighted_loss=1,\n", + " augmentation_type=\"crop_color_transform\",\n", + " precision_dtype=\"float32\",\n", + " multi_label=1,\n", + ")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Step 7: Specify the input data \n", + "Set the data type and channels used for training. In this training, we use application/x-image content type that require individual images and LST file for data input. " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "train_data = sagemaker.inputs.TrainingInput(\n", + " s3_train,\n", + " distribution=\"FullyReplicated\",\n", + " content_type=\"application/x-image\",\n", + " s3_data_type=\"S3Prefix\",\n", + ")\n", + "validation_data = sagemaker.inputs.TrainingInput(\n", + " s3_validation,\n", + " distribution=\"FullyReplicated\",\n", + " content_type=\"application/x-image\",\n", + " s3_data_type=\"S3Prefix\",\n", + ")\n", + "train_data_lst = sagemaker.inputs.TrainingInput(\n", + " s3_train_lst,\n", + " distribution=\"FullyReplicated\",\n", + " content_type=\"application/x-image\",\n", + " s3_data_type=\"S3Prefix\",\n", + ")\n", + "validation_data_lst = sagemaker.inputs.TrainingInput(\n", + " s3_validation_lst,\n", + " distribution=\"FullyReplicated\",\n", + " content_type=\"application/x-image\",\n", + " s3_data_type=\"S3Prefix\",\n", + ")\n", + "data_channels = {\n", + " \"train\": train_data,\n", + " \"validation\": validation_data,\n", + " \"train_lst\": train_data_lst,\n", + " \"validation_lst\": validation_data_lst,\n", + "}" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Step 8: Train the model\n", + "Start training by calling the fit method in the estimator." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "multilabel_ic.fit(inputs=data_channels, logs=True)" + ], + "outputs": [], + "metadata": { + "scrolled": true + } + }, + { + "cell_type": "markdown", + "source": [ + "# Inference\n", + "\n", + "### Step 9: Deploy the Model for Inference\n", + "\n", + "Once we have trained the model, we want to use it to perform inference. In this case, we will be predicting keywords for the image. \n", + "\n", + "First, let's deploy the model. You can deploy the created model by using the deploy method in the estimator." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "from sagemaker.serializers import IdentitySerializer\n", + "\n", + "ic_classifier = multilabel_ic.deploy(\n", + " initial_instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", + " serializer=IdentitySerializer(content_type=\"application/x-image\"),\n", + ")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Step 10: Evaluate the model\n", + "\n", + "Next, we evaluate one of our images through the network for inference. The network outputs probabilities for each of the keywords that we are training on. As can be seen from this example, the network output is pretty good even with training for only 5 epochs." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "import json\n", + "from PIL import Image\n", + "\n", + "\n", + "# The following image has not been used to train or validate the model yet. We will use it to test the model.\n", + "file_name = \"1721370184.jpg\"\n", + "print(\"Training on Image File: {}\".format(file_name))\n", + "\n", + "# Download and display the image\n", + "s3 = boto3.resource(\"s3\")\n", + "s3.Bucket(images_bucket).download_file(\n", + " (images_adx_directory + file_name), (\"./{}\".format(file_name))\n", + ")\n", + "with open(file_name, \"rb\") as image:\n", + " f = image.read()\n", + " b = bytearray(f)\n", + " # display the image\n", + " im = Image.open(image)\n", + " im.thumbnail([500, 500])\n", + " display(im)\n", + "\n", + "# Run the model prediction on the downloaded image\n", + "results = ic_classifier.predict(b)\n", + "prob = json.loads(results)\n", + "for idx, val in enumerate(training_keywords):\n", + " print(\"{}:{} \".format(training_keywords[idx], prob[idx]), end=\"\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "After running the above lines of code, our notebook will display the image and print out the probabilities of each of the keywords that we are training on. From here, we can expand the amount of keywords that we train on, fine-tune our parameters, or add additional images to further improve the accuracy of our model.\n", + "\n", + "\n", + "## Clean up\n", + "You can use the following command to delete the SageMaker model endpoint. The endpoint that is created above is persistent and would consume resources till it is deleted. It is good to delete the endpoint when it is not being used." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "ic_classifier.delete_endpoint()" + ], + "outputs": [], + "metadata": {} + } + ], + "metadata": { + "celltoolbar": "Raw Cell Format", + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + }, + "notice": "Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.", + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/aws_marketplace/using_data/using_data_with_ml_model/using-dataset-product-from-aws-data-exchange-with-ml-model-from-aws-marketplace.ipynb b/aws_marketplace/using_data/using_data_with_ml_model/using-dataset-product-from-aws-data-exchange-with-ml-model-from-aws-marketplace.ipynb deleted file mode 100644 index 419fbcb49f..0000000000 --- a/aws_marketplace/using_data/using_data_with_ml_model/using-dataset-product-from-aws-data-exchange-with-ml-model-from-aws-marketplace.ipynb +++ /dev/null @@ -1,507 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "33386bb3", - "metadata": {}, - "source": [ - "## Using dataset product from AWS Data Exchange with ML model from AWS Marketplace\n", - "\n", - "This sample notebook shows how to perform machine learning on third-party datasets from [AWS Data Exchange](https://aws.amazon.com/data-exchange/) using a pre-trained ML Model.\n", - "\n", - "In this notebook, you will subscribe to a dataset listed by shutterstock in AWS Data Exchange. You will then export the dataset to an S3 bucket, and then download it to your local environment. You will also subscribe to Resnet 18, an open ML model from AWS Marketplace and deploy it in form an Amazon SageMaker Endpoint. Finally, you will perform inference.\n", - "\n", - "\n", - "### Contents:\n", - "* [Pre-requisites](#Pre-requisites)\n", - "* [Introduction](#Introduction)\n", - "* [Explore dataset](#Explore-dataset)\n", - "* [Perform inference](#Perform-inference)\n", - "* [Cleanup](#Cleanup)\n", - "\n", - "\n", - "#### Usage instructions\n", - "You can run this notebook one cell at a time (By using Shift+Enter for running a cell)." - ] - }, - { - "cell_type": "markdown", - "id": "b230f601", - "metadata": {}, - "source": [ - "### Pre-requisites:\n", - "\n", - "#### Pre-requisite 1:\n", - "This sample notebook assumes a subscription to the [500 Image & Metadata Free Sample dataset](https://console.aws.amazon.com/dataexchange/home?region=us-east-1#/products/prodview-2h52yl4q6jrjw) has been created and data has been exported into an S3 bucket.\n", - "\n", - "If you have not done this already, please follow these steps: \n", - "\n", - "#### Subscribe to data from AWS Data Exchange:\n", - "1. Open the [500 Image & Metadata Free Sample dataset](https://console.aws.amazon.com/dataexchange/home?region=us-east-1#/products/prodview-2h52yl4q6jrjw) from AWS Data Exchange console.\n", - "2. Read the overview and other information such as pricing, usage, support. \n", - "3. Choose __Continue to Subscribe__.\n", - "4. If your organization agrees to subscription terms, pricing information, and Data subscription agreement, then review/update the renewal settings and choose __Subscribe__.\n", - "5. Once subscription has been successfully created (This step may take 5-10 minutes), you will find the dataset listed in the [__Subscriptions__](https://console.aws.amazon.com/dataexchange/home?region=us-east-1#/subscriptions) section of the console\n", - "6. From [subscription page](https://console.aws.amazon.com/dataexchange/home?region=us-east-1#/subscriptions), open **Shutterstock dataset**, and for this use-case, choose the __retail_trials-bathbodyworks__ dataset.\n", - "7. Select the revision and then choose **Export to Amazon S3**.\n", - "8. Select appropriate bucket and once the export job has completed, open the s3 bucket you chose in preceding step and then copy the S3 URL of the data folder by choosing **Copy S3 URI** and specify the same in following cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c042e2bb", - "metadata": {}, - "outputs": [], - "source": [ - "# Please specify S3 location in which dataset has been exported.\n", - "dataset_export_location = \"\"\n", - "# dataset_export_location='s3://bucket/adx_free_data_sample/'" - ] - }, - { - "cell_type": "markdown", - "id": "edd03dc7", - "metadata": {}, - "source": [ - "#### Pre-requisite 2:\n", - "\n", - "This sample notebook assumes a subscription to the [Resnet 18 ML Model](https://aws.amazon.com/marketplace/pp/prodview-rte234xioxzqu) has been created and an endpoint has been deployed.If you have not done this already, please follow these steps:\n", - "\n", - "\n", - "#### Subscribe and deploy ML Model from AWS Marketplace:\n", - "1. Open the [Resnet 18 ML Model from AWS Marketplace listing](https://aws.amazon.com/marketplace/pp/prodview-rte234xioxzqu) from AWS Marketplace.\n", - "2. Read the **Highlights** section and then **product overview** section of the listing.\n", - "3. View **usage information** and then **additional resources**.\n", - "4. Note the supported instance types.\n", - "5. Next, click on **Continue to subscribe**.\n", - "6. Review **End user license agreement**, **support terms**, as well as **pricing information**.\n", - "7. **\"Accept Offer\"** button needs to be clicked if your organization agrees with EULA, pricing information as well as support terms.\n", - "8. Choose **Continue to Configuration**.\n", - "9. Leave **AWS CloudFormation** as the selected option and if this is the first time you are using Amazon SageMaker, \n", - "under *Configure for AWS CloudFormation*, choose **Create and use a new service role** and ***Any S3 bucket**, and then select **Launch CloudFormation Template**.\n", - "10. In CloudFormation console, choose **Create Stack**\n", - "11. After you have launched AWS CloudFormation template, wait for the newly launched AWS CloudFormation stack's status to change to **Create Complete**. \n", - "12. Open Outputs tab of the CloudFormation stack and then copy the value corresponding to **EndpointName** and specify the same in following cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f75db16a", - "metadata": {}, - "outputs": [], - "source": [ - "endpoint_name = \"Endpoint-ResNet-18-1\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41a4a77a", - "metadata": {}, - "outputs": [], - "source": [ - "# Import necessary libraries.\n", - "import math\n", - "import re\n", - "import os\n", - "import json\n", - "import time\n", - "\n", - "import glob\n", - "import pandas as pd\n", - "\n", - "import boto3\n", - "import sagemaker\n", - "from sagemaker import AlgorithmEstimator\n", - "from sagemaker import get_execution_role\n", - "from IPython.display import Image\n", - "\n", - "s3 = boto3.client(\"s3\")\n", - "sagemaker_session = sagemaker.Session()\n", - "region = sagemaker_session.boto_region_name\n", - "runtime = boto3.client(\"runtime.sagemaker\")\n", - "\n", - "content_type = \"application/x-image\"\n", - "predictions = []\n", - "\n", - "s3_bucket = f\"jumpstart-cache-prod-{region}\"\n", - "s3.download_file(s3_bucket, \"inference-notebook-assets/ImageNetLabels.txt\", \"ImageNetLabels.txt\")\n", - "with open(\"ImageNetLabels.txt\", \"r\") as file:\n", - " class_id_to_label = file.read().splitlines()[1::]" - ] - }, - { - "cell_type": "markdown", - "id": "f8b3497a", - "metadata": {}, - "source": [ - "### Introduction\n", - "\n", - "You work for a super cool startup, which lets you bring your pet to the office. The startup is expanding and culture is pretty friendly. Your office is on a large campus provided by a tech incubator. The campus itself is well-equipped with safety cameras.\n", - "\n", - "You bring your little shih-tzu dog, affectionately called Toffee, to work. Because of his friendly nature, he quickly becomes the most popular dog on entire campus. He loves visiting all his friends and you have to find Toffee every day before leaving work.\n", - "\n", - "Since the campus is large, it is hard to physically go everywhere and find your dog. You typically end up with security and have to go through hundreds of cameras to find Toffee before you can leave for the day.\n", - "\n", - "In this workshop, you will develop new skills which you can use to build a software that security team can use to help people find their dog. For this workshop, you don’t need to worry about finding a campus and setting up cameras. Shutterstock has provided a dataset that you will use for the analysis. As part of pre-requisites of this notebook, you should already have subscribed to the dataset and specified the s3 location in dataset_export_location variable." - ] - }, - { - "cell_type": "markdown", - "id": "f1a4996d", - "metadata": {}, - "source": [ - "### Explore dataset" - ] - }, - { - "cell_type": "markdown", - "id": "ba59a3c5", - "metadata": {}, - "source": [ - "Next, you will load the dataset from S3 into your local execution environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef6f579f", - "metadata": {}, - "outputs": [], - "source": [ - "!aws s3 sync $dataset_export_location data" - ] - }, - { - "cell_type": "markdown", - "id": "38ffb9e2", - "metadata": {}, - "source": [ - "Load the camera footage into a dictionary so you can easily do a lookup." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "596ecf8b", - "metadata": {}, - "outputs": [], - "source": [ - "camera_footage = {}\n", - "counter = 1\n", - "for subdir, dirs, files in os.walk(\"data\"):\n", - " for file in files:\n", - " camera_footage[counter] = subdir + \"/\" + file\n", - " counter = counter + 1\n", - "\n", - "print(\"Total \", (counter - 1), \" cameras were found\")\n", - "\n", - "\n", - "def get_camera_id(value):\n", - " for (\n", - " key,\n", - " val,\n", - " ) in camera_footage.items(): # for name, age in dictionary.iteritems(): (for Python 2.x)\n", - " if value in val:\n", - " return key" - ] - }, - { - "cell_type": "markdown", - "id": "11fa6f4c", - "metadata": {}, - "source": [ - "See what footage from camera #1 looks like" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1562bd13", - "metadata": {}, - "outputs": [], - "source": [ - "def show_cam_footage(camera_id):\n", - " return Image(url=camera_footage[camera_id], width=400, height=800)\n", - "\n", - "\n", - "camera_id = get_camera_id(\"1634351818.jpg\")\n", - "show_cam_footage(camera_id)" - ] - }, - { - "cell_type": "markdown", - "id": "cd916d1a", - "metadata": {}, - "source": [ - "Looks like you are looking at camera located in the grocery store of the campus. Try footage from another camera." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0e726ad", - "metadata": {}, - "outputs": [], - "source": [ - "camera_id = get_camera_id(\"1821728006.jpg\")\n", - "show_cam_footage(camera_id)" - ] - }, - { - "cell_type": "markdown", - "id": "4d39abf6", - "metadata": {}, - "source": [ - "That's Stacy from your team giving a treat to her golden retriever!" - ] - }, - { - "cell_type": "markdown", - "id": "197e5ccd", - "metadata": {}, - "source": [ - "Now you need to identify a way to catalog all the different dogs and cats so that you can look them up easily. For this purpose, you will use an ML model that can identify 1000 different image classes including many popular dog and cat breeds as shown in following table.\n", - "\n", - "\n", - "| Class | dog | |\n", - "|------------------|-----|---|\n", - "| redbone | dog | |\n", - "| shih-tzu | dog | |\n", - "| collie | dog | |\n", - "| basset | dog | |\n", - "| malamute | dog | |\n", - "| beagle | dog | |\n", - "| pug | dog | |\n", - "| golden retriever | dog | |\n", - "| tabby | cat | |\n", - "| siamese cat | cat | |" - ] - }, - { - "cell_type": "markdown", - "id": "b011bd61", - "metadata": {}, - "source": [ - "### Perform inference" - ] - }, - { - "cell_type": "markdown", - "id": "01aead45", - "metadata": {}, - "source": [ - "As part of pre-requisite#2, you have already deployed the ML model and configured the endpoint name in 'endpoint_name' variable. Now you are ready to perform inference. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "deb8a0a4", - "metadata": {}, - "outputs": [], - "source": [ - "# The following method sends picture corresponding to camera_id specified to the ML model\n", - "# and returns you the classes found.\n", - "\n", - "\n", - "def perform_inference(camera_id):\n", - "\n", - " with open(camera_footage[camera_id], \"rb\") as file:\n", - " body = file.read()\n", - "\n", - " # Perform inference by calling invoke_endpoint API\n", - " response = runtime.invoke_endpoint(\n", - " EndpointName=endpoint_name, ContentType=content_type, Body=body\n", - " )\n", - "\n", - " # Parse the inference response and load top 10 classes found into a dictionary.\n", - " prediction = json.loads(response[\"Body\"].read())\n", - " prediction_ids = sorted(\n", - " range(len(prediction)), key=lambda index: prediction[index], reverse=True\n", - " )[:10]\n", - " for id in prediction_ids:\n", - " predictions.append([camera_id, class_id_to_label[id].lower(), 100 * prediction[id]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13c592fe", - "metadata": {}, - "outputs": [], - "source": [ - "# Perform inference on all cameras\n", - "for id in camera_footage:\n", - " perform_inference(id)\n", - "\n", - "# Load the inference results into a pandas datafram so you can easily look it up.\n", - "df = pd.DataFrame(predictions, columns=[\"camera_id\", \"entity\", \"probability_measure\"])" - ] - }, - { - "cell_type": "markdown", - "id": "04dc9563", - "metadata": {}, - "source": [ - "Now that our catalog containing image classes for all cameras is ready, you can look-up the classes identified by the Resnet-18 machine learning model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8ce325b", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"-------------------------------------------------\")\n", - "print(\"Image classes summary for cam-\", camera_id)\n", - "print(\"-------------------------------------------------\")\n", - "print(df[df[\"camera_id\"] == camera_id])\n", - "show_cam_footage(camera_id)" - ] - }, - { - "cell_type": "markdown", - "id": "8d3cf380", - "metadata": {}, - "source": [ - "You can see how the ML model was able to identify the golden retriever with high probability measure value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae444514", - "metadata": {}, - "outputs": [], - "source": [ - "# Following function accepts the pet catagory and returns results\n", - "# that meet the probability_measure threshold.\n", - "def find_my_pet(catagory, probability_measure):\n", - " images = []\n", - " entries = df[\n", - " (df[\"entity\"] == catagory) & (df[\"probability_measure\"] > probability_measure)\n", - " ].sort_values(\"probability_measure\", ascending=False)\n", - " for entry in entries.iterrows():\n", - " print(\n", - " \"Camera-id:\"\n", - " + str(entry[1][\"camera_id\"])\n", - " + \" -> \"\n", - " + str(entry[1][\"probability_measure\"])\n", - " )\n", - " display(Image(url=camera_footage[entry[1][\"camera_id\"]], width=400, height=800))" - ] - }, - { - "cell_type": "markdown", - "id": "a1fcdab0", - "metadata": {}, - "source": [ - "Now its time to find Toffee. Specify a **pet_category** and a **probability_measure** value to see all cameras that have the specified pet." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "afa5cba2", - "metadata": {}, - "outputs": [], - "source": [ - "pet_category = \"shih-tzu\"\n", - "probability_measure_threshold = 10\n", - "find_my_pet(pet_category, probability_measure_threshold)" - ] - }, - { - "cell_type": "markdown", - "id": "3c5cb197", - "metadata": {}, - "source": [ - "You can now try specifying different values for the **pet_category** and **probability_measure** variables to see how model behaves." - ] - }, - { - "cell_type": "markdown", - "id": "101be7f8", - "metadata": {}, - "source": [ - "Congratulations, you have learnt how pre-trained ML models can be used to extract insights from data." - ] - }, - { - "cell_type": "markdown", - "id": "f3fe07a9", - "metadata": {}, - "source": [ - "### Next Steps\n", - "As a next step, i recommend you to:\n", - "1. Explore [AWS Data Exchange](https://console.aws.amazon.com/dataexchange/home?region=us-east-1#/products) and identify the dataset that will help you solve your business problems. If you can't find a dataset you are looking for, you can also [request dataset products](https://console.aws.amazon.com/dataexchange/home?region=us-east-1#/products/product-request)\n", - "2. Explore [ML Models from AWS Marketplace](https://aws.amazon.com/marketplace/search/results?page=1&filters=fulfillment_options&fulfillment_options=SageMaker&ref_=header_nav_dm_sagemaker) and identify which ML model can help you build differentiating features. If you have any questions or need a custom ML model, you can contact AWS Marketplace team on aws-mp-bd-ml@amazon.com." - ] - }, - { - "cell_type": "markdown", - "id": "a05174b2", - "metadata": {}, - "source": [ - "### Cleanup" - ] - }, - { - "cell_type": "markdown", - "id": "4d1aa396", - "metadata": {}, - "source": [ - "To avoid charges to your AWS account when not running your invocation, you will need to delete your endpoint. You will not be charged for keeping your endpoint config or model.\n", - "\n", - "You can visit CloudFormation to delete the stack you created.\n" - ] - }, - { - "cell_type": "markdown", - "id": "09392f63", - "metadata": {}, - "source": [ - "Finally, if the AWS Marketplace subscription was created just for the experiment and you want to unsubscribe to the product, here are the steps that can be followed.\n", - "Before you cancel the subscription, ensure that you do not have any [deployable model](https://console.aws.amazon.com/sagemaker/home#/models) created from the model package or using the algorithm. Note - You can find this information by looking at the container name associated with the model. \n", - "\n", - "**Steps to un-subscribe to product from AWS Marketplace**:\n", - "1. Navigate to __Machine Learning__ tab on [__Your Software subscriptions page__](https://aws.amazon.com/marketplace/ai/library?productType=ml&ref_=lbr_tab_ml)\n", - "2. Locate the listing that you need to cancel subscription for, and then __Cancel Subscription__ can be clicked to cancel the subscription.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38ebd1f8", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/aws_sagemaker_studio/sagemaker-python-sdk/scikit_learn_iris/Scikit-learn Estimator Example With Batch Transform.ipynb b/aws_sagemaker_studio/sagemaker-python-sdk/scikit_learn_iris/Scikit-learn Estimator Example With Batch Transform.ipynb index 39a1dc86b3..15ba1bbf63 100644 --- a/aws_sagemaker_studio/sagemaker-python-sdk/scikit_learn_iris/Scikit-learn Estimator Example With Batch Transform.ipynb +++ b/aws_sagemaker_studio/sagemaker-python-sdk/scikit_learn_iris/Scikit-learn Estimator Example With Batch Transform.ipynb @@ -222,7 +222,7 @@ "import os\n", "\n", "from sklearn import tree\n", - "from sklearn.externals import joblib\n", + "import joblib\n", "\n", "\n", "if __name__ == '__main__':\n", @@ -249,8 +249,8 @@ " train_data = pd.concat(raw_data)\n", "\n", " # labels are in the first column\n", - " train_y = train_data.ix[:,0]\n", - " train_X = train_data.ix[:,1:]\n", + " train_y = train_data.loc[:,0]\n", + " train_X = train_data.loc[:,1:]\n", "\n", " # Here we support a single hyperparameter, 'max_leaf_nodes'. Note that you can add as many\n", " # as your training my require in the ArgumentParser above.\n", diff --git a/aws_sagemaker_studio/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet_studio/gluoncv_ssd_mobilenet_neo_studio.ipynb b/aws_sagemaker_studio/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet_studio/gluoncv_ssd_mobilenet_neo_studio.ipynb new file mode 100644 index 0000000000..cf527d3d0d --- /dev/null +++ b/aws_sagemaker_studio/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet_studio/gluoncv_ssd_mobilenet_neo_studio.ipynb @@ -0,0 +1,453 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deploy pre-trained GluonCV SSD Mobilenet model with SageMaker Neo\n", + "\n", + "1. [Introduction](#Introduction)\n", + "2. [Setup](#Setup)\n", + " 1. [Import SSD Mobilenet model from MXNet GluonCV](#Import-SSD-Mobilenet-model-from-MXNet-GluonCV)\n", + " 2. [Upload model to S3](#Upload-model-to-S3)\n", + " 3. [Use sagemaker MXNetModel to load pretrained MXNet model](#Use-sagemaker-MXNetModel-to-load-pretrained-MXNet-model)\n", + "3. [Compile the pre-trained model using SageMaker Neo](#Compile-the-pre-trained-model-using-SageMaker-Neo)\n", + "4. [Deploy-the-compiled-model-and-request-Inferences](#Deploy-the-compiled-model-and-request-Inferences)\n", + "5. [Delete the Endpoint](#Delete-the-Endpoint)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "This example demonstrates how to load a pre-trained MXNet GluonCV SSD model, optimize the trained model using SageMaker Neo, and host the model." + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Setup\n", + "\n", + "Before getting started, make sure to select `Python 3 (Data Science)` kernel. Ensure that `Apache MXNet` and `OpenCV` packages are installed in the kernel.\n", + "\n", + "Next, we need to define a few variables and obtain certain permissions that will be needed later in the example. These are:\n", + "\n", + "* A SageMaker session\n", + "* IAM role to give learning, storage & hosting access to your data\n", + "* An S3 bucket, a folder & sub folders that will be used to store data and artifact\n", + "\n", + "To start, we need to upgrade the [SageMaker SDK for Python](https://sagemaker.readthedocs.io/en/stable/v2.html) to v2.33.0 or greater and latest MXNet GluonCV and restart the kernel." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install mxnet\n", + "!apt-get update\n", + "!apt-get install -y python3-opencv\n", + "!pip install --upgrade sagemaker>=2.33.0 gluoncv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we need an AWS account role with SageMaker access. This role is used to give SageMaker access to your data in S3. We also create a session." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "\n", + "role = get_execution_role()\n", + "sess = sagemaker.Session()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then need an S3 bucket that would be used for storing the model artifacts generated after training and compilation, training data and custom code. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# S3 bucket and folders for saving code and model artifacts.\n", + "# Feel free to specify different bucket/folders here if you wish.\n", + "bucket = sess.default_bucket()\n", + "folder = \"DEMO-ObjectDetection-SSD-MobileNet\"\n", + "pretrained_model_sub_folder = folder + \"/pretrained-model\"\n", + "compilation_output_sub_folder = folder + \"/compilation-output\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To easily visualize the detection outputs we also define the following function. The function visualizes the high-confidence predictions with bounding box by filtering out low-confidence detections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "def visualize_detection(img_file, dets, classes=[], thresh=0.6):\n", + " \"\"\"\n", + " visualize detections in one image\n", + " Parameters:\n", + " ----------\n", + " img_file : numpy.array\n", + " image, in bgr format\n", + " dets : numpy.array\n", + " ssd detections, numpy.array([[id, score, x1, y1, x2, y2]...])\n", + " each row is one object\n", + " classes : tuple or list of str\n", + " class names\n", + " thresh : float\n", + " score threshold\n", + " \"\"\"\n", + " import random\n", + " import matplotlib.pyplot as plt\n", + " import matplotlib.image as mpimg\n", + " from matplotlib.patches import Rectangle\n", + "\n", + " img = mpimg.imread(img_file)\n", + " plt.imshow(img)\n", + " height = img.shape[0]\n", + " width = img.shape[1]\n", + " colors = dict()\n", + " klasses = dets[0][0]\n", + " scores = dets[1][0]\n", + " bbox = dets[2][0]\n", + " for i in range(len(classes)):\n", + " klass = klasses[i][0]\n", + " score = scores[i][0]\n", + " x0, y0, x1, y1 = bbox[i]\n", + " if score < thresh:\n", + " continue\n", + " cls_id = int(klass)\n", + " if cls_id not in colors:\n", + " colors[cls_id] = (random.random(), random.random(), random.random())\n", + " xmin = int(x0 * width / 512)\n", + " ymin = int(y0 * height / 512)\n", + " xmax = int(x1 * width / 512)\n", + " ymax = int(y1 * height / 512)\n", + " rect = Rectangle(\n", + " (xmin, ymin),\n", + " xmax - xmin,\n", + " ymax - ymin,\n", + " fill=False,\n", + " edgecolor=colors[cls_id],\n", + " linewidth=3.5,\n", + " )\n", + " plt.gca().add_patch(rect)\n", + " class_name = str(cls_id)\n", + " if classes and len(classes) > cls_id:\n", + " class_name = classes[cls_id]\n", + " plt.gca().text(\n", + " xmin,\n", + " ymin - 2,\n", + " \"{:s} {:.3f}\".format(class_name, score),\n", + " bbox=dict(facecolor=colors[cls_id], alpha=0.5),\n", + " fontsize=12,\n", + " color=\"white\",\n", + " )\n", + " plt.tight_layout(rect=[0, 0, 2, 2])\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing object categories\n", + "object_categories = [\n", + " \"aeroplane\",\n", + " \"bicycle\",\n", + " \"bird\",\n", + " \"boat\",\n", + " \"bottle\",\n", + " \"bus\",\n", + " \"car\",\n", + " \"cat\",\n", + " \"chair\",\n", + " \"cow\",\n", + " \"diningtable\",\n", + " \"dog\",\n", + " \"horse\",\n", + " \"motorbike\",\n", + " \"person\",\n", + " \"pottedplant\",\n", + " \"sheep\",\n", + " \"sofa\",\n", + " \"train\",\n", + " \"tvmonitor\",\n", + "]\n", + "\n", + "# Setting a threshold 0.20 will only plot detection results that have a confidence score greater than 0.20\n", + "threshold = 0.20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we load the test image into the memory. The test image used in this notebook is from [PEXELS](https://www.pexels.com/) which remains unseen until the time of prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import PIL.Image\n", + "import numpy as np\n", + "\n", + "test_file = \"test.jpg\"\n", + "test_image = PIL.Image.open(test_file)\n", + "test_image = np.asarray(test_image.resize((512, 512)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import SSD Mobilenet model from MXNet GluonCV\n", + "This example uses pre-trained MXNet GluonCV SSD model initially published in:\n", + "> Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. ECCV 2016." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import mxnet as mx\n", + "import gluoncv as gcv\n", + "import tarfile\n", + "\n", + "net = gcv.model_zoo.get_model(\"ssd_512_mobilenet1.0_voc\", pretrained=True)\n", + "net.hybridize()\n", + "net(mx.nd.ones((1, 3, 512, 512)))\n", + "net.export(\"model\")\n", + "tar = tarfile.open(\"ssd_512_mobilenet1.0_voc.tar.gz\", \"w:gz\")\n", + "\n", + "for name in [\"model-0000.params\", \"model-symbol.json\"]:\n", + " tar.add(name)\n", + "tar.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upload model to S3\n", + "Upload the pre-trained model to the S3 bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pretrained_model_path = sess.upload_data(\n", + " path=\"ssd_512_mobilenet1.0_voc.tar.gz\", bucket=bucket, key_prefix=pretrained_model_sub_folder\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we need to setup training and compilation output locations in S3, where the respective model artifacts will be dumped. We also setup the s3 location for training data and custom code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# S3 Location to save the model artifact after training\n", + "s3_pretrained_model_location = \"s3://{}/{}\".format(bucket, pretrained_model_sub_folder)\n", + "\n", + "# S3 Location to save the model artifact after compilation\n", + "s3_compilation_output_location = \"s3://{}/{}\".format(bucket, compilation_output_sub_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use sagemaker MXNetModel to load pretrained MXNet model\n", + "When loading the model, user is expected to provide the `entry_point` script required by the model. We set `MMS_DEFAULT_RESPONSE_TIMEOUT` environment variable to `500` for MXNet model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% H\n" + } + }, + "outputs": [], + "source": [ + "from sagemaker.mxnet.model import MXNetModel\n", + "from sagemaker.mxnet import MXNetPredictor\n", + "\n", + "pre_trained_model = MXNetModel(\n", + " model_data=pretrained_model_path,\n", + " predictor_cls=MXNetPredictor,\n", + " framework_version=\"1.8\",\n", + " role=role,\n", + " sagemaker_session=sess,\n", + " entry_point=\"ssd_entry_point.py\",\n", + " py_version=\"py3\",\n", + " env={\"MMS_DEFAULT_RESPONSE_TIMEOUT\": \"500\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compile the pre-trained model using SageMaker Neo\n", + "\n", + "After loading the pretrained model we can use SageMaker Neo's ``compile()`` API to compile the pretrained model. When calling ``compile()``, the user is expected to provide all the correct input shapes required by the model for successful compilation. We also specify the target instance family, the name of our IAM execution role, S3 bucket to which the compiled model would be stored.\n", + "\n", + "For this example, we will choose `ml_p3` as the target instance family while compiling the trained model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "import time\n", + "\n", + "compiled_model = pre_trained_model.compile(\n", + " job_name=\"ssd-512-mobilenet-{}\".format(time.strftime(\"%Y%m%d%I%M%S\")),\n", + " target_instance_family=\"ml_p3\",\n", + " input_shape={\"data\": [1, 3, 512, 512]},\n", + " role=role,\n", + " framework=\"mxnet\",\n", + " framework_version=\"1.8\",\n", + " output_path=s3_compilation_output_location,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy the compiled model and request Inferences\n", + "\n", + "We have to deploy the compiled model within the instance family for which the trained model was compiled. Since we have compiled for `ml_p3` we can deploy to any `ml.p3` instance type. For this example we will choose `ml.p3.2xlarge`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "neo_object_detector = compiled_model.deploy(initial_instance_count=1, instance_type=\"ml.p3.2xlarge\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "response = neo_object_detector.predict(test_image)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize the detections.\n", + "visualize_detection(test_file, response, object_categories, threshold)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Delete the Endpoint\n", + "Having an endpoint running will incur some costs. Therefore, as an optional clean-up job, you can delete it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Endpoint name: \" + neo_object_detector.endpoint_name)\n", + "neo_object_detector.delete_endpoint()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_mxnet_p36", + "language": "python", + "name": "conda_mxnet_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/aws_sagemaker_studio/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet_studio/ssd_entry_point.py b/aws_sagemaker_studio/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet_studio/ssd_entry_point.py new file mode 100644 index 0000000000..bfb27dc713 --- /dev/null +++ b/aws_sagemaker_studio/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet_studio/ssd_entry_point.py @@ -0,0 +1,277 @@ +import io +import json +import logging +import os + +import numpy as np +import PIL.Image + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +# ------------------------------------------------------------ # +# Training methods # +# ------------------------------------------------------------ # + +import argparse +import glob +import time +import warnings + +import mxnet as mx +from mxnet import autograd, gluon, nd + + +def parse_args(): + parser = argparse.ArgumentParser(description="Train SSD networks.") + parser.add_argument( + "--network", type=str, default="ssd_512_mobilenet1.0_voc", help="Network name" + ) + parser.add_argument( + "--data-shape", type=int, default=512, help="Input data shape, use 300, 512." + ) + parser.add_argument("--batch-size", type=int, default=32, help="Training mini-batch size") + parser.add_argument( + "--num-workers", + "-j", + dest="num_workers", + type=int, + default=4, + help="Number of data workers, you can use larger " + "number to accelerate data loading, if you CPU and GPUs are powerful.", + ) + parser.add_argument( + "--gpus", type=str, default="0", help="Training with GPUs, you can specify 1,3 for example." + ) + parser.add_argument("--epochs", type=int, default=240, help="Training epochs.") + parser.add_argument( + "--start-epoch", + type=int, + default=0, + help="Starting epoch for resuming, default is 0 for new training." + "You can specify it to 100 for example to start from 100 epoch.", + ) + parser.add_argument( + "--log-interval", type=int, default=100, help="Logging mini-batch interval. Default is 100." + ) + parser.add_argument("--lr", type=float, default=0.001, help="Learning rate, default is 0.001") + parser.add_argument( + "--lr-decay", type=float, default=0.1, help="decay rate of learning rate. default is 0.1." + ) + parser.add_argument( + "--lr-decay-epoch", + type=str, + default="160,200", + help="epochs at which learning rate decays. default is 160,200.", + ) + parser.add_argument("--momentum", type=float, default=0.9, help="SGD momentum, default is 0.9") + parser.add_argument("--wd", type=float, default=0.0005, help="Weight decay, default is 5e-4") + + return parser.parse_args() + + +def get_dataloader(net, data_shape, batch_size, num_workers, ctx): + """Get dataloader.""" + + from gluoncv import data as gdata + from gluoncv.data.batchify import Pad, Stack, Tuple + from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform + + width, height = data_shape, data_shape + # use fake data to generate fixed anchors for target generation + with autograd.train_mode(): + _, _, anchors = net(mx.nd.zeros((1, 3, height, width), ctx)) + anchors = anchors.as_in_context(mx.cpu()) + batchify_fn = Tuple(Stack(), Stack(), Stack()) # stack image, cls_targets, box_targets + train_dataset = gdata.RecordFileDetection( + os.path.join(os.environ["SM_CHANNEL_TRAIN"], "train.rec") + ) + train_loader = gluon.data.DataLoader( + train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)), + batch_size, + True, + batchify_fn=batchify_fn, + last_batch="rollover", + num_workers=num_workers, + ) + return train_loader + + +def train(net, train_data, ctx, args): + """Training pipeline""" + + import gluoncv as gcv + + net.collect_params().reset_ctx(ctx) + + trainer = gluon.Trainer( + net.collect_params(), + "sgd", + {"learning_rate": args.lr, "wd": args.wd, "momentum": args.momentum}, + update_on_kvstore=None, + ) + + # lr decay policy + lr_decay = float(args.lr_decay) + lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(",") if ls.strip()]) + + mbox_loss = gcv.loss.SSDMultiBoxLoss() + ce_metric = mx.metric.Loss("CrossEntropy") + smoothl1_metric = mx.metric.Loss("SmoothL1") + + # set up logger + logging.basicConfig() + logger = logging.getLogger() + logger.setLevel(logging.INFO) + logger.info(args) + logger.info("Start training from [Epoch {}]".format(args.start_epoch)) + best_map = [0] + + for epoch in range(args.start_epoch, args.epochs): + while lr_steps and epoch >= lr_steps[0]: + new_lr = trainer.learning_rate * lr_decay + lr_steps.pop(0) + trainer.set_learning_rate(new_lr) + logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) + ce_metric.reset() + smoothl1_metric.reset() + tic = time.time() + btic = time.time() + net.hybridize(static_alloc=True, static_shape=True) + + for i, batch in enumerate(train_data): + data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) + cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) + box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) + + with autograd.record(): + cls_preds = [] + box_preds = [] + for x in data: + cls_pred, box_pred, _ = net(x) + cls_preds.append(cls_pred) + box_preds.append(box_pred) + sum_loss, cls_loss, box_loss = mbox_loss( + cls_preds, box_preds, cls_targets, box_targets + ) + autograd.backward(sum_loss) + # since we have already normalized the loss, we don't want to normalize + # by batch-size anymore + trainer.step(1) + + local_batch_size = int(args.batch_size) + ce_metric.update(0, [l * local_batch_size for l in cls_loss]) + smoothl1_metric.update(0, [l * local_batch_size for l in box_loss]) + if args.log_interval and not (i + 1) % args.log_interval: + name1, loss1 = ce_metric.get() + name2, loss2 = smoothl1_metric.get() + logger.info( + "[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}".format( + epoch, i, args.batch_size / (time.time() - btic), name1, loss1, name2, loss2 + ) + ) + btic = time.time() + + name1, loss1 = ce_metric.get() + name2, loss2 = smoothl1_metric.get() + logger.info( + "[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}".format( + epoch, (time.time() - tic), name1, loss1, name2, loss2 + ) + ) + current_map = 0.0 + + # save model + net.set_nms(nms_thresh=0.45, nms_topk=400, post_nms=100) + net(mx.nd.ones((1, 3, 512, 512), ctx=ctx[0])) + net.export("%s/model" % os.environ["SM_MODEL_DIR"]) + return net + + +if __name__ == "__main__": + + from gluoncv import model_zoo + + args = parse_args() + + ctx = [mx.gpu(int(i)) for i in args.gpus.split(",") if i.strip()] + ctx = ctx if ctx else [mx.cpu()] + + net = model_zoo.get_model(args.network, pretrained=False, ctx=ctx) + net.initialize(ctx=mx.gpu(0)) + train_loader = get_dataloader(net, args.data_shape, args.batch_size, args.num_workers, ctx[0]) + + train(net, train_loader, ctx, args) + +# ------------------------------------------------------------ # +# Hosting methods for Neo compiled model # +# ------------------------------------------------------------ # + + +def model_fn(model_dir): + """ + Load the gluon model. Called once when hosting service starts. + :param: model_dir The directory where model files are stored. + :return: a model (in this case a Gluon network) + """ + logging.info("Invoking user-defined model_fn") + import neomx # noqa: F401 + + # change context to mx.cpu() when optimizing and deploying with Neo for CPU endpoints + ctx = mx.gpu() + net = gluon.SymbolBlock.imports( + "%s/compiled-symbol.json" % model_dir, + ["data"], + "%s/compiled-0000.params" % model_dir, + ctx=ctx, + ) + net.hybridize(static_alloc=True, static_shape=True) + # run warm-up inference on empty data + warmup_data = mx.nd.empty((1, 3, 512, 512), ctx=ctx) + class_IDs, scores, bounding_boxes = net(warmup_data) + + return net + + +def transform_fn(net, data, content_type, output_content_type): + """ + pre-process the incoming payload, perform prediction & convert the prediction output into response payload + """ + logging.info("Invoking user-defined transform_fn") + + import gluoncv as gcv + + # change context to mx.cpu() when optimizing and deploying with Neo for CPU endpoints + ctx = mx.gpu() + + """ + pre-processing + """ + # decode json string into numpy array + data = json.loads(data) + + # preprocess image + x, image = gcv.data.transforms.presets.ssd.transform_test(mx.nd.array(data), 512) + + # load image onto right context + x = x.as_in_context(ctx) + + """ + prediction/inference + """ + class_IDs, scores, bounding_boxes = net(x) + + """ + post-processing + """ + # create list of results + result = [ + class_IDs.asnumpy().tolist(), + scores.asnumpy().tolist(), + bounding_boxes.asnumpy().tolist(), + ] + + # decode as json string + response_body = json.dumps(result) + + return response_body, output_content_type diff --git a/aws_sagemaker_studio/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet_studio/test.jpg b/aws_sagemaker_studio/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet_studio/test.jpg new file mode 100644 index 0000000000..2eab5780e7 Binary files /dev/null and b/aws_sagemaker_studio/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet_studio/test.jpg differ diff --git a/aws_sagemaker_studio/sagemaker_studio_image_build/tensorflow_bring_your_own/tensorflow_bring_your_own.ipynb b/aws_sagemaker_studio/sagemaker_studio_image_build/tensorflow_bring_your_own/tensorflow_bring_your_own.ipynb index a72d7f4252..7da665d941 100644 --- a/aws_sagemaker_studio/sagemaker_studio_image_build/tensorflow_bring_your_own/tensorflow_bring_your_own.ipynb +++ b/aws_sagemaker_studio/sagemaker_studio_image_build/tensorflow_bring_your_own/tensorflow_bring_your_own.ipynb @@ -6,7 +6,7 @@ "source": [ "# Building your own TensorFlow container from Amazon SageMaker Studio\n", "\n", - "**STUDIO KERNEL NOTE:** If you are prompted for Kernel, choose 'Python 3 (TensorFlow CPU Optimized) \n", + "**STUDIO KERNEL NOTE:** If you are prompted for Kernel, choose 'Python 3 (TensorFlow 2.1 Python 3.6 CPU Optimized) \n", "\n", "With Amazon SageMaker, you can package your own algorithms that can then be trained and deployed in the SageMaker environment. This notebook guides you through an example using TensorFlow that shows you how to build a Docker container for SageMaker and use it for training and inference. \n", "\n", @@ -300,9 +300,10 @@ ] }, { - "cell_type": "raw", + "cell_type": "markdown", "metadata": {}, "source": [ + "```\n", "{\n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", @@ -323,7 +324,8 @@ " \"Action\": \"sts:AssumeRole\"\n", " }\n", " ]\n", - "}" + "}\n", + "```" ] }, { @@ -338,17 +340,20 @@ " \n", " * Open [Policies](https://console.aws.amazon.com/iam/home#/policies) in IAM\n", " * Click **Create policy**\n", - " * Select the JSON tab and copy/paste the policy below" + " * Select the JSON tab and copy/paste the printed result of the policy below" ] }, { - "cell_type": "raw", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "{\n", + "print(\n", + " f\"\"\"{{\n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", - " {\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"codebuild:DeleteProject\",\n", @@ -357,26 +362,26 @@ " \"codebuild:StartBuild\"\n", " ],\n", " \"Resource\": \"arn:aws:codebuild:*:*:project/sagemaker-studio*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"logs:CreateLogStream\",\n", " \"Resource\": \"arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"logs:GetLogEvents\",\n", " \"logs:PutLogEvents\"\n", " ],\n", " \"Resource\": \"arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"logs:CreateLogGroup\",\n", " \"Resource\": \"*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"ecr:CreateRepository\",\n", @@ -391,13 +396,13 @@ " \"ecr:PutImage\"\n", " ],\n", " \"Resource\": \"arn:aws:ecr:*:*:repository/sagemaker-studio*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"ecr:GetAuthorizationToken\",\n", " \"Resource\": \"*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"s3:GetObject\",\n", @@ -405,34 +410,35 @@ " \"s3:PutObject\"\n", " ],\n", " \"Resource\": \"arn:aws:s3:::sagemaker-*/*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"s3:CreateBucket\"\n", " ],\n", " \"Resource\": \"arn:aws:s3:::sagemaker*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"iam:GetRole\",\n", " \"iam:ListRoles\"\n", " ],\n", " \"Resource\": \"*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"iam:PassRole\",\n", - " \"Resource\": \"arn:aws:iam::*:role/*\",\n", - " \"Condition\": {\n", - " \"StringLikeIfExists\": {\n", + " \"Resource\": \"{role}\",\n", + " \"Condition\": {{\n", + " \"StringLikeIfExists\": {{\n", " \"iam:PassedToService\": \"codebuild.amazonaws.com\"\n", - " }\n", - " }\n", - " }\n", + " }}\n", + " }}\n", + " }}\n", " ]\n", - "}" + "}}\"\"\"\n", + ")" ] }, { @@ -446,7 +452,7 @@ "We now need to attach our policy to the Execution Role attached to this notebook environment. \n", "\n", " * Go back to [Roles](https://console.aws.amazon.com/iam/home#/roles) in IAM\n", - " * Select the SageMaker Execution Role from abovee\n", + " * Select the SageMaker Execution Role from above\n", " * On the **Permissions** tab, click **Attach policies**\n", " * Search for the Policy we created above `Studio-Image-Build-Policy`\n", " * Select the policy and click **Attach policy**" @@ -647,9 +653,10 @@ "\n", "estimator = Estimator(\n", " role=role,\n", - " train_instance_count=1,\n", - " train_instance_type=instance_type,\n", + " instance_count=1,\n", + " instance_type=instance_type,\n", " image_name=ecr_image,\n", + " image_uri=ecr_image,\n", " hyperparameters=hyperparameters,\n", ")\n", "\n", @@ -740,8 +747,6 @@ "import imageio as imageio\n", "import numpy\n", "\n", - "from sagemaker.predictor import json_serializer, json_deserializer\n", - "\n", "image = imageio.imread(\"data/cat.png\")\n", "print(image.shape)\n", "\n", @@ -758,20 +763,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "jupyter": { - "source_hidden": true - } - }, + "metadata": {}, "outputs": [], "source": [ "# The request and response format is JSON for TensorFlow Serving.\n", "# For more information: https://www.tensorflow.org/serving/api_rest#predict_api\n", - "predictor.accept = \"application/json\"\n", - "predictor.content_type = \"application/json\"\n", + "from sagemaker.serializers import JSONSerializer\n", + "from sagemaker.deserializers import JSONDeserializer\n", "\n", - "predictor.serializer = json_serializer\n", - "predictor.deserializer = json_deserializer\n", + "predictor.serializer = JSONSerializer()\n", + "predictor.deserializer = JSONDeserializer()\n", "\n", "# For more information on the predictor class.\n", "# https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/predictor.py\n", @@ -821,9 +822,9 @@ "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3 (TensorFlow CPU Optimized)", + "display_name": "Python 3 (TensorFlow 2.1 Python 3.6 CPU Optimized)", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/tensorflow-1.15-cpu-py36" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/tensorflow-2.1-cpu-py36" }, "language_info": { "codemirror_mode": { @@ -835,7 +836,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/Batch_Transform_BYO_XGB.ipynb b/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/Batch_Transform_BYO_XGB.ipynb index cb56247297..b32999e4f1 100644 --- a/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/Batch_Transform_BYO_XGB.ipynb +++ b/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/Batch_Transform_BYO_XGB.ipynb @@ -61,8 +61,8 @@ "source": [ "import sys\n", "\n", - "#!{sys.executable} -m pip install \"sagemaker-experiments\"\n", - "#!{sys.executable} -m pip install \"sagemaker-studio-image-build\"" + "!{sys.executable} -m pip install \"sagemaker-experiments\"\n", + "!{sys.executable} -m pip install \"sagemaker-studio-image-build\"" ] }, { @@ -102,13 +102,31 @@ " }\n", " ]\n", " }\n", - " \n", - "2) Permissions attached to the execution role to execute a build in AWS CodeBuild, create ECR repository and push images to ECR \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2) Permissions attached to the execution role to execute a build in AWS CodeBuild, create ECR repository and push images to ECR " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import get_execution_role\n", + "\n", + "role = get_execution_role()\n", "\n", - " {\n", + "print(\n", + " f\"\"\"{{\n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", - " {\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"codebuild:DeleteProject\",\n", @@ -117,26 +135,26 @@ " \"codebuild:StartBuild\"\n", " ],\n", " \"Resource\": \"arn:aws:codebuild:*:*:project/sagemaker-studio*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"logs:CreateLogStream\",\n", " \"Resource\": \"arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"logs:GetLogEvents\",\n", " \"logs:PutLogEvents\"\n", " ],\n", " \"Resource\": \"arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"logs:CreateLogGroup\",\n", " \"Resource\": \"*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"ecr:CreateRepository\",\n", @@ -151,13 +169,13 @@ " \"ecr:PutImage\"\n", " ],\n", " \"Resource\": \"arn:aws:ecr:*:*:repository/sagemaker-studio*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"ecr:GetAuthorizationToken\",\n", " \"Resource\": \"*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"s3:GetObject\",\n", @@ -165,34 +183,35 @@ " \"s3:PutObject\"\n", " ],\n", " \"Resource\": \"arn:aws:s3:::sagemaker-*/*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"s3:CreateBucket\"\n", " ],\n", " \"Resource\": \"arn:aws:s3:::sagemaker*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"iam:GetRole\",\n", " \"iam:ListRoles\"\n", " ],\n", " \"Resource\": \"*\"\n", - " },\n", - " {\n", + " }},\n", + " {{\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"iam:PassRole\",\n", - " \"Resource\": \"arn:aws:iam::*:role/*\",\n", - " \"Condition\": {\n", - " \"StringLikeIfExists\": {\n", + " \"Resource\": \"{role}\",\n", + " \"Condition\": {{\n", + " \"StringLikeIfExists\": {{\n", " \"iam:PassedToService\": \"codebuild.amazonaws.com\"\n", - " }\n", - " }\n", - " }\n", + " }}\n", + " }}\n", + " }}\n", " ]\n", - "}" + "}}\"\"\"\n", + ")" ] }, { @@ -213,9 +232,7 @@ "# Let's inspect the role we have created for our notebook here:\n", "import boto3\n", "import sagemaker\n", - "from sagemaker import get_execution_role\n", "\n", - "role = get_execution_role()\n", "sess = sagemaker.Session()\n", "region = boto3.session.Session().region_name\n", "print(\"Region = {}\".format(region))\n", @@ -672,13 +689,6 @@ "!sm-docker build ." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -972,21 +982,14 @@ "\n", "Remember to delete your datasets in the Amazon S3 bucket you used for this notebook." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Environment (conda_anaconda3)", + "display_name": "Python 3 (Data Science)", "language": "python", - "name": "conda_anaconda3" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" }, "language_info": { "codemirror_mode": { @@ -998,7 +1001,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.10" } }, "nbformat": 4, diff --git a/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/Dockerfile b/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/Dockerfile index a86178c32b..241992a55f 100644 --- a/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/Dockerfile +++ b/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/Dockerfile @@ -26,7 +26,7 @@ RUN apt-get -y update && apt-get install -y --no-install-recommends \ # linking them together. Likewise, pip leaves the install caches populated which uses # a significant amount of space. These optimizations save a fair amount of space in the # image, which reduces start up time. -RUN wget https://bootstrap.pypa.io/3.3/get-pip.py && python3.6 get-pip.py && \ +RUN wget https://bootstrap.pypa.io/get-pip.py && python3.6 get-pip.py && \ pip install --upgrade pip && \ pip install smdebug numpy==1.16.2 scipy==1.2.1 scikit-learn==0.19.1 xgboost==0.90 pandas==0.22.0 flask gevent gunicorn && \ (cd /usr/local/lib/python3.6/dist-packages/scipy/.libs; rm *; ln ../../numpy/.libs/* .) && \ diff --git a/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/xgboost/train b/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/xgboost/train index 2af6234353..1673320be1 100644 --- a/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/xgboost/train +++ b/aws_sagemaker_studio/sagemaker_studio_image_build/xgboost_bring_your_own/xgboost/train @@ -13,7 +13,6 @@ import sys import traceback import pandas as pd -import smdebug.xgboost as smd import xgboost as xgb print("Libraries imported") diff --git a/conf.py b/conf.py index bd7367642f..9582bceb21 100644 --- a/conf.py +++ b/conf.py @@ -62,6 +62,4 @@ # For Adobe Analytics html_js_files = [ "https://a0.awsstatic.com/s_code/js/3.0/awshome_s_code.js", - "aws-ux-shortbread/index.js", - "aws-ux-shortbread/init.js", ] diff --git a/contrib/template.ipynb b/contrib/template.ipynb index 2a5ed6b069..6da9e256fe 100644 --- a/contrib/template.ipynb +++ b/contrib/template.ipynb @@ -107,10 +107,9 @@ "\n", "~~~\n", "\n", - "To do select a particular dataset, assign choosen_data_set below to be one of 'diabetes', 'california', or 'boston' where each name corresponds to the it's respective dataset.\n", + "To select a particular dataset, assign chosen_data_set below to be 'diabetes' or 'california', where each name corresponds to its respective dataset.\n", "\n", - "'boston' : boston house data\n", - "'california' : california house data\n", + "'california' : california housing data\n", "'diabetes' : diabetes data\n", "\n", "~~~\n" @@ -125,13 +124,12 @@ "data_sets = {\n", " \"diabetes\": \"load_diabetes()\",\n", " \"california\": \"fetch_california_housing()\",\n", - " \"boston\": \"load_boston()\",\n", "}\n", "\n", - "# Change choosen_data_set variable to one of the data sets above.\n", - "choosen_data_set = \"california\"\n", - "assert choosen_data_set in data_sets.keys()\n", - "print(\"I selected the '{}' dataset!\".format(choosen_data_set))" + "# Change chosen_data_set variable to one of the data sets above.\n", + "chosen_data_set = \"california\"\n", + "assert chosen_data_set in data_sets.keys()\n", + "print(\"I selected the '{}' dataset!\".format(chosen_data_set))" ] }, { @@ -161,7 +159,7 @@ "%store -r Y_train\n", "%store -r Y_test\n", "%store -r Y_val\n", - "%store -r choosen_data_set" + "%store -r chosen_data_set" ] }, { diff --git a/end_to_end/fraud_detection/5-pipeline-e2e.ipynb b/end_to_end/fraud_detection/5-pipeline-e2e.ipynb index 13a2616118..a775a9e381 100644 --- a/end_to_end/fraud_detection/5-pipeline-e2e.ipynb +++ b/end_to_end/fraud_detection/5-pipeline-e2e.ipynb @@ -508,9 +508,10 @@ " \"--customers-table-name\",\n", " customers_table,\n", " \"--region\",\n", - " region\n", + " region,\n", " ],\n", " code=create_dataset_script_uri,\n", + " depends_on=[claims_flow_step.name, customers_flow_step.name],\n", ")" ] }, diff --git a/end_to_end/fraud_detection/claims_flow_template b/end_to_end/fraud_detection/claims_flow_template index 31341c8fb2..be5be730aa 100644 --- a/end_to_end/fraud_detection/claims_flow_template +++ b/end_to_end/fraud_detection/claims_flow_template @@ -27,7 +27,8 @@ "name": "default", "sampling": { "sampling_method": "sample_by_ratio", - "sample_ratio": 1 + "sample_ratio": 1, + "seed": 123456789 } } ] diff --git a/end_to_end/fraud_detection/customers_flow_template b/end_to_end/fraud_detection/customers_flow_template index 1cf6467434..da44bd9b21 100644 --- a/end_to_end/fraud_detection/customers_flow_template +++ b/end_to_end/fraud_detection/customers_flow_template @@ -27,7 +27,8 @@ "name": "default", "sampling": { "sampling_method": "sample_by_ratio", - "sample_ratio": 1 + "sample_ratio": 1, + "seed": 123456789 } } ] diff --git a/ground_truth_labeling_jobs/README.md b/ground_truth_labeling_jobs/README.md index baf048f0cb..79a5fb1d91 100644 --- a/ground_truth_labeling_jobs/README.md +++ b/ground_truth_labeling_jobs/README.md @@ -13,3 +13,4 @@ These examples provide quick walkthroughs to get you up and running with the lab - [Ground Truth Conversion Scripts](ground_truth_conversion_scripts) provides a conversion script for the output of Ground Truth semantic segmentation manifest to Common Objects in Context (COCO) format - [3D Point Cloud Demo](3d_point_cloud_demo) demonstrates the Amazon SageMaker Ground Truth's annotation workflow for 3D point cloud data types. - [3D Point Cloud Input Data Processing](3d_point_cloud_input_data_processing) demonstrates how you can pre-process your 3D point cloud input data to create an object tracking job labeling job. +- [Labeling Adjustment Job Adaptation](labeling_adjustment_job_adaptation) is a utility script to help you remove individual unnecessary labels from manifest files so that you can successfully launch label adjustment jobs with SageMaker Ground Truth. diff --git a/ground_truth_labeling_jobs/labeling_adjustment_job_adaptation/labeling_adjustment_job_adaptation.ipynb b/ground_truth_labeling_jobs/labeling_adjustment_job_adaptation/labeling_adjustment_job_adaptation.ipynb new file mode 100644 index 0000000000..c4dcd9c8e6 --- /dev/null +++ b/ground_truth_labeling_jobs/labeling_adjustment_job_adaptation/labeling_adjustment_job_adaptation.ipynb @@ -0,0 +1,272 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a56a7e16", + "metadata": {}, + "source": [ + "# Labeling Adjustment Job Adaptation\n", + "\n", + "## Labeling Adjustment Jobs\n", + "\n", + "This notebook is focusing on creation of Labeling Adjustment Jobs in SageMaker Ground Truth.\n", + "\n", + "More details about the usage of label adjustment jobs as well as their creation can be found in official documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-verification-data.html\n", + "\n", + "## Customer use case description\n", + "\n", + "The example provided here is given for the bounding box labeling job with multiple object detection on image data. \n", + "\n", + "Once your customer has originally labeled their dataset for object detection and trained their first models it is possible that the business requirements and priorities might change. Therefore, individual original objects which we did want to detect originally with our model might become irrelevant for further detection and should be removed, while we would want to add additional labels to be detected in our dataset.\n", + "\n", + "This will require the original dataset to be re-labeled with the labeling adjustment job displaying the already existing labels which we want to keep while removing the labels which are not anymore in target scope. The current SageMaker Ground Truth UI enables us to remove unwanted labels from the labeling team workforce UI before launching labeling adjustment job, which will also remove the labels visually from each individual image displayed to the labeling team.\n", + "\n", + "However, jobs launched in this way will fail on every example image during consolidation stage where the labels have not been adjusted by the labeling team. To avoid this issue, we need to process the existing output manifest file and remove all the unwanted labels from the manifest file directly before launching labeling adjustment job.\n", + "\n", + "The script provided in this notebook accepts as input a set of labels to remove from the output manifest file, and the name of the labeling job containing the output manifest file to adjust. It will generate the cleaned output manifest file with only target labels removed from the latest labeling job that can be used to safely launch label adjustment job." + ] + }, + { + "cell_type": "markdown", + "id": "13912066", + "metadata": {}, + "source": [ + "### Function code" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d38f5df4", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import os\n", + "import botocore\n", + "import json\n", + "\n", + "sagemaker_client = boto3.client(\"sagemaker\")\n", + "s3_client = boto3.client(\"s3\")\n", + "\n", + "##### Helper function for communication with aws services (sagemaker and s3)\n", + "def get_labeling_job_output_manifest_file_location(\n", + " labeling_job_name: str, sagemaker_client: botocore.client\n", + ") -> str:\n", + " \"\"\"\n", + " # ref: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_labeling_job\n", + " \"\"\"\n", + " s3_output_location = sagemaker_client.describe_labeling_job(LabelingJobName=labeling_job_name)[\n", + " \"OutputConfig\"\n", + " ][\"S3OutputPath\"]\n", + " manifest_file_relative_path_from_output_location = \"{}/manifests/output/output.manifest\".format(\n", + " labeling_job_name\n", + " )\n", + " output_manifest_absolute_path = os.path.join(\n", + " s3_output_location, manifest_file_relative_path_from_output_location\n", + " )\n", + "\n", + " return output_manifest_absolute_path\n", + "\n", + "\n", + "def get_labeling_job_attribute_name(\n", + " labeling_job_name: str, sagemaker_client: botocore.client\n", + ") -> str:\n", + " \"\"\"\n", + " # ref: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.describe_labeling_job\n", + " \"\"\"\n", + " labeling_job_attribute_name = sagemaker_client.describe_labeling_job(\n", + " LabelingJobName=labeling_job_name\n", + " )[\"LabelAttributeName\"]\n", + " return labeling_job_attribute_name\n", + "\n", + "\n", + "def split_bucket_key_from_s3_path(s3_full_path: str) -> (str, str):\n", + " \"\"\"\n", + " full s3 path in format: s3://BUCKET/KEY\n", + " \"\"\"\n", + " split_location = s3_full_path[5:].find(\"/\") + 5\n", + " return s3_full_path[5:split_location], s3_full_path[split_location + 1 :]\n", + "\n", + "\n", + "def read_s3_file(file_path: str, s3_client: botocore.client):\n", + " bucket_name, key = split_bucket_key_from_s3_path(file_path)\n", + " response = s3_client.get_object(Bucket=bucket_name, Key=key)[\"Body\"].read()\n", + " return response\n", + "\n", + "\n", + "def save_file_to_s3(file_path: str, object_to_save, s3_client: botocore.client):\n", + " bucket_name, key = split_bucket_key_from_s3_path(file_path)\n", + " s3_client.put_object(Body=object_to_save, Bucket=bucket_name, Key=key)\n", + "\n", + "\n", + "#### Helper functions to process the output.manifest file and cleanup unnecessary labels\n", + "def get_class_ids_for_removable_labels(label_annotations_metadata, labels_to_remove):\n", + " class_ids_for_removable_labels = []\n", + " for label in labels_to_remove:\n", + " for key, value in label_annotations_metadata[\"class-map\"].items():\n", + " if value == label:\n", + " class_ids_for_removable_labels.append(key)\n", + " del label_annotations_metadata[\"class-map\"][key]\n", + " break\n", + " return label_annotations_metadata, class_ids_for_removable_labels\n", + "\n", + "\n", + "def clean_up_annotations(label_annotations, class_ids_to_remove):\n", + " removed_annotation_positions = []\n", + " new_annotations_list = []\n", + " for i in range(len(label_annotations[\"annotations\"])):\n", + " if str(label_annotations[\"annotations\"][i][\"class_id\"]) in class_ids_to_remove:\n", + " removed_annotation_positions.append(i)\n", + " else:\n", + " new_annotations_list.append(label_annotations[\"annotations\"][i])\n", + " label_annotations[\"annotations\"] = new_annotations_list\n", + " return label_annotations, removed_annotation_positions\n", + "\n", + "\n", + "def clean_up_metadata(label_annotations_metadata, removed_marked_labels_positions):\n", + " for i in range(len(removed_marked_labels_positions)):\n", + " del label_annotations_metadata[\"objects\"][removed_marked_labels_positions[i] - i]\n", + "\n", + " label_annotations_metadata[\"adjustment-status\"] = \"adjusted\"\n", + " return label_annotations_metadata\n", + "\n", + "\n", + "#### Main function to remove all the unnecessary labels from manifest file\n", + "def remove_labels_from_output_manifest_file(\n", + " remove_labels: list, marked_labels: list, labeling_job_attribute_name: str\n", + "):\n", + " \"\"\"\n", + " remove_labels (list[str]): list of labels we want to remove from output.manifest file\n", + " marked_labels (list[marked_labels_per_document]): content of output.manifest file marked labels per document\n", + " format of marked_labels_per_document:\n", + " 'all_keys': ['source-ref', 'category', 'category-metadata','chain-job-name','chain-job-name-metadata']\n", + " 'category' (chain-job-name): ['image_size', 'annotations']\n", + " 'category-metadata' (chain-job-name-metadata): ['objects', 'class-map', 'type', 'human-annotated', 'creation-date', 'job-name', 'adjustment-status']\n", + " labeling_job_attribute_name (str): name of the labeling job attribute to find adequate annotations and annotations_meta data to be adjusted\n", + " \"\"\"\n", + " nmb_keys_previous = len(list(marked_labels[0].keys()))\n", + " total_nmb_of_removed_marked_labels = 0\n", + "\n", + " for label in marked_labels:\n", + " nmb_keys = len(list(label.keys()))\n", + " if nmb_keys_previous != nmb_keys:\n", + " assert \"Label does not have same amount of keys as others! This is unexpected behaviour since each should have same amount of jobs run...\"\n", + "\n", + " latest_annotations_name = labeling_job_attribute_name\n", + " latest_annotations_metadata_name = \"{}-metadata\".format(labeling_job_attribute_name)\n", + "\n", + " (\n", + " label[latest_annotations_metadata_name],\n", + " class_ids_to_remove,\n", + " ) = get_class_ids_for_removable_labels(\n", + " label[latest_annotations_metadata_name], remove_labels\n", + " )\n", + "\n", + " # every labeling job class-map should have one label mentioned only once, but not every class needs to be present\n", + " assert len(class_ids_to_remove) <= len(remove_labels)\n", + "\n", + " label[latest_annotations_name], removed_marked_labels_positions = clean_up_annotations(\n", + " label[latest_annotations_name], class_ids_to_remove\n", + " )\n", + " label[latest_annotations_metadata_name] = clean_up_metadata(\n", + " label[latest_annotations_metadata_name], removed_marked_labels_positions\n", + " )\n", + " total_nmb_of_removed_marked_labels += len(removed_marked_labels_positions)\n", + "\n", + " # this will log for you the total number of labels that have been removed from your manifest file\n", + " # you can use it to check the expectations depending on how many labels of the target type to be removed,\n", + " # was expected in input manifest file\n", + " print(\"In total we have removed {} marked labels.\".format(total_nmb_of_removed_marked_labels))\n", + " return marked_labels\n", + "\n", + "\n", + "def main_function(\n", + " labeling_job_name, remove_labels, path_to_save_results_to, sagemaker_client, s3_client\n", + "):\n", + " output_file_path = get_labeling_job_output_manifest_file_location(\n", + " labeling_job_name, sagemaker_client\n", + " )\n", + " output_file_content = read_s3_file(output_file_path, s3_client)\n", + "\n", + " labels = []\n", + " for line in output_file_content.splitlines():\n", + " labels.append(json.loads(line))\n", + "\n", + " cleaned_labels = remove_labels_from_output_manifest_file(\n", + " remove_labels, labels, get_labeling_job_attribute_name(labeling_job_name, sagemaker_client)\n", + " )\n", + " # you can uncomment this to generate a smaller output file for testing\n", + " # cleaned_labels = cleaned_labels[:15]\n", + "\n", + " # function to save back all the marked labels to cleaned up manifest file\n", + " output_manifest_cleaned_content = \"\"\n", + " for clean_label in cleaned_labels:\n", + " output_manifest_cleaned_content = (\n", + " output_manifest_cleaned_content + json.dumps(clean_label) + \"\\n\"\n", + " )\n", + "\n", + " save_file_to_s3(path_to_save_results_to, output_manifest_cleaned_content, s3_client)" + ] + }, + { + "cell_type": "markdown", + "id": "098d85e8", + "metadata": {}, + "source": [ + "### Parameter setup and script execution" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ea0a5772", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In total we have removed 4 marked labels.\n" + ] + } + ], + "source": [ + "#### program execution\n", + "\n", + "# these are input parameters to adjust\n", + "labeling_job_name = \"\"\n", + "remove_labels = [\"\", \"\", \"\", \"\"]\n", + "\n", + "path_to_save_results_to = (\n", + " \"s3:////output.manifest\"\n", + ")\n", + "\n", + "main_function(\n", + " labeling_job_name, remove_labels, path_to_save_results_to, sagemaker_client, s3_client\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/hyperparameter_tuning/blazingtext_text_classification_20_newsgroups/hpo_blazingtext_text_classification_20_newsgroups.ipynb b/hyperparameter_tuning/blazingtext_text_classification_20_newsgroups/hpo_blazingtext_text_classification_20_newsgroups.ipynb new file mode 100644 index 0000000000..ca0a7475bf --- /dev/null +++ b/hyperparameter_tuning/blazingtext_text_classification_20_newsgroups/hpo_blazingtext_text_classification_20_newsgroups.ipynb @@ -0,0 +1,1139 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Text Classification with Amazon SageMaker BlazingText and Hyperparameter Tuning\n", + "\n", + "Automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many jobs that test a range of hyperparameters on your dataset. You choose the tunable hyperparameters, a range of values for each, and an objective metric. You choose the objective metric from the metrics that the algorithm computes. Automatic model tuning searches the hyperparameters chosen to find the combination of values that result in the model that optimizes the objective metric.\n", + "\n", + "\n", + "## Introduction\n", + "\n", + "Text Classification can be used to solve various use-cases like sentiment analysis, spam detection, hashtag prediction etc. This notebook demonstrates the use of SageMaker BlazingText to perform supervised binary/multi class with single or multi label text classification. BlazingText can train the model on more than a billion words in a couple of minutes using a multi-core CPU or a GPU, while achieving performance on par with the state-of-the-art deep learning text classification algorithms. BlazingText extends the `fastText` text classifier to leverage GPU acceleration using custom `CUDA` kernels." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Python packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "!{sys.executable} -m pip install \"scikit_learn==0.20.0\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Let's start by specifying:\n", + "\n", + "- The S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the Notebook Instance, training, and hosting. If you don't specify a bucket, SageMaker SDK will create a default bucket following a pre-defined naming convention in the same region. \n", + "- The IAM role ARN used to give SageMaker access to your data. It can be fetched using the **get_execution_role** method from sagemaker python SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "isConfigCell": true + }, + "outputs": [], + "source": [ + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "import json\n", + "import boto3\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "sess = sagemaker.Session()\n", + "\n", + "role = get_execution_role()\n", + "print(\n", + " role\n", + ") # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf\n", + "\n", + "bucket = sess.default_bucket() # Replace with your own bucket name if needed\n", + "print(bucket)\n", + "prefix = \"blazingtext/supervised/20_newsgroups\" # Replace with the prefix under which you want to store the data if needed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preparation\n", + "\n", + "Now we'll download a dataset from the web on which we want to train the text classification model. BlazingText expects a single preprocessed text file with space separated tokens and each line of the file should contain a single sentence and the corresponding label(s) prefixed by \"\\__label\\__\".\n", + "\n", + "In this example, let us train the text classification model on the [`20 newsgroups dataset`](http://qwone.com/~jason/20Newsgroups/). The `20 newsgroups dataset` consists of 20000 messages taken from 20 Usenet newsgroups." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "\n", + "data_dir = \"20_newsgroups_bulk\"\n", + "if os.path.exists(data_dir): # cleanup existing data folder\n", + " shutil.rmtree(data_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp s3://sagemaker-sample-files/datasets/text/20_newsgroups/20_newsgroups_bulk.tar.gz ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!tar xzf 20_newsgroups_bulk.tar.gz\n", + "!ls 20_newsgroups_bulk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_list = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]\n", + "print(\"Number of files:\", len(file_list))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documents_count = 0\n", + "for file in file_list:\n", + " df = pd.read_csv(file, header=None, names=[\"text\"])\n", + " documents_count = documents_count + df.shape[0]\n", + "print(\"Number of documents:\", documents_count)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "categories_list = [f.split(\"/\")[1] for f in file_list]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "categories_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us inspect the dataset to get some understanding about how the data and the label is provided in the dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"./20_newsgroups_bulk/rec.motorcycles\", header=None, names=[\"text\"])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"text\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"./20_newsgroups_bulk/comp.sys.mac.hardware\", header=None, names=[\"text\"])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"text\"][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see from the above, there is a single file for each class in the dataset. Each record is just a plain text paragraphs with header, body, footer and quotes. We will need to process them into a suitable data format." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preprocessing\n", + "We need to preprocess the training data into **space separated tokenized text** format which can be consumed by `BlazingText` algorithm. Also, as mentioned previously, the class label(s) should be prefixed with `__label__` and it should be present in the same line along with the original sentence. We'll use `nltk` library to tokenize the input sentences from `20 newsgroups dataset`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download the `nltk` tokenizer and other libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "nltk.download(\"punkt\")\n", + "nltk.download(\"stopwords\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets.twenty_newsgroups import (\n", + " strip_newsgroup_header,\n", + " strip_newsgroup_quoting,\n", + " strip_newsgroup_footer,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This following function will remove the header, footer and quotes (of earlier messages in each text)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def strip_newsgroup_item(item):\n", + " item = strip_newsgroup_header(item)\n", + " item = strip_newsgroup_quoting(item)\n", + " item = strip_newsgroup_footer(item)\n", + " return item" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following function will take care of stop words removal, removing leading/trailing whitespace, extra space, tabs, and HTML tags/markups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's get a list of stop words from the NLTK library\n", + "stop_words = stopwords.words(\"english\")\n", + "\n", + "\n", + "def process_text(texts):\n", + " final_text_list = []\n", + " for text in texts:\n", + "\n", + " # Check if the sentence is a missing value\n", + " if isinstance(text, str) == False:\n", + " text = \"\"\n", + "\n", + " filtered_sentence = []\n", + "\n", + " # Lowercase\n", + " text = text.lower()\n", + "\n", + " # Remove leading/trailing whitespace, extra space, tabs, and HTML tags/markups\n", + " text = text.strip()\n", + " text = re.sub(\"\\[.*?\\]\", \"\", text)\n", + " text = re.sub(\"https?://\\S+|www\\.\\S+\", \"\", text)\n", + " text = re.sub(\"<.*?>+\", \"\", text)\n", + " text = re.sub(\"[%s]\" % re.escape(string.punctuation), \"\", text)\n", + " text = re.sub(\"\\n\", \"\", text)\n", + " text = re.sub(\"\\w*\\d\\w*\", \"\", text)\n", + "\n", + " for w in word_tokenize(text):\n", + " # We are applying some custom filtering here, feel free to try different things\n", + " # Check if it is not numeric and its length>2 and not in stop words\n", + " if (not w.isnumeric()) and (len(w) > 2) and (w not in stop_words):\n", + " filtered_sentence.append(w)\n", + " final_string = \" \".join(filtered_sentence) # final string of cleaned words\n", + "\n", + " final_text_list.append(final_string)\n", + "\n", + " return final_text_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will read each of the `20_newsgroups` dataset files, call `strip_newsgroup_item` and `process_text` functions we defined earlier, and then aggregate all data into one dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df = pd.DataFrame()\n", + "\n", + "for file in file_list:\n", + " print(f\"Processing {file}\")\n", + " label = file.split(\"/\")[1]\n", + " df = pd.read_csv(file, header=None, names=[\"text\"])\n", + " df[\"text\"] = df[\"text\"].apply(strip_newsgroup_item)\n", + " df[\"text\"] = process_text(df[\"text\"].tolist())\n", + " df[\"label\"] = label\n", + " all_categories_df = all_categories_df.append(df, ignore_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's inspect how many categories there are in our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our dataset there are 20 categories which is too much, so we will combine the sub-categories." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# replace to politics\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"talk.politics.misc\": \"politics\",\n", + " \"talk.politics.guns\": \"politics\",\n", + " \"talk.politics.mideast\": \"politics\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "# replace to recreational\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"rec.sport.hockey\": \"recreational\",\n", + " \"rec.sport.baseball\": \"recreational\",\n", + " \"rec.autos\": \"recreational\",\n", + " \"rec.motorcycles\": \"recreational\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "# replace to religion\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"soc.religion.christian\": \"religion\",\n", + " \"talk.religion.misc\": \"religion\",\n", + " \"alt.atheism\": \"religion\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "# replace to computer\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"comp.windows.x\": \"computer\",\n", + " \"comp.sys.ibm.pc.hardware\": \"computer\",\n", + " \"comp.os.ms-windows.misc\": \"computer\",\n", + " \"comp.graphics\": \"computer\",\n", + " \"comp.sys.mac.hardware\": \"computer\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "# replace to sales\n", + "all_categories_df[\"label\"].replace({\"misc.forsale\": \"sales\"}, inplace=True)\n", + "\n", + "# replace to science\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"sci.crypt\": \"science\",\n", + " \"sci.electronics\": \"science\",\n", + " \"sci.med\": \"science\",\n", + " \"sci.space\": \"science\",\n", + " },\n", + " inplace=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are left with 6 categories, which is much better." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's calculate number of words for each row." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"word_count\"] = all_categories_df[\"text\"].apply(lambda x: len(str(x).split()))\n", + "all_categories_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get basic statistics about the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"word_count\"].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the mean value is around 86 words. However, there are outliers, such as a text with 6179 words. This can make it harder for the model to result in good performance. We will take care to drop those rows." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's drop empty rows first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "no_text = all_categories_df[all_categories_df[\"word_count\"] == 0]\n", + "print(len(no_text))\n", + "\n", + "# drop these rows\n", + "all_categories_df.drop(no_text.index, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's drop the rows that are longer than 128 words. This is done to make it easy for the model to train without outliers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "long_text = all_categories_df[all_categories_df[\"word_count\"] > 128]\n", + "print(len(long_text))\n", + "\n", + "# drop these rows\n", + "all_categories_df.drop(long_text.index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get basic statistics about the dataset after our outliers fixes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"word_count\"].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This looks much more balanced." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we drop the `word_count` columns as we will not need it anymore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df.drop(columns=\"word_count\", axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We partition the dataset into 80% training and 20% validation set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train, validation = train_test_split(all_categories_df, test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def save_to_csv_with_prefix(df, file_name):\n", + " df[\"text\"] = \"__label__\" + df[\"label\"] + \" \" + df[\"text\"]\n", + " df.drop(columns=\"label\", axis=1, inplace=True)\n", + " records = df[\"text\"].values.tolist()\n", + " print(len(records))\n", + " f = open(file_name, \"w\")\n", + " for element in records:\n", + " f.write(element + \"\\n\")\n", + " f.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_to_csv_with_prefix(train, \"20_newsgroups.train\")\n", + "save_to_csv_with_prefix(validation, \"20_newsgroups.validation\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us inspect the train and the validation datasets after the preprocessing, to get understanding about how the data and the labels now look." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!head 20_newsgroups.train -n 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!head 20_newsgroups.validation -n 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We need to upload it to S3 so that it can be consumed by SageMaker to execute training jobs. We'll use Python SDK to upload these two files to the bucket and prefix location that we have set above. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_channel = prefix + \"/train\"\n", + "validation_channel = prefix + \"/validation\"\n", + "\n", + "sess.upload_data(path=\"20_newsgroups.train\", bucket=bucket, key_prefix=train_channel)\n", + "sess.upload_data(path=\"20_newsgroups.validation\", bucket=bucket, key_prefix=validation_channel)\n", + "\n", + "s3_train_data = \"s3://{}/{}\".format(bucket, train_channel)\n", + "s3_validation_data = \"s3://{}/{}\".format(bucket, validation_channel)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we need to set up an output location at S3, where the model artifact will be dumped. These artifacts are also the output of the algorithm's training job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s3_output_location = \"s3://{}/{}/output\".format(bucket, prefix)\n", + "print(s3_output_location)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up hyperparameter tuning job\n", + "Now that we are done with all the setup that is needed, we are ready to train our BlazingText model. To begin, let us create a `Estimator` object. This estimator will launch the training job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region_name = boto3.Session().region_name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container = sagemaker.image_uris.retrieve(\"blazingtext\", region_name, \"1\")\n", + "print(\"Using SageMaker BlazingText container: {} ({})\".format(container, region_name))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training the BlazingText model for supervised text classification" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BlazingText supports a *supervised* mode for text classification. It extends the `FastText` text classifier to leverage GPU acceleration using custom `CUDA` kernels.\n", + "The model can be trained on more than a billion words in a couple of minutes using a multi-core CPU or a GPU, while achieving performance on par with the state-of-the-art deep learning text classification algorithms.\n", + "For more information, please refer to the [algorithm documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's define the SageMaker `Estimator` with resource configurations and hyperparameters to train Text Classification on `20 newsgroups` dataset, using \"supervised\" mode on a `c4.4xlarge` instance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "estimator = sagemaker.estimator.Estimator(\n", + " container,\n", + " role,\n", + " instance_count=1,\n", + " instance_type=\"ml.c4.4xlarge\",\n", + " volume_size=30,\n", + " max_run=360000,\n", + " input_mode=\"File\",\n", + " output_path=s3_output_location,\n", + " hyperparameters={\n", + " \"mode\": \"supervised\",\n", + " \"epochs\": 25,\n", + " \"min_count\": 2,\n", + " \"early_stopping\": True,\n", + " \"patience\": 4,\n", + " \"min_epochs\": 5,\n", + " \"word_ngrams\": 1,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we've defined our estimator we can specify the hyperparameters we'd like to tune and their possible values. We have three different types of hyperparameters.\n", + "- Categorical parameters need to take one value from a discrete set. We define this by passing the list of possible values to `CategoricalParameter(list)`\n", + "- Continuous parameters can take any real number value between the minimum and maximum value, defined by `ContinuousParameter(min, max)`\n", + "- Integer parameters can take any integer value between the minimum and maximum value, defined by `IntegerParameter(min, max)`\n", + "\n", + "*Note, if possible, it's almost always best to specify a value as the least restrictive type. For example, tuning learning rate as a continuous value between 0.01 and 0.2 is likely to yield a better result than tuning as a categorical parameter with values 0.01, 0.1, 0.15, or 0.2.*\n", + "\n", + "Refer to [BlazingText Hyperparameters](https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext_hyperparameters.html) in the Amazon SageMaker documentation for the complete list of hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.tuner import (\n", + " IntegerParameter,\n", + " CategoricalParameter,\n", + " ContinuousParameter,\n", + " HyperparameterTuner,\n", + ")\n", + "\n", + "hyperparameter_ranges = {\n", + " \"learning_rate\": ContinuousParameter(0.05, 0.15),\n", + " \"vector_dim\": IntegerParameter(32, 300),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we'll specify the objective metric that we'd like to tune and its definition, which includes the regular expression (Regex) needed to extract that metric from the CloudWatch logs of the training job. Since we are using built-in `BlazingText` algorithm here, it emits two predefined metrics: `train:mean_rho` and `validation:accuracy`, and we elected to monitor `validation:accuracy` as you can see below. In this case, we only need to specify the metric name and do not need to provide regex. If you bring your own algorithm, your algorithm emits metrics by itself. In that case, you'll need to add a `MetricDefinition` object here to define the format of those metrics through regex, so that SageMaker knows how to extract those metrics from your CloudWatch logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "objective_metric_name = \"validation:accuracy\"\n", + "objective_type = \"Maximize\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we'll create a `HyperparameterTuner` object, to which we pass:\n", + "- The `BlazingText` estimator we created above\n", + "- Our hyperparameter ranges\n", + "- Objective metric name and definition\n", + "- Tuning resource configurations such as Number of training jobs to run in total and how many training jobs can be run in parallel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tuner = HyperparameterTuner(\n", + " estimator,\n", + " objective_metric_name,\n", + " hyperparameter_ranges,\n", + " max_jobs=6,\n", + " max_parallel_jobs=2,\n", + " objective_type=objective_type,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that the hyper-parameters are set up, let us prepare the handshake between our data channels and the algorithm. To do this, we need to create the `sagemaker.inputs.TrainingInput` objects from our data channels. These objects are then put in a simple dictionary, which the algorithm consumes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = sagemaker.inputs.TrainingInput(\n", + " s3_train_data,\n", + " distribution=\"FullyReplicated\",\n", + " content_type=\"text/plain\",\n", + " s3_data_type=\"S3Prefix\",\n", + ")\n", + "validation_data = sagemaker.inputs.TrainingInput(\n", + " s3_validation_data,\n", + " distribution=\"FullyReplicated\",\n", + " content_type=\"text/plain\",\n", + " s3_data_type=\"S3Prefix\",\n", + ")\n", + "data_channels = {\"train\": train_data, \"validation\": validation_data}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have our `Estimator` object, we have set the hyper-parameters for this object, and we have our data channels linked with the algorithm. The only remaining thing to do is to train the algorithm. The following command will train the algorithm. Training the algorithm involves a few steps. Firstly, the instance that we requested while creating the `Estimator` classes is provisioned and is set up with the appropriate libraries. Then, the data from our channels are downloaded into the instance. Once this is done, the training job begins. The provisioning and data downloading will take some time, depending on the size of the data. Therefore, it might be a few minutes before we start getting training logs for our training jobs. The data logs will also print out Accuracy on the validation data for every epoch after training job has executed `min_epochs`. This metric is a proxy for the quality of the algorithm.\n", + "\n", + "A \"Job complete\" message will be printed once the job has finished. The trained model can be found in the S3 bucket that was set up as `output_path` in the estimator." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch hyperparameter tuning job\n", + "Now we can launch a hyperparameter tuning job by calling *fit()* function. After the hyperparameter tuning job is created, we can go to SageMaker console to track the progress of the hyperparameter tuning job until it is completed.\n", + "\n", + "This should take around 12 minutes to complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "tuner.fit(inputs=data_channels, logs=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze Results of a Hyperparameter Tuning job\n", + "\n", + "Once you have completed a tuning job, (or even while the job is still running) you can use the code below to analyze the results to understand how each hyperparameter effects the quality of the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sm_client = boto3.Session().client(\"sagemaker\")\n", + "\n", + "tuning_job_name = tuner.latest_tuning_job.name\n", + "tuning_job_name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Track hyperparameter tuning job progress\n", + "After you launch a tuning job, you can see its progress by calling `describe_tuning_job` API. The output from describe-tuning-job is a JSON object that contains information about the current state of the tuning job. You can call `list_training_jobs_for_tuning_job` to see a detailed list of the training jobs that the tuning job launched." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tuning_job_result = sm_client.describe_hyper_parameter_tuning_job(\n", + " HyperParameterTuningJobName=tuning_job_name\n", + ")\n", + "\n", + "status = tuning_job_result[\"HyperParameterTuningJobStatus\"]\n", + "if status != \"Completed\":\n", + " print(\"Reminder: the tuning job has not been completed.\")\n", + "\n", + "job_count = tuning_job_result[\"TrainingJobStatusCounters\"][\"Completed\"]\n", + "print(\"%d training jobs have completed\" % job_count)\n", + "\n", + "is_minimize = (\n", + " tuning_job_result[\"HyperParameterTuningJobConfig\"][\"HyperParameterTuningJobObjective\"][\"Type\"]\n", + " != \"Maximize\"\n", + ")\n", + "objective_name = tuning_job_result[\"HyperParameterTuningJobConfig\"][\n", + " \"HyperParameterTuningJobObjective\"\n", + "][\"MetricName\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "if tuning_job_result.get(\"BestTrainingJob\", None):\n", + " print(\"Best model found so far:\")\n", + " pprint(tuning_job_result[\"BestTrainingJob\"])\n", + "else:\n", + " print(\"No training jobs have reported results yet.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch all results as `DataFrame`\n", + "We can list hyperparameters and objective metrics of all training jobs and pick up the training job with the best objective metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "tuner_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)\n", + "\n", + "full_df = tuner_analytics.dataframe()\n", + "\n", + "if len(full_df) > 0:\n", + " df = full_df[full_df[\"FinalObjectiveValue\"] > -float(\"inf\")]\n", + " if len(df) > 0:\n", + " df = df.sort_values(\"FinalObjectiveValue\", ascending=is_minimize)\n", + " print(\"Number of training jobs with valid objective: %d\" % len(df))\n", + " print({\"lowest\": min(df[\"FinalObjectiveValue\"]), \"highest\": max(df[\"FinalObjectiveValue\"])})\n", + " pd.set_option(\"display.max_colwidth\", -1) # Don't truncate TrainingJobName\n", + " else:\n", + " print(\"No training jobs have reported valid results yet.\")\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy the best trained model\n", + "Once the training is done, we can deploy the trained model as an Amazon SageMaker real-time hosted endpoint. This will allow us to make predictions (or inference) from the model. Note that we don't have to host on the same type of instance that we used to train, because usually for inference, less compute power is needed than for training, and in addition, instance endpoints will be up and running for long, it's advisable to choose a cheaper instance for inference.\n", + "\n", + "- `ml.c4.4xlarge` - Compute Optimized instances are ideal for compute bound applications that benefit from high performance processors.\n", + "- `ml.m4.xlarge` - General purpose instances provide a balance of compute, memory and networking resources, and can be used for a variety of diverse workloads." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.serializers import JSONSerializer\n", + "\n", + "text_classifier = tuner.deploy(\n", + " initial_instance_count=1, instance_type=\"ml.m4.xlarge\", serializer=JSONSerializer()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Use JSON format for inference\n", + "BlazingText supports `application/json` as the content-type for inference. The payload should contain a list of sentences with the key as \"**instances**\" while being passed to the endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentences = [\n", + " \"The modem is an internal AT/(E)ISA 8-bit card (just a little longer than a half-card).\",\n", + " \"In the cage I usually wave to bikers. They usually don't wave back. My wife thinks it's strange but I don't care.\",\n", + " \"Voyager has the unusual luck to be on a stable trajectory out of the solar system.\",\n", + "]\n", + "\n", + "# using the same processing logic that we used during data preparation for training\n", + "processed_sentences = process_text(sentences)\n", + "\n", + "print(processed_sentences)\n", + "\n", + "payload = {\"instances\": processed_sentences}\n", + "\n", + "response = text_classifier.predict(payload)\n", + "\n", + "predictions = json.loads(response)\n", + "print(json.dumps(predictions, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the model will return only one prediction, the one with the highest probability. For retrieving the top k predictions, you can set `k` in the configuration as shown below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "payload = {\"instances\": processed_sentences, \"configuration\": {\"k\": 2}}\n", + "\n", + "response = text_classifier.predict(payload)\n", + "\n", + "predictions = json.loads(response)\n", + "print(json.dumps(predictions, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean up\n", + "Endpoints should be deleted when no longer in use, since (per the [SageMaker pricing page](https://aws.amazon.com/sagemaker/pricing/)) they're billed by time deployed.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text_classifier.delete_endpoint()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (Data Science)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "notice": "Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hyperparameter_tuning/huggingface_multiclass_text_classification_20_newsgroups/code/train.py b/hyperparameter_tuning/huggingface_multiclass_text_classification_20_newsgroups/code/train.py new file mode 100644 index 0000000000..bc67dae161 --- /dev/null +++ b/hyperparameter_tuning/huggingface_multiclass_text_classification_20_newsgroups/code/train.py @@ -0,0 +1,101 @@ +from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer +from sklearn.metrics import accuracy_score, precision_recall_fscore_support +from datasets import load_from_disk +from datasets import load_metric +import numpy as np +import random +import logging +import sys +import argparse +import os +import torch + + +metric = load_metric("accuracy") + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + # hyperparameters sent by the client are passed as command-line arguments to the script. + parser.add_argument("--epochs", type=int, default=3) + parser.add_argument("--train-batch-size", type=int, default=32) + parser.add_argument("--num_labels", type=int, default=6) + parser.add_argument("--eval-batch-size", type=int, default=64) + parser.add_argument("--warmup_steps", type=int, default=500) + parser.add_argument("--model_name", type=str) + parser.add_argument("--learning_rate", type=str, default=5e-5) + + # Data, model, and output directories + parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) + parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) + parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) + parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"]) + + args, _ = parser.parse_known_args() + + # Set up logging + logger = logging.getLogger(__name__) + + logging.basicConfig( + level=logging.getLevelName("INFO"), + handlers=[logging.StreamHandler(sys.stdout)], + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + # load datasets + train_dataset = load_from_disk(args.training_dir) + test_dataset = load_from_disk(args.test_dir) + + logger.info(f" loaded train_dataset length is: {len(train_dataset)}") + logger.info(f" loaded test_dataset length is: {len(test_dataset)}") + + # compute metrics function for multi class classification + def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + # download model from model hub + model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=args.num_labels) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + # define training args + training_args = TrainingArguments( + output_dir=args.model_dir, + num_train_epochs=args.epochs, + per_device_train_batch_size=args.train_batch_size, + per_device_eval_batch_size=args.eval_batch_size, + warmup_steps=args.warmup_steps, + evaluation_strategy="epoch", + logging_dir=f"{args.output_data_dir}/logs", + learning_rate=float(args.learning_rate), + ) + + # create Trainer instance + trainer = Trainer( + model=model, + args=training_args, + compute_metrics=compute_metrics, + train_dataset=train_dataset, + eval_dataset=test_dataset, + tokenizer=tokenizer, + ) + + # train model + trainer.train() + + # evaluate model + eval_result = trainer.evaluate(eval_dataset=test_dataset) + + # writes eval result to file which can be accessed later in s3 ouput + with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer: + print(f"***** Eval results *****") + for key, value in sorted(eval_result.items()): + writer.write(f"{key} = {value}\n") + + # Saves the model to s3 + trainer.save_model(args.model_dir) + diff --git a/hyperparameter_tuning/huggingface_multiclass_text_classification_20_newsgroups/hpo_huggingface_text_classification_20_newsgroups.ipynb b/hyperparameter_tuning/huggingface_multiclass_text_classification_20_newsgroups/hpo_huggingface_text_classification_20_newsgroups.ipynb new file mode 100644 index 0000000000..0029cfa3d0 --- /dev/null +++ b/hyperparameter_tuning/huggingface_multiclass_text_classification_20_newsgroups/hpo_huggingface_text_classification_20_newsgroups.ipynb @@ -0,0 +1,1377 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Text Classification with Amazon SageMaker HuggingFace and Hyperparameter Tuning\n", + "\n", + "Automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many jobs that test a range of hyperparameters on your dataset. You choose the tunable hyperparameters, a range of values for each, and an objective metric. You choose the objective metric from the metrics that the algorithm computes. Automatic model tuning searches the hyperparameters chosen to find the combination of values that result in the model that optimizes the objective metric.\n", + "\n", + "\n", + "## Introduction\n", + "\n", + "Text Classification can be used to solve various use-cases like sentiment analysis, spam detection, hashtag prediction etc. \n", + "\n", + "\n", + "This notebook demonstrates the use of the [HuggingFace `transformers` library](https://huggingface.co/transformers/) together with a custom Amazon sagemaker-sdk extension to fine-tune a pre-trained transformer on multi class text classification. In particular, the pre-trained model will be fine-tuned using the [`20 newsgroups dataset`](http://qwone.com/~jason/20Newsgroups/). To get started, we need to set up the environment with a few prerequisite steps, for permissions, configurations, and so on." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Python packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "!{sys.executable} -m pip install \"scikit_learn==0.20.0\" \"sagemaker>=2.48.0\" \"transformers==4.6.1\" \"datasets[s3]==1.6.2\" \"nltk==3.4.4\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you run this notebook in SageMaker Studio, you need to make sure `ipywidgets` is installed and restart the kernel, so please uncomment the code in the next cell, and run it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %%capture\n", + "# import IPython\n", + "# import sys\n", + "\n", + "# !{sys.executable} -m pip install ipywidgets\n", + "# IPython.Application.instance().kernel.do_shutdown(True) # has to restart kernel so changes are used" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Let's start by specifying:\n", + "\n", + "- The S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the Notebook Instance, training, and hosting. If you don't specify a bucket, SageMaker SDK will create a default bucket following a pre-defined naming convention in the same region. \n", + "- The IAM role ARN used to give SageMaker access to your data. It can be fetched using the **get_execution_role** method from sagemaker python SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "isConfigCell": true + }, + "outputs": [], + "source": [ + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "import json\n", + "import boto3\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "from sklearn.model_selection import train_test_split\n", + "import sagemaker.huggingface\n", + "\n", + "\n", + "sess = sagemaker.Session()\n", + "\n", + "role = get_execution_role()\n", + "print(\n", + " role\n", + ") # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf\n", + "\n", + "bucket = sess.default_bucket() # Replace with your own bucket name if needed\n", + "print(bucket)\n", + "s3_prefix = \"huggingface/20_newsgroups\" # Replace with the prefix under which you want to store the data if needed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preparation\n", + "\n", + "Now we'll download a dataset from the web on which we want to train the text classification model.\n", + "\n", + "In this example, let us train the text classification model on the [`20 newsgroups dataset`](http://qwone.com/~jason/20Newsgroups/). The `20 newsgroups dataset` consists of 20000 messages taken from 20 Usenet newsgroups." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "\n", + "data_dir = \"20_newsgroups_bulk\"\n", + "if os.path.exists(data_dir): # cleanup existing data folder\n", + " shutil.rmtree(data_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp s3://sagemaker-sample-files/datasets/text/20_newsgroups/20_newsgroups_bulk.tar.gz ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!tar xzf 20_newsgroups_bulk.tar.gz\n", + "!ls 20_newsgroups_bulk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_list = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]\n", + "print(\"Number of files:\", len(file_list))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documents_count = 0\n", + "for file in file_list:\n", + " df = pd.read_csv(file, header=None, names=[\"text\"])\n", + " documents_count = documents_count + df.shape[0]\n", + "print(\"Number of documents:\", documents_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's inspect the dataset files and analyze the categories." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "categories_list = [f.split(\"/\")[1] for f in file_list]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "categories_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the dataset consists of 20 topics, each in different file." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us inspect the dataset to get some understanding about how the data and the label is provided in the dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"./20_newsgroups_bulk/rec.motorcycles\", header=None, names=[\"text\"])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"text\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"./20_newsgroups_bulk/comp.sys.mac.hardware\", header=None, names=[\"text\"])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"text\"][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see from the above, there is a single file for each class in the dataset. Each record is just a plain text paragraphs with header, body, footer and quotes. We will need to process them into a suitable data format." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preprocessing\n", + "We need to preprocess the dataset to remove the header, footer, quotes, leading/trailing whitespace, extra spaces, tabs, and HTML tags/markups. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download the `nltk` tokenizer and other libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "nltk.download(\"punkt\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets.twenty_newsgroups import (\n", + " strip_newsgroup_header,\n", + " strip_newsgroup_quoting,\n", + " strip_newsgroup_footer,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This following function will remove the header, footer and quotes (of earlier messages in each text)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def strip_newsgroup_item(item):\n", + " item = strip_newsgroup_header(item)\n", + " item = strip_newsgroup_quoting(item)\n", + " item = strip_newsgroup_footer(item)\n", + " return item" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following function will take care of removing leading/trailing whitespace, extra spaces, tabs, and HTML tags/markups." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def process_text(texts):\n", + " final_text_list = []\n", + " for text in texts:\n", + "\n", + " # Check if the sentence is a missing value\n", + " if isinstance(text, str) == False:\n", + " text = \"\"\n", + "\n", + " filtered_sentence = []\n", + "\n", + " # Lowercase\n", + " text = text.lower()\n", + "\n", + " # Remove leading/trailing whitespace, extra space, tabs, and HTML tags/markups\n", + " text = text.strip()\n", + " text = re.sub(\"\\[.*?\\]\", \"\", text)\n", + " text = re.sub(\"https?://\\S+|www\\.\\S+\", \"\", text)\n", + " text = re.sub(\"<.*?>+\", \"\", text)\n", + " text = re.sub(\"[%s]\" % re.escape(string.punctuation), \"\", text)\n", + " text = re.sub(\"\\n\", \"\", text)\n", + " text = re.sub(\"\\w*\\d\\w*\", \"\", text)\n", + "\n", + " for w in word_tokenize(text):\n", + " # We are applying some custom filtering here, feel free to try different things\n", + " # Check if it is not numeric\n", + " if not w.isnumeric():\n", + " filtered_sentence.append(w)\n", + " final_string = \" \".join(filtered_sentence) # final string of cleaned words\n", + "\n", + " final_text_list.append(final_string)\n", + "\n", + " return final_text_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will read each of the `20_newsgroups` dataset files, call `strip_newsgroup_item` and `process_text` functions we defined earlier, and then aggregate all data into one dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df = pd.DataFrame()\n", + "\n", + "for file in file_list:\n", + " print(f\"Processing {file}\")\n", + " label = file.split(\"/\")[1]\n", + " df = pd.read_csv(file, header=None, names=[\"text\"])\n", + " df[\"text\"] = df[\"text\"].apply(strip_newsgroup_item)\n", + " df[\"text\"] = process_text(df[\"text\"].tolist())\n", + " df[\"label\"] = label\n", + " all_categories_df = all_categories_df.append(df, ignore_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's inspect how many categories there are in our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our dataset there are 20 categories which is too much, so we will combine the sub-categories." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# replace to politics\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"talk.politics.misc\": \"politics\",\n", + " \"talk.politics.guns\": \"politics\",\n", + " \"talk.politics.mideast\": \"politics\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "# replace to recreational\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"rec.sport.hockey\": \"recreational\",\n", + " \"rec.sport.baseball\": \"recreational\",\n", + " \"rec.autos\": \"recreational\",\n", + " \"rec.motorcycles\": \"recreational\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "# replace to religion\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"soc.religion.christian\": \"religion\",\n", + " \"talk.religion.misc\": \"religion\",\n", + " \"alt.atheism\": \"religion\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "# replace to computer\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"comp.windows.x\": \"computer\",\n", + " \"comp.sys.ibm.pc.hardware\": \"computer\",\n", + " \"comp.os.ms-windows.misc\": \"computer\",\n", + " \"comp.graphics\": \"computer\",\n", + " \"comp.sys.mac.hardware\": \"computer\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "# replace to sales\n", + "all_categories_df[\"label\"].replace({\"misc.forsale\": \"sales\"}, inplace=True)\n", + "\n", + "# replace to science\n", + "all_categories_df[\"label\"].replace(\n", + " {\n", + " \"sci.crypt\": \"science\",\n", + " \"sci.electronics\": \"science\",\n", + " \"sci.med\": \"science\",\n", + " \"sci.space\": \"science\",\n", + " },\n", + " inplace=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are left with 6 categories, which is much better." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's calculate number of words for each row." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"word_count\"] = all_categories_df[\"text\"].apply(lambda x: len(str(x).split()))\n", + "all_categories_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get basic statistics about the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"word_count\"].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the mean value is around 159 words. However, there are outliers, such as a text with 11351 words. This can make it harder for the model to result in good performance. We will take care to drop those rows." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's drop empty rows first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "no_text = all_categories_df[all_categories_df[\"word_count\"] == 0]\n", + "print(len(no_text))\n", + "\n", + "# drop these rows\n", + "all_categories_df.drop(no_text.index, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's drop the rows that are longer than 256 words, as it is a length close to the mean value of the word count. This is done to make it easy for the model to train without outliers. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "long_text = all_categories_df[all_categories_df[\"word_count\"] > 256]\n", + "print(len(long_text))\n", + "\n", + "# drop these rows\n", + "all_categories_df.drop(long_text.index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get basic statistics about the dataset after our outliers fixes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"word_count\"].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This looks much more balanced." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we drop the `word_count` columns as we will not need it anymore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df.drop(columns=\"word_count\", axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's convert categorical label to integer number, in order to prepare the dataset for training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "categories = all_categories_df[\"label\"].unique().tolist()\n", + "categories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "categories.index(\"recreational\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"label\"] = all_categories_df[\"label\"].apply(lambda x: categories.index(x))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_categories_df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We partition the dataset into 80% training and 20% validation set and save to `csv` files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df, test_df = train_test_split(all_categories_df, test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df.to_csv(\"train.csv\", index=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_df.to_csv(\"test.csv\", index=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's inspect the label distribution in the training dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's inspect the label distribution in the test dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokenization " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most of the tokenizers are available in two flavors: a full python implementation and a “Fast” implementation based on the Rust library [tokenizers](https://github.com/huggingface/tokenizers). The “Fast” implementations allows:\n", + "\n", + " - A significant speed-up in particular when doing batched tokenization.\n", + " - Additional methods to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "from transformers import AutoTokenizer\n", + "\n", + "# tokenizer used in preprocessing\n", + "tokenizer_name = \"distilbert-base-uncased\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# download tokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load train and test datasets\n", + "\n", + "Let's create a [Dataset](https://huggingface.co/docs/datasets/loading_datasets.html) from our local `csv` files for training and test we saved earlier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = load_dataset(\"csv\", data_files={\"train\": \"train.csv\", \"test\": \"test.csv\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset[\"train\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset[\"train\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset[\"test\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset[\"test\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# tokenizer helper function\n", + "def tokenize(batch):\n", + " return tokenizer(batch[\"text\"], padding=\"max_length\", truncation=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = dataset[\"train\"]\n", + "test_dataset = dataset[\"test\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tokenize train and test datasets\n", + "\n", + "Let's tokenize the train dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = train_dataset.map(tokenize, batched=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's tokenize the test dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_dataset = test_dataset.map(tokenize, batched=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set format for PyTorch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = train_dataset.rename_column(\"label\", \"labels\")\n", + "train_dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n", + "test_dataset = test_dataset.rename_column(\"label\", \"labels\")\n", + "test_dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Uploading data to `sagemaker_session_bucket`\n", + "\n", + "After we processed the datasets, we are going to upload it to S3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import botocore\n", + "from datasets.filesystems import S3FileSystem\n", + "\n", + "s3 = S3FileSystem()\n", + "\n", + "# save train_dataset to s3\n", + "training_input_path = f\"s3://{sess.default_bucket()}/{s3_prefix}/train\"\n", + "train_dataset.save_to_disk(training_input_path, fs=s3)\n", + "\n", + "# save test_dataset to s3\n", + "test_input_path = f\"s3://{sess.default_bucket()}/{s3_prefix}/test\"\n", + "test_dataset.save_to_disk(test_input_path, fs=s3)\n", + "\n", + "print(training_input_path)\n", + "print(test_input_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up hyperparameter tuning job\n", + "Now that we are done with all the setup that is needed, we are ready to train our HuggingFace model. To begin, let us create a `HuggingFace` estimator object. This estimator will launch the training job." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training the HuggingFace model for supervised text classification\n", + "\n", + "In order to create a sagemaker training job we need a `HuggingFace` Estimator. The Estimator handles end-to-end Amazon SageMaker training and deployment tasks. In an Estimator we define, which fine-tuning script should be used as `entry_point`, which `instance_type` should be used, which `hyperparameters` are passed in .....\n", + "\n", + "\n", + "\n", + "```python\n", + "huggingface_estimator = HuggingFace(entry_point='train.py',\n", + " source_dir='./code',\n", + " instance_type='ml.p3.2xlarge',\n", + " instance_count=1,\n", + " volume_size=256,\n", + " role=role,\n", + " transformers_version='4.6',\n", + " pytorch_version='1.7',\n", + " py_version='py36',\n", + " hyperparameters = {'epochs': 1,\n", + " 'model_name':'distilbert-base-uncased',\n", + " 'num_labels': 6\n", + " })\n", + "```\n", + "\n", + "When we create a SageMaker training job, SageMaker takes care of starting and managing all the required ec2 instances for us with the `huggingface` container, uploads the provided fine-tuning script `train.py` and downloads the data from our `sagemaker_session_bucket` into the container at `/opt/ml/input/data`. Then, it starts the training job by running. \n", + "\n", + "```python\n", + "/opt/conda/bin/python train.py --epochs 1 --model_name distilbert-base-uncased --num_labels 6\n", + "```\n", + "\n", + "The `hyperparameters` you define in the `HuggingFace` estimator are passed in as named arguments. \n", + "\n", + "SageMaker is providing useful properties about the training environment through various environment variables, including the following:\n", + "\n", + "* `SM_MODEL_DIR`: A string that represents the path where the training job writes the model artifacts to. After training, artifacts in this directory are uploaded to S3 for model hosting.\n", + "\n", + "* `SM_NUM_GPUS`: An integer representing the number of GPUs available to the host.\n", + "\n", + "* `SM_CHANNEL_XXXX:` A string that represents the path to the directory that contains the input data for the specified channel. For example, if you specify two input channels in the HuggingFace estimator’s fit call, named `train` and `test`, the environment variables `SM_CHANNEL_TRAIN` and `SM_CHANNEL_TEST` are set.\n", + "\n", + "\n", + "To run your training job locally you can define `instance_type='local'` or `instance_type='local-gpu'` for `gpu` usage. _Note: this does not work within SageMaker Studio_\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We create a metric_definition dictionary that contains regex-based definitions that will be used to parse the job logs and extract metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metric_definitions = [\n", + " {\"Name\": \"loss\", \"Regex\": \"'loss': ([0-9]+(.|e\\-)[0-9]+),?\"},\n", + " {\"Name\": \"learning_rate\", \"Regex\": \"'learning_rate': ([0-9]+(.|e\\-)[0-9]+),?\"},\n", + " {\"Name\": \"eval_loss\", \"Regex\": \"'eval_loss': ([0-9]+(.|e\\-)[0-9]+),?\"},\n", + " {\"Name\": \"eval_accuracy\", \"Regex\": \"'eval_accuracy': ([0-9]+(.|e\\-)[0-9]+),?\"},\n", + " {\"Name\": \"eval_f1\", \"Regex\": \"'eval_f1': ([0-9]+(.|e\\-)[0-9]+),?\"},\n", + " {\"Name\": \"eval_precision\", \"Regex\": \"'eval_precision': ([0-9]+(.|e\\-)[0-9]+),?\"},\n", + " {\"Name\": \"eval_recall\", \"Regex\": \"'eval_recall': ([0-9]+(.|e\\-)[0-9]+),?\"},\n", + " {\"Name\": \"eval_runtime\", \"Regex\": \"'eval_runtime': ([0-9]+(.|e\\-)[0-9]+),?\"},\n", + " {\n", + " \"Name\": \"eval_samples_per_second\",\n", + " \"Regex\": \"'eval_samples_per_second': ([0-9]+(.|e\\-)[0-9]+),?\",\n", + " },\n", + " {\"Name\": \"epoch\", \"Regex\": \"'epoch': ([0-9]+(.|e\\-)[0-9]+),?\"},\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.huggingface import HuggingFace\n", + "\n", + "# hyperparameters, which are passed into the training job\n", + "hyperparameters = {\"epochs\": 1, \"model_name\": \"distilbert-base-uncased\", \"num_labels\": 6}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's define the SageMaker `HuggingFace` estimator with resource configurations and hyperparameters to train Text Classification on `20 newsgroups` dataset, running on a `p3.2xlarge` instance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "huggingface_estimator = HuggingFace(\n", + " entry_point=\"train.py\",\n", + " source_dir=\"./code\",\n", + " instance_type=\"ml.p3.2xlarge\",\n", + " instance_count=1,\n", + " volume_size=256,\n", + " role=role,\n", + " transformers_version=\"4.6\",\n", + " pytorch_version=\"1.7\",\n", + " py_version=\"py36\",\n", + " hyperparameters=hyperparameters,\n", + " metric_definitions=metric_definitions,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we've defined our estimator we can specify the hyperparameters we'd like to tune and their possible values. We have three different types of hyperparameters.\n", + "- Categorical parameters need to take one value from a discrete set. We define this by passing the list of possible values to `CategoricalParameter(list)`\n", + "- Continuous parameters can take any real number value between the minimum and maximum value, defined by `ContinuousParameter(min, max)`\n", + "- Integer parameters can take any integer value between the minimum and maximum value, defined by `IntegerParameter(min, max)`\n", + "\n", + "*Note, if possible, it's almost always best to specify a value as the least restrictive type. For example, tuning learning rate as a continuous value between 0.01 and 0.2 is likely to yield a better result than tuning as a categorical parameter with values 0.01, 0.1, 0.15, or 0.2.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.tuner import (\n", + " IntegerParameter,\n", + " CategoricalParameter,\n", + " ContinuousParameter,\n", + " HyperparameterTuner,\n", + ")\n", + "\n", + "hyperparameter_ranges = {\n", + " \"train_batch_size\": IntegerParameter(8, 32),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we'll specify the objective metric that we'd like to tune and its definition, which includes the regular expression (Regex) needed to extract that metric from the CloudWatch logs of the training job. If you bring your own algorithm, your algorithm emits metrics by itself. In that case, you'll need to add a `MetricDefinition` object here to define the format of those metrics through regex, so that SageMaker knows how to extract those metrics from your CloudWatch logs.\n", + "\n", + "In this case, we elected to monitor `eval_accuracy` as you can see below. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "objective_metric_name = \"eval_accuracy\"\n", + "objective_type = \"Maximize\"\n", + "hpo_metric_definitions = [\n", + " {\"Name\": \"eval_accuracy\", \"Regex\": \"'eval_accuracy': ([0-9]+(.|e\\-)[0-9]+),?\"}\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we'll create a `HyperparameterTuner` object, to which we pass:\n", + "- The `HuggingFace` estimator we created above\n", + "- Our hyperparameter ranges\n", + "- Objective metric name and definition\n", + "- Tuning resource configurations such as Number of training jobs to run in total and how many training jobs can be run in parallel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tuner = HyperparameterTuner(\n", + " huggingface_estimator,\n", + " objective_metric_name,\n", + " hyperparameter_ranges,\n", + " hpo_metric_definitions,\n", + " max_jobs=6,\n", + " max_parallel_jobs=3,\n", + " objective_type=objective_type,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch hyperparameter tuning job\n", + "Now we can launch a hyperparameter tuning job by calling *fit()* function. After the hyperparameter tuning job is created, we can go to SageMaker console to track the progress of the hyperparameter tuning job until it is completed.\n", + "\n", + "This should take around 28 minutes to complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "tuner.fit({\"train\": training_input_path, \"test\": test_input_path}, logs=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze Results of a Hyperparameter Tuning job\n", + "\n", + "Once you have completed a tuning job, (or even while the job is still running) you can use the code below to analyze the results to understand how each hyperparameter effects the quality of the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sm_client = boto3.Session().client(\"sagemaker\")\n", + "\n", + "tuning_job_name = tuner.latest_tuning_job.name\n", + "tuning_job_name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Track hyperparameter tuning job progress\n", + "After you launch a tuning job, you can see its progress by calling `describe_tuning_job` API. The output from describe-tuning-job is a JSON object that contains information about the current state of the tuning job. You can call `list_training_jobs_for_tuning_job` to see a detailed list of the training jobs that the tuning job launched." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tuning_job_result = sm_client.describe_hyper_parameter_tuning_job(\n", + " HyperParameterTuningJobName=tuning_job_name\n", + ")\n", + "\n", + "status = tuning_job_result[\"HyperParameterTuningJobStatus\"]\n", + "if status != \"Completed\":\n", + " print(\"Reminder: the tuning job has not been completed.\")\n", + "\n", + "job_count = tuning_job_result[\"TrainingJobStatusCounters\"][\"Completed\"]\n", + "print(\"%d training jobs have completed\" % job_count)\n", + "\n", + "is_minimize = (\n", + " tuning_job_result[\"HyperParameterTuningJobConfig\"][\"HyperParameterTuningJobObjective\"][\"Type\"]\n", + " != \"Maximize\"\n", + ")\n", + "objective_name = tuning_job_result[\"HyperParameterTuningJobConfig\"][\n", + " \"HyperParameterTuningJobObjective\"\n", + "][\"MetricName\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "if tuning_job_result.get(\"BestTrainingJob\", None):\n", + " print(\"Best model found so far:\")\n", + " pprint(tuning_job_result[\"BestTrainingJob\"])\n", + "else:\n", + " print(\"No training jobs have reported results yet.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch all results as `DataFrame`\n", + "We can list hyperparameters and objective metrics of all training jobs and pick up the training job with the best objective metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "tuner_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)\n", + "\n", + "full_df = tuner_analytics.dataframe()\n", + "\n", + "if len(full_df) > 0:\n", + " df = full_df[full_df[\"FinalObjectiveValue\"] > -float(\"inf\")]\n", + " if len(df) > 0:\n", + " df = df.sort_values(\"FinalObjectiveValue\", ascending=is_minimize)\n", + " print(\"Number of training jobs with valid objective: %d\" % len(df))\n", + " print({\"lowest\": min(df[\"FinalObjectiveValue\"]), \"highest\": max(df[\"FinalObjectiveValue\"])})\n", + " pd.set_option(\"display.max_colwidth\", -1) # Don't truncate TrainingJobName\n", + " else:\n", + " print(\"No training jobs have reported valid results yet.\")\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy the best trained model\n", + "Once the training is done, we can deploy the trained model as an Amazon SageMaker real-time hosted endpoint. This will allow us to make predictions (or inference) from the model. Note that we don't have to host on the same type of instance that we used to train, because usually for inference, less compute power is needed than for training, and in addition, instance endpoints will be up and running for long, it's advisable to choose a cheaper instance for inference.\n", + "\n", + "- `ml.p3.2xlarge` - deliver high performance compute in the cloud with up to 8 NVIDIA® V100 Tensor Core GPUs and up to 100 `Gbps` of networking throughput for machine learning and HPC applications.\n", + "- `ml.g4dn.xlarge` - the industry’s most cost-effective and versatile GPU instances for deploying machine learning models such as image classification, object detection, and speech recognition, and for graphics-intensive applications such as remote graphics workstations, game streaming, and graphics rendering." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictor = tuner.deploy(1, \"ml.g4dn.xlarge\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, we use the returned predictor object to call the endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def predict_sentence(sentence):\n", + " result = predictor.predict({\"inputs\": sentence})\n", + " index = int(result[0][\"label\"].split(\"LABEL_\")[1])\n", + " print(categories[index])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentences = [\n", + " \"The modem is an internal AT/(E)ISA 8-bit card (just a little longer than a half-card).\",\n", + " \"In the cage I usually wave to bikers. They usually don't wave back. My wife thinks it's strange but I don't care.\",\n", + " \"Voyager has the unusual luck to be on a stable trajectory out of the solar system.\",\n", + "]\n", + "\n", + "# using the same processing logic that we used during data preparation for training\n", + "processed_sentences = process_text(sentences)\n", + "\n", + "for sentence in processed_sentences:\n", + " predict_sentence(sentence)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean up\n", + "Endpoints should be deleted when no longer in use, since (per the [SageMaker pricing page](https://aws.amazon.com/sagemaker/pricing/)) they're billed by time deployed.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictor.delete_endpoint()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (PyTorch 1.6 Python 3.6 CPU Optimized)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/pytorch-1.6-cpu-py36-ubuntu16.04-v1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + }, + "notice": "Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/index.rst b/index.rst index f823354748..1f5fb324ed 100644 --- a/index.rst +++ b/index.rst @@ -43,6 +43,12 @@ On SageMaker Studio, you will need to open a terminal, go to your home folder, t end_to_end/fraud_detection/index end_to_end/music_recommendation/index +.. toctree:: + :maxdepth: 1 + :caption: Patterns + + patterns/ml_gateway/index + .. toctree:: :maxdepth: 1 diff --git a/ingest_data/011_Ingest_tabular_data_v1.ipynb b/ingest_data/011_Ingest_tabular_data_v1.ipynb index 8b76220430..c8214d3a6b 100644 --- a/ingest_data/011_Ingest_tabular_data_v1.ipynb +++ b/ingest_data/011_Ingest_tabular_data_v1.ipynb @@ -13,21 +13,21 @@ "* Using AWS native methods to directly access your data. You can also use AWS native packages like `s3fs` and `aws data wrangler` to access your data directly. \n", "\n", "We will demonstrate how to ingest the following tabular (structured) into a notebook for further analysis:\n", - "## Tabular data: Boston Housing Data\n", - "The [Boston House](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html) contains information collected by the U.S Census Service concerning housing in the area of Boston Mass. We will use the data set to showcase how to ingest tabular data into S3, and for further pre-processing and feature engineering. The dataset contains the following columns (506 rows):\n", - "* `CRIM` - per capita crime rate by town\n", - "* `ZN` - proportion of residential land zoned for lots over 25,000 sq.ft.\n", - "* `INDUS` - proportion of non-retail business acres per town.\n", - "* `CHAS` - Charles River dummy variable (1 if tract bounds river; 0 otherwise)\n", - "* `NOX` - nitric oxides concentration (parts per 10 million)\n", - "* `RM` - average number of rooms per dwelling\n", - "* `AGE` - proportion of owner-occupied units built prior to 1940\n", - "* `DIS` - weighted distances to five Boston employment centres\n", - "* `RAD` - index of accessibility to radial highways\n", - "* `TAX` - full-value property-tax rate per \\$10,000\n", - "* `PTRATIO` - pupil-teacher ratio by town\n", - "* `B` - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n", - "* `LSTAT` - \\% lower status of the population" + "## Tabular data: California Housing Data\n", + "The [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset contains information from the 1990 California census. We will use the data set to showcase how to ingest tabular data into S3, and for further pre-processing and feature engineering. The dataset contains the following columns:\n", + "\n", + "* `MedInc` - average income.\n", + "* `HouseAge` - housing average age.\n", + "* `AveRooms` - average rooms.\n", + "* `AveBedrms` - average bedrooms.\n", + "* `Population` - population.\n", + "* `AveOccup` - average occupation.\n", + "* `Latitude` - latitude.\n", + "* `Longitude` - longitude.\n", + "\n", + "The California Housing dataset was originally published in:\n", + "\n", + "> Pace, R. Kelley, and Ronald Barry. \"Sparse spatial autoregressions.\" Statistics & Probability Letters 33.3 (1997): 291-297." ] }, { @@ -58,8 +58,8 @@ "import s3fs\n", "import sagemaker\n", "\n", - "# to load the boston housing dataset\n", - "from sklearn.datasets import *" + "# to load the California housing dataset\n", + "from sklearn.datasets import fetch_california_housing" ] }, { @@ -72,8 +72,8 @@ "sagemaker_session = sagemaker.Session()\n", "s3 = sagemaker_session.boto_session.resource(\"s3\")\n", "bucket = sagemaker_session.default_bucket() # replace with your own bucket name if you have one\n", - "prefix = \"data/tabular/boston_house\"\n", - "filename = \"boston_house.csv\"" + "prefix = \"data/tabular/california_housing\"\n", + "filename = \"california_housing.csv\"" ] }, { @@ -103,10 +103,10 @@ "outputs": [], "source": [ "# download files from tabular data source location\n", - "tabular_data = load_boston()\n", + "tabular_data = fetch_california_housing()\n", "tabular_data_full = pd.DataFrame(tabular_data.data, columns=tabular_data.feature_names)\n", "tabular_data_full[\"target\"] = pd.DataFrame(tabular_data.target)\n", - "tabular_data_full.to_csv(\"boston_house.csv\", index=False)" + "tabular_data_full.to_csv(\"california_housing.csv\", index=False)" ] }, { @@ -219,14 +219,6 @@ "wr_data = wr.s3.read_csv(path=data_wr_location, nrows=5)\n", "wr_data.head()" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Citation\n", - "Boston Housing data, Harrison, D. and Rubinfeld, D.L. `Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978." - ] } ], "metadata": { @@ -245,7 +237,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/ingest_data/02_Ingest_data_with_Athena_v1.ipynb b/ingest_data/02_Ingest_data_with_Athena_v1.ipynb index a8c0718393..24ff1cf846 100644 --- a/ingest_data/02_Ingest_data_with_Athena_v1.ipynb +++ b/ingest_data/02_Ingest_data_with_Athena_v1.ipynb @@ -95,7 +95,7 @@ "from sagemaker import get_execution_role\n", "import os\n", "import sys\n", - "from sklearn.datasets import *\n", + "from sklearn.datasets import fetch_california_housing\n", "import pandas as pd\n", "from botocore.exceptions import ClientError\n", "\n", @@ -110,8 +110,8 @@ "s3 = sagemaker_session.boto_session.resource(\"s3\")\n", "role = sagemaker.get_execution_role()\n", "role_name = role.split(\"/\")[-1]\n", - "prefix = \"data/tabular/boston_house\"\n", - "filename = \"boston_house.csv\"" + "prefix = \"data/tabular/california_housing\"\n", + "filename = \"california_housing.csv\"" ] }, { @@ -121,6 +121,15 @@ "### Download data from online resources and write data to S3" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example uses the California Housing dataset, which was originally published in:\n", + "\n", + "> Pace, R. Kelley, and Ronald Barry. \"Sparse spatial autoregressions.\" Statistics & Probability Letters 33.3 (1997): 291-297." + ] + }, { "cell_type": "code", "execution_count": null, @@ -147,10 +156,10 @@ "metadata": {}, "outputs": [], "source": [ - "tabular_data = load_boston()\n", + "tabular_data = fetch_california_housing()\n", "tabular_data_full = pd.DataFrame(tabular_data.data, columns=tabular_data.feature_names)\n", "tabular_data_full[\"target\"] = pd.DataFrame(tabular_data.target)\n", - "tabular_data_full.to_csv(\"boston_house.csv\", index=False)\n", + "tabular_data_full.to_csv(\"california_housing.csv\", index=False)\n", "\n", "upload_to_s3(bucket, \"data/tabular\", filename)" ] @@ -186,7 +195,9 @@ " print(\"IAMFullAccessPolicy Already Attached\")\n", "except ClientError as e:\n", " if e.response[\"Error\"][\"Code\"] == \"AccessDenied\":\n", - " print(\"You need to attach the IAMFullAccess policy in order to attach policy to the role.\")\n", + " print(\n", + " \"ERROR: You need to attach the IAMFullAccess policy in order to attach policy to the role.\"\n", + " )\n", " else:\n", " print(\"Unexpected error: %s\" % e)" ] @@ -325,7 +336,7 @@ "outputs": [], "source": [ "# Set Athena database name\n", - "database_name = \"tabularbh\"" + "database_name = \"tabular_california_housing\"" ] }, { @@ -379,7 +390,7 @@ "outputs": [], "source": [ "prefix = \"data/tabular\"\n", - "filename_key = \"boston_house\"" + "filename_key = \"california_housing\"" ] }, { @@ -397,7 +408,7 @@ "metadata": {}, "outputs": [], "source": [ - "table_name_csv = \"boston_house_athena\"" + "table_name_csv = \"california_housing_athena\"" ] }, { @@ -407,21 +418,17 @@ "outputs": [], "source": [ "# SQL statement to execute\n", + "\n", "statement = \"\"\"CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(\n", - " CRIM double,\n", - " ZN double,\n", - " INDUS double,\n", - " CHAS double,\n", - " NOX double,\n", - " RM double,\n", - " AGE double,\n", - " DIS double, \n", - " RAD double, \n", - " TAX double,\n", - " PTRATIO double, \n", - " B double, \n", - " LSTAT double,\n", - " target double\n", + " MedInc double,\n", + " HouseAge double,\n", + " AveRooms double,\n", + " AveBedrms double,\n", + " Population double,\n", + " AveOccup double,\n", + " Latitude double,\n", + " Longitude double, \n", + " MedValue double\n", "\n", ") ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\\\n' LOCATION '{}'\n", "TBLPROPERTIES ('skip.header.line.count'='1')\"\"\".format(\n", @@ -546,7 +553,6 @@ "metadata": {}, "source": [ "### Citation\n", - "Boston Housing data, Harrison, D. and Rubinfeld, D.L. `Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.\n", "\n", "Data Science On AWS workshops, Chris Fregly, Antje Barth, https://www.datascienceonaws.com/" ] @@ -568,7 +574,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/ingest_data/03_Ingest_data_with_Redshift_v3.ipynb b/ingest_data/03_Ingest_data_with_Redshift_v3.ipynb index a5a81b5501..9efa17400e 100644 --- a/ingest_data/03_Ingest_data_with_Redshift_v3.ipynb +++ b/ingest_data/03_Ingest_data_with_Redshift_v3.ipynb @@ -82,28 +82,29 @@ "" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "This example uses the California Housing dataset, which was originally published in:\n", + "\n", + "> Pace, R. Kelley, and Ronald Barry. \"Sparse spatial autoregressions.\" Statistics & Probability Letters 33.3 (1997): 291-297." + ] + }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mWARNING: You are using pip version 20.0.2; however, version 20.2.4 is available.\n", - "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "%pip install -qU 'sagemaker>=2.15.0' 'PyAthena==1.10.7' 'awswrangler==1.2.0' 'SQLAlchemy==1.3.13'" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -127,13 +128,13 @@ "sagemaker_session = sagemaker.Session()\n", "bucket = sagemaker_session.default_bucket() # replace with your own bucket name if you have one\n", "role = sagemaker.get_execution_role()\n", - "prefix = \"data/tabular/boston_house\"\n", - "filename = \"boston_house.csv\"" + "prefix = \"data/tabular/california_housing\"\n", + "filename = \"california_housing.csv\"" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -146,17 +147,9 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Your Role name used to create this notebook is: AmazonSageMaker-ExecutionRole-20201006T125078\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "role_name = role.split(\"/\")[-1]\n", "print(\"Your Role name used to create this notebook is: {}\".format(role_name))" @@ -171,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -191,22 +184,14 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Writing to s3://sagemaker-us-east-2-060356833389/data/tabular/boston_house.csv\n" - ] - } - ], - "source": [ - "tabular_data = load_boston()\n", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tabular_data = fetch_california_housing()\n", "tabular_data_full = pd.DataFrame(tabular_data.data, columns=tabular_data.feature_names)\n", "tabular_data_full[\"target\"] = pd.DataFrame(tabular_data.target)\n", - "tabular_data_full.to_csv(\"boston_house.csv\", index=False)\n", + "tabular_data_full.to_csv(\"california_housing.csv\", index=False)\n", "\n", "upload_to_s3(bucket, \"data/tabular\", filename)" ] @@ -221,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -239,17 +224,9 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Role already exists\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Create Role\n", "iam_redshift_role_name = \"Tabular_Redshift\"\n", @@ -268,18 +245,9 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Your Role arn used to create a Redshift Cluster is: arn:aws:iam::060356833389:role/Tabular_Redshift\n", - "arn:aws:iam::060356833389:role/Tabular_Redshift\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# get role arn\n", "role_rs = iam.get_role(RoleName=\"Tabular_Redshift\")\n", @@ -301,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -314,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -393,17 +361,9 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Policy already exists\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "try:\n", " policy_redshift_s3 = iam.create_policy(\n", @@ -422,17 +382,9 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Policy already exists\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "try:\n", " policy_redshift_athena = iam.create_policy(\n", @@ -459,7 +411,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -477,7 +429,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -505,17 +457,9 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Policy SecretsManagerReadWrite has been succesfully attached to role: AmazonSageMaker-ExecutionRole-20201006T125078\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# making sure you have secret manager policy attached to role\n", "try:\n", @@ -533,17 +477,9 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Policy AmazonRedshiftFullAccess has been succesfully attached to role: AmazonSageMaker-ExecutionRole-20201006T125078\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# making sure you have RedshiftFullAccess policy attached to role\n", "from botocore.exceptions import ClientError\n", @@ -573,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -588,9 +524,13 @@ "source": [ "### Create Secret in Secrets Manager\n", "\n", + "__Your IAM role will need permission to create a secret and get its value.__ This can be accomplished with the SecretsManagerReadWrite managed policy.\n", + "\n", "AWS Secrets Manager is a service that enables you to easily rotate, manage, and retrieve database credentials, API keys, and other secrets throughout their lifecycle. Using Secrets Manager, you can secure and manage secrets used to access resources in the AWS Cloud, on third-party services, and on-premises.\n", "\n", - "*note that `MasterUserPassword` must contain at least 1 upper case letter and at least 1 decimal digit." + "*note that `MasterUserPassword` must contain at least 1 upper case letter and at least 1 decimal digit.\n", + "\n", + "Ensure that you change the secret password to be unique and secure." ] }, { @@ -604,8 +544,8 @@ "try:\n", " response = secretsmanager.create_secret(\n", " Name=\"tabular_redshift_login\",\n", - " Description=\"Boston House data New Cluster Redshift Login\",\n", - " SecretString='[{\"username\":\"awsuser\"},{\"password\":\"Bostonhouse1\"}]',\n", + " Description=\"California Housing data New Cluster Redshift Login\",\n", + " SecretString='[{\"username\":\"awsuser\"},{\"password\":\"Californiahousing1\"}]',\n", " Tags=[\n", " {\"Key\": \"name\", \"Value\": \"tabular_redshift_login\"},\n", " ],\n", @@ -643,7 +583,7 @@ "# Set up parameters\n", "# Redshift configuration parameters\n", "redshift_cluster_identifier = \"redshiftdemo\"\n", - "database_name = \"bostonhouse\"\n", + "database_name = \"california_housing\"\n", "cluster_type = \"multi-node\"\n", "\n", "node_type = \"dc2.large\"\n", @@ -698,7 +638,7 @@ " NumberOfNodes=int(number_nodes),\n", " MasterUsername=master_user_name,\n", " MasterUserPassword=master_user_pw,\n", - " ClusterSubnetGroupName=\"cluster-subnet-group-1\", # you can either specify an existing subnet group (change this to your Subnet Group name), or specify your security group below\n", + " # ClusterSubnetGroupName=\"\", # you can either specify an existing subnet group (change this to your Subnet Group name), or use the security group ID that was retrieved above\n", " IamRoles=[iam_role_redshift_arn],\n", " VpcSecurityGroupIds=[security_group_id],\n", " Port=5439,\n", @@ -790,15 +730,15 @@ "source": [ "redshift_cluster_identifier = \"redshiftdemo\"\n", "\n", - "database_name_redshift = \"bostonhouse\"\n", - "database_name_athena = \"tabularbh\"\n", + "database_name_redshift = \"california_housing\"\n", + "database_name_athena = \"tabular_california_housing\"\n", "\n", "redshift_port = \"5439\"\n", "\n", "schema_redshift = \"redshift\"\n", "schema_spectrum = \"spectrum\"\n", "\n", - "table_name_csv = \"boston_house_athena\"" + "table_name_csv = \"california_housing_athena\"" ] }, { @@ -817,7 +757,7 @@ "# check cluster status\n", "response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)\n", "cluster_status = response[\"Clusters\"][0][\"ClusterStatus\"]\n", - "print(cluster_status)" + "print(\"Cluster status is:\", cluster_status)" ] }, { @@ -972,20 +912,15 @@ "statement = \"\"\"\n", "rollback;\n", "create table if not exists redshift.{}(\n", - " CRIM float,\n", - " ZN float,\n", - " INDUS float,\n", - " CHAS float,\n", - " NOX float,\n", - " RM float,\n", - " AGE float,\n", - " DIS float, \n", - " RAD float, \n", - " TAX float,\n", - " PTRATIO float, \n", - " B float, \n", - " LSTAT float,\n", - " target float)\"\"\".format(\n", + " MedInc float,\n", + " HouseAge float,\n", + " AveRooms float,\n", + " AveBedrms float,\n", + " Population float,\n", + " AveOccup float,\n", + " Latitude float,\n", + " Longitude float, \n", + " MedValue float)\"\"\".format(\n", " table_name_redshift\n", ")\n", "\n", @@ -1063,20 +998,15 @@ "statement = \"\"\"\n", "rollback;\n", "create table if not exists redshift.{}(\n", - " CRIM float,\n", - " ZN float,\n", - " INDUS float,\n", - " CHAS float,\n", - " NOX float,\n", - " RM float,\n", - " AGE float,\n", - " DIS float, \n", - " RAD float, \n", - " TAX float,\n", - " PTRATIO float, \n", - " B float, \n", - " LSTAT float,\n", - " target float)\"\"\".format(\n", + " MedInc float,\n", + " HouseAge float,\n", + " AveRooms float,\n", + " AveBedrms float,\n", + " Population float,\n", + " AveOccup float,\n", + " Latitude float,\n", + " Longitude float, \n", + " MedValue float)\"\"\".format(\n", " table_name_redshift\n", ")\n", "\n", @@ -1099,7 +1029,7 @@ "outputs": [], "source": [ "table_name_redshift = table_name_csv + \"_\" + \"redshift_copy\"\n", - "data_s3_path = \"s3://sagemaker-us-east-2-060356833389/data/tabular/boston_house/boston_house.csv\"\n", + "data_s3_path = \"s3://{}/data/tabular/california_housing/california_housing.csv\".format(bucket)\n", "statement = \"\"\"\n", "rollback;\n", "copy redshift.{} \n", @@ -1174,6 +1104,18 @@ "https://aws-data-wrangler.readthedocs.io/en/latest/stubs/awswrangler.db.get_engine.html#awswrangler.db.get_engine" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "private_ip = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)[\"Clusters\"][\n", + " 0\n", + "][\"ClusterNodes\"][0][\"PrivateIPAddress\"]\n", + "print(\"Private IP address is: \", private_ip)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1182,7 +1124,7 @@ "source": [ "engine = wr.db.get_engine(\n", " db_type=\"postgresql\",\n", - " host=\"10.0.14.121\", # Private IP address of your Redshift Cluster\n", + " host=private_ip, # Private IP address of your Redshift Cluster\n", " port=redshift_port,\n", " database=database_name_redshift,\n", " user=master_user_name,\n", @@ -1213,7 +1155,6 @@ "metadata": {}, "source": [ "### Citation\n", - "Boston Housing data, Harrison, D. and Rubinfeld, D.L. `Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.\n", "\n", "Data Science On AWS workshops, Chris Fregly, Antje Barth, https://www.datascienceonaws.com/" ] @@ -1221,9 +1162,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "conda_python3", "language": "python", - "name": "python3" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -1235,7 +1176,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/ingest_data/index.rst b/ingest_data/index.rst index cb08f40cba..28092253d9 100644 --- a/ingest_data/index.rst +++ b/ingest_data/index.rst @@ -30,7 +30,7 @@ Athena You can use Amazon Athena as a data source for SageMaker. Athena is a serverless interactive query service that makes it easy to analyze your S3 data with standard SQL. -This example runs the Boston housing dataset and uses PyAthena, a Python client for Athena, and `awswrangler`, a Pandas-like interface to many AWS data platforms. +This example runs the California housing dataset and uses PyAthena, a Python client for Athena, and `awswrangler`, a Pandas-like interface to many AWS data platforms. .. toctree:: :maxdepth: 1 @@ -43,7 +43,7 @@ EMR You can use Amazon EMR as a data source for SageMaker. While EMR supports is used for processing large amounts of data from a variety of sources, SageMaker-EMR examples focus on Apache Spark. -This example runs the Boston housing dataset. +This example runs the California housing dataset. .. toctree:: :maxdepth: 1 @@ -56,7 +56,7 @@ Redshift You can use Amazon Redshift as a data source for SageMaker. Redshift is a fully managed data warehouse that allows you to run complex analytic queries against petabytes of structured data. -This example runs the Boston housing dataset and uses `awswrangler`, a Pandas-like interface to many AWS data platforms. +This example runs the California housing dataset and uses `awswrangler`, a Pandas-like interface to many AWS data platforms. .. toctree:: diff --git a/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-lst-format-highlevel.ipynb b/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-lst-format-highlevel.ipynb index 054180e53c..9bb4f6e966 100644 --- a/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-lst-format-highlevel.ipynb +++ b/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-lst-format-highlevel.ipynb @@ -7,7 +7,7 @@ "# Image classification training with image format demo\n", "\n", "1. [Introduction](#Introduction)\n", - "2. [Prerequisites and Preprocessing](#Prequisites-and-Preprocessing)\n", + "2. [Prerequisites and Preprocessing](#Prerequisites-and-Preprocessing)\n", " 1. [Permissions and environment variables](#Permissions-and-environment-variables)\n", " 2. [Prepare the data](#Prepare-the-data)\n", "3. [Fine-tuning The Image Classification Model](#Fine-tuning-the-Image-classification-model)\n", @@ -22,7 +22,7 @@ "source": [ "## Introduction\n", "\n", - "Welcome to our end-to-end example of the image classification algorithm training with image format. In this demo, we will use the Amazon sagemaker image classification algorithm in transfer learning mode to fine-tune a pre-trained model (trained on imagenet data) to learn to classify a new dataset. In particular, the pre-trained model will be fine-tuned using [caltech-256 dataset](http://www.vision.caltech.edu/Image_Datasets/Caltech256/). \n", + "Welcome to our end-to-end example of the image classification algorithm training with image format. In this demo, we will use the Amazon SageMaker image classification algorithm in transfer learning mode to fine-tune a pre-trained model (trained on ImageNet data) to learn to classify a new dataset. In particular, the pre-trained model will be fine-tuned using the [Caltech-256 dataset](http://www.vision.caltech.edu/Image_Datasets/Caltech256/). \n", "\n", "To get started, we need to set up the environment with a few prerequisite steps, for permissions, configurations, and so on." ] @@ -31,7 +31,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Prequisites and Preprocessing\n", + "## Prerequisites and Preprocessing\n", "\n", "### Permissions and environment variables\n", "\n", @@ -39,7 +39,7 @@ "\n", "* The roles used to give learning and hosting access to your data. This will automatically be obtained from the role used to start the notebook\n", "* The S3 bucket that you want to use for training and model data\n", - "* The Amazon sagemaker image classification docker image which need not be changed" + "* The Amazon SageMaker image classification docker image which need not be changed" ] }, { @@ -66,9 +66,9 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.amazon.amazon_estimator import get_image_uri\n", + "from sagemaker import image_uris\n", "\n", - "training_image = get_image_uri(sess.boto_region_name, \"image-classification\", repo_version=\"latest\")\n", + "training_image = image_uris.retrieve(region=sess.boto_region_name, framework=\"image-classification\")\n", "print(training_image)" ] }, @@ -78,9 +78,9 @@ "source": [ "\n", "### Prepare the data\n", - "The caltech 256 dataset consist of images from 257 categories (the last one being a clutter category) and has 30k images with a minimum of 80 images and a maximum of about 800 images per category. \n", + "The Caltech-256 dataset consist of images from 257 categories (the last one being a clutter category) and has 30k images with a minimum of 80 images and a maximum of about 800 images per category. \n", "\n", - "The image classification algorithm can take two types of input formats. The first is a [RecordIO format](https://mxnet.incubator.apache.org/tutorials/basic/record_io.html) (content type: application/x-recordio) and the other is a [lst format](https://mxnet.incubator.apache.org/how_to/recordio.html?highlight=im2rec) (content type: application/x-image). Files for both these formats are available at http://data.dmlc.ml/mxnet/data/caltech-256/. In this example, we will use the lst format for training and use the training/validation split [specified here](http://data.dmlc.ml/mxnet/data/caltech-256/)." + "The image classification algorithm can take two types of input formats. The first is a [RecordIO format](https://mxnet.incubator.apache.org/tutorials/basic/record_io.html) (content type: application/x-recordio) and the other is a [lst format](https://mxnet.incubator.apache.org/how_to/recordio.html?highlight=im2rec) (content type: application/x-jpeg). Files for both these formats are available at http://data.dmlc.ml/mxnet/data/caltech-256/. In this example, we will use the lst format for training and use the training/validation split [specified here](http://data.dmlc.ml/mxnet/data/caltech-256/)." ] }, { @@ -89,6 +89,7 @@ "metadata": {}, "outputs": [], "source": [ + "import boto3\n", "import os\n", "import urllib.request\n", "\n", @@ -100,7 +101,12 @@ "\n", "\n", "# Caltech-256 image files\n", - "download(\"http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar\")\n", + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/caltech-256/256_ObjectCategories.tar\",\n", + " \"256_ObjectCategories.tar\",\n", + ")\n", "!tar -xf 256_ObjectCategories.tar\n", "\n", "# Tool for creating lst file\n", @@ -179,7 +185,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we have all the data stored in S3 bucket. The image and lst files will be converted to RecordIO file internelly by the image classification algorithm. But if you want do the conversion, the following cell shows how to do it using the [im2rec](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) tool. Note that this is just an example of creating RecordIO files. We are **_not_** using them for training in this notebook. More details on creating RecordIO files can be found in this [tutorial](https://mxnet.incubator.apache.org/how_to/recordio.html?highlight=im2rec)." + "Now we have all the data stored in S3 bucket. The image and lst files will be converted to RecordIO file internally by the image classification algorithm. But if you want to do the conversion, the following cell shows how to do it using the [im2rec](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) tool. Note that this is just an example of creating RecordIO files. We are **_not_** using them for training in this notebook. More details on creating RecordIO files can be found in this [tutorial](https://mxnet.incubator.apache.org/how_to/recordio.html?highlight=im2rec)." ] }, { @@ -204,7 +210,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Before training the model, we need to setup the training parameters. The next section will explain the parameters in detail." + "Before training the model, we need to set up the training parameters. The next section will explain the parameters in detail." ] }, { @@ -231,13 +237,14 @@ "ic = sagemaker.estimator.Estimator(\n", " training_image,\n", " role,\n", - " train_instance_count=1,\n", - " train_instance_type=\"ml.p2.xlarge\",\n", - " train_volume_size=50,\n", - " train_max_run=360000,\n", + " instance_count=1,\n", + " instance_type=\"ml.p2.xlarge\",\n", + " volume_size=50,\n", + " max_run=360000,\n", " input_mode=\"File\",\n", " output_path=s3_output_location,\n", " sagemaker_session=sess,\n", + " num_classes=257,\n", ")" ] }, @@ -247,11 +254,11 @@ "source": [ "Apart from the above set of parameters, there are hyperparameters that are specific to the algorithm. These are:\n", "\n", - "* **num_layers**: The number of layers (depth) for the network. We use 18 in this samples but other values such as 50, 152 can be used.\n", + "* **num_layers**: The number of layers (depth) for the network. We use 18 in this sample but other values such as 50, 152 can be used.\n", "* **use_pretrained_model**: Set to 1 to use pretrained model for transfer learning.\n", "* **image_shape**: The input image dimensions,'num_channels, height, width', for the network. It should be no larger than the actual image size. The number of channels should be same as the actual image.\n", - "* **num_classes**: This is the number of output classes for the new dataset. Imagenet was trained with 1000 output classes but the number of output classes can be changed for fine-tuning. For caltech, we use 257 because it has 256 object categories + 1 clutter class.\n", - "* **num_training_samples**: This is the total number of training samples. It is set to 15240 for caltech dataset with the current split.\n", + "* **num_classes**: This is the number of output classes for the new dataset. ImageNet was trained with 1000 output classes but the number of output classes can be changed for fine-tuning. For Caltech, we use 257 because it has 256 object categories + 1 clutter class.\n", + "* **num_training_samples**: This is the total number of training samples. It is set to 15240 for the Caltech dataset with the current split.\n", "* **mini_batch_size**: The number of training samples used for each mini batch. In distributed training, the number of training samples used per batch will be N * mini_batch_size where N is the number of hosts on which training is run.\n", "* **epochs**: Number of training epochs.\n", "* **learning_rate**: Learning rate for training.\n", @@ -297,28 +304,28 @@ "metadata": {}, "outputs": [], "source": [ - "train_data = sagemaker.session.s3_input(\n", + "train_data = sagemaker.inputs.TrainingInput(\n", " s3train,\n", " distribution=\"FullyReplicated\",\n", - " content_type=\"application/x-image\",\n", + " content_type=\"application/jpeg\",\n", " s3_data_type=\"S3Prefix\",\n", ")\n", - "validation_data = sagemaker.session.s3_input(\n", + "validation_data = sagemaker.inputs.TrainingInput(\n", " s3validation,\n", " distribution=\"FullyReplicated\",\n", - " content_type=\"application/x-image\",\n", + " content_type=\"application/jpeg\",\n", " s3_data_type=\"S3Prefix\",\n", ")\n", - "train_data_lst = sagemaker.session.s3_input(\n", + "train_data_lst = sagemaker.inputs.TrainingInput(\n", " s3train_lst,\n", " distribution=\"FullyReplicated\",\n", - " content_type=\"application/x-image\",\n", + " content_type=\"application/jpeg\",\n", " s3_data_type=\"S3Prefix\",\n", ")\n", - "validation_data_lst = sagemaker.session.s3_input(\n", + "validation_data_lst = sagemaker.inputs.TrainingInput(\n", " s3validation_lst,\n", " distribution=\"FullyReplicated\",\n", - " content_type=\"application/x-image\",\n", + " content_type=\"application/jpeg\",\n", " s3_data_type=\"S3Prefix\",\n", ")\n", "\n", @@ -380,8 +387,13 @@ "metadata": {}, "outputs": [], "source": [ - "!wget -O /tmp/test.jpg http://www.vision.caltech.edu/Image_Datasets/Caltech256/images/008.bathtub/008_0007.jpg\n", "file_name = \"/tmp/test.jpg\"\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/caltech-256/256_ObjectCategories/008.bathtub/008_0007.jpg\",\n", + " file_name,\n", + ")\n", + "\n", "# test image\n", "from IPython.display import Image\n", "\n", @@ -396,16 +408,17 @@ "source": [ "import json\n", "import numpy as np\n", + "from sagemaker.serializers import IdentitySerializer\n", "\n", "with open(file_name, \"rb\") as f:\n", " payload = f.read()\n", - " payload = bytearray(payload)\n", "\n", - "ic_classifier.content_type = \"application/x-image\"\n", + "ic_classifier.serializer = IdentitySerializer(\"image/jpeg\")\n", "result = json.loads(ic_classifier.predict(payload))\n", "# the result will output the probabilities for all classes\n", "# find the class with maximum probability and print the class index\n", "index = np.argmax(result)\n", + "\n", "object_categories = [\n", " \"ak47\",\n", " \"american-flag\",\n", @@ -703,7 +716,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.13" }, "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." }, diff --git a/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-lst-format.ipynb b/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-lst-format.ipynb index 9f6f1633d6..c7fda2a025 100644 --- a/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-lst-format.ipynb +++ b/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-lst-format.ipynb @@ -7,7 +7,7 @@ "# Image classification training with image format\n", "\n", "1. [Introduction](#Introduction)\n", - "2. [Prerequisites and Preprocessing](#Prequisites-and-Preprocessing)\n", + "2. [Prerequisites and Preprocessing](#Prerequisites-and-Preprocessing)\n", " 1. [Permissions and environment variables](#Permissions-and-environment-variables)\n", " 2. [Prepare the data](#Prepare-the-data)\n", "3. [Fine-tuning The Image Classification Model](#Fine-tuning-the-Image-classification-model)\n", @@ -29,7 +29,7 @@ "source": [ "## Introduction\n", "\n", - "Welcome to our end-to-end example of the image classification algorithm training with image format. In this demo, we will use the Amazon sagemaker image classification algorithm in transfer learning mode to fine-tune a pre-trained model (trained on imagenet data) to learn to classify a new dataset. In particular, the pre-trained model will be fine-tuned using [caltech-256 dataset](http://www.vision.caltech.edu/Image_Datasets/Caltech256/). \n", + "Welcome to our end-to-end example of the image classification algorithm training with image format. In this demo, we will use the Amazon SageMaker image classification algorithm in transfer learning mode to fine-tune a pre-trained model (trained on ImageNet data) to learn to classify a new dataset. In particular, the pre-trained model will be fine-tuned using the [Caltech-256 dataset](http://www.vision.caltech.edu/Image_Datasets/Caltech256/). \n", "\n", "To get started, we need to set up the environment with a few prerequisite steps, for permissions, configurations, and so on." ] @@ -38,7 +38,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Prequisites and Preprocessing\n", + "## Prerequisites and Preprocessing\n", "\n", "### Permissions and environment variables\n", "\n", @@ -46,7 +46,7 @@ "\n", "* The roles used to give learning and hosting access to your data. This will automatically be obtained from the role used to start the notebook\n", "* The S3 bucket that you want to use for training and model data\n", - "* The Amazon sagemaker image classification docker image which need not be changed" + "* The Amazon SageMaker image classification docker image which need not be changed" ] }, { @@ -62,14 +62,17 @@ "source": [ "%%time\n", "import boto3\n", + "import sagemaker\n", "from sagemaker import get_execution_role\n", - "from sagemaker.amazon.amazon_estimator import get_image_uri\n", + "from sagemaker import image_uris\n", "\n", "role = get_execution_role()\n", "\n", - "bucket = \"<>\" # customize to your bucket\n", + "bucket = sagemaker.session.Session().default_bucket()\n", "\n", - "training_image = get_image_uri(boto3.Session().region_name, \"image-classification\")" + "training_image = image_uris.retrieve(\n", + " region=boto3.Session().region_name, framework=\"image-classification\"\n", + ")" ] }, { @@ -79,7 +82,7 @@ "## Fine-tuning the Image classification model\n", "\n", "### Prepare the data\n", - "The caltech 256 dataset consist of images from 257 categories (the last one being a clutter category) and has 30k images with a minimum of 80 images and a maximum of about 800 images per category. \n", + "The Caltech-256 dataset consist of images from 257 categories (the last one being a clutter category) and has 30k images with a minimum of 80 images and a maximum of about 800 images per category. \n", "\n", "The image classification algorithm can take two types of input formats. The first is a [RecordIO format](https://mxnet.incubator.apache.org/tutorials/basic/record_io.html) (content type: application/x-recordio) and the other is a [lst format](https://mxnet.incubator.apache.org/how_to/recordio.html?highlight=im2rec) (content type: application/x-image). Files for both these formats are available at http://data.dmlc.ml/mxnet/data/caltech-256/. In this example, we will use the lst format for training and use the training/validation split [specified here](http://data.dmlc.ml/mxnet/data/caltech-256/)." ] @@ -103,7 +106,12 @@ "\n", "\n", "# Caltech-256 image files\n", - "download(\"http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar\")\n", + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/caltech-256/256_ObjectCategories.tar\",\n", + " \"256_ObjectCategories.tar\",\n", + ")\n", "!tar -xf 256_ObjectCategories.tar\n", "\n", "# Tool for creating lst file\n", @@ -188,7 +196,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we have all the data stored in S3 bucket. The image and lst files will be converted to RecordIO file internelly by the image classification algorithm. But if you want do the conversion, the following cell shows how to do it using the [im2rec](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) tool. Note that this is just an example of creating RecordIO files. We are **_not_** using them for training in this notebook. More details on creating RecordIO files can be found in this [tutorial](https://mxnet.incubator.apache.org/how_to/recordio.html?highlight=im2rec)." + "Now we have all the data stored in S3 bucket. The image and lst files will be converted to RecordIO file internally by the image classification algorithm. But if you want do the conversion, the following cell shows how to do it using the [im2rec](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) tool. Note that this is just an example of creating RecordIO files. We are **_not_** using them for training in this notebook. More details on creating RecordIO files can be found in this [tutorial](https://mxnet.incubator.apache.org/how_to/recordio.html?highlight=im2rec)." ] }, { @@ -215,7 +223,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Before training the model, we need to setup the training parameters. The next section will explain the parameters in detail." + "Before training the model, we need to set up the training parameters. The next section will explain the parameters in detail." ] }, { @@ -233,10 +241,10 @@ "\n", "Apart from the above set of parameters, there are hyperparameters that are specific to the algorithm. These are:\n", "\n", - "* **num_layers**: The number of layers (depth) for the network. We use 18 in this samples but other values such as 50, 152 can be used.\n", + "* **num_layers**: The number of layers (depth) for the network. We use 18 in this sample but other values such as 50, 152 can be used.\n", "* **image_shape**: The input image dimensions,'num_channels, height, width', for the network. It should be no larger than the actual image size. The number of channels should be same as the actual image.\n", - "* **num_training_samples**: This is the total number of training samples. It is set to 15240 for caltech dataset with the current split.\n", - "* **num_classes**: This is the number of output classes for the new dataset. Imagenet was trained with 1000 output classes but the number of output classes can be changed for fine-tuning. For caltech, we use 257 because it has 256 object categories + 1 clutter class.\n", + "* **num_training_samples**: This is the total number of training samples. It is set to 15240 for the Caltech dataset with the current split.\n", + "* **num_classes**: This is the number of output classes for the new dataset. ImageNet was trained with 1000 output classes but the number of output classes can be changed for fine-tuning. For Caltech, we use 257 because it has 256 object categories + 1 clutter class.\n", "* **mini_batch_size**: The number of training samples used for each mini batch. In distributed training, the number of training samples used per batch will be N * mini_batch_size where N is the number of hosts on which training is run.\n", "* **epochs**: Number of training epochs.\n", "* **learning_rate**: Learning rate for training.\n", @@ -250,7 +258,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, "isConfigCell": true }, "outputs": [], @@ -286,7 +293,7 @@ "metadata": {}, "source": [ "### Training\n", - "Run the training using Amazon sagemaker CreateTrainingJob API" + "Run the training using Amazon SageMaker CreateTrainingJob API" ] }, { @@ -492,7 +499,9 @@ "model_data = info[\"ModelArtifacts\"][\"S3ModelArtifacts\"]\n", "print(model_data)\n", "\n", - "hosting_image = get_image_uri(boto3.Session().region_name, \"image-classification\")\n", + "hosting_image = image_uris.retrieve(\n", + " region=boto3.Session().region_name, framework=\"image-classification\"\n", + ")\n", "\n", "primary_container = {\n", " \"Image\": hosting_image,\n", @@ -1045,8 +1054,13 @@ }, "outputs": [], "source": [ - "!wget -O /tmp/test.jpg http://www.vision.caltech.edu/Image_Datasets/Caltech256/images/008.bathtub/008_0007.jpg\n", "file_name = \"/tmp/test.jpg\"\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/caltech-256/256_ObjectCategories/008.bathtub/008_0007.jpg\",\n", + " file_name,\n", + ")\n", + "\n", "# test image\n", "from IPython.display import Image\n", "\n", @@ -1076,6 +1090,7 @@ "# the result will output the probabilities for all classes\n", "# find the class with maximum probability and print the class index\n", "index = np.argmax(result)\n", + "\n", "object_categories = [\n", " \"ak47\",\n", " \"american-flag\",\n", @@ -1376,7 +1391,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.13" }, "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." }, diff --git a/introduction_to_amazon_algorithms/object2vec_movie_recommendation/object2vec_movie_recommendation.ipynb b/introduction_to_amazon_algorithms/object2vec_movie_recommendation/object2vec_movie_recommendation.ipynb index 565ebabfae..14ea46ac15 100644 --- a/introduction_to_amazon_algorithms/object2vec_movie_recommendation/object2vec_movie_recommendation.ipynb +++ b/introduction_to_amazon_algorithms/object2vec_movie_recommendation/object2vec_movie_recommendation.ipynb @@ -636,8 +636,9 @@ "source": [ "import boto3\n", "import os\n", + "import sagemaker\n", "\n", - "bucket = \"\" # Customize your own bucket name\n", + "bucket = sagemaker.session.Session().default_bucket()\n", "input_prefix = \"object2vec/movielens/input\"\n", "output_prefix = \"object2vec/movielens/output\"" ] @@ -655,7 +656,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.session import s3_input\n", + "from sagemaker.inputs import TrainingInput\n", "\n", "s3_client = boto3.client(\"s3\")\n", "input_paths = {}\n", @@ -666,7 +667,7 @@ " fname = \"{}_r.jsonl\".format(data_name)\n", " data_path = os.path.join(\"s3://\", bucket, pre_key, fname)\n", " s3_client.upload_file(fname, bucket, os.path.join(pre_key, fname))\n", - " input_paths[data_name] = s3_input(\n", + " input_paths[data_name] = TrainingInput(\n", " data_path, distribution=\"ShardedByS3Key\", content_type=\"application/jsonlines\"\n", " )\n", " print(\"Uploaded {} data to {} and defined input path\".format(data_name, data_path))\n", @@ -696,9 +697,9 @@ "print(role)\n", "\n", "## Get docker image of ObjectToVec algorithm\n", - "from sagemaker.amazon.amazon_estimator import get_image_uri\n", + "from sagemaker import image_uris\n", "\n", - "container = get_image_uri(boto3.Session().region_name, \"object2vec\")" + "container = image_uris.retrieve(region=boto3.Session().region_name, framework=\"object2vec\")" ] }, { @@ -762,8 +763,8 @@ "regressor = sagemaker.estimator.Estimator(\n", " container,\n", " role,\n", - " train_instance_count=1,\n", - " train_instance_type=\"ml.p2.xlarge\",\n", + " instance_count=1,\n", + " instance_type=\"ml.p2.xlarge\",\n", " output_path=output_path,\n", " sagemaker_session=sess,\n", ")\n", @@ -806,12 +807,11 @@ "outputs": [], "source": [ "# import numpy as np\n", - "from sagemaker.predictor import json_serializer, json_deserializer\n", + "from sagemaker.serializers import JSONSerializer\n", + "from sagemaker.deserializers import JSONDeserializer\n", "\n", "# create a model using the trained algorithm\n", - "regression_model = regressor.create_model(\n", - " serializer=json_serializer, deserializer=json_deserializer, content_type=\"application/json\"\n", - ")" + "regression_model = regressor.create_model()" ] }, { @@ -821,7 +821,13 @@ "outputs": [], "source": [ "# deploy the model\n", - "predictor = regression_model.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")" + "predictor = regression_model.deploy(\n", + " serializer=JSONSerializer(),\n", + " deserializer=JSONDeserializer(),\n", + " content_type=\"application/json\",\n", + " initial_instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", + ")" ] }, { @@ -886,7 +892,7 @@ " pre_key = os.path.join(input_prefix, \"recommendation\", f\"{data_name}\")\n", " data_path = os.path.join(\"s3://\", bucket, pre_key, fname)\n", " s3_client.upload_file(fname, bucket, os.path.join(pre_key, fname))\n", - " input_paths[data_name] = s3_input(\n", + " input_paths[data_name] = TrainingInput(\n", " data_path, distribution=\"ShardedByS3Key\", content_type=\"application/jsonlines\"\n", " )\n", " print(\"Uploaded data to {}\".format(data_path))" @@ -949,8 +955,8 @@ "classifier = sagemaker.estimator.Estimator(\n", " container,\n", " role,\n", - " train_instance_count=1,\n", - " train_instance_type=\"ml.p2.xlarge\",\n", + " instance_count=1,\n", + " instance_type=\"ml.p2.xlarge\",\n", " output_path=output_path,\n", " sagemaker_session=sess,\n", ")\n", @@ -975,11 +981,15 @@ "metadata": {}, "outputs": [], "source": [ - "classification_model = classifier.create_model(\n", - " serializer=json_serializer, deserializer=json_deserializer, content_type=\"application/json\"\n", - ")\n", + "classification_model = classifier.create_model()\n", "\n", - "predictor_2 = classification_model.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")" + "predictor_2 = classification_model.deploy(\n", + " serializer=JSONSerializer(),\n", + " deserializer=JSONDeserializer(),\n", + " content_type=\"application/json\",\n", + " initial_instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", + ")" ] }, { @@ -1127,7 +1137,7 @@ }, "outputs": [], "source": [ - "movie_id_to_examine = \"\" # Customize the movie ID you want to examine" + "movie_id_to_examine = 195 # Customize the movie ID you want to examine" ] }, { @@ -1161,8 +1171,8 @@ "outputs": [], "source": [ "## clean up\n", - "sess.delete_endpoint(predictor.endpoint)\n", - "sess.delete_endpoint(predictor_2.endpoint)" + "predictor.delete_endpoint()\n", + "predictor_2.delete_endpoint()" ] } ], @@ -1183,7 +1193,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/introduction_to_applying_machine_learning/README.md b/introduction_to_applying_machine_learning/README.md index 0f7eea5e82..e39c21994d 100644 --- a/introduction_to_applying_machine_learning/README.md +++ b/introduction_to_applying_machine_learning/README.md @@ -4,7 +4,6 @@ These examples provide a gentle introduction to machine learning concepts as they are applied in practical use cases across a variety of sectors. -- [Targeted Direct Marketing](xgboost_direct_marketing) predicts potential customers that are most likely to convert based on customer and aggregate level metrics, using Amazon SageMaker's implementation of [XGBoost](https://github.com/dmlc/xgboost). - [Predicting Customer Churn](xgboost_customer_churn) uses customer interaction and service usage data to find those most likely to churn, and then walks through the cost/benefit trade-offs of providing retention incentives. This uses Amazon SageMaker's implementation of [XGBoost](https://github.com/dmlc/xgboost) to create a highly predictive model. - [Time-series Forecasting](linear_time_series_forecast) generates a forecast for topline product demand using Amazon SageMaker's Linear Learner algorithm. - [Cancer Prediction](breast_cancer_prediction) predicts Breast Cancer based on features derived from images, using SageMaker's Linear Learner. diff --git a/introduction_to_applying_machine_learning/ntm_20newsgroups_topic_modeling/ntm_20newsgroups_topic_model.ipynb b/introduction_to_applying_machine_learning/ntm_20newsgroups_topic_modeling/ntm_20newsgroups_topic_model.ipynb index ae6056e57a..50c0b83b59 100644 --- a/introduction_to_applying_machine_learning/ntm_20newsgroups_topic_modeling/ntm_20newsgroups_topic_model.ipynb +++ b/introduction_to_applying_machine_learning/ntm_20newsgroups_topic_modeling/ntm_20newsgroups_topic_model.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "compound-allen", + "id": "000c40b8", "metadata": { "papermill": { "duration": 0.038556, @@ -27,7 +27,7 @@ }, { "cell_type": "markdown", - "id": "large-aviation", + "id": "8f02a2ca", "metadata": { "papermill": { "duration": 0.072845, @@ -84,7 +84,27 @@ }, { "cell_type": "markdown", - "id": "ranging-failure", + "id": "5833a4a3", + "metadata": {}, + "source": [ + "# Install Python packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb10078e", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "!{sys.executable} -m pip install \"scikit_learn==0.20.0\" \"nltk==3.4.4\"" + ] + }, + { + "cell_type": "markdown", + "id": "1ad81a71", "metadata": { "papermill": { "duration": 0.037545, @@ -108,7 +128,7 @@ }, { "cell_type": "markdown", - "id": "prepared-coupon", + "id": "da015914", "metadata": { "papermill": { "duration": 0.037456, @@ -127,8 +147,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "public-seller", + "execution_count": null, + "id": "8207a187", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:05:07.806750Z", @@ -150,220 +170,96 @@ "import os\n", "import shutil\n", "\n", - "data_dir = \"20_newsgroups\"\n", + "data_dir = \"20_newsgroups_bulk\"\n", "if os.path.exists(data_dir): # cleanup existing data folder\n", " shutil.rmtree(data_dir)" ] }, { "cell_type": "markdown", - "id": "average-squad", - "metadata": { - "papermill": { - "duration": 0.037312, - "end_time": "2021-06-08T21:05:07.883506", - "exception": false, - "start_time": "2021-06-08T21:05:07.846194", - "status": "completed" - }, - "tags": [] - }, + "id": "85509162", + "metadata": {}, "source": [ - "Now we can download the data. *Please review the following Acknowledgements, Copyright Information, and Availability notice before downloading the data.*" + "Now we can download the data. We download the [`20 newsgroups dataset`](http://qwone.com/~jason/20Newsgroups/). The `20 newsgroups dataset` consists of 20000 messages taken from 20 Usenet newsgroups." ] }, { "cell_type": "code", - "execution_count": 8, - "id": "through-specialist", - "metadata": { - "execution": { - "iopub.execute_input": "2021-06-08T21:05:07.962889Z", - "iopub.status.busy": "2021-06-08T21:05:07.962309Z", - "iopub.status.idle": "2021-06-08T21:05:09.030495Z", - "shell.execute_reply": "2021-06-08T21:05:09.030946Z" - }, - "papermill": { - "duration": 1.11015, - "end_time": "2021-06-08T21:05:09.031100", - "exception": false, - "start_time": "2021-06-08T21:05:07.920950", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2021-06-08 23:14:05-- https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20_newsgroups.tar.gz\n", - "Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252\n", - "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 17332201 (17M) [application/x-httpd-php]\n", - "Saving to: ‘20_newsgroups.tar.gz.1’\n", - "\n", - "20_newsgroups.tar.g 100%[===================>] 16.53M 29.2MB/s in 0.6s \n", - "\n", - "2021-06-08 23:14:06 (29.2 MB/s) - ‘20_newsgroups.tar.gz.1’ saved [17332201/17332201]\n", - "\n" - ] - } - ], + "execution_count": null, + "id": "3582587d", + "metadata": {}, + "outputs": [], "source": [ - "# **Acknowledgements, Copyright Information, and Availability**\n", - "# You may use this material free of charge for any educational purpose,\n", - "# provided attribution is given in any lectures or publications that make use of this material.\n", - "#\n", - "# Source: https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20newsgroups.data.html\n", - "\n", - "!wget https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20_newsgroups.tar.gz" + "!aws s3 cp s3://sagemaker-sample-files/datasets/text/20_newsgroups/20_newsgroups_bulk.tar.gz ." ] }, { - "cell_type": "markdown", - "id": "mineral-richardson", - "metadata": { - "papermill": { - "duration": 0.038722, - "end_time": "2021-06-08T21:05:09.108277", - "exception": false, - "start_time": "2021-06-08T21:05:09.069555", - "status": "completed" - }, - "tags": [] - }, + "cell_type": "code", + "execution_count": null, + "id": "ae91cb76", + "metadata": {}, + "outputs": [], "source": [ - "In the next 2 cells, we unpack the data set and extract a list of the files" + "!tar xzf 20_newsgroups_bulk.tar.gz\n", + "!ls 20_newsgroups_bulk" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "employed-birmingham", - "metadata": { - "execution": { - "iopub.execute_input": "2021-06-08T21:05:09.189173Z", - "iopub.status.busy": "2021-06-08T21:05:09.188622Z", - "iopub.status.idle": "2021-06-08T21:05:09.964498Z", - "shell.execute_reply": "2021-06-08T21:05:09.963864Z" - }, - "papermill": { - "duration": 0.818146, - "end_time": "2021-06-08T21:05:09.964625", - "exception": false, - "start_time": "2021-06-08T21:05:09.146479", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "alt.atheism\t\t rec.autos\t sci.space\n", - "comp.graphics\t\t rec.motorcycles soc.religion.christian\n", - "comp.os.ms-windows.misc rec.sport.baseball talk.politics.guns\n", - "comp.sys.ibm.pc.hardware rec.sport.hockey talk.politics.mideast\n", - "comp.sys.mac.hardware\t sci.crypt\t talk.politics.misc\n", - "comp.windows.x\t\t sci.electronics talk.religion.misc\n", - "misc.forsale\t\t sci.med\n" - ] - } - ], + "execution_count": null, + "id": "dcec2989", + "metadata": {}, + "outputs": [], "source": [ - "!tar -xzf 20_newsgroups.tar.gz\n", - "!ls 20_newsgroups" + "file_list = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]\n", + "print(\"Number of files:\", len(file_list))" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "developed-monte", - "metadata": { - "execution": { - "iopub.execute_input": "2021-06-08T21:05:10.048277Z", - "iopub.status.busy": "2021-06-08T21:05:10.047405Z", - "iopub.status.idle": "2021-06-08T21:05:10.091088Z", - "shell.execute_reply": "2021-06-08T21:05:10.090517Z" - }, - "papermill": { - "duration": 0.087547, - "end_time": "2021-06-08T21:05:10.091211", - "exception": false, - "start_time": "2021-06-08T21:05:10.003664", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of documents: 19997\n" - ] - } - ], + "execution_count": null, + "id": "c1b44ef4", + "metadata": {}, + "outputs": [], "source": [ - "folders = [\n", - " os.path.join(data_dir, f)\n", - " for f in sorted(os.listdir(data_dir))\n", - " if os.path.isdir(os.path.join(data_dir, f))\n", - "]\n", - "file_list = [os.path.join(d, f) for d in folders for f in os.listdir(d)]\n", - "print(\"Number of documents:\", len(file_list))" + "import pandas as pd\n", + "\n", + "\n", + "documents_count = 0\n", + "for file in file_list:\n", + " df = pd.read_csv(file, header=None, names=[\"text\"])\n", + " documents_count = documents_count + df.shape[0]\n", + "print(\"Number of documents:\", documents_count)" ] }, { "cell_type": "markdown", - "id": "destroyed-nothing", - "metadata": { - "papermill": { - "duration": 0.038869, - "end_time": "2021-06-08T21:05:10.169455", - "exception": false, - "start_time": "2021-06-08T21:05:10.130586", - "status": "completed" - }, - "tags": [] - }, + "id": "95ccaa85", + "metadata": {}, "source": [ - "Here we read in the content of all the files and remove the header, footer and quotes (of earlier messages in each email)." + "This following function will remove the header, footer and quotes (of earlier messages in each text)." ] }, { "cell_type": "code", - "execution_count": 11, - "id": "about-lover", - "metadata": { - "execution": { - "iopub.execute_input": "2021-06-08T21:05:10.252718Z", - "iopub.status.busy": "2021-06-08T21:05:10.251761Z", - "iopub.status.idle": "2021-06-08T21:05:12.857853Z", - "shell.execute_reply": "2021-06-08T21:05:12.857307Z" - }, - "papermill": { - "duration": 2.649839, - "end_time": "2021-06-08T21:05:12.858015", - "exception": false, - "start_time": "2021-06-08T21:05:10.208176", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.datasets.twenty_newsgroups module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.datasets. Anything that cannot be imported from sklearn.datasets is now part of the private API.\n", - " warnings.warn(message, FutureWarning)\n" - ] - } - ], + "execution_count": null, + "id": "62348ad5", + "metadata": {}, + "outputs": [], + "source": [ + "def strip_newsgroup_item(item):\n", + " item = strip_newsgroup_header(item)\n", + " item = strip_newsgroup_quoting(item)\n", + " item = strip_newsgroup_footer(item)\n", + " return item" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee198539", + "metadata": {}, + "outputs": [], "source": [ "from sklearn.datasets.twenty_newsgroups import (\n", " strip_newsgroup_header,\n", @@ -372,19 +268,18 @@ ")\n", "\n", "data = []\n", - "for f in file_list:\n", - " with open(f, \"rb\") as fin:\n", - " content = fin.read().decode(\"latin1\")\n", - " # remove header, quoting and footer\n", - " content = strip_newsgroup_header(content)\n", - " content = strip_newsgroup_quoting(content)\n", - " content = strip_newsgroup_footer(content)\n", - " data.append(content)" + "\n", + "for file in file_list:\n", + " print(f\"Processing {file}\")\n", + " label = file.split(\"/\")[1]\n", + " df = pd.read_csv(file, header=None, names=[\"text\"])\n", + " df[\"text\"] = df[\"text\"].apply(strip_newsgroup_item)\n", + " data.extend(df[\"text\"].tolist())" ] }, { "cell_type": "markdown", - "id": "educated-sigma", + "id": "2028ec9a", "metadata": { "papermill": { "duration": 0.039066, @@ -401,8 +296,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "plastic-right", + "execution_count": null, + "id": "6cca6efd", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:05:13.025327Z", @@ -419,27 +314,14 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "['But it\\'s STILL HAPPENING. That\\'s the entire point. Only last month, John\\nMajor hailed it as a great victory that he had personally secured a sale of\\narms to Saudi Arabia. The same month, we sold jet fighters to the same\\nIndonesian government that\\'s busy killing the East Timorese.\\n\\nIt\\'s all very well to say \"Oops, we made a boo-boo, better clean up the\\nmistake\", but the US and UK *keep* making the *same* mistake. They do it so\\noften that I can\\'t believe it\\'s not deliberate. This suspicion is reinforced\\nby the fact that the mistake is an extremely profitable one for a decrepit\\neconomy reliant on arms sales.\\n\\n\\nNo, I thought both were terrible.\\n',\n", - " 'Does anybody have Bobby\\'s post in which he said something like \"I don\\'t\\nknow why there are more men than women in islamic countries. Maybe it\\'s\\natheists killing the female children\"? It\\'s my personal favorite!\\n',\n", - " 'MC> Theory of Creationism: MY theistic view of the theory of\\nMC> creationism, (there are many others) is stated in Genesis\\nMC> 1. In the beginning God created the heavens and the earth.\\n\\nAnd which order of Creation do you accept?\\tThe story of creation is one of the\\nmany places in the Bible where the Story contradicts itself. The following is\\nan example...\\n\\nGEN 1:25 And God made the beast of the earth after his kind, and cattle \\nafter their kind, and every thing that creepeth upon the earth after his\\nkind: and God saw that it was good.\\nGEN 1:26 And God said, Let us make man in our image, after our likeness: \\nand let them have dominion over the fish of the sea, and over the fowl of\\nthe air, and over the cattle, and over all the earth, and over every\\ncreeping thing that creepeth upon the earth.\\n\\nGEN 2:18 And the LORD God said, It is not good that the man should be\\nalone; I will make him an help meet for him.\\nGEN 2:19 And out of the ground the LORD God formed every beast of the \\nfield, and every fowl of the air; and brought them unto Adam to see what he\\nwould call them: and whatsoever Adam called every living creature, that was\\nthe name thereof.\\n\\nEven your Bible cannot agree on how things were created. Why should we\\nbelieve in it?']" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "data[10:13]" ] }, { "cell_type": "markdown", - "id": "strange-lithuania", + "id": "cbdb1fe8", "metadata": { "papermill": { "duration": 0.039457, @@ -473,7 +355,7 @@ }, { "cell_type": "markdown", - "id": "marked-queen", + "id": "8ed0ed38", "metadata": { "papermill": { "duration": 0.039099, @@ -491,8 +373,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "polish-operator", + "execution_count": null, + "id": "ac6fd71a", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:05:13.272646Z", @@ -509,28 +391,8 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: nltk in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (3.4.5)\n", - "Requirement already satisfied: six in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from nltk) (1.14.0)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n", - "[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...\n", - "[nltk_data] Package wordnet is already up-to-date!\n" - ] - } - ], + "outputs": [], "source": [ - "!pip install nltk\n", "import nltk\n", "\n", "nltk.download(\"punkt\")\n", @@ -556,7 +418,7 @@ }, { "cell_type": "markdown", - "id": "burning-manufacturer", + "id": "8198b7db", "metadata": { "papermill": { "duration": 0.042907, @@ -573,8 +435,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "minus-belarus", + "execution_count": null, + "id": "fd806401", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:05:17.904594Z", @@ -591,31 +453,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tokenizing and counting, this may take a few minutes...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:385: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.\n", - " 'stop_words.' % sorted(inconsistent))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "vocab size: 2000\n", - "Done. Time elapsed: 48.74s\n" - ] - } - ], + "outputs": [], "source": [ "import time\n", "import numpy as np\n", @@ -647,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "centered-brazilian", + "id": "5eeaf288", "metadata": { "papermill": { "duration": 0.043706, @@ -664,8 +502,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "motivated-arbitration", + "execution_count": null, + "id": "e1459db7", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:22.299481Z", @@ -682,16 +520,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "removed short docs (<25 words)\n", - "(9677, 2000)\n" - ] - } - ], + "outputs": [], "source": [ "threshold = 25\n", "vectors = vectors[\n", @@ -705,7 +534,7 @@ }, { "cell_type": "markdown", - "id": "sustained-teaching", + "id": "1530abcb", "metadata": { "papermill": { "duration": 0.043682, @@ -722,8 +551,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "christian-spread", + "execution_count": null, + "id": "c2652ee9", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:22.487668Z", @@ -740,51 +569,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " int64\n", - " (0, 1814)\t1\n", - " (0, 1008)\t1\n", - " (0, 1274)\t1\n", - " (0, 137)\t1\n", - " (0, 1593)\t1\n", - " (0, 1046)\t1\n", - " (0, 1525)\t1\n", - " (0, 1337)\t1\n", - " (0, 1167)\t1\n", - " (0, 1861)\t1\n", - " (0, 200)\t1\n", - " (0, 1025)\t1\n", - " (0, 1758)\t1\n", - " (0, 770)\t1\n", - " (0, 785)\t1\n", - " (0, 1799)\t1\n", - " (0, 768)\t1\n", - " (0, 729)\t2\n", - " (0, 1024)\t1\n", - " (0, 961)\t1\n", - " (0, 750)\t1\n", - " (0, 763)\t1\n", - " (0, 155)\t1\n", - " (0, 1249)\t1\n", - " (0, 1132)\t1\n", - " (0, 1945)\t2\n", - " (0, 558)\t1\n", - " (0, 1181)\t1\n", - " (0, 559)\t1\n", - " (0, 1630)\t1\n", - " (0, 87)\t1\n", - " (0, 1178)\t1\n", - " (0, 1273)\t1\n", - " (0, 1749)\t2\n", - " (0, 1568)\t1\n", - " (0, 1315)\t2\n" - ] - } - ], + "outputs": [], "source": [ "print(type(vectors), vectors.dtype)\n", "print(vectors[0])" @@ -792,7 +577,7 @@ }, { "cell_type": "markdown", - "id": "intermediate-dairy", + "id": "062c066f", "metadata": { "papermill": { "duration": 0.044141, @@ -809,8 +594,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "id": "naughty-glory", + "execution_count": null, + "id": "3987a91b", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:22.671001Z", @@ -827,15 +612,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " float32\n" - ] - } - ], + "outputs": [], "source": [ "import scipy.sparse as sparse\n", "\n", @@ -845,7 +622,7 @@ }, { "cell_type": "markdown", - "id": "organized-memphis", + "id": "f775a26c", "metadata": { "papermill": { "duration": 0.044211, @@ -866,8 +643,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "awful-blackberry", + "execution_count": null, + "id": "60a187f4", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:22.856890Z", @@ -900,8 +677,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "varied-turkish", + "execution_count": null, + "id": "15ad2043", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:22.956233Z", @@ -918,22 +695,14 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(7741, 2000) (968, 2000) (968, 2000)\n" - ] - } - ], + "outputs": [], "source": [ "print(train_vectors.shape, test_vectors.shape, val_vectors.shape)" ] }, { "cell_type": "markdown", - "id": "structured-hampshire", + "id": "c00cb142", "metadata": { "papermill": { "duration": 0.044527, @@ -953,7 +722,7 @@ }, { "cell_type": "markdown", - "id": "characteristic-craft", + "id": "d655334d", "metadata": { "papermill": { "duration": 0.045019, @@ -975,8 +744,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "fuzzy-borough", + "execution_count": null, + "id": "47d50738", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:23.231174Z", @@ -1009,8 +778,8 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "automated-republic", + "execution_count": null, + "id": "147c2113", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:24.948559Z", @@ -1028,17 +797,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training set location s3://sagemaker-us-west-2-688520471316/20newsgroups/train\n", - "Validation set location s3://sagemaker-us-west-2-688520471316/20newsgroups/val\n", - "Trained model will be saved at s3://sagemaker-us-west-2-688520471316/20newsgroups/output\n" - ] - } - ], + "outputs": [], "source": [ "prefix = \"20newsgroups\"\n", "\n", @@ -1056,7 +815,7 @@ }, { "cell_type": "markdown", - "id": "reserved-salem", + "id": "37aa22f3", "metadata": { "papermill": { "duration": 0.045114, @@ -1077,8 +836,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "id": "dental-major", + "execution_count": null, + "id": "9525de29", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:25.135640Z", @@ -1103,8 +862,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "id": "recognized-forum", + "execution_count": null, + "id": "b39be48d", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:25.236469Z", @@ -1150,8 +909,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "id": "public-guard", + "execution_count": null, + "id": "8c27fc2d", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:25.334393Z", @@ -1168,23 +927,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Uploaded data to s3://sagemaker-us-west-2-688520471316/20newsgroups/train/train_part0.pbr\n", - "Uploaded data to s3://sagemaker-us-west-2-688520471316/20newsgroups/train/train_part1.pbr\n", - "Uploaded data to s3://sagemaker-us-west-2-688520471316/20newsgroups/train/train_part2.pbr\n", - "Uploaded data to s3://sagemaker-us-west-2-688520471316/20newsgroups/train/train_part3.pbr\n", - "Uploaded data to s3://sagemaker-us-west-2-688520471316/20newsgroups/train/train_part4.pbr\n", - "Uploaded data to s3://sagemaker-us-west-2-688520471316/20newsgroups/train/train_part5.pbr\n", - "Uploaded data to s3://sagemaker-us-west-2-688520471316/20newsgroups/train/train_part6.pbr\n", - "Uploaded data to s3://sagemaker-us-west-2-688520471316/20newsgroups/train/train_part7.pbr\n", - "Uploaded data to s3://sagemaker-us-west-2-688520471316/20newsgroups/val/val_part0.pbr\n" - ] - } - ], + "outputs": [], "source": [ "split_convert_upload(\n", " train_vectors, bucket=bucket, prefix=train_prefix, fname_template=\"train_part{}.pbr\", n_parts=8\n", @@ -1196,7 +939,7 @@ }, { "cell_type": "markdown", - "id": "streaming-trace", + "id": "ca066694", "metadata": { "papermill": { "duration": 0.048076, @@ -1216,7 +959,7 @@ }, { "cell_type": "markdown", - "id": "capital-lobby", + "id": "86b297e8", "metadata": { "papermill": { "duration": 0.047668, @@ -1233,8 +976,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "id": "figured-forwarding", + "execution_count": null, + "id": "b7e5589e", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:28.188477Z", @@ -1255,12 +998,13 @@ "source": [ "import boto3\n", "from sagemaker.image_uris import retrieve\n", + "\n", "container = retrieve(\"ntm\", boto3.Session().region_name)" ] }, { "cell_type": "markdown", - "id": "northern-dancing", + "id": "0eaf3809", "metadata": { "papermill": { "duration": 0.047712, @@ -1277,8 +1021,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "id": "listed-progressive", + "execution_count": null, + "id": "db0842c1", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:28.395130Z", @@ -1312,7 +1056,7 @@ }, { "cell_type": "markdown", - "id": "caring-arctic", + "id": "95a405f8", "metadata": { "papermill": { "duration": 0.047662, @@ -1339,8 +1083,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "id": "unlimited-ocean", + "execution_count": null, + "id": "772f52ed", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:28.618626Z", @@ -1373,7 +1117,7 @@ }, { "cell_type": "markdown", - "id": "collect-theory", + "id": "f1aa584b", "metadata": { "papermill": { "duration": 0.04791, @@ -1395,8 +1139,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "id": "confused-absorption", + "execution_count": null, + "id": "3b50dfa4", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:28.815330Z", @@ -1416,12 +1160,13 @@ "outputs": [], "source": [ "from sagemaker.inputs import TrainingInput\n", + "\n", "s3_train = TrainingInput(s3_train_data, distribution=\"ShardedByS3Key\")" ] }, { "cell_type": "markdown", - "id": "wired-clause", + "id": "2e0d6c2f", "metadata": { "papermill": { "duration": 0.047552, @@ -1438,8 +1183,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "id": "reverse-syndrome", + "execution_count": null, + "id": "c9e4b1e2", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:06:29.014408Z", @@ -1457,3813 +1202,14 @@ "scrolled": true, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2021-06-08 23:15:03 Starting - Starting the training job...\n", - "2021-06-08 23:15:06 Starting - Launching requested ML instances......\n", - "2021-06-08 23:16:16 Starting - Preparing the instances for training......\n", - "2021-06-08 23:17:21 Downloading - Downloading input data\n", - "2021-06-08 23:17:21 Training - Downloading the training image.....\u001b[35mDocker entrypoint called with argument(s): train\u001b[0m\n", - "\u001b[35mRunning default environment configuration script\u001b[0m\n", - "\u001b[35m/opt/amazon/lib/python3.7/site-packages/jsonref.py:8: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working\n", - " from collections import Mapping, MutableMapping, Sequence\u001b[0m\n", - "\u001b[34mDocker entrypoint called with argument(s): train\u001b[0m\n", - "\u001b[34mRunning default environment configuration script\u001b[0m\n", - "\u001b[34m/opt/amazon/lib/python3.7/site-packages/jsonref.py:8: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working\n", - " from collections import Mapping, MutableMapping, Sequence\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:14 INFO 139703431448384] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/default-input.json: {'encoder_layers': 'auto', 'mini_batch_size': '256', 'epochs': '50', 'encoder_layers_activation': 'sigmoid', 'optimizer': 'adadelta', 'tolerance': '0.001', 'num_patience_epochs': '3', 'batch_norm': 'false', 'rescale_gradient': '1.0', 'clip_gradient': 'Inf', 'weight_decay': '0.0', 'learning_rate': '0.01', 'sub_sample': '1.0', '_tuning_objective_metric': '', '_data_format': 'record', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_kvstore': 'auto_gpu'}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:14 INFO 139703431448384] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {'feature_dim': '2000', 'num_topics': '20', 'num_patience_epochs': '5', 'epochs': '100', 'tolerance': '0.001', 'mini_batch_size': '128'}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:14 INFO 139703431448384] Final configuration: {'encoder_layers': 'auto', 'mini_batch_size': '128', 'epochs': '100', 'encoder_layers_activation': 'sigmoid', 'optimizer': 'adadelta', 'tolerance': '0.001', 'num_patience_epochs': '5', 'batch_norm': 'false', 'rescale_gradient': '1.0', 'clip_gradient': 'Inf', 'weight_decay': '0.0', 'learning_rate': '0.01', 'sub_sample': '1.0', '_tuning_objective_metric': '', '_data_format': 'record', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_kvstore': 'auto_gpu', 'feature_dim': '2000', 'num_topics': '20'}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:14 INFO 139703431448384] nvidia-smi: took 0.028 seconds to run.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:14 INFO 139703431448384] nvidia-smi identified 0 GPUs.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:14 INFO 140403258652480] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/default-input.json: {'encoder_layers': 'auto', 'mini_batch_size': '256', 'epochs': '50', 'encoder_layers_activation': 'sigmoid', 'optimizer': 'adadelta', 'tolerance': '0.001', 'num_patience_epochs': '3', 'batch_norm': 'false', 'rescale_gradient': '1.0', 'clip_gradient': 'Inf', 'weight_decay': '0.0', 'learning_rate': '0.01', 'sub_sample': '1.0', '_tuning_objective_metric': '', '_data_format': 'record', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_kvstore': 'auto_gpu'}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:14 INFO 140403258652480] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {'feature_dim': '2000', 'num_topics': '20', 'num_patience_epochs': '5', 'epochs': '100', 'tolerance': '0.001', 'mini_batch_size': '128'}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:14 INFO 140403258652480] Final configuration: {'encoder_layers': 'auto', 'mini_batch_size': '128', 'epochs': '100', 'encoder_layers_activation': 'sigmoid', 'optimizer': 'adadelta', 'tolerance': '0.001', 'num_patience_epochs': '5', 'batch_norm': 'false', 'rescale_gradient': '1.0', 'clip_gradient': 'Inf', 'weight_decay': '0.0', 'learning_rate': '0.01', 'sub_sample': '1.0', '_tuning_objective_metric': '', '_data_format': 'record', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_kvstore': 'auto_gpu', 'feature_dim': '2000', 'num_topics': '20'}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:14 INFO 140403258652480] nvidia-smi: took 0.028 seconds to run.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:14 INFO 140403258652480] nvidia-smi identified 0 GPUs.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] Launching parameter server for role scheduler\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] {'ENVROOT': '/opt/amazon', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'HOSTNAME': 'ip-10-0-132-203.us-west-2.compute.internal', 'TRAINING_JOB_NAME': 'ntm-2021-06-08-23-15-03-276', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-west-2:688520471316:training-job/ntm-2021-06-08-23-15-03-276', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/fd94bbd0-d2ad-41d3-b8d9-c2e03adae0b0', 'CANONICAL_ENVROOT': '/opt/amazon', 'PYTHONUNBUFFERED': 'TRUE', 'NVIDIA_VISIBLE_DEVICES': 'void', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python3.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'MXNET_STORAGE_FALLBACK_LOG_VERBOSE': '0', 'PWD': '/', 'LANG': 'en_US.utf8', 'AWS_REGION': 'us-west-2', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'HOME': '/root', 'SHLVL': '1', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'OMP_NUM_THREADS': '2', 'DMLC_INTERFACE': 'eth0', 'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/60b36e67-87ed-4157-a58b-cacd43209060', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/60b36e67-87ed-4157-a58b-cacd43209060', 'SAGEMAKER_HTTP_PORT': '8080', 'SAGEMAKER_DATA_PATH': '/opt/ml'}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] envs={'ENVROOT': '/opt/amazon', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'HOSTNAME': 'ip-10-0-132-203.us-west-2.compute.internal', 'TRAINING_JOB_NAME': 'ntm-2021-06-08-23-15-03-276', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-west-2:688520471316:training-job/ntm-2021-06-08-23-15-03-276', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/fd94bbd0-d2ad-41d3-b8d9-c2e03adae0b0', 'CANONICAL_ENVROOT': '/opt/amazon', 'PYTHONUNBUFFERED': 'TRUE', 'NVIDIA_VISIBLE_DEVICES': 'void', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python3.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'MXNET_STORAGE_FALLBACK_LOG_VERBOSE': '0', 'PWD': '/', 'LANG': 'en_US.utf8', 'AWS_REGION': 'us-west-2', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'HOME': '/root', 'SHLVL': '1', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'OMP_NUM_THREADS': '2', 'DMLC_INTERFACE': 'eth0', 'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/60b36e67-87ed-4157-a58b-cacd43209060', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/60b36e67-87ed-4157-a58b-cacd43209060', 'SAGEMAKER_HTTP_PORT': '8080', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'DMLC_ROLE': 'scheduler', 'DMLC_PS_ROOT_URI': '10.0.132.203', 'DMLC_PS_ROOT_PORT': '9000', 'DMLC_NUM_SERVER': '2', 'DMLC_NUM_WORKER': '2'}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] Launching parameter server for role server\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] {'ENVROOT': '/opt/amazon', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'HOSTNAME': 'ip-10-0-132-203.us-west-2.compute.internal', 'TRAINING_JOB_NAME': 'ntm-2021-06-08-23-15-03-276', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-west-2:688520471316:training-job/ntm-2021-06-08-23-15-03-276', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/fd94bbd0-d2ad-41d3-b8d9-c2e03adae0b0', 'CANONICAL_ENVROOT': '/opt/amazon', 'PYTHONUNBUFFERED': 'TRUE', 'NVIDIA_VISIBLE_DEVICES': 'void', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python3.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'MXNET_STORAGE_FALLBACK_LOG_VERBOSE': '0', 'PWD': '/', 'LANG': 'en_US.utf8', 'AWS_REGION': 'us-west-2', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'HOME': '/root', 'SHLVL': '1', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'OMP_NUM_THREADS': '2', 'DMLC_INTERFACE': 'eth0', 'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/60b36e67-87ed-4157-a58b-cacd43209060', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/60b36e67-87ed-4157-a58b-cacd43209060', 'SAGEMAKER_HTTP_PORT': '8080', 'SAGEMAKER_DATA_PATH': '/opt/ml'}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] envs={'ENVROOT': '/opt/amazon', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'HOSTNAME': 'ip-10-0-132-203.us-west-2.compute.internal', 'TRAINING_JOB_NAME': 'ntm-2021-06-08-23-15-03-276', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-west-2:688520471316:training-job/ntm-2021-06-08-23-15-03-276', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/fd94bbd0-d2ad-41d3-b8d9-c2e03adae0b0', 'CANONICAL_ENVROOT': '/opt/amazon', 'PYTHONUNBUFFERED': 'TRUE', 'NVIDIA_VISIBLE_DEVICES': 'void', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python3.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'MXNET_STORAGE_FALLBACK_LOG_VERBOSE': '0', 'PWD': '/', 'LANG': 'en_US.utf8', 'AWS_REGION': 'us-west-2', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'HOME': '/root', 'SHLVL': '1', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'OMP_NUM_THREADS': '2', 'DMLC_INTERFACE': 'eth0', 'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/60b36e67-87ed-4157-a58b-cacd43209060', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/60b36e67-87ed-4157-a58b-cacd43209060', 'SAGEMAKER_HTTP_PORT': '8080', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'DMLC_ROLE': 'server', 'DMLC_PS_ROOT_URI': '10.0.132.203', 'DMLC_PS_ROOT_PORT': '9000', 'DMLC_NUM_SERVER': '2', 'DMLC_NUM_WORKER': '2'}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] Environment: {'ENVROOT': '/opt/amazon', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'HOSTNAME': 'ip-10-0-132-203.us-west-2.compute.internal', 'TRAINING_JOB_NAME': 'ntm-2021-06-08-23-15-03-276', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-west-2:688520471316:training-job/ntm-2021-06-08-23-15-03-276', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/fd94bbd0-d2ad-41d3-b8d9-c2e03adae0b0', 'CANONICAL_ENVROOT': '/opt/amazon', 'PYTHONUNBUFFERED': 'TRUE', 'NVIDIA_VISIBLE_DEVICES': 'void', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python3.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'MXNET_STORAGE_FALLBACK_LOG_VERBOSE': '0', 'PWD': '/', 'LANG': 'en_US.utf8', 'AWS_REGION': 'us-west-2', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'HOME': '/root', 'SHLVL': '1', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'OMP_NUM_THREADS': '2', 'DMLC_INTERFACE': 'eth0', 'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/60b36e67-87ed-4157-a58b-cacd43209060', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/60b36e67-87ed-4157-a58b-cacd43209060', 'SAGEMAKER_HTTP_PORT': '8080', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'DMLC_ROLE': 'worker', 'DMLC_PS_ROOT_URI': '10.0.132.203', 'DMLC_PS_ROOT_PORT': '9000', 'DMLC_NUM_SERVER': '2', 'DMLC_NUM_WORKER': '2'}\u001b[0m\n", - "\u001b[34mProcess 33 is a shell:scheduler.\u001b[0m\n", - "\u001b[34mProcess 34 is a shell:server.\u001b[0m\n", - "\u001b[34mProcess 1 is a worker.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] Using default worker.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] Checkpoint loading and saving are disabled.\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:15.519] [tensorio] [warning] TensorIO is already initialized; ignoring the initialization routine.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] Initializing\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] None\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] vocab.txt\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] Vocab file is not provided\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] Number of GPUs being used: 0\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:15 INFO 139703431448384] Create Store: dist_async\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] Launching parameter server for role server\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] {'ENVROOT': '/opt/amazon', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'HOSTNAME': 'ip-10-0-172-252.us-west-2.compute.internal', 'TRAINING_JOB_NAME': 'ntm-2021-06-08-23-15-03-276', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-west-2:688520471316:training-job/ntm-2021-06-08-23-15-03-276', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/5c199049-6e7a-415e-bc69-28a3e33c5f9a', 'CANONICAL_ENVROOT': '/opt/amazon', 'PYTHONUNBUFFERED': 'TRUE', 'NVIDIA_VISIBLE_DEVICES': 'void', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python3.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'MXNET_STORAGE_FALLBACK_LOG_VERBOSE': '0', 'PWD': '/', 'LANG': 'en_US.utf8', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'AWS_REGION': 'us-west-2', 'HOME': '/root', 'SHLVL': '1', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'OMP_NUM_THREADS': '2', 'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/60a11e7a-186d-4f37-91e9-9a832d095495', 'DMLC_INTERFACE': 'eth0', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/60a11e7a-186d-4f37-91e9-9a832d095495', 'SAGEMAKER_HTTP_PORT': '8080', 'SAGEMAKER_DATA_PATH': '/opt/ml'}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] envs={'ENVROOT': '/opt/amazon', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'HOSTNAME': 'ip-10-0-172-252.us-west-2.compute.internal', 'TRAINING_JOB_NAME': 'ntm-2021-06-08-23-15-03-276', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-west-2:688520471316:training-job/ntm-2021-06-08-23-15-03-276', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/5c199049-6e7a-415e-bc69-28a3e33c5f9a', 'CANONICAL_ENVROOT': '/opt/amazon', 'PYTHONUNBUFFERED': 'TRUE', 'NVIDIA_VISIBLE_DEVICES': 'void', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python3.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'MXNET_STORAGE_FALLBACK_LOG_VERBOSE': '0', 'PWD': '/', 'LANG': 'en_US.utf8', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'AWS_REGION': 'us-west-2', 'HOME': '/root', 'SHLVL': '1', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'OMP_NUM_THREADS': '2', 'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/60a11e7a-186d-4f37-91e9-9a832d095495', 'DMLC_INTERFACE': 'eth0', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/60a11e7a-186d-4f37-91e9-9a832d095495', 'SAGEMAKER_HTTP_PORT': '8080', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'DMLC_ROLE': 'server', 'DMLC_PS_ROOT_URI': '10.0.132.203', 'DMLC_PS_ROOT_PORT': '9000', 'DMLC_NUM_SERVER': '2', 'DMLC_NUM_WORKER': '2'}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] Environment: {'ENVROOT': '/opt/amazon', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'HOSTNAME': 'ip-10-0-172-252.us-west-2.compute.internal', 'TRAINING_JOB_NAME': 'ntm-2021-06-08-23-15-03-276', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-west-2:688520471316:training-job/ntm-2021-06-08-23-15-03-276', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/5c199049-6e7a-415e-bc69-28a3e33c5f9a', 'CANONICAL_ENVROOT': '/opt/amazon', 'PYTHONUNBUFFERED': 'TRUE', 'NVIDIA_VISIBLE_DEVICES': 'void', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python3.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'MXNET_STORAGE_FALLBACK_LOG_VERBOSE': '0', 'PWD': '/', 'LANG': 'en_US.utf8', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'AWS_REGION': 'us-west-2', 'HOME': '/root', 'SHLVL': '1', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'OMP_NUM_THREADS': '2', 'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/60a11e7a-186d-4f37-91e9-9a832d095495', 'DMLC_INTERFACE': 'eth0', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/60a11e7a-186d-4f37-91e9-9a832d095495', 'SAGEMAKER_HTTP_PORT': '8080', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'DMLC_ROLE': 'worker', 'DMLC_PS_ROOT_URI': '10.0.132.203', 'DMLC_PS_ROOT_PORT': '9000', 'DMLC_NUM_SERVER': '2', 'DMLC_NUM_WORKER': '2'}\u001b[0m\n", - "\u001b[35mProcess 34 is a shell:server.\u001b[0m\n", - "\u001b[35mProcess 1 is a worker.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] Using default worker.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] Checkpoint loading and saving are disabled.\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:15.517] [tensorio] [warning] TensorIO is already initialized; ignoring the initialization routine.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] Initializing\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] None\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] vocab.txt\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] Vocab file is not provided\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] Number of GPUs being used: 0\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:15 INFO 140403258652480] Create Store: dist_async\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194296.2641668, \"EndTime\": 1623194296.2641964, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"Meta\": \"init_train_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Total Batches Seen\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Max Records Seen Between Resets\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Max Batches Seen Between Resets\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Reset Count\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Number of Records Since Last Reset\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Number of Batches Since Last Reset\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}}}\n", - "\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:16.264] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 0, \"duration\": 757, \"num_examples\": 1, \"num_bytes\": 55596}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:16 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:16 INFO 139703431448384] # Starting training for epoch 1\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194296.2179184, \"EndTime\": 1623194296.217946, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"Meta\": \"init_train_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Total Batches Seen\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Max Records Seen Between Resets\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Max Batches Seen Between Resets\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Reset Count\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Number of Records Since Last Reset\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}, \"Number of Batches Since Last Reset\": {\"sum\": 0.0, \"count\": 1, \"min\": 0, \"max\": 0}}}\n", - "\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:16.218] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 0, \"duration\": 705, \"num_examples\": 1, \"num_bytes\": 48048}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:16 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:16 INFO 140403258652480] # Starting training for epoch 1\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:17.362] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 2, \"duration\": 1097, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] # Finished training epoch 1 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Loss (name: value) total: 7.021950817877246\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Loss (name: value) kld: 0.01430272898587939\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Loss (name: value) recons: 7.007648098853327\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Loss (name: value) logppx: 7.021950817877246\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] #quality_metric: host=algo-1, epoch=1, train total_loss =7.021950817877246\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:17.369] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 0, \"duration\": 1849, \"num_examples\": 1, \"num_bytes\": 43464}\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:17.442] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 2, \"duration\": 73, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Loss (name: value) total: 7.136931283133371\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Loss (name: value) kld: 0.0017484314074473722\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Loss (name: value) recons: 7.135182857513428\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Loss (name: value) logppx: 7.136931283133371\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] #validation_score (1): 7.136931283133371\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] Timing: train: 1.10s, val: 0.08s, epoch: 1.18s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] #progress_metric: host=algo-1, completed 1.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194296.2646484, \"EndTime\": 1623194297.4443984, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 0, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Total Batches Seen\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 2.0, \"count\": 1, \"min\": 2, \"max\": 2}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3278.2553720648184 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:17 INFO 139703431448384] # Starting training for epoch 2\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:17.275] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 2, \"duration\": 1056, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] # Finished training epoch 1 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Loss (name: value) total: 7.048994029721906\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Loss (name: value) kld: 0.014254417152128994\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Loss (name: value) recons: 7.034739609687559\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Loss (name: value) logppx: 7.048994029721906\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] #quality_metric: host=algo-2, epoch=1, train total_loss =7.048994029721906\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:17.281] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 0, \"duration\": 1763, \"num_examples\": 1, \"num_bytes\": 43464}\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:17.377] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 2, \"duration\": 95, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Loss (name: value) total: 7.139745439801898\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Loss (name: value) kld: 0.0017864805746025272\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Loss (name: value) recons: 7.137959071568081\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Loss (name: value) logppx: 7.139745439801898\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] #validation_score (1): 7.139745439801898\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] Timing: train: 1.06s, val: 0.10s, epoch: 1.16s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] #progress_metric: host=algo-2, completed 1.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194296.2184074, \"EndTime\": 1623194297.379345, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 0, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Total Batches Seen\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 2.0, \"count\": 1, \"min\": 2, \"max\": 2}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3335.7898596094724 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:17 INFO 140403258652480] # Starting training for epoch 2\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:18.520] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 5, \"duration\": 1075, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] # Finished training epoch 2 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Loss (name: value) total: 6.945022417652991\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Loss (name: value) kld: 0.0031111678878416216\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Loss (name: value) recons: 6.941911282077912\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Loss (name: value) logppx: 6.945022417652991\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] #quality_metric: host=algo-1, epoch=2, train total_loss =6.945022417652991\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:18.681] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 5, \"duration\": 159, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Loss (name: value) total: 7.119487626211984\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Loss (name: value) kld: 0.0032733817185674396\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Loss (name: value) recons: 7.116214207240513\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Loss (name: value) logppx: 7.119487626211984\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] #validation_score (2): 7.119487626211984\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] Timing: train: 1.08s, val: 0.16s, epoch: 1.24s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] #progress_metric: host=algo-1, completed 2.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194297.4447153, \"EndTime\": 1623194298.6862426, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 1, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 7736.0, \"count\": 1, \"min\": 7736, \"max\": 7736}, \"Total Batches Seen\": {\"sum\": 62.0, \"count\": 1, \"min\": 62, \"max\": 62}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 4.0, \"count\": 1, \"min\": 4, \"max\": 4}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3115.119614255585 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:18 INFO 139703431448384] # Starting training for epoch 3\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:18.564] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 5, \"duration\": 1185, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] # Finished training epoch 2 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Loss (name: value) total: 6.961147154531171\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Loss (name: value) kld: 0.003194315143064746\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Loss (name: value) recons: 6.9579527608809935\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Loss (name: value) logppx: 6.961147154531171\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] #quality_metric: host=algo-2, epoch=2, train total_loss =6.961147154531171\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:18.658] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 5, \"duration\": 91, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Loss (name: value) total: 7.119496617998395\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Loss (name: value) kld: 0.0033224847367299454\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Loss (name: value) recons: 7.116174152919224\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Loss (name: value) logppx: 7.119496617998395\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] #validation_score (2): 7.119496617998395\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] Timing: train: 1.19s, val: 0.10s, epoch: 1.29s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] #progress_metric: host=algo-2, completed 2.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194297.3795354, \"EndTime\": 1623194298.667446, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 1, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 7746.0, \"count\": 1, \"min\": 7746, \"max\": 7746}, \"Total Batches Seen\": {\"sum\": 62.0, \"count\": 1, \"min\": 62, \"max\": 62}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 4.0, \"count\": 1, \"min\": 4, \"max\": 4}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3006.7530111416822 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:18 INFO 140403258652480] # Starting training for epoch 3\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:19.860] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 8, \"duration\": 1173, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] # Finished training epoch 3 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Loss (name: value) total: 6.9309429314828686\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Loss (name: value) kld: 0.004128934038172085\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Loss (name: value) recons: 6.926813971611761\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Loss (name: value) logppx: 6.9309429314828686\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] #quality_metric: host=algo-1, epoch=3, train total_loss =6.9309429314828686\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:19.920] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 8, \"duration\": 1252, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:19 INFO 140403258652480] # Finished training epoch 3 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:19 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:19 INFO 140403258652480] Loss (name: value) total: 6.944729205100767\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:19 INFO 140403258652480] Loss (name: value) kld: 0.004006417156497557\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:19 INFO 140403258652480] Loss (name: value) recons: 6.940722815452084\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:19 INFO 140403258652480] Loss (name: value) logppx: 6.944729205100767\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:19 INFO 140403258652480] #quality_metric: host=algo-2, epoch=3, train total_loss =6.944729205100767\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:20.019] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 8, \"duration\": 95, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] Loss (name: value) total: 7.109448705400739\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] Loss (name: value) kld: 0.003918564678835017\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] Loss (name: value) recons: 7.105530193873814\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] Loss (name: value) logppx: 7.109448705400739\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] #validation_score (3): 7.109448705400739\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] Timing: train: 1.25s, val: 0.10s, epoch: 1.36s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] #progress_metric: host=algo-2, completed 3.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194298.6680233, \"EndTime\": 1623194300.024858, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 2, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 11619.0, \"count\": 1, \"min\": 11619, \"max\": 11619}, \"Total Batches Seen\": {\"sum\": 93.0, \"count\": 1, \"min\": 93, \"max\": 93}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 6.0, \"count\": 1, \"min\": 6, \"max\": 6}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=2854.06409378392 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:20 INFO 140403258652480] # Starting training for epoch 4\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:19.959] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 8, \"duration\": 96, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Loss (name: value) total: 7.116677624838693\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Loss (name: value) kld: 0.004577102804822581\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Loss (name: value) recons: 7.112100533076695\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Loss (name: value) logppx: 7.116677624838693\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] #validation_score (3): 7.116677624838693\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] Timing: train: 1.18s, val: 0.10s, epoch: 1.28s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] #progress_metric: host=algo-1, completed 3.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194298.686546, \"EndTime\": 1623194299.9663444, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 2, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 11604.0, \"count\": 1, \"min\": 11604, \"max\": 11604}, \"Total Batches Seen\": {\"sum\": 93.0, \"count\": 1, \"min\": 93, \"max\": 93}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 6.0, \"count\": 1, \"min\": 6, \"max\": 6}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3022.0343676153866 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:19 INFO 139703431448384] # Starting training for epoch 4\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:20.939] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 11, \"duration\": 972, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:20 INFO 139703431448384] # Finished training epoch 4 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:20 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:20 INFO 139703431448384] Loss (name: value) total: 6.92934158155995\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:20 INFO 139703431448384] Loss (name: value) kld: 0.005020001377429693\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:20 INFO 139703431448384] Loss (name: value) recons: 6.924321613004131\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:20 INFO 139703431448384] Loss (name: value) logppx: 6.92934158155995\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:20 INFO 139703431448384] #quality_metric: host=algo-1, epoch=4, train total_loss =6.92934158155995\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:21.076] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 11, \"duration\": 1051, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] # Finished training epoch 4 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Loss (name: value) total: 6.940548423797853\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Loss (name: value) kld: 0.005003343933382102\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Loss (name: value) recons: 6.935545083015196\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Loss (name: value) logppx: 6.940548423797853\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] #quality_metric: host=algo-2, epoch=4, train total_loss =6.940548423797853\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:21.152] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 11, \"duration\": 74, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Loss (name: value) total: 7.105281829833984\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Loss (name: value) kld: 0.005025317825909171\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Loss (name: value) recons: 7.10025657926287\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Loss (name: value) logppx: 7.105281829833984\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] #validation_score (4): 7.105281829833984\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] Timing: train: 1.05s, val: 0.08s, epoch: 1.13s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] #progress_metric: host=algo-2, completed 4.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194300.025165, \"EndTime\": 1623194301.1595345, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 3, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 15492.0, \"count\": 1, \"min\": 15492, \"max\": 15492}, \"Total Batches Seen\": {\"sum\": 124.0, \"count\": 1, \"min\": 124, \"max\": 124}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 8.0, \"count\": 1, \"min\": 8, \"max\": 8}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3413.730286856041 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:21 INFO 140403258652480] # Starting training for epoch 5\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:21.028] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 11, \"duration\": 87, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Loss (name: value) total: 7.111966950552804\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Loss (name: value) kld: 0.005739550172750439\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Loss (name: value) recons: 7.106227466038296\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Loss (name: value) logppx: 7.111966950552804\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] #validation_score (4): 7.111966950552804\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Timing: train: 0.97s, val: 0.09s, epoch: 1.07s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] #progress_metric: host=algo-1, completed 4.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194299.96661, \"EndTime\": 1623194301.0333276, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 3, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 15472.0, \"count\": 1, \"min\": 15472, \"max\": 15472}, \"Total Batches Seen\": {\"sum\": 124.0, \"count\": 1, \"min\": 124, \"max\": 124}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 8.0, \"count\": 1, \"min\": 8, \"max\": 8}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3625.557345290125 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] # Starting training for epoch 5\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:22.114] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 14, \"duration\": 954, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] # Finished training epoch 5 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Loss (name: value) total: 6.93538026655874\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Loss (name: value) kld: 0.0059138025277324265\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Loss (name: value) recons: 6.929466424449798\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Loss (name: value) logppx: 6.93538026655874\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] #quality_metric: host=algo-2, epoch=5, train total_loss =6.93538026655874\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:21.986] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 14, \"duration\": 952, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] # Finished training epoch 5 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Loss (name: value) total: 6.9223884728647045\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Loss (name: value) kld: 0.005677949573333946\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Loss (name: value) recons: 6.916710538248862\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] Loss (name: value) logppx: 6.9223884728647045\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:21 INFO 139703431448384] #quality_metric: host=algo-1, epoch=5, train total_loss =6.9223884728647045\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:22.058] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 14, \"duration\": 70, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] Loss (name: value) total: 7.098155226026263\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] Loss (name: value) kld: 0.005471409098910434\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] Loss (name: value) recons: 7.092683860233852\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] Loss (name: value) logppx: 7.098155226026263\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] #validation_score (5): 7.098155226026263\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] Timing: train: 0.95s, val: 0.08s, epoch: 1.03s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] #progress_metric: host=algo-1, completed 5.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194301.033632, \"EndTime\": 1623194302.0641418, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 4, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 19340.0, \"count\": 1, \"min\": 19340, \"max\": 19340}, \"Total Batches Seen\": {\"sum\": 155.0, \"count\": 1, \"min\": 155, \"max\": 155}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 10.0, \"count\": 1, \"min\": 10, \"max\": 10}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3752.8988635751152 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:22 INFO 139703431448384] # Starting training for epoch 6\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:22.218] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 14, \"duration\": 103, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Loss (name: value) total: 7.094639165060861\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Loss (name: value) kld: 0.005065387048359428\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Loss (name: value) recons: 7.089573860168457\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Loss (name: value) logppx: 7.094639165060861\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] #validation_score (5): 7.094639165060861\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] Timing: train: 0.96s, val: 0.11s, epoch: 1.06s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] #progress_metric: host=algo-2, completed 5.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194301.1598465, \"EndTime\": 1623194302.224869, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 4, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 19365.0, \"count\": 1, \"min\": 19365, \"max\": 19365}, \"Total Batches Seen\": {\"sum\": 155.0, \"count\": 1, \"min\": 155, \"max\": 155}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 10.0, \"count\": 1, \"min\": 10, \"max\": 10}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3635.967529855732 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:22 INFO 140403258652480] # Starting training for epoch 6\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:23.167] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 17, \"duration\": 940, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] # Finished training epoch 6 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Loss (name: value) total: 6.933675193017529\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Loss (name: value) kld: 0.007857177000973494\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Loss (name: value) recons: 6.925818047215862\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Loss (name: value) logppx: 6.933675193017529\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] #quality_metric: host=algo-2, epoch=6, train total_loss =6.933675193017529\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:23.029] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 17, \"duration\": 964, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] # Finished training epoch 6 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) total: 6.918802434398282\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) kld: 0.00695909055009965\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) recons: 6.911843342165793\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) logppx: 6.918802434398282\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] #quality_metric: host=algo-1, epoch=6, train total_loss =6.918802434398282\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:23.116] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 17, \"duration\": 85, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) total: 7.1021351133074075\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) kld: 0.007741069953356471\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) recons: 7.0943940707615445\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) logppx: 7.1021351133074075\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] #validation_score (6): 7.1021351133074075\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] patience losses:[7.136931283133371, 7.119487626211984, 7.116677624838693, 7.111966950552804, 7.098155226026263] min patience loss:7.098155226026263 current loss:7.1021351133074075 absolute loss difference:0.003979887281144734\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Timing: train: 0.97s, val: 0.09s, epoch: 1.05s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] #progress_metric: host=algo-1, completed 6.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194302.0644732, \"EndTime\": 1623194303.1175935, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 5, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 23208.0, \"count\": 1, \"min\": 23208, \"max\": 23208}, \"Total Batches Seen\": {\"sum\": 186.0, \"count\": 1, \"min\": 186, \"max\": 186}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 12.0, \"count\": 1, \"min\": 12, \"max\": 12}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3672.328294612598 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] # Starting training for epoch 7\u001b[0m\n", - "\n", - "2021-06-08 23:18:11 Training - Training image download completed. Training in progress.\u001b[35m[2021-06-08 23:18:23.231] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 17, \"duration\": 63, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Loss (name: value) total: 7.101483889988491\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Loss (name: value) kld: 0.0077785625388579705\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Loss (name: value) recons: 7.093705381665911\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Loss (name: value) logppx: 7.101483889988491\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] #validation_score (6): 7.101483889988491\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] patience losses:[7.139745439801898, 7.119496617998395, 7.109448705400739, 7.105281829833984, 7.094639165060861] min patience loss:7.094639165060861 current loss:7.101483889988491 absolute loss difference:0.006844724927629997\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] Timing: train: 0.94s, val: 0.06s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] #progress_metric: host=algo-2, completed 6.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194302.2252653, \"EndTime\": 1623194303.2330728, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 5, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 23238.0, \"count\": 1, \"min\": 23238, \"max\": 23238}, \"Total Batches Seen\": {\"sum\": 186.0, \"count\": 1, \"min\": 186, \"max\": 186}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 12.0, \"count\": 1, \"min\": 12, \"max\": 12}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3842.388624220295 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:23 INFO 140403258652480] # Starting training for epoch 7\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:23.974] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 20, \"duration\": 856, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] # Finished training epoch 7 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) total: 6.916796845774496\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) kld: 0.008649216882223565\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) recons: 6.908147592698374\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] Loss (name: value) logppx: 6.916796845774496\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:23 INFO 139703431448384] #quality_metric: host=algo-1, epoch=7, train total_loss =6.916796845774496\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:24.043] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 20, \"duration\": 67, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Loss (name: value) total: 7.092641285487583\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Loss (name: value) kld: 0.007715762925467321\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Loss (name: value) recons: 7.084925447191511\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Loss (name: value) logppx: 7.092641285487583\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] #validation_score (7): 7.092641285487583\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] patience losses:[7.119487626211984, 7.116677624838693, 7.111966950552804, 7.098155226026263, 7.1021351133074075] min patience loss:7.098155226026263 current loss:7.092641285487583 absolute loss difference:0.005513940538679485\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Timing: train: 0.86s, val: 0.07s, epoch: 0.93s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] #progress_metric: host=algo-1, completed 7.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194303.1178982, \"EndTime\": 1623194304.0474145, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 6, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 27076.0, \"count\": 1, \"min\": 27076, \"max\": 27076}, \"Total Batches Seen\": {\"sum\": 217.0, \"count\": 1, \"min\": 217, \"max\": 217}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 14.0, \"count\": 1, \"min\": 14, \"max\": 14}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4160.689330512278 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] # Starting training for epoch 8\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:24.933] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 23, \"duration\": 885, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] # Finished training epoch 8 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Loss (name: value) total: 6.908025887704665\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Loss (name: value) kld: 0.010726808224834742\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Loss (name: value) recons: 6.897299074357556\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] Loss (name: value) logppx: 6.908025887704665\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:24 INFO 139703431448384] #quality_metric: host=algo-1, epoch=8, train total_loss =6.908025887704665\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:24.194] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 20, \"duration\": 959, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] # Finished training epoch 7 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Loss (name: value) total: 6.928057343729081\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Loss (name: value) kld: 0.009298830878950896\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Loss (name: value) recons: 6.918758549997883\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Loss (name: value) logppx: 6.928057343729081\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] #quality_metric: host=algo-2, epoch=7, train total_loss =6.928057343729081\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:24.285] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 20, \"duration\": 90, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Loss (name: value) total: 7.101057393210275\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Loss (name: value) kld: 0.011410580415810858\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Loss (name: value) recons: 7.089646748134068\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Loss (name: value) logppx: 7.101057393210275\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] #validation_score (7): 7.101057393210275\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] patience losses:[7.119496617998395, 7.109448705400739, 7.105281829833984, 7.094639165060861, 7.101483889988491] min patience loss:7.094639165060861 current loss:7.101057393210275 absolute loss difference:0.0064182281494140625\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] Timing: train: 0.96s, val: 0.09s, epoch: 1.05s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] #progress_metric: host=algo-2, completed 7.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194303.2335095, \"EndTime\": 1623194304.2873878, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 6, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 27111.0, \"count\": 1, \"min\": 27111, \"max\": 27111}, \"Total Batches Seen\": {\"sum\": 217.0, \"count\": 1, \"min\": 217, \"max\": 217}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 14.0, \"count\": 1, \"min\": 14, \"max\": 14}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3674.429608531523 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:24 INFO 140403258652480] # Starting training for epoch 8\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:25.009] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 23, \"duration\": 74, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Loss (name: value) total: 7.107043198176792\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Loss (name: value) kld: 0.01612756241645132\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Loss (name: value) recons: 7.090915748051235\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Loss (name: value) logppx: 7.107043198176792\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] #validation_score (8): 7.107043198176792\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] patience losses:[7.116677624838693, 7.111966950552804, 7.098155226026263, 7.1021351133074075, 7.092641285487583] min patience loss:7.092641285487583 current loss:7.107043198176792 absolute loss difference:0.014401912689208984\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Timing: train: 0.89s, val: 0.08s, epoch: 0.96s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] #progress_metric: host=algo-1, completed 8.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194304.047655, \"EndTime\": 1623194305.011412, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 7, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 30944.0, \"count\": 1, \"min\": 30944, \"max\": 30944}, \"Total Batches Seen\": {\"sum\": 248.0, \"count\": 1, \"min\": 248, \"max\": 248}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 16.0, \"count\": 1, \"min\": 16, \"max\": 16}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4012.8560928324187 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] # Starting training for epoch 9\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:25.944] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 26, \"duration\": 932, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] # Finished training epoch 9 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Loss (name: value) total: 6.897835677669894\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Loss (name: value) kld: 0.014341895273255726\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Loss (name: value) recons: 6.8834937849352436\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] Loss (name: value) logppx: 6.897835677669894\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:25 INFO 139703431448384] #quality_metric: host=algo-1, epoch=9, train total_loss =6.897835677669894\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:25.215] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 23, \"duration\": 927, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] # Finished training epoch 8 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Loss (name: value) total: 6.921251077805796\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Loss (name: value) kld: 0.01125404444493113\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Loss (name: value) recons: 6.909997101752989\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Loss (name: value) logppx: 6.921251077805796\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] #quality_metric: host=algo-2, epoch=8, train total_loss =6.921251077805796\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:25.297] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 23, \"duration\": 81, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Loss (name: value) total: 7.077770573752267\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Loss (name: value) kld: 0.009851935452648572\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Loss (name: value) recons: 7.067918709346226\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Loss (name: value) logppx: 7.077770573752267\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] #validation_score (8): 7.077770573752267\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] patience losses:[7.109448705400739, 7.105281829833984, 7.094639165060861, 7.101483889988491, 7.101057393210275] min patience loss:7.094639165060861 current loss:7.077770573752267 absolute loss difference:0.01686859130859375\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] Timing: train: 0.93s, val: 0.09s, epoch: 1.02s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] #progress_metric: host=algo-2, completed 8.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194304.2876914, \"EndTime\": 1623194305.3039777, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 7, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 30984.0, \"count\": 1, \"min\": 30984, \"max\": 30984}, \"Total Batches Seen\": {\"sum\": 248.0, \"count\": 1, \"min\": 248, \"max\": 248}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 16.0, \"count\": 1, \"min\": 16, \"max\": 16}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3809.479668714784 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:25 INFO 140403258652480] # Starting training for epoch 9\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:26.019] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 26, \"duration\": 74, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Loss (name: value) total: 7.067824908665249\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Loss (name: value) kld: 0.01927896429385458\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Loss (name: value) recons: 7.048545973641532\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Loss (name: value) logppx: 7.067824908665249\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] #validation_score (9): 7.067824908665249\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] patience losses:[7.111966950552804, 7.098155226026263, 7.1021351133074075, 7.092641285487583, 7.107043198176792] min patience loss:7.092641285487583 current loss:7.067824908665249 absolute loss difference:0.024816376822334618\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Timing: train: 0.93s, val: 0.08s, epoch: 1.01s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] #progress_metric: host=algo-1, completed 9.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194305.011714, \"EndTime\": 1623194306.0244596, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 8, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 34812.0, \"count\": 1, \"min\": 34812, \"max\": 34812}, \"Total Batches Seen\": {\"sum\": 279.0, \"count\": 1, \"min\": 279, \"max\": 279}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 18.0, \"count\": 1, \"min\": 18, \"max\": 18}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3818.821442274331 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] # Starting training for epoch 10\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:26.943] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 29, \"duration\": 918, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] # Finished training epoch 10 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Loss (name: value) total: 6.866616229857168\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Loss (name: value) kld: 0.02988805096115797\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Loss (name: value) recons: 6.836728161381137\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] Loss (name: value) logppx: 6.866616229857168\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:26 INFO 139703431448384] #quality_metric: host=algo-1, epoch=10, train total_loss =6.866616229857168\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:26.254] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 26, \"duration\": 948, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] # Finished training epoch 9 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Loss (name: value) total: 6.904637463631168\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Loss (name: value) kld: 0.018383676441566597\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Loss (name: value) recons: 6.886253783779759\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Loss (name: value) logppx: 6.904637463631168\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] #quality_metric: host=algo-2, epoch=9, train total_loss =6.904637463631168\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:26.326] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 26, \"duration\": 70, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Loss (name: value) total: 7.051619393484933\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Loss (name: value) kld: 0.025550063433391706\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Loss (name: value) recons: 7.026069368634905\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Loss (name: value) logppx: 7.051619393484933\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] #validation_score (9): 7.051619393484933\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] patience losses:[7.105281829833984, 7.094639165060861, 7.101483889988491, 7.101057393210275, 7.077770573752267] min patience loss:7.077770573752267 current loss:7.051619393484933 absolute loss difference:0.026151180267333984\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] Timing: train: 0.95s, val: 0.08s, epoch: 1.03s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] #progress_metric: host=algo-2, completed 9.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194305.3045847, \"EndTime\": 1623194306.3326316, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 8, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 34857.0, \"count\": 1, \"min\": 34857, \"max\": 34857}, \"Total Batches Seen\": {\"sum\": 279.0, \"count\": 1, \"min\": 279, \"max\": 279}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 18.0, \"count\": 1, \"min\": 18, \"max\": 18}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3765.9284260893855 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:26 INFO 140403258652480] # Starting training for epoch 10\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:27.014] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 29, \"duration\": 69, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] Loss (name: value) total: 7.012120178767613\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] Loss (name: value) kld: 0.03921731774296079\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] Loss (name: value) recons: 6.972902774810791\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] Loss (name: value) logppx: 7.012120178767613\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] #validation_score (10): 7.012120178767613\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] patience losses:[7.098155226026263, 7.1021351133074075, 7.092641285487583, 7.107043198176792, 7.067824908665249] min patience loss:7.067824908665249 current loss:7.012120178767613 absolute loss difference:0.055704729897636085\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] Timing: train: 0.92s, val: 0.07s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] #progress_metric: host=algo-1, completed 10.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194306.0247035, \"EndTime\": 1623194307.0191705, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 9, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 38680.0, \"count\": 1, \"min\": 38680, \"max\": 38680}, \"Total Batches Seen\": {\"sum\": 310.0, \"count\": 1, \"min\": 310, \"max\": 310}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 20.0, \"count\": 1, \"min\": 20, \"max\": 20}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3889.006903746433 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] # Starting training for epoch 11\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:27.306] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 29, \"duration\": 972, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] # Finished training epoch 10 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Loss (name: value) total: 6.860330039455045\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Loss (name: value) kld: 0.03817620948557892\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Loss (name: value) recons: 6.822153822068246\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Loss (name: value) logppx: 6.860330039455045\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] #quality_metric: host=algo-2, epoch=10, train total_loss =6.860330039455045\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:27.413] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 29, \"duration\": 106, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Loss (name: value) total: 7.009148188999721\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Loss (name: value) kld: 0.03916592576674053\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Loss (name: value) recons: 6.969982283455985\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Loss (name: value) logppx: 7.009148188999721\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] #validation_score (10): 7.009148188999721\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] patience losses:[7.094639165060861, 7.101483889988491, 7.101057393210275, 7.077770573752267, 7.051619393484933] min patience loss:7.051619393484933 current loss:7.009148188999721 absolute loss difference:0.0424712044852118\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] Timing: train: 0.97s, val: 0.11s, epoch: 1.09s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] #progress_metric: host=algo-2, completed 10.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194306.3332198, \"EndTime\": 1623194307.4189389, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 9, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 38730.0, \"count\": 1, \"min\": 38730, \"max\": 38730}, \"Total Batches Seen\": {\"sum\": 310.0, \"count\": 1, \"min\": 310, \"max\": 310}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 20.0, \"count\": 1, \"min\": 20, \"max\": 20}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3566.7040638525095 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:27 INFO 140403258652480] # Starting training for epoch 11\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:27.999] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 32, \"duration\": 979, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] # Finished training epoch 11 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:27 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Loss (name: value) total: 6.827744795430091\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Loss (name: value) kld: 0.04285170258052887\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Loss (name: value) recons: 6.78489310895243\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Loss (name: value) logppx: 6.827744795430091\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] #quality_metric: host=algo-1, epoch=11, train total_loss =6.827744795430091\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:28.085] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 32, \"duration\": 84, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Loss (name: value) total: 6.982710361480713\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Loss (name: value) kld: 0.04274229811770575\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Loss (name: value) recons: 6.939968109130859\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Loss (name: value) logppx: 6.982710361480713\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] #validation_score (11): 6.982710361480713\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] patience losses:[7.1021351133074075, 7.092641285487583, 7.107043198176792, 7.067824908665249, 7.012120178767613] min patience loss:7.012120178767613 current loss:6.982710361480713 absolute loss difference:0.02940981728689973\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] Timing: train: 0.98s, val: 0.09s, epoch: 1.07s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] #progress_metric: host=algo-1, completed 11.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194307.019444, \"EndTime\": 1623194308.0911007, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 10, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 42548.0, \"count\": 1, \"min\": 42548, \"max\": 42548}, \"Total Batches Seen\": {\"sum\": 341.0, \"count\": 1, \"min\": 341, \"max\": 341}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 22.0, \"count\": 1, \"min\": 22, \"max\": 22}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3608.861403707412 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:28 INFO 139703431448384] # Starting training for epoch 12\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:28.313] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 32, \"duration\": 894, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] # Finished training epoch 11 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Loss (name: value) total: 6.831307157393424\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Loss (name: value) kld: 0.046623781113134276\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Loss (name: value) recons: 6.784683385202961\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Loss (name: value) logppx: 6.831307157393424\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] #quality_metric: host=algo-2, epoch=11, train total_loss =6.831307157393424\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:28.397] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 32, \"duration\": 82, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Loss (name: value) total: 6.9772820472717285\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Loss (name: value) kld: 0.04676898941397667\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Loss (name: value) recons: 6.930512973240444\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Loss (name: value) logppx: 6.9772820472717285\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] #validation_score (11): 6.9772820472717285\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] patience losses:[7.101483889988491, 7.101057393210275, 7.077770573752267, 7.051619393484933, 7.009148188999721] min patience loss:7.009148188999721 current loss:6.9772820472717285 absolute loss difference:0.03186614172799285\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] Timing: train: 0.90s, val: 0.09s, epoch: 0.98s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] #progress_metric: host=algo-2, completed 11.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194307.4191885, \"EndTime\": 1623194308.4029512, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 10, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 42603.0, \"count\": 1, \"min\": 42603, \"max\": 42603}, \"Total Batches Seen\": {\"sum\": 341.0, \"count\": 1, \"min\": 341, \"max\": 341}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 22.0, \"count\": 1, \"min\": 22, \"max\": 22}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3935.5065904948706 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:28 INFO 140403258652480] # Starting training for epoch 12\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:29.007] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 35, \"duration\": 915, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] # Finished training epoch 12 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Loss (name: value) total: 6.7968784224602485\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Loss (name: value) kld: 0.04590961355115137\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Loss (name: value) recons: 6.75096881774164\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Loss (name: value) logppx: 6.7968784224602485\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] #quality_metric: host=algo-1, epoch=12, train total_loss =6.7968784224602485\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:29.079] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 35, \"duration\": 70, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Loss (name: value) total: 6.960537365504673\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Loss (name: value) kld: 0.05211935032691274\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Loss (name: value) recons: 6.908417906079974\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Loss (name: value) logppx: 6.960537365504673\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] #validation_score (12): 6.960537365504673\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] patience losses:[7.092641285487583, 7.107043198176792, 7.067824908665249, 7.012120178767613, 6.982710361480713] min patience loss:6.982710361480713 current loss:6.960537365504673 absolute loss difference:0.022172995976039722\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] Timing: train: 0.92s, val: 0.08s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] #progress_metric: host=algo-1, completed 12.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194308.0913901, \"EndTime\": 1623194309.084099, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 11, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 46416.0, \"count\": 1, \"min\": 46416, \"max\": 46416}, \"Total Batches Seen\": {\"sum\": 372.0, \"count\": 1, \"min\": 372, \"max\": 372}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 24.0, \"count\": 1, \"min\": 24, \"max\": 24}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3895.8831761034608 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:29 INFO 139703431448384] # Starting training for epoch 13\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:29.317] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 35, \"duration\": 913, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] # Finished training epoch 12 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Loss (name: value) total: 6.808753951903312\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Loss (name: value) kld: 0.05013171859806584\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Loss (name: value) recons: 6.758622204103777\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Loss (name: value) logppx: 6.808753951903312\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] #quality_metric: host=algo-2, epoch=12, train total_loss =6.808753951903312\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:29.397] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 35, \"duration\": 77, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Loss (name: value) total: 6.9570103372846335\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Loss (name: value) kld: 0.04965064355305263\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Loss (name: value) recons: 6.907359736306327\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Loss (name: value) logppx: 6.9570103372846335\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] #validation_score (12): 6.9570103372846335\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] patience losses:[7.101057393210275, 7.077770573752267, 7.051619393484933, 7.009148188999721, 6.9772820472717285] min patience loss:6.9772820472717285 current loss:6.9570103372846335 absolute loss difference:0.020271709987095043\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] Timing: train: 0.92s, val: 0.09s, epoch: 1.00s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] #progress_metric: host=algo-2, completed 12.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194308.403702, \"EndTime\": 1623194309.4065828, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 11, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 46476.0, \"count\": 1, \"min\": 46476, \"max\": 46476}, \"Total Batches Seen\": {\"sum\": 372.0, \"count\": 1, \"min\": 372, \"max\": 372}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 24.0, \"count\": 1, \"min\": 24, \"max\": 24}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3860.6604048282793 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:29 INFO 140403258652480] # Starting training for epoch 13\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:30.004] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 38, \"duration\": 920, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] # Finished training epoch 13 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Loss (name: value) total: 6.784047307506684\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Loss (name: value) kld: 0.05167782204526086\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Loss (name: value) recons: 6.732369480594512\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Loss (name: value) logppx: 6.784047307506684\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] #quality_metric: host=algo-1, epoch=13, train total_loss =6.784047307506684\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:30.075] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 38, \"duration\": 69, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Loss (name: value) total: 6.945607253483364\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Loss (name: value) kld: 0.05221213772892952\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Loss (name: value) recons: 6.893395015171596\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Loss (name: value) logppx: 6.945607253483364\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] #validation_score (13): 6.945607253483364\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] patience losses:[7.107043198176792, 7.067824908665249, 7.012120178767613, 6.982710361480713, 6.960537365504673] min patience loss:6.960537365504673 current loss:6.945607253483364 absolute loss difference:0.014930112021309228\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] Timing: train: 0.92s, val: 0.07s, epoch: 1.00s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] #progress_metric: host=algo-1, completed 13.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194309.0843346, \"EndTime\": 1623194310.0809343, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 12, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 50284.0, \"count\": 1, \"min\": 50284, \"max\": 50284}, \"Total Batches Seen\": {\"sum\": 403.0, \"count\": 1, \"min\": 403, \"max\": 403}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 26.0, \"count\": 1, \"min\": 26, \"max\": 26}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3880.729424338352 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:30 INFO 139703431448384] # Starting training for epoch 14\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:30.352] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 38, \"duration\": 944, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] # Finished training epoch 13 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Loss (name: value) total: 6.790463997471717\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Loss (name: value) kld: 0.05356326711273963\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Loss (name: value) recons: 6.736900691063173\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Loss (name: value) logppx: 6.790463997471717\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] #quality_metric: host=algo-2, epoch=13, train total_loss =6.790463997471717\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:30.431] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 38, \"duration\": 77, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Loss (name: value) total: 6.949711663382394\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Loss (name: value) kld: 0.06298925408295222\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Loss (name: value) recons: 6.886722496577671\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Loss (name: value) logppx: 6.949711663382394\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] #validation_score (13): 6.949711663382394\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] patience losses:[7.077770573752267, 7.051619393484933, 7.009148188999721, 6.9772820472717285, 6.9570103372846335] min patience loss:6.9570103372846335 current loss:6.949711663382394 absolute loss difference:0.007298673902239372\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] Timing: train: 0.95s, val: 0.08s, epoch: 1.03s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] #progress_metric: host=algo-2, completed 13.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194309.4069364, \"EndTime\": 1623194310.4379973, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 12, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 50349.0, \"count\": 1, \"min\": 50349, \"max\": 50349}, \"Total Batches Seen\": {\"sum\": 403.0, \"count\": 1, \"min\": 403, \"max\": 403}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 26.0, \"count\": 1, \"min\": 26, \"max\": 26}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3754.8299049650914 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:30 INFO 140403258652480] # Starting training for epoch 14\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:31.050] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 41, \"duration\": 969, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] # Finished training epoch 14 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Loss (name: value) total: 6.768345082959821\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Loss (name: value) kld: 0.05379751351692023\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Loss (name: value) recons: 6.714547584133763\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Loss (name: value) logppx: 6.768345082959821\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] #quality_metric: host=algo-1, epoch=14, train total_loss =6.768345082959821\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:31.121] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 41, \"duration\": 69, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Loss (name: value) total: 6.93729373386928\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Loss (name: value) kld: 0.05581598622458322\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Loss (name: value) recons: 6.881477764674595\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Loss (name: value) logppx: 6.93729373386928\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] #validation_score (14): 6.93729373386928\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] patience losses:[7.067824908665249, 7.012120178767613, 6.982710361480713, 6.960537365504673, 6.945607253483364] min patience loss:6.945607253483364 current loss:6.93729373386928 absolute loss difference:0.008313519614083553\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] Timing: train: 0.97s, val: 0.07s, epoch: 1.05s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] #progress_metric: host=algo-1, completed 14.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194310.081162, \"EndTime\": 1623194311.1264644, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 13, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 54152.0, \"count\": 1, \"min\": 54152, \"max\": 54152}, \"Total Batches Seen\": {\"sum\": 434.0, \"count\": 1, \"min\": 434, \"max\": 434}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 28.0, \"count\": 1, \"min\": 28, \"max\": 28}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3699.8321466599255 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:31 INFO 139703431448384] # Starting training for epoch 15\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:31.409] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 41, \"duration\": 969, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] # Finished training epoch 14 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Loss (name: value) total: 6.777322265409654\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Loss (name: value) kld: 0.05655155463084098\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Loss (name: value) recons: 6.720770716667175\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Loss (name: value) logppx: 6.777322265409654\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] #quality_metric: host=algo-2, epoch=14, train total_loss =6.777322265409654\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:31.487] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 41, \"duration\": 76, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Loss (name: value) total: 6.92958572932652\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Loss (name: value) kld: 0.06050405012709754\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Loss (name: value) recons: 6.869081701551165\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Loss (name: value) logppx: 6.92958572932652\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] #validation_score (14): 6.92958572932652\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] patience losses:[7.051619393484933, 7.009148188999721, 6.9772820472717285, 6.9570103372846335, 6.949711663382394] min patience loss:6.949711663382394 current loss:6.92958572932652 absolute loss difference:0.020125934055873707\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] Timing: train: 0.97s, val: 0.08s, epoch: 1.05s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] #progress_metric: host=algo-2, completed 14.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194310.4386146, \"EndTime\": 1623194311.4932613, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 13, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 54222.0, \"count\": 1, \"min\": 54222, \"max\": 54222}, \"Total Batches Seen\": {\"sum\": 434.0, \"count\": 1, \"min\": 434, \"max\": 434}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 28.0, \"count\": 1, \"min\": 28, \"max\": 28}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3671.7420173793935 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:31 INFO 140403258652480] # Starting training for epoch 15\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:32.013] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 44, \"duration\": 886, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] # Finished training epoch 15 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Loss (name: value) total: 6.756431210425593\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Loss (name: value) kld: 0.05619427904246315\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Loss (name: value) recons: 6.700236870396521\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Loss (name: value) logppx: 6.756431210425593\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] #quality_metric: host=algo-1, epoch=15, train total_loss =6.756431210425593\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:32.094] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 44, \"duration\": 80, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Loss (name: value) total: 6.931581974029541\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Loss (name: value) kld: 0.06444319231169564\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Loss (name: value) recons: 6.867138794490269\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Loss (name: value) logppx: 6.931581974029541\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] #validation_score (15): 6.931581974029541\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] patience losses:[7.012120178767613, 6.982710361480713, 6.960537365504673, 6.945607253483364, 6.93729373386928] min patience loss:6.93729373386928 current loss:6.931581974029541 absolute loss difference:0.005711759839739372\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] Timing: train: 0.89s, val: 0.09s, epoch: 0.97s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] #progress_metric: host=algo-1, completed 15.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194311.12675, \"EndTime\": 1623194312.1009548, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 14, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 58020.0, \"count\": 1, \"min\": 58020, \"max\": 58020}, \"Total Batches Seen\": {\"sum\": 465.0, \"count\": 1, \"min\": 465, \"max\": 465}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 30.0, \"count\": 1, \"min\": 30, \"max\": 30}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3969.844593964667 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:32 INFO 139703431448384] # Starting training for epoch 16\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:32.458] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 44, \"duration\": 959, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] # Finished training epoch 15 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Loss (name: value) total: 6.762902828954881\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Loss (name: value) kld: 0.06072674670647229\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Loss (name: value) recons: 6.702176090209715\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Loss (name: value) logppx: 6.762902828954881\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] #quality_metric: host=algo-2, epoch=15, train total_loss =6.762902828954881\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:32.538] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 44, \"duration\": 78, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Loss (name: value) total: 6.923871857779367\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Loss (name: value) kld: 0.06473846627133233\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Loss (name: value) recons: 6.859133379799979\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Loss (name: value) logppx: 6.923871857779367\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] #validation_score (15): 6.923871857779367\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] patience losses:[7.009148188999721, 6.9772820472717285, 6.9570103372846335, 6.949711663382394, 6.92958572932652] min patience loss:6.92958572932652 current loss:6.923871857779367 absolute loss difference:0.005713871547153637\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] Timing: train: 0.97s, val: 0.08s, epoch: 1.05s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] #progress_metric: host=algo-2, completed 15.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194311.493514, \"EndTime\": 1623194312.544257, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 14, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 58095.0, \"count\": 1, \"min\": 58095, \"max\": 58095}, \"Total Batches Seen\": {\"sum\": 465.0, \"count\": 1, \"min\": 465, \"max\": 465}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 30.0, \"count\": 1, \"min\": 30, \"max\": 30}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3685.4618964161864 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:32 INFO 140403258652480] # Starting training for epoch 16\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:33.063] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 47, \"duration\": 962, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] # Finished training epoch 16 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Loss (name: value) total: 6.7487992163627375\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Loss (name: value) kld: 0.06103249519102035\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Loss (name: value) recons: 6.687766698098952\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Loss (name: value) logppx: 6.7487992163627375\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] #quality_metric: host=algo-1, epoch=16, train total_loss =6.7487992163627375\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:33.134] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 47, \"duration\": 68, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Loss (name: value) total: 6.906581197466169\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Loss (name: value) kld: 0.06776464677282742\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Loss (name: value) recons: 6.8388165065220425\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Loss (name: value) logppx: 6.906581197466169\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] #validation_score (16): 6.906581197466169\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] patience losses:[6.982710361480713, 6.960537365504673, 6.945607253483364, 6.93729373386928, 6.931581974029541] min patience loss:6.931581974029541 current loss:6.906581197466169 absolute loss difference:0.025000776563372185\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] Timing: train: 0.96s, val: 0.07s, epoch: 1.04s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] #progress_metric: host=algo-1, completed 16.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194312.1012487, \"EndTime\": 1623194313.1397288, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 15, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 61888.0, \"count\": 1, \"min\": 61888, \"max\": 61888}, \"Total Batches Seen\": {\"sum\": 496.0, \"count\": 1, \"min\": 496, \"max\": 496}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 32.0, \"count\": 1, \"min\": 32, \"max\": 32}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3723.7373859741456 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:33 INFO 139703431448384] # Starting training for epoch 17\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:33.479] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 47, \"duration\": 934, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] # Finished training epoch 16 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Loss (name: value) total: 6.74975081797569\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Loss (name: value) kld: 0.06422678399230203\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Loss (name: value) recons: 6.6855241137166175\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Loss (name: value) logppx: 6.74975081797569\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] #quality_metric: host=algo-2, epoch=16, train total_loss =6.74975081797569\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:33.545] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 47, \"duration\": 64, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Loss (name: value) total: 6.904898847852435\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Loss (name: value) kld: 0.06404854729771614\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Loss (name: value) recons: 6.840850285121372\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Loss (name: value) logppx: 6.904898847852435\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] #validation_score (16): 6.904898847852435\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] patience losses:[6.9772820472717285, 6.9570103372846335, 6.949711663382394, 6.92958572932652, 6.923871857779367] min patience loss:6.923871857779367 current loss:6.904898847852435 absolute loss difference:0.018973009926932072\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] Timing: train: 0.94s, val: 0.07s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] #progress_metric: host=algo-2, completed 16.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194312.5446615, \"EndTime\": 1623194313.5512521, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 15, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 61968.0, \"count\": 1, \"min\": 61968, \"max\": 61968}, \"Total Batches Seen\": {\"sum\": 496.0, \"count\": 1, \"min\": 496, \"max\": 496}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 32.0, \"count\": 1, \"min\": 32, \"max\": 32}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3847.1250520424564 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:33 INFO 140403258652480] # Starting training for epoch 17\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:34.066] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 50, \"duration\": 925, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] # Finished training epoch 17 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Loss (name: value) total: 6.7264841295057725\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Loss (name: value) kld: 0.06508037003297959\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Loss (name: value) recons: 6.661403782906071\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Loss (name: value) logppx: 6.7264841295057725\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] #quality_metric: host=algo-1, epoch=17, train total_loss =6.7264841295057725\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:34.157] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 50, \"duration\": 87, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Loss (name: value) total: 6.898202623639788\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Loss (name: value) kld: 0.06815796505127635\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Loss (name: value) recons: 6.830044542040143\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Loss (name: value) logppx: 6.898202623639788\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] #validation_score (17): 6.898202623639788\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] patience losses:[6.960537365504673, 6.945607253483364, 6.93729373386928, 6.931581974029541, 6.906581197466169] min patience loss:6.906581197466169 current loss:6.898202623639788 absolute loss difference:0.00837857382638063\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] Timing: train: 0.93s, val: 0.10s, epoch: 1.02s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] #progress_metric: host=algo-1, completed 17.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194313.1402123, \"EndTime\": 1623194314.1630564, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 16, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 65756.0, \"count\": 1, \"min\": 65756, \"max\": 65756}, \"Total Batches Seen\": {\"sum\": 527.0, \"count\": 1, \"min\": 527, \"max\": 527}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 34.0, \"count\": 1, \"min\": 34, \"max\": 34}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3781.1410269856297 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:34 INFO 139703431448384] # Starting training for epoch 18\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:34.485] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 50, \"duration\": 933, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] # Finished training epoch 17 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Loss (name: value) total: 6.73356645337997\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Loss (name: value) kld: 0.06828877852568703\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Loss (name: value) recons: 6.665277673352149\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Loss (name: value) logppx: 6.73356645337997\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] #quality_metric: host=algo-2, epoch=17, train total_loss =6.73356645337997\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:34.579] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 50, \"duration\": 93, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Loss (name: value) total: 6.883122852870396\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Loss (name: value) kld: 0.06898650154471397\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Loss (name: value) recons: 6.814136300768171\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Loss (name: value) logppx: 6.883122852870396\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] #validation_score (17): 6.883122852870396\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] patience losses:[6.9570103372846335, 6.949711663382394, 6.92958572932652, 6.923871857779367, 6.904898847852435] min patience loss:6.904898847852435 current loss:6.883122852870396 absolute loss difference:0.02177599498203886\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] Timing: train: 0.93s, val: 0.10s, epoch: 1.03s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] #progress_metric: host=algo-2, completed 17.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194313.551498, \"EndTime\": 1623194314.5854766, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 16, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 65841.0, \"count\": 1, \"min\": 65841, \"max\": 65841}, \"Total Batches Seen\": {\"sum\": 527.0, \"count\": 1, \"min\": 527, \"max\": 527}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 34.0, \"count\": 1, \"min\": 34, \"max\": 34}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3745.138764726225 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:34 INFO 140403258652480] # Starting training for epoch 18\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:35.099] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 53, \"duration\": 935, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] # Finished training epoch 18 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Loss (name: value) total: 6.713530228983972\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Loss (name: value) kld: 0.06813806379514356\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Loss (name: value) recons: 6.645392121807221\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Loss (name: value) logppx: 6.713530228983972\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] #quality_metric: host=algo-1, epoch=18, train total_loss =6.713530228983972\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:35.183] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 53, \"duration\": 82, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Loss (name: value) total: 6.8816390718732565\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Loss (name: value) kld: 0.07851824377264295\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Loss (name: value) recons: 6.803120681217739\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Loss (name: value) logppx: 6.8816390718732565\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] #validation_score (18): 6.8816390718732565\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] patience losses:[6.945607253483364, 6.93729373386928, 6.931581974029541, 6.906581197466169, 6.898202623639788] min patience loss:6.898202623639788 current loss:6.8816390718732565 absolute loss difference:0.01656355176653168\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] Timing: train: 0.94s, val: 0.09s, epoch: 1.03s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] #progress_metric: host=algo-1, completed 18.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194314.1633422, \"EndTime\": 1623194315.1897638, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 17, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 69624.0, \"count\": 1, \"min\": 69624, \"max\": 69624}, \"Total Batches Seen\": {\"sum\": 558.0, \"count\": 1, \"min\": 558, \"max\": 558}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 36.0, \"count\": 1, \"min\": 36, \"max\": 36}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3767.8799355745973 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:35 INFO 139703431448384] # Starting training for epoch 19\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:35.505] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 53, \"duration\": 917, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] # Finished training epoch 18 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Loss (name: value) total: 6.71918063394485\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Loss (name: value) kld: 0.07071425573479745\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Loss (name: value) recons: 6.64846638325722\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Loss (name: value) logppx: 6.71918063394485\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] #quality_metric: host=algo-2, epoch=18, train total_loss =6.71918063394485\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:35.576] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 53, \"duration\": 68, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Loss (name: value) total: 6.869148254394531\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Loss (name: value) kld: 0.06843102563704763\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Loss (name: value) recons: 6.800717285701206\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Loss (name: value) logppx: 6.869148254394531\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] #validation_score (18): 6.869148254394531\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] patience losses:[6.949711663382394, 6.92958572932652, 6.923871857779367, 6.904898847852435, 6.883122852870396] min patience loss:6.883122852870396 current loss:6.869148254394531 absolute loss difference:0.013974598475864575\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] Timing: train: 0.92s, val: 0.07s, epoch: 0.99s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] #progress_metric: host=algo-2, completed 18.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194314.5858042, \"EndTime\": 1623194315.581066, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 17, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 69714.0, \"count\": 1, \"min\": 69714, \"max\": 69714}, \"Total Batches Seen\": {\"sum\": 558.0, \"count\": 1, \"min\": 558, \"max\": 558}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 36.0, \"count\": 1, \"min\": 36, \"max\": 36}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3890.8638374609914 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:35 INFO 140403258652480] # Starting training for epoch 19\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:36.121] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 56, \"duration\": 931, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] # Finished training epoch 19 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Loss (name: value) total: 6.700147301919999\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Loss (name: value) kld: 0.07084768109263913\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Loss (name: value) recons: 6.6292996521919\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Loss (name: value) logppx: 6.700147301919999\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] #quality_metric: host=algo-1, epoch=19, train total_loss =6.700147301919999\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:36.199] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 56, \"duration\": 76, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Loss (name: value) total: 6.861605439867292\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Loss (name: value) kld: 0.0726701415010861\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Loss (name: value) recons: 6.788935320717948\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Loss (name: value) logppx: 6.861605439867292\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] #validation_score (19): 6.861605439867292\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] patience losses:[6.93729373386928, 6.931581974029541, 6.906581197466169, 6.898202623639788, 6.8816390718732565] min patience loss:6.8816390718732565 current loss:6.861605439867292 absolute loss difference:0.02003363200596464\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] Timing: train: 0.93s, val: 0.08s, epoch: 1.01s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] #progress_metric: host=algo-1, completed 19.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194315.1900723, \"EndTime\": 1623194316.204166, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 18, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 73492.0, \"count\": 1, \"min\": 73492, \"max\": 73492}, \"Total Batches Seen\": {\"sum\": 589.0, \"count\": 1, \"min\": 589, \"max\": 589}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 38.0, \"count\": 1, \"min\": 38, \"max\": 38}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3813.6669139598407 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:36 INFO 139703431448384] # Starting training for epoch 20\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:36.555] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 56, \"duration\": 973, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] # Finished training epoch 19 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Loss (name: value) total: 6.708913426245412\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Loss (name: value) kld: 0.07473494022363616\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Loss (name: value) recons: 6.634178511558041\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Loss (name: value) logppx: 6.708913426245412\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] #quality_metric: host=algo-2, epoch=19, train total_loss =6.708913426245412\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:36.626] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 56, \"duration\": 69, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Loss (name: value) total: 6.881233283451626\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Loss (name: value) kld: 0.08027186244726181\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Loss (name: value) recons: 6.800961426326206\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Loss (name: value) logppx: 6.881233283451626\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] #validation_score (19): 6.881233283451626\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] patience losses:[6.92958572932652, 6.923871857779367, 6.904898847852435, 6.883122852870396, 6.869148254394531] min patience loss:6.869148254394531 current loss:6.881233283451626 absolute loss difference:0.01208502905709441\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] Timing: train: 0.97s, val: 0.07s, epoch: 1.05s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] #progress_metric: host=algo-2, completed 19.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194315.581353, \"EndTime\": 1623194316.6274457, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 18, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 73587.0, \"count\": 1, \"min\": 73587, \"max\": 73587}, \"Total Batches Seen\": {\"sum\": 589.0, \"count\": 1, \"min\": 589, \"max\": 589}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 38.0, \"count\": 1, \"min\": 38, \"max\": 38}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3701.85669887567 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:36 INFO 140403258652480] # Starting training for epoch 20\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:37.098] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 59, \"duration\": 893, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] # Finished training epoch 20 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Loss (name: value) total: 6.689366098373167\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Loss (name: value) kld: 0.07546150498092175\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Loss (name: value) recons: 6.61390463382967\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Loss (name: value) logppx: 6.689366098373167\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] #quality_metric: host=algo-1, epoch=20, train total_loss =6.689366098373167\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:37.170] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 59, \"duration\": 70, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Loss (name: value) total: 6.854649407523019\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Loss (name: value) kld: 0.07737864979675838\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Loss (name: value) recons: 6.777270589556013\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Loss (name: value) logppx: 6.854649407523019\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] #validation_score (20): 6.854649407523019\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] patience losses:[6.931581974029541, 6.906581197466169, 6.898202623639788, 6.8816390718732565, 6.861605439867292] min patience loss:6.861605439867292 current loss:6.854649407523019 absolute loss difference:0.006956032344272778\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] Timing: train: 0.89s, val: 0.08s, epoch: 0.97s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] #progress_metric: host=algo-1, completed 20.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194316.2044725, \"EndTime\": 1623194317.1766822, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 19, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 77360.0, \"count\": 1, \"min\": 77360, \"max\": 77360}, \"Total Batches Seen\": {\"sum\": 620.0, \"count\": 1, \"min\": 620, \"max\": 620}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 40.0, \"count\": 1, \"min\": 40, \"max\": 40}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3977.9090391956092 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:37 INFO 139703431448384] # Starting training for epoch 21\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:37.539] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 59, \"duration\": 911, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] # Finished training epoch 20 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Loss (name: value) total: 6.694458761522847\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Loss (name: value) kld: 0.07785314904345621\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Loss (name: value) recons: 6.616605570239406\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Loss (name: value) logppx: 6.694458761522847\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] #quality_metric: host=algo-2, epoch=20, train total_loss =6.694458761522847\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:37.666] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 59, \"duration\": 124, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Loss (name: value) total: 6.862727505820138\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Loss (name: value) kld: 0.0882593042084149\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Loss (name: value) recons: 6.774468285696847\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Loss (name: value) logppx: 6.862727505820138\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] #validation_score (20): 6.862727505820138\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] patience losses:[6.923871857779367, 6.904898847852435, 6.883122852870396, 6.869148254394531, 6.881233283451626] min patience loss:6.869148254394531 current loss:6.862727505820138 absolute loss difference:0.006420748574393009\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] Timing: train: 0.91s, val: 0.13s, epoch: 1.04s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] #progress_metric: host=algo-2, completed 20.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194316.6277177, \"EndTime\": 1623194317.6726937, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 19, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 77460.0, \"count\": 1, \"min\": 77460, \"max\": 77460}, \"Total Batches Seen\": {\"sum\": 620.0, \"count\": 1, \"min\": 620, \"max\": 620}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 40.0, \"count\": 1, \"min\": 40, \"max\": 40}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3705.4310664211366 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:37 INFO 140403258652480] # Starting training for epoch 21\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:38.123] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 62, \"duration\": 946, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] # Finished training epoch 21 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Loss (name: value) total: 6.676068590533349\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Loss (name: value) kld: 0.07864594267260644\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Loss (name: value) recons: 6.59742263824709\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Loss (name: value) logppx: 6.676068590533349\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] #quality_metric: host=algo-1, epoch=21, train total_loss =6.676068590533349\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:38.189] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 62, \"duration\": 64, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Loss (name: value) total: 6.880430153438023\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Loss (name: value) kld: 0.10037313401699066\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Loss (name: value) recons: 6.78005702154977\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Loss (name: value) logppx: 6.880430153438023\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] #validation_score (21): 6.880430153438023\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] patience losses:[6.906581197466169, 6.898202623639788, 6.8816390718732565, 6.861605439867292, 6.854649407523019] min patience loss:6.854649407523019 current loss:6.880430153438023 absolute loss difference:0.025780745915003678\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] Timing: train: 0.95s, val: 0.07s, epoch: 1.01s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] #progress_metric: host=algo-1, completed 21.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194317.1769848, \"EndTime\": 1623194318.1921291, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 20, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 81228.0, \"count\": 1, \"min\": 81228, \"max\": 81228}, \"Total Batches Seen\": {\"sum\": 651.0, \"count\": 1, \"min\": 651, \"max\": 651}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 42.0, \"count\": 1, \"min\": 42, \"max\": 42}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3809.7247015471785 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:38 INFO 139703431448384] # Starting training for epoch 22\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:38.602] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 62, \"duration\": 928, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] # Finished training epoch 21 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Loss (name: value) total: 6.691170004106337\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Loss (name: value) kld: 0.08345616254354676\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Loss (name: value) recons: 6.6077139146866335\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Loss (name: value) logppx: 6.691170004106337\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] #quality_metric: host=algo-2, epoch=21, train total_loss =6.691170004106337\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:38.683] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 62, \"duration\": 79, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Loss (name: value) total: 6.859888144901821\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Loss (name: value) kld: 0.08680688589811325\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Loss (name: value) recons: 6.77308109828404\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Loss (name: value) logppx: 6.859888144901821\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] #validation_score (21): 6.859888144901821\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] patience losses:[6.904898847852435, 6.883122852870396, 6.869148254394531, 6.881233283451626, 6.862727505820138] min patience loss:6.862727505820138 current loss:6.859888144901821 absolute loss difference:0.0028393609183172686\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] Timing: train: 0.93s, val: 0.09s, epoch: 1.02s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] #progress_metric: host=algo-2, completed 21.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194317.673201, \"EndTime\": 1623194318.6890996, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 20, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 81333.0, \"count\": 1, \"min\": 81333, \"max\": 81333}, \"Total Batches Seen\": {\"sum\": 651.0, \"count\": 1, \"min\": 651, \"max\": 651}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 42.0, \"count\": 1, \"min\": 42, \"max\": 42}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3811.877153877255 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:38 INFO 140403258652480] # Starting training for epoch 22\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:39.571] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 65, \"duration\": 881, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] # Finished training epoch 22 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Loss (name: value) total: 6.6779345543153825\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Loss (name: value) kld: 0.0850946291560127\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Loss (name: value) recons: 6.592839902447116\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Loss (name: value) logppx: 6.6779345543153825\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] #quality_metric: host=algo-2, epoch=22, train total_loss =6.6779345543153825\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:39.653] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 65, \"duration\": 80, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Loss (name: value) total: 6.845040798187256\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Loss (name: value) kld: 0.08850874858243125\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Loss (name: value) recons: 6.756531987871442\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Loss (name: value) logppx: 6.845040798187256\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] #validation_score (22): 6.845040798187256\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] patience losses:[6.883122852870396, 6.869148254394531, 6.881233283451626, 6.862727505820138, 6.859888144901821] min patience loss:6.859888144901821 current loss:6.845040798187256 absolute loss difference:0.014847346714565113\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] Timing: train: 0.88s, val: 0.09s, epoch: 0.97s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] #progress_metric: host=algo-2, completed 22.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194318.6893473, \"EndTime\": 1623194319.6583855, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 21, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 85206.0, \"count\": 1, \"min\": 85206, \"max\": 85206}, \"Total Batches Seen\": {\"sum\": 682.0, \"count\": 1, \"min\": 682, \"max\": 682}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 44.0, \"count\": 1, \"min\": 44, \"max\": 44}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3996.1917370335336 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:39 INFO 140403258652480] # Starting training for epoch 23\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:40.607] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 68, \"duration\": 948, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] # Finished training epoch 23 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Loss (name: value) total: 6.659779464044878\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Loss (name: value) kld: 0.08626832957229306\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Loss (name: value) recons: 6.573511092893539\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Loss (name: value) logppx: 6.659779464044878\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] #quality_metric: host=algo-2, epoch=23, train total_loss =6.659779464044878\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:40.700] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 68, \"duration\": 90, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Loss (name: value) total: 6.821001393454416\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Loss (name: value) kld: 0.08628573162215096\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Loss (name: value) recons: 6.734715734209333\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Loss (name: value) logppx: 6.821001393454416\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] #validation_score (23): 6.821001393454416\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] patience losses:[6.869148254394531, 6.881233283451626, 6.862727505820138, 6.859888144901821, 6.845040798187256] min patience loss:6.845040798187256 current loss:6.821001393454416 absolute loss difference:0.024039404732840275\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] Timing: train: 0.95s, val: 0.10s, epoch: 1.05s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] #progress_metric: host=algo-2, completed 23.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194319.6586356, \"EndTime\": 1623194320.7053936, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 22, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 89079.0, \"count\": 1, \"min\": 89079, \"max\": 89079}, \"Total Batches Seen\": {\"sum\": 713.0, \"count\": 1, \"min\": 713, \"max\": 713}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 46.0, \"count\": 1, \"min\": 46, \"max\": 46}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3699.511316339163 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:40 INFO 140403258652480] # Starting training for epoch 24\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:41.620] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 71, \"duration\": 914, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] # Finished training epoch 24 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Loss (name: value) total: 6.656355638657847\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Loss (name: value) kld: 0.08939000471465049\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Loss (name: value) recons: 6.566965664586713\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Loss (name: value) logppx: 6.656355638657847\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] #quality_metric: host=algo-2, epoch=24, train total_loss =6.656355638657847\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:41.717] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 71, \"duration\": 96, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Loss (name: value) total: 6.8199052810668945\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Loss (name: value) kld: 0.08641789640699114\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Loss (name: value) recons: 6.733487401689802\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Loss (name: value) logppx: 6.8199052810668945\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] #validation_score (24): 6.8199052810668945\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] patience losses:[6.881233283451626, 6.862727505820138, 6.859888144901821, 6.845040798187256, 6.821001393454416] min patience loss:6.821001393454416 current loss:6.8199052810668945 absolute loss difference:0.0010961123875210532\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] Timing: train: 0.92s, val: 0.10s, epoch: 1.02s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] #progress_metric: host=algo-2, completed 24.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194320.705646, \"EndTime\": 1623194321.722293, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 23, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 92952.0, \"count\": 1, \"min\": 92952, \"max\": 92952}, \"Total Batches Seen\": {\"sum\": 744.0, \"count\": 1, \"min\": 744, \"max\": 744}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 48.0, \"count\": 1, \"min\": 48, \"max\": 48}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3809.137544599477 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:41 INFO 140403258652480] # Starting training for epoch 25\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:42.647] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 74, \"duration\": 925, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] # Finished training epoch 25 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Loss (name: value) total: 6.646712899208069\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Loss (name: value) kld: 0.09210404513343688\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Loss (name: value) recons: 6.554608864168967\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Loss (name: value) logppx: 6.646712899208069\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] #quality_metric: host=algo-2, epoch=25, train total_loss =6.646712899208069\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:42.738] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 74, \"duration\": 88, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Loss (name: value) total: 6.8057683535984586\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Loss (name: value) kld: 0.09298491052218846\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Loss (name: value) recons: 6.712783472878592\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Loss (name: value) logppx: 6.8057683535984586\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] #validation_score (25): 6.8057683535984586\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] patience losses:[6.862727505820138, 6.859888144901821, 6.845040798187256, 6.821001393454416, 6.8199052810668945] min patience loss:6.8199052810668945 current loss:6.8057683535984586 absolute loss difference:0.014136927468435978\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] Timing: train: 0.93s, val: 0.09s, epoch: 1.02s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] #progress_metric: host=algo-2, completed 25.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194321.7225623, \"EndTime\": 1623194322.744094, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 24, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 96825.0, \"count\": 1, \"min\": 96825, \"max\": 96825}, \"Total Batches Seen\": {\"sum\": 775.0, \"count\": 1, \"min\": 775, \"max\": 775}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 50.0, \"count\": 1, \"min\": 50, \"max\": 50}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3790.728004228374 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:42 INFO 140403258652480] # Starting training for epoch 26\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:39.076] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 65, \"duration\": 882, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] # Finished training epoch 22 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Loss (name: value) total: 6.665810604249278\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Loss (name: value) kld: 0.0821861119400109\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Loss (name: value) recons: 6.583624524454916\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Loss (name: value) logppx: 6.665810604249278\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] #quality_metric: host=algo-1, epoch=22, train total_loss =6.665810604249278\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:39.148] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 65, \"duration\": 69, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Loss (name: value) total: 6.834024769919259\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Loss (name: value) kld: 0.08342199559722628\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Loss (name: value) recons: 6.750602858407157\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Loss (name: value) logppx: 6.834024769919259\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] #validation_score (22): 6.834024769919259\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] patience losses:[6.898202623639788, 6.8816390718732565, 6.861605439867292, 6.854649407523019, 6.880430153438023] min patience loss:6.854649407523019 current loss:6.834024769919259 absolute loss difference:0.020624637603759766\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] Timing: train: 0.89s, val: 0.07s, epoch: 0.96s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] #progress_metric: host=algo-1, completed 22.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194318.1924777, \"EndTime\": 1623194319.1527398, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 21, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 85096.0, \"count\": 1, \"min\": 85096, \"max\": 85096}, \"Total Batches Seen\": {\"sum\": 682.0, \"count\": 1, \"min\": 682, \"max\": 682}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 44.0, \"count\": 1, \"min\": 44, \"max\": 44}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4027.447115882302 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:39 INFO 139703431448384] # Starting training for epoch 23\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:40.013] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 68, \"duration\": 860, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] # Finished training epoch 23 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) total: 6.659317255020142\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) kld: 0.08528950359792478\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) recons: 6.574027769027218\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) logppx: 6.659317255020142\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] #quality_metric: host=algo-1, epoch=23, train total_loss =6.659317255020142\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:40.102] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 68, \"duration\": 87, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) total: 6.8279876708984375\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) kld: 0.08783190378120967\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) recons: 6.740155628749302\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) logppx: 6.8279876708984375\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] #validation_score (23): 6.8279876708984375\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] patience losses:[6.8816390718732565, 6.861605439867292, 6.854649407523019, 6.880430153438023, 6.834024769919259] min patience loss:6.834024769919259 current loss:6.8279876708984375 absolute loss difference:0.0060370990208218345\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Timing: train: 0.86s, val: 0.09s, epoch: 0.95s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] #progress_metric: host=algo-1, completed 23.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194319.153012, \"EndTime\": 1623194320.107997, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 22, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 88964.0, \"count\": 1, \"min\": 88964, \"max\": 88964}, \"Total Batches Seen\": {\"sum\": 713.0, \"count\": 1, \"min\": 713, \"max\": 713}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 46.0, \"count\": 1, \"min\": 46, \"max\": 46}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4049.786914226803 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] # Starting training for epoch 24\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:40.987] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 71, \"duration\": 879, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] # Finished training epoch 24 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) total: 6.6417797573151125\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) kld: 0.08622417755184635\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) recons: 6.555555520519134\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] Loss (name: value) logppx: 6.6417797573151125\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:40 INFO 139703431448384] #quality_metric: host=algo-1, epoch=24, train total_loss =6.6417797573151125\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:41.055] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 71, \"duration\": 66, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Loss (name: value) total: 6.84266642161778\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Loss (name: value) kld: 0.09828001260757446\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Loss (name: value) recons: 6.7443864686148505\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Loss (name: value) logppx: 6.84266642161778\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] #validation_score (24): 6.84266642161778\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] patience losses:[6.861605439867292, 6.854649407523019, 6.880430153438023, 6.834024769919259, 6.8279876708984375] min patience loss:6.8279876708984375 current loss:6.84266642161778 absolute loss difference:0.01467875071934266\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Timing: train: 0.88s, val: 0.07s, epoch: 0.95s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] #progress_metric: host=algo-1, completed 24.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194320.1082702, \"EndTime\": 1623194321.0569303, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 23, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 92832.0, \"count\": 1, \"min\": 92832, \"max\": 92832}, \"Total Batches Seen\": {\"sum\": 744.0, \"count\": 1, \"min\": 744, \"max\": 744}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 48.0, \"count\": 1, \"min\": 48, \"max\": 48}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4076.811104982441 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] # Starting training for epoch 25\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:41.979] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 74, \"duration\": 921, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] # Finished training epoch 25 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Loss (name: value) total: 6.638374851595971\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Loss (name: value) kld: 0.08941735267158478\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Loss (name: value) recons: 6.548957474770084\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] Loss (name: value) logppx: 6.638374851595971\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:41 INFO 139703431448384] #quality_metric: host=algo-1, epoch=25, train total_loss =6.638374851595971\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:42.066] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 74, \"duration\": 85, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Loss (name: value) total: 6.8112970760890414\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Loss (name: value) kld: 0.08955710913453784\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Loss (name: value) recons: 6.7217399052211215\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Loss (name: value) logppx: 6.8112970760890414\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] #validation_score (25): 6.8112970760890414\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] patience losses:[6.854649407523019, 6.880430153438023, 6.834024769919259, 6.8279876708984375, 6.84266642161778] min patience loss:6.8279876708984375 current loss:6.8112970760890414 absolute loss difference:0.016690594809396053\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Timing: train: 0.92s, val: 0.09s, epoch: 1.01s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] #progress_metric: host=algo-1, completed 25.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194321.0572002, \"EndTime\": 1623194322.071541, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 24, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 96700.0, \"count\": 1, \"min\": 96700, \"max\": 96700}, \"Total Batches Seen\": {\"sum\": 775.0, \"count\": 1, \"min\": 775, \"max\": 775}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 50.0, \"count\": 1, \"min\": 50, \"max\": 50}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3812.8279960780224 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] # Starting training for epoch 26\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:42.996] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 77, \"duration\": 924, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] # Finished training epoch 26 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Loss (name: value) total: 6.627557700680148\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Loss (name: value) kld: 0.09210854029703525\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Loss (name: value) recons: 6.535449158760809\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] Loss (name: value) logppx: 6.627557700680148\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:42 INFO 139703431448384] #quality_metric: host=algo-1, epoch=26, train total_loss =6.627557700680148\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:43.061] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 77, \"duration\": 63, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Loss (name: value) total: 6.818722111838205\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Loss (name: value) kld: 0.10457579791545868\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Loss (name: value) recons: 6.714146341596331\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Loss (name: value) logppx: 6.818722111838205\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] #validation_score (26): 6.818722111838205\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] patience losses:[6.880430153438023, 6.834024769919259, 6.8279876708984375, 6.84266642161778, 6.8112970760890414] min patience loss:6.8112970760890414 current loss:6.818722111838205 absolute loss difference:0.0074250357491632\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Timing: train: 0.93s, val: 0.06s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] #progress_metric: host=algo-1, completed 26.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194322.071815, \"EndTime\": 1623194323.0623777, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 25, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 100568.0, \"count\": 1, \"min\": 100568, \"max\": 100568}, \"Total Batches Seen\": {\"sum\": 806.0, \"count\": 1, \"min\": 806, \"max\": 806}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 52.0, \"count\": 1, \"min\": 52, \"max\": 52}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3904.331669102928 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] # Starting training for epoch 27\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:43.998] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 80, \"duration\": 935, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] # Finished training epoch 27 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Loss (name: value) total: 6.614984527710946\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Loss (name: value) kld: 0.09493106976151466\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Loss (name: value) recons: 6.5200534059155375\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] Loss (name: value) logppx: 6.614984527710946\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:43 INFO 139703431448384] #quality_metric: host=algo-1, epoch=27, train total_loss =6.614984527710946\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:43.699] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 77, \"duration\": 948, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] # Finished training epoch 26 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Loss (name: value) total: 6.63441482282454\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Loss (name: value) kld: 0.09381848026908213\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Loss (name: value) recons: 6.540596377465032\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Loss (name: value) logppx: 6.63441482282454\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] #quality_metric: host=algo-2, epoch=26, train total_loss =6.63441482282454\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:43.768] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 77, \"duration\": 68, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Loss (name: value) total: 6.793333053588867\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Loss (name: value) kld: 0.09652209707668849\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Loss (name: value) recons: 6.696810926709857\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Loss (name: value) logppx: 6.793333053588867\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] #validation_score (26): 6.793333053588867\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] patience losses:[6.859888144901821, 6.845040798187256, 6.821001393454416, 6.8199052810668945, 6.8057683535984586] min patience loss:6.8057683535984586 current loss:6.793333053588867 absolute loss difference:0.012435300009591366\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] Timing: train: 0.96s, val: 0.07s, epoch: 1.03s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] #progress_metric: host=algo-2, completed 26.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194322.7444117, \"EndTime\": 1623194323.7742157, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 25, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 100698.0, \"count\": 1, \"min\": 100698, \"max\": 100698}, \"Total Batches Seen\": {\"sum\": 806.0, \"count\": 1, \"min\": 806, \"max\": 806}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 52.0, \"count\": 1, \"min\": 52, \"max\": 52}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3760.334416982332 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:43 INFO 140403258652480] # Starting training for epoch 27\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:44.062] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 80, \"duration\": 62, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Loss (name: value) total: 6.802752086094448\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Loss (name: value) kld: 0.0908956772514752\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Loss (name: value) recons: 6.711856365203857\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Loss (name: value) logppx: 6.802752086094448\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] #validation_score (27): 6.802752086094448\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] patience losses:[6.834024769919259, 6.8279876708984375, 6.84266642161778, 6.8112970760890414, 6.818722111838205] min patience loss:6.8112970760890414 current loss:6.802752086094448 absolute loss difference:0.008544989994593521\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Timing: train: 0.94s, val: 0.07s, epoch: 1.00s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] #progress_metric: host=algo-1, completed 27.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194323.0626183, \"EndTime\": 1623194324.067874, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 26, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 104436.0, \"count\": 1, \"min\": 104436, \"max\": 104436}, \"Total Batches Seen\": {\"sum\": 837.0, \"count\": 1, \"min\": 837, \"max\": 837}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 54.0, \"count\": 1, \"min\": 54, \"max\": 54}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3847.2626111239692 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] # Starting training for epoch 28\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:44.954] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 83, \"duration\": 886, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] # Finished training epoch 28 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Loss (name: value) total: 6.614842334101277\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Loss (name: value) kld: 0.09931317446452956\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Loss (name: value) recons: 6.5155291941858104\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] Loss (name: value) logppx: 6.614842334101277\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:44 INFO 139703431448384] #quality_metric: host=algo-1, epoch=28, train total_loss =6.614842334101277\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:44.687] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 80, \"duration\": 912, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] # Finished training epoch 27 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Loss (name: value) total: 6.636626032090956\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Loss (name: value) kld: 0.09950101405622498\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Loss (name: value) recons: 6.537124991416931\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Loss (name: value) logppx: 6.636626032090956\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] #quality_metric: host=algo-2, epoch=27, train total_loss =6.636626032090956\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:44.776] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 80, \"duration\": 85, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Loss (name: value) total: 6.79046276637486\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Loss (name: value) kld: 0.09723434916564397\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Loss (name: value) recons: 6.6932283129010886\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Loss (name: value) logppx: 6.79046276637486\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] #validation_score (27): 6.79046276637486\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] patience losses:[6.845040798187256, 6.821001393454416, 6.8199052810668945, 6.8057683535984586, 6.793333053588867] min patience loss:6.793333053588867 current loss:6.79046276637486 absolute loss difference:0.00287028721400695\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] Timing: train: 0.91s, val: 0.09s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] #progress_metric: host=algo-2, completed 27.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194323.7745767, \"EndTime\": 1623194324.7804508, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 26, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 104571.0, \"count\": 1, \"min\": 104571, \"max\": 104571}, \"Total Batches Seen\": {\"sum\": 837.0, \"count\": 1, \"min\": 837, \"max\": 837}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 54.0, \"count\": 1, \"min\": 54, \"max\": 54}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3849.8329184385802 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:44 INFO 140403258652480] # Starting training for epoch 28\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:45.053] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 83, \"duration\": 96, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Loss (name: value) total: 6.793524197169712\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Loss (name: value) kld: 0.10013756581715175\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Loss (name: value) recons: 6.693386622837612\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Loss (name: value) logppx: 6.793524197169712\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] #validation_score (28): 6.793524197169712\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] patience losses:[6.8279876708984375, 6.84266642161778, 6.8112970760890414, 6.818722111838205, 6.802752086094448] min patience loss:6.802752086094448 current loss:6.793524197169712 absolute loss difference:0.009227888924735694\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Timing: train: 0.89s, val: 0.10s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] #progress_metric: host=algo-1, completed 28.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194324.0681894, \"EndTime\": 1623194325.057605, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 27, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 108304.0, \"count\": 1, \"min\": 108304, \"max\": 108304}, \"Total Batches Seen\": {\"sum\": 868.0, \"count\": 1, \"min\": 868, \"max\": 868}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 56.0, \"count\": 1, \"min\": 56, \"max\": 56}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3908.7481221294665 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] # Starting training for epoch 29\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:45.972] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 86, \"duration\": 914, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] # Finished training epoch 29 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Loss (name: value) total: 6.59899850045481\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Loss (name: value) kld: 0.09868734065563448\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Loss (name: value) recons: 6.50031118623672\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] Loss (name: value) logppx: 6.59899850045481\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:45 INFO 139703431448384] #quality_metric: host=algo-1, epoch=29, train total_loss =6.59899850045481\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:45.703] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 83, \"duration\": 922, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] # Finished training epoch 28 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Loss (name: value) total: 6.6168160938447524\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Loss (name: value) kld: 0.09838753313787522\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Loss (name: value) recons: 6.518428598680804\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Loss (name: value) logppx: 6.6168160938447524\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] #quality_metric: host=algo-2, epoch=28, train total_loss =6.6168160938447524\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:45.791] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 83, \"duration\": 86, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Loss (name: value) total: 6.784268311091831\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Loss (name: value) kld: 0.09896422496863774\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Loss (name: value) recons: 6.685304233006069\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Loss (name: value) logppx: 6.784268311091831\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] #validation_score (28): 6.784268311091831\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] patience losses:[6.821001393454416, 6.8199052810668945, 6.8057683535984586, 6.793333053588867, 6.79046276637486] min patience loss:6.79046276637486 current loss:6.784268311091831 absolute loss difference:0.006194455283028866\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] Timing: train: 0.92s, val: 0.09s, epoch: 1.02s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] #progress_metric: host=algo-2, completed 28.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194324.7807548, \"EndTime\": 1623194325.7973247, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 27, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 108444.0, \"count\": 1, \"min\": 108444, \"max\": 108444}, \"Total Batches Seen\": {\"sum\": 868.0, \"count\": 1, \"min\": 868, \"max\": 868}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 56.0, \"count\": 1, \"min\": 56, \"max\": 56}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3809.361749546768 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:45 INFO 140403258652480] # Starting training for epoch 29\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:46.042] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 86, \"duration\": 68, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Loss (name: value) total: 6.791742052350726\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Loss (name: value) kld: 0.10195284017494746\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Loss (name: value) recons: 6.689789090837751\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Loss (name: value) logppx: 6.791742052350726\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] #validation_score (29): 6.791742052350726\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] patience losses:[6.84266642161778, 6.8112970760890414, 6.818722111838205, 6.802752086094448, 6.793524197169712] min patience loss:6.793524197169712 current loss:6.791742052350726 absolute loss difference:0.0017821448189865308\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Timing: train: 0.92s, val: 0.07s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] #progress_metric: host=algo-1, completed 29.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194325.0579338, \"EndTime\": 1623194326.0472052, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 28, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 112172.0, \"count\": 1, \"min\": 112172, \"max\": 112172}, \"Total Batches Seen\": {\"sum\": 899.0, \"count\": 1, \"min\": 899, \"max\": 899}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 58.0, \"count\": 1, \"min\": 58, \"max\": 58}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3909.381070949301 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] # Starting training for epoch 30\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:46.964] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 89, \"duration\": 916, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] # Finished training epoch 30 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Loss (name: value) total: 6.598636688724641\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Loss (name: value) kld: 0.1024318223879222\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Loss (name: value) recons: 6.496204860748783\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] Loss (name: value) logppx: 6.598636688724641\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:46 INFO 139703431448384] #quality_metric: host=algo-1, epoch=30, train total_loss =6.598636688724641\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:46.713] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 86, \"duration\": 915, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] # Finished training epoch 29 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Loss (name: value) total: 6.618988790819722\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Loss (name: value) kld: 0.10295827873051167\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Loss (name: value) recons: 6.516030557693973\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Loss (name: value) logppx: 6.618988790819722\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] #quality_metric: host=algo-2, epoch=29, train total_loss =6.618988790819722\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:46.800] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 86, \"duration\": 85, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Loss (name: value) total: 6.779223578316825\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Loss (name: value) kld: 0.10176127403974533\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Loss (name: value) recons: 6.677462237221854\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Loss (name: value) logppx: 6.779223578316825\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] #validation_score (29): 6.779223578316825\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] patience losses:[6.8199052810668945, 6.8057683535984586, 6.793333053588867, 6.79046276637486, 6.784268311091831] min patience loss:6.784268311091831 current loss:6.779223578316825 absolute loss difference:0.005044732775006722\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] Timing: train: 0.92s, val: 0.09s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] #progress_metric: host=algo-2, completed 29.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194325.7975643, \"EndTime\": 1623194326.8060765, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 28, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 112317.0, \"count\": 1, \"min\": 112317, \"max\": 112317}, \"Total Batches Seen\": {\"sum\": 899.0, \"count\": 1, \"min\": 899, \"max\": 899}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 58.0, \"count\": 1, \"min\": 58, \"max\": 58}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3839.126784110173 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:46 INFO 140403258652480] # Starting training for epoch 30\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:47.028] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 89, \"duration\": 62, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] Loss (name: value) total: 6.782483509608677\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] Loss (name: value) kld: 0.10839060693979263\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] Loss (name: value) recons: 6.674092905861991\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] Loss (name: value) logppx: 6.782483509608677\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] #validation_score (30): 6.782483509608677\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] patience losses:[6.8112970760890414, 6.818722111838205, 6.802752086094448, 6.793524197169712, 6.791742052350726] min patience loss:6.791742052350726 current loss:6.782483509608677 absolute loss difference:0.009258542742048625\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] Timing: train: 0.92s, val: 0.07s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] #progress_metric: host=algo-1, completed 30.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194326.047535, \"EndTime\": 1623194327.0353074, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 29, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 116040.0, \"count\": 1, \"min\": 116040, \"max\": 116040}, \"Total Batches Seen\": {\"sum\": 930.0, \"count\": 1, \"min\": 930, \"max\": 930}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 60.0, \"count\": 1, \"min\": 60, \"max\": 60}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3914.838383619015 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:47 INFO 139703431448384] # Starting training for epoch 31\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:47.720] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 89, \"duration\": 913, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] # Finished training epoch 30 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Loss (name: value) total: 6.602058433717297\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Loss (name: value) kld: 0.10410269217625741\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Loss (name: value) recons: 6.4979557721845564\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Loss (name: value) logppx: 6.602058433717297\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] #quality_metric: host=algo-2, epoch=30, train total_loss =6.602058433717297\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:47.899] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 89, \"duration\": 178, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Loss (name: value) total: 6.772921425955636\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Loss (name: value) kld: 0.10418781638145447\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Loss (name: value) recons: 6.668733596801758\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Loss (name: value) logppx: 6.772921425955636\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] #validation_score (30): 6.772921425955636\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] patience losses:[6.8057683535984586, 6.793333053588867, 6.79046276637486, 6.784268311091831, 6.779223578316825] min patience loss:6.779223578316825 current loss:6.772921425955636 absolute loss difference:0.006302152361188362\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] Timing: train: 0.91s, val: 0.18s, epoch: 1.10s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] #progress_metric: host=algo-2, completed 30.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194326.8066654, \"EndTime\": 1623194327.9061403, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 29, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 116190.0, \"count\": 1, \"min\": 116190, \"max\": 116190}, \"Total Batches Seen\": {\"sum\": 930.0, \"count\": 1, \"min\": 930, \"max\": 930}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 60.0, \"count\": 1, \"min\": 60, \"max\": 60}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3522.087309914095 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:47 INFO 140403258652480] # Starting training for epoch 31\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:48.055] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 92, \"duration\": 1019, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] # Finished training epoch 31 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Loss (name: value) total: 6.587060159252536\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Loss (name: value) kld: 0.10416518590382991\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Loss (name: value) recons: 6.482894958988313\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Loss (name: value) logppx: 6.587060159252536\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] #quality_metric: host=algo-1, epoch=31, train total_loss =6.587060159252536\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:48.167] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 92, \"duration\": 107, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Loss (name: value) total: 6.768149716513498\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Loss (name: value) kld: 0.11289980156081063\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Loss (name: value) recons: 6.65524993624006\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Loss (name: value) logppx: 6.768149716513498\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] #validation_score (31): 6.768149716513498\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] patience losses:[6.818722111838205, 6.802752086094448, 6.793524197169712, 6.791742052350726, 6.782483509608677] min patience loss:6.782483509608677 current loss:6.768149716513498 absolute loss difference:0.014333793095179459\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] Timing: train: 1.02s, val: 0.11s, epoch: 1.14s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] #progress_metric: host=algo-1, completed 31.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194327.0357637, \"EndTime\": 1623194328.173518, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 30, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 119908.0, \"count\": 1, \"min\": 119908, \"max\": 119908}, \"Total Batches Seen\": {\"sum\": 961.0, \"count\": 1, \"min\": 961, \"max\": 961}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 62.0, \"count\": 1, \"min\": 62, \"max\": 62}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3398.5439063599124 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:48 INFO 139703431448384] # Starting training for epoch 32\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:48.998] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 92, \"duration\": 1090, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:48 INFO 140403258652480] # Finished training epoch 31 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:48 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:48 INFO 140403258652480] Loss (name: value) total: 6.5946077608293105\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:48 INFO 140403258652480] Loss (name: value) kld: 0.10742388677693182\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:48 INFO 140403258652480] Loss (name: value) recons: 6.48718390926238\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:48 INFO 140403258652480] Loss (name: value) logppx: 6.5946077608293105\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:48 INFO 140403258652480] #quality_metric: host=algo-2, epoch=31, train total_loss =6.5946077608293105\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:49.092] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 92, \"duration\": 91, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] Loss (name: value) total: 6.761534009660993\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] Loss (name: value) kld: 0.11521699492420469\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] Loss (name: value) recons: 6.646316937037876\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] Loss (name: value) logppx: 6.761534009660993\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] #validation_score (31): 6.761534009660993\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] patience losses:[6.793333053588867, 6.79046276637486, 6.784268311091831, 6.779223578316825, 6.772921425955636] min patience loss:6.772921425955636 current loss:6.761534009660993 absolute loss difference:0.011387416294643238\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] Timing: train: 1.09s, val: 0.10s, epoch: 1.19s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] #progress_metric: host=algo-2, completed 31.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194327.906468, \"EndTime\": 1623194329.0968785, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 30, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 120063.0, \"count\": 1, \"min\": 120063, \"max\": 120063}, \"Total Batches Seen\": {\"sum\": 961.0, \"count\": 1, \"min\": 961, \"max\": 961}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 62.0, \"count\": 1, \"min\": 62, \"max\": 62}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3253.022301006313 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:49 INFO 140403258652480] # Starting training for epoch 32\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:50.010] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 95, \"duration\": 912, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] # Finished training epoch 32 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Loss (name: value) total: 6.5838039267447686\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Loss (name: value) kld: 0.10994393276351114\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Loss (name: value) recons: 6.4738600100240395\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Loss (name: value) logppx: 6.5838039267447686\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] #quality_metric: host=algo-2, epoch=32, train total_loss =6.5838039267447686\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:50.094] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 95, \"duration\": 83, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Loss (name: value) total: 6.753398418426514\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Loss (name: value) kld: 0.1077413484454155\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Loss (name: value) recons: 6.645657130650112\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Loss (name: value) logppx: 6.753398418426514\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] #validation_score (32): 6.753398418426514\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] patience losses:[6.79046276637486, 6.784268311091831, 6.779223578316825, 6.772921425955636, 6.761534009660993] min patience loss:6.761534009660993 current loss:6.753398418426514 absolute loss difference:0.008135591234479378\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] Timing: train: 0.91s, val: 0.09s, epoch: 1.00s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] #progress_metric: host=algo-2, completed 32.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194329.097209, \"EndTime\": 1623194330.101256, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 31, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 123936.0, \"count\": 1, \"min\": 123936, \"max\": 123936}, \"Total Batches Seen\": {\"sum\": 992.0, \"count\": 1, \"min\": 992, \"max\": 992}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 64.0, \"count\": 1, \"min\": 64, \"max\": 64}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3856.72275680519 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:50 INFO 140403258652480] # Starting training for epoch 33\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:51.048] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 98, \"duration\": 946, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] # Finished training epoch 33 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Loss (name: value) total: 6.590159473880645\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Loss (name: value) kld: 0.11490659926447176\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Loss (name: value) recons: 6.475252874435917\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Loss (name: value) logppx: 6.590159473880645\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] #quality_metric: host=algo-2, epoch=33, train total_loss =6.590159473880645\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:51.140] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 98, \"duration\": 90, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Loss (name: value) total: 6.754370212554932\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Loss (name: value) kld: 0.1153566518000194\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Loss (name: value) recons: 6.639013562883649\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Loss (name: value) logppx: 6.754370212554932\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] #validation_score (33): 6.754370212554932\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] patience losses:[6.784268311091831, 6.779223578316825, 6.772921425955636, 6.761534009660993, 6.753398418426514] min patience loss:6.753398418426514 current loss:6.754370212554932 absolute loss difference:0.0009717941284179688\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] Timing: train: 0.95s, val: 0.09s, epoch: 1.04s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] #progress_metric: host=algo-2, completed 33.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194330.101636, \"EndTime\": 1623194331.1416678, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 32, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 127809.0, \"count\": 1, \"min\": 127809, \"max\": 127809}, \"Total Batches Seen\": {\"sum\": 1023.0, \"count\": 1, \"min\": 1023, \"max\": 1023}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 66.0, \"count\": 1, \"min\": 66, \"max\": 66}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3723.182502998456 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:51 INFO 140403258652480] # Starting training for epoch 34\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:52.037] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 101, \"duration\": 895, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] # Finished training epoch 34 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Loss (name: value) total: 6.575856385692473\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Loss (name: value) kld: 0.11619408200344732\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Loss (name: value) recons: 6.459662279775066\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Loss (name: value) logppx: 6.575856385692473\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] #quality_metric: host=algo-2, epoch=34, train total_loss =6.575856385692473\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:52.112] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 101, \"duration\": 72, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Loss (name: value) total: 6.749467509133475\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Loss (name: value) kld: 0.12622197504554475\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Loss (name: value) recons: 6.623245511736188\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Loss (name: value) logppx: 6.749467509133475\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] #validation_score (34): 6.749467509133475\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] patience losses:[6.779223578316825, 6.772921425955636, 6.761534009660993, 6.753398418426514, 6.754370212554932] min patience loss:6.753398418426514 current loss:6.749467509133475 absolute loss difference:0.003930909293038631\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] Timing: train: 0.90s, val: 0.08s, epoch: 0.97s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] #progress_metric: host=algo-2, completed 34.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194331.1421874, \"EndTime\": 1623194332.1172612, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 33, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 131682.0, \"count\": 1, \"min\": 131682, \"max\": 131682}, \"Total Batches Seen\": {\"sum\": 1054.0, \"count\": 1, \"min\": 1054, \"max\": 1054}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 68.0, \"count\": 1, \"min\": 68, \"max\": 68}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3971.4835707448974 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:52 INFO 140403258652480] # Starting training for epoch 35\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:53.099] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 104, \"duration\": 981, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] # Finished training epoch 35 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Loss (name: value) total: 6.570859336083935\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Loss (name: value) kld: 0.1189706643983241\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Loss (name: value) recons: 6.4518886227761545\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Loss (name: value) logppx: 6.570859336083935\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] #quality_metric: host=algo-2, epoch=35, train total_loss =6.570859336083935\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:53.165] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 104, \"duration\": 63, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Loss (name: value) total: 6.766678537641253\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Loss (name: value) kld: 0.11529988795518875\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Loss (name: value) recons: 6.651378767830985\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Loss (name: value) logppx: 6.766678537641253\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] #validation_score (35): 6.766678537641253\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] patience losses:[6.772921425955636, 6.761534009660993, 6.753398418426514, 6.754370212554932, 6.749467509133475] min patience loss:6.749467509133475 current loss:6.766678537641253 absolute loss difference:0.017211028507778003\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] Timing: train: 0.98s, val: 0.07s, epoch: 1.05s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] #progress_metric: host=algo-2, completed 35.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194332.1175148, \"EndTime\": 1623194333.1669867, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 34, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 135555.0, \"count\": 1, \"min\": 135555, \"max\": 135555}, \"Total Batches Seen\": {\"sum\": 1085.0, \"count\": 1, \"min\": 1085, \"max\": 1085}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 70.0, \"count\": 1, \"min\": 70, \"max\": 70}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3689.908805219308 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:53 INFO 140403258652480] # Starting training for epoch 36\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:49.161] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 95, \"duration\": 983, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] # Finished training epoch 32 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Loss (name: value) total: 6.579069637483166\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Loss (name: value) kld: 0.10942346674780692\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Loss (name: value) recons: 6.469646188520616\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Loss (name: value) logppx: 6.579069637483166\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] #quality_metric: host=algo-1, epoch=32, train total_loss =6.579069637483166\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:49.236] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 95, \"duration\": 72, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Loss (name: value) total: 6.765038354056222\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Loss (name: value) kld: 0.11022824261869703\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Loss (name: value) recons: 6.6548100880214145\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Loss (name: value) logppx: 6.765038354056222\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] #validation_score (32): 6.765038354056222\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] patience losses:[6.802752086094448, 6.793524197169712, 6.791742052350726, 6.782483509608677, 6.768149716513498] min patience loss:6.768149716513498 current loss:6.765038354056222 absolute loss difference:0.0031113624572753906\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] Timing: train: 0.99s, val: 0.08s, epoch: 1.07s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] #progress_metric: host=algo-1, completed 32.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194328.1741936, \"EndTime\": 1623194329.2415597, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 31, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 123776.0, \"count\": 1, \"min\": 123776, \"max\": 123776}, \"Total Batches Seen\": {\"sum\": 992.0, \"count\": 1, \"min\": 992, \"max\": 992}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 64.0, \"count\": 1, \"min\": 64, \"max\": 64}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3623.4066763671613 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:49 INFO 139703431448384] # Starting training for epoch 33\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:50.117] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 98, \"duration\": 874, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] # Finished training epoch 33 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Loss (name: value) total: 6.571246977775328\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Loss (name: value) kld: 0.11036725204077459\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Loss (name: value) recons: 6.460879710412795\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Loss (name: value) logppx: 6.571246977775328\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] #quality_metric: host=algo-1, epoch=33, train total_loss =6.571246977775328\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:50.181] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 98, \"duration\": 62, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Loss (name: value) total: 6.752309662955148\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Loss (name: value) kld: 0.11210065760782786\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Loss (name: value) recons: 6.640208925519671\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Loss (name: value) logppx: 6.752309662955148\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] #validation_score (33): 6.752309662955148\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] patience losses:[6.793524197169712, 6.791742052350726, 6.782483509608677, 6.768149716513498, 6.765038354056222] min patience loss:6.765038354056222 current loss:6.752309662955148 absolute loss difference:0.012728691101074219\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] Timing: train: 0.88s, val: 0.07s, epoch: 0.94s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] #progress_metric: host=algo-1, completed 33.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194329.2419796, \"EndTime\": 1623194330.1868086, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 32, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 127644.0, \"count\": 1, \"min\": 127644, \"max\": 127644}, \"Total Batches Seen\": {\"sum\": 1023.0, \"count\": 1, \"min\": 1023, \"max\": 1023}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 66.0, \"count\": 1, \"min\": 66, \"max\": 66}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4093.336789617221 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:50 INFO 139703431448384] # Starting training for epoch 34\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:51.133] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 101, \"duration\": 946, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] # Finished training epoch 34 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Loss (name: value) total: 6.572240337248771\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Loss (name: value) kld: 0.11495615529917902\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Loss (name: value) recons: 6.457284154430512\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Loss (name: value) logppx: 6.572240337248771\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] #quality_metric: host=algo-1, epoch=34, train total_loss =6.572240337248771\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:51.200] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 101, \"duration\": 65, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Loss (name: value) total: 6.741094384874616\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Loss (name: value) kld: 0.11860519647598267\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Loss (name: value) recons: 6.622489043644497\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Loss (name: value) logppx: 6.741094384874616\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] #validation_score (34): 6.741094384874616\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] patience losses:[6.791742052350726, 6.782483509608677, 6.768149716513498, 6.765038354056222, 6.752309662955148] min patience loss:6.752309662955148 current loss:6.741094384874616 absolute loss difference:0.01121527808053191\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] Timing: train: 0.95s, val: 0.07s, epoch: 1.02s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] #progress_metric: host=algo-1, completed 34.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194330.1870756, \"EndTime\": 1623194331.2051654, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 33, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 131512.0, \"count\": 1, \"min\": 131512, \"max\": 131512}, \"Total Batches Seen\": {\"sum\": 1054.0, \"count\": 1, \"min\": 1054, \"max\": 1054}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 68.0, \"count\": 1, \"min\": 68, \"max\": 68}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3798.7854221472935 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:51 INFO 139703431448384] # Starting training for epoch 35\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:52.124] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 104, \"duration\": 918, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] # Finished training epoch 35 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Loss (name: value) total: 6.558247773878036\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Loss (name: value) kld: 0.11554081527696501\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Loss (name: value) recons: 6.44270698870382\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Loss (name: value) logppx: 6.558247773878036\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] #quality_metric: host=algo-1, epoch=35, train total_loss =6.558247773878036\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:52.185] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 104, \"duration\": 59, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Loss (name: value) total: 6.740245342254639\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Loss (name: value) kld: 0.1171108867440905\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Loss (name: value) recons: 6.623134476797921\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Loss (name: value) logppx: 6.740245342254639\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] #validation_score (35): 6.740245342254639\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] patience losses:[6.782483509608677, 6.768149716513498, 6.765038354056222, 6.752309662955148, 6.741094384874616] min patience loss:6.741094384874616 current loss:6.740245342254639 absolute loss difference:0.0008490426199774248\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] Timing: train: 0.92s, val: 0.07s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] #progress_metric: host=algo-1, completed 35.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194331.2054236, \"EndTime\": 1623194332.1928053, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 34, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 135380.0, \"count\": 1, \"min\": 135380, \"max\": 135380}, \"Total Batches Seen\": {\"sum\": 1085.0, \"count\": 1, \"min\": 1085, \"max\": 1085}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 70.0, \"count\": 1, \"min\": 70, \"max\": 70}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3916.825093070531 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:52 INFO 139703431448384] # Starting training for epoch 36\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:53.159] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 107, \"duration\": 965, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] # Finished training epoch 36 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Loss (name: value) total: 6.554140636997838\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Loss (name: value) kld: 0.11930101304765671\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Loss (name: value) recons: 6.434839640894244\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Loss (name: value) logppx: 6.554140636997838\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] #quality_metric: host=algo-1, epoch=36, train total_loss =6.554140636997838\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:53.227] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 107, \"duration\": 67, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Loss (name: value) total: 6.735398360661098\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Loss (name: value) kld: 0.11427459014313561\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Loss (name: value) recons: 6.621123790740967\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Loss (name: value) logppx: 6.735398360661098\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] #validation_score (36): 6.735398360661098\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] patience losses:[6.768149716513498, 6.765038354056222, 6.752309662955148, 6.741094384874616, 6.740245342254639] min patience loss:6.740245342254639 current loss:6.735398360661098 absolute loss difference:0.004846981593540356\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] Timing: train: 0.97s, val: 0.07s, epoch: 1.04s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] #progress_metric: host=algo-1, completed 36.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194332.1930695, \"EndTime\": 1623194333.2319074, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 35, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 139248.0, \"count\": 1, \"min\": 139248, \"max\": 139248}, \"Total Batches Seen\": {\"sum\": 1116.0, \"count\": 1, \"min\": 1116, \"max\": 1116}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 72.0, \"count\": 1, \"min\": 72, \"max\": 72}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3722.9452512366547 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:53 INFO 139703431448384] # Starting training for epoch 37\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:54.080] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 107, \"duration\": 912, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] # Finished training epoch 36 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Loss (name: value) total: 6.568975002534928\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Loss (name: value) kld: 0.12304298783982953\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Loss (name: value) recons: 6.445932003759569\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Loss (name: value) logppx: 6.568975002534928\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] #quality_metric: host=algo-2, epoch=36, train total_loss =6.568975002534928\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:54.137] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 107, \"duration\": 56, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Loss (name: value) total: 6.730280126844134\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Loss (name: value) kld: 0.12150586609329496\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Loss (name: value) recons: 6.6087742533002585\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Loss (name: value) logppx: 6.730280126844134\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] #validation_score (36): 6.730280126844134\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] patience losses:[6.761534009660993, 6.753398418426514, 6.754370212554932, 6.749467509133475, 6.766678537641253] min patience loss:6.749467509133475 current loss:6.730280126844134 absolute loss difference:0.019187382289341137\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] Timing: train: 0.91s, val: 0.06s, epoch: 0.98s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] #progress_metric: host=algo-2, completed 36.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194333.1672814, \"EndTime\": 1623194334.142825, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 35, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 139428.0, \"count\": 1, \"min\": 139428, \"max\": 139428}, \"Total Batches Seen\": {\"sum\": 1116.0, \"count\": 1, \"min\": 1116, \"max\": 1116}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 72.0, \"count\": 1, \"min\": 72, \"max\": 72}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3969.5678343725695 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:54 INFO 140403258652480] # Starting training for epoch 37\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:54.084] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 110, \"duration\": 851, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] # Finished training epoch 37 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Loss (name: value) total: 6.552084976626981\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Loss (name: value) kld: 0.12133233657767696\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Loss (name: value) recons: 6.430752623465754\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Loss (name: value) logppx: 6.552084976626981\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] #quality_metric: host=algo-1, epoch=37, train total_loss =6.552084976626981\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:54.147] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 110, \"duration\": 62, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Loss (name: value) total: 6.733829634530204\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Loss (name: value) kld: 0.12001204171351024\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Loss (name: value) recons: 6.6138176918029785\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Loss (name: value) logppx: 6.733829634530204\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] #validation_score (37): 6.733829634530204\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] patience losses:[6.765038354056222, 6.752309662955148, 6.741094384874616, 6.740245342254639, 6.735398360661098] min patience loss:6.735398360661098 current loss:6.733829634530204 absolute loss difference:0.0015687261308947598\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] Timing: train: 0.85s, val: 0.07s, epoch: 0.92s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] #progress_metric: host=algo-1, completed 37.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194333.2321966, \"EndTime\": 1623194334.1526773, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 36, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 143116.0, \"count\": 1, \"min\": 143116, \"max\": 143116}, \"Total Batches Seen\": {\"sum\": 1147.0, \"count\": 1, \"min\": 1147, \"max\": 1147}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 74.0, \"count\": 1, \"min\": 74, \"max\": 74}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4201.567671929211 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:54 INFO 139703431448384] # Starting training for epoch 38\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:55.084] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 110, \"duration\": 941, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] # Finished training epoch 37 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Loss (name: value) total: 6.557133839976403\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Loss (name: value) kld: 0.12426442292428785\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Loss (name: value) recons: 6.432869422820307\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Loss (name: value) logppx: 6.557133839976403\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] #quality_metric: host=algo-2, epoch=37, train total_loss =6.557133839976403\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:55.149] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 110, \"duration\": 63, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Loss (name: value) total: 6.729712554386684\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Loss (name: value) kld: 0.12496718444994517\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Loss (name: value) recons: 6.604745388031006\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Loss (name: value) logppx: 6.729712554386684\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] #validation_score (37): 6.729712554386684\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] patience losses:[6.753398418426514, 6.754370212554932, 6.749467509133475, 6.766678537641253, 6.730280126844134] min patience loss:6.730280126844134 current loss:6.729712554386684 absolute loss difference:0.0005675724574496499\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] Timing: train: 0.94s, val: 0.07s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] #progress_metric: host=algo-2, completed 37.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194334.1430824, \"EndTime\": 1623194335.1548357, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 36, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 143301.0, \"count\": 1, \"min\": 143301, \"max\": 143301}, \"Total Batches Seen\": {\"sum\": 1147.0, \"count\": 1, \"min\": 1147, \"max\": 1147}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 74.0, \"count\": 1, \"min\": 74, \"max\": 74}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3827.493181708941 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:55 INFO 140403258652480] # Starting training for epoch 38\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:55.115] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 113, \"duration\": 961, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] # Finished training epoch 38 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Loss (name: value) total: 6.541282984518236\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Loss (name: value) kld: 0.12293701594875704\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Loss (name: value) recons: 6.418345962801287\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Loss (name: value) logppx: 6.541282984518236\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] #quality_metric: host=algo-1, epoch=38, train total_loss =6.541282984518236\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:55.173] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 113, \"duration\": 57, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Loss (name: value) total: 6.729048728942871\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Loss (name: value) kld: 0.12496718444994517\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Loss (name: value) recons: 6.604081562587193\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Loss (name: value) logppx: 6.729048728942871\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] #validation_score (38): 6.729048728942871\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] patience losses:[6.752309662955148, 6.741094384874616, 6.740245342254639, 6.735398360661098, 6.733829634530204] min patience loss:6.733829634530204 current loss:6.729048728942871 absolute loss difference:0.004780905587332462\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] Timing: train: 0.96s, val: 0.06s, epoch: 1.03s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] #progress_metric: host=algo-1, completed 38.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194334.1529512, \"EndTime\": 1623194335.179533, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 37, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 146984.0, \"count\": 1, \"min\": 146984, \"max\": 146984}, \"Total Batches Seen\": {\"sum\": 1178.0, \"count\": 1, \"min\": 1178, \"max\": 1178}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 76.0, \"count\": 1, \"min\": 76, \"max\": 76}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3767.256981651084 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:55 INFO 139703431448384] # Starting training for epoch 39\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:56.094] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 113, \"duration\": 938, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] # Finished training epoch 38 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Loss (name: value) total: 6.55517364317371\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Loss (name: value) kld: 0.12642149963686544\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Loss (name: value) recons: 6.428752214677872\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Loss (name: value) logppx: 6.55517364317371\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] #quality_metric: host=algo-2, epoch=38, train total_loss =6.55517364317371\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:56.174] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 113, \"duration\": 79, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Loss (name: value) total: 6.722713470458984\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Loss (name: value) kld: 0.12360682977097374\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Loss (name: value) recons: 6.599106448037284\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Loss (name: value) logppx: 6.722713470458984\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] #validation_score (38): 6.722713470458984\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] patience losses:[6.754370212554932, 6.749467509133475, 6.766678537641253, 6.730280126844134, 6.729712554386684] min patience loss:6.729712554386684 current loss:6.722713470458984 absolute loss difference:0.006999083927699878\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] Timing: train: 0.94s, val: 0.09s, epoch: 1.03s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] #progress_metric: host=algo-2, completed 38.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194335.155098, \"EndTime\": 1623194336.1812508, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 37, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 147174.0, \"count\": 1, \"min\": 147174, \"max\": 147174}, \"Total Batches Seen\": {\"sum\": 1178.0, \"count\": 1, \"min\": 1178, \"max\": 1178}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 76.0, \"count\": 1, \"min\": 76, \"max\": 76}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3772.540235648207 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:56 INFO 140403258652480] # Starting training for epoch 39\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:56.063] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 116, \"duration\": 883, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] # Finished training epoch 39 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Loss (name: value) total: 6.541344427293347\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Loss (name: value) kld: 0.12446937782149162\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Loss (name: value) recons: 6.416875043222981\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Loss (name: value) logppx: 6.541344427293347\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] #quality_metric: host=algo-1, epoch=39, train total_loss =6.541344427293347\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:56.131] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 116, \"duration\": 67, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Loss (name: value) total: 6.733092171805246\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Loss (name: value) kld: 0.12680137902498245\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Loss (name: value) recons: 6.606290681021554\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Loss (name: value) logppx: 6.733092171805246\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] #validation_score (39): 6.733092171805246\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] patience losses:[6.741094384874616, 6.740245342254639, 6.735398360661098, 6.733829634530204, 6.729048728942871] min patience loss:6.729048728942871 current loss:6.733092171805246 absolute loss difference:0.004043442862374569\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] Timing: train: 0.88s, val: 0.07s, epoch: 0.95s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] #progress_metric: host=algo-1, completed 39.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194335.1798508, \"EndTime\": 1623194336.133397, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 38, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 150852.0, \"count\": 1, \"min\": 150852, \"max\": 150852}, \"Total Batches Seen\": {\"sum\": 1209.0, \"count\": 1, \"min\": 1209, \"max\": 1209}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 78.0, \"count\": 1, \"min\": 78, \"max\": 78}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4055.80578212713 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:56 INFO 139703431448384] # Starting training for epoch 40\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:57.025] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 119, \"duration\": 891, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] # Finished training epoch 40 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Loss (name: value) total: 6.531485234537432\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Loss (name: value) kld: 0.12565534165309322\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Loss (name: value) recons: 6.405829914154545\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Loss (name: value) logppx: 6.531485234537432\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] #quality_metric: host=algo-1, epoch=40, train total_loss =6.531485234537432\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:57.110] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 116, \"duration\": 928, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] # Finished training epoch 39 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Loss (name: value) total: 6.545046337189213\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Loss (name: value) kld: 0.1267885029796631\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Loss (name: value) recons: 6.418257905590918\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Loss (name: value) logppx: 6.545046337189213\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] #quality_metric: host=algo-2, epoch=39, train total_loss =6.545046337189213\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:57.185] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 116, \"duration\": 73, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Loss (name: value) total: 6.7241122382027765\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Loss (name: value) kld: 0.1309032684990338\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Loss (name: value) recons: 6.593208926064627\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Loss (name: value) logppx: 6.7241122382027765\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] #validation_score (39): 6.7241122382027765\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] patience losses:[6.749467509133475, 6.766678537641253, 6.730280126844134, 6.729712554386684, 6.722713470458984] min patience loss:6.722713470458984 current loss:6.7241122382027765 absolute loss difference:0.0013987677437921064\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] Timing: train: 0.93s, val: 0.08s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] #progress_metric: host=algo-2, completed 39.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194336.1820056, \"EndTime\": 1623194337.1874182, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 38, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 151047.0, \"count\": 1, \"min\": 151047, \"max\": 151047}, \"Total Batches Seen\": {\"sum\": 1209.0, \"count\": 1, \"min\": 1209, \"max\": 1209}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 78.0, \"count\": 1, \"min\": 78, \"max\": 78}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3851.5507861938677 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:57 INFO 140403258652480] # Starting training for epoch 40\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:57.093] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 119, \"duration\": 67, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Loss (name: value) total: 6.724212918962751\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Loss (name: value) kld: 0.12819062811987741\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Loss (name: value) recons: 6.596022265298026\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Loss (name: value) logppx: 6.724212918962751\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] #validation_score (40): 6.724212918962751\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] patience losses:[6.740245342254639, 6.735398360661098, 6.733829634530204, 6.729048728942871, 6.733092171805246] min patience loss:6.729048728942871 current loss:6.724212918962751 absolute loss difference:0.0048358099801202314\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] Timing: train: 0.89s, val: 0.07s, epoch: 0.97s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] #progress_metric: host=algo-1, completed 40.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194336.1336927, \"EndTime\": 1623194337.1003275, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 39, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 154720.0, \"count\": 1, \"min\": 154720, \"max\": 154720}, \"Total Batches Seen\": {\"sum\": 1240.0, \"count\": 1, \"min\": 1240, \"max\": 1240}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 80.0, \"count\": 1, \"min\": 80, \"max\": 80}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4000.8838164082035 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:57 INFO 139703431448384] # Starting training for epoch 41\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:58.018] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 122, \"duration\": 917, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] # Finished training epoch 41 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Loss (name: value) total: 6.527125789273169\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Loss (name: value) kld: 0.12927909075252472\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Loss (name: value) recons: 6.397846710297369\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Loss (name: value) logppx: 6.527125789273169\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] #quality_metric: host=algo-1, epoch=41, train total_loss =6.527125789273169\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:58.086] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 119, \"duration\": 898, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] # Finished training epoch 40 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Loss (name: value) total: 6.5433108652791665\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Loss (name: value) kld: 0.1308426900856918\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Loss (name: value) recons: 6.412468168043321\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Loss (name: value) logppx: 6.5433108652791665\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] #quality_metric: host=algo-2, epoch=40, train total_loss =6.5433108652791665\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:58.203] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 119, \"duration\": 115, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Loss (name: value) total: 6.717088086264474\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Loss (name: value) kld: 0.12764419934579305\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Loss (name: value) recons: 6.589443956102643\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Loss (name: value) logppx: 6.717088086264474\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] #validation_score (40): 6.717088086264474\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] patience losses:[6.766678537641253, 6.730280126844134, 6.729712554386684, 6.722713470458984, 6.7241122382027765] min patience loss:6.722713470458984 current loss:6.717088086264474 absolute loss difference:0.005625384194510197\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] Timing: train: 0.90s, val: 0.12s, epoch: 1.02s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] #progress_metric: host=algo-2, completed 40.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194337.1877112, \"EndTime\": 1623194338.2081888, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 39, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 154920.0, \"count\": 1, \"min\": 154920, \"max\": 154920}, \"Total Batches Seen\": {\"sum\": 1240.0, \"count\": 1, \"min\": 1240, \"max\": 1240}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 80.0, \"count\": 1, \"min\": 80, \"max\": 80}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3794.790813846275 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:58 INFO 140403258652480] # Starting training for epoch 41\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:58.098] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 122, \"duration\": 78, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Loss (name: value) total: 6.7126873561314175\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Loss (name: value) kld: 0.12672167590686254\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Loss (name: value) recons: 6.585965701511928\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Loss (name: value) logppx: 6.7126873561314175\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] #validation_score (41): 6.7126873561314175\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] patience losses:[6.735398360661098, 6.733829634530204, 6.729048728942871, 6.733092171805246, 6.724212918962751] min patience loss:6.724212918962751 current loss:6.7126873561314175 absolute loss difference:0.011525562831333325\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] Timing: train: 0.92s, val: 0.08s, epoch: 1.00s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] #progress_metric: host=algo-1, completed 41.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194337.1006193, \"EndTime\": 1623194338.1028264, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 40, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 158588.0, \"count\": 1, \"min\": 158588, \"max\": 158588}, \"Total Batches Seen\": {\"sum\": 1271.0, \"count\": 1, \"min\": 1271, \"max\": 1271}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 82.0, \"count\": 1, \"min\": 82, \"max\": 82}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3858.8760847728986 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:58 INFO 139703431448384] # Starting training for epoch 42\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:59.134] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 122, \"duration\": 926, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] # Finished training epoch 41 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Loss (name: value) total: 6.532584871015241\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Loss (name: value) kld: 0.13088302578656905\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Loss (name: value) recons: 6.401701881039527\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Loss (name: value) logppx: 6.532584871015241\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] #quality_metric: host=algo-2, epoch=41, train total_loss =6.532584871015241\u001b[0m\n", - "\u001b[35m[2021-06-08 23:18:59.215] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 122, \"duration\": 78, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Loss (name: value) total: 6.706733635493687\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Loss (name: value) kld: 0.1264485770038196\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Loss (name: value) recons: 6.58028507232666\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Loss (name: value) logppx: 6.706733635493687\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] #validation_score (41): 6.706733635493687\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] patience losses:[6.730280126844134, 6.729712554386684, 6.722713470458984, 6.7241122382027765, 6.717088086264474] min patience loss:6.717088086264474 current loss:6.706733635493687 absolute loss difference:0.010354450770787338\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] Timing: train: 0.93s, val: 0.08s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] #progress_metric: host=algo-2, completed 41.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194338.208429, \"EndTime\": 1623194339.2214096, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 40, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 158793.0, \"count\": 1, \"min\": 158793, \"max\": 158793}, \"Total Batches Seen\": {\"sum\": 1271.0, \"count\": 1, \"min\": 1271, \"max\": 1271}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 82.0, \"count\": 1, \"min\": 82, \"max\": 82}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3822.783350406328 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:18:59 INFO 140403258652480] # Starting training for epoch 42\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:59.077] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 125, \"duration\": 974, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] # Finished training epoch 42 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Loss (name: value) total: 6.518353666028669\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Loss (name: value) kld: 0.12942565773283282\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Loss (name: value) recons: 6.388928051917784\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Loss (name: value) logppx: 6.518353666028669\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] #quality_metric: host=algo-1, epoch=42, train total_loss =6.518353666028669\u001b[0m\n", - "\u001b[34m[2021-06-08 23:18:59.149] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 125, \"duration\": 68, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Loss (name: value) total: 6.713650703430176\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Loss (name: value) kld: 0.12423721168722425\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Loss (name: value) recons: 6.589413506644113\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Loss (name: value) logppx: 6.713650703430176\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] #validation_score (42): 6.713650703430176\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] patience losses:[6.733829634530204, 6.729048728942871, 6.733092171805246, 6.724212918962751, 6.7126873561314175] min patience loss:6.7126873561314175 current loss:6.713650703430176 absolute loss difference:0.0009633472987582437\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] Timing: train: 0.98s, val: 0.07s, epoch: 1.05s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] #progress_metric: host=algo-1, completed 42.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194338.1031308, \"EndTime\": 1623194339.1503327, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 41, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 162456.0, \"count\": 1, \"min\": 162456, \"max\": 162456}, \"Total Batches Seen\": {\"sum\": 1302.0, \"count\": 1, \"min\": 1302, \"max\": 1302}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 84.0, \"count\": 1, \"min\": 84, \"max\": 84}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3693.0153558776256 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:18:59 INFO 139703431448384] # Starting training for epoch 43\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:00.177] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 125, \"duration\": 955, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] # Finished training epoch 42 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Loss (name: value) total: 6.530491421299596\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Loss (name: value) kld: 0.13190790001423128\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Loss (name: value) recons: 6.39858349292509\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Loss (name: value) logppx: 6.530491421299596\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] #quality_metric: host=algo-2, epoch=42, train total_loss =6.530491421299596\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:00.091] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 128, \"duration\": 940, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] # Finished training epoch 43 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Loss (name: value) total: 6.515903580573298\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Loss (name: value) kld: 0.13104634155188838\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Loss (name: value) recons: 6.38485723926175\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Loss (name: value) logppx: 6.515903580573298\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] #quality_metric: host=algo-1, epoch=43, train total_loss =6.515903580573298\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:00.160] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 128, \"duration\": 68, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Loss (name: value) total: 6.710361003875732\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Loss (name: value) kld: 0.1268262660929135\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Loss (name: value) recons: 6.5835347175598145\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Loss (name: value) logppx: 6.710361003875732\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] #validation_score (43): 6.710361003875732\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] patience losses:[6.729048728942871, 6.733092171805246, 6.724212918962751, 6.7126873561314175, 6.713650703430176] min patience loss:6.7126873561314175 current loss:6.710361003875732 absolute loss difference:0.0023263522556851157\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] Timing: train: 0.94s, val: 0.07s, epoch: 1.01s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] #progress_metric: host=algo-1, completed 43.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194339.1506224, \"EndTime\": 1623194340.1658373, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 42, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 166324.0, \"count\": 1, \"min\": 166324, \"max\": 166324}, \"Total Batches Seen\": {\"sum\": 1333.0, \"count\": 1, \"min\": 1333, \"max\": 1333}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 86.0, \"count\": 1, \"min\": 86, \"max\": 86}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3809.5269997790397 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:00 INFO 139703431448384] # Starting training for epoch 44\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:00.263] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 125, \"duration\": 84, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Loss (name: value) total: 6.710041795458112\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Loss (name: value) kld: 0.1371119118162564\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Loss (name: value) recons: 6.572929927280971\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Loss (name: value) logppx: 6.710041795458112\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] #validation_score (42): 6.710041795458112\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] patience losses:[6.729712554386684, 6.722713470458984, 6.7241122382027765, 6.717088086264474, 6.706733635493687] min patience loss:6.706733635493687 current loss:6.710041795458112 absolute loss difference:0.00330815996442535\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] Timing: train: 0.96s, val: 0.09s, epoch: 1.04s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] #progress_metric: host=algo-2, completed 42.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194339.221726, \"EndTime\": 1623194340.2650375, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 41, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 162666.0, \"count\": 1, \"min\": 162666, \"max\": 162666}, \"Total Batches Seen\": {\"sum\": 1302.0, \"count\": 1, \"min\": 1302, \"max\": 1302}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 84.0, \"count\": 1, \"min\": 84, \"max\": 84}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3711.7166283907645 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:00 INFO 140403258652480] # Starting training for epoch 43\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:01.209] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 128, \"duration\": 943, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] # Finished training epoch 43 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Loss (name: value) total: 6.523844388223464\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Loss (name: value) kld: 0.13407012724107312\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Loss (name: value) recons: 6.38977425713693\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Loss (name: value) logppx: 6.523844388223464\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] #quality_metric: host=algo-2, epoch=43, train total_loss =6.523844388223464\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:01.092] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 131, \"duration\": 926, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] # Finished training epoch 44 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Loss (name: value) total: 6.5130442073268275\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Loss (name: value) kld: 0.13264407638099887\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Loss (name: value) recons: 6.380400126980197\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Loss (name: value) logppx: 6.5130442073268275\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] #quality_metric: host=algo-1, epoch=44, train total_loss =6.5130442073268275\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:01.165] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 131, \"duration\": 71, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Loss (name: value) total: 6.710298265729632\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Loss (name: value) kld: 0.13379278353282384\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Loss (name: value) recons: 6.576505388532366\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Loss (name: value) logppx: 6.710298265729632\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] #validation_score (44): 6.710298265729632\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] patience losses:[6.733092171805246, 6.724212918962751, 6.7126873561314175, 6.713650703430176, 6.710361003875732] min patience loss:6.710361003875732 current loss:6.710298265729632 absolute loss difference:6.273814610047168e-05\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] Timing: train: 0.93s, val: 0.08s, epoch: 1.01s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] #progress_metric: host=algo-1, completed 44.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194340.1661038, \"EndTime\": 1623194341.1725688, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 43, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 170192.0, \"count\": 1, \"min\": 170192, \"max\": 170192}, \"Total Batches Seen\": {\"sum\": 1364.0, \"count\": 1, \"min\": 1364, \"max\": 1364}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 88.0, \"count\": 1, \"min\": 88, \"max\": 88}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3842.526099983184 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:01 INFO 139703431448384] # Starting training for epoch 45\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:01.350] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 128, \"duration\": 140, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Loss (name: value) total: 6.708853381020682\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Loss (name: value) kld: 0.13473705308777945\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Loss (name: value) recons: 6.574116366250174\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Loss (name: value) logppx: 6.708853381020682\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] #validation_score (43): 6.708853381020682\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] patience losses:[6.722713470458984, 6.7241122382027765, 6.717088086264474, 6.706733635493687, 6.710041795458112] min patience loss:6.706733635493687 current loss:6.708853381020682 absolute loss difference:0.0021197455269952314\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] Timing: train: 0.94s, val: 0.14s, epoch: 1.09s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] #progress_metric: host=algo-2, completed 43.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194340.26533, \"EndTime\": 1623194341.3551402, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 42, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 166539.0, \"count\": 1, \"min\": 166539, \"max\": 166539}, \"Total Batches Seen\": {\"sum\": 1333.0, \"count\": 1, \"min\": 1333, \"max\": 1333}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 86.0, \"count\": 1, \"min\": 86, \"max\": 86}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3553.413373331604 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:01 INFO 140403258652480] # Starting training for epoch 44\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:02.337] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 134, \"duration\": 1164, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] # Finished training epoch 45 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Loss (name: value) total: 6.505843812419522\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Loss (name: value) kld: 0.1339105772635629\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Loss (name: value) recons: 6.3719332910353135\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Loss (name: value) logppx: 6.505843812419522\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] #quality_metric: host=algo-1, epoch=45, train total_loss =6.505843812419522\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:02.419] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 134, \"duration\": 80, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Loss (name: value) total: 6.707942553928921\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Loss (name: value) kld: 0.13432414936167852\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Loss (name: value) recons: 6.573618480137417\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Loss (name: value) logppx: 6.707942553928921\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] #validation_score (45): 6.707942553928921\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] patience losses:[6.724212918962751, 6.7126873561314175, 6.713650703430176, 6.710361003875732, 6.710298265729632] min patience loss:6.710298265729632 current loss:6.707942553928921 absolute loss difference:0.0023557118007113687\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] Timing: train: 1.17s, val: 0.09s, epoch: 1.25s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] #progress_metric: host=algo-1, completed 45.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194341.1728837, \"EndTime\": 1623194342.4255986, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 44, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 174060.0, \"count\": 1, \"min\": 174060, \"max\": 174060}, \"Total Batches Seen\": {\"sum\": 1395.0, \"count\": 1, \"min\": 1395, \"max\": 1395}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 90.0, \"count\": 1, \"min\": 90, \"max\": 90}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3087.3348290864224 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:02 INFO 139703431448384] # Starting training for epoch 46\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:02.499] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 131, \"duration\": 1140, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] # Finished training epoch 44 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Loss (name: value) total: 6.518908566044223\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Loss (name: value) kld: 0.13637751509105006\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Loss (name: value) recons: 6.382531023794605\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Loss (name: value) logppx: 6.518908566044223\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] #quality_metric: host=algo-2, epoch=44, train total_loss =6.518908566044223\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:02.626] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 131, \"duration\": 124, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Loss (name: value) total: 6.705690996987479\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Loss (name: value) kld: 0.14096579168524062\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Loss (name: value) recons: 6.564725194658552\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Loss (name: value) logppx: 6.705690996987479\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] #validation_score (44): 6.705690996987479\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] patience losses:[6.7241122382027765, 6.717088086264474, 6.706733635493687, 6.710041795458112, 6.708853381020682] min patience loss:6.706733635493687 current loss:6.705690996987479 absolute loss difference:0.0010426385062078936\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] Timing: train: 1.15s, val: 0.14s, epoch: 1.29s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] #progress_metric: host=algo-2, completed 44.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194341.3567493, \"EndTime\": 1623194342.6422477, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 43, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 170412.0, \"count\": 1, \"min\": 170412, \"max\": 170412}, \"Total Batches Seen\": {\"sum\": 1364.0, \"count\": 1, \"min\": 1364, \"max\": 1364}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 88.0, \"count\": 1, \"min\": 88, \"max\": 88}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3012.4029201758276 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:02 INFO 140403258652480] # Starting training for epoch 45\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:03.495] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 137, \"duration\": 1069, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] # Finished training epoch 46 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Loss (name: value) total: 6.49926794344379\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Loss (name: value) kld: 0.13524002605868923\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Loss (name: value) recons: 6.364027923153293\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Loss (name: value) logppx: 6.49926794344379\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] #quality_metric: host=algo-1, epoch=46, train total_loss =6.49926794344379\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:03.587] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 137, \"duration\": 90, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Loss (name: value) total: 6.702998774392264\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Loss (name: value) kld: 0.1358571829540389\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Loss (name: value) recons: 6.567141532897949\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Loss (name: value) logppx: 6.702998774392264\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] #validation_score (46): 6.702998774392264\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] patience losses:[6.7126873561314175, 6.713650703430176, 6.710361003875732, 6.710298265729632, 6.707942553928921] min patience loss:6.707942553928921 current loss:6.702998774392264 absolute loss difference:0.0049437795366564785\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] Timing: train: 1.07s, val: 0.10s, epoch: 1.17s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] #progress_metric: host=algo-1, completed 46.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194342.4259017, \"EndTime\": 1623194343.5926588, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 45, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 177928.0, \"count\": 1, \"min\": 177928, \"max\": 177928}, \"Total Batches Seen\": {\"sum\": 1426.0, \"count\": 1, \"min\": 1426, \"max\": 1426}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 92.0, \"count\": 1, \"min\": 92, \"max\": 92}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3314.7983560417915 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:03 INFO 139703431448384] # Starting training for epoch 47\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:03.739] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 134, \"duration\": 1094, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] # Finished training epoch 45 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Loss (name: value) total: 6.516650665190912\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Loss (name: value) kld: 0.13639985565696994\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Loss (name: value) recons: 6.380250823113226\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Loss (name: value) logppx: 6.516650665190912\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] #quality_metric: host=algo-2, epoch=45, train total_loss =6.516650665190912\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:03.857] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 134, \"duration\": 116, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Loss (name: value) total: 6.702579157693045\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Loss (name: value) kld: 0.1340148768254689\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Loss (name: value) recons: 6.568564210619245\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Loss (name: value) logppx: 6.702579157693045\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] #validation_score (45): 6.702579157693045\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] patience losses:[6.717088086264474, 6.706733635493687, 6.710041795458112, 6.708853381020682, 6.705690996987479] min patience loss:6.705690996987479 current loss:6.702579157693045 absolute loss difference:0.0031118392944335938\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] Timing: train: 1.10s, val: 0.12s, epoch: 1.22s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] #progress_metric: host=algo-2, completed 45.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194342.642528, \"EndTime\": 1623194343.8618708, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 44, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 174285.0, \"count\": 1, \"min\": 174285, \"max\": 174285}, \"Total Batches Seen\": {\"sum\": 1395.0, \"count\": 1, \"min\": 1395, \"max\": 1395}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 90.0, \"count\": 1, \"min\": 90, \"max\": 90}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3175.917047269426 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:03 INFO 140403258652480] # Starting training for epoch 46\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:04.755] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 140, \"duration\": 1161, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] # Finished training epoch 47 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Loss (name: value) total: 6.499560429203894\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Loss (name: value) kld: 0.13788391397364677\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Loss (name: value) recons: 6.361676535298748\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Loss (name: value) logppx: 6.499560429203894\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] #quality_metric: host=algo-1, epoch=47, train total_loss =6.499560429203894\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:04.858] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 140, \"duration\": 101, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Loss (name: value) total: 6.702372210366385\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Loss (name: value) kld: 0.13672810792922974\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Loss (name: value) recons: 6.5656441279820035\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Loss (name: value) logppx: 6.702372210366385\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] #validation_score (47): 6.702372210366385\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] patience losses:[6.713650703430176, 6.710361003875732, 6.710298265729632, 6.707942553928921, 6.702998774392264] min patience loss:6.702998774392264 current loss:6.702372210366385 absolute loss difference:0.0006265640258789062\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] Timing: train: 1.16s, val: 0.11s, epoch: 1.27s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] #progress_metric: host=algo-1, completed 47.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194343.592892, \"EndTime\": 1623194344.8644156, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 46, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 181796.0, \"count\": 1, \"min\": 181796, \"max\": 181796}, \"Total Batches Seen\": {\"sum\": 1457.0, \"count\": 1, \"min\": 1457, \"max\": 1457}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 94.0, \"count\": 1, \"min\": 94, \"max\": 94}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3041.6687987413034 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:04 INFO 139703431448384] # Starting training for epoch 48\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:05.185] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 137, \"duration\": 1318, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] # Finished training epoch 46 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Loss (name: value) total: 6.517983694230357\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Loss (name: value) kld: 0.13940050537067075\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Loss (name: value) recons: 6.378583192825317\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Loss (name: value) logppx: 6.517983694230357\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] #quality_metric: host=algo-2, epoch=46, train total_loss =6.517983694230357\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:05.982] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 143, \"duration\": 1117, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:05 INFO 139703431448384] # Finished training epoch 48 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:05 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:05 INFO 139703431448384] Loss (name: value) total: 6.492928397270941\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:05 INFO 139703431448384] Loss (name: value) kld: 0.13725061606495612\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:05 INFO 139703431448384] Loss (name: value) recons: 6.355677800793802\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:05 INFO 139703431448384] Loss (name: value) logppx: 6.492928397270941\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:05 INFO 139703431448384] #quality_metric: host=algo-1, epoch=48, train total_loss =6.492928397270941\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:05.299] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 137, \"duration\": 111, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Loss (name: value) total: 6.700033528464181\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Loss (name: value) kld: 0.1407191710812705\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Loss (name: value) recons: 6.559314250946045\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Loss (name: value) logppx: 6.700033528464181\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] #validation_score (46): 6.700033528464181\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] patience losses:[6.706733635493687, 6.710041795458112, 6.708853381020682, 6.705690996987479, 6.702579157693045] min patience loss:6.702579157693045 current loss:6.700033528464181 absolute loss difference:0.0025456292288641436\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] Timing: train: 1.32s, val: 0.12s, epoch: 1.44s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] #progress_metric: host=algo-2, completed 46.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194343.8621583, \"EndTime\": 1623194345.3066485, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 45, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 178158.0, \"count\": 1, \"min\": 178158, \"max\": 178158}, \"Total Batches Seen\": {\"sum\": 1426.0, \"count\": 1, \"min\": 1426, \"max\": 1426}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 92.0, \"count\": 1, \"min\": 92, \"max\": 92}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=2680.885149901129 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:05 INFO 140403258652480] # Starting training for epoch 47\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:06.074] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 143, \"duration\": 89, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Loss (name: value) total: 6.696027483258929\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Loss (name: value) kld: 0.13660213138375962\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Loss (name: value) recons: 6.559425285884312\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Loss (name: value) logppx: 6.696027483258929\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] #validation_score (48): 6.696027483258929\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] patience losses:[6.710361003875732, 6.710298265729632, 6.707942553928921, 6.702998774392264, 6.702372210366385] min patience loss:6.702372210366385 current loss:6.696027483258929 absolute loss difference:0.006344727107456372\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Timing: train: 1.12s, val: 0.10s, epoch: 1.22s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] #progress_metric: host=algo-1, completed 48.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194344.8647761, \"EndTime\": 1623194346.0803494, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 47, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 185664.0, \"count\": 1, \"min\": 185664, \"max\": 185664}, \"Total Batches Seen\": {\"sum\": 1488.0, \"count\": 1, \"min\": 1488, \"max\": 1488}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 96.0, \"count\": 1, \"min\": 96, \"max\": 96}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3181.6713028706604 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] # Starting training for epoch 49\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:06.974] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 146, \"duration\": 893, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] # Finished training epoch 49 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Loss (name: value) total: 6.4892072869885356\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Loss (name: value) kld: 0.13884203905059445\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Loss (name: value) recons: 6.350365261877736\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] Loss (name: value) logppx: 6.4892072869885356\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:06 INFO 139703431448384] #quality_metric: host=algo-1, epoch=49, train total_loss =6.4892072869885356\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:07.041] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 146, \"duration\": 66, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Loss (name: value) total: 6.692704064505441\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Loss (name: value) kld: 0.1403298888887678\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Loss (name: value) recons: 6.552374226706369\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Loss (name: value) logppx: 6.692704064505441\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] #validation_score (49): 6.692704064505441\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] patience losses:[6.710298265729632, 6.707942553928921, 6.702998774392264, 6.702372210366385, 6.696027483258929] min patience loss:6.696027483258929 current loss:6.692704064505441 absolute loss difference:0.00332341875348785\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Timing: train: 0.89s, val: 0.07s, epoch: 0.97s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] #progress_metric: host=algo-1, completed 49.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194346.080624, \"EndTime\": 1623194347.046409, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 48, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 189532.0, \"count\": 1, \"min\": 189532, \"max\": 189532}, \"Total Batches Seen\": {\"sum\": 1519.0, \"count\": 1, \"min\": 1519, \"max\": 1519}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 98.0, \"count\": 1, \"min\": 98, \"max\": 98}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4004.4507931116873 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] # Starting training for epoch 50\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:06.355] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 140, \"duration\": 1048, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] # Finished training epoch 47 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Loss (name: value) total: 6.504321375200825\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Loss (name: value) kld: 0.13867616701510646\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Loss (name: value) recons: 6.365645262502855\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Loss (name: value) logppx: 6.504321375200825\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] #quality_metric: host=algo-2, epoch=47, train total_loss =6.504321375200825\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:06.424] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 140, \"duration\": 66, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Loss (name: value) total: 6.707767895289829\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Loss (name: value) kld: 0.1306630266564233\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Loss (name: value) recons: 6.577104772840228\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Loss (name: value) logppx: 6.707767895289829\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] #validation_score (47): 6.707767895289829\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] patience losses:[6.710041795458112, 6.708853381020682, 6.705690996987479, 6.702579157693045, 6.700033528464181] min patience loss:6.700033528464181 current loss:6.707767895289829 absolute loss difference:0.007734366825648209\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] Timing: train: 1.05s, val: 0.07s, epoch: 1.12s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] #progress_metric: host=algo-2, completed 47.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194345.3069897, \"EndTime\": 1623194346.4252515, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 46, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 182031.0, \"count\": 1, \"min\": 182031, \"max\": 182031}, \"Total Batches Seen\": {\"sum\": 1457.0, \"count\": 1, \"min\": 1457, \"max\": 1457}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 94.0, \"count\": 1, \"min\": 94, \"max\": 94}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3463.017229878596 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:06 INFO 140403258652480] # Starting training for epoch 48\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:07.977] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 149, \"duration\": 930, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] # Finished training epoch 50 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Loss (name: value) total: 6.495367619299119\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Loss (name: value) kld: 0.14314466222159325\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Loss (name: value) recons: 6.352222950227799\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] Loss (name: value) logppx: 6.495367619299119\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:07 INFO 139703431448384] #quality_metric: host=algo-1, epoch=50, train total_loss =6.495367619299119\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:08.047] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 149, \"duration\": 68, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Loss (name: value) total: 6.694850036076137\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Loss (name: value) kld: 0.14365083192076003\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Loss (name: value) recons: 6.551199231828962\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Loss (name: value) logppx: 6.694850036076137\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] #validation_score (50): 6.694850036076137\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] patience losses:[6.707942553928921, 6.702998774392264, 6.702372210366385, 6.696027483258929, 6.692704064505441] min patience loss:6.692704064505441 current loss:6.694850036076137 absolute loss difference:0.0021459715706964033\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Timing: train: 0.93s, val: 0.07s, epoch: 1.00s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] #progress_metric: host=algo-1, completed 50.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194347.0466795, \"EndTime\": 1623194348.048654, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 49, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 193400.0, \"count\": 1, \"min\": 193400, \"max\": 193400}, \"Total Batches Seen\": {\"sum\": 1550.0, \"count\": 1, \"min\": 1550, \"max\": 1550}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 100.0, \"count\": 1, \"min\": 100, \"max\": 100}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3859.88874735375 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] # Starting training for epoch 51\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:07.368] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 143, \"duration\": 943, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] # Finished training epoch 48 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Loss (name: value) total: 6.5107019908966555\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Loss (name: value) kld: 0.1420344395502921\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Loss (name: value) recons: 6.368667525629843\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Loss (name: value) logppx: 6.5107019908966555\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] #quality_metric: host=algo-2, epoch=48, train total_loss =6.5107019908966555\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:07.450] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 143, \"duration\": 79, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Loss (name: value) total: 6.690619468688965\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Loss (name: value) kld: 0.14526338875293732\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Loss (name: value) recons: 6.545356137411935\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Loss (name: value) logppx: 6.690619468688965\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] #validation_score (48): 6.690619468688965\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] patience losses:[6.708853381020682, 6.705690996987479, 6.702579157693045, 6.700033528464181, 6.707767895289829] min patience loss:6.700033528464181 current loss:6.690619468688965 absolute loss difference:0.009414059775216366\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] Timing: train: 0.94s, val: 0.09s, epoch: 1.03s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] #progress_metric: host=algo-2, completed 48.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194346.4254868, \"EndTime\": 1623194347.4557197, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 47, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 185904.0, \"count\": 1, \"min\": 185904, \"max\": 185904}, \"Total Batches Seen\": {\"sum\": 1488.0, \"count\": 1, \"min\": 1488, \"max\": 1488}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 96.0, \"count\": 1, \"min\": 96, \"max\": 96}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3757.950017037911 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:07 INFO 140403258652480] # Starting training for epoch 49\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:08.993] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 152, \"duration\": 944, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] # Finished training epoch 51 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Loss (name: value) total: 6.481423266472355\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Loss (name: value) kld: 0.1413534471104222\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Loss (name: value) recons: 6.340069790040293\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] Loss (name: value) logppx: 6.481423266472355\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:08 INFO 139703431448384] #quality_metric: host=algo-1, epoch=51, train total_loss =6.481423266472355\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:09.058] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 152, \"duration\": 63, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Loss (name: value) total: 6.690475940704346\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Loss (name: value) kld: 0.14489130675792694\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Loss (name: value) recons: 6.545584610530308\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Loss (name: value) logppx: 6.690475940704346\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] #validation_score (51): 6.690475940704346\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] patience losses:[6.702998774392264, 6.702372210366385, 6.696027483258929, 6.692704064505441, 6.694850036076137] min patience loss:6.692704064505441 current loss:6.690475940704346 absolute loss difference:0.002228123801095272\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Timing: train: 0.95s, val: 0.07s, epoch: 1.01s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] #progress_metric: host=algo-1, completed 51.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194348.048892, \"EndTime\": 1623194349.0634882, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 50, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 197268.0, \"count\": 1, \"min\": 197268, \"max\": 197268}, \"Total Batches Seen\": {\"sum\": 1581.0, \"count\": 1, \"min\": 1581, \"max\": 1581}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 102.0, \"count\": 1, \"min\": 102, \"max\": 102}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3810.8433105985796 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] # Starting training for epoch 52\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:08.432] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 146, \"duration\": 975, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] # Finished training epoch 49 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Loss (name: value) total: 6.502394595453816\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Loss (name: value) kld: 0.14330740309050005\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Loss (name: value) recons: 6.359087132638501\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Loss (name: value) logppx: 6.502394595453816\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] #quality_metric: host=algo-2, epoch=49, train total_loss =6.502394595453816\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:08.516] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 146, \"duration\": 82, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Loss (name: value) total: 6.689588614872524\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Loss (name: value) kld: 0.14071246342999594\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Loss (name: value) recons: 6.548876149313791\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Loss (name: value) logppx: 6.689588614872524\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] #validation_score (49): 6.689588614872524\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] patience losses:[6.705690996987479, 6.702579157693045, 6.700033528464181, 6.707767895289829, 6.690619468688965] min patience loss:6.690619468688965 current loss:6.689588614872524 absolute loss difference:0.0010308538164407466\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] Timing: train: 0.98s, val: 0.09s, epoch: 1.07s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] #progress_metric: host=algo-2, completed 49.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194347.4564488, \"EndTime\": 1623194348.5219433, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 48, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 189777.0, \"count\": 1, \"min\": 189777, \"max\": 189777}, \"Total Batches Seen\": {\"sum\": 1519.0, \"count\": 1, \"min\": 1519, \"max\": 1519}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 98.0, \"count\": 1, \"min\": 98, \"max\": 98}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3634.3032000896687 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:08 INFO 140403258652480] # Starting training for epoch 50\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:09.958] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 155, \"duration\": 894, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] # Finished training epoch 52 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Loss (name: value) total: 6.489485879098216\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Loss (name: value) kld: 0.14502207802668696\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Loss (name: value) recons: 6.344463802153064\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] Loss (name: value) logppx: 6.489485879098216\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:09 INFO 139703431448384] #quality_metric: host=algo-1, epoch=52, train total_loss =6.489485879098216\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:10.024] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 155, \"duration\": 64, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Loss (name: value) total: 6.711028507777622\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Loss (name: value) kld: 0.14876768844468252\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Loss (name: value) recons: 6.562260900224958\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Loss (name: value) logppx: 6.711028507777622\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] #validation_score (52): 6.711028507777622\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] patience losses:[6.702372210366385, 6.696027483258929, 6.692704064505441, 6.694850036076137, 6.690475940704346] min patience loss:6.690475940704346 current loss:6.711028507777622 absolute loss difference:0.020552567073276684\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Timing: train: 0.90s, val: 0.07s, epoch: 0.96s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] #progress_metric: host=algo-1, completed 52.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194349.0641682, \"EndTime\": 1623194350.0256298, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 51, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 201136.0, \"count\": 1, \"min\": 201136, \"max\": 201136}, \"Total Batches Seen\": {\"sum\": 1612.0, \"count\": 1, \"min\": 1612, \"max\": 1612}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 104.0, \"count\": 1, \"min\": 104, \"max\": 104}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4022.4303753153454 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] # Starting training for epoch 53\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:09.473] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 149, \"duration\": 951, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] # Finished training epoch 50 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Loss (name: value) total: 6.504089320859602\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Loss (name: value) kld: 0.14536532122761972\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Loss (name: value) recons: 6.358723928851466\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Loss (name: value) logppx: 6.504089320859602\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] #quality_metric: host=algo-2, epoch=50, train total_loss =6.504089320859602\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:09.557] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 149, \"duration\": 82, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Loss (name: value) total: 6.723826135907855\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Loss (name: value) kld: 0.14368316105433873\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Loss (name: value) recons: 6.580142906733921\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Loss (name: value) logppx: 6.723826135907855\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] #validation_score (50): 6.723826135907855\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] patience losses:[6.702579157693045, 6.700033528464181, 6.707767895289829, 6.690619468688965, 6.689588614872524] min patience loss:6.689588614872524 current loss:6.723826135907855 absolute loss difference:0.03423752103533051\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] Timing: train: 0.95s, val: 0.08s, epoch: 1.04s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] #progress_metric: host=algo-2, completed 50.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194348.522228, \"EndTime\": 1623194349.5593793, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 49, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 193650.0, \"count\": 1, \"min\": 193650, \"max\": 193650}, \"Total Batches Seen\": {\"sum\": 1550.0, \"count\": 1, \"min\": 1550, \"max\": 1550}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 100.0, \"count\": 1, \"min\": 100, \"max\": 100}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3733.728833731157 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:09 INFO 140403258652480] # Starting training for epoch 51\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:10.936] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 158, \"duration\": 910, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] # Finished training epoch 53 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Loss (name: value) total: 6.479494613985861\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Loss (name: value) kld: 0.14413599489677337\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Loss (name: value) recons: 6.33535861199902\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] Loss (name: value) logppx: 6.479494613985861\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:10 INFO 139703431448384] #quality_metric: host=algo-1, epoch=53, train total_loss =6.479494613985861\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:11.008] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 158, \"duration\": 70, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Loss (name: value) total: 6.687007495335171\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Loss (name: value) kld: 0.15112232736178807\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Loss (name: value) recons: 6.535885197775705\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Loss (name: value) logppx: 6.687007495335171\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] #validation_score (53): 6.687007495335171\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] patience losses:[6.696027483258929, 6.692704064505441, 6.694850036076137, 6.690475940704346, 6.711028507777622] min patience loss:6.690475940704346 current loss:6.687007495335171 absolute loss difference:0.0034684453691751216\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Timing: train: 0.91s, val: 0.08s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] #progress_metric: host=algo-1, completed 53.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194350.0258949, \"EndTime\": 1623194351.0135972, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 52, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 205004.0, \"count\": 1, \"min\": 205004, \"max\": 205004}, \"Total Batches Seen\": {\"sum\": 1643.0, \"count\": 1, \"min\": 1643, \"max\": 1643}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 106.0, \"count\": 1, \"min\": 106, \"max\": 106}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3915.610333296728 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] # Starting training for epoch 54\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:10.485] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 152, \"duration\": 925, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] # Finished training epoch 51 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Loss (name: value) total: 6.496892486849139\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Loss (name: value) kld: 0.1444214610082488\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Loss (name: value) recons: 6.352471067059424\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Loss (name: value) logppx: 6.496892486849139\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] #quality_metric: host=algo-2, epoch=51, train total_loss =6.496892486849139\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:10.565] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 152, \"duration\": 78, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Loss (name: value) total: 6.69028411592756\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Loss (name: value) kld: 0.14619238887514388\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Loss (name: value) recons: 6.544091701507568\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Loss (name: value) logppx: 6.69028411592756\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] #validation_score (51): 6.69028411592756\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] patience losses:[6.700033528464181, 6.707767895289829, 6.690619468688965, 6.689588614872524, 6.723826135907855] min patience loss:6.689588614872524 current loss:6.69028411592756 absolute loss difference:0.0006955010550360186\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] Timing: train: 0.93s, val: 0.08s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] #progress_metric: host=algo-2, completed 51.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194349.5596795, \"EndTime\": 1623194350.5665076, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 50, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 197523.0, \"count\": 1, \"min\": 197523, \"max\": 197523}, \"Total Batches Seen\": {\"sum\": 1581.0, \"count\": 1, \"min\": 1581, \"max\": 1581}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 102.0, \"count\": 1, \"min\": 102, \"max\": 102}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3846.2023308954813 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:10 INFO 140403258652480] # Starting training for epoch 52\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:11.931] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 161, \"duration\": 917, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] # Finished training epoch 54 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Loss (name: value) total: 6.474150634581043\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Loss (name: value) kld: 0.1462471775950924\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Loss (name: value) recons: 6.3279034514581\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] Loss (name: value) logppx: 6.474150634581043\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:11 INFO 139703431448384] #quality_metric: host=algo-1, epoch=54, train total_loss =6.474150634581043\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:12.013] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 161, \"duration\": 80, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Loss (name: value) total: 6.68559319632394\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Loss (name: value) kld: 0.14986510574817657\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Loss (name: value) recons: 6.5357281139918735\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Loss (name: value) logppx: 6.68559319632394\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] #validation_score (54): 6.68559319632394\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] patience losses:[6.692704064505441, 6.694850036076137, 6.690475940704346, 6.711028507777622, 6.687007495335171] min patience loss:6.687007495335171 current loss:6.68559319632394 absolute loss difference:0.0014142990112304688\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Timing: train: 0.92s, val: 0.09s, epoch: 1.01s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] #progress_metric: host=algo-1, completed 54.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194351.01386, \"EndTime\": 1623194352.0194588, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 53, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 208872.0, \"count\": 1, \"min\": 208872, \"max\": 208872}, \"Total Batches Seen\": {\"sum\": 1674.0, \"count\": 1, \"min\": 1674, \"max\": 1674}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 108.0, \"count\": 1, \"min\": 108, \"max\": 108}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3845.9155549665998 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] # Starting training for epoch 55\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:11.530] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 155, \"duration\": 963, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] # Finished training epoch 52 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Loss (name: value) total: 6.492133998101758\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Loss (name: value) kld: 0.14711126300596422\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Loss (name: value) recons: 6.345022751439002\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Loss (name: value) logppx: 6.492133998101758\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] #quality_metric: host=algo-2, epoch=52, train total_loss =6.492133998101758\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:11.603] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 155, \"duration\": 70, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Loss (name: value) total: 6.683133602142334\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Loss (name: value) kld: 0.1486606470176152\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Loss (name: value) recons: 6.534473010471889\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Loss (name: value) logppx: 6.683133602142334\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] #validation_score (52): 6.683133602142334\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] patience losses:[6.707767895289829, 6.690619468688965, 6.689588614872524, 6.723826135907855, 6.69028411592756] min patience loss:6.689588614872524 current loss:6.683133602142334 absolute loss difference:0.006455012730190113\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] Timing: train: 0.97s, val: 0.08s, epoch: 1.04s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] #progress_metric: host=algo-2, completed 52.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194350.5668356, \"EndTime\": 1623194351.6093779, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 51, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 201396.0, \"count\": 1, \"min\": 201396, \"max\": 201396}, \"Total Batches Seen\": {\"sum\": 1612.0, \"count\": 1, \"min\": 1612, \"max\": 1612}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 104.0, \"count\": 1, \"min\": 104, \"max\": 104}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3714.4477919502697 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:11 INFO 140403258652480] # Starting training for epoch 53\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:12.968] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 164, \"duration\": 948, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] # Finished training epoch 55 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Loss (name: value) total: 6.477433569969669\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Loss (name: value) kld: 0.14812736261275508\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Loss (name: value) recons: 6.329306214086471\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] Loss (name: value) logppx: 6.477433569969669\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:12 INFO 139703431448384] #quality_metric: host=algo-1, epoch=55, train total_loss =6.477433569969669\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:13.045] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 164, \"duration\": 76, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Loss (name: value) total: 6.690626212528774\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Loss (name: value) kld: 0.14650257783276693\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Loss (name: value) recons: 6.544123649597168\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Loss (name: value) logppx: 6.690626212528774\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] #validation_score (55): 6.690626212528774\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] patience losses:[6.694850036076137, 6.690475940704346, 6.711028507777622, 6.687007495335171, 6.68559319632394] min patience loss:6.68559319632394 current loss:6.690626212528774 absolute loss difference:0.005033016204833984\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Timing: train: 0.95s, val: 0.08s, epoch: 1.03s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] #progress_metric: host=algo-1, completed 55.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194352.0197818, \"EndTime\": 1623194353.046619, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 54, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 212740.0, \"count\": 1, \"min\": 212740, \"max\": 212740}, \"Total Batches Seen\": {\"sum\": 1705.0, \"count\": 1, \"min\": 1705, \"max\": 1705}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 110.0, \"count\": 1, \"min\": 110, \"max\": 110}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3766.3159420579204 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] # Starting training for epoch 56\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:12.580] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 158, \"duration\": 968, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] # Finished training epoch 53 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Loss (name: value) total: 6.485062976037303\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Loss (name: value) kld: 0.14708121397322224\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Loss (name: value) recons: 6.337981743197287\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Loss (name: value) logppx: 6.485062976037303\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] #quality_metric: host=algo-2, epoch=53, train total_loss =6.485062976037303\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:12.661] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 158, \"duration\": 79, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Loss (name: value) total: 6.6823031561715265\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Loss (name: value) kld: 0.15160976137433732\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Loss (name: value) recons: 6.530693394797189\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Loss (name: value) logppx: 6.6823031561715265\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] #validation_score (53): 6.6823031561715265\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] patience losses:[6.690619468688965, 6.689588614872524, 6.723826135907855, 6.69028411592756, 6.683133602142334] min patience loss:6.683133602142334 current loss:6.6823031561715265 absolute loss difference:0.0008304459708075029\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] Timing: train: 0.97s, val: 0.09s, epoch: 1.06s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] #progress_metric: host=algo-2, completed 53.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194351.6096275, \"EndTime\": 1623194352.6682808, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 52, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 205269.0, \"count\": 1, \"min\": 205269, \"max\": 205269}, \"Total Batches Seen\": {\"sum\": 1643.0, \"count\": 1, \"min\": 1643, \"max\": 1643}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 106.0, \"count\": 1, \"min\": 106, \"max\": 106}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3657.8578490690606 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:12 INFO 140403258652480] # Starting training for epoch 54\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:13.985] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 167, \"duration\": 938, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] # Finished training epoch 56 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Loss (name: value) total: 6.4704614070154\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Loss (name: value) kld: 0.14747269667925372\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Loss (name: value) recons: 6.322988756241337\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] Loss (name: value) logppx: 6.4704614070154\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:13 INFO 139703431448384] #quality_metric: host=algo-1, epoch=56, train total_loss =6.4704614070154\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:14.048] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 167, \"duration\": 60, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Loss (name: value) total: 6.722341878073556\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Loss (name: value) kld: 0.15502436459064484\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Loss (name: value) recons: 6.567317553928921\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Loss (name: value) logppx: 6.722341878073556\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] #validation_score (56): 6.722341878073556\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] patience losses:[6.690475940704346, 6.711028507777622, 6.687007495335171, 6.68559319632394, 6.690626212528774] min patience loss:6.68559319632394 current loss:6.722341878073556 absolute loss difference:0.0367486817496161\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Timing: train: 0.94s, val: 0.06s, epoch: 1.00s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] #progress_metric: host=algo-1, completed 56.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194353.0468552, \"EndTime\": 1623194354.0513842, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 55, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 216608.0, \"count\": 1, \"min\": 216608, \"max\": 216608}, \"Total Batches Seen\": {\"sum\": 1736.0, \"count\": 1, \"min\": 1736, \"max\": 1736}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 112.0, \"count\": 1, \"min\": 112, \"max\": 112}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3849.814296090972 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] # Starting training for epoch 57\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:13.633] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 161, \"duration\": 961, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] # Finished training epoch 54 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Loss (name: value) total: 6.488760452116689\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Loss (name: value) kld: 0.1498559324010726\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Loss (name: value) recons: 6.338904480780324\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Loss (name: value) logppx: 6.488760452116689\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] #quality_metric: host=algo-2, epoch=54, train total_loss =6.488760452116689\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:13.724] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 161, \"duration\": 89, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Loss (name: value) total: 6.683248043060303\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Loss (name: value) kld: 0.14626909792423248\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Loss (name: value) recons: 6.536978994097028\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Loss (name: value) logppx: 6.683248043060303\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] #validation_score (54): 6.683248043060303\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] patience losses:[6.689588614872524, 6.723826135907855, 6.69028411592756, 6.683133602142334, 6.6823031561715265] min patience loss:6.6823031561715265 current loss:6.683248043060303 absolute loss difference:0.0009448868887762529\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] Timing: train: 0.97s, val: 0.09s, epoch: 1.06s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] #progress_metric: host=algo-2, completed 54.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194352.668619, \"EndTime\": 1623194353.726517, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 53, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 209142.0, \"count\": 1, \"min\": 209142, \"max\": 209142}, \"Total Batches Seen\": {\"sum\": 1674.0, \"count\": 1, \"min\": 1674, \"max\": 1674}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 108.0, \"count\": 1, \"min\": 108, \"max\": 108}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3660.4022460116935 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:13 INFO 140403258652480] # Starting training for epoch 55\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:14.942] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 170, \"duration\": 889, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] # Finished training epoch 57 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Loss (name: value) total: 6.468935070499297\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Loss (name: value) kld: 0.15081719982047234\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Loss (name: value) recons: 6.318117872361214\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] Loss (name: value) logppx: 6.468935070499297\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:14 INFO 139703431448384] #quality_metric: host=algo-1, epoch=57, train total_loss =6.468935070499297\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:15.012] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 170, \"duration\": 69, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Loss (name: value) total: 6.675262928009033\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Loss (name: value) kld: 0.1536182463169098\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Loss (name: value) recons: 6.521644660404751\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Loss (name: value) logppx: 6.675262928009033\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] #validation_score (57): 6.675262928009033\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] patience losses:[6.711028507777622, 6.687007495335171, 6.68559319632394, 6.690626212528774, 6.722341878073556] min patience loss:6.68559319632394 current loss:6.675262928009033 absolute loss difference:0.01033026831490691\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Timing: train: 0.89s, val: 0.08s, epoch: 0.97s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] #progress_metric: host=algo-1, completed 57.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194354.0518465, \"EndTime\": 1623194355.0205889, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 56, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 220476.0, \"count\": 1, \"min\": 220476, \"max\": 220476}, \"Total Batches Seen\": {\"sum\": 1767.0, \"count\": 1, \"min\": 1767, \"max\": 1767}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 114.0, \"count\": 1, \"min\": 114, \"max\": 114}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3992.221032803082 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] # Starting training for epoch 58\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:14.664] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 164, \"duration\": 937, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] # Finished training epoch 55 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Loss (name: value) total: 6.485632561868237\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Loss (name: value) kld: 0.1517157072742139\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Loss (name: value) recons: 6.3339168448602\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Loss (name: value) logppx: 6.485632561868237\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] #quality_metric: host=algo-2, epoch=55, train total_loss =6.485632561868237\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:14.742] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 164, \"duration\": 76, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Loss (name: value) total: 6.684352057320731\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Loss (name: value) kld: 0.14985676961285727\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Loss (name: value) recons: 6.534495285579136\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Loss (name: value) logppx: 6.684352057320731\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] #validation_score (55): 6.684352057320731\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] patience losses:[6.723826135907855, 6.69028411592756, 6.683133602142334, 6.6823031561715265, 6.683248043060303] min patience loss:6.6823031561715265 current loss:6.684352057320731 absolute loss difference:0.0020489011492044185\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:3\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] Timing: train: 0.94s, val: 0.08s, epoch: 1.02s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] #progress_metric: host=algo-2, completed 55.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194353.7268448, \"EndTime\": 1623194354.7444324, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 54, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 213015.0, \"count\": 1, \"min\": 213015, \"max\": 213015}, \"Total Batches Seen\": {\"sum\": 1705.0, \"count\": 1, \"min\": 1705, \"max\": 1705}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 110.0, \"count\": 1, \"min\": 110, \"max\": 110}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3805.4210203408975 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:14 INFO 140403258652480] # Starting training for epoch 56\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:15.969] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 173, \"duration\": 947, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] # Finished training epoch 58 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Loss (name: value) total: 6.462677017334969\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Loss (name: value) kld: 0.14947012174994714\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Loss (name: value) recons: 6.313206945696185\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] Loss (name: value) logppx: 6.462677017334969\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:15 INFO 139703431448384] #quality_metric: host=algo-1, epoch=58, train total_loss =6.462677017334969\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:16.038] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 173, \"duration\": 68, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) total: 6.681616374424526\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) kld: 0.15669986818517959\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) recons: 6.524916580745152\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) logppx: 6.681616374424526\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] #validation_score (58): 6.681616374424526\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] patience losses:[6.687007495335171, 6.68559319632394, 6.690626212528774, 6.722341878073556, 6.675262928009033] min patience loss:6.675262928009033 current loss:6.681616374424526 absolute loss difference:0.006353446415492847\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Timing: train: 0.95s, val: 0.07s, epoch: 1.02s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] #progress_metric: host=algo-1, completed 58.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194355.0208404, \"EndTime\": 1623194356.0392976, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 57, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 224344.0, \"count\": 1, \"min\": 224344, \"max\": 224344}, \"Total Batches Seen\": {\"sum\": 1798.0, \"count\": 1, \"min\": 1798, \"max\": 1798}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 116.0, \"count\": 1, \"min\": 116, \"max\": 116}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3797.4045396858537 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] # Starting training for epoch 59\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:15.659] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 167, \"duration\": 914, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] # Finished training epoch 56 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Loss (name: value) total: 6.478448490942678\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Loss (name: value) kld: 0.15091760805056942\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Loss (name: value) recons: 6.327530883973645\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Loss (name: value) logppx: 6.478448490942678\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] #quality_metric: host=algo-2, epoch=56, train total_loss =6.478448490942678\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:15.752] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 167, \"duration\": 90, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Loss (name: value) total: 6.68073490687779\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Loss (name: value) kld: 0.14852941461971828\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Loss (name: value) recons: 6.532205513545445\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Loss (name: value) logppx: 6.68073490687779\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] #validation_score (56): 6.68073490687779\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] patience losses:[6.69028411592756, 6.683133602142334, 6.6823031561715265, 6.683248043060303, 6.684352057320731] min patience loss:6.6823031561715265 current loss:6.68073490687779 absolute loss difference:0.0015682492937365566\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] Timing: train: 0.92s, val: 0.10s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] #progress_metric: host=algo-2, completed 56.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194354.7447479, \"EndTime\": 1623194355.7584586, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 55, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 216888.0, \"count\": 1, \"min\": 216888, \"max\": 216888}, \"Total Batches Seen\": {\"sum\": 1736.0, \"count\": 1, \"min\": 1736, \"max\": 1736}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 112.0, \"count\": 1, \"min\": 112, \"max\": 112}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3820.0819470201104 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:15 INFO 140403258652480] # Starting training for epoch 57\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:16.926] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 176, \"duration\": 887, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] # Finished training epoch 59 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) total: 6.466279133673637\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) kld: 0.15278377299827914\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) recons: 6.313495366804061\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) logppx: 6.466279133673637\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] #quality_metric: host=algo-1, epoch=59, train total_loss =6.466279133673637\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:16.994] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 176, \"duration\": 65, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) total: 6.680599689483643\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) kld: 0.16217031649180821\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) recons: 6.518429347446987\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Loss (name: value) logppx: 6.680599689483643\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] #validation_score (59): 6.680599689483643\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] patience losses:[6.68559319632394, 6.690626212528774, 6.722341878073556, 6.675262928009033, 6.681616374424526] min patience loss:6.675262928009033 current loss:6.680599689483643 absolute loss difference:0.005336761474609375\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] Timing: train: 0.89s, val: 0.07s, epoch: 0.96s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] #progress_metric: host=algo-1, completed 59.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194356.0395412, \"EndTime\": 1623194356.9949849, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 58, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 228212.0, \"count\": 1, \"min\": 228212, \"max\": 228212}, \"Total Batches Seen\": {\"sum\": 1829.0, \"count\": 1, \"min\": 1829, \"max\": 1829}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 118.0, \"count\": 1, \"min\": 118, \"max\": 118}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4047.8074845015794 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:16 INFO 139703431448384] # Starting training for epoch 60\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:16.671] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 170, \"duration\": 908, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] # Finished training epoch 57 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Loss (name: value) total: 6.480872527245553\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Loss (name: value) kld: 0.15368555317963323\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Loss (name: value) recons: 6.3271870036279\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Loss (name: value) logppx: 6.480872527245553\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] #quality_metric: host=algo-2, epoch=57, train total_loss =6.480872527245553\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:16.763] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 170, \"duration\": 90, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Loss (name: value) total: 6.692234856741769\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Loss (name: value) kld: 0.16247833413737162\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Loss (name: value) recons: 6.529756614140102\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Loss (name: value) logppx: 6.692234856741769\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] #validation_score (57): 6.692234856741769\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] patience losses:[6.683133602142334, 6.6823031561715265, 6.683248043060303, 6.684352057320731, 6.68073490687779] min patience loss:6.68073490687779 current loss:6.692234856741769 absolute loss difference:0.011499949863979175\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] Timing: train: 0.91s, val: 0.09s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] #progress_metric: host=algo-2, completed 57.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194355.7587621, \"EndTime\": 1623194356.7654533, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 56, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 220761.0, \"count\": 1, \"min\": 220761, \"max\": 220761}, \"Total Batches Seen\": {\"sum\": 1767.0, \"count\": 1, \"min\": 1767, \"max\": 1767}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 114.0, \"count\": 1, \"min\": 114, \"max\": 114}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3845.6951594712605 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:16 INFO 140403258652480] # Starting training for epoch 58\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:17.871] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 179, \"duration\": 876, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] # Finished training epoch 60 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Loss (name: value) total: 6.456482694995019\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Loss (name: value) kld: 0.1519757499617915\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Loss (name: value) recons: 6.304506940226401\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Loss (name: value) logppx: 6.456482694995019\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] #quality_metric: host=algo-1, epoch=60, train total_loss =6.456482694995019\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:17.958] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 179, \"duration\": 85, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Loss (name: value) total: 6.677128383091518\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Loss (name: value) kld: 0.16019189996378763\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Loss (name: value) recons: 6.516936506543841\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Loss (name: value) logppx: 6.677128383091518\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] #validation_score (60): 6.677128383091518\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] patience losses:[6.690626212528774, 6.722341878073556, 6.675262928009033, 6.681616374424526, 6.680599689483643] min patience loss:6.675262928009033 current loss:6.677128383091518 absolute loss difference:0.0018654550824850347\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:3\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] Timing: train: 0.88s, val: 0.09s, epoch: 0.96s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] #progress_metric: host=algo-1, completed 60.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194356.9952252, \"EndTime\": 1623194357.959989, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 59, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 232080.0, \"count\": 1, \"min\": 232080, \"max\": 232080}, \"Total Batches Seen\": {\"sum\": 1860.0, \"count\": 1, \"min\": 1860, \"max\": 1860}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 120.0, \"count\": 1, \"min\": 120, \"max\": 120}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4008.7094576830705 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:17 INFO 139703431448384] # Starting training for epoch 61\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:17.684] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 173, \"duration\": 918, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] # Finished training epoch 58 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Loss (name: value) total: 6.477591126195846\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Loss (name: value) kld: 0.15296106088546016\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Loss (name: value) recons: 6.324630122030935\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Loss (name: value) logppx: 6.477591126195846\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] #quality_metric: host=algo-2, epoch=58, train total_loss =6.477591126195846\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:17.759] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 173, \"duration\": 72, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Loss (name: value) total: 6.677730355943952\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Loss (name: value) kld: 0.1559468082019261\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Loss (name: value) recons: 6.521783556256976\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Loss (name: value) logppx: 6.677730355943952\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] #validation_score (58): 6.677730355943952\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] patience losses:[6.6823031561715265, 6.683248043060303, 6.684352057320731, 6.68073490687779, 6.692234856741769] min patience loss:6.68073490687779 current loss:6.677730355943952 absolute loss difference:0.0030045509338378906\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] Timing: train: 0.92s, val: 0.08s, epoch: 1.00s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] #progress_metric: host=algo-2, completed 58.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194356.7661815, \"EndTime\": 1623194357.765515, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 57, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 224634.0, \"count\": 1, \"min\": 224634, \"max\": 224634}, \"Total Batches Seen\": {\"sum\": 1798.0, \"count\": 1, \"min\": 1798, \"max\": 1798}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 116.0, \"count\": 1, \"min\": 116, \"max\": 116}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3874.794064062216 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:17 INFO 140403258652480] # Starting training for epoch 59\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:19.018] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 182, \"duration\": 1058, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] # Finished training epoch 61 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) total: 6.449966669082642\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) kld: 0.1536012811045493\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) recons: 6.296365372596249\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) logppx: 6.449966669082642\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] #quality_metric: host=algo-1, epoch=61, train total_loss =6.449966669082642\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:18.874] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 176, \"duration\": 1107, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] # Finished training epoch 59 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Loss (name: value) total: 6.466070190552743\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Loss (name: value) kld: 0.15420611587262922\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Loss (name: value) recons: 6.311864110731309\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Loss (name: value) logppx: 6.466070190552743\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] #quality_metric: host=algo-2, epoch=59, train total_loss =6.466070190552743\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:18.957] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 176, \"duration\": 77, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Loss (name: value) total: 6.669989858354841\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Loss (name: value) kld: 0.15275737004620688\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Loss (name: value) recons: 6.517232486179897\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Loss (name: value) logppx: 6.669989858354841\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] #validation_score (59): 6.669989858354841\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] patience losses:[6.683248043060303, 6.684352057320731, 6.68073490687779, 6.692234856741769, 6.677730355943952] min patience loss:6.677730355943952 current loss:6.669989858354841 absolute loss difference:0.007740497589111328\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] Timing: train: 1.11s, val: 0.08s, epoch: 1.20s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] #progress_metric: host=algo-2, completed 59.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194357.765859, \"EndTime\": 1623194358.9633799, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 58, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 228507.0, \"count\": 1, \"min\": 228507, \"max\": 228507}, \"Total Batches Seen\": {\"sum\": 1829.0, \"count\": 1, \"min\": 1829, \"max\": 1829}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 118.0, \"count\": 1, \"min\": 118, \"max\": 118}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3233.780254370057 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:18 INFO 140403258652480] # Starting training for epoch 60\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:19.086] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 182, \"duration\": 66, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) total: 6.677890709468296\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) kld: 0.16082421583788736\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) recons: 6.517066410609654\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) logppx: 6.677890709468296\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] #validation_score (61): 6.677890709468296\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] patience losses:[6.722341878073556, 6.675262928009033, 6.681616374424526, 6.680599689483643, 6.677128383091518] min patience loss:6.675262928009033 current loss:6.677890709468296 absolute loss difference:0.002627781459263012\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:4\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Timing: train: 1.06s, val: 0.07s, epoch: 1.13s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] #progress_metric: host=algo-1, completed 61.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194357.9602466, \"EndTime\": 1623194359.087367, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 60, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 235948.0, \"count\": 1, \"min\": 235948, \"max\": 235948}, \"Total Batches Seen\": {\"sum\": 1891.0, \"count\": 1, \"min\": 1891, \"max\": 1891}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 122.0, \"count\": 1, \"min\": 122, \"max\": 122}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3431.3451218462415 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] # Starting training for epoch 62\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:19.996] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 185, \"duration\": 908, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] # Finished training epoch 62 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) total: 6.455840110778809\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) kld: 0.1567232281930985\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) recons: 6.299116896044824\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] Loss (name: value) logppx: 6.455840110778809\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:19 INFO 139703431448384] #quality_metric: host=algo-1, epoch=62, train total_loss =6.455840110778809\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:20.065] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 185, \"duration\": 67, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Loss (name: value) total: 6.670565264565604\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Loss (name: value) kld: 0.15777813536780222\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Loss (name: value) recons: 6.512787137712751\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Loss (name: value) logppx: 6.670565264565604\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] #validation_score (62): 6.670565264565604\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] patience losses:[6.675262928009033, 6.681616374424526, 6.680599689483643, 6.677128383091518, 6.677890709468296] min patience loss:6.675262928009033 current loss:6.670565264565604 absolute loss difference:0.004697663443429256\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Timing: train: 0.91s, val: 0.07s, epoch: 0.98s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] #progress_metric: host=algo-1, completed 62.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194359.0876057, \"EndTime\": 1623194360.0708504, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 61, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 239816.0, \"count\": 1, \"min\": 239816, \"max\": 239816}, \"Total Batches Seen\": {\"sum\": 1922.0, \"count\": 1, \"min\": 1922, \"max\": 1922}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 124.0, \"count\": 1, \"min\": 124, \"max\": 124}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3933.2855572521157 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] # Starting training for epoch 63\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:19.916] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 179, \"duration\": 952, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] # Finished training epoch 60 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Loss (name: value) total: 6.47000450857224\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Loss (name: value) kld: 0.1559909107463975\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Loss (name: value) recons: 6.314013623422192\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Loss (name: value) logppx: 6.47000450857224\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] #quality_metric: host=algo-2, epoch=60, train total_loss =6.47000450857224\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:19.983] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 179, \"duration\": 65, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Loss (name: value) total: 6.668963772909982\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Loss (name: value) kld: 0.1575477123260498\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Loss (name: value) recons: 6.511416094643729\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Loss (name: value) logppx: 6.668963772909982\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] #validation_score (60): 6.668963772909982\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] patience losses:[6.684352057320731, 6.68073490687779, 6.692234856741769, 6.677730355943952, 6.669989858354841] min patience loss:6.669989858354841 current loss:6.668963772909982 absolute loss difference:0.0010260854448587153\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] Timing: train: 0.95s, val: 0.07s, epoch: 1.03s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] #progress_metric: host=algo-2, completed 60.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194358.9636629, \"EndTime\": 1623194359.9904492, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 59, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 232380.0, \"count\": 1, \"min\": 232380, \"max\": 232380}, \"Total Batches Seen\": {\"sum\": 1860.0, \"count\": 1, \"min\": 1860, \"max\": 1860}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 120.0, \"count\": 1, \"min\": 120, \"max\": 120}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3771.4769336709687 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:19 INFO 140403258652480] # Starting training for epoch 61\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:20.983] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 188, \"duration\": 911, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] # Finished training epoch 63 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Loss (name: value) total: 6.453765619185663\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Loss (name: value) kld: 0.15727893239067448\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Loss (name: value) recons: 6.296486719962089\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] Loss (name: value) logppx: 6.453765619185663\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:20 INFO 139703431448384] #quality_metric: host=algo-1, epoch=63, train total_loss =6.453765619185663\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:21.041] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 188, \"duration\": 56, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Loss (name: value) total: 6.674215384892055\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Loss (name: value) kld: 0.16436691795076644\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Loss (name: value) recons: 6.509848526545933\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Loss (name: value) logppx: 6.674215384892055\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] #validation_score (63): 6.674215384892055\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] patience losses:[6.681616374424526, 6.680599689483643, 6.677128383091518, 6.677890709468296, 6.670565264565604] min patience loss:6.670565264565604 current loss:6.674215384892055 absolute loss difference:0.0036501203264514004\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Timing: train: 0.91s, val: 0.06s, epoch: 0.97s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] #progress_metric: host=algo-1, completed 63.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194360.0711694, \"EndTime\": 1623194361.0420473, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 62, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 243684.0, \"count\": 1, \"min\": 243684, \"max\": 243684}, \"Total Batches Seen\": {\"sum\": 1953.0, \"count\": 1, \"min\": 1953, \"max\": 1953}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 126.0, \"count\": 1, \"min\": 126, \"max\": 126}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3983.474395231882 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] # Starting training for epoch 64\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:20.958] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 182, \"duration\": 965, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:20 INFO 140403258652480] # Finished training epoch 61 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:20 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:20 INFO 140403258652480] Loss (name: value) total: 6.468216746084152\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:20 INFO 140403258652480] Loss (name: value) kld: 0.15796625025330052\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:20 INFO 140403258652480] Loss (name: value) recons: 6.310250543778943\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:20 INFO 140403258652480] Loss (name: value) logppx: 6.468216746084152\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:20 INFO 140403258652480] #quality_metric: host=algo-2, epoch=61, train total_loss =6.468216746084152\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:21.018] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 182, \"duration\": 59, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Loss (name: value) total: 6.6740593910217285\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Loss (name: value) kld: 0.16605010202952794\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Loss (name: value) recons: 6.5080092293875555\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Loss (name: value) logppx: 6.6740593910217285\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] #validation_score (61): 6.6740593910217285\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] patience losses:[6.68073490687779, 6.692234856741769, 6.677730355943952, 6.669989858354841, 6.668963772909982] min patience loss:6.668963772909982 current loss:6.6740593910217285 absolute loss difference:0.005095618111746525\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Timing: train: 0.97s, val: 0.06s, epoch: 1.03s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] #progress_metric: host=algo-2, completed 61.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194359.9907277, \"EndTime\": 1623194361.0203905, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 60, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 236253.0, \"count\": 1, \"min\": 236253, \"max\": 236253}, \"Total Batches Seen\": {\"sum\": 1891.0, \"count\": 1, \"min\": 1891, \"max\": 1891}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 122.0, \"count\": 1, \"min\": 122, \"max\": 122}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3760.9490575482037 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] # Starting training for epoch 62\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:21.968] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 191, \"duration\": 926, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] # Finished training epoch 64 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Loss (name: value) total: 6.447790922657136\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Loss (name: value) kld: 0.15753942187274655\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Loss (name: value) recons: 6.2902514280811435\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] Loss (name: value) logppx: 6.447790922657136\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:21 INFO 139703431448384] #quality_metric: host=algo-1, epoch=64, train total_loss =6.447790922657136\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:22.030] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 191, \"duration\": 59, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Loss (name: value) total: 6.663431167602539\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Loss (name: value) kld: 0.15929278518472398\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Loss (name: value) recons: 6.504138333456857\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Loss (name: value) logppx: 6.663431167602539\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] #validation_score (64): 6.663431167602539\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] patience losses:[6.680599689483643, 6.677128383091518, 6.677890709468296, 6.670565264565604, 6.674215384892055] min patience loss:6.670565264565604 current loss:6.663431167602539 absolute loss difference:0.007134096963064884\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Timing: train: 0.93s, val: 0.07s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] #progress_metric: host=algo-1, completed 64.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194361.0422769, \"EndTime\": 1623194362.0362244, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 63, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 247552.0, \"count\": 1, \"min\": 247552, \"max\": 247552}, \"Total Batches Seen\": {\"sum\": 1984.0, \"count\": 1, \"min\": 1984, \"max\": 1984}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 128.0, \"count\": 1, \"min\": 128, \"max\": 128}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3891.0691951934996 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] # Starting training for epoch 65\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:21.958] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 185, \"duration\": 937, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] # Finished training epoch 62 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Loss (name: value) total: 6.467087887948559\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Loss (name: value) kld: 0.15845014323149959\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Loss (name: value) recons: 6.308637730536923\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] Loss (name: value) logppx: 6.467087887948559\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:21 INFO 140403258652480] #quality_metric: host=algo-2, epoch=62, train total_loss =6.467087887948559\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:22.019] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 185, \"duration\": 59, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Loss (name: value) total: 6.672698361533029\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Loss (name: value) kld: 0.15731676135744369\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Loss (name: value) recons: 6.515381608690534\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Loss (name: value) logppx: 6.672698361533029\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] #validation_score (62): 6.672698361533029\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] patience losses:[6.692234856741769, 6.677730355943952, 6.669989858354841, 6.668963772909982, 6.6740593910217285] min patience loss:6.668963772909982 current loss:6.672698361533029 absolute loss difference:0.003734588623046875\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Timing: train: 0.94s, val: 0.06s, epoch: 1.00s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] #progress_metric: host=algo-2, completed 62.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194361.020678, \"EndTime\": 1623194362.0207744, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 61, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 240126.0, \"count\": 1, \"min\": 240126, \"max\": 240126}, \"Total Batches Seen\": {\"sum\": 1922.0, \"count\": 1, \"min\": 1922, \"max\": 1922}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 124.0, \"count\": 1, \"min\": 124, \"max\": 124}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3872.0722101756987 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] # Starting training for epoch 63\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:22.956] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 194, \"duration\": 920, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] # Finished training epoch 65 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Loss (name: value) total: 6.446324286922332\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Loss (name: value) kld: 0.1574169864817973\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Loss (name: value) recons: 6.288907293350466\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] Loss (name: value) logppx: 6.446324286922332\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:22 INFO 139703431448384] #quality_metric: host=algo-1, epoch=65, train total_loss =6.446324286922332\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:23.028] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 194, \"duration\": 70, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Loss (name: value) total: 6.666570118495396\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Loss (name: value) kld: 0.16145168031964982\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Loss (name: value) recons: 6.505118301936558\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Loss (name: value) logppx: 6.666570118495396\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] #validation_score (65): 6.666570118495396\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] patience losses:[6.677128383091518, 6.677890709468296, 6.670565264565604, 6.674215384892055, 6.663431167602539] min patience loss:6.663431167602539 current loss:6.666570118495396 absolute loss difference:0.003138950892856762\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Timing: train: 0.92s, val: 0.07s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] #progress_metric: host=algo-1, completed 65.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194362.0364838, \"EndTime\": 1623194363.0299838, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 64, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 251420.0, \"count\": 1, \"min\": 251420, \"max\": 251420}, \"Total Batches Seen\": {\"sum\": 2015.0, \"count\": 1, \"min\": 2015, \"max\": 2015}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 130.0, \"count\": 1, \"min\": 130, \"max\": 130}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3892.78990995764 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] # Starting training for epoch 66\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:22.956] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 188, \"duration\": 935, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] # Finished training epoch 63 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Loss (name: value) total: 6.461982780887235\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Loss (name: value) kld: 0.15825096613937809\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Loss (name: value) recons: 6.303731837580281\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] Loss (name: value) logppx: 6.461982780887235\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:22 INFO 140403258652480] #quality_metric: host=algo-2, epoch=63, train total_loss =6.461982780887235\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:23.018] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 188, \"duration\": 60, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Loss (name: value) total: 6.671676022665841\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Loss (name: value) kld: 0.1637382890496935\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Loss (name: value) recons: 6.507937840053013\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Loss (name: value) logppx: 6.671676022665841\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] #validation_score (63): 6.671676022665841\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] patience losses:[6.677730355943952, 6.669989858354841, 6.668963772909982, 6.6740593910217285, 6.672698361533029] min patience loss:6.668963772909982 current loss:6.671676022665841 absolute loss difference:0.002712249755859375\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:3\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Timing: train: 0.94s, val: 0.06s, epoch: 1.00s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] #progress_metric: host=algo-2, completed 63.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194362.0211422, \"EndTime\": 1623194363.019877, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 62, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 243999.0, \"count\": 1, \"min\": 243999, \"max\": 243999}, \"Total Batches Seen\": {\"sum\": 1953.0, \"count\": 1, \"min\": 1953, \"max\": 1953}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 126.0, \"count\": 1, \"min\": 126, \"max\": 126}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3877.37814304482 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] # Starting training for epoch 64\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:23.986] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 197, \"duration\": 955, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] # Finished training epoch 66 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Loss (name: value) total: 6.446424361198179\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Loss (name: value) kld: 0.16072159296562594\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Loss (name: value) recons: 6.285702797674364\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] Loss (name: value) logppx: 6.446424361198179\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:23 INFO 139703431448384] #quality_metric: host=algo-1, epoch=66, train total_loss =6.446424361198179\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:24.046] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 197, \"duration\": 59, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] Loss (name: value) total: 6.662403719765799\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] Loss (name: value) kld: 0.16190606781414577\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] Loss (name: value) recons: 6.500497681753976\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] Loss (name: value) logppx: 6.662403719765799\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] #validation_score (66): 6.662403719765799\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] patience losses:[6.677890709468296, 6.670565264565604, 6.674215384892055, 6.663431167602539, 6.666570118495396] min patience loss:6.663431167602539 current loss:6.662403719765799 absolute loss difference:0.0010274478367398032\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] Timing: train: 0.96s, val: 0.06s, epoch: 1.02s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] #progress_metric: host=algo-1, completed 66.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194363.030255, \"EndTime\": 1623194364.0515509, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 65, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 255288.0, \"count\": 1, \"min\": 255288, \"max\": 255288}, \"Total Batches Seen\": {\"sum\": 2046.0, \"count\": 1, \"min\": 2046, \"max\": 2046}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 132.0, \"count\": 1, \"min\": 132, \"max\": 132}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3786.825695174353 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:24 INFO 139703431448384] # Starting training for epoch 67\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:23.989] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 191, \"duration\": 968, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] # Finished training epoch 64 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Loss (name: value) total: 6.460592865943909\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Loss (name: value) kld: 0.16141018223377965\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Loss (name: value) recons: 6.299182672654429\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] Loss (name: value) logppx: 6.460592865943909\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:23 INFO 140403258652480] #quality_metric: host=algo-2, epoch=64, train total_loss =6.460592865943909\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:24.046] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 191, \"duration\": 55, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Loss (name: value) total: 6.6569061279296875\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Loss (name: value) kld: 0.1613906068461282\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Loss (name: value) recons: 6.495515414646694\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Loss (name: value) logppx: 6.6569061279296875\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] #validation_score (64): 6.6569061279296875\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] patience losses:[6.669989858354841, 6.668963772909982, 6.6740593910217285, 6.672698361533029, 6.671676022665841] min patience loss:6.668963772909982 current loss:6.6569061279296875 absolute loss difference:0.01205764498029449\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Timing: train: 0.97s, val: 0.06s, epoch: 1.03s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] #progress_metric: host=algo-2, completed 64.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194363.0201707, \"EndTime\": 1623194364.05214, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 63, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 247872.0, \"count\": 1, \"min\": 247872, \"max\": 247872}, \"Total Batches Seen\": {\"sum\": 1984.0, \"count\": 1, \"min\": 1984, \"max\": 1984}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 128.0, \"count\": 1, \"min\": 128, \"max\": 128}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3752.5122891620813 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] # Starting training for epoch 65\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:25.082] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 200, \"duration\": 1030, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] # Finished training epoch 67 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Loss (name: value) total: 6.4369154514804965\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Loss (name: value) kld: 0.15995240728220633\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Loss (name: value) recons: 6.276963030138323\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Loss (name: value) logppx: 6.4369154514804965\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] #quality_metric: host=algo-1, epoch=67, train total_loss =6.4369154514804965\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:24.982] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 194, \"duration\": 929, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] # Finished training epoch 65 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Loss (name: value) total: 6.453128537824077\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Loss (name: value) kld: 0.16108794342125615\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Loss (name: value) recons: 6.292040617235245\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] Loss (name: value) logppx: 6.453128537824077\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:24 INFO 140403258652480] #quality_metric: host=algo-2, epoch=65, train total_loss =6.453128537824077\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:25.061] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 194, \"duration\": 77, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] Loss (name: value) total: 6.655554635184152\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] Loss (name: value) kld: 0.16526753987584794\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] Loss (name: value) recons: 6.490287167685373\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] Loss (name: value) logppx: 6.655554635184152\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] #validation_score (65): 6.655554635184152\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] patience losses:[6.668963772909982, 6.6740593910217285, 6.672698361533029, 6.671676022665841, 6.6569061279296875] min patience loss:6.6569061279296875 current loss:6.655554635184152 absolute loss difference:0.0013514927455355874\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] Timing: train: 0.93s, val: 0.08s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] #progress_metric: host=algo-2, completed 65.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194364.0524375, \"EndTime\": 1623194365.065849, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 64, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 251745.0, \"count\": 1, \"min\": 251745, \"max\": 251745}, \"Total Batches Seen\": {\"sum\": 2015.0, \"count\": 1, \"min\": 2015, \"max\": 2015}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 130.0, \"count\": 1, \"min\": 130, \"max\": 130}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3821.198004123106 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:25 INFO 140403258652480] # Starting training for epoch 66\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:25.149] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 200, \"duration\": 65, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Loss (name: value) total: 6.656591074807303\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Loss (name: value) kld: 0.16315475744860514\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Loss (name: value) recons: 6.493436472756522\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Loss (name: value) logppx: 6.656591074807303\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] #validation_score (67): 6.656591074807303\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] patience losses:[6.670565264565604, 6.674215384892055, 6.663431167602539, 6.666570118495396, 6.662403719765799] min patience loss:6.662403719765799 current loss:6.656591074807303 absolute loss difference:0.005812644958496094\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] Timing: train: 1.03s, val: 0.07s, epoch: 1.10s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] #progress_metric: host=algo-1, completed 67.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194364.0518334, \"EndTime\": 1623194365.154302, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 66, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 259156.0, \"count\": 1, \"min\": 259156, \"max\": 259156}, \"Total Batches Seen\": {\"sum\": 2077.0, \"count\": 1, \"min\": 2077, \"max\": 2077}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 134.0, \"count\": 1, \"min\": 134, \"max\": 134}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3508.063175361773 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:25 INFO 139703431448384] # Starting training for epoch 68\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:26.082] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 203, \"duration\": 927, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] # Finished training epoch 68 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Loss (name: value) total: 6.443859607942643\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Loss (name: value) kld: 0.1633695773780346\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Loss (name: value) recons: 6.280490071542801\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Loss (name: value) logppx: 6.443859607942643\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] #quality_metric: host=algo-1, epoch=68, train total_loss =6.443859607942643\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:26.031] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 197, \"duration\": 964, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] # Finished training epoch 66 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Loss (name: value) total: 6.459586285775708\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Loss (name: value) kld: 0.1650759122304378\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Loss (name: value) recons: 6.2945104106780025\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Loss (name: value) logppx: 6.459586285775708\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] #quality_metric: host=algo-2, epoch=66, train total_loss =6.459586285775708\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:26.105] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 197, \"duration\": 73, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Loss (name: value) total: 6.666655472346714\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Loss (name: value) kld: 0.16357360993112838\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Loss (name: value) recons: 6.503081934792655\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Loss (name: value) logppx: 6.666655472346714\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] #validation_score (66): 6.666655472346714\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] patience losses:[6.6740593910217285, 6.672698361533029, 6.671676022665841, 6.6569061279296875, 6.655554635184152] min patience loss:6.655554635184152 current loss:6.666655472346714 absolute loss difference:0.011100837162562271\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] Timing: train: 0.97s, val: 0.07s, epoch: 1.04s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] #progress_metric: host=algo-2, completed 66.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194365.0661278, \"EndTime\": 1623194366.1068618, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 65, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 255618.0, \"count\": 1, \"min\": 255618, \"max\": 255618}, \"Total Batches Seen\": {\"sum\": 2046.0, \"count\": 1, \"min\": 2046, \"max\": 2046}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 132.0, \"count\": 1, \"min\": 132, \"max\": 132}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3720.894403313786 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:26 INFO 140403258652480] # Starting training for epoch 67\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:26.140] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 203, \"duration\": 55, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Loss (name: value) total: 6.658583641052246\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Loss (name: value) kld: 0.1679336620228631\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Loss (name: value) recons: 6.490650040762765\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Loss (name: value) logppx: 6.658583641052246\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] #validation_score (68): 6.658583641052246\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] patience losses:[6.674215384892055, 6.663431167602539, 6.666570118495396, 6.662403719765799, 6.656591074807303] min patience loss:6.656591074807303 current loss:6.658583641052246 absolute loss difference:0.0019925662449429282\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] Timing: train: 0.93s, val: 0.06s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] #progress_metric: host=algo-1, completed 68.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194365.1545813, \"EndTime\": 1623194366.1422343, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 67, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 263024.0, \"count\": 1, \"min\": 263024, \"max\": 263024}, \"Total Batches Seen\": {\"sum\": 2108.0, \"count\": 1, \"min\": 2108, \"max\": 2108}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 136.0, \"count\": 1, \"min\": 136, \"max\": 136}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3915.721851801524 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:26 INFO 139703431448384] # Starting training for epoch 69\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:27.048] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 206, \"duration\": 905, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] # Finished training epoch 69 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Loss (name: value) total: 6.429637662826046\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Loss (name: value) kld: 0.160144591163243\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Loss (name: value) recons: 6.26949304919089\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Loss (name: value) logppx: 6.429637662826046\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] #quality_metric: host=algo-1, epoch=69, train total_loss =6.429637662826046\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:27.055] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 200, \"duration\": 948, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] # Finished training epoch 67 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Loss (name: value) total: 6.446748495101929\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Loss (name: value) kld: 0.16171543985124556\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Loss (name: value) recons: 6.28503304912198\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Loss (name: value) logppx: 6.446748495101929\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] #quality_metric: host=algo-2, epoch=67, train total_loss =6.446748495101929\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:27.113] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 200, \"duration\": 56, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Loss (name: value) total: 6.662278175354004\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Loss (name: value) kld: 0.17767012545040675\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Loss (name: value) recons: 6.484608037131173\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Loss (name: value) logppx: 6.662278175354004\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] #validation_score (67): 6.662278175354004\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] patience losses:[6.672698361533029, 6.671676022665841, 6.6569061279296875, 6.655554635184152, 6.666655472346714] min patience loss:6.655554635184152 current loss:6.662278175354004 absolute loss difference:0.006723540169851994\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] Timing: train: 0.95s, val: 0.06s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] #progress_metric: host=algo-2, completed 67.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194366.107135, \"EndTime\": 1623194367.114215, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 66, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 259491.0, \"count\": 1, \"min\": 259491, \"max\": 259491}, \"Total Batches Seen\": {\"sum\": 2077.0, \"count\": 1, \"min\": 2077, \"max\": 2077}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 134.0, \"count\": 1, \"min\": 134, \"max\": 134}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3845.23818239231 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:27 INFO 140403258652480] # Starting training for epoch 68\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:27.107] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 206, \"duration\": 57, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Loss (name: value) total: 6.664823191506522\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Loss (name: value) kld: 0.18049034689153945\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Loss (name: value) recons: 6.48433290209089\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Loss (name: value) logppx: 6.664823191506522\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] #validation_score (69): 6.664823191506522\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] patience losses:[6.663431167602539, 6.666570118495396, 6.662403719765799, 6.656591074807303, 6.658583641052246] min patience loss:6.656591074807303 current loss:6.664823191506522 absolute loss difference:0.00823211669921875\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] Timing: train: 0.91s, val: 0.06s, epoch: 0.97s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] #progress_metric: host=algo-1, completed 69.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194366.1425526, \"EndTime\": 1623194367.108762, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 68, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 266892.0, \"count\": 1, \"min\": 266892, \"max\": 266892}, \"Total Batches Seen\": {\"sum\": 2139.0, \"count\": 1, \"min\": 2139, \"max\": 2139}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 138.0, \"count\": 1, \"min\": 138, \"max\": 138}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4002.6013338412376 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:27 INFO 139703431448384] # Starting training for epoch 70\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:28.050] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 209, \"duration\": 940, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] # Finished training epoch 70 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Loss (name: value) total: 6.435725873516452\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Loss (name: value) kld: 0.16338768829741784\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Loss (name: value) recons: 6.272338186540911\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Loss (name: value) logppx: 6.435725873516452\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] #quality_metric: host=algo-1, epoch=70, train total_loss =6.435725873516452\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:28.042] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 203, \"duration\": 927, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] # Finished training epoch 68 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Loss (name: value) total: 6.452178808950609\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Loss (name: value) kld: 0.16514470091750544\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Loss (name: value) recons: 6.287034046265386\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Loss (name: value) logppx: 6.452178808950609\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] #quality_metric: host=algo-2, epoch=68, train total_loss =6.452178808950609\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:28.118] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 203, \"duration\": 73, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Loss (name: value) total: 6.656955787113735\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Loss (name: value) kld: 0.16866027883120946\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Loss (name: value) recons: 6.488295486995152\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Loss (name: value) logppx: 6.656955787113735\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] #validation_score (68): 6.656955787113735\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] patience losses:[6.671676022665841, 6.6569061279296875, 6.655554635184152, 6.666655472346714, 6.662278175354004] min patience loss:6.655554635184152 current loss:6.656955787113735 absolute loss difference:0.001401151929583122\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:3\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] Timing: train: 0.93s, val: 0.08s, epoch: 1.00s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] #progress_metric: host=algo-2, completed 68.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194367.1144717, \"EndTime\": 1623194368.119349, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 67, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 263364.0, \"count\": 1, \"min\": 263364, \"max\": 263364}, \"Total Batches Seen\": {\"sum\": 2108.0, \"count\": 1, \"min\": 2108, \"max\": 2108}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 136.0, \"count\": 1, \"min\": 136, \"max\": 136}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3853.62851343696 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:28 INFO 140403258652480] # Starting training for epoch 69\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:28.124] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 209, \"duration\": 72, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Loss (name: value) total: 6.661276204245431\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Loss (name: value) kld: 0.16866027883120946\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Loss (name: value) recons: 6.492615904126849\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Loss (name: value) logppx: 6.661276204245431\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] #validation_score (70): 6.661276204245431\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] patience losses:[6.666570118495396, 6.662403719765799, 6.656591074807303, 6.658583641052246, 6.664823191506522] min patience loss:6.656591074807303 current loss:6.661276204245431 absolute loss difference:0.004685129438128044\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:3\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] Timing: train: 0.94s, val: 0.07s, epoch: 1.02s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] #progress_metric: host=algo-1, completed 70.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194367.1090524, \"EndTime\": 1623194368.1255898, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 69, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 270760.0, \"count\": 1, \"min\": 270760, \"max\": 270760}, \"Total Batches Seen\": {\"sum\": 2170.0, \"count\": 1, \"min\": 2170, \"max\": 2170}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 140.0, \"count\": 1, \"min\": 140, \"max\": 140}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3804.550074174756 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:28 INFO 139703431448384] # Starting training for epoch 71\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:29.044] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 212, \"duration\": 918, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] # Finished training epoch 71 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Loss (name: value) total: 6.430116249668982\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Loss (name: value) kld: 0.16282161084874983\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Loss (name: value) recons: 6.267294622236682\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Loss (name: value) logppx: 6.430116249668982\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] #quality_metric: host=algo-1, epoch=71, train total_loss =6.430116249668982\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:29.080] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 206, \"duration\": 960, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] # Finished training epoch 69 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Loss (name: value) total: 6.444919105499022\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Loss (name: value) kld: 0.16481420337673156\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Loss (name: value) recons: 6.280104860182731\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Loss (name: value) logppx: 6.444919105499022\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] #quality_metric: host=algo-2, epoch=69, train total_loss =6.444919105499022\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:29.151] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 206, \"duration\": 67, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Loss (name: value) total: 6.654708930424282\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Loss (name: value) kld: 0.17239811164992197\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Loss (name: value) recons: 6.482310771942139\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Loss (name: value) logppx: 6.654708930424282\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] #validation_score (69): 6.654708930424282\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] patience losses:[6.6569061279296875, 6.655554635184152, 6.666655472346714, 6.662278175354004, 6.656955787113735] min patience loss:6.655554635184152 current loss:6.654708930424282 absolute loss difference:0.0008457047598700029\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:4\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] Timing: train: 0.96s, val: 0.08s, epoch: 1.04s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] #progress_metric: host=algo-2, completed 69.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194368.119665, \"EndTime\": 1623194369.1600218, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 68, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 267237.0, \"count\": 1, \"min\": 267237, \"max\": 267237}, \"Total Batches Seen\": {\"sum\": 2139.0, \"count\": 1, \"min\": 2139, \"max\": 2139}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 138.0, \"count\": 1, \"min\": 138, \"max\": 138}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3721.43227591566 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:29 INFO 140403258652480] # Starting training for epoch 70\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:29.110] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 212, \"duration\": 65, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Loss (name: value) total: 6.670550210135324\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Loss (name: value) kld: 0.1764322498014995\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Loss (name: value) recons: 6.494117941175189\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Loss (name: value) logppx: 6.670550210135324\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] #validation_score (71): 6.670550210135324\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] patience losses:[6.662403719765799, 6.656591074807303, 6.658583641052246, 6.664823191506522, 6.661276204245431] min patience loss:6.656591074807303 current loss:6.670550210135324 absolute loss difference:0.013959135328020622\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:4\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] Timing: train: 0.92s, val: 0.07s, epoch: 0.99s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] #progress_metric: host=algo-1, completed 71.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194368.12583, \"EndTime\": 1623194369.1119826, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 70, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 274628.0, \"count\": 1, \"min\": 274628, \"max\": 274628}, \"Total Batches Seen\": {\"sum\": 2201.0, \"count\": 1, \"min\": 2201, \"max\": 2201}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 142.0, \"count\": 1, \"min\": 142, \"max\": 142}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3921.775103026693 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:29 INFO 139703431448384] # Starting training for epoch 72\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:30.057] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 215, \"duration\": 945, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] # Finished training epoch 72 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Loss (name: value) total: 6.429529524618579\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Loss (name: value) kld: 0.16501144728352946\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Loss (name: value) recons: 6.264518064837302\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Loss (name: value) logppx: 6.429529524618579\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] #quality_metric: host=algo-1, epoch=72, train total_loss =6.429529524618579\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:30.113] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 209, \"duration\": 952, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] # Finished training epoch 70 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Loss (name: value) total: 6.44409659601027\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Loss (name: value) kld: 0.16728067013525194\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Loss (name: value) recons: 6.276815925875018\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Loss (name: value) logppx: 6.44409659601027\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] #quality_metric: host=algo-2, epoch=70, train total_loss =6.44409659601027\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:30.174] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 209, \"duration\": 59, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Loss (name: value) total: 6.658991064344134\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Loss (name: value) kld: 0.1658152320555278\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Loss (name: value) recons: 6.493175779070173\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Loss (name: value) logppx: 6.658991064344134\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] #validation_score (70): 6.658991064344134\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] patience losses:[6.655554635184152, 6.666655472346714, 6.662278175354004, 6.656955787113735, 6.654708930424282] min patience loss:6.654708930424282 current loss:6.658991064344134 absolute loss difference:0.004282133919851994\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:5\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] Timing: train: 0.95s, val: 0.06s, epoch: 1.01s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] #progress_metric: host=algo-2, completed 70.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194369.160657, \"EndTime\": 1623194370.1758206, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 69, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 271110.0, \"count\": 1, \"min\": 271110, \"max\": 271110}, \"Total Batches Seen\": {\"sum\": 2170.0, \"count\": 1, \"min\": 2170, \"max\": 2170}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 140.0, \"count\": 1, \"min\": 140, \"max\": 140}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3814.4514142048142 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:30 INFO 140403258652480] # Starting training for epoch 71\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:30.140] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 215, \"duration\": 81, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Loss (name: value) total: 6.661265577588763\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Loss (name: value) kld: 0.1696037713970457\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Loss (name: value) recons: 6.491661752973284\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Loss (name: value) logppx: 6.661265577588763\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] #validation_score (72): 6.661265577588763\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] patience losses:[6.656591074807303, 6.658583641052246, 6.664823191506522, 6.661276204245431, 6.670550210135324] min patience loss:6.656591074807303 current loss:6.661265577588763 absolute loss difference:0.004674502781459644\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:5\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] Timing: train: 0.95s, val: 0.08s, epoch: 1.03s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] #progress_metric: host=algo-1, completed 72.0 % of epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194369.1122649, \"EndTime\": 1623194370.1418765, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 71, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 278496.0, \"count\": 1, \"min\": 278496, \"max\": 278496}, \"Total Batches Seen\": {\"sum\": 2232.0, \"count\": 1, \"min\": 2232, \"max\": 2232}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 144.0, \"count\": 1, \"min\": 144, \"max\": 144}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=3756.151109464716 records/second\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] \u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:30 INFO 139703431448384] # Starting training for epoch 73\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:31.012] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 218, \"duration\": 870, \"num_examples\": 31, \"num_bytes\": 1452508}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] # Finished training epoch 73 on 3868 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Metrics for Training:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Loss (name: value) total: 6.437948799902393\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Loss (name: value) kld: 0.16708097811187467\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Loss (name: value) recons: 6.2708677784089115\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Loss (name: value) logppx: 6.437948799902393\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] #quality_metric: host=algo-1, epoch=73, train total_loss =6.437948799902393\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:31.084] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 218, \"duration\": 70, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Loss (name: value) total: 6.669911929539272\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Loss (name: value) kld: 0.16847902536392212\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Loss (name: value) recons: 6.5014328956604\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Loss (name: value) logppx: 6.669911929539272\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] #validation_score (73): 6.669911929539272\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] patience losses:[6.658583641052246, 6.664823191506522, 6.661276204245431, 6.670550210135324, 6.661265577588763] min patience loss:6.658583641052246 current loss:6.669911929539272 absolute loss difference:0.01132828848702605\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Bad epoch: loss has not improved (enough). Bad count:6\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Bad epochs exceeded patience. Stopping training early!\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Timing: train: 0.87s, val: 0.07s, epoch: 0.94s\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] Early stop condition met. Stopping training.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] #progress_metric: host=algo-1, completed 100 % epochs\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194370.1421127, \"EndTime\": 1623194371.0864322, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\", \"epoch\": 72, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 282364.0, \"count\": 1, \"min\": 282364, \"max\": 282364}, \"Total Batches Seen\": {\"sum\": 2263.0, \"count\": 1, \"min\": 2263, \"max\": 2263}, \"Max Records Seen Between Resets\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 146.0, \"count\": 1, \"min\": 146, \"max\": 146}, \"Number of Records Since Last Reset\": {\"sum\": 3868.0, \"count\": 1, \"min\": 3868, \"max\": 3868}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:31 INFO 139703431448384] #throughput_metric: host=algo-1, train throughput=4095.363077339397 records/second\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:31.105] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 212, \"duration\": 928, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] # Finished training epoch 71 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) total: 6.450904342436021\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) kld: 0.16861538973546797\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) recons: 6.28228888973113\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) logppx: 6.450904342436021\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] #quality_metric: host=algo-2, epoch=71, train total_loss =6.450904342436021\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:31.170] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 212, \"duration\": 64, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) total: 6.652386733463833\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) kld: 0.169691801071167\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) recons: 6.4826949664524625\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) logppx: 6.652386733463833\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] #validation_score (71): 6.652386733463833\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] patience losses:[6.666655472346714, 6.662278175354004, 6.656955787113735, 6.654708930424282, 6.658991064344134] min patience loss:6.654708930424282 current loss:6.652386733463833 absolute loss difference:0.0023221969604492188\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Timing: train: 0.93s, val: 0.07s, epoch: 1.00s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] #progress_metric: host=algo-2, completed 71.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194370.1762898, \"EndTime\": 1623194371.1759996, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 70, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 274983.0, \"count\": 1, \"min\": 274983, \"max\": 274983}, \"Total Batches Seen\": {\"sum\": 2201.0, \"count\": 1, \"min\": 2201, \"max\": 2201}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 142.0, \"count\": 1, \"min\": 142, \"max\": 142}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=3873.625239440704 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] # Starting training for epoch 72\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:31.876] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 215, \"duration\": 699, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] # Finished training epoch 72 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) total: 6.424685385919386\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) kld: 0.16629477482168906\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) recons: 6.258390634290634\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) logppx: 6.424685385919386\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] #quality_metric: host=algo-2, epoch=72, train total_loss =6.424685385919386\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:31.952] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 215, \"duration\": 75, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) total: 6.658300399780273\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) kld: 0.16995217118944442\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) recons: 6.488348211560931\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Loss (name: value) logppx: 6.658300399780273\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] #validation_score (72): 6.658300399780273\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] patience losses:[6.662278175354004, 6.656955787113735, 6.654708930424282, 6.658991064344134, 6.652386733463833] min patience loss:6.652386733463833 current loss:6.658300399780273 absolute loss difference:0.005913666316440747\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:1\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] Timing: train: 0.70s, val: 0.08s, epoch: 0.78s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] #progress_metric: host=algo-2, completed 72.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194371.1762474, \"EndTime\": 1623194371.9535005, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 71, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 278856.0, \"count\": 1, \"min\": 278856, \"max\": 278856}, \"Total Batches Seen\": {\"sum\": 2232.0, \"count\": 1, \"min\": 2232, \"max\": 2232}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 144.0, \"count\": 1, \"min\": 144, \"max\": 144}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=4982.072126011202 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:31 INFO 140403258652480] # Starting training for epoch 73\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:32.665] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 218, \"duration\": 711, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] # Finished training epoch 73 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Loss (name: value) total: 6.4154576793793705\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Loss (name: value) kld: 0.16860893825369497\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Loss (name: value) recons: 6.246848767803561\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Loss (name: value) logppx: 6.4154576793793705\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] #quality_metric: host=algo-2, epoch=73, train total_loss =6.4154576793793705\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:32.725] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 218, \"duration\": 57, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Loss (name: value) total: 6.656081404004778\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Loss (name: value) kld: 0.17290431899683817\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Loss (name: value) recons: 6.483177116938999\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Loss (name: value) logppx: 6.656081404004778\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] #validation_score (73): 6.656081404004778\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] patience losses:[6.656955787113735, 6.654708930424282, 6.658991064344134, 6.652386733463833, 6.658300399780273] min patience loss:6.652386733463833 current loss:6.656081404004778 absolute loss difference:0.0036946705409457437\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:2\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] Timing: train: 0.71s, val: 0.06s, epoch: 0.77s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] #progress_metric: host=algo-2, completed 73.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194371.953774, \"EndTime\": 1623194372.7263746, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 72, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 282729.0, \"count\": 1, \"min\": 282729, \"max\": 282729}, \"Total Batches Seen\": {\"sum\": 2263.0, \"count\": 1, \"min\": 2263, \"max\": 2263}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 146.0, \"count\": 1, \"min\": 146, \"max\": 146}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=5012.055930514799 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:32 INFO 140403258652480] # Starting training for epoch 74\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:33.446] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 221, \"duration\": 719, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] # Finished training epoch 74 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Loss (name: value) total: 6.40968152399986\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Loss (name: value) kld: 0.16973936726008693\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Loss (name: value) recons: 6.239942150731241\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Loss (name: value) logppx: 6.40968152399986\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] #quality_metric: host=algo-2, epoch=74, train total_loss =6.40968152399986\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:33.503] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 221, \"duration\": 55, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Loss (name: value) total: 6.666548592703683\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Loss (name: value) kld: 0.17217808536120824\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Loss (name: value) recons: 6.494370664869036\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Loss (name: value) logppx: 6.666548592703683\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] #validation_score (74): 6.666548592703683\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] patience losses:[6.654708930424282, 6.658991064344134, 6.652386733463833, 6.658300399780273, 6.656081404004778] min patience loss:6.652386733463833 current loss:6.666548592703683 absolute loss difference:0.014161859239850472\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:3\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] Timing: train: 0.72s, val: 0.06s, epoch: 0.78s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] #progress_metric: host=algo-2, completed 74.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194372.7267487, \"EndTime\": 1623194373.505339, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 73, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 286602.0, \"count\": 1, \"min\": 286602, \"max\": 286602}, \"Total Batches Seen\": {\"sum\": 2294.0, \"count\": 1, \"min\": 2294, \"max\": 2294}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 148.0, \"count\": 1, \"min\": 148, \"max\": 148}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=4973.408392853152 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:33 INFO 140403258652480] # Starting training for epoch 75\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:34.266] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 224, \"duration\": 760, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] # Finished training epoch 75 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Loss (name: value) total: 6.408468634851517\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Loss (name: value) kld: 0.17197414030951838\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Loss (name: value) recons: 6.236494521940908\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Loss (name: value) logppx: 6.408468634851517\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] #quality_metric: host=algo-2, epoch=75, train total_loss =6.408468634851517\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:34.324] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 224, \"duration\": 56, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Loss (name: value) total: 6.659877027784075\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Loss (name: value) kld: 0.17903506330081395\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Loss (name: value) recons: 6.480841909136091\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Loss (name: value) logppx: 6.659877027784075\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] #validation_score (75): 6.659877027784075\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] patience losses:[6.658991064344134, 6.652386733463833, 6.658300399780273, 6.656081404004778, 6.666548592703683] min patience loss:6.652386733463833 current loss:6.659877027784075 absolute loss difference:0.007490294320242619\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:4\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] Timing: train: 0.76s, val: 0.06s, epoch: 0.82s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] #progress_metric: host=algo-2, completed 75.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194373.505637, \"EndTime\": 1623194374.325788, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 74, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 290475.0, \"count\": 1, \"min\": 290475, \"max\": 290475}, \"Total Batches Seen\": {\"sum\": 2325.0, \"count\": 1, \"min\": 2325, \"max\": 2325}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 150.0, \"count\": 1, \"min\": 150, \"max\": 150}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=4721.588251878661 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:34 INFO 140403258652480] # Starting training for epoch 76\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:35.058] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 227, \"duration\": 732, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] # Finished training epoch 76 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) total: 6.40226448351337\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) kld: 0.17277954434675555\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) recons: 6.229484973415252\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) logppx: 6.40226448351337\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] #quality_metric: host=algo-2, epoch=76, train total_loss =6.40226448351337\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:35.118] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 227, \"duration\": 58, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) total: 6.660318647112165\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) kld: 0.17280201401029313\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) recons: 6.48751653943743\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) logppx: 6.660318647112165\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] #validation_score (76): 6.660318647112165\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] patience losses:[6.652386733463833, 6.658300399780273, 6.656081404004778, 6.666548592703683, 6.659877027784075] min patience loss:6.652386733463833 current loss:6.660318647112165 absolute loss difference:0.007931913648332234\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:5\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Timing: train: 0.73s, val: 0.06s, epoch: 0.79s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] #progress_metric: host=algo-2, completed 76.0 % of epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194374.3260279, \"EndTime\": 1623194375.1196294, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 75, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 294348.0, \"count\": 1, \"min\": 294348, \"max\": 294348}, \"Total Batches Seen\": {\"sum\": 2356.0, \"count\": 1, \"min\": 2356, \"max\": 2356}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 152.0, \"count\": 1, \"min\": 152, \"max\": 152}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=4879.280384706649 records/second\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] \u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] # Starting training for epoch 77\u001b[0m\n", - "\u001b[34m[2021-06-08 23:19:35.982] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 221, \"duration\": 65, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:35 INFO 139703431448384] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:35 INFO 139703431448384] Metrics for Inference:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:35 INFO 139703431448384] Loss (name: value) total: 6.664144515991211\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:35 INFO 139703431448384] Loss (name: value) kld: 0.1597729985203062\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:35 INFO 139703431448384] Loss (name: value) recons: 6.504371438707624\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:35 INFO 139703431448384] Loss (name: value) logppx: 6.664144515991211\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:35 INFO 139703431448384] #quality_metric: host=algo-1, epoch=73, validation total_loss =6.664144515991211\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:35 INFO 139703431448384] Loss of server-side model: 6.664144515991211\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:35 INFO 139703431448384] Best model based on early stopping at epoch 67. Best loss: 6.656591074807303\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] Topics from epoch:final (num_topics:20) [, tu 0.72]:\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 773 679 1931 419 418 3 157 58 1129 779 118 11 1103 376 884 542 111 1150 846 979\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 592 571 1960 1370 1541 422 320 1584 93 1583 570 1103 934 295 1191 51 902 1200 303 575\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 841 594 1572 791 389 1134 152 538 1573 1641 452 898 1296 928 231 687 1651 1096 1085 1654\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 749 1681 1696 448 1183 907 908 1488 612 148 1618 361 1244 1119 1959 935 856 1152 1125 1416\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 524 1780 668 1524 949 47 1635 709 1975 1213 179 843 732 730 1522 847 1111 684 353 525\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 1366 1328 697 1902 504 1607 1988 706 1872 1955 673 1973 1990 742 1987 890 1269 512 1299 1119\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 1234 518 1512 36 884 1175 673 1400 510 179 659 1264 1807 127 920 1041 1081 257 1563 1671\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 899 912 1571 751 120 1483 300 176 1900 301 1644 825 646 299 177 169 900 519 914 824\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 1554 1128 1227 1031 1662 1114 546 122 1634 1348 1899 1679 964 1157 1063 230 1289 180 1306 1532\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 1148 608 318 1264 868 1800 250 495 510 1296 1915 1297 1870 1201 1564 121 1441 698 1081 1200\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 1148 1780 659 1915 868 949 413 1689 691 353 1634 742 1772 1031 1524 1522 730 1374 180 732\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 524 179 1148 1524 733 1780 668 1522 1111 913 807 1138 189 1523 698 1027 667 1441 77 1366\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 524 179 1780 1309 1138 1993 1522 1524 1644 189 1148 1855 1635 135 936 1125 789 1523 1111 1213\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 446 140 142 1499 0 746 1300 659 118 141 157 1601 162 1070 96 433 139 1780 949 102\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 426 1179 1313 817 1127 972 1779 1568 1437 1315 1576 729 1966 1826 486 1295 689 293 1197 1235\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 62 484 1429 1917 1959 205 158 1450 1815 1837 304 179 255 1309 767 1936 576 1183 874 733\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 1234 413 1635 1522 667 150 1780 1052 524 1245 179 1148 789 1955 1652 1006 1843 1993 1655 518\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 1234 318 1080 1081 455 1268 172 413 794 1098 3 1264 353 504 1122 1175 973 884 1685 949\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 824 732 1128 179 1125 733 1031 676 964 684 1780 1522 174 949 37 108 1524 1227 807 237\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] 1264 413 824 518 38 1245 673 973 1745 1522 1234 353 250 218 1157 1175 682 1386 659 319\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] Serializing model to /opt/ml/model/model_algo-1\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] Saved checkpoint to \"/tmp/tmpkmfh515x/state-0001.params\"\u001b[0m\n", - "\u001b[34m[06/08/2021 23:19:36 INFO 139703431448384] Test data is not provided.\u001b[0m\n", - "\u001b[34m#metrics {\"StartTime\": 1623194295.5061681, \"EndTime\": 1623194376.0876753, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-1\", \"Operation\": \"training\"}, \"Metrics\": {\"initialize.time\": {\"sum\": 743.8404560089111, \"count\": 1, \"min\": 743.8404560089111, \"max\": 743.8404560089111}, \"epochs\": {\"sum\": 100.0, \"count\": 1, \"min\": 100, \"max\": 100}, \"model.score.time\": {\"sum\": 5502.745628356934, \"count\": 74, \"min\": 57.010650634765625, \"max\": 160.45522689819336}, \"early_stop.time\": {\"sum\": 5669.4016456604, \"count\": 73, \"min\": 57.80625343322754, \"max\": 164.24846649169922}, \"update.time\": {\"sum\": 74785.13598442078, \"count\": 73, \"min\": 920.3245639801025, \"max\": 1277.8825759887695}, \"finalize.time\": {\"sum\": 165.69781303405762, \"count\": 1, \"min\": 165.69781303405762, \"max\": 165.69781303405762}, \"model.serialize.time\": {\"sum\": 4.55474853515625, \"count\": 1, \"min\": 4.55474853515625, \"max\": 4.55474853515625}, \"setuptime\": {\"sum\": 1060.2984428405762, \"count\": 1, \"min\": 1060.2984428405762, \"max\": 1060.2984428405762}, \"totaltime\": {\"sum\": 81677.8953075409, \"count\": 1, \"min\": 81677.8953075409, \"max\": 81677.8953075409}}}\n", - "\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:35.855] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 230, \"duration\": 734, \"num_examples\": 31, \"num_bytes\": 1413316}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] # Finished training epoch 77 on 3873 examples from 31 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Metrics for Training:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) total: 6.402825109420284\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) kld: 0.17584915338985382\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) recons: 6.226975960116232\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) logppx: 6.402825109420284\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] #quality_metric: host=algo-2, epoch=77, train total_loss =6.402825109420284\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:35.911] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 230, \"duration\": 55, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) total: 6.667810644422259\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) kld: 0.17939407910619462\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) recons: 6.488416603633335\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) logppx: 6.667810644422259\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] #validation_score (77): 6.667810644422259\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] patience losses:[6.658300399780273, 6.656081404004778, 6.666548592703683, 6.659877027784075, 6.660318647112165] min patience loss:6.656081404004778 current loss:6.667810644422259 absolute loss difference:0.011729240417480469\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Bad epoch: loss has not improved (enough). Bad count:6\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Bad epochs exceeded patience. Stopping training early!\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Timing: train: 0.74s, val: 0.06s, epoch: 0.79s\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Early stop condition met. Stopping training.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] #progress_metric: host=algo-2, completed 100 % epochs\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194375.1199713, \"EndTime\": 1623194375.9129415, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\", \"epoch\": 76, \"Meta\": \"training_data_iter\"}, \"Metrics\": {\"Total Records Seen\": {\"sum\": 298221.0, \"count\": 1, \"min\": 298221, \"max\": 298221}, \"Total Batches Seen\": {\"sum\": 2387.0, \"count\": 1, \"min\": 2387, \"max\": 2387}, \"Max Records Seen Between Resets\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Max Batches Seen Between Resets\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}, \"Reset Count\": {\"sum\": 154.0, \"count\": 1, \"min\": 154, \"max\": 154}, \"Number of Records Since Last Reset\": {\"sum\": 3873.0, \"count\": 1, \"min\": 3873, \"max\": 3873}, \"Number of Batches Since Last Reset\": {\"sum\": 31.0, \"count\": 1, \"min\": 31, \"max\": 31}}}\n", - "\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] #throughput_metric: host=algo-2, train throughput=4883.278780146722 records/second\u001b[0m\n", - "\u001b[35m[2021-06-08 23:19:35.986] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/validation\", \"epoch\": 233, \"duration\": 71, \"num_examples\": 8, \"num_bytes\": 346580}\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Finished scoring on 896 examples from 7 batches, each of size 128.\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Metrics for Inference:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) total: 6.662471362522671\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) kld: 0.1597729985203062\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) recons: 6.5026982852390836\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss (name: value) logppx: 6.662471362522671\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] #quality_metric: host=algo-2, epoch=77, validation total_loss =6.662471362522671\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Loss of server-side model: 6.662471362522671\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:35 INFO 140403258652480] Best model based on early stopping at epoch 71. Best loss: 6.652386733463833\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] Topics from epoch:final (num_topics:20) [, tu 0.74]:\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 773 679 419 3 1931 418 157 58 1129 779 11 1150 118 884 846 111 1103 376 979 542\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 592 571 1541 1960 93 320 1370 422 1584 1583 1247 902 295 1103 51 934 1191 570 303 1200\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 841 594 1572 791 1134 389 1641 1573 152 538 452 687 928 1651 898 1296 231 1096 1085 1654\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 1681 749 1696 448 1183 148 1244 1488 1246 859 1125 908 361 907 1959 1618 1119 1926 1152 185\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 1780 730 668 77 1365 732 684 237 1635 936 1227 1213 843 222 147 1031 47 6 763 1914\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 1366 697 1328 1902 1988 1607 504 706 1872 1955 1987 1973 673 742 1990 890 1299 1220 1269 512\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 518 1175 120 1460 1098 1264 1807 1234 673 413 884 868 1593 318 682 1041 605 36 973 510\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 899 912 1571 300 1900 1483 120 751 1644 176 301 825 646 299 177 169 900 519 914 824\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 1114 1227 1128 1554 1031 1662 1634 546 1899 122 1679 964 1057 1063 1348 1532 1603 1157 180 230\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 1148 608 318 868 1297 1296 1800 250 1915 510 1264 495 121 1870 1200 271 1564 1201 1348 331\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 1524 1148 659 949 797 413 1915 179 1031 1018 1128 973 668 688 1780 1690 824 936 770 77\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 524 1148 1524 179 1523 1780 1111 668 1522 698 1855 397 1138 77 1833 733 1655 824 789 807\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 524 1148 1522 1138 189 1524 1780 397 179 1309 1125 1365 522 1993 1245 200 1855 789 732 668\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 446 140 142 0 1300 746 1499 659 162 157 118 141 1601 433 96 1070 139 949 1780 1963\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 1179 426 817 1313 972 1127 1779 1437 1315 1568 1576 1966 729 1826 1295 486 1197 689 293 1825\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 62 484 1429 1917 1959 205 1450 158 1815 1837 179 1309 304 1936 733 767 255 1294 1183 1138\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 524 1234 1652 1175 143 1148 179 1765 1245 495 667 518 682 763 1635 1339 413 1717 1522 1253\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 1234 1621 719 455 413 1268 1175 3 1081 973 884 1776 1336 1968 218 673 37 1130 1686 1555\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 733 824 179 732 1570 524 522 1143 31 228 1780 604 1522 807 1138 1128 659 1672 834 757\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] 1264 1234 36 1081 37 162 876 518 218 824 1689 795 1175 1459 1460 402 413 250 1776 32\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] Serializing model to /opt/ml/model/model_algo-2\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] Saved checkpoint to \"/tmp/tmpg_6fo0vm/state-0001.params\"\u001b[0m\n", - "\u001b[35m[06/08/2021 23:19:36 INFO 140403258652480] Test data is not provided.\u001b[0m\n", - "\u001b[35m#metrics {\"StartTime\": 1623194295.5121074, \"EndTime\": 1623194376.086828, \"Dimensions\": {\"Algorithm\": \"AWS/NTM\", \"Host\": \"algo-2\", \"Operation\": \"training\"}, \"Metrics\": {\"initialize.time\": {\"sum\": 697.8249549865723, \"count\": 1, \"min\": 697.8249549865723, \"max\": 697.8249549865723}, \"epochs\": {\"sum\": 100.0, \"count\": 1, \"min\": 100, \"max\": 100}, \"model.score.time\": {\"sum\": 6394.08540725708, \"count\": 78, \"min\": 56.20837211608887, \"max\": 180.50503730773926}, \"early_stop.time\": {\"sum\": 6582.762241363525, \"count\": 77, \"min\": 56.58698081970215, \"max\": 184.61942672729492}, \"update.time\": {\"sum\": 79649.55043792725, \"count\": 77, \"min\": 772.4390029907227, \"max\": 1444.2715644836426}, \"finalize.time\": {\"sum\": 168.27869415283203, \"count\": 1, \"min\": 168.27869415283203, \"max\": 168.27869415283203}, \"model.serialize.time\": {\"sum\": 4.034280776977539, \"count\": 1, \"min\": 4.034280776977539, \"max\": 4.034280776977539}, \"setuptime\": {\"sum\": 1071.57301902771, \"count\": 1, \"min\": 1071.57301902771, \"max\": 1071.57301902771}, \"totaltime\": {\"sum\": 81681.95343017578, \"count\": 1, \"min\": 81681.95343017578, \"max\": 81681.95343017578}}}\n", - "\u001b[0m\n", - "\n", - "2021-06-08 23:19:46 Uploading - Uploading generated training model\n", - "2021-06-08 23:19:46 Completed - Training job completed\n", - "Training seconds: 304\n", - "Billable seconds: 304\n" - ] - } - ], + "outputs": [], "source": [ "ntm.fit({\"train\": s3_train, \"validation\": s3_val_data})" ] }, { "cell_type": "markdown", - "id": "freelance-shannon", + "id": "ca5cc1d9", "metadata": { "papermill": { "duration": 0.081212, @@ -5284,8 +1230,8 @@ }, { "cell_type": "code", - "execution_count": 46, - "id": "superb-resolution", + "execution_count": null, + "id": "2251d7a9", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:23:49.296263Z", @@ -5302,22 +1248,14 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training job name: ntm-2021-06-08-23-15-03-276\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Training job name: {}\".format(ntm.latest_training_job.job_name))" ] }, { "cell_type": "markdown", - "id": "static-ghost", + "id": "2a8db4fc", "metadata": { "papermill": { "duration": 0.081343, @@ -5338,8 +1276,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "id": "sunset-swaziland", + "execution_count": null, + "id": "00a96680", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:23:49.630626Z", @@ -5356,30 +1294,22 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-------!" - ] - } - ], + "outputs": [], "source": [ "from sagemaker.serializers import CSVSerializer\n", "from sagemaker.deserializers import JSONDeserializer\n", "\n", "ntm_predictor = ntm.deploy(\n", - " initial_instance_count=1, \n", - " instance_type='ml.m4.xlarge',\n", + " initial_instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", " serializer=CSVSerializer(),\n", - " deserializer=JSONDeserializer()\n", + " deserializer=JSONDeserializer(),\n", ")" ] }, { "cell_type": "markdown", - "id": "passive-terrain", + "id": "540cc2e8", "metadata": { "papermill": { "duration": 0.086504, @@ -5396,8 +1326,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "id": "computational-stage", + "execution_count": null, + "id": "792f590c", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:22.275437Z", @@ -5414,22 +1344,14 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Endpoint name: ntm-2021-06-08-23-20-17-554\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Endpoint name: {}\".format(ntm_predictor.endpoint_name))" ] }, { "cell_type": "markdown", - "id": "designing-stereo", + "id": "5e6ce600", "metadata": { "papermill": { "duration": 0.085727, @@ -5446,8 +1368,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "id": "alien-martin", + "execution_count": null, + "id": "28794e0f", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:22.626693Z", @@ -5464,15 +1386,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'predictions': [{'topic_weights': [0.1291756481, 0.0507885031, 0.0417106971, 0.0400005728, 0.0478653982, 0.0206772666, 0.0430370495, 0.0218653288, 0.0410777815, 0.022001991, 0.0393451117, 0.0522925183, 0.051600188, 0.066249229, 0.0703478307, 0.0968779847, 0.0428318121, 0.0364326015, 0.0460041463, 0.0398182683]}, {'topic_weights': [0.0180158913, 0.0247942526, 0.0276389122, 0.0678321198, 0.0382873453, 0.0367063396, 0.0410614833, 0.2918568254, 0.033076521, 0.0316482373, 0.0379365794, 0.0369765386, 0.0379251391, 0.0251364261, 0.0677403361, 0.0291158687, 0.0380523577, 0.0396333672, 0.0351459794, 0.0414195508]}, {'topic_weights': [0.0278755464, 0.0196884535, 0.0671157613, 0.0308432747, 0.0388110615, 0.0237644296, 0.0446243994, 0.0353006199, 0.0337769724, 0.2674693763, 0.0463326573, 0.0491070077, 0.0476619303, 0.0291159041, 0.0177176483, 0.0222446267, 0.055981189, 0.0463135466, 0.0446933433, 0.051562313]}, {'topic_weights': [0.187104553, 0.1906852573, 0.0279728584, 0.0273396056, 0.0360631421, 0.0487223566, 0.0339291841, 0.0196604691, 0.0355255529, 0.0164047647, 0.0321371406, 0.037225455, 0.0391473696, 0.0257860348, 0.0156444404, 0.0884368941, 0.03244159, 0.0337445736, 0.0363439955, 0.0356848612]}, {'topic_weights': [0.0392541029, 0.0159985349, 0.0748593733, 0.0280441064, 0.0462445579, 0.0165806375, 0.0557285696, 0.0207282994, 0.0196479764, 0.1528619379, 0.0533503629, 0.0816884562, 0.0800268874, 0.0211151857, 0.0102875112, 0.0374294519, 0.0905672461, 0.0448295996, 0.0546249263, 0.0561322533]}]}\n" - ] - } - ], + "outputs": [], "source": [ "test_data = np.array(test_vectors.todense())\n", "results = ntm_predictor.predict(test_data[:5])\n", @@ -5481,7 +1395,7 @@ }, { "cell_type": "markdown", - "id": "professional-criterion", + "id": "534e5f3f", "metadata": { "papermill": { "duration": 0.086139, @@ -5511,8 +1425,8 @@ }, { "cell_type": "code", - "execution_count": 34, - "id": "pursuant-friend", + "execution_count": null, + "id": "9d640006", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:23.201537Z", @@ -5529,34 +1443,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0.12917565 0.0507885 0.0417107 0.04000057 0.0478654 0.02067727\n", - " 0.04303705 0.02186533 0.04107778 0.02200199 0.03934511 0.05229252\n", - " 0.05160019 0.06624923 0.07034783 0.09687798 0.04283181 0.0364326\n", - " 0.04600415 0.03981827]\n", - " [0.01801589 0.02479425 0.02763891 0.06783212 0.03828735 0.03670634\n", - " 0.04106148 0.29185683 0.03307652 0.03164824 0.03793658 0.03697654\n", - " 0.03792514 0.02513643 0.06774034 0.02911587 0.03805236 0.03963337\n", - " 0.03514598 0.04141955]\n", - " [0.02787555 0.01968845 0.06711576 0.03084327 0.03881106 0.02376443\n", - " 0.0446244 0.03530062 0.03377697 0.26746938 0.04633266 0.04910701\n", - " 0.04766193 0.0291159 0.01771765 0.02224463 0.05598119 0.04631355\n", - " 0.04469334 0.05156231]\n", - " [0.18710455 0.19068526 0.02797286 0.02733961 0.03606314 0.04872236\n", - " 0.03392918 0.01966047 0.03552555 0.01640476 0.03213714 0.03722545\n", - " 0.03914737 0.02578603 0.01564444 0.08843689 0.03244159 0.03374457\n", - " 0.036344 0.03568486]\n", - " [0.0392541 0.01599853 0.07485937 0.02804411 0.04624456 0.01658064\n", - " 0.05572857 0.0207283 0.01964798 0.15286194 0.05335036 0.08168846\n", - " 0.08002689 0.02111519 0.01028751 0.03742945 0.09056725 0.0448296\n", - " 0.05462493 0.05613225]]\n" - ] - } - ], + "outputs": [], "source": [ "predictions = np.array([prediction[\"topic_weights\"] for prediction in results[\"predictions\"]])\n", "print(predictions)" @@ -5564,7 +1451,7 @@ }, { "cell_type": "markdown", - "id": "extensive-record", + "id": "ad89bb0d", "metadata": { "papermill": { "duration": 0.086825, @@ -5585,8 +1472,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "id": "executed-intersection", + "execution_count": null, + "id": "3d9b0a22", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:23.556121Z", @@ -5617,7 +1504,7 @@ }, { "cell_type": "markdown", - "id": "southeast-pillow", + "id": "92e608ec", "metadata": { "papermill": { "duration": 0.086612, @@ -5634,7 +1521,7 @@ }, { "cell_type": "markdown", - "id": "sized-victory", + "id": "43c83800", "metadata": { "papermill": { "duration": 0.086791, @@ -5652,8 +1539,8 @@ }, { "cell_type": "code", - "execution_count": 36, - "id": "fabulous-struggle", + "execution_count": null, + "id": "73ec118f", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:24.085365Z", @@ -5670,30 +1557,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "Text(0.5, 0, 'Topic ID')" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", @@ -5709,7 +1573,7 @@ }, { "cell_type": "markdown", - "id": "potential-miniature", + "id": "9aee381c", "metadata": { "papermill": { "duration": 0.087339, @@ -5730,8 +1594,8 @@ }, { "cell_type": "code", - "execution_count": 37, - "id": "hungry-steam", + "execution_count": null, + "id": "a56ac4b4", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:25.100455Z", @@ -5755,7 +1619,7 @@ }, { "cell_type": "markdown", - "id": "patient-metadata", + "id": "7843440a", "metadata": { "papermill": { "duration": 0.088459, @@ -5772,7 +1636,7 @@ }, { "cell_type": "markdown", - "id": "adequate-handbook", + "id": "043668a4", "metadata": { "papermill": { "duration": 0.088629, @@ -5792,8 +1656,8 @@ }, { "cell_type": "code", - "execution_count": 38, - "id": "literary-attitude", + "execution_count": null, + "id": "c1099b93", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:25.704815Z", @@ -5819,7 +1683,7 @@ }, { "cell_type": "markdown", - "id": "radical-premiere", + "id": "849f6492", "metadata": { "papermill": { "duration": 0.089891, @@ -5836,8 +1700,8 @@ }, { "cell_type": "code", - "execution_count": 39, - "id": "looking-teaching", + "execution_count": null, + "id": "64f2c973", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:26.978568Z", @@ -5854,18 +1718,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "'20newsgroups/output/ntm-2021-06-08-23-15-03-276/output/model.tar.gz'" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "model_path = os.path.join(output_prefix, ntm._current_job_name, \"output/model.tar.gz\")\n", "model_path" @@ -5873,8 +1726,8 @@ }, { "cell_type": "code", - "execution_count": 40, - "id": "imposed-booth", + "execution_count": null, + "id": "536fb075", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:27.175129Z", @@ -5898,48 +1751,42 @@ }, { "cell_type": "code", - "execution_count": 50, - "id": "informative-walter", + "execution_count": null, + "id": "b91176b2", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/ubuntu/SageMaker/batch_fix/batchfix15/introduction_to_applying_machine_learning/ntm_20newsgroups_topic_modeling\n" - ] - } - ], + "outputs": [], "source": [ "!pwd" ] }, { "cell_type": "code", - "execution_count": 55, - "id": "immediate-colon", + "execution_count": null, + "id": "a93f3f21", "metadata": {}, "outputs": [], "source": [ "import tarfile\n", - "tarfile.open('downloaded_model.tar.gz').extractall()" + "\n", + "tarfile.open(\"downloaded_model.tar.gz\").extractall()" ] }, { "cell_type": "code", - "execution_count": 56, - "id": "meaningful-cannon", + "execution_count": null, + "id": "5a946a9b", "metadata": {}, "outputs": [], "source": [ "import zipfile\n", - "with zipfile.ZipFile('model_algo-1', 'r') as zip_ref:\n", - " zip_ref.extractall('./')" + "\n", + "with zipfile.ZipFile(\"model_algo-1\", \"r\") as zip_ref:\n", + " zip_ref.extractall(\"./\")" ] }, { "cell_type": "markdown", - "id": "soviet-alpha", + "id": "d5039948", "metadata": { "papermill": { "duration": 0.089504, @@ -5956,8 +1803,8 @@ }, { "cell_type": "code", - "execution_count": 58, - "id": "widespread-finnish", + "execution_count": null, + "id": "796ed7b8", "metadata": { "execution": { "iopub.execute_input": "2021-06-08T21:31:28.387482Z", @@ -5982,7 +1829,7 @@ }, { "cell_type": "markdown", - "id": "posted-quantum", + "id": "85d44999", "metadata": { "papermill": { "duration": null, @@ -5999,8 +1846,8 @@ }, { "cell_type": "code", - "execution_count": 44, - "id": "black-sydney", + "execution_count": null, + "id": "4004ffeb", "metadata": { "papermill": { "duration": null, @@ -6011,33 +1858,19 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: wordcloud in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (1.8.1)\n", - "Requirement already satisfied: pillow in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from wordcloud) (7.0.0)\n", - "Requirement already satisfied: matplotlib in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from wordcloud) (3.1.3)\n", - "Requirement already satisfied: numpy>=1.6.1 in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from wordcloud) (1.18.1)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from matplotlib->wordcloud) (2.4.6)\n", - "Requirement already satisfied: cycler>=0.10 in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from matplotlib->wordcloud) (0.10.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from matplotlib->wordcloud) (1.1.0)\n", - "Requirement already satisfied: python-dateutil>=2.1 in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from matplotlib->wordcloud) (2.8.1)\n", - "Requirement already satisfied: six in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from cycler>=0.10->matplotlib->wordcloud) (1.14.0)\n", - "Requirement already satisfied: setuptools in /home/ubuntu/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib->wordcloud) (45.2.0.post20200210)\n" - ] - } - ], + "outputs": [], "source": [ - "!pip install wordcloud\n", + "import sys\n", + "\n", + "!{sys.executable} -m pip install wordcloud\n", + "\n", "import wordcloud as wc" ] }, { "cell_type": "code", - "execution_count": 59, - "id": "scenic-aside", + "execution_count": null, + "id": "f4684f18", "metadata": { "papermill": { "duration": null, @@ -6048,22 +1881,10 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "word_to_id = dict()\n", "for i, v in enumerate(vocab_list):\n", " word_to_id[v] = i\n", @@ -6098,14 +1919,23 @@ "\n", " counter += 1" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b38e42ec", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "celltoolbar": "Tags", + "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Environment (conda_mxnet_p36)", + "display_name": "Python 3 (MXNet 1.6 Python 3.6 CPU Optimized)", "language": "python", - "name": "conda_mxnet_p36" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/mxnet-1.6-cpu-py36" }, "language_info": { "codemirror_mode": { @@ -6117,7 +1947,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" }, "notice": "Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.", "papermill": { diff --git a/ml-lifecycle/feature_store/FS_demo.ipynb b/ml-lifecycle/feature_store/FS_demo.ipynb index 86528475c8..a0fa895bd3 100644 --- a/ml-lifecycle/feature_store/FS_demo.ipynb +++ b/ml-lifecycle/feature_store/FS_demo.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Detect Heart Failure from Clinical Record with SageMaker FeatureStore\n", + "# Detect Heart Failure from Clinical Record with SageMaker Feature Store\n", "This notebook runs with Kernel `Python 3 (Data Science)`.\n", "\n", "Note:\n", @@ -24,14 +24,14 @@ "source": [ "## Contents\n", "* [Background](#1)\n", - "* [Setup SageMaker FeatureStore](#2)\n", + "* [Setup SageMaker Feature Store](#2)\n", "* [Inspect Dataset](#3)\n", - "* [Prepare Data for FeatureStore](#4)\n", + "* [Prepare Data for Feature Store](#4)\n", "* [Create Features](#5)\n", "* [Work with FeatureGroup](#10)\n", "* [Build Training Dataset](#6)\n", "* [Train and Deploy the Model](#7)\n", - "* [SageMaker FeatureStore At Inference](#8)\n", + "* [SageMaker Feature Store At Inference](#8)\n", "* [Cleanup Resources](#9)" ] }, @@ -42,11 +42,11 @@ "\n", "## Background\n", "\n", - "SageMaker FeatureStore is a new SageMaker capability that makes it easy for customers to create and manage curated features for machine learning (ML) development. It erves as the single source of truth to store, retrieve, remove, track, share, discover, and control access to features.\n", - "SageMaker FeatureStore enables data ingestion via a high TPS API and data consumption via the online and offline stores.\n", + "SageMaker Feature Store is a SageMaker capability that makes it easy for customers to create and manage curated features for machine learning (ML) development. It erves as the single source of truth to store, retrieve, remove, track, share, discover, and control access to features.\n", + "SageMaker Feature Store enables data ingestion via a high TPS API and data consumption via the online and offline stores.\n", "\n", "\n", - "This notebook provides an example for the APIs provided by SageMaker FeatureStore by walking through the process of training a heart failure detection model with clinical records data. The notebook demonstrates how the dataset can be ingested into the FeatureStore, queried to create a training dataset, and quickly accessed during inference.\n", + "This notebook provides an example for the APIs provided by SageMaker Feature Store by walking through the process of training a heart failure detection model with clinical records data. The notebook demonstrates how the dataset can be ingested into the Feature Store, queried to create a training dataset, and quickly accessed during inference.\n", "\n", "### Terminology\n", "* `Feature group` – A FeatureGroup is the main Feature Store resource that contains the metadata for all the data stored in Amazon SageMaker Feature Store. A feature group is a logical grouping of features, defined in the feature store, to describe records. A feature group’s definition is composed of a list of feature definitions, a record identifier name, and configurations for its online and offline store. \n", @@ -62,21 +62,12 @@ "* `Offline store` – the OfflineStore, stores historical data in your S3 bucket. It is used when low (sub-second) latency reads are not needed. For example, when you want to store and serve features for exploration, model training, and batch inference. A feature group contains an OfflineStoreConfig controlling where the data is stored." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Architecture Di" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", - "## Setup SageMaker FeatureStore\n", + "## Setup SageMaker Feature Store\n", "Let's start by setting up the SageMaker Python SDK and boto client. " ] }, @@ -112,7 +103,7 @@ "metadata": {}, "source": [ "#### Set Up S3 Bucket For The OfflineStore\n", - "SageMaker FeatureStore writes the data in the `OfflineStore` of a `FeatureGroup` to a S3 bucket owned by you. To be able to write to your S3 bucket, SageMaker FeatureStore assumes an IAM role which has access to it. The role is also owned by you. Note that the same bucket can be re-used across FeatureGroups. Data in the bucket is partitioned by FeatureGroup." + "SageMaker Feature Store writes the data in the `OfflineStore` of a `FeatureGroup` to a S3 bucket owned by you. To be able to write to your S3 bucket, SageMaker Feature Store assumes an IAM role which has access to it. The role is also owned by you. Note that the same bucket can be re-used across FeatureGroups. Data in the bucket is partitioned by FeatureGroup." ] }, { @@ -233,7 +224,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The dataset contains no missing value, and all columns are either numerical or binary, therefore no processing or feature engineering is needed in this case. Depending on your data and use case, you should examine your data and decide if any pre-processing and feature engineering steps are needed before you ingest your data into FeatureStore." + "The dataset contains no missing value, and all columns are either numerical or binary, therefore no processing or feature engineering is needed in this case. Depending on your data and use case, you should examine your data and decide if any pre-processing and feature engineering steps are needed before you ingest your data into Feature Store." ] }, { @@ -242,14 +233,14 @@ "source": [ "\n", "\n", - "## Prepare data For FeatureStore\n", + "## Prepare data For Feature Store\n", "In the Amazon SageMaker Feature Store API, a feature is an attribute of a record. You can define a name and type for every feature stored in Feature Store. Name uniquely identifies a feature within a feature group. Type identifies the datatype for the values of the feature. Supported datatypes are: String, Integral and Fractional. \n", "\n", - "Take a look at the data types and making sure they are all correct and readable by Feature store. SageMaker FeatureStore Python SDK will map the string dtype to String feature type.\n", + "Take a look at the data types and making sure they are all correct and readable by Feature store. SageMaker Feature Store Python SDK will map the string dtype to String feature type.\n", "\n", "In SageMaker Feature Store, a `record` is a collection of values for features for a single record identifier value. Specific features are flagged with record identifier and event time, and a combination of record identifier name and a timestamp uniquely identify a record within a feature group. we will need to specify a record identifier and an event time in this case, and since the raw data does not contain the two columns, we will need to create them.\n", "\n", - "* For record identifier name: a record is a collection of values for features for a single record identifier value. In this case, we will create an unique ID for each patient in the previous step as the record indentifier. Making sure the identifier is the unique identifier for each instance.\n", + "* For record identifier name: a record is a collection of values for features for a single record identifier value. In this case, we will create a unique ID for each patient in the previous step as the record indentifier. Making sure the identifier is the unique identifier for each instance.\n", "* For event time feature name: it refers to a point in time when a new event occurs that corresponds to the creation or update of a record in a feature group. It can be used to track changes to a record over time. For example, in this use case, EventTime can be appended to your data when no timestamp is available. In the following code, you can see how EventTime is appended to the clinical data." ] }, @@ -257,7 +248,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Create an unique ID for each patient" + "#### Create a unique ID for each patient" ] }, { @@ -328,7 +319,7 @@ " if data_frame.dtypes[label] == 'object':\n", " data_frame[label] = data_frame[label].astype(\"str\").astype(\"string\")\n", "\n", - "# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.\n", + "# cast object dtype to string. The SageMaker Feature Store Python SDK will then map the string dtype to String feature type.\n", "cast_object_to_string(clinical)" ] }, @@ -405,7 +396,7 @@ "metadata": {}, "source": [ "#### Load feature definitions to the feature group\n", - "We can now load the feature definitions by passing a data frame containing the feature data. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data. For developers using a schema rather than automatic detection, see the [Export Feature Groups from Data Wrangler example](https://docs.aws.amazon.com/sagemaker/latest/dg/data-wrangler-data-export.html#data-wrangler-data-export-feature-store) for code that shows how to load the schema, map it, and add it as a FeatureDefinition that you can use to create the FeatureGroup. " + "We can now load the feature definitions by passing a data frame containing the feature data. SageMaker Feature Store Python SDK will auto-detect the data schema based on input data. For developers using a schema rather than automatic detection, see the [Export Feature Groups from Data Wrangler example](https://docs.aws.amazon.com/sagemaker/latest/dg/data-wrangler-data-export.html#data-wrangler-data-export-feature-store) for code that shows how to load the schema, map it, and add it as a FeatureDefinition that you can use to create the FeatureGroup. " ] }, { @@ -521,7 +512,7 @@ "metadata": {}, "source": [ "#### Generate Hive DDL Commands\n", - "The SageMaker Python SDK’s FeatureStore class also provides the functionality to generate Hive DDL commands. The schema of the table is generated based on the feature definitions. Columns are named after feature name and data-type are inferred based on feature type." + "The SageMaker Python SDK’s Feature Store class also provides the functionality to generate Hive DDL commands. The schema of the table is generated based on the feature definitions. Columns are named after feature name and data-type are inferred based on feature type." ] }, { @@ -537,7 +528,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's wait for the data to appear in our offline store before moving forward to creating a dataset. This will take approximately 5 minutes. SageMaker FeatureStore adds metadata for each record that's ingested into the offline store." + "Now let's wait for the data to appear in our offline store before moving forward to creating a dataset. This will take approximately 5 minutes. SageMaker Feature Store adds metadata for each record that's ingested into the offline store." ] }, { @@ -575,7 +566,7 @@ "metadata": {}, "source": [ "## Build a Training Dataset\n", - " SageMaker FeatureStore automatically builds an AWS Glue data catalog when you create feature groups and you can turn this off if you want. In this example, we will create a training dataset with FeatureValues from the clinical FeatureGroup. This is done by utilizing the auto-built Catalog. We run an Athena query that does a simple `select all` in the offline store in S3 from the FeatureGroup.\n", + " SageMaker Feature Store automatically builds an AWS Glue data catalog when you create feature groups and you can turn this off if you want. In this example, we will create a training dataset with FeatureValues from the clinical FeatureGroup. This is done by utilizing the auto-built Catalog. We run an Athena query that does a simple `select all` in the offline store in S3 from the FeatureGroup.\n", " \n", "For testing purpose, we left out 9 records when creating the training dataset, so that we can use the left-out 9 records as test data for the reference. You can also do a train/test split. " ] @@ -703,7 +694,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Due to cost consideration, the goal of this example is to showcase FeatureStore capabilities, not necessarily to achieve the best result. In this example, we will skip hyperparamter tuning and go with default hyperparameters." + "Due to cost consideration, the goal of this example is to showcase Feature Store capabilities, not necessarily to achieve the best result. In this example, we will skip hyperparamter tuning and go with default hyperparameters." ] }, { @@ -765,8 +756,8 @@ "metadata": {}, "source": [ "\n", - "## SageMaker FeatureStore During Inference\n", - "SageMaker FeatureStore can be useful in supplementing data for inference requests because of the low-latency GetRecord functionality. For this demo, we will be given a patientID and query our online FeatureGroup to build our inference request.\n", + "## SageMaker Feature Store During Inference\n", + "SageMaker Feature Store can be useful in supplementing data for inference requests because of the low-latency GetRecord functionality. For this demo, we will be given a patientID and query our online FeatureGroup to build our inference request.\n", "\n", "From the patient ID we left out in training set, we can choose one patient ID to test the real-time reference. In this example we choose patient `194`, but you can choose either one from the left out id list for testing." ] @@ -879,7 +870,7 @@ "* [SageMaker Feature Store Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html)\n", "* [Store, Discover, and Share Machine Learning Features with Amazon SageMaker Feature Store](https://aws.amazon.com/blogs/aws/new-store-discover-and-share-machine-learning-features-with-amazon-sagemaker-feature-store/?sc_icampaign=launch_sagemaker-feature-store_reinvent20&sc_ichannel=ha&sc_icontent=awssm-6216&sc_iplace=ribbon&trk=ha_awssm-6216) \n", "* [Using streaming ingestion with Amazon SageMaker Feature Store to make ML-backed decisions in near-real time](https://aws.amazon.com/blogs/machine-learning/using-streaming-ingestion-with-amazon-sagemaker-feature-store-to-make-ml-backed-decisions-in-near-real-time/)\n", - "* [Fraud Detection using SageMaker FeatureStore](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-featurestore/sagemaker_featurestore_fraud_detection_python_sdk.ipynb)\n" + "* [Fraud Detection using SageMaker Feature Store](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-featurestore/sagemaker_featurestore_fraud_detection_python_sdk.ipynb)\n" ] } ], @@ -888,7 +879,7 @@ "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" }, "language_info": { "codemirror_mode": { diff --git a/ml-lifecycle/feature_store/README.md b/ml-lifecycle/feature_store/README.md index 505ea1aaaf..37cf0cd9d8 100644 --- a/ml-lifecycle/feature_store/README.md +++ b/ml-lifecycle/feature_store/README.md @@ -13,14 +13,14 @@ # Background -SageMaker FeatureStore makes it easy to create and manage curated features for machine learning (ML) development. It serves as the single source of truth to store, retrieve, remove, track, share, discover, and control access to features. SageMaker Feature Store enables data ingestion via a high TPS API and data consumption via the online and offline stores. +SageMaker Feature Store makes it easy to create and manage curated features for machine learning (ML) development. It serves as the single source of truth to store, retrieve, remove, track, share, discover, and control access to features. SageMaker Feature Store enables data ingestion via a high TPS API and data consumption via the online and offline stores. In this notebook we use SageMaker Feature Store to prepare and store features to train a heart failure detection model using medical record data. This notebook demonstrates how the dataset can be ingested into the Feature Store, queried to create a training dataset, and quickly accessed during inference. We also see how to integrate SageMaker Feature Store with SageMaker Data Wrangler and SageMaker Pipelines to process, store and use features in machine learning development. # Prereqs -The following policies need to be attached to the SageMaker execution role that you use to run this notebook: +The following IAM policies need to be attached to the SageMaker execution role that you use to run this notebook: - AmazonSageMakerFullAccess - AmazonSageMakerFeatureStoreAccess @@ -31,7 +31,7 @@ Note that the AmazonS3FullAccess policy is not attached to your role by default # Data -This notebook uses the publically available [Heart failure clinical records Data Set](https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records) that can be downloaded from the UCI machine learning Repository, as described in the notebook. The data set contains medical record information for a small sample of heart failure patients, including demographic, diagnostic and laboratory test data. +This notebook uses the publicly available [Heart failure clinical records Data Set](https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records) that can be downloaded from the UCI machine learning Repository, as described in the notebook. The data set contains medical record information for a small sample of heart failure patients, including demographic, diagnostic and laboratory test data. **heart_failure_clinical_records_dataset.csv** @@ -56,11 +56,11 @@ The dataset contains one table with thirteen (13) columns: ![architecture diagram](/fs1.PNG) -First, we'll prepare the data for feature store, create a Feature Group and then ingest our data in to the Feature Group. Our features will be available in the offline feature store within minutes. We then use the feature store to build a training dataset, fit a simple model and return predictions. +First, we'll prepare the data for Feature Store, create a Feature Group and then ingest our data in to the Feature Group. Our features will be available in the offline feature store within minutes. We then use the feature store to build a training dataset, fit a simple model and return predictions. # Clean Up -In order to prevent ongoing charges to your AWS account, clean up any resources we spun up during this tutorial. An easy way to track and delete related training resources is to [tag them](https://docs.aws.amazon.com/general/latest/gr/aws_tagging.html) or create the resources all under the same [resource group](https://us-east-2.console.aws.amazon.com/resource-groups/home) so that later you can easily search and delete unwanted resources. +In order to prevent ongoing charges to your AWS account, clean up any resources we spun up during this tutorial. # Other Resources diff --git a/patterns/ml_gateway/README.md b/patterns/ml_gateway/README.md new file mode 100644 index 0000000000..3d5c5924ec --- /dev/null +++ b/patterns/ml_gateway/README.md @@ -0,0 +1,17 @@ +# Enterprise-Grade ML: Part 1 Prepare and Use the ML Gateway Pattern for Inference + +## Data Prep and Inference using SageMaker Feature Store + +Very often Data Science projects start in an experimental phase in which transformations on features are experimented with, algorithms are selected and tried for determining if they can fit the data distribution well enough for reliable predictions, tuning is done with various hyper-parameters and so on. + +As an organization matures in their Machine Learning (ML) Journey, they will find that they will then transition to an automated ML or MLOps phase where the pipelines for data preparation, training, deployment, monitoring will all need to be automated. + +In order to raise the maturity of projects to an Enterprise Scale that can fulfill business needs, sustain business-level continuity, scale, security and performance, the need for integrating data science experiments with machine learning deployment patterns and best-practices will grow in importance and will save you time and money. + +In this blog series on ML Patterns, we will start by focusing on Deployment Patterns and Best-Practices within the ML lifecycle : exploring the considerations and options that present themselves, post-training; on the serving/inference/prediction phases of the ML lifecycle. + +There are many ways in which we can expose an endpoint that was deployed as a hosted SageMaker endpoint: these variations are summarized in the ML Gateway Pattern with mandatory and optional components. Through this series of blogs we will outline options and their context, pros and cons for helping you decide what components to use for your specific workload and use-case. + + +![arch](./images/ml-gateway-pattern.png) + diff --git a/patterns/ml_gateway/code/train_deploy.py b/patterns/ml_gateway/code/train_deploy.py new file mode 100644 index 0000000000..eff34e3e4f --- /dev/null +++ b/patterns/ml_gateway/code/train_deploy.py @@ -0,0 +1,82 @@ +import os +import sys +import pickle +import xgboost as xgb +import argparse +import pandas as pd +import json +import logging + +logger = logging.getLogger(__name__) +# logger.setLevel(logging.DEBUG) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +def model_fn(model_dir): + """Deserialize and return fitted model. + Note that this should have the same name as the serialized model in the _xgb_train method + """ + model_file = "xgboost-model" + model = pickle.load(open(os.path.join(model_dir, model_file), "rb")) + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + # Hyperparameters are described here + parser.add_argument("--num_round", type=int, default=999) + parser.add_argument("--max_depth", type=int, default=3) + parser.add_argument("--eta", type=float, default=0.2) + parser.add_argument("--objective", type=str, default="binary:logistic") + parser.add_argument("--nfold", type=int, default=5) + parser.add_argument("--early_stopping_rounds", type=int, default=10) + parser.add_argument("--train_data_path", type=str, default=os.environ.get("SM_CHANNEL_TRAIN")) + + # SageMaker specific arguments. Defaults are set in the environment variables. + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR")) + parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR")) + + args = parser.parse_args() + + data = pd.read_csv(f"{args.train_data_path}/train.csv") + train = data.drop("fraud", axis=1) + label = pd.DataFrame(data["fraud"]) + dtrain = xgb.DMatrix(train, label=label) + + params = {"max_depth": args.max_depth, "eta": args.eta, "objective": args.objective} + num_boost_round = args.num_round + nfold = args.nfold + early_stopping_rounds = args.early_stopping_rounds + + cv_results = xgb.cv( + params=params, + dtrain=dtrain, + num_boost_round=num_boost_round, + nfold=nfold, + early_stopping_rounds=early_stopping_rounds, + metrics=("auc"), + seed=0, + ) + + print(f"[0]#011train-auc:{cv_results.iloc[-1]['train-auc-mean']}") + print(f"[1]#011validation-auc:{cv_results.iloc[-1]['test-auc-mean']}") + + metrics_data = { + "classification_metrics": { + "validation:auc": {"value": cv_results.iloc[-1]["test-auc-mean"]}, + "train:auc": {"value": cv_results.iloc[-1]["train-auc-mean"]}, + } + } + + model = xgb.train(params=params, dtrain=dtrain, num_boost_round=len(cv_results)) + + # Save the model to the location specified by ``model_dir`` + metrics_location = args.output_data_dir + "/metrics.json" + model_location = args.model_dir + "/xgboost-model" + + with open(metrics_location, "w") as f: + json.dump(metrics_data, f) + + with open(model_location, "wb") as f: + pickle.dump(model, f) diff --git a/patterns/ml_gateway/helpers/__init__.py b/patterns/ml_gateway/helpers/__init__.py new file mode 100644 index 0000000000..caf5fb0747 --- /dev/null +++ b/patterns/ml_gateway/helpers/__init__.py @@ -0,0 +1,2 @@ +from .ml_gateway import * +from .feature_store import * diff --git a/patterns/ml_gateway/helpers/feature_store.py b/patterns/ml_gateway/helpers/feature_store.py new file mode 100644 index 0000000000..798697a675 --- /dev/null +++ b/patterns/ml_gateway/helpers/feature_store.py @@ -0,0 +1,213 @@ +import sys +import time +from sagemaker.feature_store.feature_definition import FeatureDefinition +from sagemaker.feature_store.feature_group import FeatureGroup + + +class StatusIndicator: + def __init__(self): + self.previous_status = None + self.need_newline = False + + def update(self, status): + if self.previous_status != status: + if self.need_newline: + sys.stdout.write("\n") + sys.stdout.write(status + " ") + self.need_newline = True + self.previous_status = status + else: + sys.stdout.write(".") + self.need_newline = True + sys.stdout.flush() + + def end(self): + if self.need_newline: + sys.stdout.write("\n") + + +def get_feature_definitions(df, feature_group): + """ + Get datatypes from pandas DataFrame and map them + to Feature Store datatypes. + + :param df: pandas.DataFrame + :param feature_group: FeatureGroup + :return: list + """ + # Dtype int_, int8, int16, int32, int64, uint8, uint16, uint32 + # and uint64 are mapped to Integral feature type. + + # Dtype float_, float16, float32 and float64 + # are mapped to Fractional feature type. + + # string dtype is mapped to String feature type. + + # Our schema of our data that we expect + # _after_ SageMaker Processing + feature_definitions = [] + for column in df.columns: + feature_type = feature_group._DTYPE_TO_FEATURE_DEFINITION_CLS_MAP.get( + str(df[column].dtype), None + ) + feature_definitions.append( + FeatureDefinition(column, feature_type) + ) # you can alternatively define your own schema + return feature_definitions + + +def wait_for_feature_group_creation_complete(feature_group): + """ + Wait for a FeatureGroup to finish creating. + + :param feature_group: FeatureGroup + :return: None + """ + status_indicator = StatusIndicator() + status = feature_group.describe().get("FeatureGroupStatus") + while status == "Creating": + status_indicator.update(status) + time.sleep(5) + status = feature_group.describe().get("FeatureGroupStatus") + status_indicator.end() + if status != "Created": + raise RuntimeError(f"Failed to create feature group {feature_group.name}") + print(f"FeatureGroup {feature_group.name} successfully created.") + + +def ingest_df_to_feature_group(df, feature_group_name, feature_store_client): + """ + Take a pandas DataFrame and put it in a FeatureGroup. + + :param df: pandas.DataFrame + :param feature_group_name: str + :param feature_store_client: boto3.client('sagemaker-featurestore-runtime') + :return: None + """ + success, fail = 0, 0 + for row_num, row_series in df.astype(str).iterrows(): + record = [] + for key, value in row_series.to_dict().items(): + record.append({"FeatureName": key, "ValueAsString": str(value)}) + response = feature_store_client.put_record( + FeatureGroupName=feature_group_name, Record=record + ) + if response["ResponseMetadata"]["HTTPStatusCode"] == 200: + success += 1 + else: + fail += 1 + print(f"Success = {success}") + print(f"Fail = {fail}") + + +def get_datatypes(): + """ + Get pandas DataFrame datatypes. + + :return: tuple(dict, dict) + """ + claims_dtypes = { + "policy_id": int, + "incident_severity": int, + "num_vehicles_involved": int, + "num_injuries": int, + "num_witnesses": int, + "police_report_available": int, + "injury_claim": float, + "vehicle_claim": float, + "total_claim_amount": float, + "incident_month": int, + "incident_day": int, + "incident_dow": int, + "incident_hour": int, + "fraud": int, + "driver_relationship_self": int, + "driver_relationship_na": int, + "driver_relationship_spouse": int, + "driver_relationship_child": int, + "driver_relationship_other": int, + "incident_type_collision": int, + "incident_type_breakin": int, + "incident_type_theft": int, + "collision_type_front": int, + "collision_type_rear": int, + "collision_type_side": int, + "collision_type_na": int, + "authorities_contacted_police": int, + "authorities_contacted_none": int, + "authorities_contacted_fire": int, + "authorities_contacted_ambulance": int, + "event_time": float, + } + + customers_dtypes = { + "policy_id": int, + "customer_age": int, + "customer_education": int, + "months_as_customer": int, + "policy_deductable": int, + "policy_annual_premium": int, + "policy_liability": int, + "auto_year": int, + "num_claims_past_year": int, + "num_insurers_past_5_years": int, + "customer_gender_male": int, + "customer_gender_female": int, + "policy_state_ca": int, + "policy_state_wa": int, + "policy_state_az": int, + "policy_state_or": int, + "policy_state_nv": int, + "policy_state_id": int, + "event_time": float, + } + + return claims_dtypes, customers_dtypes + + +def create_feature_group( + feature_group_name, + feature_group_description, + df, + id_name, + event_time_name, + offline_feature_group_bucket, + sagemaker_session, + role, +): + """ + Create a new FeatureGroup. + + :param feature_group_name: str + :param feature_group_description: str + :param df: pandas.DataFrame + :param id_name: str + :param event_time_name: str + :param offline_feature_group_bucket: str + :param sagemaker_session: sagemaker.Session() + :param role: str + :return: tuple(FeatureGroup, bool) + """ + feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session) + feature_definitions = get_feature_definitions(df, feature_group) + feature_group.feature_definitions = feature_definitions + feature_group_already_exists = False + try: + print(f"Trying to create feature group {feature_group_description} \n") + feature_group.create( + description=feature_group_description, + record_identifier_name=id_name, + event_time_feature_name=event_time_name, + role_arn=role, + s3_uri=offline_feature_group_bucket, + enable_online_store=True, + ) + wait_for_feature_group_creation_complete(feature_group) + except Exception as e: + code = e.response.get("Error").get("Code") + if code == "ResourceInUse": + print(f"Using existing feature group: {feature_group_name}") + feature_group_already_exists = True + else: + raise (e) + return feature_group, feature_group_already_exists diff --git a/patterns/ml_gateway/helpers/ml_gateway.py b/patterns/ml_gateway/helpers/ml_gateway.py new file mode 100644 index 0000000000..f3908c9990 --- /dev/null +++ b/patterns/ml_gateway/helpers/ml_gateway.py @@ -0,0 +1,253 @@ +import boto3 +import logging +from time import strftime, gmtime + + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def deploy_ml_gateway_pattern( + sagemaker_endpoint_name: str, region: str, s3_bucket_name: str +) -> str: + """ + Create an API Gateway HTTP endpoint that points to a Lambda function + which points to a SageMaker endpoint. + + :param sagemaker_endpoint_name: str + :param s3_bucket_name: str + :return: str + """ + + cloudformation = boto3.client("cloudformation", region_name=region) + timestamp: int = strftime("%d%H%M%S", gmtime()) + stack_name: str = f"ml-gateway-{timestamp}" + lambda_name: str = f"serverless-artillery-{timestamp}-dev-loadGenerator" + cloudformation.create_stack( + StackName=stack_name, + TemplateBody=ml_gateway_cf_body(region), + Parameters=[ + {"ParameterKey": "SageMakerEndPointName", "ParameterValue": sagemaker_endpoint_name}, + { + "ParameterKey": "LambdaName", + "ParameterValue": f"invoke-sagemaker-endpoint-{timestamp}", + }, + {"ParameterKey": "S3BucketName", "ParameterValue": s3_bucket_name}, + ], + Capabilities=["CAPABILITY_IAM"], + ) + waiter = cloudformation.get_waiter("stack_create_complete") + logger.info("Creating ML Gateway...") + waiter.wait(StackName=stack_name) + logger.info("ML Gateway created!") + response = cloudformation.describe_stacks(StackName=stack_name) + api_gateway_endpoint_url = response["Stacks"][0]["Outputs"][0]["OutputValue"] + return f"{api_gateway_endpoint_url}/TestStage/Model" + + +def ml_gateway_cf_body(region: str) -> str: + """ + Return a JSON CloudFormation template represented as a string + that will create an API Gateway HTTP endpoint with a Lambda + function behind it which calls SageMaker Feature Store + and a SageMaker Production Variant. + """ + + template_body = """ + { + "AWSTemplateFormatVersion":"2010-09-09", + "Description":"Call SageMaker Endpoint with API Gateway and Lambda", + "Parameters":{ + "SageMakerEndPointName": { + "Type" : "String", + "Description" : "Name of your SageMaker Endpoint" + }, + "LambdaName": { + "Type" : "String", + "Description" : "Name of your Lambda function" + }, + "S3BucketName": { + "Type" : "String", + "Description" : "Name of your S3 bucket" + } + }, + "Resources":{ + "lambdafunctionRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": ["lambda.amazonaws.com"] + }, + "Action": ["sts:AssumeRole"] + } + ] + }, + "Path": "/" + } + }, + "lambdafunctionRolePolicy": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyName": "lambda_sm_Function_Policy", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + "sagemaker:InvokeEndpoint", + "sagemaker:GetRecord", + "sagemaker:PutRecord" + ], + "Effect": "Allow", + "Resource": ["arn:aws:logs:*:*:*", + "arn:aws:sagemaker:*:*:*" ] + } + ] + }, + "Roles": [{ "Ref": "lambdafunctionRole"}] + } + }, + "invokeSMEndpoint": { + "Type": "AWS::Lambda::Function", + "Properties": { + "Code": { + "S3Bucket": {"Ref": "S3BucketName"}, + "S3Key": "function.zip" + }, + "Handler": "lambda_function.lambda_handler", + "FunctionName": {"Ref": "LambdaName"}, + "Layers": [ + "arn:aws:lambda:us-west-1:770693421928:layer:Klayers-python38-pandas:37" + ], + "Runtime": "python3.8", + "Timeout": 30, + "Role": {"Fn::GetAtt": ["lambdafunctionRole", "Arn"]} + } + }, + "ModelAPI": { + "Type": "AWS::ApiGateway::RestApi", + "Properties": { + "Name": "ModelAPI", + "Description": "API fronting Lambda function calling SageMaker endpoint", + "FailOnWarnings" : true + } + }, + "LambdaPermission": { + "Type": "AWS::Lambda::Permission", + "Properties": { + "Action": "lambda:invokeFunction", + "FunctionName": {"Fn::GetAtt": ["invokeSMEndpoint", "Arn"]}, + "Principal": "apigateway.amazonaws.com", + "SourceArn": {"Fn::Join": ["", + ["arn:aws:execute-api:", {"Ref": "AWS::Region"}, ":", {"Ref": "AWS::AccountId"}, ":", {"Ref": "ModelAPI"}, "/*"] + ]} + } + }, + "ModelApiStage": { + "DependsOn" : ["ApiGatewayAccount"], + "Type": "AWS::ApiGateway::Stage", + "Properties": { + "DeploymentId": {"Ref": "ApiDeployment"}, + "MethodSettings": [{ + "DataTraceEnabled": true, + "HttpMethod": "*", + "LoggingLevel": "INFO", + "ResourcePath": "/*" + }], + "RestApiId": {"Ref": "ModelAPI"}, + "StageName": "LATEST" + } + }, + "ApiGatewayCloudWatchLogsRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Principal": { "Service": ["apigateway.amazonaws.com"] }, + "Action": ["sts:AssumeRole"] + }] + }, + "Policies": [{ + "PolicyName": "ApiGatewayLogsPolicy", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:DescribeLogGroups", + "logs:DescribeLogStreams", + "logs:PutLogEvents", + "logs:GetLogEvents", + "logs:FilterLogEvents" + ], + "Resource": "*" + }] + } + }] + } + }, + "ApiGatewayAccount": { + "Type" : "AWS::ApiGateway::Account", + "Properties" : { + "CloudWatchRoleArn" : {"Fn::GetAtt" : ["ApiGatewayCloudWatchLogsRole", "Arn"] } + } + }, + + "ApiDeployment": { + "Type": "AWS::ApiGateway::Deployment", + "DependsOn": ["ModelRequest"], + "Properties": { + "RestApiId": {"Ref": "ModelAPI"}, + "StageName": "TestStage" + } + }, + "Model": { + "Type": "AWS::ApiGateway::Resource", + "Properties": { + "RestApiId": {"Ref": "ModelAPI"}, + "ParentId": {"Fn::GetAtt": ["ModelAPI", "RootResourceId"]}, + "PathPart": "Model" + } + }, + "ModelRequest": { + "DependsOn": "LambdaPermission", + "Type": "AWS::ApiGateway::Method", + "Properties": { + "AuthorizationType": "NONE", + "HttpMethod": "POST", + "Integration": { + "Type": "AWS_PROXY", + "IntegrationHttpMethod": "POST", + "Uri": {"Fn::Join" : ["", + ["arn:aws:apigateway:", {"Ref": "AWS::Region"}, ":lambda:path/2015-03-31/functions/", {"Fn::GetAtt": ["invokeSMEndpoint", "Arn"]}, "/invocations"] + ]} + }, + "MethodResponses": [{ + "StatusCode": "200" + }], + "ResourceId": {"Ref": "Model"}, + "RestApiId": {"Ref": "ModelAPI"} + } + } + }, + "Outputs":{ + "APIGatewayEndPointURL":{ + "Value": {"Fn::Join": ["", ["https://", {"Ref": "ModelAPI"}, ".execute-api.", {"Ref": "AWS::Region"}, ".amazonaws.com"]]} + } + } +} + """ + + return template_body.replace("lambda:us-west-1", f"lambda:{region}") diff --git a/patterns/ml_gateway/images/ml-gateway-pattern.png b/patterns/ml_gateway/images/ml-gateway-pattern.png new file mode 100644 index 0000000000..ce1887a1bc Binary files /dev/null and b/patterns/ml_gateway/images/ml-gateway-pattern.png differ diff --git a/patterns/ml_gateway/index.rst b/patterns/ml_gateway/index.rst new file mode 100644 index 0000000000..294c0cf70a --- /dev/null +++ b/patterns/ml_gateway/index.rst @@ -0,0 +1,13 @@ +The ML Gateway Pattern +============================== + +The following are examples of using the ML Gateway Pattern for inference. + + +Data Preparation and Inference Using SageMaker Feature Store +------------------------------------------------------------ + +.. toctree:: + :maxdepth: 2 + + ml-gateway diff --git a/patterns/ml_gateway/ml-gateway.ipynb b/patterns/ml_gateway/ml-gateway.ipynb new file mode 100644 index 0000000000..c7413ef08a --- /dev/null +++ b/patterns/ml_gateway/ml-gateway.ipynb @@ -0,0 +1,1263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Enterprise-Grade ML : Part 1 - Prepare and Use the ML Gateway Pattern for Inference\n", + "\n", + "## Data Preperation and Inference Using SageMaker Feature Store " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", + "\n", + "### Contents\n", + "\n", + "----\n", + "\n", + "- [Motivation](#motivation)\n", + "- [Architecure](#arch)\n", + "- [Import Libraries and SageMaker Session Variables](#imports)\n", + "- [Data and Features](#data)\n", + "- [Clean Up](#clean-up)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", + "\n", + "## Motivation\n", + "\n", + "----\n", + "\n", + "Data Science projects often start in an experimental phase in which transformations on features are experimented with, algorithms are selected and tried for determining if they can fit the data distribution well enough for reliable predictions, tuning is done with various hyper-parameters and so on. \n", + "\n", + "As an organization matures in their Machine Learning (ML) Journey, they will find that they will then transition to an automated ML or MLOps phase where the pipelines for data preparation, training, deployment, monitoring will all need to be automated.\n", + "\n", + "In order to raise the maturity of projects to an Enterprise Scale that can fulfill business needs, sustain business-level continuity, scale, security and performance, the need for integrating data science experiments with machine learning deployment patterns and best-practices will grow in importance and will save you time and money.\n", + "\n", + "In this blog series on ML Patterns, we will start by focusing on Deployment Patterns and Best-Practices within the ML lifecycle : exploring the considerations and options that present themselves, post-training; on the serving/inference/prediction phases of the ML lifecycle.\n", + "\n", + "There are many ways in which we can expose an endpoint that was deployed as a hosted SageMaker endpoint: these variations are summarized in the ML Gateway Pattern with mandatory and optional components. Through this series of blogs we will outline options and their context, pros and cons for helping you decide what components to use for your specific workload and use-case.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", + "\n", + "## Architecture\n", + "\n", + "----\n", + "\n", + "Here we break down the example in this blog into four parts:\n", + "\n", + "1. Data prep\n", + " 1. For preparation we will load the CSV into s3\n", + " 2. Then create and populate a Feature Store that can be used for training our model\n", + " 3. Later we will use Athena to load the data from the feature store into a dataframe\n", + "2. Training and deployment\n", + "3. Inference\n", + "4. MLOps — deployment of a Cloud Formation Template \n", + "\n", + "\n", + "\n", + "![image](./images/ml-gateway-pattern.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", + "\n", + "## Import Libraries and SageMaker Session Variables\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pip\n", + "\n", + "def import_or_install(package):\n", + " try:\n", + " __import__(package)\n", + " except ImportError:\n", + " pip.main(['install', package]) \n", + " \n", + "import_or_install('sagemaker')\n", + "import_or_install('boto3')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import sagemaker\n", + "import boto3\n", + "import os\n", + "from sagemaker.inputs import TrainingInput\n", + "from sagemaker.xgboost import XGBoost\n", + "from sagemaker.predictor import Predictor\n", + "from sagemaker.serializers import CSVSerializer\n", + "from sagemaker.deserializers import CSVDeserializer\n", + "from sagemaker.session import production_variant\n", + "from sagemaker.model_monitor import DataCaptureConfig, CronExpressionGenerator, DefaultModelMonitor\n", + "from sagemaker.model_monitor.dataset_format import DatasetFormat\n", + "import datetime as datetime\n", + "import statistics\n", + "import numpy as np\n", + "import requests\n", + "import shutil\n", + "import time\n", + "import helpers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Session variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "role = sagemaker.get_execution_role()\n", + "\n", + "# Session variables\n", + "sess = sagemaker.Session()\n", + "bucket = sess.default_bucket()\n", + "prefix = \"ml-gateway\"\n", + "region = sess.boto_region_name\n", + "\n", + "print(f\"Region: {region}\\nBucket: {bucket}\\nPrefix: {prefix}\\n\")\n", + "\n", + "# Data source location\n", + "claims_url = \"https://raw.githubusercontent.com/aws/amazon-sagemaker-examples/master/end_to_end/fraud_detection/data/claims_preprocessed.csv\"\n", + "customers_url = \"https://raw.githubusercontent.com/aws/amazon-sagemaker-examples/master/end_to_end/fraud_detection/data/customers_preprocessed.csv\"\n", + "\n", + "# Feature Store parameters\n", + "claims_feature_group_name = \"claims-feature-group\"\n", + "customers_feature_group_name = \"customers-feature-group\"\n", + "claims_feature_group_description = \"Claims feature group\"\n", + "customers_feature_group_description = \"Customers feature group\"\n", + "id_name = \"policy_id\"\n", + "event_time_name = \"event_time\"\n", + "claims_offline_feature_group_bucket = f\"s3://{bucket}/claims-feature-group\"\n", + "customers_offline_feature_group_bucket = f\"s3://{bucket}/customers-feature-group\"\n", + "\n", + "# SageMaker training\n", + "s3_input_train_uri = f\"s3://{bucket}/{prefix}/data/train/train.csv\"\n", + "s3_input_test_uri = f\"s3://{bucket}/{prefix}/data/test/test.csv\"\n", + "train_instance_type = \"ml.m4.xlarge\"\n", + "train_base_job_name = \"xgboost-model\"\n", + "\n", + "# Model names\n", + "model1_name = \"xgboost-model-1\"\n", + "model2_name = \"xgboost-model-2\"\n", + "\n", + "# SageMaker endpoint\n", + "endpoint_name = \"xgboost-claims-fraud\"\n", + "deploy_instance_type = \"ml.m4.xlarge\"\n", + "\n", + "# SageMaker Model Monitor\n", + "monitor_schedule_name = f\"{prefix}-monitor-schedule\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", + "\n", + "## Data and Features\n", + "\n", + "----\n", + "\n", + "The data we are using is the same synthetic data that was created in this blog post for the [End-to-end ML Lifecycle with Amazon SageMaker](https://aws.amazon.com/blogs/machine-learning/architect-and-build-the-full-machine-learning-lifecycle-with-amazon-sagemaker/). \n", + "The use-case in the above link/blog is Autoclaim Fraud Detection. We will be using the same datasets to demonstrate the ML Gateway Pattern in this example." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get claims and customer data from existing aws-samples location\n", + "claims_df = pd.read_csv(claims_url)\n", + "customers_df = pd.read_csv(customers_url)\n", + "\n", + "# If your DataFrame doesn't have a timestamp, you can just create one\n", + "timestamp = pd.to_datetime(\"now\").timestamp()\n", + "claims_df[event_time_name] = timestamp\n", + "customers_df[event_time_name] = timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "claims_dtypes, customers_dtypes = helpers.get_datatypes()\n", + "claims_df = claims_df.astype(claims_dtypes)\n", + "customers_df = customers_df.astype(customers_dtypes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add data to Feature Store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "claims_feature_group, claims_feature_group_exists = helpers.create_feature_group(\n", + " claims_feature_group_name,\n", + " claims_feature_group_description,\n", + " claims_df,\n", + " id_name,\n", + " event_time_name,\n", + " claims_offline_feature_group_bucket,\n", + " sess,\n", + " role,\n", + ")\n", + "\n", + "customers_feature_group, customers_feature_group_exists = helpers.create_feature_group(\n", + " customers_feature_group_name,\n", + " customers_feature_group_description,\n", + " customers_df,\n", + " id_name,\n", + " event_time_name,\n", + " customers_offline_feature_group_bucket,\n", + " sess,\n", + " role,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add data to Feature Groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ingest data to Feature Store\n", + "feature_store_client = boto3.client(\"sagemaker-featurestore-runtime\", region_name=region)\n", + "if not claims_feature_group_exists:\n", + " helpers.ingest_df_to_feature_group(claims_df, claims_feature_group_name, feature_store_client)\n", + "if not customers_feature_group_exists:\n", + " helpers.ingest_df_to_feature_group(\n", + " customers_df, customers_feature_group_name, feature_store_client\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get training and test data from Feature Store" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wait for data to be synchronized with offline Feature Store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "time.sleep(900)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then query feature store to get training and test data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "claims_query = claims_feature_group.athena_query()\n", + "customers_query = customers_feature_group.athena_query()\n", + "\n", + "claims_table = claims_query.table_name\n", + "customers_table = customers_query.table_name\n", + "database_name = customers_query.database\n", + "\n", + "feature_columns = list(set(claims_df.columns) ^ set(customers_df.columns))\n", + "feature_columns_string = \", \".join(f'\"{c}\"' for c in feature_columns)\n", + "feature_columns_string = f'\"{claims_table}\".{id_name} as {id_name}, ' + feature_columns_string\n", + "\n", + "query_string = f\"\"\"\n", + "SELECT {feature_columns_string}\n", + "FROM \"{claims_table}\" LEFT JOIN \"{customers_table}\" \n", + "ON \"{claims_table}\".{id_name} = \"{customers_table}\".{id_name}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "claims_query.run(query_string=query_string, output_location=f\"s3://{bucket}/{prefix}/query_results\")\n", + "claims_query.wait()\n", + "dataset = claims_query.as_dataframe()\n", + "\n", + "# Create data directory to store local data\n", + "data_dir = os.path.join(os.getcwd(), \"data\")\n", + "os.makedirs(data_dir, exist_ok=True)\n", + "\n", + "dataset.to_csv(\"data/claims_customer.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save training and test sets locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "col_order = [\"fraud\"] + list(dataset.drop([\"fraud\", \"policy_id\"], axis=1).columns)\n", + "train = dataset.sample(frac=0.80, random_state=0)[col_order]\n", + "test = dataset.drop(train.index)[col_order]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train.to_csv(\"data/train.csv\", index=False)\n", + "test.to_csv(\"data/test.csv\", index=False)\n", + "\n", + "test = test.reset_index(drop=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Upload datasets to S3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s3_client = boto3.client(\"s3\", region_name=region)\n", + "s3_client.upload_file(\n", + " Filename=\"data/train.csv\", Bucket=bucket, Key=f\"{prefix}/data/train/train.csv\"\n", + ")\n", + "s3_client.upload_file(Filename=\"data/test.csv\", Bucket=bucket, Key=f\"{prefix}/data/test/test.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train and deploy an XGBoost model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s3_input_train = TrainingInput(s3_input_train_uri, content_type=\"csv\")\n", + "s3_input_test = TrainingInput(s3_input_test_uri, content_type=\"csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "hyperparameters = {\n", + " \"max_depth\": \"3\",\n", + " \"eta\": \"0.2\",\n", + " \"objective\": \"binary:logistic\",\n", + " \"num_round\": \"100\",\n", + "}\n", + "\n", + "estimator_parameters = {\n", + " \"entry_point\": \"code/train_deploy.py\",\n", + " \"instance_type\": train_instance_type,\n", + " \"instance_count\": 1,\n", + " \"hyperparameters\": hyperparameters,\n", + " \"role\": role,\n", + " \"base_job_name\": train_base_job_name,\n", + " \"framework_version\": \"1.0-1\",\n", + " \"py_version\": \"py3\",\n", + "}\n", + "\n", + "estimator = XGBoost(**estimator_parameters)\n", + "inputs = {\"train\": s3_input_train, \"test\": s3_input_test}\n", + "\n", + "# Train the model if it already hasn't been trained\n", + "existing_training_jobs = sess.sagemaker_client.list_training_jobs(\n", + " NameContains=train_base_job_name, MaxResults=30\n", + ")[\"TrainingJobSummaries\"]\n", + "if not existing_training_jobs:\n", + " estimator.fit(inputs)\n", + "# Else fetch the latest training job\n", + "else:\n", + " latest_training_job_name = existing_training_jobs[0][\"TrainingJobName\"]\n", + " estimator = XGBoost.attach(latest_training_job_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create two SageMaker models to deploy behind a single endpoint using SageMaker Production Variants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model1 = estimator.create_model(entry_point=\"code/train_deploy.py\", role=role, name=model1_name)\n", + "model1._create_sagemaker_model(instance_type=deploy_instance_type)\n", + "\n", + "model2 = estimator.create_model(entry_point=\"code/train_deploy.py\", role=role, name=model2_name)\n", + "model2._create_sagemaker_model(instance_type=deploy_instance_type)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "variant_1 = production_variant(\n", + " model_name=model1_name,\n", + " instance_type=deploy_instance_type,\n", + " initial_instance_count=1,\n", + " variant_name=\"Variant1\",\n", + " initial_weight=1,\n", + ")\n", + "\n", + "\n", + "variant_2 = production_variant(\n", + " model_name=model2_name,\n", + " instance_type=deploy_instance_type,\n", + " initial_instance_count=1,\n", + " variant_name=\"Variant2\",\n", + " initial_weight=1,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Setup Model Monitor's Data Capture for Production Variants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s3_capture_upload_path = f\"s3://{bucket}/{prefix}/model_monitor\"\n", + "\n", + "data_capture_config = DataCaptureConfig(\n", + " enable_capture=True, sampling_percentage=100, destination_s3_uri=s3_capture_upload_path\n", + ")\n", + "\n", + "data_capture_config_dict = data_capture_config._to_request_dict()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now create the Production Variant endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If not already deployed, deploy the model\n", + "existing_endpoints = sess.sagemaker_client.list_endpoints(\n", + " NameContains=endpoint_name, MaxResults=30\n", + ")[\"Endpoints\"]\n", + "if not existing_endpoints:\n", + " sess.endpoint_from_production_variants(\n", + " name=endpoint_name,\n", + " production_variants=[variant_1, variant_2],\n", + " data_capture_config_dict=data_capture_config_dict,\n", + " )\n", + " predictor = Predictor(\n", + " endpoint_name=endpoint_name,\n", + " sagemaker_session=sess,\n", + " serializer=CSVSerializer(),\n", + " deserializer=CSVDeserializer(),\n", + " )\n", + "else:\n", + " predictor = Predictor(\n", + " endpoint_name=endpoint_name,\n", + " sagemaker_session=sess,\n", + " serializer=CSVSerializer(),\n", + " deserializer=CSVDeserializer(),\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a baseline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Baseline data is the training data that we saved as CSV\n", + "baseline_data_uri = s3_input_train_uri\n", + "baseline_results_uri = f\"s3://{bucket}/{prefix}/model_monitor/baseline_output\"\n", + "\n", + "my_default_monitor = DefaultModelMonitor(\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=\"ml.m5.large\",\n", + " volume_size_in_gb=20,\n", + " max_runtime_in_seconds=3600,\n", + ")\n", + "\n", + "my_default_monitor.suggest_baseline(\n", + " baseline_dataset=baseline_data_uri,\n", + " dataset_format=DatasetFormat.csv(header=False),\n", + " output_s3_uri=baseline_results_uri,\n", + " wait=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the monitoring job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_violations_uri = f\"s3://{bucket}/{prefix}/model_monitor/violations\"\n", + "\n", + "my_default_monitor.create_monitoring_schedule(\n", + " monitor_schedule_name=monitor_schedule_name,\n", + " endpoint_input=endpoint_name,\n", + " output_s3_uri=baseline_violations_uri,\n", + " statistics=my_default_monitor.baseline_statistics(),\n", + " constraints=my_default_monitor.suggested_constraints(),\n", + " schedule_cron_expression=CronExpressionGenerator.hourly(),\n", + " enable_cloudwatch_metrics=True,\n", + ")\n", + "\n", + "desc_schedule_result = my_default_monitor.describe_schedule()\n", + "print(\"Schedule status: {}\".format(desc_schedule_result[\"MonitoringScheduleStatus\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test Feature Store in Real-Time Inference" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See how you can aggregate data from multiple Feature Groups and use those features as input to a SageMaker endpoint in a low-latency fashion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def get_prediction(policy_id, featurestore_runtime):\n", + " t0 = datetime.datetime.now()\n", + " customer_record_response = featurestore_runtime.get_record(\n", + " FeatureGroupName=\"customers-feature-group\", RecordIdentifierValueAsString=str(policy_id)\n", + " )\n", + "\n", + " claims_record_response = featurestore_runtime.get_record(\n", + " FeatureGroupName=\"claims-feature-group\", RecordIdentifierValueAsString=str(policy_id)\n", + " )\n", + "\n", + " t1 = datetime.datetime.now()\n", + "\n", + " customer_record = customer_record_response[\"Record\"]\n", + " customer_df = pd.DataFrame(customer_record).set_index(\"FeatureName\")\n", + " claims_record = claims_record_response[\"Record\"]\n", + " claims_df = pd.DataFrame(claims_record).set_index(\"FeatureName\")\n", + "\n", + " joined_df = pd.concat([claims_df, customer_df]).loc[col_order].drop(\"fraud\")\n", + " payload = \",\".join(joined_df[\"ValueAsString\"])\n", + " prediction = float(\n", + " predictor.predict(\n", + " payload, initial_args={\"ContentType\": \"text/csv\"}, target_variant=\"Variant1\"\n", + " )[0][0]\n", + " )\n", + "\n", + " diff = t1 - t0\n", + " minutes, seconds = divmod(diff.total_seconds(), 60)\n", + " timer.append(seconds)\n", + "\n", + " return prediction\n", + "\n", + "\n", + "# Instantiate Feature Store Runtime client\n", + "boto_session = boto3.Session(region_name=region)\n", + "featurestore_runtime = boto_session.client(\n", + " service_name=\"sagemaker-featurestore-runtime\", region_name=region\n", + ")\n", + "\n", + "MAX_POLICY_IDS = 100\n", + "timer = []\n", + "for policy_id in range(1, MAX_POLICY_IDS + 1):\n", + " prediction = get_prediction(policy_id, featurestore_runtime)\n", + " print(f\"Probablitity the claim from policy {int(policy_id)} is fraudulent:\", prediction)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get latencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "timer_array = np.array(timer)\n", + "print(\n", + " f\"p95: {np.percentile(timer_array,95)}, p99: {np.percentile(timer_array,99)}, mean: {np.mean(timer_array)} for {MAX_POLICY_IDS} distinct Feature Store gets across two Feature Groups\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create ML Gateway with Feature Store" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, write out a Lambda function script. Make sure to replace the `ENDPOINT_NAME` variable with the name of your deployed SageMaker endpoint." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Lambda function will check if the policy ID from a user request already exists in Feature Store. If so, it will fetch the features associated with the policy ID from both Feature Groups and feed them as inputs into the SageMaker endpoint.\n", + "\n", + "If there are no features in Feature Store for the given policy ID, then take the raw data from the request, transform it, store it in Feature Store, and return a prediction back to the user." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile lambda_function.py\n", + "\n", + "import os\n", + "import io\n", + "import boto3\n", + "import json\n", + "import pandas as pd\n", + "import datetime as datetime\n", + "import re\n", + "\n", + "ENDPOINT_NAME = \"xgboost-claims-fraud\" # REPLACE WITH SAGEMAKER ENDPOINT NAME\n", + "ENDPOINT_NAME = ENDPOINT_NAME.strip()\n", + "runtime = boto3.client(\"runtime.sagemaker\")\n", + "\n", + "# Instantiate Feature Store Runtime client\n", + "# get current region\n", + "region = boto3.Session().region_name\n", + "print(f\"region : {region}\\n\")\n", + "\n", + "boto_session = boto3.Session(region_name=region)\n", + "featurestore_runtime = boto_session.client(\n", + " service_name=\"sagemaker-featurestore-runtime\", region_name=region\n", + ")\n", + "\n", + "\n", + "def get_payload(policy_id):\n", + " \"\"\"Get records associated with the policy id from both\n", + " Feature Groups\n", + "\n", + " Args:\n", + " policy_id: int or str\n", + "\n", + " Returns:\n", + " str\n", + " \"\"\"\n", + "\n", + " col_order = [\n", + " \"fraud\",\n", + " \"driver_relationship_child\",\n", + " \"num_insurers_past_5_years\",\n", + " \"incident_severity\",\n", + " \"driver_relationship_self\",\n", + " \"authorities_contacted_none\",\n", + " \"months_as_customer\",\n", + " \"driver_relationship_na\",\n", + " \"policy_liability\",\n", + " \"collision_type_side\",\n", + " \"collision_type_front\",\n", + " \"incident_month\",\n", + " \"num_claims_past_year\",\n", + " \"customer_gender_male\",\n", + " \"num_vehicles_involved\",\n", + " \"customer_education\",\n", + " \"authorities_contacted_ambulance\",\n", + " \"police_report_available\",\n", + " \"incident_dow\",\n", + " \"vehicle_claim\",\n", + " \"collision_type_rear\",\n", + " \"customer_gender_female\",\n", + " \"incident_day\",\n", + " \"policy_state_or\",\n", + " \"customer_age\",\n", + " \"policy_state_wa\",\n", + " \"injury_claim\",\n", + " \"policy_state_id\",\n", + " \"driver_relationship_spouse\",\n", + " \"policy_deductable\",\n", + " \"num_injuries\",\n", + " \"collision_type_na\",\n", + " \"driver_relationship_other\",\n", + " \"incident_hour\",\n", + " \"incident_type_theft\",\n", + " \"incident_type_breakin\",\n", + " \"num_witnesses\",\n", + " \"policy_state_ca\",\n", + " \"policy_state_nv\",\n", + " \"incident_type_collision\",\n", + " \"auto_year\",\n", + " \"authorities_contacted_police\",\n", + " \"policy_state_az\",\n", + " \"policy_annual_premium\",\n", + " \"total_claim_amount\",\n", + " \"authorities_contacted_fire\",\n", + " ]\n", + " t0 = datetime.datetime.now()\n", + " customer_record_response = featurestore_runtime.get_record(\n", + " FeatureGroupName=\"customers-feature-group\", RecordIdentifierValueAsString=str(policy_id)\n", + " )\n", + " claims_record_response = featurestore_runtime.get_record(\n", + " FeatureGroupName=\"claims-feature-group\", RecordIdentifierValueAsString=str(policy_id)\n", + " )\n", + " t1 = datetime.datetime.now()\n", + " customer_record = customer_record_response[\"Record\"]\n", + " customer_df = pd.DataFrame(customer_record).set_index(\"FeatureName\")\n", + " claims_record = claims_record_response[\"Record\"]\n", + " claims_df = pd.DataFrame(claims_record).set_index(\"FeatureName\")\n", + " joined_df = pd.concat([claims_df, customer_df]).loc[col_order].drop(\"fraud\")\n", + " payload = \",\".join(joined_df[\"ValueAsString\"])\n", + " return payload\n", + "\n", + "\n", + "def response(message, status_code):\n", + " return {\n", + " \"statusCode\": str(status_code),\n", + " \"body\": json.dumps(message),\n", + " \"headers\": {\"Content-Type\": \"application/json\", \"Access-Control-Allow-Origin\": \"*\"},\n", + " }\n", + "\n", + "\n", + "def one_hot_encoder(df: pd.DataFrame, input_column: str, categories: list) -> None:\n", + " \"\"\"A one hot encoder similiar to the one in Data Wrangler.\n", + "\n", + " Args:\n", + " df: A Pandas DataFrame.\n", + " input_column: The name of the column which contains the categorical values.\n", + " categories: The list of categorical values which was available during training.\n", + "\n", + " Returns:\n", + " None: The DataFrame is updated in place with the encoded features.\n", + "\n", + " \"\"\"\n", + "\n", + " # NaN types are converted to literal `na` in Data Wrangler during one-hot encoding\n", + " if \"na\" in categories:\n", + " df[input_column].fillna(\"na\", inplace=True)\n", + " for c in categories:\n", + " df[f\"{input_column}_{c}\"] = 0\n", + " for idx, val in df[input_column].iteritems():\n", + " df.at[idx, f\"{input_column}_{val}\"] = 1\n", + " df.drop(input_column, axis=1, inplace=True)\n", + "\n", + "\n", + "def transform_claims_data(claims_data: dict) -> pd.DataFrame:\n", + " \"\"\"Transforms the inbound claims data to the feature store format.\n", + "\n", + " Args:\n", + " claims_data: A dictionary containing the claims data.\n", + "\n", + " Returns:\n", + " pd.DataFrame: A Pandas DataFrame containing the processed claims data.\n", + " \"\"\"\n", + "\n", + " claims_df = pd.DataFrame.from_dict(claims_data)\n", + "\n", + " # (3) convert cat columns to lowercase\n", + " claims_df = claims_df.applymap(lambda s: s.lower() if type(s) == str else s)\n", + "\n", + " # (4-6) format string\n", + " invalid_char = re.compile(\"[-@#$%^&*()_+=/\\`~{}|<>?]\")\n", + " claims_df[\"driver_relationship\"].replace(invalid_char, \" \", regex=True, inplace=True)\n", + " claims_df[\"collision_type\"].replace(invalid_char, \" \", regex=True, inplace=True)\n", + " claims_df[\"incident_type\"].replace(invalid_char, \" \", regex=True, inplace=True)\n", + "\n", + " # (7-10) one hot encode\n", + " one_hot_encoder(claims_df, \"driver_relationship\", [\"spouse\", \"self\", \"child\", \"na\", \"other\"])\n", + " one_hot_encoder(claims_df, \"incident_type\", [\"collision\", \"breakin\", \"theft\"])\n", + " one_hot_encoder(claims_df, \"collision_type\", [\"front\", \"rear\", \"side\", \"na\"])\n", + " one_hot_encoder(claims_df, \"authorities_contacted\", [\"none\", \"police\", \"ambulance\", \"fire\"])\n", + "\n", + " # (11-12) ordinal encode\n", + " claims_df[\"incident_severity\"] = claims_df[\"incident_severity\"].replace(\n", + " {\"minor\": 0, \"major\": 1, \"totaled\": 2, \"na\": 3}\n", + " )\n", + " claims_df[\"police_report_available\"] = claims_df[\"police_report_available\"].replace(\n", + " {\"no\": 0, \"yes\": 1, \"na\": 2}\n", + " )\n", + "\n", + " # (13) create event_time\n", + " claims_df[\"event_time\"] = pd.to_datetime(\"now\").timestamp()\n", + "\n", + " # NOTE: remaining steps in Flow file involve casting encoded columns from Float to Long, which is not\n", + " # necessary here.\n", + "\n", + " return claims_df\n", + "\n", + "\n", + "def transform_customers_data(customers_data: dict) -> pd.DataFrame:\n", + " \"\"\"Transforms the inbound customers data to the feature store format.\n", + "\n", + " Args:\n", + " customers_data: A dictionary containing the customers data.\n", + "\n", + " Returns:\n", + " pd.DataFrame: A Pandas DataFrame containing the processed customers data.\n", + " \"\"\"\n", + " customers_df = pd.DataFrame.from_dict(customers_data)\n", + "\n", + " # (3) convert cat columns to lowercase\n", + " customers_df = customers_df.applymap(lambda s: s.lower() if type(s) == str else s)\n", + "\n", + " # (4) drop customer_zip\n", + " customers_df.drop(\"customer_zip\", axis=1, inplace=True)\n", + "\n", + " # (5-6) one hot encode\n", + " one_hot_encoder(customers_df, \"customer_gender\", [\"unkown\", \"male\", \"female\", \"other\"])\n", + " one_hot_encoder(customers_df, \"policy_state\", [\"wa\", \"ca\", \"az\", \"or\", \"nv\", \"id\"])\n", + "\n", + " # (7-8) ordinal encode\n", + " customers_df[\"customer_education\"] = customers_df[\"customer_education\"].replace(\n", + " {\n", + " \"below high school\": 0,\n", + " \"high school\": 1,\n", + " \"associate\": 2,\n", + " \"bachelor\": 3,\n", + " \"advanced degree\": 4,\n", + " }\n", + " )\n", + " customers_df[\"policy_liability\"] = customers_df[\"policy_liability\"].replace(\n", + " {\"15/30\": 0, \"25/50\": 1, \"30/60\": 2, \"100/200\": 3}\n", + " )\n", + "\n", + " # NOTE: steps 9-18 in Flow file involve casting encoded columns from Float to Long, which is not\n", + " # necessary here.\n", + "\n", + " # (19) create event_time\n", + " customers_df[\"event_time\"] = pd.to_datetime(\"now\").timestamp()\n", + "\n", + " # (20-21) drop unused columns\n", + " customers_df.drop(\"customer_gender_unkown\", axis=1, inplace=True)\n", + " customers_df.drop(\"customer_gender_other\", axis=1, inplace=True)\n", + "\n", + " return customers_df\n", + "\n", + "\n", + "def ingest_df_to_feature_group(df, feature_group_name):\n", + " \"\"\"Ingests data from a DataFrame into a Feature Groups\n", + "\n", + " Args:\n", + " df: pd.DataFrame\n", + " feature_group_name: str\n", + "\n", + " Returns:\n", + " None: Data is already ingested into Feature Group\n", + " \"\"\"\n", + " success, fail = 0, 0\n", + " for row_num, row_series in df.astype(str).iterrows():\n", + " record = []\n", + " for key, value in row_series.to_dict().items():\n", + " record.append({\"FeatureName\": key, \"ValueAsString\": str(value)})\n", + " print(record)\n", + " response = featurestore_runtime.put_record(\n", + " FeatureGroupName=feature_group_name, Record=record\n", + " )\n", + " if response[\"ResponseMetadata\"][\"HTTPStatusCode\"] == 200:\n", + " success += 1\n", + " else:\n", + " fail += 1\n", + " print(f\"Success = {success}\")\n", + " print(f\"Fail = {fail}\")\n", + "\n", + "\n", + "def get_prediction(policy, target_variant):\n", + " \"\"\"Get records from Feature Groups and invoke SageMaker endpoint\n", + "\n", + " Args:\n", + " policy: int or str\n", + "\n", + " Returns:\n", + " dict to be used as a json response\n", + " \"\"\"\n", + " feature_record = get_payload(policy)\n", + " sm_response = runtime.invoke_endpoint(\n", + " EndpointName=ENDPOINT_NAME,\n", + " ContentType=\"text/csv\",\n", + " Accept=\"application/json\",\n", + " Body=feature_record,\n", + " TargetVariant=target_variant,\n", + " )\n", + " result = json.loads(sm_response[\"Body\"].read().decode())\n", + " pred = result[0]\n", + " return response({\"prediction\": pred}, 200)\n", + "\n", + "\n", + "def lambda_handler(event, context):\n", + " print(\"Received event: \" + json.dumps(event, indent=2))\n", + "\n", + " # If request came from API Gateway\n", + " try:\n", + " data = json.loads(event[\"body\"])\n", + "\n", + " # Otherwise it's just a test case\n", + " except:\n", + " data = json.loads(json.dumps(event))\n", + "\n", + " policy = data[\"claim\"][\"policy_id\"][\"0\"]\n", + " target_variant = data[\"variant\"]\n", + "\n", + " try:\n", + " return get_prediction(policy, target_variant)\n", + " except:\n", + " # Get raw data from request\n", + " claim = data[\"claim\"]\n", + " customer = data[\"customer\"]\n", + " # Transform data\n", + " processed_claims_df = transform_claims_data(claim)\n", + " processed_customers_df = transform_customers_data(customer)\n", + " # Ingest newly processed records into Feature Groups\n", + " ingest_df_to_feature_group(processed_claims_df, \"claims-feature-group\")\n", + " ingest_df_to_feature_group(processed_customers_df, \"customers-feature-group\")\n", + " # Return prediction\n", + " return get_prediction(policy, target_variant)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Upload the Lambda code to S3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shutil.make_archive(\"function\", \"zip\", \".\", \"lambda_function.py\")\n", + "s3_bucket_uri = f\"s3://{bucket}\"\n", + "\n", + "!aws s3 cp function.zip {s3_bucket_uri}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the `helpers` library to deploy what we call an ML Gateway pattern. This will spin up an API Gateway endpoint that's attached to a Lambda function with code you've seen above. This is the gateway that ties together the SageMaker Feature Store and a model deployed as a SageMaker endpoint." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To deploy this ML Gateway pattern, you need to add the following permissions to your SageMaker execution role:\n", + "\n", + "```json\n", + "{\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"apigateway:*\"\n", + " ],\n", + " \"Resource\": [\n", + " \"*\"\n", + " ]\n", + "},\n", + "{\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"lambda:GetLayerVersion\"\n", + " ],\n", + " \"Resource\": [\n", + " \"*\"\n", + " ]\n", + "}\n", + "```\n", + "\n", + "Alternatively, you can add the managed `AWSLambdaFullAccess` and `AmazonAPIGatewayAdministrator` policies to your SageMaker execution role but keep in mind these particular managed policies are overly permissive and should be reviewed for least privileges before production." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "api_gateway_url = helpers.deploy_ml_gateway_pattern(endpoint_name, region, bucket)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the above API Gateway URL, we can call our endpoint with Feature Store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "api_gateway_url" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Endpoint\n", + "url = api_gateway_url\n", + "\n", + "# User request data\n", + "input_data = {\n", + " \"variant\": \"Variant1\",\n", + " \"claim\": {\n", + " \"policy_id\": {\"0\": \"999999999\"},\n", + " \"driver_relationship\": {\"0\": \"Spouse\"},\n", + " \"incident_type\": {\"0\": \"Collision\"},\n", + " \"collision_type\": {\"0\": \"Front\"},\n", + " \"incident_severity\": {\"0\": \"Minor\"},\n", + " \"authorities_contacted\": {\"0\": \"None\"},\n", + " \"num_vehicles_involved\": {\"0\": 2},\n", + " \"num_injuries\": {\"0\": 0},\n", + " \"num_witnesses\": {\"0\": 0},\n", + " \"police_report_available\": {\"0\": \"No\"},\n", + " \"injury_claim\": {\"0\": 71600},\n", + " \"vehicle_claim\": {\"0\": 8913.6687631788},\n", + " \"total_claim_amount\": {\"0\": 80513.6687631788},\n", + " \"incident_month\": {\"0\": 3},\n", + " \"incident_day\": {\"0\": 17},\n", + " \"incident_dow\": {\"0\": 6},\n", + " \"incident_hour\": {\"0\": 8},\n", + " \"fraud\": {\"0\": 0},\n", + " },\n", + " \"customer\": {\n", + " \"policy_id\": {\"0\": \"999999999\"},\n", + " \"customer_age\": {\"0\": 54},\n", + " \"months_as_customer\": {\"0\": 94},\n", + " \"num_claims_past_year\": {\"0\": 0},\n", + " \"num_insurers_past_5_years\": {\"0\": 1},\n", + " \"policy_state\": {\"0\": \"WA\"},\n", + " \"policy_deductable\": {\"0\": 750},\n", + " \"policy_annual_premium\": {\"0\": 3000},\n", + " \"policy_liability\": {\"0\": \"25/50\"},\n", + " \"customer_zip\": {\"0\": 99207},\n", + " \"customer_gender\": {\"0\": \"Unkown\"},\n", + " \"customer_education\": {\"0\": \"Associate\"},\n", + " \"auto_year\": {\"0\": 2006},\n", + " },\n", + "}\n", + "\n", + "# Hit endpoint\n", + "r = requests.post(url, json=input_data)\n", + "\n", + "# Print response\n", + "print(r.json())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean Up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_up():\n", + " # Delete the online Feature Groups\n", + " claims_feature_group.delete()\n", + " customers_feature_group.delete()\n", + "\n", + " # Delete the offline Feature Groups\n", + " !aws s3 rm {claims_offline_feature_group_bucket} --recursive\n", + " !aws s3 rm {customers_offline_feature_group_bucket} --recursive\n", + " !aws s3 rm {prefix} --recursive\n", + "\n", + " # Delete training and test data\n", + " s3_prefix_uri = f\"s3://{bucket}/{prefix}\"\n", + " !aws s3 rm {s3_prefix_uri} --recursive\n", + "\n", + " # Delete model monitor\n", + " !aws sagemaker delete-monitoring-schedule --monitoring-schedule-name {monitor_schedule_name}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Uncomment the following cell to clean up the Feature Groups, the offline Featrue Group S3 buckets, and the Model Monitor schedule." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#clean_up()" + ] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (Data Science)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/r_examples/r_api_serving_examples/API Serving Examples.ipynb b/r_examples/r_api_serving_examples/API Serving Examples.ipynb new file mode 100644 index 0000000000..cb85c5bac6 --- /dev/null +++ b/r_examples/r_api_serving_examples/API Serving Examples.ipynb @@ -0,0 +1,610 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# R API Serving Examples\n", + "\n", + "In this example, we demonstrate how to quickly compare the runtimes of three methods for serving a model from an R hosted REST API. The following SageMaker examples discuss each method in detail:\n", + "\n", + "* **Plumber**\n", + " * Website: [https://www.rplumber.io/](https://www.rplumber.io)\n", + " * SageMaker Example: [r_serving_with_plumber](../r_serving_with_plumber)\n", + "* **RestRServe**\n", + " * Website: [https://restrserve.org](https://restrserve.org)\n", + " * SageMaker Example: [r_serving_with_restrserve](../r_serving_with_restrserve)\n", + "* **FastAPI** (reticulated from Python)\n", + " * Website: [https://fastapi.tiangolo.com](https://fastapi.tiangolo.com)\n", + " * SageMaker Example: [r_serving_with_fastapi](../r_serving_with_fastapi)\n", + " \n", + "We will reuse the docker images from each of these examples. Each one is configured to serve a small XGBoost model which has already been trained on the classical Iris dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building Docker Images for Serving\n", + "\n", + "First, we will build each docker image from the provided SageMaker Examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plumber Serving Image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!cd .. && docker build -t r-plumber -f r_serving_with_plumber/Dockerfile r_serving_with_plumber" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RestRServe Serving Image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!cd .. && docker build -t r-restrserve -f r_serving_with_restrserve/Dockerfile r_serving_with_restrserve" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FastAPI Serving Image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!cd .. && docker build -t r-fastapi -f r_serving_with_fastapi/Dockerfile r_serving_with_fastapi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch Serving Containers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will launch each search container. The containers will be launch on the following ports to avoid port collisions on your local machine or SageMaker Notebook instance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ports = {\n", + " \"plumber\": 5000,\n", + " \"restrserve\": 5001,\n", + " \"fastapi\": 5002,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!bash launch.sh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker container list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Simple Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from tqdm import tqdm\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_predictions(examples, instance=requests, port=5000):\n", + " payload = {\"features\": examples}\n", + " return instance.post(f\"http://127.0.0.1:{port}/invocations\", json=payload)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_health(instance=requests, port=5000):\n", + " instance.get(f\"http://127.0.0.1:{port}/ping\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Example Inputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we define a example inputs from the classical [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset.\n", + "* Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_names = [\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\", \"Label\"]\n", + "iris = pd.read_csv(\n", + " \"s3://sagemaker-sample-files/datasets/tabular/iris/iris.data\", names=column_names\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris_features = iris[[\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example = iris_features.values[:1].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "many_examples = iris_features.values[:100].tolist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing\n", + "\n", + "Now it's time to test how each API server performs under stress." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will test two use cases:\n", + "* **New Requests**: In this scenario, we test how quickly the server can respond with predictions when each client request establishes a new connection with the server. This simulates the server's ability to handle real-time requests. We could make this more realistic by creating an asynchronous environment that tests the server's ability to fulfill concurrent rather than sequential requests.\n", + "* **Keep Alive / Reuse Session**: In this scenario, we test how quickly the server can respond with predictions when each client request uses a session to keep its connection to the server alive between requests. This simulates the server's ability to handle sequential batch requests from the same client." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For each of the two use cases, we will test the performance on following situations:\n", + "\n", + "* 1000 requests of a single example\n", + "* 1000 requests of 100 examples\n", + "* 1000 pings for health status" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## New Requests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plumber" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# verify the prediction output\n", + "get_predictions(example, port=ports[\"plumber\"]).json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(example, port=ports[\"plumber\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(many_examples, port=ports[\"plumber\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " get_health(port=ports[\"plumber\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RestRserve" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# verify the prediction output\n", + "get_predictions(example, port=ports[\"restrserve\"]).json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(example, port=ports[\"restrserve\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(many_examples, port=ports[\"restrserve\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " get_health(port=ports[\"restrserve\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FastAPI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# verify the prediction output\n", + "get_predictions(example, port=ports[\"fastapi\"]).json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(example, port=ports[\"fastapi\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(many_examples, port=ports[\"fastapi\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " get_health(port=ports[\"fastapi\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Keep Alive (Reuse Session)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's test how each one performs when each request reuses a session connection. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reuse the session for each post and get request\n", + "instance = requests.Session()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plumber" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(example, instance=instance, port=ports[\"plumber\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(many_examples, instance=instance, port=ports[\"plumber\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " get_health(instance=instance, port=ports[\"plumber\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RestRserve" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(example, instance=instance, port=ports[\"restrserve\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(many_examples, instance=instance, port=ports[\"restrserve\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " get_health(instance=instance, port=ports[\"restrserve\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FastAPI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(example, instance=instance, port=ports[\"fastapi\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " _ = get_predictions(many_examples, instance=instance, port=ports[\"fastapi\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in tqdm(range(1000)):\n", + " get_health(instance=instance, port=ports[\"fastapi\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop All Serving Containers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we will shut down the serving containers we launched for the tests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker kill $(docker ps -q)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we demonstrated how to conduct a simple performance benchmark across three R model serving solutions. We leave the choice of serving solution up to the reader since in some cases it might be appropriate to customize the benchmark in the following ways:\n", + "\n", + "* Update the serving example to serve a specific model\n", + "* Perform the tests across multiple instances types\n", + "* Modify the serving example and client to test asynchronous requests.\n", + "* Deploy the serving examples to SageMaker Endpoints to test within an autoscaling environment.\n", + "\n", + "For more information on serving your models in custom containers on SageMaker, please see our [support documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-main.html) for the latest updates and best practices." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/r_examples/r_byo_r_algo_hpo/iris.csv b/r_examples/r_api_serving_examples/iris.csv similarity index 100% rename from r_examples/r_byo_r_algo_hpo/iris.csv rename to r_examples/r_api_serving_examples/iris.csv diff --git a/r_examples/r_api_serving_examples/launch.sh b/r_examples/r_api_serving_examples/launch.sh new file mode 100644 index 0000000000..e456602d35 --- /dev/null +++ b/r_examples/r_api_serving_examples/launch.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "Launching Plumber" +docker run -d --rm -p 5000:8080 r-plumber + +echo "Launching RestRServer" +docker run -d --rm -p 5001:8080 r-restrserve + +echo "Launching FastAPI" +docker run -d --rm -p 5002:8080 r-fastapi + diff --git a/r_examples/r_byo_r_algo_hpo/tune_r_bring_your_own.ipynb b/r_examples/r_byo_r_algo_hpo/tune_r_bring_your_own.ipynb index b6e0171ed5..8dafc1a894 100644 --- a/r_examples/r_byo_r_algo_hpo/tune_r_bring_your_own.ipynb +++ b/r_examples/r_byo_r_algo_hpo/tune_r_bring_your_own.ipynb @@ -151,6 +151,9 @@ "\n", "fullname=\"${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest\"\n", "\n", + "# Get the login command from ECR and execute it directly\n", + "aws ecr get-login-password --region ${region} | docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com\n", + "\n", "# If the repository doesn't exist in ECR, create it.\n", "aws ecr describe-repositories --repository-names \"${algorithm_name}\" > /dev/null 2>&1\n", "\n", @@ -159,9 +162,6 @@ " aws ecr create-repository --repository-name \"${algorithm_name}\" > /dev/null\n", "fi\n", "\n", - "# Get the login command from ECR and execute it directly\n", - "$(aws ecr get-login-password --region ${region} --no-include-email)\n", - "\n", "# Build the docker image locally with the image name and then push it to ECR\n", "# with the full name.\n", "docker build -t ${algorithm_name} .\n", @@ -176,7 +176,8 @@ "source": [ "---\n", "## Data\n", - "For this illustrative example, we'll simply use `iris`. This a classic, but small, dataset used to test supervised learning algorithms. Typically the goal is to predict one of three flower species based on various measurements of the flowers' attributes. Further detail can be found [here](https://en.wikipedia.org/wiki/Iris_flower_data_set).\n", + "For this illustrative example, we'll simply use `iris`. This a classic, but small, dataset used to test supervised learning algorithms. Typically, the goal is to predict one of three flower species based on various measurements of the flowers' attributes. Further details can be found [here](https://archive.ics.uci.edu/ml/datasets/iris).\n", + "* **Source:** Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n", "\n", "Let's split the data to train and test datasets (70% / 30%) and then copy the data to S3 so that SageMaker training can access it." ] @@ -187,7 +188,11 @@ "metadata": {}, "outputs": [], "source": [ - "data = pd.read_csv(\"iris.csv\")" + "column_names = [\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\", \"Species\"]\n", + "data = pd.read_csv(\n", + " \"s3://sagemaker-sample-files/datasets/tabular/iris/iris.data\",\n", + " names=column_names,\n", + ")" ] }, { @@ -697,5 +702,5 @@ "notice": "Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/r_examples/r_sagemaker_binary_classification_algorithms/R_binary_classification_algorithms_comparison.ipynb b/r_examples/r_sagemaker_binary_classification_algorithms/R_binary_classification_algorithms_comparison.ipynb new file mode 100644 index 0000000000..727cff9f7a --- /dev/null +++ b/r_examples/r_sagemaker_binary_classification_algorithms/R_binary_classification_algorithms_comparison.ipynb @@ -0,0 +1,1159 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "cf4dc0b3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "e4fd44cc", + "metadata": {}, + "source": [ + "## Compare built-in Sagemaker classification algorithms for a binary classification problem using Iris dataset" + ] + }, + { + "cell_type": "markdown", + "id": "69f342df", + "metadata": {}, + "source": [ + "In the notebook tutorial, we build 3 classification models using HPO and then compare the AUC on test dataset on 3 deployed models\n", + "\n", + "IRIS is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. The dataset is built-in by default into R or can also be downloaded from https://archive.ics.uci.edu/ml/datasets/iris\n", + "\n", + "The iris dataset, besides its historical importance, is also a fun dataset to play with since it can educate us about various ML techniques such as clustering, classification and regression, all in one dataset.\n", + "\n", + "The dataset is built into any base R installation, so no download is required.\n", + "\n", + "Attribute Information:\n", + "\n", + "1. sepal length in cm\n", + "2. sepal width in cm\n", + "3. petal length in cm\n", + "4. petal width in cm\n", + "5. Species of flowers: Iris setosa, Iris versicolor, Iris virginica\n", + "\n", + "The prediction we will perform is `Species ~ f(sepal.length,sepal.width,petal.width,petal.length)`\n", + "\n", + "Predicted attribute: Species of iris plant.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "2b7bc401", + "metadata": {}, + "source": [ + "### Load required libraries and initialize variables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae96595e", + "metadata": {}, + "outputs": [], + "source": [ + "rm(list=ls())\n", + "library(reticulate) # be careful not to install reticulate again. since it can cause problems.\n", + "library(tidyverse)\n", + "library(pROC)\n", + "set.seed(1324)" + ] + }, + { + "cell_type": "markdown", + "id": "e7ef8e97", + "metadata": {}, + "source": [ + "SageMaker needs to be imported using the reticulate library. If this was performed in a local computer, we would have to make sure that Python and appropriate SageMaker libraries are installed, but inside a SageMaker notebook R kernels, these are all pre-loaded and the R user does not have to worry about installing reticulate or Python. \n", + "\n", + "Session is the unique session ID associated with each SageMaker call. It remains the same throughout the execution of the program and can be recalled later to close a session or open a new session.\n", + "\n", + "The bucket is the Amazon S3 bucket where we will be storing our data output. The Amazon S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the Notebook Instance, training, and hosting.\n", + "\n", + "The role is the role of the SageMaker notebook as when it was initially deployed. The IAM role arn used to give training and hosting access to your data. See the documentation for how to create these. Note, if more than one role is required for notebook instances, training, and/or hosting, please replace the boto regexp with appropriate full IAM role arn string(s).\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "107a91cb", + "metadata": {}, + "outputs": [], + "source": [ + "sagemaker <- import('sagemaker')\n", + "session <- sagemaker$Session()\n", + "bucket <- session$default_bucket() # you may replace with name of your personal S3 bucket\n", + "role_arn <- sagemaker$get_execution_role()" + ] + }, + { + "cell_type": "markdown", + "id": "78ac95c7", + "metadata": {}, + "source": [ + "### Input the data and basic pre-processing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5be74fd", + "metadata": {}, + "outputs": [], + "source": [ + "head(iris)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "190824ed", + "metadata": {}, + "outputs": [], + "source": [ + "summary(iris)" + ] + }, + { + "cell_type": "markdown", + "id": "d0391e7e", + "metadata": {}, + "source": [ + "In above, we see that there are 50 flowers of the setosa species, 50 flowers of the versicolor species, and 50 flowers of the virginica species." + ] + }, + { + "cell_type": "markdown", + "id": "32a67ad2", + "metadata": {}, + "source": [ + "In this case, the target variable is the Species prediction. We are trying to predict the species of the flower given its numerical measurements of Sepal length, sepal width, petal length, and petal width. Since we are trying to do binary classification, we will only take the flower species setosa and versicolor for simplicity. Also we will perform one-hot encoding on the categorical variable Species." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c699c5b2", + "metadata": {}, + "outputs": [], + "source": [ + "iris1 <- iris %>% \n", + " dplyr::select(Species,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width) %>% # change order of columns such that the label column is the first column.\n", + " dplyr::filter(Species %in% c(\"setosa\",\"versicolor\")) %>% #only select two flower for binary classification.\n", + " dplyr::mutate(Species = as.numeric(Species) -1) # one-hot encoding,starting with 0 as setosa and 1 as versicolor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e338575f", + "metadata": {}, + "outputs": [], + "source": [ + "head(iris1)" + ] + }, + { + "cell_type": "markdown", + "id": "82954ca2", + "metadata": {}, + "source": [ + "We now obtain some basic descriptive statistics of the features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c47022b", + "metadata": {}, + "outputs": [], + "source": [ + "iris1 %>% group_by(Species) %>% summarize(mean_sepal_length = mean(Sepal.Length),\n", + " mean_petal_length = mean(Petal.Length),\n", + " mean_sepal_width = mean(Sepal.Width),\n", + " mean_petal_width = mean(Petal.Width),\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "c3e492f0", + "metadata": {}, + "source": [ + "In the summary statistics, we observe that mean sepal length is longer than mean petal length for both flowers. " + ] + }, + { + "cell_type": "markdown", + "id": "d7833b26", + "metadata": {}, + "source": [ + "### Prepare for modelling" + ] + }, + { + "cell_type": "markdown", + "id": "dd7b7a46", + "metadata": {}, + "source": [ + "##### We split the train and test and validate into 70%, 15%, and 15%, using random sampling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4f6d609", + "metadata": {}, + "outputs": [], + "source": [ + "iris_train <- iris1 %>%\n", + " sample_frac(size = 0.7)\n", + "iris_test <- anti_join(iris1, iris_train) %>% \n", + " sample_frac(size = 0.5)\n", + "iris_validate <- anti_join(iris1, iris_train) %>%\n", + " anti_join(., iris_test)" + ] + }, + { + "cell_type": "markdown", + "id": "55124158", + "metadata": {}, + "source": [ + "##### We do a check of the summary statistics to make sure train, test, validate datasets are appropriately split and have proper class balance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e4a2cc9", + "metadata": {}, + "outputs": [], + "source": [ + "table(iris_train$Species)\n", + "nrow(iris_train)" + ] + }, + { + "cell_type": "markdown", + "id": "0b970cf0", + "metadata": {}, + "source": [ + "We see that the class balance between 0 and 1 is almost 50% each for the binary classification. We also see that there are 70 rows in the train dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "207d6b36", + "metadata": {}, + "outputs": [], + "source": [ + "table(iris_validate$Species)\n", + "nrow(iris_validate)" + ] + }, + { + "cell_type": "markdown", + "id": "5ba950db", + "metadata": {}, + "source": [ + "We see that the class balance in validation dataset between 0 and 1 is almost 50% each for the binary classification. We also see that there are 15 rows in the validation dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d76514ee", + "metadata": {}, + "outputs": [], + "source": [ + "table(iris_test$Species)\n", + "nrow(iris_test)" + ] + }, + { + "cell_type": "markdown", + "id": "c2271689", + "metadata": {}, + "source": [ + "We see that the class balance in test dataset between 0 and 1 is almost 50% each for the binary classification. We also see that there are 15 rows in the test dataset." + ] + }, + { + "cell_type": "markdown", + "id": "2307a959", + "metadata": {}, + "source": [ + "### Write the data to Amazon S3" + ] + }, + { + "cell_type": "markdown", + "id": "55026467", + "metadata": {}, + "source": [ + "Different algorithms in SageMaker will have different data formats required for training and for testing. These formats are created to make model production easier. csv is the most well known of these formats and has been used here as input in all algorithms to make it consistent.\n", + "\n", + "SageMaker algorithms take in data from an Amazon S3 object and output data to an Amazon S3 object, so data has to be stored in Amazon S3 as csv,json, proto-buf or any format that is supported by the algorithm that you are going to use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f25eabdb", + "metadata": {}, + "outputs": [], + "source": [ + "write_csv(iris_train, 'iris_train.csv', col_names = FALSE)\n", + "write_csv(iris_validate, 'iris_valid.csv', col_names = FALSE)\n", + "write_csv(iris_test, 'iris_test.csv', col_names = FALSE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d63db1d8", + "metadata": {}, + "outputs": [], + "source": [ + "s3_train <- session$upload_data(path = 'iris_train.csv', \n", + " bucket = bucket, \n", + " key_prefix = 'data')\n", + "s3_valid <- session$upload_data(path = 'iris_valid.csv', \n", + " bucket = bucket, \n", + " key_prefix = 'data')\n", + "\n", + "s3_test <- session$upload_data(path = 'iris_test.csv', \n", + " bucket = bucket, \n", + " key_prefix = 'data')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54df2b27", + "metadata": {}, + "outputs": [], + "source": [ + "s3_train_input <- sagemaker$inputs$TrainingInput(s3_data = s3_train,\n", + " content_type = 'text/csv')\n", + "s3_valid_input <- sagemaker$inputs$TrainingInput(s3_data = s3_valid,\n", + " content_type = 'text/csv')\n", + "s3_test_input <- sagemaker$inputs$TrainingInput(s3_data = s3_test,\n", + " content_type = 'text/csv')\n" + ] + }, + { + "cell_type": "markdown", + "id": "e2a2542c", + "metadata": {}, + "source": [ + "To perform Binary classification on Tabular\tdata, SageMaker contains following algorithms:\n", + "\n", + "- XGBoost Algorithm\n", + "- Linear Learner Algorithm, \n", + "- K-Nearest Neighbors (k-NN) Algorithm, \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c01f88a2", + "metadata": {}, + "source": [ + "## Create model 1: XGBoost model in SageMaker" + ] + }, + { + "cell_type": "markdown", + "id": "d99ff2bc", + "metadata": {}, + "source": [ + "Use the XGBoost built-in algorithm to build an XGBoost training container as shown in the following code example. You can automatically spot the XGBoost built-in algorithm image URI using the SageMaker image_uris.retrieve API (or the get_image_uri API if using Amazon SageMaker Python SDK version 1). If you want to ensure if the image_uris.retrieve API finds the correct URI, see Common parameters for built-in algorithms and look up XGBoost from the full list of built-in algorithm image URIs and available regions.\n", + "\n", + "After specifying the XGBoost image URI, you can use the XGBoost container to construct an estimator using the SageMaker Estimator API and initiate a training job. This XGBoost built-in algorithm mode does not incorporate your own XGBoost training script and runs directly on the input datasets.\n", + "\n", + "See https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html for more information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29145cfe", + "metadata": {}, + "outputs": [], + "source": [ + "container <- sagemaker$image_uris$retrieve(framework='xgboost', region= session$boto_region_name, version='latest')\n", + "cat('XGBoost Container Image URL: ', container)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07325371", + "metadata": {}, + "outputs": [], + "source": [ + "s3_output <- paste0('s3://', bucket, '/output_xgboost')\n", + "estimator1 <- sagemaker$estimator$Estimator(image_uri = container,\n", + " role = role_arn,\n", + " train_instance_count = 1L,\n", + " train_instance_type = 'ml.m5.4xlarge',\n", + " input_mode = 'File',\n", + " output_path = s3_output,\n", + " output_kms_key = NULL,\n", + " base_job_name = NULL,\n", + " sagemaker_session = NULL)" + ] + }, + { + "cell_type": "markdown", + "id": "9a7b6207", + "metadata": {}, + "source": [ + "How would an untuned model perform compared to a tuned model? Is it worth the effort? Before going deeper into XGBoost model tuning, let’s highlight the reasons why you have to tune your model. The main reason to perform hyper-parameter tuning is to increase predictability of our models by choosing our hyperparameters in a well thought manner. There are 3 ways to perform hyperparameter tuning: grid search, random search, bayesian search. Popular packages like scikit-learn use grid search and random search techniques. SageMaker uses Bayesian search techniques.\n", + "\n", + "We need to choose \n", + "\n", + "- a learning objective function to optimize during model training\n", + "- an eval_metric to use to evaluate model performance during validation\n", + "- a set of hyperparameters and a range of values for each to use when tuning the model automatically\n", + "\n", + "SageMaker XGBoost model can be tuned with many hyperparameters. The hyperparameters that have the greatest effect on optimizing the XGBoost evaluation metrics are: \n", + "\n", + "- alpha, \n", + "- min_child_weight, \n", + "- subsample, \n", + "- eta, \n", + "- num_round.\n", + "\n", + "\n", + "The hyperparameters that are required are num_class (the number of classes if it is a multi-class classification problem) and num_round ( the number of rounds to run the training on). All other hyperparameters are optional and will be set to default values if it is not specified by the user." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c33f178", + "metadata": {}, + "outputs": [], + "source": [ + "# check to make sure which are required and which are optional\n", + "estimator1$set_hyperparameters(eval_metric='auc',\n", + " objective='binary:logistic',\n", + " num_round = 6L\n", + " )\n", + "\n", + "# Set Hyperparameter Ranges, check to make sure which are integer and which are continuos parameters. \n", + "hyperparameter_ranges = list('eta' = sagemaker$parameter$ContinuousParameter(0,1),\n", + " 'min_child_weight'= sagemaker$parameter$ContinuousParameter(0,10),\n", + " 'alpha'= sagemaker$parameter$ContinuousParameter(0,2),\n", + " 'max_depth'= sagemaker$parameter$IntegerParameter(0L,10L))\n" + ] + }, + { + "cell_type": "markdown", + "id": "42771dbc", + "metadata": {}, + "source": [ + "The evaluation metric that we will use for our binary classification purpose is validation:auc, but you could use any other metric that is right for your problem. You do have to be careful to change your objective_type to point to the right direction of Maximize or Minimize according to the objective metric you have chosen." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056f2add", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a hyperparamter tuner\n", + "objective_metric_name = 'validation:auc'\n", + "tuner1 <- sagemaker$tuner$HyperparameterTuner(estimator1,\n", + " objective_metric_name,\n", + " hyperparameter_ranges,\n", + " objective_type='Maximize',\n", + " max_jobs=4L,\n", + " max_parallel_jobs=2L)\n", + "\n", + "# Define the data channels for train and validation datasets\n", + "input_data <- list('train' = s3_train_input,\n", + " 'validation' = s3_valid_input)\n", + "\n", + "# train the tuner\n", + "tuner1$fit(inputs = input_data, \n", + " job_name = paste('sagemaker-tune-xgboost', format(Sys.time(), '%H-%M-%S'), sep = '-'), \n", + " wait=TRUE)" + ] + }, + { + "cell_type": "markdown", + "id": "79451cba", + "metadata": {}, + "source": [ + "The output of the tuning job can be checked in SageMaker if needed." + ] + }, + { + "cell_type": "markdown", + "id": "278ea863", + "metadata": {}, + "source": [ + "### Calculate AUC for the test data on model 1" + ] + }, + { + "cell_type": "markdown", + "id": "1d1951de", + "metadata": {}, + "source": [ + "SageMaker will automatically recognize the training job with the best evaluation metric and load the hyperparameters associated with that training job when we deploy the model. One of the benefits of SageMaker is that we can easily deploy models in a different instance than the instance in which the notebook is running. So we can deploy into a more powerful instance or a less powerful instance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8596325", + "metadata": {}, + "outputs": [], + "source": [ + "model_endpoint1 <- tuner1$deploy(initial_instance_count = 1L,\n", + " instance_type = 'ml.t2.medium')\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "181de542", + "metadata": {}, + "source": [ + "The serializer tells SageMaker what format the model expects data to be input in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05d97e07", + "metadata": {}, + "outputs": [], + "source": [ + "model_endpoint1$serializer <- sagemaker$serializers$CSVSerializer(content_type='text/csv')" + ] + }, + { + "cell_type": "markdown", + "id": "3ec87668", + "metadata": {}, + "source": [ + "We input the `iris_test` dataset without the labels into the model using the `predict` function and check its AUC value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc3ce0c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the test sample for input into the model\n", + "test_sample <- as.matrix(iris_test[-1])\n", + "dimnames(test_sample)[[2]] <- NULL\n", + "\n", + "# Predict using the deployed model\n", + "predictions_ep <- model_endpoint1$predict(test_sample)\n", + "predictions_ep <- stringr::str_split(predictions_ep, pattern = ',', simplify = TRUE)\n", + "predictions_ep <- as.numeric(predictions_ep > 0.5)\n", + "\n", + "# Add the predictions to the test dataset.\n", + "iris_predictions_ep1 <- dplyr::bind_cols(predicted_flower = predictions_ep, \n", + " iris_test)\n", + "iris_predictions_ep1\n", + "\n", + "# Get the AUC\n", + "auc(roc(iris_predictions_ep1$predicted_flower,iris_test$Species))" + ] + }, + { + "cell_type": "markdown", + "id": "d29c25b7", + "metadata": {}, + "source": [ + "## Create model 2: Linear Learner in SageMaker" + ] + }, + { + "cell_type": "markdown", + "id": "42aa8a6a", + "metadata": {}, + "source": [ + "Linear models are supervised learning algorithms used for solving either classification or regression problems. For input, you give the model labeled examples (x, y). x is a high-dimensional vector and y is a numeric label. For binary classification problems, the label must be either 0 or 1.\n", + "\n", + "The linear learner algorithm requires a data matrix, with rows representing the observations, and columns representing the dimensions of the features. It also requires an additional column that contains the labels that match the data points. At a minimum, Amazon SageMaker linear learner requires you to specify input and output data locations, and objective type (classification or regression) as arguments. The feature dimension is also required. You can specify additional parameters in the HyperParameters string map of the request body. These parameters control the optimization procedure, or specifics of the objective function that you train on. For example, the number of epochs, regularization, and loss type.\n", + "\n", + "See https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html for more information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c34b9b3d", + "metadata": {}, + "outputs": [], + "source": [ + "container <- sagemaker$image_uris$retrieve(framework='linear-learner', region= session$boto_region_name, version='latest')\n", + "cat('Linear Learner Container Image URL: ', container)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ed66e62", + "metadata": {}, + "outputs": [], + "source": [ + "s3_output <- paste0('s3://', bucket, '/output_glm')\n", + "estimator2 <- sagemaker$estimator$Estimator(image_uri = container,\n", + " role = role_arn,\n", + " train_instance_count = 1L,\n", + " train_instance_type = 'ml.m5.4xlarge',\n", + " input_mode = 'File',\n", + " output_path = s3_output,\n", + " output_kms_key = NULL,\n", + " base_job_name = NULL,\n", + " sagemaker_session = NULL)" + ] + }, + { + "cell_type": "markdown", + "id": "f92dbbdd", + "metadata": {}, + "source": [ + "For the text/csv input type, the first column is assumed to be the label, which is the target variable for prediction." + ] + }, + { + "cell_type": "markdown", + "id": "6b2391b3", + "metadata": {}, + "source": [ + "predictor_type is the only hyperparameter that is required to be pre-defined for tuning. The rest are optional." + ] + }, + { + "cell_type": "markdown", + "id": "9f1a07f5", + "metadata": {}, + "source": [ + "Normalization, or feature scaling, is an important preprocessing step for certain loss functions that ensures the model being trained on a dataset does not become dominated by the weight of a single feature. Decision trees do not require normalization of their inputs; and since XGBoost is essentially an ensemble algorithm comprised of decision trees, it does not require normalization for the inputs either.\n", + "\n", + "However, Generalized Linear Models require a normalization of their input. The Amazon SageMaker Linear Learner algorithm has a normalization option to assist with this preprocessing step. If normalization is turned on, the algorithm first goes over a small sample of the data to learn the mean value and standard deviation for each feature and for the label. Each of the features in the full dataset is then shifted to have mean of zero and scaled to have a unit standard deviation.\n", + "\n", + "To make our job easier, we do not have to go back to our previous steps to do normalization. Normalization is built in as a hyper-parameter in SageMaker Linear learner algorithm. So no need to worry about normalization for the training portions.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23e49f71", + "metadata": {}, + "outputs": [], + "source": [ + "estimator2$set_hyperparameters(predictor_type=\"binary_classifier\",\n", + " normalize_data = TRUE)" + ] + }, + { + "cell_type": "markdown", + "id": "4eb28cfd", + "metadata": {}, + "source": [ + "The tunable hyperparameters for linear learner are:\n", + "\n", + "- wd\n", + "- l1\n", + "- learning_rate\n", + "- mini_batch_size\n", + "- use_bias\n", + "- positive_example_weight_mult\n", + "\n", + "Be careful to check which parameters are integers and which parameters are continuous because that is one of the common sources of errors. Also be careful to give a proper range for hyperparameters that makes sense for your problem. Training jobs can sometimes fail if the mini-batch size is too big compared to the training data available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b79b90bc", + "metadata": {}, + "outputs": [], + "source": [ + "# Set Hyperparameter Ranges\n", + "hyperparameter_ranges = list('wd' = sagemaker$parameter$ContinuousParameter(0.00001,1),\n", + " 'l1' = sagemaker$parameter$ContinuousParameter(0.00001,1),\n", + " 'learning_rate' = sagemaker$parameter$ContinuousParameter(0.00001,1),\n", + " 'mini_batch_size' = sagemaker$parameter$IntegerParameter(10L, 50L) \n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "3f777e12", + "metadata": {}, + "source": [ + "The evaluation metric we will be using in our case to compare the models will be the objective loss and is based on the validation dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ab03023", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a hyperparamter tuner\n", + "objective_metric_name = 'validation:objective_loss'\n", + "tuner2 <- sagemaker$tuner$HyperparameterTuner(estimator2,\n", + " objective_metric_name,\n", + " hyperparameter_ranges,\n", + " objective_type='Minimize',\n", + " max_jobs=4L,\n", + " max_parallel_jobs=2L)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5803b03b", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a tuning job name\n", + "job_name <- paste('tune-linear-learner', format(Sys.time(), '%H-%M-%S'), sep = '-')\n", + "\n", + "# Define the data channels for train and validation datasets\n", + "input_data <- list('train' = s3_train_input,\n", + " 'validation' = s3_valid_input)\n", + "\n", + "# Train the tuner\n", + "tuner2$fit(inputs = input_data, job_name = job_name, wait=TRUE, content_type='csv') # since we are using csv files as input into the model, we need to specify content type as csv." + ] + }, + { + "cell_type": "markdown", + "id": "2b7bf6a1", + "metadata": {}, + "source": [ + "### Calculate AUC for the test data on model 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "672ca373", + "metadata": {}, + "outputs": [], + "source": [ + "# Deploy the model into an instance of your choosing.\n", + "model_endpoint2 <- tuner2$deploy(initial_instance_count = 1L,\n", + " instance_type = 'ml.t2.medium')" + ] + }, + { + "cell_type": "markdown", + "id": "949973bd", + "metadata": {}, + "source": [ + "For inference, the linear learner algorithm supports the application/json, application/x-recordio-protobuf, and text/csv formats. For more information, https://docs.aws.amazon.com/sagemaker/latest/dg/LL-in-formats.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da8d7f55", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify what data formats you want the input and output of your model to look like.\n", + "model_endpoint2$serializer <- sagemaker$serializers$CSVSerializer(content_type='text/csv')\n", + "model_endpoint2$deserializer <- sagemaker$deserializers$JSONDeserializer()" + ] + }, + { + "cell_type": "markdown", + "id": "5aed6097", + "metadata": {}, + "source": [ + "In Linear Learner the output inference files are in JSON or RecordIO formats. https://docs.aws.amazon.com/sagemaker/latest/dg/LL-in-formats.html \n", + "\n", + "When you make predictions on new data, the contents of the response data depends on the type of model you choose within Linear Learner. For regression (predictor_type='regressor'), the score is the prediction produced by the model. For classification (predictor_type='binary_classifier' or predictor_type='multiclass_classifier'), the model returns a score and also a predicted_label. The predicted_label is the class predicted by the model and the score measures the strength of that prediction. So, for binary classification, predicted_label is 0 or 1, and score is a single floating point number that indicates how strongly the algorithm believes that the label should be 1.\n", + "\n", + "To interpret the score in classification problems, you have to consider the loss function used. If the loss hyperparameter value is logistic for binary classification or softmax_loss for multiclass classification, then the score can be interpreted as the probability of the corresponding class. These are the loss values used by the linear learner when the `loss` hyperparameter is set to auto as default value. But if the `loss` is set to `hinge_loss`, then the score cannot be interpreted as a probability. This is because hinge loss corresponds to a Support Vector Classifier, which does not produce probability estimates. In the current example, since our loss hyperparameter is logistic for binary classification, we can interpret it as probability of the corresponding class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd578e7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the test data for input into the model\n", + "test_sample <- as.matrix(iris_test[-1])\n", + "dimnames(test_sample)[[2]] <- NULL\n", + "\n", + "# Predict using the test data on the deployed model\n", + "predictions_ep <- model_endpoint2$predict(test_sample)\n", + "\n", + "# Add the predictions to the test dataset.\n", + "df <- data.frame(matrix(unlist(predictions_ep$predictions), nrow=length(predictions_ep$predictions), byrow=TRUE))\n", + "df <- df %>% dplyr::rename(score = X1, predicted_label = X2)\n", + "iris_predictions_ep2 <- dplyr::bind_cols(predicted_flower = df$predicted_label, \n", + " iris_test)\n", + "iris_predictions_ep2\n", + "\n", + "# Get the AUC\n", + "auc(roc(iris_predictions_ep2$predicted_flower,iris_test$Species))\n" + ] + }, + { + "cell_type": "markdown", + "id": "f8aff600", + "metadata": {}, + "source": [ + "## Create model 3: KNN in SageMaker" + ] + }, + { + "cell_type": "markdown", + "id": "48ef375f", + "metadata": {}, + "source": [ + "Amazon SageMaker k-nearest neighbors (k-NN) algorithm is an index-based algorithm. It uses a non-parametric method for classification or regression. For classification problems, the algorithm queries the k points that are closest to the sample point and returns the most frequently used label of their class as the predicted label. For regression problems, the algorithm queries the k closest points to the sample point and returns the average of their feature values as the predicted value.\n", + "\n", + "Training with the k-NN algorithm has three steps: sampling, dimension reduction, and index building. Sampling reduces the size of the initial dataset so that it fits into memory. For dimension reduction, the algorithm decreases the feature dimension of the data to reduce the footprint of the k-NN model in memory and inference latency. We provide two methods of dimension reduction methods: random projection and the fast Johnson-Lindenstrauss transform. Typically, you use dimension reduction for high-dimensional (d >1000) datasets to avoid the “curse of dimensionality” that troubles the statistical analysis of data that becomes sparse as dimensionality increases. The main objective of k-NN's training is to construct the index. The index enables efficient lookups of distances between points whose values or class labels have not yet been determined and the k nearest points to use for inference.\n", + "\n", + "See https://docs.aws.amazon.com/sagemaker/latest/dg/k-nearest-neighbors.html for more information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc3befe5", + "metadata": {}, + "outputs": [], + "source": [ + "container <- sagemaker$image_uris$retrieve(framework='knn', region= session$boto_region_name, version='latest')\n", + "cat('KNN Container Image URL: ', container)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8952078c", + "metadata": {}, + "outputs": [], + "source": [ + "s3_output <- paste0('s3://', bucket, '/output_knn')\n", + "estimator3 <- sagemaker$estimator$Estimator(image_uri = container,\n", + " role = role_arn,\n", + " train_instance_count = 1L,\n", + " train_instance_type = 'ml.m5.4xlarge',\n", + " input_mode = 'File',\n", + " output_path = s3_output,\n", + " output_kms_key = NULL,\n", + " base_job_name = NULL,\n", + " sagemaker_session = NULL)" + ] + }, + { + "cell_type": "markdown", + "id": "952cee1b", + "metadata": {}, + "source": [ + "Hyperparameter `dimension_reduction_target` should not be set when `dimension_reduction_type` is set to its default value, which is `None`. If 'dimension_reduction_target' is set to a certain number without setting `dimension_reduction_type`, then SageMaker will ask us to remove 'dimension_reduction_target' from the specified hyperparameters and try again. In this tutorial, we are not performing dimensionality reduction, since we only have 4 features; so `dimension_reduction_type` is set to its default value of `None`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37ef7aa7", + "metadata": {}, + "outputs": [], + "source": [ + "estimator3$set_hyperparameters( \n", + " feature_dim = 4L, \n", + " sample_size = 10L, \n", + " predictor_type = \"classifier\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "484f987c", + "metadata": {}, + "source": [ + "Amazon SageMaker k-nearest neighbor model can be tuned with the following hyperparameters:\n", + "- k \n", + "- sample_size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a91bd395", + "metadata": {}, + "outputs": [], + "source": [ + "# Set Hyperparameter Ranges\n", + "hyperparameter_ranges = list('k' = sagemaker$parameter$IntegerParameter(1L,10L)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59ad0005", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a hyperparamter tuner\n", + "objective_metric_name = 'test:accuracy'\n", + "tuner3 <- sagemaker$tuner$HyperparameterTuner(estimator3,\n", + " objective_metric_name,\n", + " hyperparameter_ranges,\n", + " objective_type='Maximize',\n", + " max_jobs=2L,\n", + " max_parallel_jobs=2L)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6d49b48", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a tuning job name\n", + "job_name <- paste('tune-knn', format(Sys.time(), '%H-%M-%S'), sep = '-')\n", + "\n", + "# Define the data channels for train and validation datasets\n", + "input_data <- list('train' = s3_train_input,\n", + " 'test' = s3_valid_input # KNN needs a test data, does not work without it.\n", + " ) \n", + "\n", + "# train the tuner\n", + "tuner3$fit(inputs = input_data, job_name = job_name, wait=TRUE, content_type='text/csv;label_size=0')" + ] + }, + { + "cell_type": "markdown", + "id": "0887c62d", + "metadata": {}, + "source": [ + "### Calculate AUC for the test data on model 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c204889d", + "metadata": {}, + "outputs": [], + "source": [ + "# Deploy the model into an instance of your choosing.\n", + "model_endpoint3 <- tuner3$deploy(initial_instance_count = 1L,\n", + " instance_type = 'ml.t2.medium')" + ] + }, + { + "cell_type": "markdown", + "id": "d4243dba", + "metadata": {}, + "source": [ + "For inference, the linear learner algorithm supports the application/json, application/x-recordio-protobuf, and text/csv formats. For more information, https://docs.aws.amazon.com/sagemaker/latest/dg/LL-in-formats.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "167ced33", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify what data formats you want the input and output of your model to look like.\n", + "model_endpoint3$serializer <- sagemaker$serializers$CSVSerializer(content_type='text/csv')\n", + "model_endpoint3$deserializer <- sagemaker$deserializers$JSONDeserializer()" + ] + }, + { + "cell_type": "markdown", + "id": "01ce50fd", + "metadata": {}, + "source": [ + "In KNN, the input formats for inference are:\n", + "- CSV\n", + "- JSON\n", + "- JSONLINES\n", + "- RECORDIO\n", + "\n", + "The output formats for inference are:\n", + "- JSON\n", + "- JSONLINES\n", + "- Verbose JSON\n", + "- Verbose RecordIO-ProtoBuf\n", + "\n", + "Notice that there is no CSV output format for inference. \n", + "\n", + "See https://docs.aws.amazon.com/sagemaker/latest/dg/kNN-inference-formats.html for more details.\n", + "\n", + "When you make predictions on new data, the contents of the response data depends on the type of model you choose within Linear Learner. For regression (predictor_type='regressor'), the score is the prediction produced by the model. For classification (predictor_type='binary_classifier' or predictor_type='multiclass_classifier'), the model returns a score and also a predicted_label. The predicted_label is the class predicted by the model and the score measures the strength of that prediction. So, for binary classification, predicted_label is 0 or 1, and score is a single floating point number that indicates how strongly the algorithm believes that the label should be 1.\n", + "\n", + "To interpret the score in classification problems, you have to consider the loss function used. If the loss hyperparameter value is logistic for binary classification or softmax_loss for multiclass classification, then the score can be interpreted as the probability of the corresponding class. These are the loss values used by the linear learner when the loss hyperparameter is set to auto as default value. But if the loss is set to hinge_loss, then the score cannot be interpreted as a probability. This is because hinge loss corresponds to a Support Vector Classifier, which does not produce probability estimates. In the current example, since our loss hyperparameter is logistic for binary classification, we can interpret it as probability of the corresponding class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "017e7253", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the test data for input into the model\n", + "test_sample <- as.matrix(iris_test[-1])\n", + "dimnames(test_sample)[[2]] <- NULL\n", + "\n", + "# Predict using the test data on the deployed model\n", + "predictions_ep <- model_endpoint3$predict(test_sample)" + ] + }, + { + "cell_type": "markdown", + "id": "66a9203b", + "metadata": {}, + "source": [ + "We see that the output is of a deserialized JSON format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d2380e5", + "metadata": {}, + "outputs": [], + "source": [ + "predictions_ep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9c12fab", + "metadata": {}, + "outputs": [], + "source": [ + "typeof(predictions_ep)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a755d464", + "metadata": {}, + "outputs": [], + "source": [ + "# Add the predictions to the test dataset.\n", + "df = data.frame(predicted_flower = unlist(predictions_ep$predictions))\n", + "iris_predictions_ep2 <- dplyr::bind_cols(predicted_flower = df$predicted_flower, \n", + " iris_test)\n", + "iris_predictions_ep2\n", + "\n", + "# Get the AUC\n", + "auc(roc(iris_predictions_ep2$predicted_flower,iris_test$Species))" + ] + }, + { + "cell_type": "markdown", + "id": "d36155c6", + "metadata": {}, + "source": [ + "## Compare the AUC of 3 models for the test data" + ] + }, + { + "cell_type": "markdown", + "id": "af1cca8d", + "metadata": {}, + "source": [ + "\n", + "- AUC of Sagemaker XGBoost = 1 \n", + "\n", + "- AUC of Sagemaker Linear Learner = 0.83\n", + "\n", + "- AUC of Sagemaker KNN = 1\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "4e81c277", + "metadata": {}, + "source": [ + "Based on the AUC metric (the higher the better), both XGBoost and KNN perform equally well and are better than the Linear Learner. We can also explore the 3 models with other binary classification metrics such as accuracy, F1 score, and misclassification error. Comparing only the AUC, in this example, we could chose either the XGBoost model or the KNN model to move onto production and close the other two. The deployed model of our choosing can be passed onto production to generate predictions of flower species given that the user only has its sepal and petal measurements. The performance of the deployed model can also be tracked in Amazon CloudWatch." + ] + }, + { + "cell_type": "markdown", + "id": "3a9dd325", + "metadata": {}, + "source": [ + "## Clean up " + ] + }, + { + "cell_type": "markdown", + "id": "e4b7ef60", + "metadata": {}, + "source": [ + "##### We close the endpoints which we created to free up resources." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f27da27", + "metadata": {}, + "outputs": [], + "source": [ + "session$delete_endpoint(model_endpoint1$endpoint)\n", + "session$delete_endpoint(model_endpoint2$endpoint)\n", + "session$delete_endpoint(model_endpoint3$endpoint)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.1.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/r_examples/r_serving_with_fastapi/Dockerfile b/r_examples/r_serving_with_fastapi/Dockerfile new file mode 100644 index 0000000000..68315a476f --- /dev/null +++ b/r_examples/r_serving_with_fastapi/Dockerfile @@ -0,0 +1,23 @@ +FROM r-base:3.6.3 + +MAINTAINER Amazon SageMaker Examples + +RUN apt-get -y update && apt-get install -y --no-install-recommends \ + wget \ + r-base \ + r-base-dev \ + apt-transport-https \ + ca-certificates \ + python3 python3-dev pip + +RUN pip install fastapi uvicorn numpy + +RUN R -e "install.packages(c('reticulate','xgboost'), repos='https://cloud.r-project.org')" + +COPY endpoints.py /opt/ml/endpoints.py +COPY deploy.R /opt/ml/deploy.R +COPY xgb.model /opt/ml/xgb.model + +WORKDIR /opt/ml + +ENTRYPOINT ["/usr/bin/Rscript", "/opt/ml/deploy.R", "--no-save"] diff --git a/r_examples/r_serving_with_fastapi/FastAPI Example.ipynb b/r_examples/r_serving_with_fastapi/FastAPI Example.ipynb new file mode 100644 index 0000000000..1fc513f644 --- /dev/null +++ b/r_examples/r_serving_with_fastapi/FastAPI Example.ipynb @@ -0,0 +1,297 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# R Serving with FastAPI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dockerfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* The dockerfile defines the environment in which our server will be executed. \n", + "* Below, you can see that the entrypoint for our container will be [deploy.R](deploy.R)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pycat Dockerfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code: deploy.R\n", + "\n", + "**deploy.R** handles the following steps\n", + "* Loads the R libraries used by the server.\n", + "* Loads a pretrained `xgboost` model that has been trained on the classical [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset.\n", + " * Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n", + "* Defines an inference function that takes a matrix of iris features and returns predictions for those iris examples.\n", + "* Wraps the inference function to make it thread-safe for passing to python through reticulate.\n", + "* Finally, it generates the [endpoints.py](endpoints.py) from python and launches the FastAPI server app using those endpoint definitions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pycat deploy.R" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code: endpoints.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**endpoints.py** defines two routes:\n", + "* `/ping` returns a status of 'Alive' to indicate that the application is healthy\n", + "* `/invocations` applies the previously defined inference function to the input features from the request body\n", + "\n", + "Note, that FastAPI is typed. The `Example` class define the type of the input that we expect to receive from the request.\n", + "\n", + "For more information about the requirements for building your own inference container, see:\n", + "[Use Your Own Inference Code with Hosting Services](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pycat endpoints.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build the Serving Image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!docker build -t r-fastapi ." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch the Serving Container" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!echo \"Launching FastAPI\"\n", + "!docker run -d --rm -p 5000:8080 r-fastapi\n", + "!echo \"Waiting for the server to start..\" && sleep 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker container list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Simple Python Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "\n", + "pd.set_option(\"display.max_rows\", 500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_predictions(examples, instance=requests, port=5000):\n", + " payload = {\"features\": examples}\n", + " return instance.post(f\"http://127.0.0.1:{port}/invocations\", json=payload)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_health(instance=requests, port=5000):\n", + " instance.get(f\"http://127.0.0.1:{port}/ping\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Example Inputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define example inputs from the Iris dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_names = [\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\", \"Label\"]\n", + "iris = pd.read_csv(\n", + " \"s3://sagemaker-sample-files/datasets/tabular/iris/iris.data\", names=column_names\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris_features = iris[[\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example_inputs = iris_features.values.tolist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plumber" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predicted = get_predictions(example_inputs).json()[\"output\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris[\"predicted\"] = predicted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop All Serving Containers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we will shut down the serving container we launched for the test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker kill $(docker ps -q)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/r_examples/r_serving_with_fastapi/deploy.R b/r_examples/r_serving_with_fastapi/deploy.R new file mode 100644 index 0000000000..82550d0edd --- /dev/null +++ b/r_examples/r_serving_with_fastapi/deploy.R @@ -0,0 +1,26 @@ +library(reticulate) +library(xgboost) + +# explicit tell reticulate to use the system python +use_python("/usr/bin/python3") + +# load our FastAPI endpoints with reticulate +source_python('endpoints.py') + +# load a pretrained xgboost model +bst <- xgb.load("xgb.model") + +# create a closure around our xgboost model and input data processing +inference <- function(x){ + ds <- xgb.DMatrix(data = x ) + predict(bst, ds) +} + +# make our inference closure safe to send to python as a callback +safe_inference <- py_main_thread_func(inference) + +# create a new FastAPI application instance +app <- make_endpoints(safe_inference) + +# run our FastAPI application +run_app(app) \ No newline at end of file diff --git a/r_examples/r_serving_with_fastapi/endpoints.py b/r_examples/r_serving_with_fastapi/endpoints.py new file mode 100644 index 0000000000..7b62009e85 --- /dev/null +++ b/r_examples/r_serving_with_fastapi/endpoints.py @@ -0,0 +1,32 @@ +from typing import Optional, List +from fastapi import FastAPI +from pydantic import BaseModel +import uvicorn +import numpy as np + + +# Define our expected input types +class Example(BaseModel): + features: List[List[float]] + + +# Create a function that we can use to pass our inference function +# to the endpoints during initialization. +def make_endpoints(r_inference_func): + app = FastAPI() + + @app.get("/ping") + async def check_health(): + return {"Status": "Alive"} + + @app.post("/invocations") + async def read_item(input: Example): + output = r_inference_func(np.array(input.features)) + return {"output": output} + + return app + + +# A function we can call from R to launch the FastAPI application +def run_app(app): + uvicorn.run(app, host="0.0.0.0", port=8080, log_level="info") diff --git a/r_examples/r_serving_with_fastapi/xgb.model b/r_examples/r_serving_with_fastapi/xgb.model new file mode 100644 index 0000000000..1898fcdf6e Binary files /dev/null and b/r_examples/r_serving_with_fastapi/xgb.model differ diff --git a/r_examples/r_serving_with_plumber/Dockerfile b/r_examples/r_serving_with_plumber/Dockerfile new file mode 100644 index 0000000000..a00c6aa3ea --- /dev/null +++ b/r_examples/r_serving_with_plumber/Dockerfile @@ -0,0 +1,20 @@ +FROM r-base:3.6.3 + +MAINTAINER Amazon SageMaker Examples + +RUN apt-get -y update && apt-get install -y --no-install-recommends \ + wget \ + apt-transport-https \ + ca-certificates \ + libcurl4-openssl-dev \ + libsodium-dev + +RUN R -e "install.packages(c('xgboost','plumber'), repos='https://cloud.r-project.org')" + +COPY xgb.model /opt/ml/xgb.model +COPY endpoints.R /opt/ml/endpoints.R +COPY deploy.R /opt/ml/deploy.R + +WORKDIR /opt/ml + +ENTRYPOINT ["/usr/bin/Rscript", "/opt/ml/deploy.R", "--no-save"] diff --git a/r_examples/r_serving_with_plumber/Plumber Example.ipynb b/r_examples/r_serving_with_plumber/Plumber Example.ipynb new file mode 100644 index 0000000000..1030638dcf --- /dev/null +++ b/r_examples/r_serving_with_plumber/Plumber Example.ipynb @@ -0,0 +1,301 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# R Serving with Plumber" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dockerfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* The Dockerfile defines the environment in which our server will be executed.\n", + "* Below, you can see that the entrypoint for our container will be [deploy.R](deploy.R)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pycat Dockerfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code: deploy.R\n", + "\n", + "The **deploy.R** script handles the following steps:\n", + "* Loads the R libraries used by the server.\n", + "* Loads a pretrained `xgboost` model that has been trained on the classical [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset.\n", + " * Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n", + "* Defines an inference function that takes a matrix of iris features and returns predictions for those iris examples.\n", + "* Finally, it imports the [endpoints.R](endpoints.R) script and launches the Plumber server app using those endpoint definitions.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pycat deploy.R" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code: endpoints.R" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**endpoints.R** defines two routes:\n", + "* `/ping` returns a string 'Alive' to indicate that the application is healthy\n", + "* `/invocations` applies the previously defined inference function to the input features from the request body\n", + "\n", + "For more information about the requirements for building your own inference container, see:\n", + "[Use Your Own Inference Code with Hosting Services](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pycat endpoints.R" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build the Serving Image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!docker build -t r-plumber ." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch the Serving Container" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!echo \"Launching Plumber\"\n", + "!docker run -d --rm -p 5000:8080 r-plumber\n", + "!echo \"Waiting for the server to start..\" && sleep 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker container list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Simple Python Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "\n", + "pd.set_option(\"display.max_rows\", 500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_predictions(examples, instance=requests, port=5000):\n", + " payload = {\"features\": examples}\n", + " return instance.post(f\"http://127.0.0.1:{port}/invocations\", json=payload)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_health(instance=requests, port=5000):\n", + " instance.get(f\"http://127.0.0.1:{port}/ping\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Example Inputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define example inputs from the Iris dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_names = [\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\", \"Label\"]\n", + "iris = pd.read_csv(\n", + " \"s3://sagemaker-sample-files/datasets/tabular/iris/iris.data\", names=column_names\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris_features = iris[[\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example_inputs = iris_features.values.tolist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plumber" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predicted = get_predictions(example_inputs).json()[\"output\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris[\"predicted\"] = predicted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop All Serving Containers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we will shut down the serving container we launched for the test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker kill $(docker ps -q)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/r_examples/r_serving_with_plumber/deploy.R b/r_examples/r_serving_with_plumber/deploy.R new file mode 100644 index 0000000000..b15b24ef72 --- /dev/null +++ b/r_examples/r_serving_with_plumber/deploy.R @@ -0,0 +1,16 @@ +library(xgboost) +library(plumber) +library(jsonlite) + +# load a pretrained xgboost model +bst <- xgb.load("xgb.model") + +# create a closure around our xgboost model and input data processing +inference <- function(x){ + ds <- xgb.DMatrix(data = x ) + output <- predict(bst, ds) + list(output=output) +} + +app <- plumb('endpoints.R') +app$run(host='0.0.0.0', port=8080) diff --git a/r_examples/r_serving_with_plumber/endpoints.R b/r_examples/r_serving_with_plumber/endpoints.R new file mode 100644 index 0000000000..529763866b --- /dev/null +++ b/r_examples/r_serving_with_plumber/endpoints.R @@ -0,0 +1,20 @@ + +#' Ping to show server is there +#' @get /ping +function() { + return('Alive') +} + + +#' Parse input and return prediction from model +#' @param req The http request sent +#' @post /invocations +function(req) { + + # Read in data + input_json <- fromJSON(req$postBody) + output <- inference(input_json$features) + # Return prediction + return(output) + +} diff --git a/r_examples/r_serving_with_plumber/xgb.model b/r_examples/r_serving_with_plumber/xgb.model new file mode 100644 index 0000000000..1898fcdf6e Binary files /dev/null and b/r_examples/r_serving_with_plumber/xgb.model differ diff --git a/r_examples/r_serving_with_restrserve/Dockerfile b/r_examples/r_serving_with_restrserve/Dockerfile new file mode 100644 index 0000000000..69dc88b8c2 --- /dev/null +++ b/r_examples/r_serving_with_restrserve/Dockerfile @@ -0,0 +1,12 @@ +FROM r-base:3.6.3 + +MAINTAINER Amazon SageMaker Examples + +RUN R -e "install.packages(c('RestRserve','xgboost','dplyr'), repos='https://cloud.r-project.org')" + +COPY xgb.model /opt/ml/xgb.model +COPY restrserve.R /opt/ml/restrserve.R + +WORKDIR /opt/ml + +ENTRYPOINT ["/usr/bin/Rscript", "/opt/ml/restrserve.R", "--no-save"] diff --git a/r_examples/r_serving_with_restrserve/RestRServe Example.ipynb b/r_examples/r_serving_with_restrserve/RestRServe Example.ipynb new file mode 100644 index 0000000000..bfe1fcfa6e --- /dev/null +++ b/r_examples/r_serving_with_restrserve/RestRServe Example.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# R Serving with RestRserve" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dockerfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* The Dockerfile defines the environment in which our server will be executed.\n", + "* Below, you can see that the entry point for our container will be `restrserve.R`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pycat Dockerfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code: restrserve.R\n", + "\n", + "Script `restrserve.R` handles the following steps\n", + "* Loads the R libraries used by the server.\n", + "* Loads a pretrained `xgboost` model that has been trained on the classical [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset.\n", + " * Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n", + "* Defines an inference function that takes a matrix of iris features and returns predictions for those iris examples.\n", + "* Defines two routes:\n", + " * `/ping` returns a string 'Alive' to indicate that the application is healthy\n", + " * `/invocations` applies the previously defined inference function to the input features from the request body\n", + "* Launches the RestRserve serving application." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pycat restrserve.R" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build the Serving Image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!docker build -t r-restrserve ." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch the Serving Container" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!echo \"Launching RestRServer\"\n", + "!docker run -d --rm -p 5000:8080 r-restrserve\n", + "!echo \"Waiting for the server to start..\" && sleep 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker container list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Simple Python Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "\n", + "pd.set_option(\"display.max_rows\", 500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_predictions(examples, instance=requests, port=5000):\n", + " payload = {\"features\": examples}\n", + " return instance.post(f\"http://127.0.0.1:{port}/invocations\", json=payload)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_health(instance=requests, port=5000):\n", + " instance.get(f\"http://127.0.0.1:{port}/ping\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Example Inputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define example inputs from the Iris dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_names = [\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\", \"Label\"]\n", + "iris = pd.read_csv(\n", + " \"s3://sagemaker-sample-files/datasets/tabular/iris/iris.data\", names=column_names\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris_features = iris[[\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example_inputs = iris_features.values.tolist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plumber" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predicted = get_predictions(example_inputs).json()[\"output\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris[\"predicted\"] = predicted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iris" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop All Serving Containers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we will shut down the serving container we launched for the test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker kill $(docker ps -q)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/r_examples/r_serving_with_restrserve/restrserve.R b/r_examples/r_serving_with_restrserve/restrserve.R new file mode 100644 index 0000000000..f3ec5cb482 --- /dev/null +++ b/r_examples/r_serving_with_restrserve/restrserve.R @@ -0,0 +1,31 @@ +library(xgboost) +library(RestRserve) + +# load a pretrained xgboost model +bst <- xgb.load("xgb.model") + +# create a closure around our xgboost model and input data processing +inference <- function(x){ + ds <- xgb.DMatrix(data = x ) + output <- predict(bst, ds) + list(output=output) +} +app = Application$new() + +app$add_get( + path = "/ping", + FUN = function(request, response) { + response$set_body(list(Status = "Alive")) + }) + +app$add_post( + path = "/invocations", + FUN = function(request, response) { + result = inference(do.call(rbind,request$body$features)) + response$set_content_type("application/json") + response$set_body(result) + }) + + +backend = BackendRserve$new() +backend$start(app, http_port = 8080) diff --git a/r_examples/r_serving_with_restrserve/xgb.model b/r_examples/r_serving_with_restrserve/xgb.model new file mode 100644 index 0000000000..1898fcdf6e Binary files /dev/null and b/r_examples/r_serving_with_restrserve/xgb.model differ diff --git a/reinforcement_learning/common/markdown_helper.py b/reinforcement_learning/common/markdown_helper.py index 3d0cb3051f..8a1302a1bc 100644 --- a/reinforcement_learning/common/markdown_helper.py +++ b/reinforcement_learning/common/markdown_helper.py @@ -89,8 +89,8 @@ def generate_help_for_robomaker_all_permissions(role): text += ( "3. Go to JSON tab, add the following JSON blob to the `Statement` list and save policy:\n" ) - text += """```json - { + text += f"""```json + {{ "Effect": "Allow", "Action": [ "robomaker:CreateSimulationApplication", @@ -104,31 +104,29 @@ def generate_help_for_robomaker_all_permissions(role): "Resource": [ "*" ] - }, - { + }}, + {{ "Effect": "Allow", "Action": "iam:CreateServiceLinkedRole", "Resource": "*", - "Condition": { - "StringEquals": { + "Condition": {{ + "StringEquals": {{ "iam:AWSServiceName": "robomaker.amazonaws.com" - } - } - }, - { + }} + }} + }}, + {{ "Effect": "Allow", - "Action": [ - "iam:PassRole" - ], - "Resource": "*", - "Condition": { - "StringEquals": { + "Action": "iam:PassRole", + "Resource": \"{role}\", + "Condition": {{ + "StringEquals": {{ "iam:PassedToService": [ "robomaker.amazonaws.com" ] - } - } - },```\n""" + }} + }} + }},```\n""" text += ( "4. Next, go to the `Trust relationships tab` and click on `Edit Trust Relationship.` \n" ) @@ -212,8 +210,8 @@ def generate_help_for_experiment_manager_permissions(role): text += ( "3. Go to JSON tab, add the following JSON blob to the `Statement` list and save policy:\n" ) - text += """```json - { + text += f"""```json + {{ "Effect": "Allow", "Action": [ "cloudformation:DescribeStacks", @@ -232,7 +230,6 @@ def generate_help_for_experiment_manager_permissions(role): "iam:PutRolePolicy", "iam:DeleteRolePolicy", "iam:DeleteRole", - "iam:PassRole", "cloudwatch:PutDashboard", "firehose:ListDeliveryStreams", "firehose:DeleteDeliveryStream", @@ -250,6 +247,11 @@ def generate_help_for_experiment_manager_permissions(role): "Resource": [ "*" ] - },```\n""" + }}, + {{ + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": \"{role}\" + }}```\n""" text += "4. Now wait for a few minutes before executing this cell again!" return text diff --git a/sagemaker-debugger/debugger_interactive_analysis_profiling/interactive_analysis_profiling_data.ipynb b/sagemaker-debugger/debugger_interactive_analysis_profiling/interactive_analysis_profiling_data.ipynb index 4daacc52fa..52c309e3c8 100644 --- a/sagemaker-debugger/debugger_interactive_analysis_profiling/interactive_analysis_profiling_data.ipynb +++ b/sagemaker-debugger/debugger_interactive_analysis_profiling/interactive_analysis_profiling_data.ipynb @@ -171,17 +171,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pip\n", + "import sys\n", "\n", - "\n", - "def import_or_install(package):\n", - " try:\n", - " __import__(package)\n", - " except ImportError:\n", - " pip.main([\"install\", package])\n", - "\n", - "\n", - "import_or_install(\"smdebug\")" + "!{sys.executable} -m pip install \"smdebug\"\n", + "!{sys.executable} -m pip install \"bokeh==2.3.0\"" ] }, { @@ -194,7 +187,8 @@ "\n", "tj = TrainingJob(training_job_name, region)\n", "\n", - "tj.wait_for_sys_profiling_data_to_be_available()" + "tj.wait_for_sys_profiling_data_to_be_available()\n", + "tj.wait_for_framework_profiling_data_to_be_available()" ] }, { @@ -504,83 +498,9 @@ }, { "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2021-04-29 07:02:03.203 ip-172-16-71-4:29196 INFO metrics_reader_base.py:134] Getting 9 event files\n", - "Found 76397 system metrics events from timestamp_in_us:0 to timestamp_in_us:1619676600000000\n", - "select events:['.*']\n", - "select dimensions:['CPU', 'GPU']\n", - "filtered_events:{'cpu23', 'cpu20', 'cpu19', 'cpu26', 'cpu18', 'cpu2', 'WriteThroughputInBytesPerSecond', 'cpu8', 'gpu3', 'cpu10', 'cpu16', 'ReceiveBytesPerSecond', 'TransmitBytesPerSecond', 'cpu17', 'gpu1', 'cpu13', 'cpu29', 'cpu3', 'IOPS', 'cpu22', 'MemoryUsedPercent', 'cpu11', 'cpu12', 'cpu25', 'cpu24', 'cpu1', 'cpu28', 'cpu9', 'gpu2', 'gpu0', 'cpu21', 'cpu27', 'ReadThroughputInBytesPerSecond', 'total', 'cpu5', 'cpu15', 'cpu4', 'cpu30', 'cpu7', 'cpu31', 'cpu14', 'cpu6', 'cpu0'}\n", - "filtered_dimensions:{'CPUUtilization-nodeid:algo-1', 'GPUUtilization-nodeid:algo-1', 'GPUMemoryUtilization-nodeid:algo-1'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "(function(root) {\n", - " function embed_document(root) {\n", - " \n", - " var docs_json = {\"d9af2108-bcb5-4ca5-a49f-df3aa06ee385\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"3332\"},{\"id\":\"3330\"}]},\"id\":\"3333\",\"type\":\"Column\"},{\"attributes\":{\"below\":[{\"id\":\"2502\"}],\"center\":[{\"id\":\"2505\"},{\"id\":\"2509\"}],\"left\":[{\"id\":\"2506\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2528\"}],\"title\":{\"id\":\"3221\"},\"toolbar\":{\"id\":\"2517\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2494\"},\"x_scale\":{\"id\":\"2498\"},\"y_range\":{\"id\":\"2496\"},\"y_scale\":{\"id\":\"2500\"}},\"id\":\"2493\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2350\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3263\"},\"ticker\":{\"id\":\"2655\"}},\"id\":\"2654\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"2728\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2731\",\"type\":\"Grid\"},{\"attributes\":{\"below\":[{\"id\":\"2798\"}],\"center\":[{\"id\":\"2801\"},{\"id\":\"2805\"}],\"left\":[{\"id\":\"2802\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2824\"}],\"title\":{\"id\":\"3301\"},\"toolbar\":{\"id\":\"2813\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2790\"},\"x_scale\":{\"id\":\"2794\"},\"y_range\":{\"id\":\"2792\"},\"y_scale\":{\"id\":\"2796\"}},\"id\":\"2789\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis\":{\"id\":\"1429\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1432\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2927\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2352\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2531\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2655\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"2783\"}},\"id\":\"2788\",\"type\":\"CDSView\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2933\"},\"ticker\":{\"id\":\"1434\"}},\"id\":\"1433\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2494\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2886\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"overlay\":{\"id\":\"2775\"}},\"id\":\"2771\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu7\",\"formatter\":{\"id\":\"3185\"},\"ticker\":{\"id\":\"2355\"}},\"id\":\"2354\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"2654\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2657\",\"type\":\"Grid\"},{\"attributes\":{\"start\":0},\"id\":\"2496\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1434\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2774\",\"type\":\"HelpTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2901\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2790\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2355\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2732\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2769\",\"type\":\"PanTool\"},{\"attributes\":{\"axis\":{\"id\":\"1433\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1436\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"2354\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2357\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2498\",\"type\":\"LinearScale\"},{\"attributes\":{\"start\":0},\"id\":\"2792\",\"type\":\"DataRange1d\"},{\"attributes\":{\"source\":{\"id\":\"2894\"}},\"id\":\"2899\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2733\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1451\"},\"glyph\":{\"id\":\"1453\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1454\"},\"selection_glyph\":null,\"view\":{\"id\":\"1456\"}},\"id\":\"1455\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2500\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3183\"},\"ticker\":{\"id\":\"2359\"}},\"id\":\"2358\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2794\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2658\",\"type\":\"PanTool\"},{\"attributes\":{\"overlay\":{\"id\":\"2738\"}},\"id\":\"2734\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu0\",\"formatter\":{\"id\":\"3225\"},\"ticker\":{\"id\":\"2503\"}},\"id\":\"2502\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2796\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1517\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[588,111,55,41,32,28,15,15,13,8,6,7,7,4,2,2,4,1,3,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"2987\"},\"selection_policy\":{\"id\":\"2988\"}},\"id\":\"1636\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2359\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2659\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2735\",\"type\":\"SaveTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2894\"},\"glyph\":{\"id\":\"2896\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2897\"},\"selection_glyph\":null,\"view\":{\"id\":\"2899\"}},\"id\":\"2898\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"children\":[[{\"id\":\"1309\"},0,0],[{\"id\":\"1346\"},0,1],[{\"id\":\"1383\"},0,2],[{\"id\":\"1420\"},0,3],[{\"id\":\"1457\"},1,0],[{\"id\":\"1494\"},1,1],[{\"id\":\"1531\"},1,2],[{\"id\":\"1568\"},1,3],[{\"id\":\"1605\"},2,0],[{\"id\":\"1642\"},2,1],[{\"id\":\"1679\"},2,2],[{\"id\":\"1716\"},2,3],[{\"id\":\"1753\"},3,0],[{\"id\":\"1790\"},3,1],[{\"id\":\"1827\"},3,2],[{\"id\":\"1864\"},3,3],[{\"id\":\"1901\"},4,0],[{\"id\":\"1938\"},4,1],[{\"id\":\"1975\"},4,2],[{\"id\":\"2012\"},4,3],[{\"id\":\"2049\"},5,0],[{\"id\":\"2086\"},5,1],[{\"id\":\"2123\"},5,2],[{\"id\":\"2160\"},5,3],[{\"id\":\"2197\"},6,0],[{\"id\":\"2234\"},6,1],[{\"id\":\"2271\"},6,2],[{\"id\":\"2308\"},6,3],[{\"id\":\"2345\"},7,0],[{\"id\":\"2382\"},7,1],[{\"id\":\"2419\"},7,2],[{\"id\":\"2456\"},7,3],[{\"id\":\"2493\"},8,0],[{\"id\":\"2530\"},8,1],[{\"id\":\"2567\"},8,2],[{\"id\":\"2604\"},8,3],[{\"id\":\"2641\"},9,0],[{\"id\":\"2678\"},9,1],[{\"id\":\"2715\"},9,2],[{\"id\":\"2752\"},9,3],[{\"id\":\"2789\"},10,0],[{\"id\":\"2826\"},10,1],[{\"id\":\"2863\"},10,2]]},\"id\":\"3330\",\"type\":\"GridBox\"},{\"attributes\":{},\"id\":\"1437\",\"type\":\"PanTool\"},{\"attributes\":{\"axis\":{\"id\":\"2358\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2361\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"GPUMemoryUtilization-nodeid:algo-1_gpu2\",\"formatter\":{\"id\":\"3305\"},\"ticker\":{\"id\":\"2799\"}},\"id\":\"2798\",\"type\":\"LinearAxis\"},{\"attributes\":{\"overlay\":{\"id\":\"2664\"}},\"id\":\"2660\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2736\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis\":{\"id\":\"2502\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2505\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1438\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2903\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2799\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2661\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2737\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3223\"},\"ticker\":{\"id\":\"2507\"}},\"id\":\"2506\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"2798\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2801\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2905\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"2376\"}},\"id\":\"2381\",\"type\":\"CDSView\"},{\"attributes\":{\"below\":[{\"id\":\"2391\"}],\"center\":[{\"id\":\"2394\"},{\"id\":\"2398\"}],\"left\":[{\"id\":\"2395\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2417\"}],\"title\":{\"id\":\"3191\"},\"toolbar\":{\"id\":\"2406\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2383\"},\"x_scale\":{\"id\":\"2387\"},\"y_range\":{\"id\":\"2385\"},\"y_scale\":{\"id\":\"2389\"}},\"id\":\"2382\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2662\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2507\",\"type\":\"BasicTicker\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2732\"},{\"id\":\"2733\"},{\"id\":\"2734\"},{\"id\":\"2735\"},{\"id\":\"2736\"},{\"id\":\"2737\"}]},\"id\":\"2739\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3303\"},\"ticker\":{\"id\":\"2803\"}},\"id\":\"2802\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1454\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2907\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2362\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2663\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis\":{\"id\":\"2506\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2509\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2803\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2908\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2363\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2773\",\"type\":\"ResetTool\"},{\"attributes\":{\"source\":{\"id\":\"1451\"}},\"id\":\"1456\",\"type\":\"CDSView\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2658\"},{\"id\":\"2659\"},{\"id\":\"2660\"},{\"id\":\"2661\"},{\"id\":\"2662\"},{\"id\":\"2663\"}]},\"id\":\"2665\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis\":{\"id\":\"2802\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2805\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2928\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"below\":[{\"id\":\"2539\"}],\"center\":[{\"id\":\"2542\"},{\"id\":\"2546\"}],\"left\":[{\"id\":\"2543\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2565\"}],\"title\":{\"id\":\"3231\"},\"toolbar\":{\"id\":\"2554\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2531\"},\"x_scale\":{\"id\":\"2535\"},\"y_range\":{\"id\":\"2533\"},\"y_scale\":{\"id\":\"2537\"}},\"id\":\"2530\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"overlay\":{\"id\":\"2368\"}},\"id\":\"2364\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2772\",\"type\":\"SaveTool\"},{\"attributes\":{\"below\":[{\"id\":\"1466\"}],\"center\":[{\"id\":\"1469\"},{\"id\":\"1473\"}],\"left\":[{\"id\":\"1470\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1492\"}],\"title\":{\"id\":\"2941\"},\"toolbar\":{\"id\":\"1481\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1458\"},\"x_scale\":{\"id\":\"1462\"},\"y_range\":{\"id\":\"1460\"},\"y_scale\":{\"id\":\"1464\"}},\"id\":\"1457\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2931\",\"type\":\"Title\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1490\",\"type\":\"Quad\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[829,2,1,1,10,102,8,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3287\"},\"selection_policy\":{\"id\":\"3288\"}},\"id\":\"2746\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2695\"},{\"id\":\"2696\"},{\"id\":\"2697\"},{\"id\":\"2698\"},{\"id\":\"2699\"},{\"id\":\"2700\"}]},\"id\":\"2702\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2365\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2848\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2510\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1458\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2738\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2933\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2366\",\"type\":\"ResetTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2746\"},\"glyph\":{\"id\":\"2748\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2749\"},\"selection_glyph\":null,\"view\":{\"id\":\"2751\"}},\"id\":\"2750\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2701\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2511\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"start\":0},\"id\":\"1460\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2923\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2935\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2367\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2806\",\"type\":\"PanTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2664\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"overlay\":{\"id\":\"2516\"}},\"id\":\"2512\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1462\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2749\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2937\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2807\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2672\"},\"glyph\":{\"id\":\"2674\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2675\"},\"selection_glyph\":null,\"view\":{\"id\":\"2677\"}},\"id\":\"2676\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2918\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2513\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1464\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[569,79,40,27,21,21,22,18,13,5,11,8,8,1,3,2,3,4,1,0,3,1,0,1,1,0,0,0,0,1,0,0,2,0,0,0,0,1,1,0,0,0,2,0,1,0,1,0,4]},\"selected\":{\"id\":\"3207\"},\"selection_policy\":{\"id\":\"3208\"}},\"id\":\"2450\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"2746\"}},\"id\":\"2751\",\"type\":\"CDSView\"},{\"attributes\":{\"overlay\":{\"id\":\"2812\"}},\"id\":\"2808\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2675\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2938\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2514\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu18\",\"formatter\":{\"id\":\"2945\"},\"ticker\":{\"id\":\"1467\"}},\"id\":\"1466\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2941\",\"type\":\"Title\"},{\"attributes\":{\"axis\":{\"id\":\"2765\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2768\",\"type\":\"Grid\"},{\"attributes\":{\"below\":[{\"id\":\"2761\"}],\"center\":[{\"id\":\"2764\"},{\"id\":\"2768\"}],\"left\":[{\"id\":\"2765\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2787\"}],\"title\":{\"id\":\"3291\"},\"toolbar\":{\"id\":\"2776\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2753\"},\"x_scale\":{\"id\":\"2757\"},\"y_range\":{\"id\":\"2755\"},\"y_scale\":{\"id\":\"2759\"}},\"id\":\"2752\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2809\",\"type\":\"SaveTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2783\"},\"glyph\":{\"id\":\"2785\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2786\"},\"selection_glyph\":null,\"view\":{\"id\":\"2788\"}},\"id\":\"2787\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2515\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2943\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1467\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"2672\"}},\"id\":\"2677\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2810\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2753\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2748\",\"type\":\"Quad\"},{\"attributes\":{\"axis\":{\"id\":\"1466\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1469\",\"type\":\"Grid\"},{\"attributes\":{\"data_source\":{\"id\":\"2709\"},\"glyph\":{\"id\":\"2711\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2712\"},\"selection_glyph\":null,\"view\":{\"id\":\"2714\"}},\"id\":\"2713\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2945\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2540\",\"type\":\"BasicTicker\"},{\"attributes\":{\"below\":[{\"id\":\"2687\"}],\"center\":[{\"id\":\"2690\"},{\"id\":\"2694\"}],\"left\":[{\"id\":\"2691\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2713\"}],\"title\":{\"id\":\"3271\"},\"toolbar\":{\"id\":\"2702\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2679\"},\"x_scale\":{\"id\":\"2683\"},\"y_range\":{\"id\":\"2681\"},\"y_scale\":{\"id\":\"2685\"}},\"id\":\"2678\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2510\"},{\"id\":\"2511\"},{\"id\":\"2512\"},{\"id\":\"2513\"},{\"id\":\"2514\"},{\"id\":\"2515\"}]},\"id\":\"2517\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2811\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2943\"},\"ticker\":{\"id\":\"1471\"}},\"id\":\"1470\",\"type\":\"LinearAxis\"},{\"attributes\":{\"start\":0},\"id\":\"2755\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2674\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2947\",\"type\":\"Selection\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2921\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2679\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2757\",\"type\":\"LinearScale\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2806\"},{\"id\":\"2807\"},{\"id\":\"2808\"},{\"id\":\"2809\"},{\"id\":\"2810\"},{\"id\":\"2811\"}]},\"id\":\"2813\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1471\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2948\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2537\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2759\",\"type\":\"LinearScale\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2843\"},{\"id\":\"2844\"},{\"id\":\"2845\"},{\"id\":\"2846\"},{\"id\":\"2847\"},{\"id\":\"2848\"}]},\"id\":\"2850\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis\":{\"id\":\"1470\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1473\",\"type\":\"Grid\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2951\",\"type\":\"Title\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2849\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"axis\":{\"id\":\"2691\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2694\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"GPUMemoryUtilization-nodeid:algo-1_gpu1\",\"formatter\":{\"id\":\"3295\"},\"ticker\":{\"id\":\"2762\"}},\"id\":\"2761\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[829,2,1,2,8,93,19,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3297\"},\"selection_policy\":{\"id\":\"3298\"}},\"id\":\"2783\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1606\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2925\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2953\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[826,3,1,0,0,0,1,0,1,0,0,0,2,0,0,1,7,19,33,54,6,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3277\"},\"selection_policy\":{\"id\":\"3278\"}},\"id\":\"2709\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2368\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2712\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2762\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2955\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"below\":[{\"id\":\"2724\"}],\"center\":[{\"id\":\"2727\"},{\"id\":\"2731\"}],\"left\":[{\"id\":\"2728\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2750\"}],\"title\":{\"id\":\"3281\"},\"toolbar\":{\"id\":\"2739\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2716\"},\"x_scale\":{\"id\":\"2720\"},\"y_range\":{\"id\":\"2718\"},\"y_scale\":{\"id\":\"2722\"}},\"id\":\"2715\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2383\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis\":{\"id\":\"2761\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2764\",\"type\":\"Grid\"},{\"attributes\":{\"below\":[{\"id\":\"1614\"}],\"center\":[{\"id\":\"1617\"},{\"id\":\"1621\"}],\"left\":[{\"id\":\"1618\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1640\"}],\"title\":{\"id\":\"2981\"},\"toolbar\":{\"id\":\"1629\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1606\"},\"x_scale\":{\"id\":\"1610\"},\"y_range\":{\"id\":\"1608\"},\"y_scale\":{\"id\":\"1612\"}},\"id\":\"1605\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"start\":0},\"id\":\"2385\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2812\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[600,97,57,46,29,23,12,14,14,9,6,10,8,1,2,2,1,1,0,2,0,0,1,0,1,2,1,0,0,0,0,0,0,0,0,0,3,0,1,0,1,2,1,1,1,2,0,1,0]},\"selected\":{\"id\":\"2947\"},\"selection_policy\":{\"id\":\"2948\"}},\"id\":\"1488\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1474\",\"type\":\"PanTool\"},{\"attributes\":{\"axis_label\":\"GPUUtilization-nodeid:algo-1_gpu3\",\"formatter\":{\"id\":\"3235\"},\"ticker\":{\"id\":\"2540\"}},\"id\":\"2539\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3293\"},\"ticker\":{\"id\":\"2766\"}},\"id\":\"2765\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2387\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2716\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data_source\":{\"id\":\"2820\"},\"glyph\":{\"id\":\"2822\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2823\"},\"selection_glyph\":null,\"view\":{\"id\":\"2825\"}},\"id\":\"2824\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2958\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1475\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2535\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2766\",\"type\":\"BasicTicker\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2961\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2389\",\"type\":\"LinearScale\"},{\"attributes\":{\"start\":0},\"id\":\"2718\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2823\",\"type\":\"Quad\"},{\"attributes\":{\"overlay\":{\"id\":\"1480\"}},\"id\":\"1476\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2770\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2516\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu31\",\"formatter\":{\"id\":\"3195\"},\"ticker\":{\"id\":\"2392\"}},\"id\":\"2391\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2720\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2963\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"2820\"}},\"id\":\"2825\",\"type\":\"CDSView\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2769\"},{\"id\":\"2770\"},{\"id\":\"2771\"},{\"id\":\"2772\"},{\"id\":\"2773\"},{\"id\":\"2774\"}]},\"id\":\"2776\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1477\",\"type\":\"SaveTool\"},{\"attributes\":{\"start\":0},\"id\":\"2533\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2722\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2965\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2392\",\"type\":\"BasicTicker\"},{\"attributes\":{\"below\":[{\"id\":\"2835\"}],\"center\":[{\"id\":\"2838\"},{\"id\":\"2842\"}],\"left\":[{\"id\":\"2839\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2861\"}],\"title\":{\"id\":\"3311\"},\"toolbar\":{\"id\":\"2850\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2827\"},\"x_scale\":{\"id\":\"2831\"},\"y_range\":{\"id\":\"2829\"},\"y_scale\":{\"id\":\"2833\"}},\"id\":\"2826\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1478\",\"type\":\"ResetTool\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[829,2,1,1,10,101,9,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3317\"},\"selection_policy\":{\"id\":\"3318\"}},\"id\":\"2857\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis\":{\"id\":\"2391\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2394\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"GPUMemoryUtilization-nodeid:algo-1_gpu3\",\"formatter\":{\"id\":\"3285\"},\"ticker\":{\"id\":\"2725\"}},\"id\":\"2724\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2822\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2917\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1479\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis\":{\"id\":\"2539\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2542\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2827\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3193\"},\"ticker\":{\"id\":\"2396\"}},\"id\":\"2395\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2725\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2967\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2968\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3233\"},\"ticker\":{\"id\":\"2544\"}},\"id\":\"2543\",\"type\":\"LinearAxis\"},{\"attributes\":{\"start\":0},\"id\":\"2829\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2859\",\"type\":\"Quad\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2971\",\"type\":\"Title\"},{\"attributes\":{\"axis\":{\"id\":\"2724\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2727\",\"type\":\"Grid\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2775\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2396\",\"type\":\"BasicTicker\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1474\"},{\"id\":\"1475\"},{\"id\":\"1476\"},{\"id\":\"1477\"},{\"id\":\"1478\"},{\"id\":\"1479\"}]},\"id\":\"1481\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2831\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2544\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"2395\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2398\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3283\"},\"ticker\":{\"id\":\"2729\"}},\"id\":\"2728\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2973\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"1599\"}},\"id\":\"1604\",\"type\":\"CDSView\"},{\"attributes\":{\"axis\":{\"id\":\"2543\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2546\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2833\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2785\",\"type\":\"Quad\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1511\"},{\"id\":\"1512\"},{\"id\":\"1513\"},{\"id\":\"1514\"},{\"id\":\"1515\"},{\"id\":\"1516\"}]},\"id\":\"1518\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2975\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2729\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"GPUMemoryUtilization-nodeid:algo-1_gpu0\",\"formatter\":{\"id\":\"3315\"},\"ticker\":{\"id\":\"2836\"}},\"id\":\"2835\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[675,89,32,24,15,20,21,11,15,10,7,7,6,3,2,0,4,0,0,0,0,0,2,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"2957\"},\"selection_policy\":{\"id\":\"2958\"}},\"id\":\"1525\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[680,78,46,23,19,24,15,18,8,8,7,8,6,7,3,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3197\"},\"selection_policy\":{\"id\":\"3198\"}},\"id\":\"2413\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[824,3,2,1,0,0,2,0,0,0,0,1,0,1,2,4,7,13,21,58,14,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3237\"},\"selection_policy\":{\"id\":\"3238\"}},\"id\":\"2561\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2786\",\"type\":\"Quad\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3121\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2977\",\"type\":\"Selection\"},{\"attributes\":{\"below\":[{\"id\":\"2280\"}],\"center\":[{\"id\":\"2283\"},{\"id\":\"2287\"}],\"left\":[{\"id\":\"2284\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2306\"}],\"title\":{\"id\":\"3161\"},\"toolbar\":{\"id\":\"2295\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2272\"},\"x_scale\":{\"id\":\"2276\"},\"y_range\":{\"id\":\"2274\"},\"y_scale\":{\"id\":\"2278\"}},\"id\":\"2271\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu17\",\"formatter\":{\"id\":\"2995\"},\"ticker\":{\"id\":\"1652\"}},\"id\":\"1651\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"3123\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2978\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2294\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"1652\",\"type\":\"BasicTicker\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2981\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3125\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"axis\":{\"id\":\"1651\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1654\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2305\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2251\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2983\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3127\",\"type\":\"Selection\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2993\"},\"ticker\":{\"id\":\"1656\"}},\"id\":\"1655\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2252\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2985\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3128\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1656\",\"type\":\"BasicTicker\"},{\"attributes\":{\"overlay\":{\"id\":\"2257\"}},\"id\":\"2253\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3131\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2987\",\"type\":\"Selection\"},{\"attributes\":{\"axis\":{\"id\":\"1655\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1658\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2254\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"3133\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2988\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2255\",\"type\":\"ResetTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2991\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3135\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"1673\"}},\"id\":\"1678\",\"type\":\"CDSView\"},{\"attributes\":{\"below\":[{\"id\":\"1688\"}],\"center\":[{\"id\":\"1691\"},{\"id\":\"1695\"}],\"left\":[{\"id\":\"1692\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1714\"}],\"title\":{\"id\":\"3001\"},\"toolbar\":{\"id\":\"1703\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1680\"},\"x_scale\":{\"id\":\"1684\"},\"y_range\":{\"id\":\"1682\"},\"y_scale\":{\"id\":\"1686\"}},\"id\":\"1679\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2256\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2993\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2503\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3137\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1659\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2995\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3138\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2251\"},{\"id\":\"2252\"},{\"id\":\"2253\"},{\"id\":\"2254\"},{\"id\":\"2255\"},{\"id\":\"2256\"}]},\"id\":\"2258\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1310\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1660\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3141\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2997\",\"type\":\"Selection\"},{\"attributes\":{\"overlay\":{\"id\":\"1665\"}},\"id\":\"1661\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"start\":0},\"id\":\"1312\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"3143\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2998\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2288\"},{\"id\":\"2289\"},{\"id\":\"2290\"},{\"id\":\"2291\"},{\"id\":\"2292\"},{\"id\":\"2293\"}]},\"id\":\"2295\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1662\",\"type\":\"SaveTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3001\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3145\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data_source\":{\"id\":\"2302\"},\"glyph\":{\"id\":\"2304\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2305\"},\"selection_glyph\":null,\"view\":{\"id\":\"2307\"}},\"id\":\"2306\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1663\",\"type\":\"ResetTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2257\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"3003\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3147\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1664\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"3005\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3148\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"2265\"},\"glyph\":{\"id\":\"2267\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2268\"},\"selection_glyph\":null,\"view\":{\"id\":\"2270\"}},\"id\":\"2269\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3151\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3007\",\"type\":\"Selection\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1712\",\"type\":\"Quad\"},{\"attributes\":{\"source\":{\"id\":\"2265\"}},\"id\":\"2270\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1680\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"3153\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3008\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"start\":0},\"id\":\"1756\",\"type\":\"DataRange1d\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3011\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3155\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2304\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2272\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1343\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3013\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3157\",\"type\":\"Selection\"},{\"attributes\":{\"below\":[{\"id\":\"1318\"}],\"center\":[{\"id\":\"1321\"},{\"id\":\"1325\"}],\"left\":[{\"id\":\"1322\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1344\"}],\"title\":{\"id\":\"2901\"},\"toolbar\":{\"id\":\"1333\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1310\"},\"x_scale\":{\"id\":\"1314\"},\"y_range\":{\"id\":\"1312\"},\"y_scale\":{\"id\":\"1316\"}},\"id\":\"1309\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2267\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3015\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3158\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1754\",\"type\":\"DataRange1d\"},{\"attributes\":{\"start\":0},\"id\":\"2274\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[667,98,31,26,15,21,20,18,14,9,13,6,3,4,0,1,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1]},\"selected\":{\"id\":\"2907\"},\"selection_policy\":{\"id\":\"2908\"}},\"id\":\"1340\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3161\",\"type\":\"Title\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1342\",\"type\":\"Quad\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[327,37,20,15,12,4,1,6,2,2,2,2,4,3,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3027\"},\"selection_policy\":{\"id\":\"3028\"}},\"id\":\"1784\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3017\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1758\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2276\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1665\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"3163\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3018\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2278\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1314\",\"type\":\"LinearScale\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3021\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3165\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"start\":0},\"id\":\"1682\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu4\",\"formatter\":{\"id\":\"3165\"},\"ticker\":{\"id\":\"2281\"}},\"id\":\"2280\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1316\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3023\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3167\",\"type\":\"Selection\"},{\"attributes\":{\"overlay\":{\"id\":\"1332\"}},\"id\":\"1328\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1684\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2281\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu23\",\"formatter\":{\"id\":\"2905\"},\"ticker\":{\"id\":\"1319\"}},\"id\":\"1318\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"3025\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3168\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1686\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis\":{\"id\":\"2280\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2283\",\"type\":\"Grid\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3171\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1319\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3027\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1329\",\"type\":\"SaveTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu13\",\"formatter\":{\"id\":\"3005\"},\"ticker\":{\"id\":\"1689\"}},\"id\":\"1688\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3163\"},\"ticker\":{\"id\":\"2285\"}},\"id\":\"2284\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"1318\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1321\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3173\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3028\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1689\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2285\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2903\"},\"ticker\":{\"id\":\"1323\"}},\"id\":\"1322\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3031\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3175\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1330\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis\":{\"id\":\"1688\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1691\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"2284\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2287\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1323\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3033\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3177\",\"type\":\"Selection\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3003\"},\"ticker\":{\"id\":\"1693\"}},\"id\":\"1692\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"1322\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1325\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3035\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2378\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3178\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1693\",\"type\":\"BasicTicker\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3181\",\"type\":\"Title\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1380\",\"type\":\"Quad\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[670,80,41,32,28,20,20,18,8,8,9,7,5,2,0,0,1,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3167\"},\"selection_policy\":{\"id\":\"3168\"}},\"id\":\"2302\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3037\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2695\",\"type\":\"PanTool\"},{\"attributes\":{\"axis\":{\"id\":\"1692\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1695\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2288\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"3183\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3038\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"below\":[{\"id\":\"1762\"}],\"center\":[{\"id\":\"1765\"},{\"id\":\"1769\"}],\"left\":[{\"id\":\"1766\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1788\"}],\"title\":{\"id\":\"3021\"},\"toolbar\":{\"id\":\"1777\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1754\"},\"x_scale\":{\"id\":\"1758\"},\"y_range\":{\"id\":\"1756\"},\"y_scale\":{\"id\":\"1760\"}},\"id\":\"1753\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2289\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3041\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3185\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"overlay\":{\"id\":\"1369\"}},\"id\":\"1365\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1331\",\"type\":\"HelpTool\"},{\"attributes\":{\"overlay\":{\"id\":\"2294\"}},\"id\":\"2290\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[552,82,41,32,27,21,17,23,15,13,11,7,6,3,3,3,2,1,2,0,2,0,1,1,1,1,0,0,0,1,0,1,1,0,0,0,1,0,1,2,0,1,0,0,0,0,2,1,3]},\"selected\":{\"id\":\"3007\"},\"selection_policy\":{\"id\":\"3008\"}},\"id\":\"1710\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3043\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1327\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"3187\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1326\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1696\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2291\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"3045\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3188\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1697\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2292\",\"type\":\"ResetTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3191\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3047\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2293\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis\":{\"id\":\"2132\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2135\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2836\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2698\",\"type\":\"SaveTool\"},{\"attributes\":{\"below\":[{\"id\":\"2317\"}],\"center\":[{\"id\":\"2320\"},{\"id\":\"2324\"}],\"left\":[{\"id\":\"2321\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2343\"}],\"title\":{\"id\":\"3171\"},\"toolbar\":{\"id\":\"2332\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2309\"},\"x_scale\":{\"id\":\"2313\"},\"y_range\":{\"id\":\"2311\"},\"y_scale\":{\"id\":\"2315\"}},\"id\":\"2308\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3123\"},\"ticker\":{\"id\":\"2137\"}},\"id\":\"2136\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"2835\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2838\",\"type\":\"Grid\"},{\"attributes\":{\"overlay\":{\"id\":\"2701\"}},\"id\":\"2697\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3313\"},\"ticker\":{\"id\":\"2840\"}},\"id\":\"2839\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2137\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1368\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2696\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"axis\":{\"id\":\"2136\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2139\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2840\",\"type\":\"BasicTicker\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1443\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data_source\":{\"id\":\"2154\"},\"glyph\":{\"id\":\"2156\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2157\"},\"selection_glyph\":null,\"view\":{\"id\":\"2159\"}},\"id\":\"2158\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"2839\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2842\",\"type\":\"Grid\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1332\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2309\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2220\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2214\"},{\"id\":\"2215\"},{\"id\":\"2216\"},{\"id\":\"2217\"},{\"id\":\"2218\"},{\"id\":\"2219\"}]},\"id\":\"2221\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1347\",\"type\":\"DataRange1d\"},{\"attributes\":{\"source\":{\"id\":\"2857\"}},\"id\":\"2862\",\"type\":\"CDSView\"},{\"attributes\":{\"start\":0},\"id\":\"2311\",\"type\":\"DataRange1d\"},{\"attributes\":{\"source\":{\"id\":\"1340\"}},\"id\":\"1345\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2896\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2140\",\"type\":\"PanTool\"},{\"attributes\":{\"below\":[{\"id\":\"1355\"}],\"center\":[{\"id\":\"1358\"},{\"id\":\"1362\"}],\"left\":[{\"id\":\"1359\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1381\"}],\"title\":{\"id\":\"2911\"},\"toolbar\":{\"id\":\"1370\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1347\"},\"x_scale\":{\"id\":\"1351\"},\"y_range\":{\"id\":\"1349\"},\"y_scale\":{\"id\":\"1353\"}},\"id\":\"1346\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2313\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2141\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2843\",\"type\":\"PanTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1340\"},\"glyph\":{\"id\":\"1342\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1343\"},\"selection_glyph\":null,\"view\":{\"id\":\"1345\"}},\"id\":\"1344\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"below\":[{\"id\":\"1392\"}],\"center\":[{\"id\":\"1395\"},{\"id\":\"1399\"}],\"left\":[{\"id\":\"1396\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1418\"}],\"title\":{\"id\":\"2921\"},\"toolbar\":{\"id\":\"1407\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1384\"},\"x_scale\":{\"id\":\"1388\"},\"y_range\":{\"id\":\"1386\"},\"y_scale\":{\"id\":\"1390\"}},\"id\":\"1383\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2315\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2844\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu30\",\"formatter\":{\"id\":\"3175\"},\"ticker\":{\"id\":\"2318\"}},\"id\":\"2317\",\"type\":\"LinearAxis\"},{\"attributes\":{\"overlay\":{\"id\":\"2849\"}},\"id\":\"2845\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2157\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2318\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2846\",\"type\":\"SaveTool\"},{\"attributes\":{\"start\":0},\"id\":\"1349\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis\":{\"id\":\"2317\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2320\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"2154\"}},\"id\":\"2159\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2847\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1351\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3173\"},\"ticker\":{\"id\":\"2322\"}},\"id\":\"2321\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2193\",\"type\":\"Quad\"},{\"attributes\":{\"below\":[{\"id\":\"2169\"}],\"center\":[{\"id\":\"2172\"},{\"id\":\"2176\"}],\"left\":[{\"id\":\"2173\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2195\"}],\"title\":{\"id\":\"3131\"},\"toolbar\":{\"id\":\"2184\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2161\"},\"x_scale\":{\"id\":\"2165\"},\"y_range\":{\"id\":\"2163\"},\"y_scale\":{\"id\":\"2167\"}},\"id\":\"2160\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1353\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[642,93,33,28,27,17,14,9,13,9,9,11,4,5,1,2,2,3,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1]},\"selected\":{\"id\":\"2977\"},\"selection_policy\":{\"id\":\"2978\"}},\"id\":\"1599\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2156\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2322\",\"type\":\"BasicTicker\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2860\",\"type\":\"Quad\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu20\",\"formatter\":{\"id\":\"2915\"},\"ticker\":{\"id\":\"1356\"}},\"id\":\"1355\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2161\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data_source\":{\"id\":\"2857\"},\"glyph\":{\"id\":\"2859\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2860\"},\"selection_glyph\":null,\"view\":{\"id\":\"2862\"}},\"id\":\"2861\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"below\":[{\"id\":\"2872\"}],\"center\":[{\"id\":\"2875\"},{\"id\":\"2879\"}],\"left\":[{\"id\":\"2876\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2898\"}],\"title\":{\"id\":\"3321\"},\"toolbar\":{\"id\":\"2887\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2864\"},\"x_scale\":{\"id\":\"2868\"},\"y_range\":{\"id\":\"2866\"},\"y_scale\":{\"id\":\"2870\"}},\"id\":\"2863\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis\":{\"id\":\"2321\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2324\",\"type\":\"Grid\"},{\"attributes\":{\"start\":0},\"id\":\"2163\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2864\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1356\",\"type\":\"BasicTicker\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2362\"},{\"id\":\"2363\"},{\"id\":\"2364\"},{\"id\":\"2365\"},{\"id\":\"2366\"},{\"id\":\"2367\"}]},\"id\":\"2369\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis\":{\"id\":\"1355\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1358\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2165\",\"type\":\"LinearScale\"},{\"attributes\":{\"start\":0},\"id\":\"2866\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2913\"},\"ticker\":{\"id\":\"1360\"}},\"id\":\"1359\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2167\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2868\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[688,75,44,22,26,21,15,13,16,7,11,3,6,1,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3177\"},\"selection_policy\":{\"id\":\"3178\"}},\"id\":\"2339\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2325\",\"type\":\"PanTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_total\",\"formatter\":{\"id\":\"3135\"},\"ticker\":{\"id\":\"2170\"}},\"id\":\"2169\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2870\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1360\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2326\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"axis\":{\"id\":\"1359\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1362\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"GPUMemoryUtilization-nodeid:algo-1_total\",\"formatter\":{\"id\":\"3325\"},\"ticker\":{\"id\":\"2873\"}},\"id\":\"2872\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2170\",\"type\":\"BasicTicker\"},{\"attributes\":{\"overlay\":{\"id\":\"2331\"}},\"id\":\"2327\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"axis\":{\"id\":\"2169\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2172\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2873\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2328\",\"type\":\"SaveTool\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3133\"},\"ticker\":{\"id\":\"2174\"}},\"id\":\"2173\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"2872\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2875\",\"type\":\"Grid\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[642,89,67,26,29,31,18,10,8,8,6,4,7,4,3,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"2917\"},\"selection_policy\":{\"id\":\"2918\"}},\"id\":\"1377\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2329\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3323\"},\"ticker\":{\"id\":\"2877\"}},\"id\":\"2876\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2174\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1363\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2330\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis\":{\"id\":\"2173\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2176\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2877\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1364\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"axis\":{\"id\":\"2876\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2879\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2379\",\"type\":\"Quad\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2711\",\"type\":\"Quad\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2325\"},{\"id\":\"2326\"},{\"id\":\"2327\"},{\"id\":\"2328\"},{\"id\":\"2329\"},{\"id\":\"2330\"}]},\"id\":\"2332\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1367\",\"type\":\"ResetTool\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1363\"},{\"id\":\"1364\"},{\"id\":\"1365\"},{\"id\":\"1366\"},{\"id\":\"1367\"},{\"id\":\"1368\"}]},\"id\":\"1370\",\"type\":\"Toolbar\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2911\",\"type\":\"Title\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[125,37,76,116,337,76,26,38,99,15,3,3,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3137\"},\"selection_policy\":{\"id\":\"3138\"}},\"id\":\"2191\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2452\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2177\",\"type\":\"PanTool\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[830,2,0,2,10,104,6,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3327\"},\"selection_policy\":{\"id\":\"3328\"}},\"id\":\"2894\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"1599\"},\"glyph\":{\"id\":\"1601\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1602\"},\"selection_glyph\":null,\"view\":{\"id\":\"1604\"}},\"id\":\"1603\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2915\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1437\"},{\"id\":\"1438\"},{\"id\":\"1439\"},{\"id\":\"1440\"},{\"id\":\"1441\"},{\"id\":\"1442\"}]},\"id\":\"1444\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2178\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2880\",\"type\":\"PanTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2376\"},\"glyph\":{\"id\":\"2378\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2379\"},\"selection_glyph\":null,\"view\":{\"id\":\"2381\"}},\"id\":\"2380\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2331\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1527\",\"type\":\"Quad\"},{\"attributes\":{\"overlay\":{\"id\":\"2183\"}},\"id\":\"2179\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2881\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1369\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data_source\":{\"id\":\"2339\"},\"glyph\":{\"id\":\"2341\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2342\"},\"selection_glyph\":null,\"view\":{\"id\":\"2344\"}},\"id\":\"2343\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2180\",\"type\":\"SaveTool\"},{\"attributes\":{\"overlay\":{\"id\":\"2886\"}},\"id\":\"2882\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1377\"},\"glyph\":{\"id\":\"1379\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1380\"},\"selection_glyph\":null,\"view\":{\"id\":\"1382\"}},\"id\":\"1381\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2342\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2181\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2883\",\"type\":\"SaveTool\"},{\"attributes\":{\"source\":{\"id\":\"1377\"}},\"id\":\"1382\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2182\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2884\",\"type\":\"ResetTool\"},{\"attributes\":{\"source\":{\"id\":\"2339\"}},\"id\":\"2344\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2885\",\"type\":\"HelpTool\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[705,69,40,31,18,18,19,13,8,12,7,4,3,2,1,0,2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"2927\"},\"selection_policy\":{\"id\":\"2928\"}},\"id\":\"1414\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2231\",\"type\":\"Quad\"},{\"attributes\":{\"below\":[{\"id\":\"2354\"}],\"center\":[{\"id\":\"2357\"},{\"id\":\"2361\"}],\"left\":[{\"id\":\"2358\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2380\"}],\"title\":{\"id\":\"3181\"},\"toolbar\":{\"id\":\"2369\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2346\"},\"x_scale\":{\"id\":\"2350\"},\"y_range\":{\"id\":\"2348\"},\"y_scale\":{\"id\":\"2352\"}},\"id\":\"2345\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1384\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[517,143,57,40,31,29,15,8,11,9,8,10,3,3,7,1,0,5,0,1,0,1,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,1,0,8]},\"selected\":{\"id\":\"3187\"},\"selection_policy\":{\"id\":\"3188\"}},\"id\":\"2376\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2177\"},{\"id\":\"2178\"},{\"id\":\"2179\"},{\"id\":\"2180\"},{\"id\":\"2181\"},{\"id\":\"2182\"}]},\"id\":\"2184\",\"type\":\"Toolbar\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2341\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2346\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1379\",\"type\":\"Quad\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2880\"},{\"id\":\"2881\"},{\"id\":\"2882\"},{\"id\":\"2883\"},{\"id\":\"2884\"},{\"id\":\"2885\"}]},\"id\":\"2887\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2699\",\"type\":\"ResetTool\"},{\"attributes\":{\"start\":0},\"id\":\"1386\",\"type\":\"DataRange1d\"},{\"attributes\":{\"start\":0},\"id\":\"2348\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2700\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2913\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1590\",\"type\":\"HelpTool\"},{\"attributes\":{\"overlay\":{\"id\":\"1702\"}},\"id\":\"1698\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"overlay\":{\"id\":\"1924\"}},\"id\":\"1920\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1525\"},\"glyph\":{\"id\":\"1527\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1528\"},\"selection_glyph\":null,\"view\":{\"id\":\"1530\"}},\"id\":\"1529\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1480\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"3193\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3048\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1699\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1921\",\"type\":\"SaveTool\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3023\"},\"ticker\":{\"id\":\"1767\"}},\"id\":\"1766\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3051\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3195\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1700\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1922\",\"type\":\"ResetTool\"},{\"attributes\":{\"start\":0},\"id\":\"1608\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"3053\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3197\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1701\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1923\",\"type\":\"HelpTool\"},{\"attributes\":{\"source\":{\"id\":\"1488\"}},\"id\":\"1493\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3055\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"axis\":{\"id\":\"1762\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1765\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3198\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3201\",\"type\":\"Title\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[650,87,22,24,26,26,18,16,14,12,11,7,6,7,4,4,1,0,2,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,1,1,0]},\"selected\":{\"id\":\"3087\"},\"selection_policy\":{\"id\":\"3088\"}},\"id\":\"2006\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3057\",\"type\":\"Selection\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1696\"},{\"id\":\"1697\"},{\"id\":\"1698\"},{\"id\":\"1699\"},{\"id\":\"1700\"},{\"id\":\"1701\"}]},\"id\":\"1703\",\"type\":\"Toolbar\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1918\"},{\"id\":\"1919\"},{\"id\":\"1920\"},{\"id\":\"1921\"},{\"id\":\"1922\"},{\"id\":\"1923\"}]},\"id\":\"1925\",\"type\":\"Toolbar\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1491\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3203\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3058\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"1488\"},\"glyph\":{\"id\":\"1490\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1491\"},\"selection_glyph\":null,\"view\":{\"id\":\"1493\"}},\"id\":\"1492\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"1747\"},\"glyph\":{\"id\":\"1749\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1750\"},\"selection_glyph\":null,\"view\":{\"id\":\"1752\"}},\"id\":\"1751\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"1988\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1991\",\"type\":\"Grid\"},{\"attributes\":{\"below\":[{\"id\":\"1503\"}],\"center\":[{\"id\":\"1506\"},{\"id\":\"1510\"}],\"left\":[{\"id\":\"1507\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1529\"}],\"title\":{\"id\":\"2951\"},\"toolbar\":{\"id\":\"1518\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1495\"},\"x_scale\":{\"id\":\"1499\"},\"y_range\":{\"id\":\"1497\"},\"y_scale\":{\"id\":\"1501\"}},\"id\":\"1494\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1610\",\"type\":\"LinearScale\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3061\",\"type\":\"Title\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1750\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1495\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1749\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1989\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1612\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3063\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3205\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3207\",\"type\":\"Selection\"},{\"attributes\":{\"start\":0},\"id\":\"1497\",\"type\":\"DataRange1d\"},{\"attributes\":{\"source\":{\"id\":\"1747\"}},\"id\":\"1752\",\"type\":\"CDSView\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu16\",\"formatter\":{\"id\":\"2985\"},\"ticker\":{\"id\":\"1615\"}},\"id\":\"1614\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1702\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1924\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"3065\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1499\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3208\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"1710\"},\"glyph\":{\"id\":\"1712\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1713\"},\"selection_glyph\":null,\"view\":{\"id\":\"1715\"}},\"id\":\"1714\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"1932\"},\"glyph\":{\"id\":\"1934\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1935\"},\"selection_glyph\":null,\"view\":{\"id\":\"1937\"}},\"id\":\"1936\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3211\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1615\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1501\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1971\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3067\",\"type\":\"Selection\"},{\"attributes\":{\"axis\":{\"id\":\"1614\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1617\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3213\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3068\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu2\",\"formatter\":{\"id\":\"2955\"},\"ticker\":{\"id\":\"1504\"}},\"id\":\"1503\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2983\"},\"ticker\":{\"id\":\"1619\"}},\"id\":\"1618\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3071\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3215\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1504\",\"type\":\"BasicTicker\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1713\",\"type\":\"Quad\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1935\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1619\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3073\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3217\",\"type\":\"Selection\"},{\"attributes\":{\"below\":[{\"id\":\"1725\"}],\"center\":[{\"id\":\"1728\"},{\"id\":\"1732\"}],\"left\":[{\"id\":\"1729\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1751\"}],\"title\":{\"id\":\"3011\"},\"toolbar\":{\"id\":\"1740\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1717\"},\"x_scale\":{\"id\":\"1721\"},\"y_range\":{\"id\":\"1719\"},\"y_scale\":{\"id\":\"1723\"}},\"id\":\"1716\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"below\":[{\"id\":\"1947\"}],\"center\":[{\"id\":\"1950\"},{\"id\":\"1954\"}],\"left\":[{\"id\":\"1951\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1973\"}],\"title\":{\"id\":\"3071\"},\"toolbar\":{\"id\":\"1962\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1939\"},\"x_scale\":{\"id\":\"1943\"},\"y_range\":{\"id\":\"1941\"},\"y_scale\":{\"id\":\"1945\"}},\"id\":\"1938\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis\":{\"id\":\"1503\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1506\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"1618\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1621\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1710\"}},\"id\":\"1715\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"1932\"}},\"id\":\"1937\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3075\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2953\"},\"ticker\":{\"id\":\"1508\"}},\"id\":\"1507\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"3218\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1717\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1939\",\"type\":\"DataRange1d\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3221\",\"type\":\"Title\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2897\",\"type\":\"Quad\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1659\"},{\"id\":\"1660\"},{\"id\":\"1661\"},{\"id\":\"1662\"},{\"id\":\"1663\"},{\"id\":\"1664\"}]},\"id\":\"1666\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1508\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3077\",\"type\":\"Selection\"},{\"attributes\":{\"start\":0},\"id\":\"1719\",\"type\":\"DataRange1d\"},{\"attributes\":{\"start\":0},\"id\":\"1941\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"3223\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3078\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis\":{\"id\":\"1507\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1510\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1721\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1943\",\"type\":\"LinearScale\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3081\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3225\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1622\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1723\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1945\",\"type\":\"LinearScale\"},{\"attributes\":{\"source\":{\"id\":\"1525\"}},\"id\":\"1530\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3083\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1623\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[532,142,52,44,22,28,27,24,13,11,8,9,10,6,1,4,1,5,0,1,1,1,2,0,0,0,0,0,2,0,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0]},\"selected\":{\"id\":\"2967\"},\"selection_policy\":{\"id\":\"2968\"}},\"id\":\"1562\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3227\",\"type\":\"Selection\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu29\",\"formatter\":{\"id\":\"3015\"},\"ticker\":{\"id\":\"1726\"}},\"id\":\"1725\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu24\",\"formatter\":{\"id\":\"3075\"},\"ticker\":{\"id\":\"1948\"}},\"id\":\"1947\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"3085\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3228\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"overlay\":{\"id\":\"1628\"}},\"id\":\"1624\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1511\",\"type\":\"PanTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3231\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1726\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1948\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1625\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"3087\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1512\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"axis\":{\"id\":\"1725\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1728\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"1947\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1950\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3233\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1626\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"3088\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"overlay\":{\"id\":\"1517\"}},\"id\":\"1513\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3013\"},\"ticker\":{\"id\":\"1730\"}},\"id\":\"1729\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3073\"},\"ticker\":{\"id\":\"1952\"}},\"id\":\"1951\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3091\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1627\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"3235\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1514\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1730\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1952\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3093\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1676\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3237\",\"type\":\"Selection\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1622\"},{\"id\":\"1623\"},{\"id\":\"1624\"},{\"id\":\"1625\"},{\"id\":\"1626\"},{\"id\":\"1627\"}]},\"id\":\"1629\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1515\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis\":{\"id\":\"1729\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1732\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"1951\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1954\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3095\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3238\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1516\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu1\",\"formatter\":{\"id\":\"3085\"},\"ticker\":{\"id\":\"1985\"}},\"id\":\"1984\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3241\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3097\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1763\",\"type\":\"BasicTicker\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1823\",\"type\":\"Quad\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1528\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3243\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3098\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"below\":[{\"id\":\"1540\"}],\"center\":[{\"id\":\"1543\"},{\"id\":\"1547\"}],\"left\":[{\"id\":\"1544\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1566\"}],\"title\":{\"id\":\"2961\"},\"toolbar\":{\"id\":\"1555\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1532\"},\"x_scale\":{\"id\":\"1536\"},\"y_range\":{\"id\":\"1534\"},\"y_scale\":{\"id\":\"1538\"}},\"id\":\"1531\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[638,95,48,26,29,24,16,9,15,11,10,5,7,4,2,2,1,0,3,0,1,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]},\"selected\":{\"id\":\"3017\"},\"selection_policy\":{\"id\":\"3018\"}},\"id\":\"1747\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[680,85,36,24,24,19,16,20,12,4,9,5,7,5,2,2,1,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3077\"},\"selection_policy\":{\"id\":\"3078\"}},\"id\":\"1969\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1532\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data_source\":{\"id\":\"1673\"},\"glyph\":{\"id\":\"1675\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1676\"},\"selection_glyph\":null,\"view\":{\"id\":\"1678\"}},\"id\":\"1677\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3101\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3245\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1733\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1955\",\"type\":\"PanTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1628\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"start\":0},\"id\":\"1534\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"3103\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3247\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1734\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1956\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1636\"},\"glyph\":{\"id\":\"1638\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1639\"},\"selection_glyph\":null,\"view\":{\"id\":\"1641\"}},\"id\":\"1640\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1536\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3105\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3248\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"overlay\":{\"id\":\"1739\"}},\"id\":\"1735\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"overlay\":{\"id\":\"1961\"}},\"id\":\"1957\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1639\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1538\",\"type\":\"LinearScale\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3251\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"3107\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1736\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1958\",\"type\":\"SaveTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu8\",\"formatter\":{\"id\":\"2965\"},\"ticker\":{\"id\":\"1541\"}},\"id\":\"1540\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"1636\"}},\"id\":\"1641\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3253\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3108\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1737\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1959\",\"type\":\"ResetTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3111\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1541\",\"type\":\"BasicTicker\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[595,104,58,39,26,22,15,20,20,11,9,7,2,3,1,0,2,2,2,1,0,0,0,0,0,0,1,1,0,1,1,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"2997\"},\"selection_policy\":{\"id\":\"2998\"}},\"id\":\"1673\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3255\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"below\":[{\"id\":\"1651\"}],\"center\":[{\"id\":\"1654\"},{\"id\":\"1658\"}],\"left\":[{\"id\":\"1655\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1677\"}],\"title\":{\"id\":\"2991\"},\"toolbar\":{\"id\":\"1666\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1643\"},\"x_scale\":{\"id\":\"1647\"},\"y_range\":{\"id\":\"1645\"},\"y_scale\":{\"id\":\"1649\"}},\"id\":\"1642\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1738\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1960\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis\":{\"id\":\"1540\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1543\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1638\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3113\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3257\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1643\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3083\"},\"ticker\":{\"id\":\"1989\"}},\"id\":\"1988\",\"type\":\"LinearAxis\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1733\"},{\"id\":\"1734\"},{\"id\":\"1735\"},{\"id\":\"1736\"},{\"id\":\"1737\"},{\"id\":\"1738\"}]},\"id\":\"1740\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2963\"},\"ticker\":{\"id\":\"1545\"}},\"id\":\"1544\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"3115\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3258\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1955\"},{\"id\":\"1956\"},{\"id\":\"1957\"},{\"id\":\"1958\"},{\"id\":\"1959\"},{\"id\":\"1960\"}]},\"id\":\"1962\",\"type\":\"Toolbar\"},{\"attributes\":{\"start\":0},\"id\":\"1645\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis\":{\"id\":\"1984\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1987\",\"type\":\"Grid\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3261\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1545\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3117\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1647\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis\":{\"id\":\"1544\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1547\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1985\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3263\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3118\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1649\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis\":{\"id\":\"1803\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1806\",\"type\":\"Grid\"},{\"attributes\":{\"data_source\":{\"id\":\"1858\"},\"glyph\":{\"id\":\"1860\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1861\"},\"selection_glyph\":null,\"view\":{\"id\":\"1863\"}},\"id\":\"1862\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"overlay\":{\"id\":\"2072\"}},\"id\":\"2068\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2029\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1832\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1807\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2030\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1808\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"overlay\":{\"id\":\"2035\"}},\"id\":\"2031\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1739\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"overlay\":{\"id\":\"1813\"}},\"id\":\"1809\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2032\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1810\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2033\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1767\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1811\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2034\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis\":{\"id\":\"1766\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1769\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1812\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2069\",\"type\":\"SaveTool\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2029\"},{\"id\":\"2030\"},{\"id\":\"2031\"},{\"id\":\"2032\"},{\"id\":\"2033\"},{\"id\":\"2034\"}]},\"id\":\"2036\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1834\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1906\",\"type\":\"LinearScale\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1807\"},{\"id\":\"1808\"},{\"id\":\"1809\"},{\"id\":\"1810\"},{\"id\":\"1811\"},{\"id\":\"1812\"}]},\"id\":\"1814\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2067\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"source\":{\"id\":\"1821\"}},\"id\":\"1826\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1770\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2066\",\"type\":\"PanTool\"},{\"attributes\":{\"start\":0},\"id\":\"1830\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1771\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1828\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2035\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"overlay\":{\"id\":\"1776\"}},\"id\":\"1772\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2043\"},\"glyph\":{\"id\":\"2045\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2046\"},\"selection_glyph\":null,\"view\":{\"id\":\"2048\"}},\"id\":\"2047\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1813\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"1773\",\"type\":\"SaveTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu11\",\"formatter\":{\"id\":\"3045\"},\"ticker\":{\"id\":\"1837\"}},\"id\":\"1836\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2046\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1774\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1837\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1775\",\"type\":\"HelpTool\"},{\"attributes\":{\"source\":{\"id\":\"2043\"}},\"id\":\"2048\",\"type\":\"CDSView\"},{\"attributes\":{\"axis\":{\"id\":\"1836\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1839\",\"type\":\"Grid\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1770\"},{\"id\":\"1771\"},{\"id\":\"1772\"},{\"id\":\"1773\"},{\"id\":\"1774\"},{\"id\":\"1775\"}]},\"id\":\"1777\",\"type\":\"Toolbar\"},{\"attributes\":{\"below\":[{\"id\":\"2058\"}],\"center\":[{\"id\":\"2061\"},{\"id\":\"2065\"}],\"left\":[{\"id\":\"2062\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2084\"}],\"title\":{\"id\":\"3101\"},\"toolbar\":{\"id\":\"2073\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2050\"},\"x_scale\":{\"id\":\"2054\"},\"y_range\":{\"id\":\"2052\"},\"y_scale\":{\"id\":\"2056\"}},\"id\":\"2049\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3043\"},\"ticker\":{\"id\":\"1841\"}},\"id\":\"1840\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2045\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2050\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1824\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1841\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"1840\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1843\",\"type\":\"Grid\"},{\"attributes\":{\"start\":0},\"id\":\"2052\",\"type\":\"DataRange1d\"},{\"attributes\":{\"below\":[{\"id\":\"1836\"}],\"center\":[{\"id\":\"1839\"},{\"id\":\"1843\"}],\"left\":[{\"id\":\"1840\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1862\"}],\"title\":{\"id\":\"3041\"},\"toolbar\":{\"id\":\"1851\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1828\"},\"x_scale\":{\"id\":\"1832\"},\"y_range\":{\"id\":\"1830\"},\"y_scale\":{\"id\":\"1834\"}},\"id\":\"1827\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"data_source\":{\"id\":\"1821\"},\"glyph\":{\"id\":\"1823\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1824\"},\"selection_glyph\":null,\"view\":{\"id\":\"1826\"}},\"id\":\"1825\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2054\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1776\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"source\":{\"id\":\"1895\"}},\"id\":\"1900\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2056\",\"type\":\"LinearScale\"},{\"attributes\":{\"data_source\":{\"id\":\"1784\"},\"glyph\":{\"id\":\"1786\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1787\"},\"selection_glyph\":null,\"view\":{\"id\":\"1789\"}},\"id\":\"1788\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[345,57,33,37,22,18,20,12,24,12,9,12,14,8,4,6,5,6,6,4,0,2,1,1,2,3,0,2,2,3,3,2,1,1,3,0,2,3,2,1,0,2,2,4,1,0,5,6,13]},\"selected\":{\"id\":\"3047\"},\"selection_policy\":{\"id\":\"3048\"}},\"id\":\"1858\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu9\",\"formatter\":{\"id\":\"3105\"},\"ticker\":{\"id\":\"2059\"}},\"id\":\"2058\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1844\",\"type\":\"PanTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1787\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2059\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1845\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"axis\":{\"id\":\"2058\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2061\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1784\"}},\"id\":\"1789\",\"type\":\"CDSView\"},{\"attributes\":{\"overlay\":{\"id\":\"1850\"}},\"id\":\"1846\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1326\"},{\"id\":\"1327\"},{\"id\":\"1328\"},{\"id\":\"1329\"},{\"id\":\"1330\"},{\"id\":\"1331\"}]},\"id\":\"1333\",\"type\":\"Toolbar\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[694,66,36,39,22,18,17,11,9,6,6,12,5,6,3,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3037\"},\"selection_policy\":{\"id\":\"3038\"}},\"id\":\"1821\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"below\":[{\"id\":\"1799\"}],\"center\":[{\"id\":\"1802\"},{\"id\":\"1806\"}],\"left\":[{\"id\":\"1803\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1825\"}],\"title\":{\"id\":\"3031\"},\"toolbar\":{\"id\":\"1814\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1791\"},\"x_scale\":{\"id\":\"1795\"},\"y_range\":{\"id\":\"1793\"},\"y_scale\":{\"id\":\"1797\"}},\"id\":\"1790\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1847\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2070\",\"type\":\"ResetTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1786\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1848\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2071\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1791\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1849\",\"type\":\"HelpTool\"},{\"attributes\":{\"start\":0},\"id\":\"1793\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2146\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2066\"},{\"id\":\"2067\"},{\"id\":\"2068\"},{\"id\":\"2069\"},{\"id\":\"2070\"},{\"id\":\"2071\"}]},\"id\":\"2073\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1795\",\"type\":\"LinearScale\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1844\"},{\"id\":\"1845\"},{\"id\":\"1846\"},{\"id\":\"1847\"},{\"id\":\"1848\"},{\"id\":\"1849\"}]},\"id\":\"1851\",\"type\":\"Toolbar\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[635,89,45,26,22,19,20,16,13,6,10,10,11,2,1,1,0,0,3,1,0,0,1,0,1,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3147\"},\"selection_policy\":{\"id\":\"3148\"}},\"id\":\"2228\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1797\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[575,85,55,41,28,26,25,22,13,12,8,6,6,4,4,4,1,3,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0]},\"selected\":{\"id\":\"3157\"},\"selection_policy\":{\"id\":\"3158\"}},\"id\":\"2265\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1902\",\"type\":\"DataRange1d\"},{\"attributes\":{\"source\":{\"id\":\"2302\"}},\"id\":\"2307\",\"type\":\"CDSView\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2140\"},{\"id\":\"2141\"},{\"id\":\"2142\"},{\"id\":\"2143\"},{\"id\":\"2144\"},{\"id\":\"2145\"}]},\"id\":\"2147\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu22\",\"formatter\":{\"id\":\"3035\"},\"ticker\":{\"id\":\"1800\"}},\"id\":\"1799\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1934\",\"type\":\"Quad\"},{\"attributes\":{\"below\":[{\"id\":\"1910\"}],\"center\":[{\"id\":\"1913\"},{\"id\":\"1917\"}],\"left\":[{\"id\":\"1914\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1936\"}],\"title\":{\"id\":\"3061\"},\"toolbar\":{\"id\":\"1925\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1902\"},\"x_scale\":{\"id\":\"1906\"},\"y_range\":{\"id\":\"1904\"},\"y_scale\":{\"id\":\"1908\"}},\"id\":\"1901\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2072\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"1800\",\"type\":\"BasicTicker\"},{\"attributes\":{\"start\":0},\"id\":\"1904\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis\":{\"id\":\"1799\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1802\",\"type\":\"Grid\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1850\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2082\",\"type\":\"Quad\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3033\"},\"ticker\":{\"id\":\"1804\"}},\"id\":\"1803\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1860\",\"type\":\"Quad\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2083\",\"type\":\"Quad\"},{\"attributes\":{\"data_source\":{\"id\":\"2080\"},\"glyph\":{\"id\":\"2082\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2083\"},\"selection_glyph\":null,\"view\":{\"id\":\"2085\"}},\"id\":\"2084\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1804\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"2080\"}},\"id\":\"2085\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1861\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1388\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2268\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3265\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1961\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2183\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2489\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1390\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3267\",\"type\":\"Selection\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3273\"},\"ticker\":{\"id\":\"2692\"}},\"id\":\"2691\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu19\",\"formatter\":{\"id\":\"2925\"},\"ticker\":{\"id\":\"1393\"}},\"id\":\"1392\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"2228\"}},\"id\":\"2233\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3268\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3271\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1393\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"1969\"}},\"id\":\"1974\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"2191\"}},\"id\":\"2196\",\"type\":\"CDSView\"},{\"attributes\":{\"axis\":{\"id\":\"1392\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1395\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3273\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2923\"},\"ticker\":{\"id\":\"1397\"}},\"id\":\"1396\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1972\",\"type\":\"Quad\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2194\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3275\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data_source\":{\"id\":\"1969\"},\"glyph\":{\"id\":\"1971\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1972\"},\"selection_glyph\":null,\"view\":{\"id\":\"1974\"}},\"id\":\"1973\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"2191\"},\"glyph\":{\"id\":\"2193\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2194\"},\"selection_glyph\":null,\"view\":{\"id\":\"2196\"}},\"id\":\"2195\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"below\":[{\"id\":\"1984\"}],\"center\":[{\"id\":\"1987\"},{\"id\":\"1991\"}],\"left\":[{\"id\":\"1988\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2010\"}],\"title\":{\"id\":\"3081\"},\"toolbar\":{\"id\":\"1999\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1976\"},\"x_scale\":{\"id\":\"1980\"},\"y_range\":{\"id\":\"1978\"},\"y_scale\":{\"id\":\"1982\"}},\"id\":\"1975\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"below\":[{\"id\":\"2206\"}],\"center\":[{\"id\":\"2209\"},{\"id\":\"2213\"}],\"left\":[{\"id\":\"2210\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2232\"}],\"title\":{\"id\":\"3141\"},\"toolbar\":{\"id\":\"2221\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2198\"},\"x_scale\":{\"id\":\"2202\"},\"y_range\":{\"id\":\"2200\"},\"y_scale\":{\"id\":\"2204\"}},\"id\":\"2197\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1397\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3277\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1976\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2198\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis\":{\"id\":\"1396\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1399\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"2687\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2690\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3278\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"start\":0},\"id\":\"1978\",\"type\":\"DataRange1d\"},{\"attributes\":{\"start\":0},\"id\":\"2200\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data_source\":{\"id\":\"1414\"},\"glyph\":{\"id\":\"1416\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1417\"},\"selection_glyph\":null,\"view\":{\"id\":\"1419\"}},\"id\":\"1418\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3281\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2692\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2202\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2442\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3103\"},\"ticker\":{\"id\":\"2063\"}},\"id\":\"2062\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"3283\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"overlay\":{\"id\":\"1443\"}},\"id\":\"1439\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2204\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2457\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1992\",\"type\":\"PanTool\"},{\"attributes\":{\"start\":0},\"id\":\"2459\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"3285\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1400\",\"type\":\"PanTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu5\",\"formatter\":{\"id\":\"3145\"},\"ticker\":{\"id\":\"2207\"}},\"id\":\"2206\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1993\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2461\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3287\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1401\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2207\",\"type\":\"BasicTicker\"},{\"attributes\":{\"overlay\":{\"id\":\"1998\"}},\"id\":\"1994\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2463\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3288\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"overlay\":{\"id\":\"1406\"}},\"id\":\"1402\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"axis\":{\"id\":\"2206\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2209\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1995\",\"type\":\"SaveTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu6\",\"formatter\":{\"id\":\"3215\"},\"ticker\":{\"id\":\"2466\"}},\"id\":\"2465\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3291\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1403\",\"type\":\"SaveTool\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3143\"},\"ticker\":{\"id\":\"2211\"}},\"id\":\"2210\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1996\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2466\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3293\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1404\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2211\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1997\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis\":{\"id\":\"2465\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2468\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3295\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1405\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis\":{\"id\":\"2210\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2213\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3213\"},\"ticker\":{\"id\":\"2470\"}},\"id\":\"2469\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[643,81,44,34,24,13,28,14,10,9,9,9,9,3,5,2,0,2,1,1,1,1,0,0,0,2,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]},\"selected\":{\"id\":\"3107\"},\"selection_policy\":{\"id\":\"3108\"}},\"id\":\"2080\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3297\",\"type\":\"Selection\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1992\"},{\"id\":\"1993\"},{\"id\":\"1994\"},{\"id\":\"1995\"},{\"id\":\"1996\"},{\"id\":\"1997\"}]},\"id\":\"1999\",\"type\":\"Toolbar\"},{\"attributes\":{\"data_source\":{\"id\":\"2228\"},\"glyph\":{\"id\":\"2230\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2231\"},\"selection_glyph\":null,\"view\":{\"id\":\"2233\"}},\"id\":\"2232\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2470\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"3298\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1406\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"below\":[{\"id\":\"2243\"}],\"center\":[{\"id\":\"2246\"},{\"id\":\"2250\"}],\"left\":[{\"id\":\"2247\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2269\"}],\"title\":{\"id\":\"3151\"},\"toolbar\":{\"id\":\"2258\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2235\"},\"x_scale\":{\"id\":\"2239\"},\"y_range\":{\"id\":\"2237\"},\"y_scale\":{\"id\":\"2241\"}},\"id\":\"2234\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis\":{\"id\":\"2062\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2065\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"2469\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2472\",\"type\":\"Grid\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3301\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2235\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1441\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2214\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2063\",\"type\":\"BasicTicker\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[649,100,35,33,22,20,22,14,10,11,5,10,6,3,3,0,1,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0]},\"selected\":{\"id\":\"3227\"},\"selection_policy\":{\"id\":\"3228\"}},\"id\":\"2524\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3303\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2215\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1998\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"3305\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[649,85,45,31,16,21,27,14,9,11,10,9,7,2,1,2,0,2,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3217\"},\"selection_policy\":{\"id\":\"3218\"}},\"id\":\"2487\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"overlay\":{\"id\":\"2220\"}},\"id\":\"2216\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2006\"},\"glyph\":{\"id\":\"2008\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2009\"},\"selection_glyph\":null,\"view\":{\"id\":\"2011\"}},\"id\":\"2010\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"2524\"}},\"id\":\"2529\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3307\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2473\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2217\",\"type\":\"SaveTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2009\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3308\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2474\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1442\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2218\",\"type\":\"ResetTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3311\",\"type\":\"Title\"},{\"attributes\":{\"source\":{\"id\":\"2709\"}},\"id\":\"2714\",\"type\":\"CDSView\"},{\"attributes\":{\"overlay\":{\"id\":\"2479\"}},\"id\":\"2475\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"source\":{\"id\":\"2006\"}},\"id\":\"2011\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1440\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2219\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"3313\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2476\",\"type\":\"SaveTool\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[281,61,30,25,17,6,3,1,7,4,7,2,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]},\"selected\":{\"id\":\"3097\"},\"selection_policy\":{\"id\":\"3098\"}},\"id\":\"2043\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"below\":[{\"id\":\"2021\"}],\"center\":[{\"id\":\"2024\"},{\"id\":\"2028\"}],\"left\":[{\"id\":\"2025\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2047\"}],\"title\":{\"id\":\"3091\"},\"toolbar\":{\"id\":\"2036\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2013\"},\"x_scale\":{\"id\":\"2017\"},\"y_range\":{\"id\":\"2015\"},\"y_scale\":{\"id\":\"2019\"}},\"id\":\"2012\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"3315\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1400\"},{\"id\":\"1401\"},{\"id\":\"1402\"},{\"id\":\"1403\"},{\"id\":\"1404\"},{\"id\":\"1405\"}]},\"id\":\"1407\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2477\",\"type\":\"ResetTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2008\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2013\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"3317\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2478\",\"type\":\"HelpTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1417\",\"type\":\"Quad\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2230\",\"type\":\"Quad\"},{\"attributes\":{\"start\":0},\"id\":\"2015\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"3318\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"start\":0},\"id\":\"2237\",\"type\":\"DataRange1d\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2473\"},{\"id\":\"2474\"},{\"id\":\"2475\"},{\"id\":\"2476\"},{\"id\":\"2477\"},{\"id\":\"2478\"}]},\"id\":\"2480\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2017\",\"type\":\"LinearScale\"},{\"attributes\":{\"text\":\"\"},\"id\":\"3321\",\"type\":\"Title\"},{\"attributes\":{\"source\":{\"id\":\"1414\"}},\"id\":\"1419\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2239\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2019\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[614,100,55,34,36,14,15,12,10,7,4,13,4,4,9,3,0,2,1,0,0,0,0,0,1,0,1,0,0,2,0,1,0,0,0,0,0,1,0,2,0,0,0,0,2,2,0,0,0]},\"selected\":{\"id\":\"2937\"},\"selection_policy\":{\"id\":\"2938\"}},\"id\":\"1451\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3323\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"below\":[{\"id\":\"1429\"}],\"center\":[{\"id\":\"1432\"},{\"id\":\"1436\"}],\"left\":[{\"id\":\"1433\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1455\"}],\"title\":{\"id\":\"2931\"},\"toolbar\":{\"id\":\"1444\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1421\"},\"x_scale\":{\"id\":\"1425\"},\"y_range\":{\"id\":\"1423\"},\"y_scale\":{\"id\":\"1427\"}},\"id\":\"1420\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2241\",\"type\":\"LinearScale\"},{\"attributes\":{\"data_source\":{\"id\":\"2524\"},\"glyph\":{\"id\":\"2526\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2527\"},\"selection_glyph\":null,\"view\":{\"id\":\"2529\"}},\"id\":\"2528\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu28\",\"formatter\":{\"id\":\"3095\"},\"ticker\":{\"id\":\"2022\"}},\"id\":\"2021\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2611\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1416\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3325\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1421\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu15\",\"formatter\":{\"id\":\"3155\"},\"ticker\":{\"id\":\"2244\"}},\"id\":\"2243\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2022\",\"type\":\"BasicTicker\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2527\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"3327\",\"type\":\"Selection\"},{\"attributes\":{\"start\":0},\"id\":\"1423\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2479\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2244\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"2021\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2024\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3328\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1425\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis\":{\"id\":\"2243\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2246\",\"type\":\"Grid\"},{\"attributes\":{\"toolbars\":[{\"id\":\"1333\"},{\"id\":\"1370\"},{\"id\":\"1407\"},{\"id\":\"1444\"},{\"id\":\"1481\"},{\"id\":\"1518\"},{\"id\":\"1555\"},{\"id\":\"1592\"},{\"id\":\"1629\"},{\"id\":\"1666\"},{\"id\":\"1703\"},{\"id\":\"1740\"},{\"id\":\"1777\"},{\"id\":\"1814\"},{\"id\":\"1851\"},{\"id\":\"1888\"},{\"id\":\"1925\"},{\"id\":\"1962\"},{\"id\":\"1999\"},{\"id\":\"2036\"},{\"id\":\"2073\"},{\"id\":\"2110\"},{\"id\":\"2147\"},{\"id\":\"2184\"},{\"id\":\"2221\"},{\"id\":\"2258\"},{\"id\":\"2295\"},{\"id\":\"2332\"},{\"id\":\"2369\"},{\"id\":\"2406\"},{\"id\":\"2443\"},{\"id\":\"2480\"},{\"id\":\"2517\"},{\"id\":\"2554\"},{\"id\":\"2591\"},{\"id\":\"2628\"},{\"id\":\"2665\"},{\"id\":\"2702\"},{\"id\":\"2739\"},{\"id\":\"2776\"},{\"id\":\"2813\"},{\"id\":\"2850\"},{\"id\":\"2887\"}],\"tools\":[{\"id\":\"1326\"},{\"id\":\"1327\"},{\"id\":\"1328\"},{\"id\":\"1329\"},{\"id\":\"1330\"},{\"id\":\"1331\"},{\"id\":\"1363\"},{\"id\":\"1364\"},{\"id\":\"1365\"},{\"id\":\"1366\"},{\"id\":\"1367\"},{\"id\":\"1368\"},{\"id\":\"1400\"},{\"id\":\"1401\"},{\"id\":\"1402\"},{\"id\":\"1403\"},{\"id\":\"1404\"},{\"id\":\"1405\"},{\"id\":\"1437\"},{\"id\":\"1438\"},{\"id\":\"1439\"},{\"id\":\"1440\"},{\"id\":\"1441\"},{\"id\":\"1442\"},{\"id\":\"1474\"},{\"id\":\"1475\"},{\"id\":\"1476\"},{\"id\":\"1477\"},{\"id\":\"1478\"},{\"id\":\"1479\"},{\"id\":\"1511\"},{\"id\":\"1512\"},{\"id\":\"1513\"},{\"id\":\"1514\"},{\"id\":\"1515\"},{\"id\":\"1516\"},{\"id\":\"1548\"},{\"id\":\"1549\"},{\"id\":\"1550\"},{\"id\":\"1551\"},{\"id\":\"1552\"},{\"id\":\"1553\"},{\"id\":\"1585\"},{\"id\":\"1586\"},{\"id\":\"1587\"},{\"id\":\"1588\"},{\"id\":\"1589\"},{\"id\":\"1590\"},{\"id\":\"1622\"},{\"id\":\"1623\"},{\"id\":\"1624\"},{\"id\":\"1625\"},{\"id\":\"1626\"},{\"id\":\"1627\"},{\"id\":\"1659\"},{\"id\":\"1660\"},{\"id\":\"1661\"},{\"id\":\"1662\"},{\"id\":\"1663\"},{\"id\":\"1664\"},{\"id\":\"1696\"},{\"id\":\"1697\"},{\"id\":\"1698\"},{\"id\":\"1699\"},{\"id\":\"1700\"},{\"id\":\"1701\"},{\"id\":\"1733\"},{\"id\":\"1734\"},{\"id\":\"1735\"},{\"id\":\"1736\"},{\"id\":\"1737\"},{\"id\":\"1738\"},{\"id\":\"1770\"},{\"id\":\"1771\"},{\"id\":\"1772\"},{\"id\":\"1773\"},{\"id\":\"1774\"},{\"id\":\"1775\"},{\"id\":\"1807\"},{\"id\":\"1808\"},{\"id\":\"1809\"},{\"id\":\"1810\"},{\"id\":\"1811\"},{\"id\":\"1812\"},{\"id\":\"1844\"},{\"id\":\"1845\"},{\"id\":\"1846\"},{\"id\":\"1847\"},{\"id\":\"1848\"},{\"id\":\"1849\"},{\"id\":\"1881\"},{\"id\":\"1882\"},{\"id\":\"1883\"},{\"id\":\"1884\"},{\"id\":\"1885\"},{\"id\":\"1886\"},{\"id\":\"1918\"},{\"id\":\"1919\"},{\"id\":\"1920\"},{\"id\":\"1921\"},{\"id\":\"1922\"},{\"id\":\"1923\"},{\"id\":\"1955\"},{\"id\":\"1956\"},{\"id\":\"1957\"},{\"id\":\"1958\"},{\"id\":\"1959\"},{\"id\":\"1960\"},{\"id\":\"1992\"},{\"id\":\"1993\"},{\"id\":\"1994\"},{\"id\":\"1995\"},{\"id\":\"1996\"},{\"id\":\"1997\"},{\"id\":\"2029\"},{\"id\":\"2030\"},{\"id\":\"2031\"},{\"id\":\"2032\"},{\"id\":\"2033\"},{\"id\":\"2034\"},{\"id\":\"2066\"},{\"id\":\"2067\"},{\"id\":\"2068\"},{\"id\":\"2069\"},{\"id\":\"2070\"},{\"id\":\"2071\"},{\"id\":\"2103\"},{\"id\":\"2104\"},{\"id\":\"2105\"},{\"id\":\"2106\"},{\"id\":\"2107\"},{\"id\":\"2108\"},{\"id\":\"2140\"},{\"id\":\"2141\"},{\"id\":\"2142\"},{\"id\":\"2143\"},{\"id\":\"2144\"},{\"id\":\"2145\"},{\"id\":\"2177\"},{\"id\":\"2178\"},{\"id\":\"2179\"},{\"id\":\"2180\"},{\"id\":\"2181\"},{\"id\":\"2182\"},{\"id\":\"2214\"},{\"id\":\"2215\"},{\"id\":\"2216\"},{\"id\":\"2217\"},{\"id\":\"2218\"},{\"id\":\"2219\"},{\"id\":\"2251\"},{\"id\":\"2252\"},{\"id\":\"2253\"},{\"id\":\"2254\"},{\"id\":\"2255\"},{\"id\":\"2256\"},{\"id\":\"2288\"},{\"id\":\"2289\"},{\"id\":\"2290\"},{\"id\":\"2291\"},{\"id\":\"2292\"},{\"id\":\"2293\"},{\"id\":\"2325\"},{\"id\":\"2326\"},{\"id\":\"2327\"},{\"id\":\"2328\"},{\"id\":\"2329\"},{\"id\":\"2330\"},{\"id\":\"2362\"},{\"id\":\"2363\"},{\"id\":\"2364\"},{\"id\":\"2365\"},{\"id\":\"2366\"},{\"id\":\"2367\"},{\"id\":\"2399\"},{\"id\":\"2400\"},{\"id\":\"2401\"},{\"id\":\"2402\"},{\"id\":\"2403\"},{\"id\":\"2404\"},{\"id\":\"2436\"},{\"id\":\"2437\"},{\"id\":\"2438\"},{\"id\":\"2439\"},{\"id\":\"2440\"},{\"id\":\"2441\"},{\"id\":\"2473\"},{\"id\":\"2474\"},{\"id\":\"2475\"},{\"id\":\"2476\"},{\"id\":\"2477\"},{\"id\":\"2478\"},{\"id\":\"2510\"},{\"id\":\"2511\"},{\"id\":\"2512\"},{\"id\":\"2513\"},{\"id\":\"2514\"},{\"id\":\"2515\"},{\"id\":\"2547\"},{\"id\":\"2548\"},{\"id\":\"2549\"},{\"id\":\"2550\"},{\"id\":\"2551\"},{\"id\":\"2552\"},{\"id\":\"2584\"},{\"id\":\"2585\"},{\"id\":\"2586\"},{\"id\":\"2587\"},{\"id\":\"2588\"},{\"id\":\"2589\"},{\"id\":\"2621\"},{\"id\":\"2622\"},{\"id\":\"2623\"},{\"id\":\"2624\"},{\"id\":\"2625\"},{\"id\":\"2626\"},{\"id\":\"2658\"},{\"id\":\"2659\"},{\"id\":\"2660\"},{\"id\":\"2661\"},{\"id\":\"2662\"},{\"id\":\"2663\"},{\"id\":\"2695\"},{\"id\":\"2696\"},{\"id\":\"2697\"},{\"id\":\"2698\"},{\"id\":\"2699\"},{\"id\":\"2700\"},{\"id\":\"2732\"},{\"id\":\"2733\"},{\"id\":\"2734\"},{\"id\":\"2735\"},{\"id\":\"2736\"},{\"id\":\"2737\"},{\"id\":\"2769\"},{\"id\":\"2770\"},{\"id\":\"2771\"},{\"id\":\"2772\"},{\"id\":\"2773\"},{\"id\":\"2774\"},{\"id\":\"2806\"},{\"id\":\"2807\"},{\"id\":\"2808\"},{\"id\":\"2809\"},{\"id\":\"2810\"},{\"id\":\"2811\"},{\"id\":\"2843\"},{\"id\":\"2844\"},{\"id\":\"2845\"},{\"id\":\"2846\"},{\"id\":\"2847\"},{\"id\":\"2848\"},{\"id\":\"2880\"},{\"id\":\"2881\"},{\"id\":\"2882\"},{\"id\":\"2883\"},{\"id\":\"2884\"},{\"id\":\"2885\"}]},\"id\":\"3331\",\"type\":\"ProxyToolbar\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3093\"},\"ticker\":{\"id\":\"2026\"}},\"id\":\"2025\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1427\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3153\"},\"ticker\":{\"id\":\"2248\"}},\"id\":\"2247\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2490\",\"type\":\"Quad\"},{\"attributes\":{\"toolbar\":{\"id\":\"3331\"},\"toolbar_location\":\"above\"},\"id\":\"3332\",\"type\":\"ToolbarBox\"},{\"attributes\":{\"data_source\":{\"id\":\"2487\"},\"glyph\":{\"id\":\"2489\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2490\"},\"selection_glyph\":null,\"view\":{\"id\":\"2492\"}},\"id\":\"2491\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2026\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu26\",\"formatter\":{\"id\":\"2935\"},\"ticker\":{\"id\":\"1430\"}},\"id\":\"1429\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2248\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"2025\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2028\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"2487\"}},\"id\":\"2492\",\"type\":\"CDSView\"},{\"attributes\":{\"axis\":{\"id\":\"2247\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2250\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1430\",\"type\":\"BasicTicker\"},{\"attributes\":{\"below\":[{\"id\":\"2613\"}],\"center\":[{\"id\":\"2616\"},{\"id\":\"2620\"}],\"left\":[{\"id\":\"2617\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2639\"}],\"title\":{\"id\":\"3251\"},\"toolbar\":{\"id\":\"2628\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2605\"},\"x_scale\":{\"id\":\"2609\"},\"y_range\":{\"id\":\"2607\"},\"y_scale\":{\"id\":\"2611\"}},\"id\":\"2604\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2584\"},{\"id\":\"2585\"},{\"id\":\"2586\"},{\"id\":\"2587\"},{\"id\":\"2588\"},{\"id\":\"2589\"}]},\"id\":\"2591\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu3\",\"formatter\":{\"id\":\"3025\"},\"ticker\":{\"id\":\"1763\"}},\"id\":\"1762\",\"type\":\"LinearAxis\"},{\"attributes\":{\"start\":0},\"id\":\"2607\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2547\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1548\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2614\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2548\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1549\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"axis_label\":\"GPUUtilization-nodeid:algo-1_gpu2\",\"formatter\":{\"id\":\"3255\"},\"ticker\":{\"id\":\"2614\"}},\"id\":\"2613\",\"type\":\"LinearAxis\"},{\"attributes\":{\"overlay\":{\"id\":\"2553\"}},\"id\":\"2549\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"axis\":{\"id\":\"2613\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2616\",\"type\":\"Grid\"},{\"attributes\":{\"overlay\":{\"id\":\"1554\"}},\"id\":\"1550\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2590\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2550\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2957\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1551\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2551\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1552\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2618\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2552\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1553\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3253\"},\"ticker\":{\"id\":\"2618\"}},\"id\":\"2617\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2609\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2601\",\"type\":\"Quad\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2547\"},{\"id\":\"2548\"},{\"id\":\"2549\"},{\"id\":\"2550\"},{\"id\":\"2551\"},{\"id\":\"2552\"}]},\"id\":\"2554\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis\":{\"id\":\"2617\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2620\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1602\",\"type\":\"Quad\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1548\"},{\"id\":\"1549\"},{\"id\":\"1550\"},{\"id\":\"1551\"},{\"id\":\"1552\"},{\"id\":\"1553\"}]},\"id\":\"1555\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2605\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1591\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[824,3,1,1,1,1,0,0,0,0,1,0,1,1,0,4,3,14,26,47,25,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3257\"},\"selection_policy\":{\"id\":\"3258\"}},\"id\":\"2635\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1760\",\"type\":\"LinearScale\"},{\"attributes\":{\"start\":0},\"id\":\"2681\",\"type\":\"DataRange1d\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1585\"},{\"id\":\"1586\"},{\"id\":\"1587\"},{\"id\":\"1588\"},{\"id\":\"1589\"},{\"id\":\"1590\"}]},\"id\":\"1592\",\"type\":\"Toolbar\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2553\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"axis_label\":\"GPUUtilization-nodeid:algo-1_total\",\"formatter\":{\"id\":\"3275\"},\"ticker\":{\"id\":\"2688\"}},\"id\":\"2687\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1554\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2621\",\"type\":\"PanTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2561\"},\"glyph\":{\"id\":\"2563\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2564\"},\"selection_glyph\":null,\"view\":{\"id\":\"2566\"}},\"id\":\"2565\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"1562\"},\"glyph\":{\"id\":\"1564\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1565\"},\"selection_glyph\":null,\"view\":{\"id\":\"1567\"}},\"id\":\"1566\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2622\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2564\",\"type\":\"Quad\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1565\",\"type\":\"Quad\"},{\"attributes\":{\"overlay\":{\"id\":\"2627\"}},\"id\":\"2623\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"source\":{\"id\":\"2561\"}},\"id\":\"2566\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1453\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2624\",\"type\":\"SaveTool\"},{\"attributes\":{\"source\":{\"id\":\"1562\"}},\"id\":\"1567\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2600\",\"type\":\"Quad\"},{\"attributes\":{\"below\":[{\"id\":\"2576\"}],\"center\":[{\"id\":\"2579\"},{\"id\":\"2583\"}],\"left\":[{\"id\":\"2580\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2602\"}],\"title\":{\"id\":\"3241\"},\"toolbar\":{\"id\":\"2591\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2568\"},\"x_scale\":{\"id\":\"2572\"},\"y_range\":{\"id\":\"2570\"},\"y_scale\":{\"id\":\"2574\"}},\"id\":\"2567\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2625\",\"type\":\"ResetTool\"},{\"attributes\":{\"below\":[{\"id\":\"1577\"}],\"center\":[{\"id\":\"1580\"},{\"id\":\"1584\"}],\"left\":[{\"id\":\"1581\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1603\"}],\"title\":{\"id\":\"2971\"},\"toolbar\":{\"id\":\"1592\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1569\"},\"x_scale\":{\"id\":\"1573\"},\"y_range\":{\"id\":\"1571\"},\"y_scale\":{\"id\":\"1575\"}},\"id\":\"1568\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2563\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2626\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2568\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1564\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1569\",\"type\":\"DataRange1d\"},{\"attributes\":{\"start\":0},\"id\":\"2570\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2688\",\"type\":\"BasicTicker\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2621\"},{\"id\":\"2622\"},{\"id\":\"2623\"},{\"id\":\"2624\"},{\"id\":\"2625\"},{\"id\":\"2626\"}]},\"id\":\"2628\",\"type\":\"Toolbar\"},{\"attributes\":{\"start\":0},\"id\":\"1571\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2572\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1573\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2574\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2685\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1575\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"GPUUtilization-nodeid:algo-1_gpu1\",\"formatter\":{\"id\":\"3245\"},\"ticker\":{\"id\":\"2577\"}},\"id\":\"2576\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2683\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu10\",\"formatter\":{\"id\":\"2975\"},\"ticker\":{\"id\":\"1578\"}},\"id\":\"1577\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2627\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2577\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1578\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"2576\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2579\",\"type\":\"Grid\"},{\"attributes\":{\"data_source\":{\"id\":\"2635\"},\"glyph\":{\"id\":\"2637\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2638\"},\"selection_glyph\":null,\"view\":{\"id\":\"2640\"}},\"id\":\"2639\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"1577\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1580\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3243\"},\"ticker\":{\"id\":\"2581\"}},\"id\":\"2580\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2638\",\"type\":\"Quad\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"2973\"},\"ticker\":{\"id\":\"1582\"}},\"id\":\"1581\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2581\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1582\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"2635\"}},\"id\":\"2640\",\"type\":\"CDSView\"},{\"attributes\":{\"axis\":{\"id\":\"2580\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2583\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"1581\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1584\",\"type\":\"Grid\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[820,9,0,1,0,0,1,1,0,0,0,1,0,3,0,3,5,18,19,54,16,2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3267\"},\"selection_policy\":{\"id\":\"3268\"}},\"id\":\"2672\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"below\":[{\"id\":\"2650\"}],\"center\":[{\"id\":\"2653\"},{\"id\":\"2657\"}],\"left\":[{\"id\":\"2654\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2676\"}],\"title\":{\"id\":\"3261\"},\"toolbar\":{\"id\":\"2665\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2642\"},\"x_scale\":{\"id\":\"2646\"},\"y_range\":{\"id\":\"2644\"},\"y_scale\":{\"id\":\"2648\"}},\"id\":\"2641\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"data_source\":{\"id\":\"2598\"},\"glyph\":{\"id\":\"2600\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2601\"},\"selection_glyph\":null,\"view\":{\"id\":\"2603\"}},\"id\":\"2602\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1601\",\"type\":\"Quad\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2637\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2642\",\"type\":\"DataRange1d\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1675\",\"type\":\"Quad\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[824,3,3,0,0,1,0,0,0,1,1,0,1,0,0,4,5,13,27,57,14,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3247\"},\"selection_policy\":{\"id\":\"3248\"}},\"id\":\"2598\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"start\":0},\"id\":\"2644\",\"type\":\"DataRange1d\"},{\"attributes\":{\"source\":{\"id\":\"2598\"}},\"id\":\"2603\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2584\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1585\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2441\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2646\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2585\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1586\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2648\",\"type\":\"LinearScale\"},{\"attributes\":{\"overlay\":{\"id\":\"2590\"}},\"id\":\"2586\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"overlay\":{\"id\":\"1591\"}},\"id\":\"1587\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"axis_label\":\"GPUUtilization-nodeid:algo-1_gpu0\",\"formatter\":{\"id\":\"3265\"},\"ticker\":{\"id\":\"2651\"}},\"id\":\"2650\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2587\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1588\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2651\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2588\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1589\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis\":{\"id\":\"2650\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2653\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2589\",\"type\":\"HelpTool\"},{\"attributes\":{\"below\":[{\"id\":\"2095\"}],\"center\":[{\"id\":\"2098\"},{\"id\":\"2102\"}],\"left\":[{\"id\":\"2099\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2121\"}],\"title\":{\"id\":\"3111\"},\"toolbar\":{\"id\":\"2110\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2087\"},\"x_scale\":{\"id\":\"2091\"},\"y_range\":{\"id\":\"2089\"},\"y_scale\":{\"id\":\"2093\"}},\"id\":\"2086\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2436\"},{\"id\":\"2437\"},{\"id\":\"2438\"},{\"id\":\"2439\"},{\"id\":\"2440\"},{\"id\":\"2441\"}]},\"id\":\"2443\",\"type\":\"Toolbar\"},{\"attributes\":{\"source\":{\"id\":\"1858\"}},\"id\":\"1863\",\"type\":\"CDSView\"},{\"attributes\":{\"below\":[{\"id\":\"1873\"}],\"center\":[{\"id\":\"1876\"},{\"id\":\"1880\"}],\"left\":[{\"id\":\"1877\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"1899\"}],\"title\":{\"id\":\"3051\"},\"toolbar\":{\"id\":\"1888\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1865\"},\"x_scale\":{\"id\":\"1869\"},\"y_range\":{\"id\":\"1867\"},\"y_scale\":{\"id\":\"1871\"}},\"id\":\"1864\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2399\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2087\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2400\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1915\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1865\",\"type\":\"DataRange1d\"},{\"attributes\":{\"start\":0},\"id\":\"2089\",\"type\":\"DataRange1d\"},{\"attributes\":{\"overlay\":{\"id\":\"2405\"}},\"id\":\"2401\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"start\":0},\"id\":\"1867\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2091\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2402\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1869\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2093\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2403\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1871\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu21\",\"formatter\":{\"id\":\"3115\"},\"ticker\":{\"id\":\"2096\"}},\"id\":\"2095\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2404\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu12\",\"formatter\":{\"id\":\"3055\"},\"ticker\":{\"id\":\"1874\"}},\"id\":\"1873\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2096\",\"type\":\"BasicTicker\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2453\",\"type\":\"Quad\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2399\"},{\"id\":\"2400\"},{\"id\":\"2401\"},{\"id\":\"2402\"},{\"id\":\"2403\"},{\"id\":\"2404\"}]},\"id\":\"2406\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis\":{\"id\":\"2095\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2098\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1874\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"1873\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1876\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3113\"},\"ticker\":{\"id\":\"2100\"}},\"id\":\"2099\",\"type\":\"LinearAxis\"},{\"attributes\":{\"below\":[{\"id\":\"2465\"}],\"center\":[{\"id\":\"2468\"},{\"id\":\"2472\"}],\"left\":[{\"id\":\"2469\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2491\"}],\"title\":{\"id\":\"3211\"},\"toolbar\":{\"id\":\"2480\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2457\"},\"x_scale\":{\"id\":\"2461\"},\"y_range\":{\"id\":\"2459\"},\"y_scale\":{\"id\":\"2463\"}},\"id\":\"2456\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3053\"},\"ticker\":{\"id\":\"1878\"}},\"id\":\"1877\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2100\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"2450\"}},\"id\":\"2455\",\"type\":\"CDSView\"},{\"attributes\":{\"axis\":{\"id\":\"2099\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2102\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1878\",\"type\":\"BasicTicker\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2405\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[829,2,1,1,6,63,52,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3307\"},\"selection_policy\":{\"id\":\"3308\"}},\"id\":\"2820\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis\":{\"id\":\"1877\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1880\",\"type\":\"Grid\"},{\"attributes\":{\"overlay\":{\"id\":\"2146\"}},\"id\":\"2142\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2413\"},\"glyph\":{\"id\":\"2415\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2416\"},\"selection_glyph\":null,\"view\":{\"id\":\"2418\"}},\"id\":\"2417\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu25\",\"formatter\":{\"id\":\"3065\"},\"ticker\":{\"id\":\"1911\"}},\"id\":\"1910\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[663,92,37,22,22,29,19,18,9,9,7,7,4,5,3,3,3,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3117\"},\"selection_policy\":{\"id\":\"3118\"}},\"id\":\"2117\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2416\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2145\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2103\",\"type\":\"PanTool\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[664,81,33,21,6,11,13,8,10,13,0,6,6,1,3,1,2,0,1,0,0,0,1,1,1,1,1,0,0,1,0,0,0,0,1,1,0,1,0,0,2,2,0,1,0,0,0,0,2]},\"selected\":{\"id\":\"3057\"},\"selection_policy\":{\"id\":\"3058\"}},\"id\":\"1895\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"2413\"}},\"id\":\"2418\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1881\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2104\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2450\"},\"glyph\":{\"id\":\"2452\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2453\"},\"selection_glyph\":null,\"view\":{\"id\":\"2455\"}},\"id\":\"2454\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"below\":[{\"id\":\"2428\"}],\"center\":[{\"id\":\"2431\"},{\"id\":\"2435\"}],\"left\":[{\"id\":\"2432\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2454\"}],\"title\":{\"id\":\"3201\"},\"toolbar\":{\"id\":\"2443\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2420\"},\"x_scale\":{\"id\":\"2424\"},\"y_range\":{\"id\":\"2422\"},\"y_scale\":{\"id\":\"2426\"}},\"id\":\"2419\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1882\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"overlay\":{\"id\":\"2109\"}},\"id\":\"2105\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2415\",\"type\":\"Quad\"},{\"attributes\":{\"overlay\":{\"id\":\"1887\"}},\"id\":\"1883\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2106\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2420\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1884\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2107\",\"type\":\"ResetTool\"},{\"attributes\":{\"start\":0},\"id\":\"2422\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1885\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2108\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2424\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1886\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2426\",\"type\":\"LinearScale\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2103\"},{\"id\":\"2104\"},{\"id\":\"2105\"},{\"id\":\"2106\"},{\"id\":\"2107\"},{\"id\":\"2108\"}]},\"id\":\"2110\",\"type\":\"Toolbar\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu14\",\"formatter\":{\"id\":\"3205\"},\"ticker\":{\"id\":\"2429\"}},\"id\":\"2428\",\"type\":\"LinearAxis\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1881\"},{\"id\":\"1882\"},{\"id\":\"1883\"},{\"id\":\"1884\"},{\"id\":\"1885\"},{\"id\":\"1886\"}]},\"id\":\"1888\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"2144\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2429\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"1910\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1913\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2143\",\"type\":\"SaveTool\"},{\"attributes\":{\"axis\":{\"id\":\"2428\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2431\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1911\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3203\"},\"ticker\":{\"id\":\"2433\"}},\"id\":\"2432\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2109\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"axis_label\":\"Occurences\",\"formatter\":{\"id\":\"3063\"},\"ticker\":{\"id\":\"1915\"}},\"id\":\"1914\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data_source\":{\"id\":\"2117\"},\"glyph\":{\"id\":\"2119\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2120\"},\"selection_glyph\":null,\"view\":{\"id\":\"2122\"}},\"id\":\"2121\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1887\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2433\",\"type\":\"BasicTicker\"},{\"attributes\":{\"data_source\":{\"id\":\"1895\"},\"glyph\":{\"id\":\"1897\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1898\"},\"selection_glyph\":null,\"view\":{\"id\":\"1900\"}},\"id\":\"1899\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"2432\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2435\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2120\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1366\",\"type\":\"SaveTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1898\",\"type\":\"Quad\"},{\"attributes\":{\"source\":{\"id\":\"2117\"}},\"id\":\"2122\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2526\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"1908\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[602,85,61,42,36,28,19,16,12,7,11,4,4,4,3,5,1,0,1,2,2,2,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]},\"selected\":{\"id\":\"3127\"},\"selection_policy\":{\"id\":\"3128\"}},\"id\":\"2154\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"below\":[{\"id\":\"2132\"}],\"center\":[{\"id\":\"2135\"},{\"id\":\"2139\"}],\"left\":[{\"id\":\"2136\"}],\"plot_height\":250,\"plot_width\":250,\"renderers\":[{\"id\":\"2158\"}],\"title\":{\"id\":\"3121\"},\"toolbar\":{\"id\":\"2147\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"2124\"},\"x_scale\":{\"id\":\"2128\"},\"y_range\":{\"id\":\"2126\"},\"y_scale\":{\"id\":\"2130\"}},\"id\":\"2123\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis\":{\"id\":\"1914\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1917\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2436\",\"type\":\"PanTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2119\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2124\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2437\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"navy\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1897\",\"type\":\"Quad\"},{\"attributes\":{\"start\":0},\"id\":\"2126\",\"type\":\"DataRange1d\"},{\"attributes\":{\"overlay\":{\"id\":\"2442\"}},\"id\":\"2438\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1980\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2128\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2439\",\"type\":\"SaveTool\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[608,110,52,27,27,28,14,11,12,14,6,14,9,6,3,3,3,1,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"3067\"},\"selection_policy\":{\"id\":\"3068\"}},\"id\":\"1932\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1982\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2130\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1918\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2440\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_cpu27\",\"formatter\":{\"id\":\"3125\"},\"ticker\":{\"id\":\"2133\"}},\"id\":\"2132\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1919\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2133\",\"type\":\"BasicTicker\"}],\"root_ids\":[\"3333\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", - " var render_items = [{\"docid\":\"d9af2108-bcb5-4ca5-a49f-df3aa06ee385\",\"notebook_comms_target\":\"3748\",\"root_ids\":[\"3333\"],\"roots\":{\"3333\":\"1c6ade4d-a094-44b4-b36f-95757b7218e7\"}}];\n", - " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", - "\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " embed_document(root);\n", - " } else {\n", - " var attempts = 0;\n", - " var timer = setInterval(function(root) {\n", - " if (root.Bokeh !== undefined) {\n", - " clearInterval(timer);\n", - " embed_document(root);\n", - " } else {\n", - " attempts++;\n", - " if (attempts > 100) {\n", - " clearInterval(timer);\n", - " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", - " }\n", - " }\n", - " }, 10, root)\n", - " }\n", - "})(window);" - ], - "application/vnd.bokehjs_exec.v0+json": "" - }, - "metadata": { - "application/vnd.bokehjs_exec.v0+json": { - "id": "3333" - } - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "filtered_dimensions:{'CPUUtilization-nodeid:algo-1', 'GPUUtilization-nodeid:algo-1', 'GPUMemoryUtilization-nodeid:algo-1'}\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from smdebug.profiler.analysis.notebook_utils.metrics_histogram import MetricsHistogram\n", "\n", @@ -611,13 +531,19 @@ "metadata": {}, "outputs": [], "source": [ + "import time\n", + "from smdebug.profiler.analysis.notebook_utils.step_timeline_chart import StepTimelineChart\n", + "\n", + "tj.wait_for_sys_profiling_data_to_be_available()\n", "tj.wait_for_framework_profiling_data_to_be_available()\n", "\n", - "from smdebug.profiler.analysis.notebook_utils.step_timeline_chart import StepTimelineChart\n", + "time.sleep(30)\n", "\n", "framework_metrics_reader = tj.get_framework_metrics_reader()\n", "framework_metrics_reader.refresh_event_file_list()\n", "\n", + "time.sleep(30)\n", + "\n", "view_step_timeline_chart = StepTimelineChart(framework_metrics_reader)" ] }, @@ -637,20 +563,9 @@ }, { "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "9" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "python_profiling_config_tj[\"StartStep\"]" ] @@ -664,37 +579,9 @@ }, { "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Thu Apr 29 07:02:49 2021 python_stats/python_stats/81-algo-1/train-9-stepstart-1619676498401482.5_train-9-stepend-1619676498506921.8/python_stats\n", - "Thu Apr 29 07:02:49 2021 python_stats/python_stats/81-algo-1/train-9-stepend-1619676498512565.0_train-10-stepstart-1619676498517752.8/python_stats\n", - "\n", - " 3853 function calls (3708 primitive calls) in 0.110 seconds\n", - "\n", - " Ordered by: cumulative time\n", - " List reduced from 292 to 10 due to restriction <10>\n", - "\n", - " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.000 0.000 0.080 0.080 /usr/local/lib/python3.7/site-packages/tensorflow/python/eager/function.py:2827(__call__)\n", - " 1 0.000 0.000 0.080 0.080 /usr/local/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py:757(__call__)\n", - " 1 0.000 0.000 0.080 0.080 /usr/local/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py:798(_call)\n", - " 1 0.000 0.000 0.070 0.070 /usr/local/lib/python3.7/site-packages/tensorflow/python/eager/function.py:1829(_filtered_call)\n", - " 1 0.000 0.000 0.070 0.070 /usr/local/lib/python3.7/site-packages/tensorflow/python/eager/function.py:1852(_call_flat)\n", - " 1 0.060 0.060 0.060 0.060 {built-in method tensorflow.python._pywrap_tfe.TFE_Py_Execute}\n", - " 1 0.000 0.000 0.060 0.060 /usr/local/lib/python3.7/site-packages/tensorflow/python/eager/function.py:511(call)\n", - " 1 0.000 0.000 0.060 0.060 /usr/local/lib/python3.7/site-packages/tensorflow/python/eager/execute.py:33(quick_execute)\n", - " 673 0.030 0.000 0.030 0.000 {built-in method builtins.isinstance}\n", - " 1 0.000 0.000 0.020 0.020 /usr/local/lib/python3.7/site-packages/smdebug/tensorflow/keras.py:605(_save_layer_input_and_outputs)\n", - "\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "if python_profiling_config_tj == {}:\n", " step = 9\n", @@ -740,378 +627,9 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "application/javascript": [ - "\n", - "(function(root) {\n", - " function now() {\n", - " return new Date();\n", - " }\n", - "\n", - " var force = true;\n", - "\n", - " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", - " root._bokeh_onload_callbacks = [];\n", - " root._bokeh_is_loading = undefined;\n", - " }\n", - "\n", - " var JS_MIME_TYPE = 'application/javascript';\n", - " var HTML_MIME_TYPE = 'text/html';\n", - " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", - " var CLASS_NAME = 'output_bokeh rendered_html';\n", - "\n", - " /**\n", - " * Render data to the DOM node\n", - " */\n", - " function render(props, node) {\n", - " var script = document.createElement(\"script\");\n", - " node.appendChild(script);\n", - " }\n", - "\n", - " /**\n", - " * Handle when an output is cleared or removed\n", - " */\n", - " function handleClearOutput(event, handle) {\n", - " var cell = handle.cell;\n", - "\n", - " var id = cell.output_area._bokeh_element_id;\n", - " var server_id = cell.output_area._bokeh_server_id;\n", - " // Clean up Bokeh references\n", - " if (id != null && id in Bokeh.index) {\n", - " Bokeh.index[id].model.document.clear();\n", - " delete Bokeh.index[id];\n", - " }\n", - "\n", - " if (server_id !== undefined) {\n", - " // Clean up Bokeh references\n", - " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", - " cell.notebook.kernel.execute(cmd, {\n", - " iopub: {\n", - " output: function(msg) {\n", - " var id = msg.content.text.trim();\n", - " if (id in Bokeh.index) {\n", - " Bokeh.index[id].model.document.clear();\n", - " delete Bokeh.index[id];\n", - " }\n", - " }\n", - " }\n", - " });\n", - " // Destroy server and session\n", - " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", - " cell.notebook.kernel.execute(cmd);\n", - " }\n", - " }\n", - "\n", - " /**\n", - " * Handle when a new output is added\n", - " */\n", - " function handleAddOutput(event, handle) {\n", - " var output_area = handle.output_area;\n", - " var output = handle.output;\n", - "\n", - " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", - " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", - " return\n", - " }\n", - "\n", - " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", - "\n", - " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", - " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", - " // store reference to embed id on output_area\n", - " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", - " }\n", - " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", - " var bk_div = document.createElement(\"div\");\n", - " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", - " var script_attrs = bk_div.children[0].attributes;\n", - " for (var i = 0; i < script_attrs.length; i++) {\n", - " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", - " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", - " }\n", - " // store reference to server id on output_area\n", - " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", - " }\n", - " }\n", - "\n", - " function register_renderer(events, OutputArea) {\n", - "\n", - " function append_mime(data, metadata, element) {\n", - " // create a DOM node to render to\n", - " var toinsert = this.create_output_subarea(\n", - " metadata,\n", - " CLASS_NAME,\n", - " EXEC_MIME_TYPE\n", - " );\n", - " this.keyboard_manager.register_events(toinsert);\n", - " // Render to node\n", - " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", - " render(props, toinsert[toinsert.length - 1]);\n", - " element.append(toinsert);\n", - " return toinsert\n", - " }\n", - "\n", - " /* Handle when an output is cleared or removed */\n", - " events.on('clear_output.CodeCell', handleClearOutput);\n", - " events.on('delete.Cell', handleClearOutput);\n", - "\n", - " /* Handle when a new output is added */\n", - " events.on('output_added.OutputArea', handleAddOutput);\n", - "\n", - " /**\n", - " * Register the mime type and append_mime function with output_area\n", - " */\n", - " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", - " /* Is output safe? */\n", - " safe: true,\n", - " /* Index of renderer in `output_area.display_order` */\n", - " index: 0\n", - " });\n", - " }\n", - "\n", - " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", - " if (root.Jupyter !== undefined) {\n", - " var events = require('base/js/events');\n", - " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", - "\n", - " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", - " register_renderer(events, OutputArea);\n", - " }\n", - " }\n", - "\n", - " \n", - " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", - " root._bokeh_timeout = Date.now() + 5000;\n", - " root._bokeh_failed_load = false;\n", - " }\n", - "\n", - " var NB_LOAD_WARNING = {'data': {'text/html':\n", - " \"
\\n\"+\n", - " \"

\\n\"+\n", - " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", - " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", - " \"

\\n\"+\n", - " \"
    \\n\"+\n", - " \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n", - " \"
  • use INLINE resources instead, as so:
  • \\n\"+\n", - " \"
\\n\"+\n", - " \"\\n\"+\n", - " \"from bokeh.resources import INLINE\\n\"+\n", - " \"output_notebook(resources=INLINE)\\n\"+\n", - " \"\\n\"+\n", - " \"
\"}};\n", - "\n", - " function display_loaded() {\n", - " var el = document.getElementById(null);\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS is loading...\";\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(display_loaded, 100)\n", - " }\n", - " }\n", - "\n", - "\n", - " function run_callbacks() {\n", - " try {\n", - " root._bokeh_onload_callbacks.forEach(function(callback) {\n", - " if (callback != null)\n", - " callback();\n", - " });\n", - " } finally {\n", - " delete root._bokeh_onload_callbacks\n", - " }\n", - " console.debug(\"Bokeh: all callbacks have finished\");\n", - " }\n", - "\n", - " function load_libs(css_urls, js_urls, callback) {\n", - " if (css_urls == null) css_urls = [];\n", - " if (js_urls == null) js_urls = [];\n", - "\n", - " root._bokeh_onload_callbacks.push(callback);\n", - " if (root._bokeh_is_loading > 0) {\n", - " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", - " return null;\n", - " }\n", - " if (js_urls == null || js_urls.length === 0) {\n", - " run_callbacks();\n", - " return null;\n", - " }\n", - " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", - " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", - "\n", - " function on_load() {\n", - " root._bokeh_is_loading--;\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", - " run_callbacks()\n", - " }\n", - " }\n", - "\n", - " function on_error() {\n", - " console.error(\"failed to load \" + url);\n", - " }\n", - "\n", - " for (var i = 0; i < css_urls.length; i++) {\n", - " var url = css_urls[i];\n", - " const element = document.createElement(\"link\");\n", - " element.onload = on_load;\n", - " element.onerror = on_error;\n", - " element.rel = \"stylesheet\";\n", - " element.type = \"text/css\";\n", - " element.href = url;\n", - " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\": \"T2yuo9Oe71Cz/I4X9Ac5+gpEa5a8PpJCDlqKYO0CfAuEszu1JrXLl8YugMqYe3sM\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\": \"98GDGJ0kOMCUMUePhksaQ/GYgB3+NH9h996V88sh3aOiUNX3N+fLXAtry6xctSZ6\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\": \"89bArO+nlbP3sgakeHjCo1JYxYR5wufVgA3IbUvDY+K7w4zyxJqssu7wVnfeKCq8\"};\n", - "\n", - " for (var i = 0; i < js_urls.length; i++) {\n", - " var url = js_urls[i];\n", - " var element = document.createElement('script');\n", - " element.onload = on_load;\n", - " element.onerror = on_error;\n", - " element.async = false;\n", - " element.src = url;\n", - " if (url in hashes) {\n", - " element.crossOrigin = \"anonymous\";\n", - " element.integrity = \"sha384-\" + hashes[url];\n", - " }\n", - " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", - " document.head.appendChild(element);\n", - " }\n", - " };\n", - "\n", - " function inject_raw_css(css) {\n", - " const element = document.createElement(\"style\");\n", - " element.appendChild(document.createTextNode(css));\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " \n", - " var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\"];\n", - " var css_urls = [];\n", - " \n", - "\n", - " var inline_js = [\n", - " function(Bokeh) {\n", - " Bokeh.set_log_level(\"info\");\n", - " },\n", - " function(Bokeh) {\n", - " \n", - " \n", - " }\n", - " ];\n", - "\n", - " function run_inline_js() {\n", - " \n", - " if (root.Bokeh !== undefined || force === true) {\n", - " \n", - " for (var i = 0; i < inline_js.length; i++) {\n", - " inline_js[i].call(root, root.Bokeh);\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(run_inline_js, 100);\n", - " } else if (!root._bokeh_failed_load) {\n", - " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", - " root._bokeh_failed_load = true;\n", - " } else if (force !== true) {\n", - " var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n", - " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", - " }\n", - "\n", - " }\n", - "\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", - " run_inline_js();\n", - " } else {\n", - " load_libs(css_urls, js_urls, function() {\n", - " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", - " run_inline_js();\n", - " });\n", - " }\n", - "}(window));" - ], - "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(null);\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\": \"T2yuo9Oe71Cz/I4X9Ac5+gpEa5a8PpJCDlqKYO0CfAuEszu1JrXLl8YugMqYe3sM\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\": \"98GDGJ0kOMCUMUePhksaQ/GYgB3+NH9h996V88sh3aOiUNX3N+fLXAtry6xctSZ6\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\": \"89bArO+nlbP3sgakeHjCo1JYxYR5wufVgA3IbUvDY+K7w4zyxJqssu7wVnfeKCq8\"};\n\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n if (url in hashes) {\n element.crossOrigin = \"anonymous\";\n element.integrity = \"sha384-\" + hashes[url];\n }\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\"];\n var css_urls = [];\n \n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2021-04-29 07:02:58.055 ip-172-16-71-4:29196 INFO metrics_reader_base.py:134] Getting 9 event files\n", - "select events:['total']\n", - "select dimensions:['CPU', 'GPU']\n", - "filtered_events:{'total'}\n", - "filtered_dimensions:{'CPUUtilization-nodeid:algo-1', 'GPUUtilization-nodeid:algo-1', 'GPUMemoryUtilization-nodeid:algo-1'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "(function(root) {\n", - " function embed_document(root) {\n", - " \n", - " var docs_json = {\"2676cafa-7ab3-4f33-a5bc-59946ca4cafb\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"6303\"},{\"id\":\"6181\"}]},\"id\":\"6304\",\"type\":\"Row\"},{\"attributes\":{\"source\":{\"id\":\"6213\"}},\"id\":\"6220\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"6749\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"callback\":null,\"renderers\":[{\"id\":\"6219\"}],\"tooltips\":[[\"index\",\"$index\"],[\"(x,y)\",\"($x, $y)\"]]},\"id\":\"6221\",\"type\":\"HoverTool\"},{\"attributes\":{},\"id\":\"6751\",\"type\":\"Selection\"},{\"attributes\":{\"below\":[{\"id\":\"6231\"}],\"center\":[{\"id\":\"6234\"},{\"id\":\"6238\"}],\"js_event_callbacks\":{\"selectiongeometry\":[{\"id\":\"6254\"}]},\"left\":[{\"id\":\"6235\"}],\"plot_height\":200,\"plot_width\":1000,\"renderers\":[{\"id\":\"6257\"},{\"id\":\"6259\"}],\"title\":{\"id\":\"6730\"},\"toolbar\":{\"id\":\"6246\"},\"x_range\":{\"id\":\"6183\"},\"x_scale\":{\"id\":\"6227\"},\"y_range\":{\"id\":\"6225\"},\"y_scale\":{\"id\":\"6229\"}},\"id\":\"6223\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"6752\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"end\":1619676599.7611256,\"start\":1619676123.2610095},\"id\":\"6183\",\"type\":\"Range1d\"},{\"attributes\":{\"line_color\":\"blue\",\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"6255\",\"type\":\"Line\"},{\"attributes\":{\"end\":102},\"id\":\"6225\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"6187\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"6227\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"Time in ms\",\"formatter\":{\"id\":\"6736\"},\"ticker\":{\"id\":\"6192\"}},\"id\":\"6191\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"6229\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"Time in ms\",\"formatter\":{\"id\":\"6740\"},\"ticker\":{\"id\":\"6232\"}},\"id\":\"6231\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"6191\"},\"ticker\":null,\"visible\":false},\"id\":\"6194\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"6232\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"6231\"},\"ticker\":null,\"visible\":false},\"id\":\"6234\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"6196\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"GPUUtilization-nodeid:algo-1_total\",\"formatter\":{\"id\":\"6738\"},\"ticker\":{\"id\":\"6236\"}},\"id\":\"6235\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"6213\"}},\"id\":\"6218\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0},\"line_width\":{\"value\":0},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"6216\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"6236\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"6235\"},\"dimension\":1,\"ticker\":null,\"visible\":false},\"id\":\"6238\",\"type\":\"Grid\"},{\"attributes\":{\"height\":100,\"height_policy\":\"fixed\",\"width\":250},\"id\":\"6181\",\"type\":\"Div\"},{\"attributes\":{\"args\":{\"div\":{\"id\":\"6181\"},\"s1\":{\"id\":\"6253\"}},\"code\":\"\\n console.log('Running CustomJS callback now.');\\n var inds = s1.selected.indices;\\n console.log(inds);\\n var line = \\\" Selected index range: [\\\" + Math.min.apply(Math,inds) + \\\",\\\" + Math.max.apply(Math,inds) + \\\"]\\\\n\\\";\\n console.log(line)\\n var text = div.text.concat(line);\\n var lines = text.split(\\\"\\\\n\\\")\\n if (lines.length > 35)\\n lines.shift();\\n div.text = lines.join(\\\"\\\\n\\\");\"},\"id\":\"6254\",\"type\":\"CustomJS\"},{\"attributes\":{\"below\":[{\"id\":\"6191\"}],\"center\":[{\"id\":\"6194\"},{\"id\":\"6198\"}],\"js_event_callbacks\":{\"selectiongeometry\":[{\"id\":\"6214\"}]},\"left\":[{\"id\":\"6195\"}],\"plot_height\":200,\"plot_width\":1000,\"renderers\":[{\"id\":\"6217\"},{\"id\":\"6219\"}],\"title\":{\"id\":\"6728\"},\"toolbar\":{\"id\":\"6206\"},\"x_range\":{\"id\":\"6183\"},\"x_scale\":{\"id\":\"6187\"},\"y_range\":{\"id\":\"6185\"},\"y_scale\":{\"id\":\"6189\"}},\"id\":\"6182\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"6239\",\"type\":\"CrosshairTool\"},{\"attributes\":{\"dimensions\":\"width\",\"overlay\":{\"id\":\"6205\"}},\"id\":\"6200\",\"type\":\"BoxSelectTool\"},{\"attributes\":{\"dimensions\":\"width\",\"overlay\":{\"id\":\"6245\"}},\"id\":\"6240\",\"type\":\"BoxSelectTool\"},{\"attributes\":{\"end\":102},\"id\":\"6185\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"6241\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"6202\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"6242\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"6189\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"6243\",\"type\":\"SaveTool\"},{\"attributes\":{\"dimensions\":\"width\"},\"id\":\"6204\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"6734\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"6192\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"6736\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"6199\"},{\"id\":\"6200\"},{\"id\":\"6201\"},{\"id\":\"6202\"},{\"id\":\"6203\"},{\"id\":\"6204\"},{\"id\":\"6221\"}]},\"id\":\"6206\",\"type\":\"Toolbar\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"6245\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"axis_label\":\"CPUUtilization-nodeid:algo-1_total\",\"formatter\":{\"id\":\"6734\"},\"ticker\":{\"id\":\"6196\"}},\"id\":\"6195\",\"type\":\"LinearAxis\"},{\"attributes\":{\"fill_alpha\":{\"value\":0},\"line_width\":{\"value\":0},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"6256\",\"type\":\"Circle\"},{\"attributes\":{\"args\":{\"div\":{\"id\":\"6181\"},\"s1\":{\"id\":\"6213\"}},\"code\":\"\\n console.log('Running CustomJS callback now.');\\n var inds = s1.selected.indices;\\n console.log(inds);\\n var line = \\\" Selected index range: [\\\" + Math.min.apply(Math,inds) + \\\",\\\" + Math.max.apply(Math,inds) + \\\"]\\\\n\\\";\\n console.log(line)\\n var text = div.text.concat(line);\\n var lines = text.split(\\\"\\\\n\\\")\\n if (lines.length > 35)\\n lines.shift();\\n div.text = lines.join(\\\"\\\\n\\\");\"},\"id\":\"6214\",\"type\":\"CustomJS\"},{\"attributes\":{\"axis\":{\"id\":\"6195\"},\"dimension\":1,\"ticker\":null,\"visible\":false},\"id\":\"6198\",\"type\":\"Grid\"},{\"attributes\":{\"data_source\":{\"id\":\"6253\"},\"glyph\":{\"id\":\"6255\"},\"hover_glyph\":null,\"muted_glyph\":null,\"view\":{\"id\":\"6258\"}},\"id\":\"6257\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{\"x\":{\"__ndarray__\":\"Ibaw9pIi2EFhtND2kiLYQeqz8PaSIthBNrUQ95Ii2EFDtDD3kiLYQZ+zUPeSIthBv7Nw95Ii2EHftJD3kiLYQfazsPeSIthBWLTQ95Ii2EEDtPD3kiLYQV+0EPiSIthBHbQw+JIi2EH2s1D4kiLYQXS2cPiSIthBCbaQ+JIi2EE9tLD4kiLYQWO00PiSIthB1rbw+JIi2EG2tBD5kiLYQT24MPmSIthB17RQ+ZIi2EEKt3D5kiLYQWq0kPmSIthBirOw+ZIi2EHDs9D5kiLYQamz8PmSIthBdLYQ+pIi2EFDtDD6kiLYQeqzUPqSIthBH7Rw+pIi2EG/s5D6kiLYQXS0sPqSIthB6rPQ+pIi2EEWtPD6kiLYQcyzEPuSIthBv7Yw+5Ii2EEKtFD7kiLYQYq0cPuSIthBIbaQ+5Ii2EGdtLD7kiLYQfa00PuSIthBNrPw+5Ii2EGqtxD8kiLYQSG0MPySIthBXrNQ/JIi2EHKtXD8kiLYQSC0kPySIthBSLOw/JIi2EFYs9D8kiLYQf+18PySIthB6rMQ/ZIi2EEotDD9kiLYQYK0UP2SIthBwbZw/ZIi2EGCtJD9kiLYQTa0sP2SIthBH7TQ/ZIi2EHfs/D9kiLYQeqyEP6SIthBnbMw/pIi2EHhtVD+kiLYQfazcP6SIthBnbWQ/pIi2EFftLD+kiLYQeKz0P6SIthBn7Pw/pIi2EHXsxD/kiLYQZi0MP+SIthBobNQ/5Ii2EH4t3D/kiLYQcCzkP+SIthBSLOw/5Ii2EHBtdD/kiLYQf+z8P+SIthBobMQAJMi2EFWszAAkyLYQeGzUACTIthBY7RwAJMi2EHDs5AAkyLYQcqzsACTIthBdLPQAJMi2EH4tfAAkyLYQZ20EAGTIthB/bUwAZMi2EGqtFABkyLYQeK0cAGTIthBILSQAZMi2EGftLABkyLYQV600AGTIthB37TwAZMi2EG2tBACkyLYQT+2MAKTIthB9rVQApMi2EE2tXACkyLYQeGzkAKTIthB37OwApMi2EGCtNACkyLYQTe08AKTIthBnbQQA5Mi2EGCtDADkyLYQX21UAOTIthBAbVwA5Mi2EGBtJADkyLYQfi0sAOTIthBwLTQA5Mi2EHDs/ADkyLYQcG0EASTIthBNrQwBJMi2EHDtVAEkyLYQYG3cASTIthBtrOQBJMi2EG4s7AEkyLYQb+z0ASTIthBwbPwBJMi2EGqsxAFkyLYQZizMAWTIthBXrNQBZMi2EF0s3AFkyLYQQmzkAWTIthBCrSwBZMi2EG2tNAFkyLYQViz8AWTIthB4bMQBpMi2EH/szAGkyLYQcG0UAaTIthBo7VwBpMi2EH/tJAGkyLYQQm1sAaTIthB6LTQBpMi2EFBtPAGkyLYQTa0EAeTIthBuLMwB5Mi2EEWtFAHkyLYQWO0cAeTIthBirOQB5Mi2EGjt7AHkyLYQWG30AeTIthBuLTwB5Mi2EFBtBAIkyLYQUO2MAiTIthBw7RQCJMi2EFYtHAIkyLYQR+0kAiTIthBiLSwCJMi2EHXtNAIkyLYQf2z8AiTIthBo7UQCZMi2EG2szAJkyLYQVazUAmTIthBX7RwCZMi2EEKtpAJkyLYQTe0sAmTIthBtrPQCZMi2EEDtPAJkyLYQVi0EAqTIthBgrMwCpMi2EHftFAKkyLYQUi1cAqTIthBfbOQCpMi2EHqs7AKkyLYQYiz0AqTIthB/7PwCpMi2EFItxALkyLYQTa0MAuTIthB4bNQC5Mi2EEqtHALkyLYQf21kAuTIthBKrSwC5Mi2EHfs9ALkyLYQYC28AuTIthB4rQQDJMi2EHhtDAMkyLYQX+1UAyTIthBzLZwDJMi2EGWtZAMkyLYQf20sAyTIthBdrTQDJMi2EEDtvAMkyLYQaOzEA2TIthB6rMwDZMi2EHdtFANkyLYQUq0cA2TIthBdLSQDZMi2EF0tLANkyLYQcC00A2TIthBKLTwDZMi2EFftBAOkyLYQXS0MA6TIthB3bNQDpMi2EHis3AOkyLYQTezkA6TIthB17OwDpMi2EHKtNAOkyLYQYK08A6TIthBCrUQD5Mi2EEgtzAPkyLYQeGzUA+TIthBN7RwD5Mi2EEgtJAPkyLYQfizsA+TIthBiLPQD5Mi2EHhtPAPkyLYQcO2EBCTIthBgbQwEJMi2EGItFAQkyLYQZ20cBCTIthB4rOQEJMi2EHhs7AQkyLYQT+00BCTIthByrTwEJMi2EHAtBARkyLYQWG1MBGTIthBIbVQEZMi2EHdtHARkyLYQYGzkBGTIthBFrSwEZMi2EEBtdARkyLYQZi08BGTIthBA7QQEpMi2EFYtDASkyLYQaGzUBKTIthBILRwEpMi2EEBtJASkyLYQdezsBKTIthB9rXQEpMi2EFetPASkyLYQWq0EBOTIthB3bMwE5Mi2EG2tFATkyLYQZa0cBOTIthBwbSQE5Mi2EEotLATkyLYQVa00BOTIthB9rPwE5Mi2EHhsxAUkyLYQaO0MBSTIthB/7NQFJMi2EEWtXAUkyLYQSqzkBSTIthBuLOwFJMi2EGBs9AUkyLYQai38BSTIthBf7QQFZMi2EHDszAVkyLYQQq2UBWTIthBw7NwFZMi2EGhtJAVkyLYQT+2sBWTIthBAbTQFZMi2EHds/AVkyLYQeqzEBaTIthBSrYwFpMi2EH9tlAWkyLYQXazcBaTIthBv7OQFpMi2EFjtrAWkyLYQUi00BaTIthByrPwFpMi2EFKsxAXkyLYQbi0MBeTIthBqLZQF5Mi2EHotXAXkyLYQYC3kBeTIthB6rOwF5Mi2EG4s9AXkyLYQX2z8BeTIthBILYQGJMi2EF0tDAYkyLYQeq0UBiTIthB4LlwGJMi2EH4tZAYkyLYQeG1sBiTIthB37TQGJMi2EFKs/AYkyLYQT2zEBmTIthBwbMwGZMi2EHfs1AZkyLYQWGzcBmTIthB17OQGZMi2EFDtLAZkyLYQde00BmTIthBIbfwGZMi2EGjsxAakyLYQZ20MBqTIthB6rRQGpMi2EHqs3AakyLYQd2zkBqTIthBzLSwGpMi2EFjtNAakyLYQeq08BqTIthBwbMQG5Mi2EFetjAbkyLYQYGzUBuTIthBH7VwG5Mi2EEqtJAbkyLYQZi0sBuTIthBlrPQG5Mi2EE/s/AbkyLYQR2zEByTIthBgrYwHJMi2EE9tVAckyLYQX22cByTIthB+LOQHJMi2EHqtrAckyLYQWO00ByTIthBvrXwHJMi2EFYtRAdkyLYQda1MB2TIthBH7RQHZMi2EHdtHAdkyLYQVi0kB2TIthBHbSwHZMi2EFItdAdkyLYQWO08B2TIthBX7MQHpMi2EFfszAekyLYQeizUB6TIthB4rNwHpMi2EF/s5AekyLYQUG1sB6TIthBlrTQHpMi2EGYtPAekyLYQTe0EB+TIthB3bMwH5Mi2EG2tFAfkyLYQcO0cB+TIthBqbOQH5Mi2EHds7AfkyLYQei10B+TIthBPbTwH5Mi2EGAtRAgkyLYQWO0MCCTIthBPbNQIJMi2EFftHAgkyLYQQO1kCCTIthB6LawIJMi2EGBs9AgkyLYQd+z8CCTIthBzLMQIZMi2EEWtDAhkyLYQYqzUCGTIthBYbNwIZMi2EF9tJAhkyLYQR+3sCGTIthB37TQIZMi2EFItvAhkyLYQeK1ECKTIthB6LMwIpMi2EH2s1AikyLYQX21cCKTIthB/7OQIpMi2EE/t7AikyLYQSC00CKTIthBXrbwIpMi2EHAsxAjkyLYQUO0MCOTIthB6LRQI5Mi2EEgtHAjkyLYQeiykCOTIthBPbSwI5Mi2EH2s9AjkyLYQcGz8COTIthBY7MQJJMi2EFItjAkkyLYQSC0UCSTIthB17NwJJMi2EHXs5AkkyLYQQq0sCSTIthBNrXQJJMi2EEBtfAkkyLYQX21ECWTIthBVrQwJZMi2EGjtVAlkyLYQaGzcCWTIthBqrOQJZMi2EFIs7AlkyLYQV230CWTIthBwLPwJZMi2EHdsxAmkyLYQYizMCaTIthBzLNQJpMi2EFItXAmkyLYQaO2kCaTIthBNrSwJpMi2EEgtNAmkyLYQbbO8CaTIthBKv8QJ5Mi2EFItDAnkyLYQfizUCeTIthBdrRwJ5Mi2EE9tJAnkyLYQWHAsCeTIthBA7TQJ5Mi2EHAs/AnkyLYQQq3ECiTIthBHbUwKJMi2EF9s1AokyLYQcyzcCiTIthB6LKQKJMi2EFCt7AokyLYQSC30CiTIthBI7bwKJMi2EEBtRApkyLYQZ2zMCmTIthBQ7NQKZMi2EEqs3ApkyLYQWO3kCmTIthBP7OwKZMi2EHKs9ApkyLYQcO08CmTIthBnbMQKpMi2EHiszAqkyLYQT+zUCqTIthBw7NwKpMi2EEds5AqkyLYQf+ysCqTIthByrTQKpMi2EFWs/AqkyLYQeizECuTIthBobQwK5Mi2EEjt1ArkyLYQfiycCuTIthBo7OQK5Mi2EH2srArkyLYQaGz0CuTIthB6LLwK5Mi2EGCthAskyLYQf2zMCyTIthBIrNQLJMi2EEJs3AskyLYQYqzkCyTIthBVrOwLJMi2EGqs9AskyLYQXaz8CyTIthBgbMQLZMi2EFjszAtkyLYQUOzUC2TIthBKLNwLZMi2EEdtJAtkyLYQVizsC2TIthBNrPQLZMi2EEDs/AtkyLYQeqzEC6TIthB3bYwLpMi2EHBtVAukyLYQf+ycC6TIthBSrOQLpMi2EEis7AukyLYQfa00C6TIthB9rPwLpMi2EEBsxAvkyLYQeGzMC+TIthB/rJQL5Mi2EGKs3AvkyLYQeOykC+TIthBCrawL5Mi2EF2tNAvkyLYQSqz8C+TIthB/7IQMJMi2EF9tjAwkyLYQV+0UDCTIthB+LJwMJMi2EG2s5AwkyLYQSq2sDCTIthBgrPQMJMi2EEBs/AwkyLYQcq1EDGTIthBYbQwMZMi2EHos1AxkyLYQSC0cDGTIthBuLOQMZMi2EFYs7AxkyLYQcy10DGTIthBCbPwMZMi2EH2shAykyLYQQmzMDKTIthBiLRQMpMi2EGCtnAykyLYQSCzkDKTIthBgrOwMpMi2EGBs9AykyLYQfa18DKTIthBtrMQM5Mi2EGKtDAzkyLYQYKzUDOTIthBarRwM5Mi2EFjs5AzkyLYQV+0sDOTIthBdrTQM5Mi2EGCtPAzkyLYQQm0EDSTIthBPbQwNJMi2EFYtFA0kyLYQd2zcDSTIthBCbeQNJMi2EG2s7A0kyLYQYKz0DSTIthBqrPwNJMi2EFfsxA1kyLYQYGzMDWTIthBAbRQNZMi2EF0s3A1kyLYQSq0kDWTIthBgrOwNZMi2EFhs9A1kyLYQcqz8DWTIthBarMQNpMi2EGfszA2kyLYQcCzUDaTIthB37JwNpMi2EFjs5A2kyLYQeKzsDaTIthBobPQNpMi2EHqs/A2kyLYQV+zEDeTIthBNrQwN5Mi2EGfs1A3kyLYQYKzcDeTIthB17KQN5Mi2EFItLA3kyLYQV+z0DeTIthBWLPwN5Mi2EFfsxA4kyLYQUq0MDiTIthBuLNQOJMi2EF9s3A4kyLYQVizkDiTIthBQbOwOJMi2EGfs9A4kyLYQX+08DiTIthBuLMQOZMi2EGKszA5kyLYQd2zUDmTIthBSLZwOZMi2EHDs5A5kyLYQZa0sDmTIthBFrTQOZMi2EF0tPA5kyLYQeqyEDqTIthBY7QwOpMi2EGds1A6kyLYQaGzcDqTIthBwbOQOpMi2EGjtbA6kyLYQaqz0DqTIthBmLPwOpMi2EEosxA7kyLYQRazMDuTIthB3bNQO5Mi2EEqs3A7kyLYQba0kDuTIthBuLawO5Mi2EGIs9A7kyLYQeiz8DuTIthBQ7UQPJMi2EHdtjA8kyLYQV+0UDyTIthBH7VwPJMi2EF9tZA8kyLYQSC1sDyTIthBSrXQPJMi2EFftPA8kyLYQQG1ED2TIthBarMwPZMi2EH4s1A9kyLYQSi0cD2TIthBn7OQPZMi2EGKs7A9kyLYQXSz0D2TIthBiLTwPZMi2EFesxA+kyLYQbazMD6TIthBX7NQPpMi2EE9tHA+kyLYQeGzkD6TIthBPbSwPpMi2EHos9A+kyLYQf+z8D6TIthBWLMQP5Mi2EF/tDA/kyLYQcq1UD+TIthBYbRwP5Mi2EFfs5A/kyLYQeKzsD+TIthBf7TQP5Mi2EEKtPA/kyLYQeqzEECTIthBdLMwQJMi2EGYtFBAkyLYQcqzcECTIthBNrSQQJMi2EE2tLBAkyLYQcGz0ECTIthBdLPwQJMi2EGCtBBBkyLYQba1MEGTIthBdrRQQZMi2EGIs3BBkyLYQf+zkEGTIthB17OwQZMi2EFhs9BBkyLYQQq08EGTIthBobUQQpMi2EEqtTBCkyLYQSG0UEKTIthBqLZwQpMi2EG2tJBCkyLYQaO0sEKTIthBwbPQQpMi2EHDs/BCkyLYQd+zEEOTIthBN7MwQ5Mi2EGdtFBDkyLYQXS0cEOTIthB9rOQQ5Mi2EHotLBDkyLYQX+00EOTIthBNrTwQ5Mi2EGhtRBEkyLYQWO2MESTIthBobRQRJMi2EHDs3BEkyLYQYGzkESTIthBGLawRJMi2EHhs9BEkyLYQSqz8ESTIthB4rMQRZMi2EFjszBFkyLYQZ2zUEWTIthBSrNwRZMi2EGItJBFkyLYQeGzsEWTIthBiLTQRZMi2EEKtvBFkyLYQZizEEaTIthBuLIwRpMi2EGKtFBGkyLYQeqzcEaTIthBqrKQRpMi2EEqtLBGkyLYQWqz0EaTIthB+LPwRpMi2EGpsxBHkyLYQWOzMEeTIthBH7RQR5Mi2EGKs3BHkyLYQZ+zkEeTIthBw7SwR5Mi2EG2s9BHkyLYQcGz8EeTIthB17MQSJMi2EGjtDBIkyLYQZ2zUEiTIthBgrRwSJMi2EHBs5BIkyLYQYq0sEiTIthBCbTQSJMi2EG/s/BIkyLYQWGzEEmTIthBHbQwSZMi2EGBs1BJkyLYQd2zcEmTIthBgrOQSZMi2EF/s7BJkyLYQSq00EmTIthBw7PwSZMi2EEKthBKkyLYQQm1MEqTIthBgbNQSpMi2EF2s3BKkyLYQX2zkEqTIthB6LOwSpMi2EHXs9BKkyLYQZ+z8EqTIthB4rMQS5Mi2EFetDBLkyLYQV6zUEuTIthBIrNwS5Mi2EEis5BLkyLYQcG0sEuTIthBFrXQS5Mi2EHBs/BLkyLYQUO0EEyTIthBgbMwTJMi2EG/s1BMkyLYQbazcEyTIthB17OQTJMi2EFItLBMkyLYQV+00EyTIthBf7PwTJMi2EF/sxBNkyLYQT20ME2TIthBQbRQTZMi2EHds3BNkyLYQcq0kE2TIthBNrSwTZMi2EHhs9BNkyLYQWGz8E2TIthBdLMQTpMi2EHhtTBOkyLYQVi0UE6TIthBzLNwTpMi2EH2s5BOkyLYQd+zsE6TIthBKrTQTpMi2EHBs/BOkyLYQZ+0EE+TIthBP7QwT5Mi2EH9tlBPkyLYQd+0cE+TIthB4bOQT5Mi2EGqs7BPkyLYQTa00E+TIthBX7PwT5Mi2EG/sxBQkyLYQb+zMFCTIthBtrNQUJMi2EGCtHBQkyLYQeOykFCTIthBzLOwUJMi2EHdtNBQkyLYQd+z8FCTIthBPbQQUZMi2EFDtTBRkyLYQUq2UFGTIthBNrRwUZMi2EEKtJBRkyLYQZazsFGTIthBA7TQUZMi2EHAs/BRkyLYQSC2EFKTIthB6LUwUpMi2EH/tFBSkyLYQT20cFKTIthB37aQUpMi2EHhs7BSkyLYQd+z0FKTIthBvrTwUpMi2EEJsxBTkyLYQR20MFOTIthBdLNQU5Mi2EF0s3BTkyLYQSG0kFOTIthBdrSwU5Mi2EHKtNBTkyLYQcGz8FOTIthBwbMQVJMi2EGftDBUkyLYQeKzUFSTIthBqbNwVJMi2EFfs5BUkyLYQRa0sFSTIthBXrbQVJMi2EFjtPBUkyLYQSq2EFWTIthBmLQwVZMi2EG4s1BVkyLYQWOzcFWTIthB+LSQVZMi2EFhs7BVkyLYQX2z0FWTIthBWLTwVZMi2EHAtBBWkyLYQcq0MFaTIthBX7RQVpMi2EGds3BWkyLYQb+1kFaTIthBarWwVpMi2EG2tdBWkyLYQYiz8FaTIthBobQQV5Mi2EF9tDBXkyLYQeK0UFeTIthB/bNwV5Mi2EG4s5BXkyLYQQO0sFeTIthBobTQV5Mi2EG2s/BXkyLYQeGyEFiTIthBdLMwWJMi2EHAs1BYkyLYQYq0cFiTIthBirSQWJMi2EHMs7BYkyLYQaO00FiTIthBwbPwWJMi2EGCtBBZkyLYQaO0MFmTIthBWLRQWZMi2EHKtHBZkyLYQcGzkFmTIthBiLOwWZMi2EHfs9BZkyLYQb+28FmTIthBobQQWpMi2EEqtDBakyLYQVazUFqTIthB4rRwWpMi2EHMs5BakyLYQYK2sFqTIthBf7PQWpMi2EEKtPBakyLYQUi1EFuTIthBP7UwW5Mi2EFhtVBbkyLYQd20cFuTIthBqrOQW5Mi2EGps7BbkyLYQamz0FuTIthBCrTwW5Mi2EG/sxBckyLYQUq2MFyTIthBnbRQXJMi2EH4s3BckyLYQTe0kFyTIthB4rOwXJMi2EHhs9BckyLYQT208FyTIthBtrUQXZMi2EHBszBdkyLYQQq2UF2TIthB9rNwXZMi2EHds5BdkyLYQeKzsF2TIthBH7TQXZMi2EHMs/BdkyLYQbi0EF6TIthBqbQwXpMi2EEgtVBekyLYQde0cF6TIthB37OQXpMi2EGWs7BekyLYQd2z0F6TIthBY7bwXpMi2EGjtRBfkyLYQX22MF+TIthB17NQX5Mi2EF/tXBfkyLYQVi2kF+TIthBfbSwX5Mi2EHMtdBfkyLYQVa08F+TIthBgrMQYJMi2EEDtTBgkyLYQfi0UGCTIthBIbRwYJMi2EHhs5BgkyLYQWm2sGCTIthBnbPQYJMi2EHBs/BgkyLYQde0EGGTIthBSrMwYZMi2EEBtlBhkyLYQd+zcGGTIthBSLWQYZMi2EGItLBhkyLYQSq10GGTIthB9rjwYZMi2EH4shBikyLYQcG2MGKTIthBY7ZQYpMi2EHBtHBikyLYQeKzkGKTIthBwbSwYpMi2EFYs9BikyLYQXS28GKTIthB9rMQY5Mi2EHdtDBjkyLYQfizUGOTIthBH7VwY5Mi2EE9tZBjkyLYQd+0sGOTIthBzLTQY5Mi2EFqs/BjkyLYQd20EGSTIthB37MwZJMi2EHhs1BkkyLYQd+zcGSTIthBCrSQZJMi2EEdtbBkkyLYQWO00GSTIthBiLXwZJMi2EG2txBlkyLYQYK0MGWTIthBCrRQZZMi2EHds3BlkyLYQcyzkGWTIthB+LSwZZMi2EHotNBlkyLYQV6z8GWTIthBPbYQZpMi2EHdszBmkyLYQSq1UGaTIthBILRwZpMi2EF/tZBmkyLYQeGzsGaTIthBSrfQZpMi2EEJtfBmkyLYQUOzEGeTIthB4rMwZ5Mi2EFYtVBnkyLYQam0cGeTIthBwbSQZ5Mi2EGCs7BnkyLYQXS00GeTIthBvrXwZ5Mi2EHXtBBokyLYQcG0MGiTIthBiLNQaJMi2EHftHBokyLYQYGzkGiTIthBXrawaJMi2EGBs9BokyLYQUG28GiTIthBY7cQaZMi2EEqtjBpkyLYQeq1UGmTIthBAbZwaZMi2EH9s5BpkyLYQcy1sGmTIthB9rPQaZMi2EH4tvBpkyLYQTa3EGqTIthB+LMwapMi2EEqtFBqkyLYQfa1cGqTIthBWLaQapMi2EG/trBqkyLYQai10GqTIthBgLbwapMi2EFqsxBrkyLYQVi3MGuTIthBl7VQa5Mi2EHMs3BrkyLYQWqzkGuTIthBo7Owa5Mi2EGds9BrkyLYQaqz8GuTIthB3bIQbJMi2EFKtjBskyLYQd2zUGyTIthBNrRwbJMi2EF2tZBskyLYQei1sGyTIthBirbQbJMi2EHMtfBskyLYQei1EG2TIthBCrYwbZMi2EH2tlBtkyLYQTa3cG2TIthBgrWQbZMi2EH2trBtkyLYQeKz0G2TIthBSLbwbZMi2EE=\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[955]},\"y\":{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[955]}},\"selected\":{\"id\":\"6745\"},\"selection_policy\":{\"id\":\"6746\"}},\"id\":\"6213\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"6199\",\"type\":\"CrosshairTool\"},{\"attributes\":{\"source\":{\"id\":\"6253\"}},\"id\":\"6258\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"6253\"},\"glyph\":{\"id\":\"6256\"},\"hover_glyph\":null,\"muted_glyph\":null,\"view\":{\"id\":\"6260\"}},\"id\":\"6259\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"6253\"}},\"id\":\"6260\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"6213\"},\"glyph\":{\"id\":\"6215\"},\"hover_glyph\":null,\"muted_glyph\":null,\"view\":{\"id\":\"6218\"}},\"id\":\"6217\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"callback\":null,\"renderers\":[{\"id\":\"6259\"}],\"tooltips\":[[\"index\",\"$index\"],[\"(x,y)\",\"($x, $y)\"]]},\"id\":\"6261\",\"type\":\"HoverTool\"},{\"attributes\":{\"below\":[{\"id\":\"6271\"}],\"center\":[{\"id\":\"6274\"},{\"id\":\"6278\"}],\"js_event_callbacks\":{\"selectiongeometry\":[{\"id\":\"6294\"}]},\"left\":[{\"id\":\"6275\"}],\"plot_height\":200,\"plot_width\":1000,\"renderers\":[{\"id\":\"6297\"},{\"id\":\"6299\"}],\"title\":{\"id\":\"6732\"},\"toolbar\":{\"id\":\"6286\"},\"x_range\":{\"id\":\"6183\"},\"x_scale\":{\"id\":\"6267\"},\"y_range\":{\"id\":\"6265\"},\"y_scale\":{\"id\":\"6269\"}},\"id\":\"6263\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"6201\",\"type\":\"PanTool\"},{\"attributes\":{\"data\":{\"x\":{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[956]},\"y\":{\"__ndarraydtype\":\"float64\",\"order\":\"little\",\"shape\":[956]}},\"selected\":{\"id\":\"6751\"},\"selection_policy\":{\"id\":\"6752\"}},\"id\":\"6293\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"end\":102},\"id\":\"6265\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"6203\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"6267\",\"type\":\"LinearScale\"},{\"attributes\":{\"line_color\":\"blue\",\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"6215\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"6269\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{\"x\":{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[956]},\"y\":{\"__ndarraywAAAAAAAABAAAAAAAAA/D8AAAAAAADoPwwAAAAAAAOA/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANA/AAAAAAAA4D8AAAAAAAAAAAAAAAAAANA/AAAAAAAA4D8AAAAAAADQPwAAAAAAANA/AAAAAAAA4D8AAAAAAADoPwAAAAAAANA/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA0D8AAAAAAADgPwAAAAAAANA/AAAAAAAA0D8AAAAAAADQPwAAAAAAAPg/AAAAAAAA0D8AAAAAACBKQAAAAAAAAAAAAAAAAABAQEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAhAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAPkAAAAAAAIBDQAAAAAAA4ENAAAAAAADgQ0AAAAAAAKBCQAAAAAAAAEJAAAAAAACAQ0AAAAAAAIBDQAAAAAAA4EFAAAAAAADgQ0AAAAAAAKBCQAAAAAAAgEFAAAAAAACgQ0AAAAAAAOBCQAAAAAAAAEJAAAAAAABAQ0AAAAAAAABEQAAAAAAAgENAAAAAAADAQ0AAAAAAAEBCQAAAAAAAYEFAAAAAAAAAQEAAAAAAAOBDQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBAQAAAAAAAwDhAAAAAAACgQUAAAAAAAEBBQAAAAAAAIEJAAAAAAABgQ0AAAAAAAGBCQAAAAAAAQENAAAAAAADgQ0AAAAAAAMBDQAAAAAAAgENAAAAAAACAQ0AAAAAAAKBBQAAAAAAAAEJAAAAAAACgQkAAAAAAAOBDQAAAAAAAoENAAAAAAADAQkAAAAAAAABCQAAAAAAAoEBAAAAAAACgQ0AAAAAAAKBDQAAAAAAAoEJAAAAAAADAQ0AAAAAAAABCQAAAAAAAAERAAAAAAAAAMEAAAAAAAGBDQAAAAAAAoEFAAAAAAADgQUAAAAAAAIBDQAAAAAAAAERAAAAAAACgQ0AAAAAAAGBCQAAAAAAAYENAAAAAAABAQ0AAAAAAAOBBQAAAAAAAoEFAAAAAAABgQ0AAAAAAAGBCQAAAAAAAQENAAAAAAAAAQ0AAAAAAAEBCQAAAAAAA4EJAAAAAAADAQkAAAAAAAABDQAAAAAAA4EBAAAAAAABgQ0AAAAAAAIBCQAAAAAAAoEJAAAAAAAAA9D8AAAAAAOBCQAAAAAAAYEFAAAAAAAAgQkAAAAAAACBCQAAAAAAAAEBAAAAAAAAgREAAAAAAAIBAQAAAAAAAoEFAAAAAAABgQUAAAAAAAABCQAAAAAAAQENAAAAAAACgQUAAAAAAAABBQAAAAAAAQENAAAAAAACAQ0AAAAAAAMBCQAAAAAAAwEFAAAAAAACAQ0AAAAAAAKBCQAAAAAAAYENAAAAAAABAQ0AAAAAAAABDQAAAAAAAAEFAAAAAAADgQ0AAAAAAAEA4QAAAAAAAgENAAAAAAAAAKEAAAAAAAKBDQAAAAAAAgEJAAAAAAADAQkAAAAAAAIBBQAAAAAAAIERAAAAAAABAQ0AAAAAAAIBDQAAAAAAAwEJAAAAAAACAQkAAAAAAAMBDQAAAAAAAAEJAAAAAAADAQUAAAAAAAEBDQAAAAAAAoENAAAAAAAAAQ0AAAAAAAGBDQAAAAAAAoENAAAAAAACAQ0AAAAAAAOBDQAAAAAAAAENAAAAAAABgQ0AAAAAAAABCQAAAAAAAABNAAAAAAADgREAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4D8AAAAAAADQPwAAAAAAAOA/AAAAAAAA4D8AAAAAAADQPwAAAAAAAAAAAAAAAAAAWUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[956]}},\"selected\":{\"id\":\"6748\"},\"selection_policy\":{\"id\":\"6749\"}},\"id\":\"6253\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis_label\":\"Time in ms\",\"formatter\":{\"id\":\"6744\"},\"ticker\":{\"id\":\"6272\"}},\"id\":\"6271\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"6738\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"6239\"},{\"id\":\"6240\"},{\"id\":\"6241\"},{\"id\":\"6242\"},{\"id\":\"6243\"},{\"id\":\"6244\"},{\"id\":\"6261\"}]},\"id\":\"6246\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"6272\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"6271\"},\"ticker\":null,\"visible\":false},\"id\":\"6274\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"GPUMemoryUtilization-nodeid:algo-1_total\",\"formatter\":{\"id\":\"6742\"},\"ticker\":{\"id\":\"6276\"}},\"id\":\"6275\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"6276\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"6275\"},\"dimension\":1,\"ticker\":null,\"visible\":false},\"id\":\"6278\",\"type\":\"Grid\"},{\"attributes\":{\"args\":{\"div\":{\"id\":\"6181\"},\"s1\":{\"id\":\"6293\"}},\"code\":\"\\n console.log('Running CustomJS callback now.');\\n var inds = s1.selected.indices;\\n console.log(inds);\\n var line = \\\" Selected index range: [\\\" + Math.min.apply(Math,inds) + \\\",\\\" + Math.max.apply(Math,inds) + \\\"]\\\\n\\\";\\n console.log(line)\\n var text = div.text.concat(line);\\n var lines = text.split(\\\"\\\\n\\\")\\n if (lines.length > 35)\\n lines.shift();\\n div.text = lines.join(\\\"\\\\n\\\");\"},\"id\":\"6294\",\"type\":\"CustomJS\"},{\"attributes\":{},\"id\":\"6279\",\"type\":\"CrosshairTool\"},{\"attributes\":{\"dimensions\":\"width\",\"overlay\":{\"id\":\"6285\"}},\"id\":\"6280\",\"type\":\"BoxSelectTool\"},{\"attributes\":{},\"id\":\"6281\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"6282\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"6283\",\"type\":\"SaveTool\"},{\"attributes\":{\"dimensions\":\"width\"},\"id\":\"6284\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"6745\",\"type\":\"Selection\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"6279\"},{\"id\":\"6280\"},{\"id\":\"6281\"},{\"id\":\"6282\"},{\"id\":\"6283\"},{\"id\":\"6284\"},{\"id\":\"6301\"}]},\"id\":\"6286\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"6742\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"6744\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"6740\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"6285\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"dimensions\":\"width\"},\"id\":\"6244\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"line_color\":\"blue\",\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"6295\",\"type\":\"Line\"},{\"attributes\":{\"fill_alpha\":{\"value\":0},\"line_width\":{\"value\":0},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"6296\",\"type\":\"Circle\"},{\"attributes\":{\"data_source\":{\"id\":\"6293\"},\"glyph\":{\"id\":\"6295\"},\"hover_glyph\":null,\"muted_glyph\":null,\"view\":{\"id\":\"6298\"}},\"id\":\"6297\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"6293\"}},\"id\":\"6298\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"6293\"},\"glyph\":{\"id\":\"6296\"},\"hover_glyph\":null,\"muted_glyph\":null,\"view\":{\"id\":\"6300\"}},\"id\":\"6299\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"6293\"}},\"id\":\"6300\",\"type\":\"CDSView\"},{\"attributes\":{\"callback\":null,\"renderers\":[{\"id\":\"6299\"}],\"tooltips\":[[\"index\",\"$index\"],[\"(x,y)\",\"($x, $y)\"]]},\"id\":\"6301\",\"type\":\"HoverTool\"},{\"attributes\":{\"children\":[{\"id\":\"6182\"},{\"id\":\"6223\"},{\"id\":\"6263\"}]},\"id\":\"6303\",\"type\":\"Column\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"6205\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data_source\":{\"id\":\"6213\"},\"glyph\":{\"id\":\"6216\"},\"hover_glyph\":null,\"muted_glyph\":null,\"view\":{\"id\":\"6220\"}},\"id\":\"6219\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"text\":\"\"},\"id\":\"6728\",\"type\":\"Title\"},{\"attributes\":{\"text\":\"\"},\"id\":\"6730\",\"type\":\"Title\"},{\"attributes\":{\"text\":\"\"},\"id\":\"6732\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"6746\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"6748\",\"type\":\"Selection\"}],\"root_ids\":[\"6304\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", - " var render_items = [{\"docid\":\"2676cafa-7ab3-4f33-a5bc-59946ca4cafb\",\"notebook_comms_target\":\"6754\",\"root_ids\":[\"6304\"],\"roots\":{\"6304\":\"b9ffb79e-9caa-497c-938a-3932cd4888bb\"}}];\n", - " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", - "\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " embed_document(root);\n", - " } else {\n", - " var attempts = 0;\n", - " var timer = setInterval(function(root) {\n", - " if (root.Bokeh !== undefined) {\n", - " clearInterval(timer);\n", - " embed_document(root);\n", - " } else {\n", - " attempts++;\n", - " if (attempts > 100) {\n", - " clearInterval(timer);\n", - " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", - " }\n", - " }\n", - " }, 10, root)\n", - " }\n", - "})(window);" - ], - "application/vnd.bokehjs_exec.v0+json": "" - }, - "metadata": { - "application/vnd.bokehjs_exec.v0+json": { - "id": "6304" - } - }, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts\n", "\n", @@ -1158,7 +676,7 @@ "outputs": [], "source": [ "# Note change index range below with selected index range from above cell\n", - "view_timeline_charts.find_time_annotations([808, 831])" + "view_timeline_charts.find_time_annotations([400, 440])" ] }, { @@ -1175,7 +693,7 @@ "outputs": [], "source": [ "# Note change index range below with selected index range from above cell\n", - "view_timeline_charts.plot_detailed_profiler_data([808, 831])" + "view_timeline_charts.plot_detailed_profiler_data([400, 440])" ] }, { @@ -1191,385 +709,9 @@ }, { "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "data": { - "application/javascript": [ - "\n", - "(function(root) {\n", - " function now() {\n", - " return new Date();\n", - " }\n", - "\n", - " var force = true;\n", - "\n", - " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", - " root._bokeh_onload_callbacks = [];\n", - " root._bokeh_is_loading = undefined;\n", - " }\n", - "\n", - " var JS_MIME_TYPE = 'application/javascript';\n", - " var HTML_MIME_TYPE = 'text/html';\n", - " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", - " var CLASS_NAME = 'output_bokeh rendered_html';\n", - "\n", - " /**\n", - " * Render data to the DOM node\n", - " */\n", - " function render(props, node) {\n", - " var script = document.createElement(\"script\");\n", - " node.appendChild(script);\n", - " }\n", - "\n", - " /**\n", - " * Handle when an output is cleared or removed\n", - " */\n", - " function handleClearOutput(event, handle) {\n", - " var cell = handle.cell;\n", - "\n", - " var id = cell.output_area._bokeh_element_id;\n", - " var server_id = cell.output_area._bokeh_server_id;\n", - " // Clean up Bokeh references\n", - " if (id != null && id in Bokeh.index) {\n", - " Bokeh.index[id].model.document.clear();\n", - " delete Bokeh.index[id];\n", - " }\n", - "\n", - " if (server_id !== undefined) {\n", - " // Clean up Bokeh references\n", - " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", - " cell.notebook.kernel.execute(cmd, {\n", - " iopub: {\n", - " output: function(msg) {\n", - " var id = msg.content.text.trim();\n", - " if (id in Bokeh.index) {\n", - " Bokeh.index[id].model.document.clear();\n", - " delete Bokeh.index[id];\n", - " }\n", - " }\n", - " }\n", - " });\n", - " // Destroy server and session\n", - " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", - " cell.notebook.kernel.execute(cmd);\n", - " }\n", - " }\n", - "\n", - " /**\n", - " * Handle when a new output is added\n", - " */\n", - " function handleAddOutput(event, handle) {\n", - " var output_area = handle.output_area;\n", - " var output = handle.output;\n", - "\n", - " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", - " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", - " return\n", - " }\n", - "\n", - " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", - "\n", - " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", - " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", - " // store reference to embed id on output_area\n", - " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", - " }\n", - " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", - " var bk_div = document.createElement(\"div\");\n", - " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", - " var script_attrs = bk_div.children[0].attributes;\n", - " for (var i = 0; i < script_attrs.length; i++) {\n", - " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", - " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", - " }\n", - " // store reference to server id on output_area\n", - " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", - " }\n", - " }\n", - "\n", - " function register_renderer(events, OutputArea) {\n", - "\n", - " function append_mime(data, metadata, element) {\n", - " // create a DOM node to render to\n", - " var toinsert = this.create_output_subarea(\n", - " metadata,\n", - " CLASS_NAME,\n", - " EXEC_MIME_TYPE\n", - " );\n", - " this.keyboard_manager.register_events(toinsert);\n", - " // Render to node\n", - " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", - " render(props, toinsert[toinsert.length - 1]);\n", - " element.append(toinsert);\n", - " return toinsert\n", - " }\n", - "\n", - " /* Handle when an output is cleared or removed */\n", - " events.on('clear_output.CodeCell', handleClearOutput);\n", - " events.on('delete.Cell', handleClearOutput);\n", - "\n", - " /* Handle when a new output is added */\n", - " events.on('output_added.OutputArea', handleAddOutput);\n", - "\n", - " /**\n", - " * Register the mime type and append_mime function with output_area\n", - " */\n", - " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", - " /* Is output safe? */\n", - " safe: true,\n", - " /* Index of renderer in `output_area.display_order` */\n", - " index: 0\n", - " });\n", - " }\n", - "\n", - " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", - " if (root.Jupyter !== undefined) {\n", - " var events = require('base/js/events');\n", - " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", - "\n", - " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", - " register_renderer(events, OutputArea);\n", - " }\n", - " }\n", - "\n", - " \n", - " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", - " root._bokeh_timeout = Date.now() + 5000;\n", - " root._bokeh_failed_load = false;\n", - " }\n", - "\n", - " var NB_LOAD_WARNING = {'data': {'text/html':\n", - " \"
\\n\"+\n", - " \"

\\n\"+\n", - " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", - " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", - " \"

\\n\"+\n", - " \"
    \\n\"+\n", - " \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n", - " \"
  • use INLINE resources instead, as so:
  • \\n\"+\n", - " \"
\\n\"+\n", - " \"\\n\"+\n", - " \"from bokeh.resources import INLINE\\n\"+\n", - " \"output_notebook(resources=INLINE)\\n\"+\n", - " \"\\n\"+\n", - " \"
\"}};\n", - "\n", - " function display_loaded() {\n", - " var el = document.getElementById(null);\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS is loading...\";\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(display_loaded, 100)\n", - " }\n", - " }\n", - "\n", - "\n", - " function run_callbacks() {\n", - " try {\n", - " root._bokeh_onload_callbacks.forEach(function(callback) {\n", - " if (callback != null)\n", - " callback();\n", - " });\n", - " } finally {\n", - " delete root._bokeh_onload_callbacks\n", - " }\n", - " console.debug(\"Bokeh: all callbacks have finished\");\n", - " }\n", - "\n", - " function load_libs(css_urls, js_urls, callback) {\n", - " if (css_urls == null) css_urls = [];\n", - " if (js_urls == null) js_urls = [];\n", - "\n", - " root._bokeh_onload_callbacks.push(callback);\n", - " if (root._bokeh_is_loading > 0) {\n", - " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", - " return null;\n", - " }\n", - " if (js_urls == null || js_urls.length === 0) {\n", - " run_callbacks();\n", - " return null;\n", - " }\n", - " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", - " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", - "\n", - " function on_load() {\n", - " root._bokeh_is_loading--;\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", - " run_callbacks()\n", - " }\n", - " }\n", - "\n", - " function on_error() {\n", - " console.error(\"failed to load \" + url);\n", - " }\n", - "\n", - " for (var i = 0; i < css_urls.length; i++) {\n", - " var url = css_urls[i];\n", - " const element = document.createElement(\"link\");\n", - " element.onload = on_load;\n", - " element.onerror = on_error;\n", - " element.rel = \"stylesheet\";\n", - " element.type = \"text/css\";\n", - " element.href = url;\n", - " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\": \"T2yuo9Oe71Cz/I4X9Ac5+gpEa5a8PpJCDlqKYO0CfAuEszu1JrXLl8YugMqYe3sM\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\": \"98GDGJ0kOMCUMUePhksaQ/GYgB3+NH9h996V88sh3aOiUNX3N+fLXAtry6xctSZ6\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\": \"89bArO+nlbP3sgakeHjCo1JYxYR5wufVgA3IbUvDY+K7w4zyxJqssu7wVnfeKCq8\"};\n", - "\n", - " for (var i = 0; i < js_urls.length; i++) {\n", - " var url = js_urls[i];\n", - " var element = document.createElement('script');\n", - " element.onload = on_load;\n", - " element.onerror = on_error;\n", - " element.async = false;\n", - " element.src = url;\n", - " if (url in hashes) {\n", - " element.crossOrigin = \"anonymous\";\n", - " element.integrity = \"sha384-\" + hashes[url];\n", - " }\n", - " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", - " document.head.appendChild(element);\n", - " }\n", - " };\n", - "\n", - " function inject_raw_css(css) {\n", - " const element = document.createElement(\"style\");\n", - " element.appendChild(document.createTextNode(css));\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " \n", - " var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\"];\n", - " var css_urls = [];\n", - " \n", - "\n", - " var inline_js = [\n", - " function(Bokeh) {\n", - " Bokeh.set_log_level(\"info\");\n", - " },\n", - " function(Bokeh) {\n", - " \n", - " \n", - " }\n", - " ];\n", - "\n", - " function run_inline_js() {\n", - " \n", - " if (root.Bokeh !== undefined || force === true) {\n", - " \n", - " for (var i = 0; i < inline_js.length; i++) {\n", - " inline_js[i].call(root, root.Bokeh);\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(run_inline_js, 100);\n", - " } else if (!root._bokeh_failed_load) {\n", - " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", - " root._bokeh_failed_load = true;\n", - " } else if (force !== true) {\n", - " var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n", - " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", - " }\n", - "\n", - " }\n", - "\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", - " run_inline_js();\n", - " } else {\n", - " load_libs(css_urls, js_urls, function() {\n", - " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", - " run_inline_js();\n", - " });\n", - " }\n", - "}(window));" - ], - "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(null);\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\": \"T2yuo9Oe71Cz/I4X9Ac5+gpEa5a8PpJCDlqKYO0CfAuEszu1JrXLl8YugMqYe3sM\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\": \"98GDGJ0kOMCUMUePhksaQ/GYgB3+NH9h996V88sh3aOiUNX3N+fLXAtry6xctSZ6\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\": \"89bArO+nlbP3sgakeHjCo1JYxYR5wufVgA3IbUvDY+K7w4zyxJqssu7wVnfeKCq8\"};\n\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n if (url in hashes) {\n element.crossOrigin = \"anonymous\";\n element.integrity = \"sha384-\" + hashes[url];\n }\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\"];\n var css_urls = [];\n \n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2021-04-29 07:03:34.927 ip-172-16-71-4:29196 INFO metrics_reader_base.py:134] Getting 9 event files\n", - "select events:['.*']\n", - "select dimensions:['.*CPU', '.*GPU', '.*Memory']\n", - "filtered_events:{'cpu23', 'cpu20', 'cpu19', 'cpu26', 'cpu18', 'cpu2', 'WriteThroughputInBytesPerSecond', 'cpu8', 'gpu3', 'cpu10', 'cpu16', 'ReceiveBytesPerSecond', 'TransmitBytesPerSecond', 'cpu17', 'gpu1', 'cpu13', 'cpu29', 'cpu3', 'IOPS', 'cpu22', 'MemoryUsedPercent', 'cpu11', 'cpu12', 'cpu25', 'cpu24', 'cpu1', 'cpu28', 'cpu9', 'gpu2', 'gpu0', 'cpu21', 'cpu27', 'ReadThroughputInBytesPerSecond', 'total', 'cpu5', 'cpu15', 'cpu4', 'cpu30', 'cpu7', 'cpu31', 'cpu14', 'cpu6', 'cpu0'}\n", - "filtered_dimensions:{'CPUUtilization-nodeid:algo-1', 'GPUUtilization-nodeid:algo-1', 'GPUMemoryUtilization-nodeid:algo-1'}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "BokehUserWarning: ColumnDataSource's columns must be of the same length. Current lengths: ('dh', 44), ('dw', 44), ('image', 43), ('metric', 43), ('x', 44), ('y', 44)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "(function(root) {\n", - " function embed_document(root) {\n", - " \n", - " var docs_json = {\"b4a0e15d-9c39-4537-8e81-b2747bae2095\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"11036\"}],\"center\":[{\"id\":\"11039\"},{\"id\":\"11043\"}],\"left\":[{\"id\":\"11040\"}],\"plot_height\":450,\"plot_width\":1000,\"renderers\":[{\"id\":\"11057\"}],\"title\":{\"id\":\"11968\"},\"toolbar\":{\"id\":\"11048\"},\"x_range\":{\"id\":\"11028\"},\"x_scale\":{\"id\":\"11032\"},\"y_range\":{\"id\":\"11030\"},\"y_scale\":{\"id\":\"11034\"}},\"id\":\"11027\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"11032\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"11973\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"end\":44},\"id\":\"11030\",\"type\":\"Range1d\"},{\"attributes\":{\"data\":{\"dh\":[1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3],\"dw\":[955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955,955],\"image\":[{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"AAAAAAAAAAAAAAAAAAAAQAAAAAAAABBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAArkfhehSuEEAAAAAAAAAkQAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAEAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAexSuR+F6GEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAEBSuB6F61EQQAAAAAAAABhAUrgehetRAEBSuB6F61EQQAAAAAAAAABAUrgehetREECkcD0K16MAQAAAAAAAABBAXI/C9ShcH0BSuB6F61EAQAAAAAAAABBAAAAAAAAAAEBcj8L1KFwfQAAAAAAAAAAAUrgehetRAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAEEAAAAAAAAAAAAAAAAAAAAAAXI/C9ShcD0BSuB6F61EAQFK4HoXrUQBAAAAAAAAAAACkcD0K16MAQAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAFyPwvUoXA9AFK5H4XoUO0CamZmZmZkjQAAAAAAAABBAAAAAAAAAGUBSuB6F61EAQD0K16NwfTlAUrgehetRAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEEAAAAAAAAAQQAAAAAAAABhAAAAAAAAAAEBSuB6F61EAQAAAAAAAABhAAAAAAAAAGEBSuB6F61EQQAAAAAAAAABAZmZmZmZmJEBSuB6F61EgQAAAAAAAABhAAAAAAAAAAABcj8L1KFwPQIXrUbgehRdAUrgehetREEAAAAAAAAAAQAAAAAAAABBAAAAAAAAAEEAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAYQAAAAAAAABBAFK5H4XqULEApXI/C9SgtQAAAAAAAACBAAAAAAAAAAABI4XoUrsctQAAAAAAAAAAAAAAAAAAAAABcj8L1KFwBQFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAArkfhehSuEEAAAAAAAAAAAEjhehSuxzFAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABSuB6F61EQQAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAUrgehetRAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAK16NwPQoDQD0K16NwPURAAAAAAADAQkAK16NwPYopQMP1KFyPwhFAUrgehetRAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABSuB6F61EAQAAAAAAAAAAAmpmZmZmZMkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEApXI/C9aggQK5H4XoUrhBAAAAAAAAAAAA9CtejcH1IQAAAAAAAAFlAhetRuB4FWECuR+F6FG4+QClcj8L1KC1A7FG4HoWrMEBI4XoUrschQLgehetROCZAAAAAAAAAGUDsUbgehaswQClcj8L1qCBAAAAAAAAAAAAAAAAAAAAAAKRwPQrXowBASOF6FK5HHUBmZmZmZmY3QGZmZmZmJjNAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACtejcD2KKUA9CtejcL01QAAAAAAAACRAAAAAAAAAAAAAAAAAAAAAAFK4HoXrUQBAhetRuB6FF0BSuB6F61EAQFK4HoXrUQBAmpmZmZmZIkAAAAAAAAA4QAAAAAAAAABAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPQrXo3BdQkAAAAAAAABZQAAAAAAAAFlAAAAAAAAARkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBAXI/C9Shc/z8AAAAAAAAAAEjhehSuRyVA16NwPQrXJEBmZmZmZmYhQK5H4XoUrhBAhetRuB6FF0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAgQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAABBApHA9CtejAEApXI/C9aggQDMzMzMzczZAw/UoXI9CPEDD9Shcj4JIQAAAAAAAAAAAUrgehetRAEAAAAAAAAAAAIXrUbgeBSFAAAAAAAAAAAAAAAAAAAAAAPYoXI/CdStAUrgehetRAEAK16NwPQoRQAAAAAAAABBAAAAAAAAALEAAAAAAAAAAAAAAAAAAAAAAXI/C9ShcD0AAAAAAAAAAAFK4HoXrURBAAAAAAAAAAEB7FK5H4XooQAAAAAAAABlAAAAAAAAAAEAAAAAAAAAAQKRwPQrXowBArkfhehQuIkCuR+F6FK4QQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAEBSuB6F61EAQIXrUbgehRdAAAAAAAAAAABSuB6F61EAQAAAAAAAAAAAAAAAAAAAGEAfhetRuF4yQArXo3A9CgFAAAAAAAAAAEBcj8L1KFwPQAAAAAAAAClAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEjhehSuhzdAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBAAAAAAAAANEAAAAAAAAAAQAAAAAAAADhAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADhAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAEBcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFwPQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAUrgehetREEAAAAAAAAAAAFK4HoXrUQBAhetRuB6FJ0BSuB6F61EQQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAMzMzMzNzRkApXI/C9Wg0QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9ShcD0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAJEAAAAAAAAAAAFK4HoXrUQBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXA9AAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAFK5H4XoUF0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADD9Shcj8IeQGZmZmZmZiRAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/P1K4HoXrUQBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAQFyPwvUoXP8/AAAAAAAAAEAAAAAAAAAAAK5H4XoUrhBAAAAAAAAAMkBI4XoUrkclQPYoXI/CdStAAAAAAAAAAAAUrkfhehQnQAAAAAAAACxArkfhehSuEEBSuB6F61EQQAAAAAAAABBAAAAAAAAAAEBSuB6F65E1QK5H4XoUrhBAZmZmZmamMUAAAAAAAAAAAEjhehSuRyVAexSuR+F6GEAAAAAAAAA0QGZmZmZmZiRAFK5H4XpUMEDXo3A9Chc6QGZmZmZmZjdAXI/C9Shc/z8AAAAAAAAAABSuR+F6lCxAAAAAAAAAWUAAAAAAAAA+QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAhetRuB6FF0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/pHA9CtejAEBSuB6F61EAQAAAAAAAACRAexSuR+F6KEDsUbgehaswQArXo3A9CgFAAAAAAAAAAAAAAAAAAAAAQFK4HoXrUQBAUrgehetREEBSuB6F61EgQFK4HoXrUQBAXI/C9ShcH0DXo3A9CtckQAAAAAAAADpACtejcD2qQEBSuB6F65E1QKRwPQrXYzFAAAAAAAAAKEAAAAAAAAA4QFK4HoXrUSBAAAAAAAAAAAAAAAAAAAAwQJqZmZmZmSNAUrgehetREEAAAAAAAAAwQNejcD0K1yRAAAAAAAAAMkAUrkfhetQ0QFK4HoXrUSBAAAAAAAAAIEB7FK5H4XoYQAAAAAAAABBAFK5H4XpUMEAAAAAAAAAAAHsUrkfheihAhetRuB4FIUAAAAAAAAAkQBSuR+F6FDtAAAAAAAAAMkDXo3A9CtckQClcj8L1aDRAFK5H4XpUMEApXI/C9SgtQArXo3A9yj1AAAAAAAAAJEB7FK5H4XooQK5H4XoUrhBAAAAAAAAAAEAK16NwPQoBQHsUrkfhehhAFK5H4XpUMEBSuB6F61EgQAAAAAAAABlAKVyPwvWoIEAAAAAAAAAwQJqZmZmZmSNAhetRuB6FJ0AUrkfhelQwQAAAAAAAAChAUrgehetRIEAAAAAAAAAAAAAAAAAAADRAexSuR+F6GEAAAAAAAMAyQAAAAAAAwDJAH4XrUbheMkAUrkfhepQsQAAAAAAAAEJAXI/C9ShcAUBSuB6F61EgQHsUrkfhehhAUrgehetRIEAfhetRuF4yQAAAAAAAAClAKVyPwvWoIEAAAAAAAAAZQAAAAAAAABBAKVyPwvWoIECF61G4HoUnQAAAAAAAwDJAAAAAAAAANECF61G4HoUZQArXo3A9CgFAexSuR+F6GEAAAAAAAAAoQAAAAAAAAAAAPQrXo3B9OUAAAAAAAAApQArXo3A9iilA7FG4HoWrMEBmZmZmZmYkQAAAAAAAACRACtejcD0KAUCkcD0K16MAQAAAAAAAACBAMzMzMzNzNkB7FK5H4XooQFK4HoXrUSBASOF6FK6HOUAAAAAAAAAZQNejcD0K1yRAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAADBAexSuR+F6GEBSuB6F61EQQAAAAAAAAAAAAAAAAAAAAAA9CtejcD0jQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAyQAAAAAAAAAAAAAAAAAAAAABcj8L1KFwPQAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHsUrkfhehhAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQQFyPwvUoXB9AAAAAAAAAAAAAAAAAAAAAQFyPwvUoXA9AexSuR+F6GEAAAAAAAAAAQAAAAAAAABBAAAAAAAAAIEAAAAAAAAAYQAAAAAAAAAAArkfhehSuEEAAAAAAAAAgQFK4HoXrURBAUrgehetRAEBSuB6F61EQQFyPwvUoXA9AAAAAAAAAAEBcj8L1KJwzQAAAAAAAAAAAXI/C9ShcD0AAAAAAAAAAAFyPwvUoXB9AAAAAAAAAAAAAAAAAAAAAQFyPwvUoXA9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAABlAAAAAAAAAAEBSuB6F61EQQAAAAAAAAABAXI/C9Shc/z+F61G4HoUXQAAAAAAAAAAAUrgehetRAEBSuB6F61EAQFK4HoXrURBAAAAAAAAAEEBSuB6F61EQQFK4HoXrUQBAAAAAAAAAEECkcD0K16MAQJqZmZmZmSNAUrgehetREEAAAAAAAAAAAAAAAAAAABhAXI/C9ShcH0BSuB6F61EQQGZmZmZmJjVAAAAAAAAAGECkcD0K16MAQAAAAAAAACBAAAAAAAAAAEAAAAAAAAAgQAAAAAAAAABAAAAAAAAAAAAAAAAAAAAYQFyPwvUoXB9AUrgehetRAECF61G4HoUXQFK4HoXrUQBAUrgehetRAEAAAAAAAAAAAAAAAAAAAABAexSuR+F6GECF61G4HgUxQAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAFK4HoXrURBAAAAAAAAAAAAK16NwPQoDQAAAAAAAAAAAAAAAAAAAAACkcD0K16MAQBSuR+F6FBpApHA9CtejAkDD9Shcj8IRQKRwPQrXowJAZmZmZmZmIUDsUbgehaswQD0K16NwvTVArkfhehSuGkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEjhehSuRyVAXI/C9ShcH0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQFK4HoXrkTNAFK5H4XrUNECPwvUoXG9OQIXrUbgehRlAexSuR+F6GEApXI/C9agqQLgehetRODZAZmZmZmZmN0BI4XoUrkcbQAAAAAAAAAAASOF6FK7HIUCPwvUoXI8cQFK4HoXrURBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFK5H4XoUKkB7FK5H4XoYQKRwPQrXowBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAEEBI4XoUrkclQAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQQAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAABBAAAAAAAAAAAAAAAAAAAAAAIXrUbgehSdAAAAAAAAAIEBSuB6F61EQQHsUrkfhehhAAAAAAAAAEEAK16NwPQoBQClcj8L1qCBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXA9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKRwPQrXowBAXI/C9Shc/z8K16NwPQoTQGZmZmZmZiRAhetRuB4FMUCF61G4HgUhQFK4HoXrUQBAUrgehetRAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIECkcD0K16MAQAAAAAAAAAAAAAAAAAAAAACuR+F6FK4QQAAAAAAAAABAZmZmZmYmNUBI4XoUrkclQKRwPQrXowBAKVyPwvWoIEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAIXrUbgeBTFAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAUrgehetRIEBSuB6F61EAQAAAAAAAABBApHA9CtejAEAAAAAAAAAAAAAAAAAAwDJAAAAAAAAAEEBSuB6F61EAQEjhehSuRytASOF6FK5HG0BI4XoUrschQBSuR+F6lCxASOF6FK5HNUAAAAAAAAAQQIXrUbgepUFAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAK16NwPQoTQK5H4XoUrhpAcT0K16NwOECkcD0K16MAQKRwPQrXowJAXI/C9ShcAUAAAAAAAAAAAHsUrkfhehhAAAAAAAAAAABSuB6F61EAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXA9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAApXI/C9aggQIXrUbgeBSFAAAAAAAAAAAAAAAAAAABFQAAAAAAAAAAAAAAAAAAAAADsUbgehaswQEjhehSuRyVAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAABAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAClcj8L1aDRAAAAAAAAAEEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXA9AUrgeheuRNUAAAAAAAAAAAAAAAAAAADpAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAACAWEAAAAAAAAAAAKRwPQrX40JAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAA=\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"rkfhehSuEEAAAAAAAAAkQDMzMzMzczZA4XoUrkdhL0CkcD0K1xNXQIXrUbgeJVBAKVyPwvUoLUApXI/C9WhNQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAYQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFwPQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAzczMzMzMDkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEEAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAYQAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXA9AAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAXI/C9ShcD0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEEBSuB6F61EQQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEB7FK5H4XoYQClcj8L1qCBAAAAAAAAAAACkcD0K16MAQAAAAAAAACBAAAAAAAAAGEApXI/C9aggQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAA0QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAhetRuB6FF0AAAAAAAAAAAKRwPQrXowBAAAAAAAAAAABSuB6F61EAQAAAAAAAAAAAexSuR+F6GEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAUrgehetRIEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACtejcD2KKUAK16NwPQoTQDMzMzMzMxJACtejcD0KA0BSuB6F61EQQKRwPQrXowBACtejcD0KAUAAAAAAAAAAAIXrUbgehRlAAAAAAAAAAAAAAAAAAAAYQAAAAAAAAAAAAAAAAAAAAAAfhetRuB4vQD0K16NwvSVArkfhehTuNEC4HoXrUTgmQMP1KFyPwgFAAAAAAAAAAAAAAAAAAAAAAArXo3A9ChFAXI/C9Shc/z+uR+F6FK4QQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFK4HoXrUQBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADD9Shcj8IBQAAAAAAAAAAApHA9CtejAkAK16NwPQoRQKRwPQrXYzFAH4XrUbgeL0AAAAAAAAAAAAAAAAAAAAAAUrgehetRAEAAAAAAAAAQQAAAAAAAAAAACtejcD0KAUAAAAAAAAAAAIXrUbgeBSFAFK5H4XqULEBSuB6F61EQQNejcD0K1yRAhetRuB6FF0BSuB6F61EAQAAAAAAAAABAhetRuB6FGUAAAAAAAAAYQAAAAAAAAAAAUrgehetREEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAApXI/C9SgCQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACkcD0K11NQQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAPQrXo3B9OUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9ShcD0AK16NwPQoRQAAAAAAAABhAKVyPwvWoIEDsUbgehaswQFK4HoXrUQBASOF6FK5HJUAAAAAAAAAAAAAAAAAAAChAAAAAAAAAEEBSuB6F61EAQAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAACBAAAAAAAAAAEAK16NwPQoRQIXrUbgehRlAKVyPwvWoIEAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAQAAAAAAAAABAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAGEAAAAAAAAAkQD0K16NwvSVAAAAAAAAAAEBSuB6F61EAQAAAAAAAABBAexSuR+F6GEAK16NwPYopQNejcD0KFzpAFK5H4Xp0REAAAAAAAAAAAMP1KFyPwgFAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAACkcD0K16MAQAAAAAAAAAAAUrgehetREEAAAAAAAAAAAJqZmZmZmSNAAAAAAAAAAACkcD0K16MAQHsUrkfheihAhetRuB4FIUApXI/C9WhEQK5H4XoUrhBAAAAAAADAMkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAABhAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAYQAAAAAAAABBAUrgehetRAEDXo3A9CtckQArXo3A9CgFAAAAAAAAANEAUrkfhehQqQHsUrkfhehhAAAAAAAAAAABcj8L1KFz/P1K4HoXrUQBAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAABAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGZmZmZmJjNAXI/C9ShcD0AAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAQAAAAAAAABBAAAAAAAAANEAAAAAAAAAAAEjhehSuhzdAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADhAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAzczMzMzMDkAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAADMzMzMzU0BAAAAAAAAAQUAAAAAAAAAAAJqZmZmZmSNAXI/C9Shc/z8AAAAAAAAAAFyPwvUoXP8/AAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABZQMP1KFyPglhAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAgFhAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAw/UoXI+CWEAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAw/UoXI+CWEAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAw/UoXI+CWEAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAw/UoXI+CWEAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAIBYQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAIBYQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUDD9Shcj4JYQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUDD9Shcj4JYQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQMP1KFyPglhAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAgFhAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9ShcD0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAQFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAQFyPwvUoXB9AAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAApQEjhehSuJ1VAPQrXo3ANVUC4HoXrUUhWQAAAAAAAADhAAAAAAAAAAAAAAAAAAAAAAFK4HoXrURBAAAAAAAAAEEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAADZAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFlAAAAAAAAAWUAAAAAAAABZQAAAAAAAAFNAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z9cj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFwPQAAAAAAAAAAAXI/C9ShcD0AAAAAAAAAAAAAAAAAAAAAApHA9CtejAEBSuB6F61EAQAAAAAAAAABAAAAAAAAAAABSuB6F61EAQPYoXI/CtUFAKVyPwvVoNEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z97FK5H4XoYQDMzMzMzczZAAAAAAACAUkAAAAAAAAAYQBSuR+F61DRAAAAAAAAAR0DsUbgehSs9QAAAAAAAAEVAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFK4HoXrURBAhetRuB6FF0B7FK5H4XooQClcj8L1aDRA7FG4HoUrPUAAAAAAAAAAAAAAAAAAACxAAAAAAABAP0BSuB6F61EgQEjhehSuRyVAAAAAAAAAJEAAAAAAAAAZQD0K16NwfThAAAAAAAAAQEC4HoXrUVhBQAAAAAAAAClA9ihcj8J1K0B7FK5H4XooQFK4HoXrUQBAAAAAAAAAKEAAAAAAAAAAAHsUrkfhehhASOF6FK5HJUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAexSuR+F6KEBmZmZmZmYkQFK4HoXrUSBAAAAAAAAAAEAAAAAAAAAQQHsUrkfhehhAuB6F61FYQUCuR+F6FK4aQBSuR+F6lCxAAAAAAAAAQEDsUbgehaswQEjhehSuRyVA16NwPQrXJEAUrkfhetQ0QOxRuB6F6zZAexSuR+F6KECuR+F6FK4QQAAAAAAAAAAAAAAAAAAAMkCuR+F6FK4QQEjhehSuRyVAAAAAAAAAGUB7FK5H4XoYQBSuR+F6lCxAFK5H4XpUMEDD9Shcj4JIQClcj8L1KC1AUrgehetRAEBI4XoUrsctQOxRuB6FqzBAhetRuB6FJ0DsUbgehes2QAAAAAAAADlAMzMzMzNzNkAK16NwPYopQAAAAAAAACxAAAAAAADAMkAAAAAAAAAAQAAAAAAAABlAAAAAAAAAAAAUrkfhelQwQBSuR+F6VDBAAAAAAAAAOEApXI/C9aggQAAAAAAAADhAZmZmZmZmJEBmZmZmZiYzQAAAAAAAACBAAAAAAAAAKUDsUbgehaswQGZmZmZmZiRAFK5H4XpUMEB7FK5H4XooQAAAAAAAAAAAAAAAAAAAIEAAAAAAAAA0QFK4HoXrURBA16NwPQrXJECF61G4HoUnQPYoXI/CdStAAAAAAAAAMEBSuB6F61EQQBSuR+F6VDBAFK5H4XqULEBcj8L1KJw+QAAAAAAAACxArkfhehSuEECkcD0K16MAQFK4HoXrUSBAUrgehetRIEBSuB6F61EQQFK4HoXrUQBAmpmZmZmZI0CuR+F6FK4QQGZmZmZmZiRAAAAAAAAALEBSuB6F61EQQAAAAAAAABBAZmZmZmamREAAAAAAAAApQClcj8L1KC1AUrgehetREECF61G4HoUZQBSuR+F6VDBA9ihcj8IVQkAAAAAAAAA8QClcj8L1KC1AMzMzMzNzNkBSuB6F61EgQOxRuB6FqzBAFK5H4XqULEAAAAAAAAApQBSuR+F61DRAAAAAAAAAAAAK16NwPQoBQAAAAAAAACBAAAAAAAAANEAAAAAAAAAAAAAAAAAAAChAAAAAAAAAOUCuR+F6FK4QQAAAAAAAAChAAAAAAAAALEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z9cj8L1KFz/P4XrUbgehRdAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAXI/C9ShcD0AAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAAAAAAAAAAAAFyPwvUoXP8/AAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFz/PwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAXI/C9Shc/z8AAAAAAAAAAFyPwvUoXP8/AAAAAAAAAAA=\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarraydtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarraywAAAAAAAABAAAAAAAAA/D8AAAAAAADoPwwwAAAAAAANA/AAAAAAAA4D8AAAAAAADoPwAAAAAAANA/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA0D8AAAAAAADgPwAAAAAAANA/AAAAAAAA0D8AAAAAAADQPwAAAAAAAPg/AAAAAAAA0D8AAAAAACBKQAAAAAAAAAAAAAAAAABAQEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAhAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAPkAAAAAAAIBDQAAAAAAA4ENAAAAAAADgQ0AAAAAAAKBCQAAAAAAAAEJAAAAAAACAQ0AAAAAAAIBDQAAAAAAA4EFAAAAAAADgQ0AAAAAAAKBCQAAAAAAAgEFAAAAAAACgQ0AAAAAAAOBCQAAAAAAAAEJAAAAAAABAQ0AAAAAAAABEQAAAAAAAgENAAAAAAADAQ0AAAAAAAEBCQAAAAAAAYEFAAAAAAAAAQEAAAAAAAOBDQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBAQAAAAAAAwDhAAAAAAACgQUAAAAAAAEBBQAAAAAAAIEJAAAAAAABgQ0AAAAAAAGBCQAAAAAAAQENAAAAAAADgQ0AAAAAAAMBDQAAAAAAAgENAAAAAAACAQ0AAAAAAAKBBQAAAAAAAAEJAAAAAAACgQkAAAAAAAOBDQAAAAAAAoENAAAAAAADAQkAAAAAAAABCQAAAAAAAoEBAAAAAAACgQ0AAAAAAAKBDQAAAAAAAoEJAAAAAAADAQ0AAAAAAAABCQAAAAAAAAERAAAAAAAAAMEAAAAAAAGBDQAAAAAAAoEFAAAAAAADgQUAAAAAAAIBDQAAAAAAAAERAAAAAAACgQ0AAAAAAAGBCQAAAAAAAYENAAAAAAABAQ0AAAAAAAOBBQAAAAAAAoEFAAAAAAABgQ0AAAAAAAGBCQAAAAAAAQENAAAAAAAAAQ0AAAAAAAEBCQAAAAAAA4EJAAAAAAADAQkAAAAAAAABDQAAAAAAA4EBAAAAAAABgQ0AAAAAAAIBCQAAAAAAAoEJAAAAAAAAA9D8AAAAAAOBCQAAAAAAAYEFAAAAAAAAgQkAAAAAAACBCQAAAAAAAAEBAAAAAAAAgREAAAAAAAIBAQAAAAAAAoEFAAAAAAABgQUAAAAAAAABCQAAAAAAAQENAAAAAAACgQUAAAAAAAABBQAAAAAAAQENAAAAAAACAQ0AAAAAAAMBCQAAAAAAAwEFAAAAAAACAQ0AAAAAAAKBCQAAAAAAAYENAAAAAAABAQ0AAAAAAAABDQAAAAAAAAEFAAAAAAADgQ0AAAAAAAEA4QAAAAAAAgENAAAAAAAAAKEAAAAAAAKBDQAAAAAAAgEJAAAAAAADAQkAAAAAAAIBBQAAAAAAAIERAAAAAAABAQ0AAAAAAAIBDQAAAAAAAwEJAAAAAAACAQkAAAAAAAMBDQAAAAAAAAEJAAAAAAADAQUAAAAAAAEBDQAAAAAAAoENAAAAAAAAAQ0AAAAAAAGBDQAAAAAAAoENAAAAAAACAQ0AAAAAAAOBDQAAAAAAAAENAAAAAAABgQ0AAAAAAAABCQAAAAAAAABNAAAAAAADgREAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4D8AAAAAAADQPwAAAAAAAOA/AAAAAAAA4D8AAAAAAADQPwdtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarraydtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarraydtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarraydtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]},{\"__ndarray__\":\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[1,955]}],\"metric\":[\"CPUUtilization-nodeid:algo-1_cpu23\",\"CPUUtilization-nodeid:algo-1_cpu20\",\"CPUUtilization-nodeid:algo-1_cpu19\",\"CPUUtilization-nodeid:algo-1_cpu26\",\"CPUUtilization-nodeid:algo-1_cpu18\",\"CPUUtilization-nodeid:algo-1_cpu2\",\"CPUUtilization-nodeid:algo-1_cpu8\",\"CPUUtilization-nodeid:algo-1_cpu10\",\"CPUUtilization-nodeid:algo-1_cpu16\",\"CPUUtilization-nodeid:algo-1_cpu17\",\"CPUUtilization-nodeid:algo-1_cpu13\",\"CPUUtilization-nodeid:algo-1_cpu29\",\"CPUUtilization-nodeid:algo-1_cpu3\",\"CPUUtilization-nodeid:algo-1_cpu22\",\"CPUUtilization-nodeid:algo-1_cpu11\",\"CPUUtilization-nodeid:algo-1_cpu12\",\"CPUUtilization-nodeid:algo-1_cpu25\",\"CPUUtilization-nodeid:algo-1_cpu24\",\"CPUUtilization-nodeid:algo-1_cpu1\",\"CPUUtilization-nodeid:algo-1_cpu28\",\"CPUUtilization-nodeid:algo-1_cpu9\",\"CPUUtilization-nodeid:algo-1_cpu21\",\"CPUUtilization-nodeid:algo-1_cpu27\",\"CPUUtilization-nodeid:algo-1_total\",\"CPUUtilization-nodeid:algo-1_cpu5\",\"CPUUtilization-nodeid:algo-1_cpu15\",\"CPUUtilization-nodeid:algo-1_cpu4\",\"CPUUtilization-nodeid:algo-1_cpu30\",\"CPUUtilization-nodeid:algo-1_cpu7\",\"CPUUtilization-nodeid:algo-1_cpu31\",\"CPUUtilization-nodeid:algo-1_cpu14\",\"CPUUtilization-nodeid:algo-1_cpu6\",\"CPUUtilization-nodeid:algo-1_cpu0\",\"GPUUtilization-nodeid:algo-1_gpu3\",\"GPUUtilization-nodeid:algo-1_gpu1\",\"GPUUtilization-nodeid:algo-1_gpu2\",\"GPUUtilization-nodeid:algo-1_gpu0\",\"GPUUtilization-nodeid:algo-1_total\",\"GPUMemoryUtilization-nodeid:algo-1_gpu3\",\"GPUMemoryUtilization-nodeid:algo-1_gpu1\",\"GPUMemoryUtilization-nodeid:algo-1_gpu2\",\"GPUMemoryUtilization-nodeid:algo-1_gpu0\",\"GPUMemoryUtilization-nodeid:algo-1_total\"],\"x\":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],\"y\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43]},\"selected\":{\"id\":\"11972\"},\"selection_policy\":{\"id\":\"11973\"}},\"id\":\"11055\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"formatter\":{\"id\":\"11969\"},\"major_label_overrides\":{\"1\":\"CPUUtilization-nodeid:algo-1_cpu23\",\"10\":\"CPUUtilization-nodeid:algo-1_cpu17\",\"11\":\"CPUUtilization-nodeid:algo-1_cpu13\",\"12\":\"CPUUtilization-nodeid:algo-1_cpu29\",\"13\":\"CPUUtilization-nodeid:algo-1_cpu3\",\"14\":\"CPUUtilization-nodeid:algo-1_cpu22\",\"15\":\"CPUUtilization-nodeid:algo-1_cpu11\",\"16\":\"CPUUtilization-nodeid:algo-1_cpu12\",\"17\":\"CPUUtilization-nodeid:algo-1_cpu25\",\"18\":\"CPUUtilization-nodeid:algo-1_cpu24\",\"19\":\"CPUUtilization-nodeid:algo-1_cpu1\",\"2\":\"CPUUtilization-nodeid:algo-1_cpu20\",\"20\":\"CPUUtilization-nodeid:algo-1_cpu28\",\"21\":\"CPUUtilization-nodeid:algo-1_cpu9\",\"22\":\"CPUUtilization-nodeid:algo-1_cpu21\",\"23\":\"CPUUtilization-nodeid:algo-1_cpu27\",\"24\":\"CPUUtilization-nodeid:algo-1_total\",\"25\":\"CPUUtilization-nodeid:algo-1_cpu5\",\"26\":\"CPUUtilization-nodeid:algo-1_cpu15\",\"27\":\"CPUUtilization-nodeid:algo-1_cpu4\",\"28\":\"CPUUtilization-nodeid:algo-1_cpu30\",\"29\":\"CPUUtilization-nodeid:algo-1_cpu7\",\"3\":\"CPUUtilization-nodeid:algo-1_cpu19\",\"30\":\"CPUUtilization-nodeid:algo-1_cpu31\",\"31\":\"CPUUtilization-nodeid:algo-1_cpu14\",\"32\":\"CPUUtilization-nodeid:algo-1_cpu6\",\"33\":\"CPUUtilization-nodeid:algo-1_cpu0\",\"34\":\"GPUUtilization-nodeid:algo-1_gpu3\",\"35\":\"GPUUtilization-nodeid:algo-1_gpu1\",\"36\":\"GPUUtilization-nodeid:algo-1_gpu2\",\"37\":\"GPUUtilization-nodeid:algo-1_gpu0\",\"38\":\"GPUUtilization-nodeid:algo-1_total\",\"39\":\"GPUMemoryUtilization-nodeid:algo-1_gpu3\",\"4\":\"CPUUtilization-nodeid:algo-1_cpu26\",\"40\":\"GPUMemoryUtilization-nodeid:algo-1_gpu1\",\"41\":\"GPUMemoryUtilization-nodeid:algo-1_gpu2\",\"42\":\"GPUMemoryUtilization-nodeid:algo-1_gpu0\",\"43\":\"GPUMemoryUtilization-nodeid:algo-1_total\",\"44\":\"\",\"5\":\"CPUUtilization-nodeid:algo-1_cpu18\",\"6\":\"CPUUtilization-nodeid:algo-1_cpu2\",\"7\":\"CPUUtilization-nodeid:algo-1_cpu8\",\"8\":\"CPUUtilization-nodeid:algo-1_cpu10\",\"9\":\"CPUUtilization-nodeid:algo-1_cpu16\"},\"major_label_text_font_size\":\"5pt\",\"ticker\":{\"id\":\"11060\"}},\"id\":\"11040\",\"type\":\"LinearAxis\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"11044\"},{\"id\":\"11045\"},{\"id\":\"11046\"},{\"id\":\"11047\"},{\"id\":\"11053\"}]},\"id\":\"11048\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"11971\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"axis\":{\"id\":\"11040\"},\"dimension\":1,\"ticker\":null,\"visible\":false},\"id\":\"11043\",\"type\":\"Grid\"},{\"attributes\":{\"end\":955},\"id\":\"11028\",\"type\":\"Range1d\"},{\"attributes\":{\"ticks\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43]},\"id\":\"11060\",\"type\":\"FixedTicker\"},{\"attributes\":{\"axis_label\":\"Indices\",\"formatter\":{\"id\":\"11971\"},\"ticker\":{\"id\":\"11037\"}},\"id\":\"11036\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"11044\",\"type\":\"CrosshairTool\"},{\"attributes\":{\"callback\":null,\"tooltips\":[[\"usage\",\"@image\"],[\"metric\",\"@metric\"],[\"index\",\"$x{10}\"]]},\"id\":\"11053\",\"type\":\"HoverTool\"},{\"attributes\":{\"source\":{\"id\":\"11055\"}},\"id\":\"11058\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"11972\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"11034\",\"type\":\"LinearScale\"},{\"attributes\":{\"empty_value\":null},\"id\":\"11047\",\"type\":\"BoxEditTool\"},{\"attributes\":{\"text\":\"\"},\"id\":\"11968\",\"type\":\"Title\"},{\"attributes\":{\"dimensions\":\"width\"},\"id\":\"11046\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"color_mapper\":{\"id\":\"11054\"},\"dh\":{\"field\":\"dh\",\"units\":\"data\"},\"dw\":{\"field\":\"dw\",\"units\":\"data\"},\"image\":{\"field\":\"image\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"11056\",\"type\":\"Image\"},{\"attributes\":{},\"id\":\"11969\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"11037\",\"type\":\"BasicTicker\"},{\"attributes\":{\"high\":100,\"low\":0,\"palette\":[\"#440154\",\"#440357\",\"#45085B\",\"#460B5E\",\"#470F62\",\"#471265\",\"#471669\",\"#481A6C\",\"#481D6F\",\"#482172\",\"#482374\",\"#472777\",\"#472A79\",\"#462D7C\",\"#46317E\",\"#45347F\",\"#443781\",\"#433A83\",\"#423D84\",\"#424085\",\"#404387\",\"#3F4788\",\"#3E4989\",\"#3D4C89\",\"#3C4E8A\",\"#3A528B\",\"#39548B\",\"#38578C\",\"#365A8C\",\"#355C8C\",\"#345F8D\",\"#33618D\",\"#31648D\",\"#30678D\",\"#2F698D\",\"#2E6C8E\",\"#2D6E8E\",\"#2C718E\",\"#2B738E\",\"#2A768E\",\"#29798E\",\"#287A8E\",\"#277D8E\",\"#267F8E\",\"#25828E\",\"#24848D\",\"#23878D\",\"#22898D\",\"#228B8D\",\"#218E8C\",\"#20908C\",\"#1F938B\",\"#1F958B\",\"#1E988A\",\"#1E9A89\",\"#1E9C89\",\"#1E9F88\",\"#1FA187\",\"#20A485\",\"#21A685\",\"#23A883\",\"#25AB81\",\"#27AD80\",\"#2AB07E\",\"#2CB17D\",\"#30B47A\",\"#35B778\",\"#38B976\",\"#3DBB74\",\"#40BD72\",\"#45BF6F\",\"#49C16D\",\"#4FC369\",\"#55C666\",\"#59C764\",\"#60C960\",\"#64CB5D\",\"#6BCD59\",\"#70CE56\",\"#77D052\",\"#7ED24E\",\"#83D34B\",\"#8BD546\",\"#90D643\",\"#97D83E\",\"#9DD93A\",\"#A5DA35\",\"#ADDC30\",\"#B2DD2C\",\"#BADE27\",\"#BFDF24\",\"#C7E01F\",\"#CDE01D\",\"#D4E11A\",\"#DCE218\",\"#E1E318\",\"#E9E419\",\"#EEE51B\",\"#F6E61F\",\"#FDE724\"]},\"id\":\"11054\",\"type\":\"LinearColorMapper\"},{\"attributes\":{\"axis\":{\"id\":\"11036\"},\"ticker\":null,\"visible\":false},\"id\":\"11039\",\"type\":\"Grid\"},{\"attributes\":{\"data_source\":{\"id\":\"11055\"},\"glyph\":{\"id\":\"11056\"},\"hover_glyph\":null,\"muted_glyph\":null,\"view\":{\"id\":\"11058\"}},\"id\":\"11057\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"11045\",\"type\":\"ResetTool\"}],\"root_ids\":[\"11027\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", - " var render_items = [{\"docid\":\"b4a0e15d-9c39-4537-8e81-b2747bae2095\",\"notebook_comms_target\":\"11974\",\"root_ids\":[\"11027\"],\"roots\":{\"11027\":\"1c9cd143-168a-45c5-b29c-c455fc72851c\"}}];\n", - " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", - "\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " embed_document(root);\n", - " } else {\n", - " var attempts = 0;\n", - " var timer = setInterval(function(root) {\n", - " if (root.Bokeh !== undefined) {\n", - " clearInterval(timer);\n", - " embed_document(root);\n", - " } else {\n", - " attempts++;\n", - " if (attempts > 100) {\n", - " clearInterval(timer);\n", - " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", - " }\n", - " }\n", - " }, 10, root)\n", - " }\n", - "})(window);" - ], - "application/vnd.bokehjs_exec.v0+json": "" - }, - "metadata": { - "application/vnd.bokehjs_exec.v0+json": { - "id": "11027" - } - }, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from smdebug.profiler.analysis.notebook_utils.heatmap import Heatmap\n", "\n", @@ -1735,36 +877,9 @@ }, { "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Thu Apr 29 07:04:12 2021 python_stats/python_stats/81-algo-1/train-979-stepend-1619676567816781.8_posthookclose-*-end-1619676567830144.5/python_stats\n", - "\n", - " 518 function calls (516 primitive calls) in 0.010 seconds\n", - "\n", - " Ordered by: cumulative time\n", - " List reduced from 113 to 10 due to restriction <10>\n", - "\n", - " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 4 0.000 0.000 0.010 0.002 /usr/local/lib/python3.7/site-packages/tensorflow/python/data/ops/iterator_ops.py:531(__del__)\n", - " 5 0.000 0.000 0.010 0.002 /usr/local/lib/python3.7/contextlib.py:107(__enter__)\n", - " 12/11 0.010 0.001 0.010 0.001 {built-in method builtins.next}\n", - " 13 0.000 0.000 0.000 0.000 {built-in method tensorflow.python._pywrap_tfe.TFE_ContextRemoveFunction}\n", - " 5 0.000 0.000 0.000 0.000 {built-in method tensorflow.python._pywrap_tfe.TFE_Py_FastPathExecute}\n", - " 13 0.000 0.000 0.000 0.000 {built-in method tensorflow.python._pywrap_tf_session.TF_DeleteFunction}\n", - " 7 0.000 0.000 0.000 0.000 {built-in method tensorflow.python._pywrap_utils.IsSequence}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method tensorflow.python._pywrap_utils.IsMutableMapping}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method tensorflow.python._pywrap_utils.Flatten}\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", - "\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "stats = python_analysis.fetch_post_hook_close_profile_stats()\n", "display_python_profile_stats(stats)" diff --git a/sagemaker-debugger/tensorflow_builtin_rule/tf-mnist-builtin-rule.ipynb b/sagemaker-debugger/tensorflow_builtin_rule/tf-mnist-builtin-rule.ipynb index 02e936dc1e..1fe56ec7fd 100644 --- a/sagemaker-debugger/tensorflow_builtin_rule/tf-mnist-builtin-rule.ipynb +++ b/sagemaker-debugger/tensorflow_builtin_rule/tf-mnist-builtin-rule.ipynb @@ -2,38 +2,15 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "id": "returning-commodity", + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: smdebug in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (1.0.9)\n", - "Requirement already satisfied: pyinstrument>=3.1.3 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from smdebug) (3.4.2)\n", - "Requirement already satisfied: packaging in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from smdebug) (20.1)\n", - "Requirement already satisfied: numpy>=1.16.0 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from smdebug) (1.18.1)\n", - "Requirement already satisfied: protobuf>=3.6.0 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from smdebug) (3.15.6)\n", - "Requirement already satisfied: boto3>=1.10.32 in /home/ubuntu/.local/lib/python3.6/site-packages (from smdebug) (1.16.36)\n", - "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from boto3>=1.10.32->smdebug) (0.10.0)\n", - "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from boto3>=1.10.32->smdebug) (0.3.3)\n", - "Requirement already satisfied: botocore<1.20.0,>=1.19.36 in /home/ubuntu/.local/lib/python3.6/site-packages (from boto3>=1.10.32->smdebug) (1.19.36)\n", - "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from botocore<1.20.0,>=1.19.36->boto3>=1.10.32->smdebug) (2.8.1)\n", - "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from botocore<1.20.0,>=1.19.36->boto3>=1.10.32->smdebug) (1.25.10)\n", - "Requirement already satisfied: six>=1.9 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from protobuf>=3.6.0->smdebug) (1.14.0)\n", - "Requirement already satisfied: pyinstrument-cext>=0.2.2 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from pyinstrument>=3.1.3->smdebug) (0.2.4)\n", - "Requirement already satisfied: pyparsing>=2.0.2 in /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (from packaging->smdebug) (2.4.6)\n" - ] - } - ], + "outputs": [], "source": [ - "!pip install smdebug" + "!pip install smdebug matplotlib" ] }, { "cell_type": "markdown", - "id": "passing-exclusive", "metadata": { "papermill": { "duration": 0.015745, @@ -50,7 +27,6 @@ }, { "cell_type": "markdown", - "id": "diagnostic-discrimination", "metadata": { "papermill": { "duration": 0.015761, @@ -79,7 +55,6 @@ }, { "cell_type": "markdown", - "id": "aware-proposition", "metadata": { "papermill": { "duration": 0.015708, @@ -97,7 +72,6 @@ }, { "cell_type": "markdown", - "id": "absolute-arabic", "metadata": { "papermill": { "duration": 0.015697, @@ -114,8 +88,7 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "worse-affiliation", + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2021-06-01T00:12:47.954649Z", @@ -146,7 +119,6 @@ }, { "cell_type": "markdown", - "id": "academic-mathematics", "metadata": { "papermill": { "duration": 0.015886, @@ -163,8 +135,7 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "federal-forward", + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2021-06-01T00:12:48.024143Z", @@ -181,18 +152,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "'2.42.1'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import sagemaker\n", "\n", @@ -201,8 +161,7 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "oriented-texture", + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2021-06-01T00:12:48.510517Z", @@ -220,27 +179,7 @@ "scrolled": true, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extension horovod.torch has not been built: /home/ubuntu/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-36m-x86_64-linux-gnu.so not found\n", - "If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.\n", - "Warning! MPI libs are missing, but python applications are still avaiable.\n" - ] - }, - { - "data": { - "text/plain": [ - "'1.0.9'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import smdebug\n", "\n", @@ -249,7 +188,6 @@ }, { "cell_type": "markdown", - "id": "lightweight-initial", "metadata": { "papermill": { "duration": null, @@ -267,8 +205,7 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "noted-tennis", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -286,7 +223,6 @@ }, { "cell_type": "markdown", - "id": "neutral-spanking", "metadata": { "papermill": { "duration": null, @@ -303,8 +239,7 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "crazy-casting", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -325,7 +260,6 @@ }, { "cell_type": "markdown", - "id": "related-dating", "metadata": { "papermill": { "duration": null, @@ -349,8 +283,7 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "prescribed-operations", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -386,7 +319,6 @@ }, { "cell_type": "markdown", - "id": "first-supplier", "metadata": { "papermill": { "duration": null, @@ -405,8 +337,7 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "affiliated-latex", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -424,7 +355,6 @@ }, { "cell_type": "markdown", - "id": "automotive-second", "metadata": { "papermill": { "duration": null, @@ -445,7 +375,6 @@ }, { "cell_type": "markdown", - "id": "fluid-product", "metadata": { "papermill": { "duration": null, @@ -470,7 +399,6 @@ }, { "cell_type": "markdown", - "id": "blank-agriculture", "metadata": { "papermill": { "duration": null, @@ -489,7 +417,6 @@ }, { "cell_type": "markdown", - "id": "tough-refrigerator", "metadata": { "papermill": { "duration": null, @@ -506,8 +433,7 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "incorporated-estate", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -518,15 +444,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training job name: tensorflow-training-2021-06-02-19-16-51-791\n" - ] - } - ], + "outputs": [], "source": [ "job_name = estimator.latest_training_job.name\n", "print(\"Training job name: {}\".format(job_name))" @@ -534,7 +452,6 @@ }, { "cell_type": "markdown", - "id": "coastal-olive", "metadata": { "papermill": { "duration": null, @@ -553,8 +470,7 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "asian-study", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -566,28 +482,7 @@ "scrolled": true, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Starting] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Downloading] | Overfit Rule Evaluation Status: InProgress\n", - "Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Training] | Overfit Rule Evaluation Status: InProgress\n" - ] - } - ], + "outputs": [], "source": [ "import time\n", "\n", @@ -611,107 +506,6 @@ }, { "cell_type": "markdown", - "id": "german-italian", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "source": [ - "##### Get a direct Amazon CloudWatch URL to find the current rule processing job log" - ] - }, - { - "cell_type": "markdown", - "id": "unknown-thunder", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "source": [ - "If you want do see logs of the Debugger rule evaluations, run the following code cell to get an autogenerated URL to Amazon CloudWatch. Paste the URL into an internet browser to directly open the logs in your CloudWatch console." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "mental-popularity", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The direct CloudWatch URL to the current rule job: https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logStream:group=/aws/sagemaker/ProcessingJobs;prefix=tensorflow-training-2021-0-Overfit-4d9a4470;streamFilter=typeLogStreamPrefix\n" - ] - } - ], - "source": [ - "import boto3\n", - "\n", - "\n", - "def _get_rule_job_name(training_job_name, rule_configuration_name, rule_job_arn):\n", - " \"\"\"Helper function to get the rule job name\"\"\"\n", - " return \"{}-{}-{}\".format(\n", - " training_job_name[:26], rule_configuration_name[:26], rule_job_arn[-8:]\n", - " )\n", - "\n", - "\n", - "def _get_cw_url_for_rule_job(rule_job_name, region):\n", - " return \"https://{}.console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix\".format(\n", - " region, region, rule_job_name\n", - " )\n", - "\n", - "\n", - "def get_rule_jobs_cw_urls(estimator):\n", - " region = boto3.Session().region_name\n", - " training_job = estimator.latest_training_job\n", - " training_job_name = training_job.describe()[\"TrainingJobName\"]\n", - " rule_eval_statuses = training_job.describe()[\"DebugRuleEvaluationStatuses\"]\n", - "\n", - " result = {}\n", - " for status in rule_eval_statuses:\n", - " if status.get(\"RuleEvaluationJobArn\", None) is not None:\n", - " rule_job_name = _get_rule_job_name(\n", - " training_job_name, status[\"RuleConfigurationName\"], status[\"RuleEvaluationJobArn\"]\n", - " )\n", - " result[status[\"RuleConfigurationName\"]] = _get_cw_url_for_rule_job(\n", - " rule_job_name, region\n", - " )\n", - " return result\n", - "\n", - "\n", - "print(\n", - " \"The direct CloudWatch URL to the current rule job:\",\n", - " get_rule_jobs_cw_urls(estimator)[\n", - " estimator.latest_training_job.rule_job_summary()[0][\"RuleConfigurationName\"]\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "south-contractor", "metadata": { "papermill": { "duration": null, @@ -731,8 +525,7 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "monetary-northwest", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -744,19 +537,7 @@ "scrolled": true, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2021-06-02 19:20:24.477 ip-172-31-33-148:30930 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\n", - "[2021-06-02 19:20:24.498 ip-172-31-33-148:30930 INFO s3_trial.py:42] Loading trial debug-output at path s3://sagemaker-us-west-2-688520471316/tensorflow-training-2021-06-02-19-16-51-791/debug-output\n", - "[2021-06-02 19:20:44.997 ip-172-31-33-148:30930 WARNING trial.py:148] Waiting to read collections files generated by the training job,from s3://sagemaker-us-west-2-688520471316/tensorflow-training-2021-06-02-19-16-51-791/debug-output/. If this has been a while, you might want to check that the trial is pointed at the right path.\n", - "[2021-06-02 19:20:47.035 ip-172-31-33-148:30930 WARNING trial.py:148] Waiting to read collections files generated by the training job,from s3://sagemaker-us-west-2-688520471316/tensorflow-training-2021-06-02-19-16-51-791/debug-output/. If this has been a while, you might want to check that the trial is pointed at the right path.\n", - "[2021-06-02 19:20:49.072 ip-172-31-33-148:30930 WARNING trial.py:148] Waiting to read collections files generated by the training job,from s3://sagemaker-us-west-2-688520471316/tensorflow-training-2021-06-02-19-16-51-791/debug-output/. If this has been a while, you might want to check that the trial is pointed at the right path.\n" - ] - } - ], + "outputs": [], "source": [ "from smdebug.trials import create_trial\n", "\n", @@ -765,7 +546,6 @@ }, { "cell_type": "markdown", - "id": "marine-canon", "metadata": { "papermill": { "duration": null, @@ -782,7 +562,6 @@ }, { "cell_type": "markdown", - "id": "endless-satin", "metadata": { "papermill": { "duration": null, @@ -799,8 +578,7 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "fixed-barbados", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -811,25 +589,13 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "'s3://sagemaker-us-west-2-688520471316/tensorflow-training-2021-06-02-19-16-51-791/debug-output/'" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tutorial_trial.path" ] }, { "cell_type": "markdown", - "id": "hybrid-corner", "metadata": { "papermill": { "duration": null, @@ -846,8 +612,7 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "sporting-departure", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -858,25 +623,13 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "['accuracy', 'batch', 'loss']" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tutorial_trial.tensor_names()" ] }, { "cell_type": "markdown", - "id": "external-offset", "metadata": { "papermill": { "duration": null, @@ -893,7 +646,6 @@ }, { "cell_type": "markdown", - "id": "composed-signal", "metadata": { "papermill": { "duration": null, @@ -910,8 +662,7 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "fluid-source", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -929,8 +680,7 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "expressed-foundation", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -941,26 +691,14 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "[0]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tutorial_trial.steps(mode=ModeKeys.TRAIN)" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "afraid-emperor", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -971,25 +709,13 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tutorial_trial.steps(mode=ModeKeys.EVAL)" ] }, { "cell_type": "markdown", - "id": "modified-budget", "metadata": { "papermill": { "duration": null, @@ -1008,28 +734,7 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "biblical-leadership", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Uncomment the following line if `matplotlib` is not installed.\n", - "#! pip install -q matplotlib" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "single-thirty", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -1054,8 +759,7 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "dirty-portland", + "execution_count": null, "metadata": { "papermill": { "duration": null, @@ -1066,32 +770,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "from mpl_toolkits.axes_grid1 import host_subplot\n", @@ -1134,7 +813,13 @@ }, { "cell_type": "markdown", - "id": "elementary-crystal", + "metadata": {}, + "source": [ + "> ## Note : Rerun the above cell if you don't see any plots! " + ] + }, + { + "cell_type": "markdown", "metadata": { "papermill": { "duration": null, @@ -1154,7 +839,6 @@ { "cell_type": "code", "execution_count": null, - "id": "amber-defeat", "metadata": { "papermill": { "duration": null, @@ -1172,9 +856,9 @@ "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Environment (conda_tensorflow2_p36)", + "display_name": "Python 3 (TensorFlow 2.1 Python 3.6 CPU Optimized)", "language": "python", - "name": "conda_tensorflow2_p36" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/tensorflow-2.1-cpu-py36" }, "language_info": { "codemirror_mode": { @@ -1186,7 +870,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" }, "papermill": { "default_parameters": {}, diff --git a/sagemaker-featurestore/README.md b/sagemaker-featurestore/README.md index 9f64103b1e..4fbaf0b2fb 100644 --- a/sagemaker-featurestore/README.md +++ b/sagemaker-featurestore/README.md @@ -34,7 +34,7 @@ This notebook requires this synthetic data set in `./data/`: In `feature_store_securely_store_images.ipynb` we demonstrate how to securely store a dataset of images into your Feature Store using KMS key. ## Securely store the output of an image or text classification labelling job from Amazon Ground Truth directly into Feature Store using a KMS key -In `feature_store_object_detection_ground_truth.ipynb`, we demonstrate how to pipe the output of an image or text classification labelling job from Amazon Ground Truth directly into Feature Store. +In `feature_store_classification_job_to_ground_truth.ipynb`, we demonstrate how to pipe the output of an image or text classification labelling job from Amazon Ground Truth directly into Feature Store. ## Fraud Detection with Feature Store For an advanced example on how to use Feature Store for a Fraud Detection use-case, see [Fraud Detection with Feature Store](https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-featurestore/sagemaker_featurestore_fraud_detection_python_sdk.html), and it's associated notebook, `sagemaker_featurestore_fraud_detection_python_sdk.ipynb`. diff --git a/sagemaker-featurestore/feature_store_introduction.ipynb b/sagemaker-featurestore/feature_store_introduction.ipynb index 61652a82fb..f5adad9754 100644 --- a/sagemaker-featurestore/feature_store_introduction.ipynb +++ b/sagemaker-featurestore/feature_store_introduction.ipynb @@ -366,6 +366,44 @@ "sample_record" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use `batch_get_record` to check that all data has been ingested into two feature groups by providing customer ids." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_records = sagemaker_session.boto_session.client(\n", + " \"sagemaker-featurestore-runtime\", region_name=region\n", + ").batch_get_record(\n", + " Identifiers=[\n", + " {\n", + " \"FeatureGroupName\": customers_feature_group_name,\n", + " \"RecordIdentifiersValueAsString\": [\"573291\", \"109382\", \"828400\", \"124013\"],\n", + " },\n", + " {\n", + " \"FeatureGroupName\": orders_feature_group_name,\n", + " \"RecordIdentifiersValueAsString\": [\"573291\", \"109382\", \"828400\", \"124013\"],\n", + " },\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_records" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -390,8 +428,11 @@ "metadata": {}, "outputs": [], "source": [ + "%%bash -s \"$original_version\"\n", + "\n", "# preserve original sagemaker version\n", - "%pip install 'sagemaker=={}'.format(original_version)" + "\n", + "pip install sagemaker==$1" ] }, { @@ -436,7 +477,8 @@ "\n", "#### Boto3 API Calls\n", "* `list_feature_groups()`\n", - "* `get_record()`\n" + "* `get_record()`\n", + "* `batch_get_record()`\n" ] } ], diff --git a/sagemaker-featurestore/feature_store_kms_key_encryption.ipynb b/sagemaker-featurestore/feature_store_kms_key_encryption.ipynb index d036b2300c..847e70f26f 100644 --- a/sagemaker-featurestore/feature_store_kms_key_encryption.ipynb +++ b/sagemaker-featurestore/feature_store_kms_key_encryption.ipynb @@ -13,6 +13,10 @@ "source": [ "This notebook demonstrates how to enable encyption for your data in your online or offline Feature Store using KMS key. We start by showing how to programmatically create a KMS key, and how to apply it to the feature store creation process for data encryption. The last portion of this notebook demonstrates how to verify that your KMS key is being used to encerypt your data in your feature store.\n", "\n", + "### Important\n", + "If you **do not** specify a KMS encryption key, by default we encrypt all data at rest using an AWS KMS key. By defining your [bucket-level key](https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucket-key.html) for SSE, you can reduce AWS KMS requests costs by up to 99 percent. \n", + "\n", + "\n", "### Overview\n", "1. Create a KMS key.\n", " - How to create a KMS key programmatically using the KMS client from boto3?\n", diff --git a/sagemaker-featurestore/sagemaker_featurestore_fraud_detection_python_sdk.ipynb b/sagemaker-featurestore/sagemaker_featurestore_fraud_detection_python_sdk.ipynb index b3ecd2158b..c2ebe08afc 100644 --- a/sagemaker-featurestore/sagemaker_featurestore_fraud_detection_python_sdk.ipynb +++ b/sagemaker-featurestore/sagemaker_featurestore_fraud_detection_python_sdk.ipynb @@ -431,6 +431,33 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also retrieve a record of each feature group from the online store:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "featurestore_runtime.batch_get_record(\n", + " Identifiers=[\n", + " {\n", + " \"FeatureGroupName\": identity_feature_group_name,\n", + " \"RecordIdentifiersValueAsString\": [\"2990130\"],\n", + " },\n", + " {\n", + " \"FeatureGroupName\": transaction_feature_group_name,\n", + " \"RecordIdentifiersValueAsString\": [\"2990130\"],\n", + " },\n", + " ]\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/sagemaker-fundamentals/create-endpoint/create_endpoint.ipynb b/sagemaker-fundamentals/create-endpoint/create_endpoint.ipynb index c2993c0f22..a77fd55987 100644 --- a/sagemaker-fundamentals/create-endpoint/create_endpoint.ipynb +++ b/sagemaker-fundamentals/create-endpoint/create_endpoint.ipynb @@ -815,7 +815,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'EndpointArn': 'arn:aws:sagemaker:us-west-2:688520471316:endpoint/exmaple-endpoint',\n", + "{'EndpointArn': 'arn:aws:sagemaker:us-west-2:688520471316:endpoint/example-endpoint',\n", " 'ResponseMetadata': {'HTTPHeaders': {'content-length': '84',\n", " 'content-type': 'application/x-amz-json-1.1',\n", " 'date': 'Wed, 10 Mar 2021 23:47:37 GMT',\n", @@ -827,7 +827,7 @@ } ], "source": [ - "endpoint_name = \"exmaple-endpoint\"\n", + "endpoint_name = \"example-endpoint\"\n", "ep_res = sm_boto3.create_endpoint(\n", " EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name\n", ")\n", @@ -852,9 +852,9 @@ "output_type": "stream", "text": [ "{'CreationTime': datetime.datetime(2021, 3, 10, 23, 47, 38, 119000, tzinfo=tzlocal()),\n", - " 'EndpointArn': 'arn:aws:sagemaker:us-west-2:688520471316:endpoint/exmaple-endpoint',\n", + " 'EndpointArn': 'arn:aws:sagemaker:us-west-2:688520471316:endpoint/example-endpoint',\n", " 'EndpointConfigName': 'ExampleServeConfig',\n", - " 'EndpointName': 'exmaple-endpoint',\n", + " 'EndpointName': 'example-endpoint',\n", " 'EndpointStatus': 'Creating',\n", " 'LastModifiedTime': datetime.datetime(2021, 3, 10, 23, 47, 38, 119000, tzinfo=tzlocal()),\n", " 'ResponseMetadata': {'HTTPHeaders': {'content-length': '256',\n", @@ -912,9 +912,9 @@ "output_type": "stream", "text": [ "{'CreationTime': datetime.datetime(2021, 3, 10, 23, 47, 38, 119000, tzinfo=tzlocal()),\n", - " 'EndpointArn': 'arn:aws:sagemaker:us-west-2:688520471316:endpoint/exmaple-endpoint',\n", + " 'EndpointArn': 'arn:aws:sagemaker:us-west-2:688520471316:endpoint/example-endpoint',\n", " 'EndpointConfigName': 'ExampleServeConfig',\n", - " 'EndpointName': 'exmaple-endpoint',\n", + " 'EndpointName': 'example-endpoint',\n", " 'EndpointStatus': 'InService',\n", " 'LastModifiedTime': datetime.datetime(2021, 3, 10, 23, 56, 2, 741000, tzinfo=tzlocal()),\n", " 'ProductionVariants': [{'CurrentInstanceCount': 1,\n", diff --git a/sagemaker-pipeline-compare-model-versions/notebook.ipynb b/sagemaker-pipeline-compare-model-versions/notebook.ipynb index 91afe7a214..5343777f87 100644 --- a/sagemaker-pipeline-compare-model-versions/notebook.ipynb +++ b/sagemaker-pipeline-compare-model-versions/notebook.ipynb @@ -86,13 +86,6 @@ "In the last section, the SageMaker pipeline is created and all steps orchestrated before executing the pipeline.\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -219,6 +212,8 @@ "from sagemaker.sklearn.processing import SKLearnProcessor\n", "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", "from sagemaker.workflow.steps import ProcessingStep\n", + "from sagemaker.workflow.functions import Join\n", + "from sagemaker.workflow.execution_variables import ExecutionVariables\n", "\n", "# Create SKlearn processor object,\n", "# The object contains information about what instance type to use, the IAM role to use etc.\n", @@ -239,9 +234,45 @@ " ProcessingInput(source=input_data, destination=\"/opt/ml/processing/input\"),\n", " ],\n", " outputs=[\n", - " ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n", - " ProcessingOutput(output_name=\"validation\", source=\"/opt/ml/processing/validation\"),\n", - " ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\"),\n", + " ProcessingOutput(\n", + " output_name=\"train\",\n", + " source=\"/opt/ml/processing/train\",\n", + " destination=Join(\n", + " on=\"/\",\n", + " values=[\n", + " \"s3://{}\".format(bucket),\n", + " prefix,\n", + " ExecutionVariables.PIPELINE_EXECUTION_ID,\n", + " \"train\",\n", + " ],\n", + " ),\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"validation\",\n", + " source=\"/opt/ml/processing/validation\",\n", + " destination=Join(\n", + " on=\"/\",\n", + " values=[\n", + " \"s3://{}\".format(bucket),\n", + " prefix,\n", + " ExecutionVariables.PIPELINE_EXECUTION_ID,\n", + " \"validation\",\n", + " ],\n", + " ),\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"test\",\n", + " source=\"/opt/ml/processing/test\",\n", + " destination=Join(\n", + " on=\"/\",\n", + " values=[\n", + " \"s3://{}\".format(bucket),\n", + " prefix,\n", + " ExecutionVariables.PIPELINE_EXECUTION_ID,\n", + " \"test\",\n", + " ],\n", + " ),\n", + " ),\n", " ],\n", " code=\"preprocess.py\",\n", ")" @@ -267,9 +298,6 @@ "from sagemaker.workflow.steps import TrainingStep\n", "from sagemaker.estimator import Estimator\n", "\n", - "# Where to store the trained model\n", - "model_path = f\"s3://{bucket}/{prefix}/churn\"\n", - "\n", "# Fetch container to use for training\n", "image_uri = sagemaker.image_uris.retrieve(\n", " framework=\"xgboost\",\n", @@ -285,7 +313,6 @@ " image_uri=image_uri,\n", " instance_type=training_instance_type,\n", " instance_count=1,\n", - " output_path=model_path,\n", " role=role,\n", " disable_profiler=True,\n", ")\n", @@ -376,7 +403,19 @@ " ),\n", " ],\n", " outputs=[\n", - " ProcessingOutput(output_name=\"evaluation\", source=\"/opt/ml/processing/evaluation\"),\n", + " ProcessingOutput(\n", + " output_name=\"evaluation\",\n", + " source=\"/opt/ml/processing/evaluation\",\n", + " destination=Join(\n", + " on=\"/\",\n", + " values=[\n", + " \"s3://{}\".format(bucket),\n", + " prefix,\n", + " ExecutionVariables.PIPELINE_EXECUTION_ID,\n", + " \"evaluation-report\",\n", + " ],\n", + " ),\n", + " ),\n", " ],\n", " code=\"evaluate.py\",\n", " property_files=[evaluation_report],\n", @@ -406,10 +445,14 @@ "# A ModelMetrics object contains metrics captured from a model.\n", "model_metrics = ModelMetrics(\n", " model_statistics=MetricsSource(\n", - " s3_uri=\"{}/evaluation.json\".format(\n", - " step_evaluate_model.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\n", - " \"S3Uri\"\n", - " ]\n", + " s3_uri=Join(\n", + " on=\"/\",\n", + " values=[\n", + " step_evaluate_model.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\n", + " \"S3Uri\"\n", + " ],\n", + " \"evaluation.json\",\n", + " ],\n", " ),\n", " content_type=\"application/json\",\n", " )\n", diff --git a/sagemaker-pipelines/index.rst b/sagemaker-pipelines/index.rst index 402c1f2b2e..581616a017 100644 --- a/sagemaker-pipelines/index.rst +++ b/sagemaker-pipelines/index.rst @@ -8,3 +8,4 @@ Amazon SageMaker Model Building Pipelines is a tool for building machine learnin tabular/abalone_build_train_deploy/sagemaker-pipelines-preprocess-train-evaluate-batch-transform tabular/customizing_build_train_deploy_project/modelbuild/sagemaker-pipelines-customized-project + tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint \ No newline at end of file diff --git a/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/code/train.py b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/code/train.py new file mode 100644 index 0000000000..151002db9c --- /dev/null +++ b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/code/train.py @@ -0,0 +1,82 @@ +import argparse +import numpy as np +import os +import tensorflow as tf + +def parse_args(): + + parser = argparse.ArgumentParser() + + # hyperparameters sent by the client are passed as command-line arguments to the script + parser.add_argument('--epochs', type=int, default=1) + parser.add_argument('--batch_size', type=int, default=64) + parser.add_argument('--learning_rate', type=float, default=0.1) + + # data directories + parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN')) + parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST')) + + # model directory + parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR')) + + return parser.parse_known_args() + + +def get_train_data(train_dir): + + x_train = np.load(os.path.join(train_dir, 'x_train.npy')) + y_train = np.load(os.path.join(train_dir, 'y_train.npy')) + print('x train', x_train.shape,'y train', y_train.shape) + + return x_train, y_train + + +def get_test_data(test_dir): + + x_test = np.load(os.path.join(test_dir, 'x_test.npy')) + y_test = np.load(os.path.join(test_dir, 'y_test.npy')) + print('x test', x_test.shape,'y test', y_test.shape) + + return x_test, y_test + + +def get_model(): + + inputs = tf.keras.Input(shape=(8,)) + hidden_1 = tf.keras.layers.Dense(8, activation='tanh')(inputs) + hidden_2 = tf.keras.layers.Dense(4, activation='sigmoid')(hidden_1) + outputs = tf.keras.layers.Dense(1)(hidden_2) + return tf.keras.Model(inputs=inputs, outputs=outputs) + + +if __name__ == "__main__": + + args, _ = parse_args() + + print('Training data location: {}'.format(args.train)) + print('Test data location: {}'.format(args.test)) + x_train, y_train = get_train_data(args.train) + x_test, y_test = get_test_data(args.test) + + batch_size = args.batch_size + epochs = args.epochs + learning_rate = args.learning_rate + print('batch_size = {}, epochs = {}, learning rate = {}'.format(batch_size, epochs, learning_rate)) + + + model = get_model() + optimizer = tf.keras.optimizers.SGD(learning_rate) + model.compile(optimizer=optimizer, loss='mse') + model.fit(x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + validation_data=(x_test, y_test)) + + # evaluate on test set + scores = model.evaluate(x_test, y_test, batch_size, verbose=2) + print("\nTest MSE :", scores) + + # save model + model.save(args.sm_model_dir + '/1') + diff --git a/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/iam_helper.py b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/iam_helper.py new file mode 100644 index 0000000000..5bdb19a4c4 --- /dev/null +++ b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/iam_helper.py @@ -0,0 +1,87 @@ +import boto3 +import time +import json + + +iam = boto3.client('iam') + +def create_s3_lambda_role(role_name): + try: + response = iam.create_role( + RoleName = role_name, + AssumeRolePolicyDocument = json.dumps({ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "lambda.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] + }), + Description='Role for Lambda to provide S3 read only access' + ) + + role_arn = response['Role']['Arn'] + + response = iam.attach_role_policy( + RoleName=role_name, + PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole' + ) + + response = iam.attach_role_policy( + PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess', + RoleName=role_name + ) + + print('Waiting 30 seconds for the IAM role to propagate') + time.sleep(30) + return role_arn + + except iam.exceptions.EntityAlreadyExistsException: + print(f'Using ARN from existing role: {role_name}') + response = iam.get_role(RoleName=role_name) + return response['Role']['Arn'] + + +def create_sagemaker_lambda_role(role_name): + try: + response = iam.create_role( + RoleName = role_name, + AssumeRolePolicyDocument = json.dumps({ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "lambda.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] + }), + Description='Role for Lambda to call SageMaker functions' + ) + + role_arn = response['Role']['Arn'] + + response = iam.attach_role_policy( + RoleName=role_name, + PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole' + ) + + response = iam.attach_role_policy( + PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess', + RoleName=role_name + ) + + print('Waiting 30 seconds for the IAM role to propagate') + time.sleep(30) + return role_arn + + except iam.exceptions.EntityAlreadyExistsException: + print(f'Using ARN from existing role: {role_name}') + response = iam.get_role(RoleName=role_name) + return response['Role']['Arn'] \ No newline at end of file diff --git a/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/images/pipeline_run_1_mse_higher_than_threshold.png b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/images/pipeline_run_1_mse_higher_than_threshold.png new file mode 100644 index 0000000000..2c35b1e56c Binary files /dev/null and b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/images/pipeline_run_1_mse_higher_than_threshold.png differ diff --git a/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/images/pipeline_run_1_mse_lower_than_threshold.png b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/images/pipeline_run_1_mse_lower_than_threshold.png new file mode 100644 index 0000000000..3fc0972f0b Binary files /dev/null and b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/images/pipeline_run_1_mse_lower_than_threshold.png differ diff --git a/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint.ipynb b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint.ipynb new file mode 100644 index 0000000000..3d6389f3ca --- /dev/null +++ b/sagemaker-pipelines/tabular/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint/tensorflow2-california-housing-sagemaker-pipelines-deploy-endpoint.ipynb @@ -0,0 +1,1302 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "19215575", + "metadata": {}, + "source": [ + "# SageMaker Pipelines California Housing - Taking different steps based on model performance" + ] + }, + { + "cell_type": "markdown", + "id": "30bd1bfa", + "metadata": {}, + "source": [ + "This notebook illustrates how to take different actions based on model performance in a SageMaker Pipeline.\n", + "\n", + "The steps in this pipeline include:\n", + "* Preprocessing the California Housing dataset.\n", + "* Train a TensorFlow2 Artificial Neural Network (ANN) Model.\n", + "* Evaluate the model performance - mean square error (MSE).\n", + "* If MSE is higher than threshold, use a Lambda step to send an E-Mail to the Data Science team.\n", + "* If MSE is lower than threshold, register the model into the Model Registry, and use a Lambda step to deploy the model to SageMaker Endpoint." + ] + }, + { + "cell_type": "markdown", + "id": "75907ff2", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "#### Add `AmazonSageMakerPipelinesIntegrations` policy\n", + "\n", + "The notebook execution role should have policies which enable the notebook to create a Lambda function. The Amazon managed policy `AmazonSageMakerPipelinesIntegrations` can be added to the notebook execution role. \n", + "\n", + "The policy description is:\n", + "\n", + "```\n", + "\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"lambda:CreateFunction\",\n", + " \"lambda:DeleteFunction\",\n", + " \"lambda:InvokeFunction\",\n", + " \"lambda:UpdateFunctionCode\"\n", + " ],\n", + " \"Resource\": [\n", + " \"arn:aws:lambda:*:*:function:*sagemaker*\",\n", + " \"arn:aws:lambda:*:*:function:*sageMaker*\",\n", + " \"arn:aws:lambda:*:*:function:*SageMaker*\"\n", + " ]\n", + " },\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"sqs:CreateQueue\",\n", + " \"sqs:SendMessage\"\n", + " ],\n", + " \"Resource\": [\n", + " \"arn:aws:sqs:*:*:*sagemaker*\",\n", + " \"arn:aws:sqs:*:*:*sageMaker*\",\n", + " \"arn:aws:sqs:*:*:*SageMaker*\"\n", + " ]\n", + " },\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"iam:PassRole\"\n", + " ],\n", + " \"Resource\": \"arn:aws:iam::*:role/*\",\n", + " \"Condition\": {\n", + " \"StringEquals\": {\n", + " \"iam:PassedToService\": [\n", + " \"lambda.amazonaws.com\"\n", + " ]\n", + " }\n", + " }\n", + " }\n", + " ]\n", + "}\n", + " \n", + "```\n", + "\n", + "#### Add inline policy to enable creation of IAM role required for the Lambda Function\n", + "\n", + "The notebook execution role should have an inline policy which enable the notebook to create the IAM role required for the Lambda function. An inline policy can be added to the notebook execution role. \n", + "\n", + "The policy description is:\n", + "\n", + "```\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"iam:GetRole\",\n", + " \"iam:CreateRole\",\n", + " \"iam:AttachRolePolicy\"\n", + " ],\n", + " \"Resource\": \"*\"\n", + " }\n", + " ]\n", + "}\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85d9d259", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "!{sys.executable} -m pip install \"sagemaker>=2.51.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ee837d6", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "import boto3\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "import sagemaker\n", + "from sagemaker import get_execution_role" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb2e9cec", + "metadata": {}, + "outputs": [], + "source": [ + "sess = boto3.Session()\n", + "sm = sess.client(\"sagemaker\")\n", + "role = get_execution_role()\n", + "sagemaker_session = sagemaker.Session(boto_session=sess)\n", + "bucket = sagemaker_session.default_bucket()\n", + "region = boto3.Session().region_name\n", + "model_package_group_name = \"TF2-California-Housing\" # Model name in model registry\n", + "prefix = \"tf2-california-housing-pipelines\"\n", + "pipeline_name = \"TF2CaliforniaHousingPipeline\" # SageMaker Pipeline name\n", + "current_time = time.strftime(\"%m-%d-%H-%M-%S\", time.localtime())" + ] + }, + { + "cell_type": "markdown", + "id": "1040cc38", + "metadata": {}, + "source": [ + "## Download California Housing dataset and upload to Amazon S3\n", + "\n", + "We use the California housing dataset.\n", + "\n", + "More info on the dataset:\n", + "\n", + "This dataset was obtained from the `StatLib` repository. http://lib.stat.cmu.edu/datasets/\n", + "\n", + "The target variable is the median house value for California districts.\n", + "\n", + "This dataset was derived from the 1990 U.S. census, using one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abf6e279", + "metadata": {}, + "outputs": [], + "source": [ + "data_dir = os.path.join(os.getcwd(), \"data\")\n", + "os.makedirs(data_dir, exist_ok=True)\n", + "\n", + "raw_dir = os.path.join(os.getcwd(), \"data/raw\")\n", + "os.makedirs(raw_dir, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc4c1053", + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/california_housing/cal_housing.tgz ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6744929", + "metadata": {}, + "outputs": [], + "source": [ + "!tar -zxf cal_housing.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6e28a11", + "metadata": {}, + "outputs": [], + "source": [ + "columns = [\n", + " \"longitude\",\n", + " \"latitude\",\n", + " \"housingMedianAge\",\n", + " \"totalRooms\",\n", + " \"totalBedrooms\",\n", + " \"population\",\n", + " \"households\",\n", + " \"medianIncome\",\n", + " \"medianHouseValue\",\n", + "]\n", + "cal_housing_df = pd.read_csv(\"CaliforniaHousing/cal_housing.data\", names=columns, header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "550568f1", + "metadata": {}, + "outputs": [], + "source": [ + "cal_housing_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c29f19ea", + "metadata": {}, + "outputs": [], + "source": [ + "X = cal_housing_df[\n", + " [\n", + " \"longitude\",\n", + " \"latitude\",\n", + " \"housingMedianAge\",\n", + " \"totalRooms\",\n", + " \"totalBedrooms\",\n", + " \"population\",\n", + " \"households\",\n", + " \"medianIncome\",\n", + " ]\n", + "]\n", + "Y = cal_housing_df[[\"medianHouseValue\"]] / 100000\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)\n", + "\n", + "np.save(os.path.join(raw_dir, \"x_train.npy\"), x_train)\n", + "np.save(os.path.join(raw_dir, \"x_test.npy\"), x_test)\n", + "np.save(os.path.join(raw_dir, \"y_train.npy\"), y_train)\n", + "np.save(os.path.join(raw_dir, \"y_test.npy\"), y_test)\n", + "rawdata_s3_prefix = \"{}/data/raw\".format(prefix)\n", + "raw_s3 = sagemaker_session.upload_data(path=\"./data/raw/\", key_prefix=rawdata_s3_prefix)\n", + "print(raw_s3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65a0d944", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat\n", + "\n", + "# raw input data\n", + "input_data = ParameterString(name=\"InputData\", default_value=raw_s3)\n", + "\n", + "# processing step parameters\n", + "processing_instance_type = ParameterString(\n", + " name=\"ProcessingInstanceType\", default_value=\"ml.m5.large\"\n", + ")\n", + "\n", + "# training step parameters\n", + "training_instance_type = ParameterString(name=\"TrainingInstanceType\", default_value=\"ml.m5.large\")\n", + "training_epochs = ParameterString(name=\"TrainingEpochs\", default_value=\"100\")\n", + "\n", + "# model performance step parameters\n", + "accuracy_mse_threshold = ParameterFloat(name=\"AccuracyMseThreshold\", default_value=0.75)\n", + "\n", + "# Inference step parameters\n", + "endpoint_instance_type = ParameterString(name=\"EndpointInstanceType\", default_value=\"ml.m5.large\")" + ] + }, + { + "cell_type": "markdown", + "id": "23fa5dd8", + "metadata": {}, + "source": [ + "## Processing Step \n", + "\n", + "The first step in the pipeline will preprocess the data to prepare it for training. We create a `SKLearnProcessor` object similar to the one above, but now parameterized, so we can separately track and change the job configuration as needed, for example to increase the instance type size and count to accommodate a growing dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "599ec436", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile preprocess.py\n", + "\n", + "import glob\n", + "import numpy as np\n", + "import os\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "if __name__ == \"__main__\":\n", + "\n", + " input_files = glob.glob(\"{}/*.npy\".format(\"/opt/ml/processing/input\"))\n", + " print(\"\\nINPUT FILE LIST: \\n{}\\n\".format(input_files))\n", + " scaler = StandardScaler()\n", + " x_train = np.load(os.path.join(\"/opt/ml/processing/input\", \"x_train.npy\"))\n", + " scaler.fit(x_train)\n", + " for file in input_files:\n", + " raw = np.load(file)\n", + " # only transform feature columns\n", + " if \"y_\" not in file:\n", + " transformed = scaler.transform(raw)\n", + " if \"train\" in file:\n", + " if \"y_\" in file:\n", + " output_path = os.path.join(\"/opt/ml/processing/train\", \"y_train.npy\")\n", + " np.save(output_path, raw)\n", + " print(\"SAVED LABEL TRAINING DATA FILE\\n\")\n", + " else:\n", + " output_path = os.path.join(\"/opt/ml/processing/train\", \"x_train.npy\")\n", + " np.save(output_path, transformed)\n", + " print(\"SAVED TRANSFORMED TRAINING DATA FILE\\n\")\n", + " else:\n", + " if \"y_\" in file:\n", + " output_path = os.path.join(\"/opt/ml/processing/test\", \"y_test.npy\")\n", + " np.save(output_path, raw)\n", + " print(\"SAVED LABEL TEST DATA FILE\\n\")\n", + " else:\n", + " output_path = os.path.join(\"/opt/ml/processing/test\", \"x_test.npy\")\n", + " np.save(output_path, transformed)\n", + " print(\"SAVED TRANSFORMED TEST DATA FILE\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43e25edd", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.sklearn.processing import SKLearnProcessor\n", + "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", + "from sagemaker.workflow.steps import ProcessingStep\n", + "\n", + "framework_version = \"0.23-1\"\n", + "\n", + "# Create SKlearn processor object,\n", + "# The object contains information about what instance type to use, the IAM role to use etc.\n", + "# A managed processor comes with a preconfigured container, so only specifying version is required.\n", + "sklearn_processor = SKLearnProcessor(\n", + " framework_version=framework_version,\n", + " role=role,\n", + " instance_type=processing_instance_type,\n", + " instance_count=1,\n", + " base_job_name=\"tf2-california-housing-processing-job\",\n", + ")\n", + "\n", + "# Use the sklearn_processor in a Sagemaker pipelines ProcessingStep\n", + "step_preprocess_data = ProcessingStep(\n", + " name=\"Preprocess-California-Housing-Data\",\n", + " processor=sklearn_processor,\n", + " inputs=[\n", + " ProcessingInput(source=input_data, destination=\"/opt/ml/processing/input\"),\n", + " ],\n", + " outputs=[\n", + " ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n", + " ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\"),\n", + " ],\n", + " code=\"preprocess.py\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "32f6536d", + "metadata": {}, + "source": [ + "## Train model step\n", + "In the second step, the train and validation output from the precious processing step are used to train a model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b86f819", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.tensorflow import TensorFlow\n", + "from sagemaker.inputs import TrainingInput\n", + "from sagemaker.workflow.steps import TrainingStep\n", + "from sagemaker.workflow.step_collections import RegisterModel\n", + "import time\n", + "\n", + "# Where to store the trained model\n", + "model_path = f\"s3://{bucket}/{prefix}/model/\"\n", + "\n", + "hyperparameters = {\"epochs\": training_epochs}\n", + "tensorflow_version = \"2.4.1\"\n", + "python_version = \"py37\"\n", + "\n", + "tf2_estimator = TensorFlow(\n", + " source_dir=\"code\",\n", + " entry_point=\"train.py\",\n", + " instance_type=training_instance_type,\n", + " instance_count=1,\n", + " framework_version=tensorflow_version,\n", + " role=role,\n", + " base_job_name=\"tf2-california-housing-train\",\n", + " output_path=model_path,\n", + " hyperparameters=hyperparameters,\n", + " py_version=python_version,\n", + ")\n", + "\n", + "# Use the tf2_estimator in a Sagemaker pipelines ProcessingStep.\n", + "# NOTE how the input to the training job directly references the output of the previous step.\n", + "step_train_model = TrainingStep(\n", + " name=\"Train-California-Housing-Model\",\n", + " estimator=tf2_estimator,\n", + " inputs={\n", + " \"train\": TrainingInput(\n", + " s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[\n", + " \"train\"\n", + " ].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", + " ),\n", + " \"test\": TrainingInput(\n", + " s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[\n", + " \"test\"\n", + " ].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", + " ),\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "064c282c", + "metadata": {}, + "source": [ + "## Evaluate model step\n", + "When a model is trained, it's common to evaluate the model on unseen data before registering it with the model registry. This ensures the model registry isn't cluttered with poorly performing model versions. To evaluate the model, create a ScriptProcessor object and use it in a ProcessingStep.\n", + "\n", + "**Note** that a separate preprocessed test dataset is used to evaluate the model, and not the output of the processing step. This is only for demo purposes, to ensure the second run of the pipeline creates a model with better performance. In a real-world scenario, the test output of the processing step would be used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afe9f0d5", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile evaluate.py\n", + "\n", + "import os\n", + "import json\n", + "import subprocess\n", + "import sys\n", + "import numpy as np\n", + "import pathlib\n", + "import tarfile\n", + "\n", + "\n", + "def install(package):\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", package])\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + "\n", + " install(\"tensorflow==2.4.1\")\n", + " model_path = f\"/opt/ml/processing/model/model.tar.gz\"\n", + " with tarfile.open(model_path, \"r:gz\") as tar:\n", + " tar.extractall(\"./model\")\n", + " import tensorflow as tf\n", + "\n", + " model = tf.keras.models.load_model(\"./model/1\")\n", + " test_path = \"/opt/ml/processing/test/\"\n", + " x_test = np.load(os.path.join(test_path, \"x_test.npy\"))\n", + " y_test = np.load(os.path.join(test_path, \"y_test.npy\"))\n", + " scores = model.evaluate(x_test, y_test, verbose=2)\n", + " print(\"\\nTest MSE :\", scores)\n", + "\n", + " # Available metrics to add to model: https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html\n", + " report_dict = {\n", + " \"regression_metrics\": {\n", + " \"mse\": {\"value\": scores, \"standard_deviation\": \"NaN\"},\n", + " },\n", + " }\n", + "\n", + " output_dir = \"/opt/ml/processing/evaluation\"\n", + " pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)\n", + "\n", + " evaluation_path = f\"{output_dir}/evaluation.json\"\n", + " with open(evaluation_path, \"w\") as f:\n", + " f.write(json.dumps(report_dict))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6328380", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.properties import PropertyFile\n", + "\n", + "# Create SKLearnProcessor object.\n", + "# The object contains information about what container to use, what instance type etc.\n", + "evaluate_model_processor = SKLearnProcessor(\n", + " framework_version=framework_version,\n", + " instance_type=processing_instance_type,\n", + " instance_count=1,\n", + " base_job_name=\"tf2-california-housing-evaluate\",\n", + " role=role,\n", + ")\n", + "\n", + "# Create a PropertyFile\n", + "# A PropertyFile is used to be able to reference outputs from a processing step, for instance to use in a condition step.\n", + "# For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html\n", + "evaluation_report = PropertyFile(\n", + " name=\"EvaluationReport\", output_name=\"evaluation\", path=\"evaluation.json\"\n", + ")\n", + "\n", + "# Use the evaluate_model_processor in a Sagemaker pipelines ProcessingStep.\n", + "step_evaluate_model = ProcessingStep(\n", + " name=\"Evaluate-California-Housing-Model\",\n", + " processor=evaluate_model_processor,\n", + " inputs=[\n", + " ProcessingInput(\n", + " source=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,\n", + " destination=\"/opt/ml/processing/model\",\n", + " ),\n", + " ProcessingInput(\n", + " source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[\n", + " \"test\"\n", + " ].S3Output.S3Uri,\n", + " destination=\"/opt/ml/processing/test\",\n", + " ),\n", + " ],\n", + " outputs=[\n", + " ProcessingOutput(output_name=\"evaluation\", source=\"/opt/ml/processing/evaluation\"),\n", + " ],\n", + " code=\"evaluate.py\",\n", + " property_files=[evaluation_report],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4a3b941b", + "metadata": {}, + "source": [ + "## Send E-Mail Lambda Step\n", + "\n", + "When defining the `LambdaStep`, the SageMaker Lambda helper class provides helper functions for creating the Lambda function. Users can either use the `lambda_func` argument to provide the function ARN to an already deployed Lambda function OR use the `Lambda` class to create a Lambda function by providing a script, function name and role for the Lambda function.\n", + "\n", + "When passing inputs to the Lambda, the `inputs` argument can be used and within the Lambda function's handler, the `event` argument can be used to retrieve the inputs.\n", + "\n", + "The dictionary response from the Lambda function is parsed through the `LambdaOutput` objects provided to the `outputs` argument. The `output_name` in `LambdaOutput` corresponds to the dictionary key in the Lambda's return dictionary." + ] + }, + { + "cell_type": "markdown", + "id": "bc99b6b7", + "metadata": {}, + "source": [ + "#### Define the Lambda function\n", + "\n", + "Users can choose the leverage the Lambda helper class to create a Lambda function and provide that function object to the `LambdaStep`. Alternatively, users can use a pre-deployed Lambda function and provide the function ARN to the `Lambda` helper class in the lambda step.\n", + "\n", + "Here, If the MSE is lower than threshold, an E-Mail will be sent to Data Science team.\n", + "\n", + "Note that the E-Mail sending part is left for you to implement by the framework you choose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4305f405", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile send_email_lambda.py\n", + "\n", + "\"\"\"\n", + "This Lambda function sends an E-Mail to the Data Science team with the MSE from model evaluation step. \n", + "The evaluation.json location in S3 is provided via the `event` argument\n", + "\"\"\"\n", + "\n", + "import json\n", + "import boto3\n", + "\n", + "\n", + "s3_client = client = boto3.client(\"s3\")\n", + "\n", + "\n", + "def lambda_handler(event, context):\n", + "\n", + " print(f\"Received Event: {event}\")\n", + "\n", + " evaluation_s3_uri = event[\"evaluation_s3_uri\"]\n", + " path_parts = evaluation_s3_uri.replace(\"s3://\", \"\").split(\"/\")\n", + " bucket = path_parts.pop(0)\n", + " key = \"/\".join(path_parts)\n", + "\n", + " content = s3_client.get_object(Bucket=bucket, Key=key)\n", + " text = content[\"Body\"].read().decode()\n", + " evaluation_json = json.loads(text)\n", + " mse = evaluation_json[\"regression_metrics\"][\"mse\"][\"value\"]\n", + "\n", + " subject_line = \"Please check high MSE ({}) detected on model evaluation\".format(mse)\n", + " print(f\"Sending E-Mail to Data Science Team with subject line: {subject_line}\")\n", + "\n", + " # TODO - ADD YOUR CODE TO SEND EMAIL...\n", + "\n", + " return {\"statusCode\": 200, \"body\": json.dumps(\"E-Mail Sent Successfully\")}" + ] + }, + { + "cell_type": "markdown", + "id": "2dabb9f4", + "metadata": {}, + "source": [ + "#### IAM Role\n", + "\n", + "The Lambda function needs an IAM role that will allow it to read the `evaluation.json` from S3. The role ARN must be provided in the `LambdaStep`.\n", + "\n", + "A helper function in `iam_helper.py` is available to create the Lambda function role. Please note that the role uses the Amazon managed policy - `AmazonS3ReadOnlyAccess`. This should be replaced with an IAM policy with the least privileges as per AWS IAM best practices." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25598115", + "metadata": {}, + "outputs": [], + "source": [ + "from iam_helper import create_s3_lambda_role\n", + "\n", + "lambda_role = create_s3_lambda_role(\"send-email-to-ds-team-lambda-role\")" + ] + }, + { + "cell_type": "markdown", + "id": "edcfb5a3", + "metadata": {}, + "source": [ + "#### Create the Lambda Function step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eeb988d9", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.lambda_step import LambdaStep\n", + "from sagemaker.lambda_helper import Lambda\n", + "\n", + "evaluation_s3_uri = \"{}/evaluation.json\".format(\n", + " step_evaluate_model.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n", + ")\n", + "\n", + "send_email_lambda_function_name = \"sagemaker-send-email-to-ds-team-lambda-\" + current_time\n", + "\n", + "send_email_lambda_function = Lambda(\n", + " function_name=send_email_lambda_function_name,\n", + " execution_role_arn=lambda_role,\n", + " script=\"send_email_lambda.py\",\n", + " handler=\"send_email_lambda.lambda_handler\",\n", + ")\n", + "\n", + "step_higher_mse_send_email_lambda = LambdaStep(\n", + " name=\"Send-Email-To-DS-Team\",\n", + " lambda_func=send_email_lambda_function,\n", + " inputs={\"evaluation_s3_uri\": evaluation_s3_uri},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0d5935ba", + "metadata": {}, + "source": [ + "## Register model step\n", + "If the trained model meets the model performance requirements a new model version is registered with the model registry for further analysis. To attach model metrics to the model version, create a [`ModelMetrics`](https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html) object using the evaluation report created in the evaluation step. Then, create the RegisterModel step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68bb7f5d", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.model_metrics import MetricsSource, ModelMetrics\n", + "from sagemaker.workflow.step_collections import RegisterModel\n", + "\n", + "# Create ModelMetrics object using the evaluation report from the evaluation step\n", + "# A ModelMetrics object contains metrics captured from a model.\n", + "model_metrics = ModelMetrics(\n", + " model_statistics=MetricsSource(\n", + " s3_uri=evaluation_s3_uri,\n", + " content_type=\"application/json\",\n", + " )\n", + ")\n", + "\n", + "# Create a RegisterModel step, which registers the model with Sagemaker Model Registry.\n", + "step_register_model = RegisterModel(\n", + " name=\"Register-California-Housing-Model\",\n", + " estimator=tf2_estimator,\n", + " model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,\n", + " content_types=[\"text/csv\"],\n", + " response_types=[\"text/csv\"],\n", + " inference_instances=[\"ml.m5.large\", \"ml.m5.xlarge\"],\n", + " transform_instances=[\"ml.m5.xlarge\"],\n", + " model_package_group_name=model_package_group_name,\n", + " model_metrics=model_metrics,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1f8e48d8", + "metadata": {}, + "source": [ + "## Create the model\n", + "\n", + "The model is created and the name of the model is provided to the Lambda function for deployment. The `CreateModelStep` dynamically assigns a name to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5988ca19", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.step_collections import CreateModelStep\n", + "from sagemaker.tensorflow.model import TensorFlowModel\n", + "\n", + "model = TensorFlowModel(\n", + " role=role,\n", + " model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,\n", + " framework_version=tensorflow_version,\n", + " sagemaker_session=sagemaker_session,\n", + ")\n", + "\n", + "step_create_model = CreateModelStep(\n", + " name=\"Create-California-Housing-Model\",\n", + " model=model,\n", + " inputs=sagemaker.inputs.CreateModelInput(instance_type=endpoint_instance_type),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "430c26ff", + "metadata": {}, + "source": [ + "## Deploy model to SageMaker Endpoint Lambda Step\n", + "\n", + "When defining the `LambdaStep`, the SageMaker Lambda helper class provides helper functions for creating the Lambda function. Users can either use the `lambda_func` argument to provide the function ARN to an already deployed Lambda function OR use the `Lambda` class to create a Lambda function by providing a script, function name and role for the Lambda function.\n", + "\n", + "When passing inputs to the Lambda, the `inputs` argument can be used and within the Lambda function's handler, the `event` argument can be used to retrieve the inputs.\n", + "\n", + "The dictionary response from the Lambda function is parsed through the `LambdaOutput` objects provided to the `outputs` argument. The `output_name` in `LambdaOutput` corresponds to the dictionary key in the Lambda's return dictionary." + ] + }, + { + "cell_type": "markdown", + "id": "85e25e59", + "metadata": {}, + "source": [ + "### Define the Lambda function\n", + "\n", + "Here, the Lambda Function will deploy the model to SageMaker Endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d56cf4b4", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile deploy_model_lambda.py\n", + "\n", + "\n", + "\"\"\"\n", + "This Lambda function deploys the model to SageMaker Endpoint. \n", + "If Endpoint exists, then Endpoint will be updated with new Endpoint Config.\n", + "\"\"\"\n", + "\n", + "import json\n", + "import boto3\n", + "import time\n", + "\n", + "\n", + "sm_client = boto3.client(\"sagemaker\")\n", + "\n", + "\n", + "def lambda_handler(event, context):\n", + "\n", + " print(f\"Received Event: {event}\")\n", + "\n", + " current_time = time.strftime(\"%m-%d-%H-%M-%S\", time.localtime())\n", + " endpoint_instance_type = event[\"endpoint_instance_type\"]\n", + " model_name = event[\"model_name\"]\n", + " endpoint_config_name = \"{}-{}\".format(event[\"endpoint_config_name\"], current_time)\n", + " endpoint_name = event[\"endpoint_name\"]\n", + "\n", + " # Create Endpoint Configuration\n", + " create_endpoint_config_response = sm_client.create_endpoint_config(\n", + " EndpointConfigName=endpoint_config_name,\n", + " ProductionVariants=[\n", + " {\n", + " \"InstanceType\": endpoint_instance_type,\n", + " \"InitialVariantWeight\": 1,\n", + " \"InitialInstanceCount\": 1,\n", + " \"ModelName\": model_name,\n", + " \"VariantName\": \"AllTraffic\",\n", + " }\n", + " ],\n", + " )\n", + " print(f\"create_endpoint_config_response: {create_endpoint_config_response}\")\n", + "\n", + " # Check if an endpoint exists. If no - Create new endpoint, if yes - Update existing endpoint\n", + " list_endpoints_response = sm_client.list_endpoints(\n", + " SortBy=\"CreationTime\",\n", + " SortOrder=\"Descending\",\n", + " NameContains=endpoint_name,\n", + " )\n", + " print(f\"list_endpoints_response: {list_endpoints_response}\")\n", + "\n", + " if len(list_endpoints_response[\"Endpoints\"]) > 0:\n", + " print(\"Updating Endpoint with new Endpoint Configuration\")\n", + " update_endpoint_response = sm_client.update_endpoint(\n", + " EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name\n", + " )\n", + " print(f\"update_endpoint_response: {update_endpoint_response}\")\n", + " else:\n", + " print(\"Creating Endpoint\")\n", + " create_endpoint_response = sm_client.create_endpoint(\n", + " EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name\n", + " )\n", + " print(f\"create_endpoint_response: {create_endpoint_response}\")\n", + "\n", + " return {\"statusCode\": 200, \"body\": json.dumps(\"Endpoint Created Successfully\")}" + ] + }, + { + "cell_type": "markdown", + "id": "d58d4485", + "metadata": {}, + "source": [ + "#### IAM Role\n", + "\n", + "The Lambda function needs an IAM role that will allow it to deploy a SageMaker Endpoint. The role ARN must be provided in the `LambdaStep`.\n", + "\n", + "A helper function in `iam_helper.py` is available to create the Lambda function role. Please note that the role uses the Amazon managed policy - `AmazonSageMakerFullAccess`. This should be replaced with an IAM policy with the least privileges as per AWS IAM best practices." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e1f2a35", + "metadata": {}, + "outputs": [], + "source": [ + "from iam_helper import create_sagemaker_lambda_role\n", + "\n", + "lambda_role = create_sagemaker_lambda_role(\"deploy-model-lambda-role\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37a56740", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.lambda_step import LambdaStep\n", + "from sagemaker.lambda_helper import Lambda\n", + "\n", + "endpoint_config_name = \"tf2-california-housing-endpoint-config\"\n", + "endpoint_name = \"tf2-california-housing-endpoint-\" + current_time\n", + "\n", + "deploy_model_lambda_function_name = \"sagemaker-deploy-model-lambda-\" + current_time\n", + "\n", + "deploy_model_lambda_function = Lambda(\n", + " function_name=deploy_model_lambda_function_name,\n", + " execution_role_arn=lambda_role,\n", + " script=\"deploy_model_lambda.py\",\n", + " handler=\"deploy_model_lambda.lambda_handler\",\n", + ")\n", + "\n", + "step_lower_mse_deploy_model_lambda = LambdaStep(\n", + " name=\"Deploy-California-Housing-Model-To-Endpoint\",\n", + " lambda_func=deploy_model_lambda_function,\n", + " inputs={\n", + " \"model_name\": step_create_model.properties.ModelName,\n", + " \"endpoint_config_name\": endpoint_config_name,\n", + " \"endpoint_name\": endpoint_name,\n", + " \"endpoint_instance_type\": endpoint_instance_type,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "787921fb", + "metadata": {}, + "source": [ + "## Accuracy condition step\n", + "Adding conditions to the pipeline is done with a ConditionStep.\n", + "In this case, we only want to register the new model version with the model registry if the new model meets an accuracy condition." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b27277dd", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo\n", + "from sagemaker.workflow.condition_step import (\n", + " ConditionStep,\n", + " JsonGet,\n", + ")\n", + "\n", + "# Create accuracy condition to ensure the model meets performance requirements.\n", + "# Models with a test accuracy lower than the condition will not be registered with the model registry.\n", + "cond_lte = ConditionLessThanOrEqualTo(\n", + " left=JsonGet(\n", + " step=step_evaluate_model,\n", + " property_file=evaluation_report,\n", + " json_path=\"regression_metrics.mse.value\",\n", + " ),\n", + " right=accuracy_mse_threshold,\n", + ")\n", + "\n", + "# Create a Sagemaker Pipelines ConditionStep, using the condition above.\n", + "# Enter the steps to perform if the condition returns True / False.\n", + "step_cond = ConditionStep(\n", + " name=\"MSE-Lower-Than-Threshold-Condition\",\n", + " conditions=[cond_lte],\n", + " if_steps=[step_register_model, step_create_model, step_lower_mse_deploy_model_lambda],\n", + " else_steps=[step_higher_mse_send_email_lambda],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3d243e6e", + "metadata": {}, + "source": [ + "## Pipeline Creation: Orchestrate all steps\n", + "\n", + "Now that all pipeline steps are created, a pipeline is created." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6bd9d5e", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.pipeline import Pipeline\n", + "\n", + "# Create a Sagemaker Pipeline.\n", + "# Each parameter for the pipeline must be set as a parameter explicitly when the pipeline is created.\n", + "# Also pass in each of the steps created above.\n", + "# Note that the order of execution is determined from each step's dependencies on other steps,\n", + "# not on the order they are passed in below.\n", + "pipeline = Pipeline(\n", + " name=pipeline_name,\n", + " parameters=[\n", + " processing_instance_type,\n", + " training_instance_type,\n", + " input_data,\n", + " training_epochs,\n", + " accuracy_mse_threshold,\n", + " endpoint_instance_type,\n", + " ],\n", + " steps=[step_preprocess_data, step_train_model, step_evaluate_model, step_cond],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ca4af181", + "metadata": {}, + "source": [ + "## Execute the Pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "86f6cb54", + "metadata": {}, + "source": [ + "### List the execution steps to check out the status and artifacts:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "198c583a", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "definition = json.loads(pipeline.definition())\n", + "definition" + ] + }, + { + "cell_type": "markdown", + "id": "079623b1", + "metadata": {}, + "source": [ + "### Submit pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "438e8401", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.upsert(role_arn=role)" + ] + }, + { + "cell_type": "markdown", + "id": "fb6a7e29", + "metadata": {}, + "source": [ + "### Execute pipeline using the default parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11c27cd7", + "metadata": {}, + "outputs": [], + "source": [ + "execution = pipeline.start()" + ] + }, + { + "cell_type": "markdown", + "id": "72eb39b1", + "metadata": {}, + "source": [ + "### Wait for pipeline to complete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95e32f80", + "metadata": {}, + "outputs": [], + "source": [ + "execution.wait()" + ] + }, + { + "cell_type": "markdown", + "id": "e93d8920", + "metadata": {}, + "source": [ + "## Visualize SageMaker Pipeline - MSE lower than the threshold\n", + "In SageMaker Studio, choose `SageMaker Components and registries` in the left pane and under `Pipelines`, click the pipeline that was created. Then all pipeline executions are shown, and the one just created should have a status of `Succeded`. Selecting that execution, the different pipeline steps can be tracked as they execute.\n", + "\n", + "You can see that the `Register-California-Housing-Model` step was executed.\n", + "\n", + "![](images/pipeline_run_1_mse_lower_than_threshold.png \"Pipeline - MSE lower than the threshold\")" + ] + }, + { + "cell_type": "markdown", + "id": "60731a94", + "metadata": {}, + "source": [ + "## Start a pipeline with 2 epochs to trigger the `send-email-to-ds-team-lambda` Lambda Function\n", + "\n", + "\n", + "Run the pipeline again, but this time, with only 2 epochs and a lower MSE Threshold of 0.2. This will result in a higher MSE value on model evaluation, and will cause the `send-email-to-ds-team-lambda` Lambda Function to be triggered. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e15ba8ba", + "metadata": {}, + "outputs": [], + "source": [ + "# Execute pipeline with explicit parameters\n", + "execution = pipeline.start(parameters=dict(TrainingEpochs=2, AccuracyMseThreshold=0.2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1d274be", + "metadata": {}, + "outputs": [], + "source": [ + "execution.wait()" + ] + }, + { + "cell_type": "markdown", + "id": "26176917", + "metadata": {}, + "source": [ + "## Visualize SageMaker Pipeline - MSE higher than the threshold\n", + "In SageMaker Studio, choose `SageMaker Components and registries` in the left pane and under `Pipelines`, click the pipeline that was created. Then all pipeline executions are shown, and the one just created should have a status of `Succeded`. Selecting that execution, the different pipeline steps can be tracked as they execute.\n", + "\n", + "You can see that the `Send-Email-To-DS-Team` step was executed.\n", + "\n", + "![](images/pipeline_run_1_mse_higher_than_threshold.png \"Pipeline - MSE higher than the threshold\")" + ] + }, + { + "cell_type": "markdown", + "id": "9d042db6", + "metadata": {}, + "source": [ + "## Clean up (optional)" + ] + }, + { + "cell_type": "markdown", + "id": "76beb434", + "metadata": {}, + "source": [ + "#### Stop / Close the Endpoint\n", + "You should delete the endpoint before you close the notebook if you don't need to keep the endpoint running for serving real-time predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7fd2ea6", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "\n", + "client = boto3.client(\"sagemaker\")\n", + "client.delete_endpoint(EndpointName=endpoint_name)" + ] + }, + { + "cell_type": "markdown", + "id": "530a3c09", + "metadata": {}, + "source": [ + "#### Delete the model registry and the pipeline to keep the studio environment tidy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00f02f22", + "metadata": {}, + "outputs": [], + "source": [ + "def delete_model_package_group(sm_client, package_group_name):\n", + " try:\n", + " model_versions = sm_client.list_model_packages(ModelPackageGroupName=package_group_name)\n", + "\n", + " except Exception as e:\n", + " print(\"{} \\n\".format(e))\n", + " return\n", + "\n", + " for model_version in model_versions[\"ModelPackageSummaryList\"]:\n", + " try:\n", + " sm_client.delete_model_package(ModelPackageName=model_version[\"ModelPackageArn\"])\n", + " except Exception as e:\n", + " print(\"{} \\n\".format(e))\n", + " time.sleep(0.5) # Ensure requests aren't throttled\n", + "\n", + " try:\n", + " sm_client.delete_model_package_group(ModelPackageGroupName=package_group_name)\n", + " print(\"{} model package group deleted\".format(package_group_name))\n", + " except Exception as e:\n", + " print(\"{} \\n\".format(e))\n", + " return\n", + "\n", + "\n", + "def delete_sagemaker_pipeline(sm_client, pipeline_name):\n", + " try:\n", + " sm_client.delete_pipeline(\n", + " PipelineName=pipeline_name,\n", + " )\n", + " print(\"{} pipeline deleted\".format(pipeline_name))\n", + " except Exception as e:\n", + " print(\"{} \\n\".format(e))\n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66fc04ba", + "metadata": {}, + "outputs": [], + "source": [ + "delete_model_package_group(client, model_package_group_name)\n", + "delete_sagemaker_pipeline(client, pipeline_name)" + ] + }, + { + "cell_type": "markdown", + "id": "fba7e83e", + "metadata": {}, + "source": [ + "#### Delete the Lambda functions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e12be1f1", + "metadata": {}, + "outputs": [], + "source": [ + "send_email_lambda_function.delete()\n", + "deploy_model_lambda_function.delete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef56efc7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (Data Science)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sagemaker-pipelines/tabular/train-register-deploy-pipeline-model/train register and deploy a pipeline model.ipynb b/sagemaker-pipelines/tabular/train-register-deploy-pipeline-model/train register and deploy a pipeline model.ipynb new file mode 100644 index 0000000000..8a6083dca4 --- /dev/null +++ b/sagemaker-pipelines/tabular/train-register-deploy-pipeline-model/train register and deploy a pipeline model.ipynb @@ -0,0 +1,1115 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker Pipelines\n", + "\n", + "The following notebook shows how to create an Amazon SageMaker Pipeline that builds and trains a **PipelineModel** consisting of a preprocessing SKLearn script followed by a TensorFlow model. The pipeline model is then registered to the Model Registry and deployed from there into a real-time endpoint. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "import boto3\n", + "import numpy as np\n", + "import pandas as pd\n", + "import sagemaker\n", + "from sagemaker import get_execution_role" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sess = boto3.Session()\n", + "sm = sess.client(\"sagemaker\")\n", + "role = get_execution_role()\n", + "sagemaker_session = sagemaker.Session(boto_session=sess)\n", + "bucket = sagemaker_session.default_bucket()\n", + "region = boto3.Session().region_name\n", + "\n", + "model_package_group_name = \"PipelineModelPackageGroup\"\n", + "prefix = \"pipeline-model-example\"\n", + "pipeline_name = \"TrainingPipelineForModel\" # SageMaker Pipeline name\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download California Housing dataset and upload to Amazon S3\n", + "\n", + "We use the California housing dataset.\n", + "\n", + "More info on the dataset:\n", + "\n", + "This dataset was obtained from the StatLib repository. http://lib.stat.cmu.edu/datasets/\n", + "\n", + "The target variable is the median house value for California districts.\n", + "\n", + "This dataset was derived from the 1990 U.S. census, using one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_dir = os.path.join(os.getcwd(), \"data\")\n", + "os.makedirs(data_dir, exist_ok=True)\n", + "\n", + "raw_dir = os.path.join(os.getcwd(), \"data/raw\")\n", + "os.makedirs(raw_dir, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/california_housing/cal_housing.tgz .\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!tar -zxf cal_housing.tgz\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "columns = [\n", + " \"longitude\",\n", + " \"latitude\",\n", + " \"housingMedianAge\",\n", + " \"totalRooms\",\n", + " \"totalBedrooms\",\n", + " \"population\",\n", + " \"households\",\n", + " \"medianIncome\",\n", + " \"medianHouseValue\",\n", + "]\n", + "cal_housing_df = pd.read_csv(\"CaliforniaHousing/cal_housing.data\", names=columns, header=None)\n", + "cal_housing_df['medianHouseValue'] /= 500000 # Scaling target down to avoid overcomplicating the example\n", + "cal_housing_df.to_csv(f\"./data/raw/raw_data_all.csv\", header=True, index=False)\n", + "rawdata_s3_prefix = \"{}/data/raw\".format(prefix)\n", + "raw_s3 = sagemaker_session.upload_data(path=\"./data/raw/\", key_prefix=rawdata_s3_prefix)\n", + "print(raw_s3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Parameters to Parametrize Pipeline Execution\n", + "\n", + "Define Pipeline parameters that you can use to parametrize the pipeline. Parameters enable custom pipeline executions and schedules without having to modify the Pipeline definition.\n", + "\n", + "The supported parameter types include:\n", + "\n", + "- ParameterString - represents a str Python type\n", + "- ParameterInteger - represents an int Python type\n", + "- ParameterFloat - represents a float Python type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat\n", + "\n", + "# raw input data\n", + "input_data = ParameterString(name=\"InputData\", default_value=raw_s3)\n", + "\n", + "# status of newly trained model in registry\n", + "model_approval_status = ParameterString(name=\"ModelApprovalStatus\", default_value=\"Approved\") # PendingManualApproval | Rejected\n", + "\n", + "# processing step parameters\n", + "processing_instance_count = ParameterInteger(name=\"ProcessingInstanceCount\", default_value=1)\n", + "processing_instance_type = ParameterString(\n", + " name=\"ProcessingInstanceType\", default_value=\"ml.m5.large\"\n", + ")\n", + "\n", + "# training step parameters\n", + "training_instance_type = ParameterString(name=\"TrainingInstanceType\", default_value=\"ml.m5.xlarge\")\n", + "training_epochs = ParameterString(name=\"TrainingEpochs\", default_value=\"100\")\n", + "\n", + "# model performance step parameters\n", + "accuracy_mse_threshold = ParameterFloat(name=\"AccuracyMseThreshold\", default_value=0.75)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define a Processing Step for Feature Engineering\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The below preprocessing script, in addition to creating a scaler, contains the necessary functions for it to be deployed as part of a pipeline model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile code/preprocess.py\n", + "\n", + "import glob\n", + "import numpy as np\n", + "import pandas as pd\n", + "import os\n", + "import json\n", + "import joblib\n", + "from io import StringIO\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "import tarfile\n", + "try:\n", + " from sagemaker_containers.beta.framework import (\n", + " content_types, encoders, env, modules, transformer, worker, server)\n", + "except ImportError:\n", + " pass\n", + "\n", + "feature_columns = [\n", + " \"longitude\",\n", + " \"latitude\",\n", + " \"housingMedianAge\",\n", + " \"totalRooms\",\n", + " \"totalBedrooms\",\n", + " \"population\",\n", + " \"households\",\n", + " \"medianIncome\",\n", + " ]\n", + "label_column = 'medianHouseValue'\n", + "\n", + "base_dir = \"/opt/ml/processing\"\n", + "base_output_dir = \"/opt/ml/output/\"\n", + " \n", + "if __name__ == \"__main__\": \n", + " df = pd.read_csv(f\"{base_dir}/input/raw_data_all.csv\")\n", + " feature_data = df.drop(label_column, axis=1, inplace=False)\n", + " label_data = df[label_column]\n", + " x_train, x_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.33)\n", + " \n", + " scaler = StandardScaler()\n", + "\n", + " scaler.fit(x_train)\n", + " x_train = scaler.transform(x_train)\n", + " x_test = scaler.transform(x_test)\n", + " \n", + " train_dataset = pd.concat([pd.DataFrame(x_train), y_train.reset_index(drop=True)], axis=1)\n", + " test_dataset = pd.concat([pd.DataFrame(x_test), y_test.reset_index(drop=True)], axis=1)\n", + "\n", + " train_dataset.columns=feature_columns+[label_column]\n", + " test_dataset.columns=feature_columns+[label_column]\n", + " \n", + " train_dataset.to_csv(f\"{base_dir}/train/train.csv\", header=True, index=False)\n", + " test_dataset.to_csv(f\"{base_dir}/test/test.csv\", header=True, index=False)\n", + " joblib.dump(scaler, \"model.joblib\")\n", + " with tarfile.open(f\"{base_dir}/scaler_model/model.tar.gz\", \"w:gz\") as tar_handle:\n", + " tar_handle.add(f\"model.joblib\")\n", + " \n", + "\n", + "def input_fn(input_data, content_type):\n", + " \"\"\"Parse input data payload\n", + "\n", + " We currently only take csv input. Since we need to process both labelled\n", + " and unlabelled data we first determine whether the label column is present\n", + " by looking at how many columns were provided.\n", + " \"\"\"\n", + " if content_type == 'text/csv':\n", + " # Read the raw input data as CSV.\n", + " df = pd.read_csv(StringIO(input_data), header=None)\n", + "\n", + " if len(df.columns) == len(feature_columns) + 1:\n", + " # This is a labelled example, includes the ring label\n", + " df.columns = feature_columns + [label_column]\n", + " elif len(df.columns) == len(feature_columns):\n", + " # This is an unlabelled example.\n", + " df.columns = feature_columns\n", + "\n", + " return df\n", + " else:\n", + " raise ValueError(\"{} not supported by script!\".format(content_type))\n", + "\n", + "\n", + "def output_fn(prediction, accept):\n", + " \"\"\"Format prediction output\n", + "\n", + " The default accept/content-type between containers for serial inference is JSON.\n", + " We also want to set the ContentType or mimetype as the same value as accept so the next\n", + " container can read the response payload correctly.\n", + " \"\"\"\n", + " if accept == \"application/json\":\n", + " instances = []\n", + " for row in prediction.tolist():\n", + " instances.append(row)\n", + " json_output = {\"instances\": instances}\n", + "\n", + " return worker.Response(json.dumps(json_output), mimetype=accept)\n", + " elif accept == 'text/csv':\n", + " return worker.Response(encoders.encode(prediction, accept), mimetype=accept)\n", + " else:\n", + " raise RuntimeException(\"{} accept type is not supported by this script.\".format(accept))\n", + "\n", + "\n", + "def predict_fn(input_data, model):\n", + " \"\"\"Preprocess input data\n", + "\n", + " We implement this because the default predict_fn uses .predict(), but our model is a preprocessor\n", + " so we want to use .transform().\n", + "\n", + " The output is returned in the following order:\n", + "\n", + " rest of features either one hot encoded or standardized\n", + " \"\"\"\n", + " features = model.transform(input_data)\n", + "\n", + " if label_column in input_data:\n", + " # Return the label (as the first column) and the set of features.\n", + " return np.insert(features, 0, input_data[label_column], axis=1)\n", + " else:\n", + " # Return only the set of features\n", + " return features\n", + "\n", + "\n", + "def model_fn(model_dir):\n", + " \"\"\"Deserialize fitted model\n", + " \"\"\"\n", + " preprocessor = joblib.load(os.path.join(model_dir, \"model.joblib\"))\n", + " return preprocessor\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.sklearn.processing import SKLearnProcessor\n", + "\n", + "\n", + "sklearn_framework_version = \"0.23-1\"\n", + "\n", + "sklearn_processor = SKLearnProcessor(\n", + " framework_version=sklearn_framework_version,\n", + " instance_type=processing_instance_type,\n", + " instance_count=processing_instance_count,\n", + " base_job_name=\"sklearn-housing-data-process\",\n", + " role=role,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", + "from sagemaker.workflow.steps import ProcessingStep\n", + "\n", + "\n", + "step_process = ProcessingStep(\n", + " name=\"PreprocessData\",\n", + " processor=sklearn_processor,\n", + " inputs=[\n", + " ProcessingInput(source=input_data, destination=\"/opt/ml/processing/input\"),\n", + " ],\n", + " outputs=[\n", + " ProcessingOutput(output_name=\"scaler_model\", source=\"/opt/ml/processing/scaler_model\"),\n", + " ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n", + " ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\"),\n", + " ],\n", + " code=\"code/preprocess.py\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define a Training Step to Train a Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile code/train.py\n", + "\n", + "import argparse\n", + "import numpy as np\n", + "import os\n", + "import tensorflow as tf\n", + "import pandas as pd\n", + "\n", + "feature_columns = [\n", + " \"longitude\",\n", + " \"latitude\",\n", + " \"housingMedianAge\",\n", + " \"totalRooms\",\n", + " \"totalBedrooms\",\n", + " \"population\",\n", + " \"households\",\n", + " \"medianIncome\",\n", + " ]\n", + "label_column = 'medianHouseValue'\n", + "\n", + "\n", + "def parse_args():\n", + "\n", + " parser = argparse.ArgumentParser()\n", + "\n", + " # hyperparameters sent by the client are passed as command-line arguments to the script\n", + " parser.add_argument('--epochs', type=int, default=1)\n", + " parser.add_argument('--batch_size', type=int, default=64)\n", + " parser.add_argument('--learning_rate', type=float, default=0.1)\n", + "\n", + " # data directories\n", + " parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))\n", + " parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))\n", + "\n", + " # model directory\n", + " parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))\n", + "\n", + " return parser.parse_known_args()\n", + "\n", + "\n", + "def get_train_data(train_dir):\n", + " train_data = pd.read_csv(os.path.join(train_dir, 'train.csv'))\n", + " x_train = train_data[feature_columns].to_numpy()\n", + " y_train = train_data[label_column].to_numpy()\n", + " print('x train', x_train.shape,'y train', y_train.shape)\n", + "\n", + " return x_train, y_train\n", + "\n", + "\n", + "def get_test_data(test_dir):\n", + "\n", + " test_data = pd.read_csv(os.path.join(test_dir, 'test.csv'))\n", + " x_test = test_data[feature_columns].to_numpy()\n", + " y_test = test_data[label_column].to_numpy()\n", + " print('x test', x_test.shape,'y test', y_test.shape)\n", + "\n", + " return x_test, y_test\n", + "\n", + "\n", + "def get_model():\n", + "\n", + " inputs = tf.keras.Input(shape=(8,))\n", + " hidden_1 = tf.keras.layers.Dense(8, activation='tanh')(inputs)\n", + " hidden_2 = tf.keras.layers.Dense(4, activation='sigmoid')(hidden_1)\n", + " outputs = tf.keras.layers.Dense(1)(hidden_2)\n", + " return tf.keras.Model(inputs=inputs, outputs=outputs)\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + "\n", + " args, _ = parse_args()\n", + "\n", + " print('Training data location: {}'.format(args.train))\n", + " print('Test data location: {}'.format(args.test))\n", + " x_train, y_train = get_train_data(args.train)\n", + " x_test, y_test = get_test_data(args.test)\n", + "\n", + " batch_size = args.batch_size\n", + " epochs = args.epochs\n", + " learning_rate = args.learning_rate\n", + " print('batch_size = {}, epochs = {}, learning rate = {}'.format(batch_size, epochs, learning_rate))\n", + "\n", + "\n", + " model = get_model()\n", + " optimizer = tf.keras.optimizers.SGD(learning_rate)\n", + " model.compile(optimizer=optimizer, loss='mse')\n", + " model.fit(x_train,\n", + " y_train,\n", + " batch_size=batch_size,\n", + " epochs=epochs,\n", + " validation_data=(x_test, y_test))\n", + "\n", + " # evaluate on test set\n", + " scores = model.evaluate(x_test, y_test, batch_size, verbose=2)\n", + " print(\"\\nTest MSE :\", scores)\n", + "\n", + " # save model\n", + " model.save(args.sm_model_dir + '/1')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.tensorflow import TensorFlow\n", + "from sagemaker.inputs import TrainingInput\n", + "from sagemaker.workflow.steps import TrainingStep\n", + "from sagemaker.workflow.step_collections import RegisterModel\n", + "import time\n", + "\n", + "# Where to store the trained model\n", + "model_path = f\"s3://{bucket}/{prefix}/model/\"\n", + "\n", + "hyperparameters = {\"epochs\": training_epochs}\n", + "tensorflow_version = \"2.4.1\"\n", + "python_version = \"py37\"\n", + "\n", + "tf2_estimator = TensorFlow(\n", + " source_dir=\"code\",\n", + " entry_point=\"train.py\",\n", + " instance_type=training_instance_type,\n", + " instance_count=1,\n", + " framework_version=tensorflow_version,\n", + " role=role,\n", + " base_job_name=\"tensorflow-train-model\",\n", + " output_path=model_path,\n", + " hyperparameters=hyperparameters,\n", + " py_version=python_version,\n", + ")\n", + "\n", + "# Use the tf2_estimator in a Sagemaker pipelines ProcessingStep.\n", + "# NOTE how the input to the training job directly references the output of the previous step.\n", + "step_train_model = TrainingStep(\n", + " name=\"TrainTensorflowModel\",\n", + " estimator=tf2_estimator,\n", + " inputs={\n", + " \"train\": TrainingInput(\n", + " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n", + " \"train\"\n", + " ].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", + " ),\n", + " \"test\": TrainingInput(\n", + " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n", + " \"test\"\n", + " ].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", + " ),\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define a Model Evaluation Step to Evaluate the Trained Model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile code/evaluate.py\n", + "\n", + "import os\n", + "import json\n", + "import sys\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pathlib\n", + "import tarfile\n", + "\n", + "\n", + "feature_columns = [\n", + " \"longitude\",\n", + " \"latitude\",\n", + " \"housingMedianAge\",\n", + " \"totalRooms\",\n", + " \"totalBedrooms\",\n", + " \"population\",\n", + " \"households\",\n", + " \"medianIncome\",\n", + " ]\n", + "label_column = 'medianHouseValue'\n", + "\n", + "if __name__ == \"__main__\":\n", + "\n", + " model_path = f\"/opt/ml/processing/model/model.tar.gz\"\n", + " with tarfile.open(model_path, \"r:gz\") as tar:\n", + " tar.extractall(\"./model\")\n", + " import tensorflow as tf\n", + "\n", + " model = tf.keras.models.load_model(\"./model/1\")\n", + " test_path = \"/opt/ml/processing/test/\"\n", + " df = pd.read_csv(test_path+\"/test.csv\")\n", + " x_test = df[feature_columns].to_numpy()\n", + " y_test = df[label_column].to_numpy()\n", + " scores = model.evaluate(x_test, y_test, verbose=2)\n", + " print(\"\\nTest MSE :\", scores)\n", + "\n", + " # Available metrics to add to model: https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html\n", + " report_dict = {\n", + " \"regression_metrics\": {\n", + " \"mse\": {\"value\": scores, \"standard_deviation\": \"NaN\"},\n", + " },\n", + " }\n", + "\n", + " output_dir = \"/opt/ml/processing/evaluation\"\n", + " pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)\n", + "\n", + " evaluation_path = f\"{output_dir}/evaluation.json\"\n", + " with open(evaluation_path, \"w\") as f:\n", + " f.write(json.dumps(report_dict))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.properties import PropertyFile\n", + "from sagemaker.sklearn.processing import ScriptProcessor\n", + "\n", + "tf_eval_image_uri = sagemaker.image_uris.retrieve(\n", + " framework=\"tensorflow\",\n", + " region=region,\n", + " version=tensorflow_version,\n", + " image_scope=\"training\",\n", + " py_version=\"py37\",\n", + " instance_type=training_instance_type,\n", + ")\n", + "\n", + "evaluate_model_processor = ScriptProcessor(\n", + " role=role, \n", + " image_uri=tf_eval_image_uri, \n", + " command=['python3'], \n", + " instance_count=1, \n", + " instance_type=training_instance_type, \n", + ")\n", + "\n", + "# Create a PropertyFile\n", + "# A PropertyFile is used to be able to reference outputs from a processing step, for instance to use in a condition step.\n", + "# For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html\n", + "evaluation_report = PropertyFile(\n", + " name=\"EvaluationReport\", output_name=\"evaluation\", path=\"evaluation.json\"\n", + ")\n", + "\n", + "# Use the evaluate_model_processor in a Sagemaker pipelines ProcessingStep.\n", + "step_evaluate_model = ProcessingStep(\n", + " name=\"EvaluateModelPerformance\",\n", + " processor=evaluate_model_processor,\n", + " inputs=[\n", + " ProcessingInput(\n", + " source=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,\n", + " destination=\"/opt/ml/processing/model\",\n", + " ),\n", + " ProcessingInput(\n", + " source=step_process.properties.ProcessingOutputConfig.Outputs[\n", + " \"test\"\n", + " ].S3Output.S3Uri,\n", + " destination=\"/opt/ml/processing/test\",\n", + " ),\n", + " ],\n", + " outputs=[\n", + " ProcessingOutput(output_name=\"evaluation\", source=\"/opt/ml/processing/evaluation\"),\n", + " ],\n", + " code=\"code/evaluate.py\",\n", + " property_files=[evaluation_report],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define a Register Model Step to Create a Model Package for the PipelineModel\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.model import Model\n", + "from sagemaker.sklearn.model import SKLearnModel\n", + "from sagemaker import PipelineModel\n", + "\n", + "\n", + "scaler_model_s3 = \"{}/model.tar.gz\".format(\n", + " step_process.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n", + ")\n", + "\n", + "scaler_model = SKLearnModel(model_data=scaler_model_s3,\n", + " role=role,\n", + " sagemaker_session=sagemaker_session,\n", + " entry_point=\"code/preprocess.py\",\n", + " framework_version=sklearn_framework_version)\n", + "\n", + "\n", + "tf_model_image_uri = sagemaker.image_uris.retrieve(\n", + " framework=\"tensorflow\",\n", + " region=region,\n", + " version=tensorflow_version,\n", + " image_scope=\"inference\",\n", + " py_version=\"py37\",\n", + " instance_type=training_instance_type,\n", + ")\n", + "\n", + "tf_model = Model(\n", + " image_uri=tf_model_image_uri,\n", + " model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,\n", + " sagemaker_session=sagemaker_session,\n", + " role=role,\n", + ")\n", + "\n", + "pipeline_model = PipelineModel(\n", + " models=[scaler_model, tf_model],\n", + " role=role,\n", + " sagemaker_session=sagemaker_session\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.model_metrics import MetricsSource, ModelMetrics\n", + "from sagemaker.workflow.step_collections import RegisterModel\n", + "\n", + "\n", + "evaluation_s3_uri = \"{}/evaluation.json\".format(\n", + " step_evaluate_model.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n", + ")\n", + "\n", + "model_metrics = ModelMetrics(\n", + " model_statistics=MetricsSource(\n", + " s3_uri=evaluation_s3_uri,\n", + " content_type=\"application/json\",\n", + " )\n", + ")\n", + "\n", + "step_register_pipeline_model = RegisterModel(\n", + " name=\"PipelineModel\",\n", + " model=pipeline_model,\n", + " content_types=[\"text/csv\"],\n", + " response_types=[\"text/csv\"],\n", + " inference_instances=[\"ml.m5.large\", \"ml.m5.xlarge\"],\n", + " transform_instances=[\"ml.m5.xlarge\"],\n", + " model_package_group_name=model_package_group_name,\n", + " model_metrics=model_metrics,\n", + " approval_status=model_approval_status,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define a Condition Step to Check Accuracy and Conditionally Register a Model in the Model Registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo\n", + "from sagemaker.workflow.condition_step import (\n", + " ConditionStep,\n", + " JsonGet,\n", + ")\n", + "\n", + "# Create accuracy condition to ensure the model meets performance requirements.\n", + "# Models with a test accuracy lower than the condition will not be registered with the model registry.\n", + "cond_lte = ConditionLessThanOrEqualTo(\n", + " left=JsonGet(\n", + " step=step_evaluate_model,\n", + " property_file=evaluation_report,\n", + " json_path=\"regression_metrics.mse.value\",\n", + " ),\n", + " right=accuracy_mse_threshold,\n", + ")\n", + "\n", + "# Create a Sagemaker Pipelines ConditionStep, using the condition above.\n", + "# Enter the steps to perform if the condition returns True / False.\n", + "step_cond = ConditionStep(\n", + " name=\"MSE-Lower-Than-Threshold-Condition\",\n", + " conditions=[cond_lte],\n", + " if_steps=[step_register_pipeline_model], #step_register_model, step_register_scaler,\n", + " else_steps=[],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define a Pipeline of Parameters, Steps, and Conditions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.pipeline import Pipeline\n", + "\n", + "# Create a Sagemaker Pipeline.\n", + "# Each parameter for the pipeline must be set as a parameter explicitly when the pipeline is created.\n", + "# Also pass in each of the steps created above.\n", + "# Note that the order of execution is determined from each step's dependencies on other steps,\n", + "# not on the order they are passed in below.\n", + "pipeline = Pipeline(\n", + " name=pipeline_name,\n", + " parameters=[\n", + " processing_instance_type,\n", + " processing_instance_count,\n", + " training_instance_type,\n", + " input_data,\n", + " model_approval_status,\n", + " training_epochs,\n", + " accuracy_mse_threshold,\n", + " ],\n", + " steps=[step_process, step_train_model, step_evaluate_model, step_cond],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "definition = json.loads(pipeline.definition())\n", + "definition" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submit the pipeline to SageMaker and start execution¶\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.upsert(role_arn=role)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "execution = pipeline.start()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "execution.wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy latest approved model to a real-time endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile utils.py\n", + "import argparse\n", + "import boto3\n", + "import logging\n", + "import os\n", + "from botocore.exceptions import ClientError\n", + "import tarfile\n", + "import zipfile\n", + "\n", + "logger = logging.getLogger(__name__)\n", + "sm_client = boto3.client(\"sagemaker\")\n", + "\n", + "\n", + "def get_approved_package(model_package_group_name):\n", + " \"\"\"Gets the latest approved model package for a model package group.\n", + "\n", + " Args:\n", + " model_package_group_name: The model package group name.\n", + "\n", + " Returns:\n", + " The SageMaker Model Package ARN.\n", + " \"\"\"\n", + " try:\n", + " # Get the latest approved model package\n", + " response = sm_client.list_model_packages(\n", + " ModelPackageGroupName=model_package_group_name,\n", + " ModelApprovalStatus=\"Approved\",\n", + " SortBy=\"CreationTime\",\n", + " MaxResults=100,\n", + " )\n", + " approved_packages = response[\"ModelPackageSummaryList\"]\n", + "\n", + " # Fetch more packages if none returned with continuation token\n", + " while len(approved_packages) == 0 and \"NextToken\" in response:\n", + " logger.debug(\"Getting more packages for token: {}\".format(response[\"NextToken\"]))\n", + " response = sm_client.list_model_packages(\n", + " ModelPackageGroupName=model_package_group_name,\n", + " ModelApprovalStatus=\"Approved\",\n", + " SortBy=\"CreationTime\",\n", + " MaxResults=100,\n", + " NextToken=response[\"NextToken\"],\n", + " )\n", + " approved_packages.extend(response[\"ModelPackageSummaryList\"])\n", + "\n", + " # Return error if no packages found\n", + " if len(approved_packages) == 0:\n", + " error_message = (\n", + " f\"No approved ModelPackage found for ModelPackageGroup: {model_package_group_name}\"\n", + " )\n", + " logger.error(error_message)\n", + " raise Exception(error_message)\n", + "\n", + " # Return the pmodel package arn\n", + " model_package_arn = approved_packages[0][\"ModelPackageArn\"]\n", + " logger.info(f\"Identified the latest approved model package: {model_package_arn}\")\n", + " return approved_packages[0]\n", + " # return model_package_arn\n", + " except ClientError as e:\n", + " error_message = e.response[\"Error\"][\"Message\"]\n", + " logger.error(error_message)\n", + " raise Exception(error_message)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import get_approved_package\n", + "sm_client = boto3.client(\"sagemaker\")\n", + "\n", + "pck = get_approved_package(model_package_group_name) # Reminder: model_package_group_name was defined as \"NominetAbaloneModelPackageGroupName\" at the beginning of the pipeline definition\n", + "model_description = sm_client.describe_model_package(ModelPackageName=pck['ModelPackageArn'])\n", + "\n", + "model_description" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import ModelPackage\n", + "\n", + "model_package_arn = model_description['ModelPackageArn']\n", + "model = ModelPackage(role=role, \n", + " model_package_arn=model_package_arn, \n", + " sagemaker_session=sagemaker_session)\n", + "\n", + "endpoint_name = 'DEMO-endpoint-' + time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n", + "print(\"EndpointName= {}\".format(endpoint_name))\n", + "model.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge', endpoint_name=endpoint_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.predictor import Predictor\n", + "predictor = Predictor(endpoint_name=endpoint_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv('data/raw/raw_data_all.csv')\n", + "house_values = data['medianHouseValue']\n", + "data = data.drop('medianHouseValue', axis=1)\n", + "\n", + "pred_count = 10\n", + "payload = data.iloc[:pred_count].to_string(header=False, index=False).replace(' ',',')\n", + "p = predictor.predict(payload, initial_args = {\"ContentType\": \"text/csv\"})\n", + "print(p.decode(\"utf-8\") )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "blue, stop = '\\033[94m', '\\033[0m'\n", + "predictions = json.loads(p.decode(\"utf-8\"))['predictions']\n", + "for i in range(pred_count):\n", + " print(f\"Predicted: {blue}{predictions[i][0]}{stop} and Actual is: {blue}{house_values.iloc[i]}{stop}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean-up\n", + "Delete the resources created for this example to avoid any unintended charges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sm_client = boto3.client(\"sagemaker\")\n", + "\n", + "for d in sm_client.list_model_packages(ModelPackageGroupName=model_package_group_name)['ModelPackageSummaryList']:\n", + " print(d['ModelPackageArn'])\n", + " sm_client.delete_model_package(ModelPackageName=d['ModelPackageArn'])\n", + " \n", + "sm_client.delete_model_package_group(ModelPackageGroupName=model_package_group_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictor.delete_endpoint()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.delete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (Data Science)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/sagemaker-pipelines/tabular/tuning-step/sagemaker-pipelines-tuning-step.ipynb b/sagemaker-pipelines/tabular/tuning-step/sagemaker-pipelines-tuning-step.ipynb index d97fc11ede..fac31fa81b 100644 --- a/sagemaker-pipelines/tabular/tuning-step/sagemaker-pipelines-tuning-step.ipynb +++ b/sagemaker-pipelines/tabular/tuning-step/sagemaker-pipelines-tuning-step.ipynb @@ -41,7 +41,12 @@ "from sagemaker.estimator import Estimator\n", "from sagemaker.inputs import TrainingInput\n", "\n", - "from sagemaker.processing import ProcessingInput, ProcessingOutput, Processor, ScriptProcessor\n", + "from sagemaker.processing import (\n", + " ProcessingInput,\n", + " ProcessingOutput,\n", + " Processor,\n", + " ScriptProcessor,\n", + ")\n", "\n", "from sagemaker import Model\n", "from sagemaker.xgboost import XGBoostPredictor\n", @@ -56,16 +61,22 @@ ")\n", "from sagemaker.workflow.pipeline import Pipeline\n", "from sagemaker.workflow.properties import PropertyFile\n", - "from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig, TuningStep\n", + "from sagemaker.workflow.steps import (\n", + " ProcessingStep,\n", + " CacheConfig,\n", + " TuningStep,\n", + ")\n", "from sagemaker.workflow.step_collections import RegisterModel, CreateModelStep\n", "from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo\n", "from sagemaker.workflow.condition_step import (\n", " ConditionStep,\n", " JsonGet,\n", ")\n", + "\n", + "from sagemaker.workflow.functions import Join\n", + "from sagemaker.workflow.execution_variables import ExecutionVariables\n", + "\n", "from sagemaker.tuner import (\n", - " IntegerParameter,\n", - " CategoricalParameter,\n", " ContinuousParameter,\n", " HyperparameterTuner,\n", " WarmStartConfig,\n", @@ -84,7 +95,9 @@ "region = sagemaker.Session().boto_region_name\n", "sm_client = boto3.client(\"sagemaker\")\n", "boto_session = boto3.Session(region_name=region)\n", - "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)" + "sagemaker_session = sagemaker.session.Session(\n", + " boto_session=boto_session, sagemaker_client=sm_client\n", + ")" ] }, { @@ -100,11 +113,15 @@ "base_job_prefix = \"tuning-step-example\"\n", "model_package_group_name = \"tuning-job-model-packages\"\n", "\n", - "processing_instance_count = ParameterInteger(name=\"ProcessingInstanceCount\", default_value=1)\n", + "processing_instance_count = ParameterInteger(\n", + " name=\"ProcessingInstanceCount\", default_value=1\n", + ")\n", "processing_instance_type = ParameterString(\n", " name=\"ProcessingInstanceType\", default_value=\"ml.m5.xlarge\"\n", ")\n", - "training_instance_type = ParameterString(name=\"TrainingInstanceType\", default_value=\"ml.m5.xlarge\")\n", + "training_instance_type = ParameterString(\n", + " name=\"TrainingInstanceType\", default_value=\"ml.m5.xlarge\"\n", + ")\n", "model_approval_status = ParameterString(\n", " name=\"ModelApprovalStatus\", default_value=\"PendingManualApproval\"\n", ")\n", @@ -224,7 +241,10 @@ " numeric_features = list(feature_columns_names)\n", " numeric_features.remove(\"sex\")\n", " numeric_transformer = Pipeline(\n", - " steps=[(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]\n", + " steps=[\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler()),\n", + " ]\n", " )\n", "\n", " categorical_features = [\"sex\"]\n", @@ -249,7 +269,9 @@ "\n", " X = np.concatenate((y_pre, X_pre), axis=1)\n", "\n", - " logger.info(\"Splitting %d rows of data into train, validation, test datasets.\", len(X))\n", + " logger.info(\n", + " \"Splitting %d rows of data into train, validation, test datasets.\", len(X)\n", + " )\n", " np.random.shuffle(X)\n", " train, validation, test = np.split(X, [int(0.7 * len(X)), int(0.85 * len(X))])\n", "\n", @@ -269,6 +291,10 @@ "source": [ "# Process the training data step using a python script.\n", "# Split the training data set into train, test, and validation datasets\n", + "# When defining the ProcessingOutput destination as a dynamic value using the \n", + "# Pipeline Execution ID, caching will not be in effect as each time the step runs, \n", + "# the step definition changes resulting in new execution. If caching is required, \n", + "# the ProcessingOutput definition should be status\n", "\n", "sklearn_processor = SKLearnProcessor(\n", " framework_version=\"0.23-1\",\n", @@ -282,13 +308,51 @@ " name=\"PreprocessAbaloneDataForHPO\",\n", " processor=sklearn_processor,\n", " outputs=[\n", - " ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n", - " ProcessingOutput(output_name=\"validation\", source=\"/opt/ml/processing/validation\"),\n", - " ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\"),\n", + " ProcessingOutput(\n", + " output_name=\"train\",\n", + " source=\"/opt/ml/processing/train\",\n", + " destination=Join(\n", + " on=\"/\",\n", + " values=[\n", + " \"s3:/\",\n", + " default_bucket,\n", + " base_job_prefix,\n", + " ExecutionVariables.PIPELINE_EXECUTION_ID,\n", + " \"PreprocessAbaloneDataForHPO\",\n", + " ],\n", + " ),\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"validation\",\n", + " source=\"/opt/ml/processing/validation\",\n", + " destination=Join(\n", + " on=\"/\",\n", + " values=[\n", + " \"s3:/\",\n", + " default_bucket,\n", + " base_job_prefix,\n", + " ExecutionVariables.PIPELINE_EXECUTION_ID,\n", + " \"PreprocessAbaloneDataForHPO\",\n", + " ],\n", + " ),\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"test\",\n", + " source=\"/opt/ml/processing/test\",\n", + " destination=Join(\n", + " on=\"/\",\n", + " values=[\n", + " \"s3:/\",\n", + " default_bucket,\n", + " base_job_prefix,\n", + " ExecutionVariables.PIPELINE_EXECUTION_ID,\n", + " \"PreprocessAbaloneDataForHPO\",\n", + " ],\n", + " ),\n", + " ),\n", " ],\n", " code=\"preprocess.py\",\n", " job_arguments=[\"--input-data\", input_data],\n", - " cache_config=cache_config,\n", ")" ] }, @@ -366,7 +430,9 @@ " tuner=tuner_log,\n", " inputs={\n", " \"train\": TrainingInput(\n", - " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\"train\"].S3Output.S3Uri,\n", + " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n", + " \"train\"\n", + " ].S3Output.S3Uri,\n", " content_type=\"text/csv\",\n", " ),\n", " \"validation\": TrainingInput(\n", @@ -429,7 +495,9 @@ " tuner=tuner_log_warm_start,\n", " inputs={\n", " \"train\": TrainingInput(\n", - " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\"train\"].S3Output.S3Uri,\n", + " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n", + " \"train\"\n", + " ].S3Output.S3Uri,\n", " content_type=\"text/csv\",\n", " ),\n", " \"validation\": TrainingInput(\n", @@ -601,16 +669,22 @@ " processor=script_eval,\n", " inputs=[\n", " ProcessingInput(\n", - " source=step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),\n", + " source=step_tuning.get_top_model_s3_uri(\n", + " top_k=0, s3_bucket=model_bucket_key\n", + " ),\n", " destination=\"/opt/ml/processing/model\",\n", " ),\n", " ProcessingInput(\n", - " source=step_process.properties.ProcessingOutputConfig.Outputs[\"test\"].S3Output.S3Uri,\n", + " source=step_process.properties.ProcessingOutputConfig.Outputs[\n", + " \"test\"\n", + " ].S3Output.S3Uri,\n", " destination=\"/opt/ml/processing/test\",\n", " ),\n", " ],\n", " outputs=[\n", - " ProcessingOutput(output_name=\"evaluation\", source=\"/opt/ml/processing/evaluation\"),\n", + " ProcessingOutput(\n", + " output_name=\"evaluation\", source=\"/opt/ml/processing/evaluation\"\n", + " ),\n", " ],\n", " code=\"evaluate.py\",\n", " property_files=[evaluation_report],\n", @@ -620,7 +694,9 @@ "model_metrics = ModelMetrics(\n", " model_statistics=MetricsSource(\n", " s3_uri=\"{}/evaluation.json\".format(\n", - " step_eval.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n", + " step_eval.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\n", + " \"S3Uri\"\n", + " ]\n", " ),\n", " content_type=\"application/json\",\n", " )\n", @@ -660,7 +736,9 @@ "\n", "cond_lte = ConditionLessThanOrEqualTo(\n", " left=JsonGet(\n", - " step=step_eval, property_file=evaluation_report, json_path=\"regression_metrics.mse.value\"\n", + " step=step_eval,\n", + " property_file=evaluation_report,\n", + " json_path=\"regression_metrics.mse.value\",\n", " ),\n", " right=6.0,\n", ")\n", @@ -687,7 +765,14 @@ " input_data,\n", " model_approval_status,\n", " ],\n", - " steps=[step_process, step_tuning, step_create_first, step_create_second, step_eval, step_cond],\n", + " steps=[\n", + " step_process,\n", + " step_tuning,\n", + " step_create_first,\n", + " step_create_second,\n", + " step_eval,\n", + " step_cond,\n", + " ],\n", " sagemaker_session=sagemaker_session,\n", ")" ] @@ -764,21 +849,12 @@ "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3 (Data Science)", - "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" + "display_name": "Python 3.9.4 64-bit ('python@3.9')", + "name": "python394jvsc74a57bd0ac2eaa0ea0ebeafcc7822e65e46aa9d4f966f30b695406963e145ea4a91cd4fc" }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "" }, "metadata": { "interpreter": { @@ -788,4 +864,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/sagemaker-python-sdk/1P_kmeans_highlevel/kmeans_mnist.ipynb b/sagemaker-python-sdk/1P_kmeans_highlevel/kmeans_mnist.ipynb index 3accd5f49c..953a43d555 100644 --- a/sagemaker-python-sdk/1P_kmeans_highlevel/kmeans_mnist.ipynb +++ b/sagemaker-python-sdk/1P_kmeans_highlevel/kmeans_mnist.ipynb @@ -242,18 +242,24 @@ "source": [ "for cluster in range(10):\n", " print(f\"\\n\\n\\nCluster {int(cluster)}:\")\n", - " digits = [img for l, img in zip(clusters, valid_set[0]) if int(l) == cluster]\n", - " height = ((len(digits) - 1) // 5) + 1\n", - " width = 5\n", - " plt.rcParams[\"figure.figsize\"] = (width, height)\n", - " _, subplots = plt.subplots(height, width)\n", - " subplots = numpy.ndarray.flatten(subplots)\n", - " for subplot, image in zip(subplots, digits):\n", - " show_digit(image, subplot=subplot)\n", - " for subplot in subplots[len(digits) :]:\n", - " subplot.axis(\"off\")\n", - "\n", - " plt.show()" + " digits = [img for l, img in zip(clusters, valid_set[0]) if int(l) == cluster] \n", + " '''\n", + " The KMeans algorithm as an optimization problem is an NP Complete problem, and internal implementations\n", + " can produce different results for each run, depending upon the locations of the initial cluster centroid.\n", + " In some cases, there might be no data points in a cluster. We plot below the data points for clusters which\n", + " have datapoints.\n", + " '''\n", + " if digits: \n", + " height = ((len(digits) - 1) // 5) + 1\n", + " width = 5\n", + " plt.rcParams[\"figure.figsize\"] = (width, height)\n", + " _, subplots = plt.subplots(height, width)\n", + " subplots = numpy.ndarray.flatten(subplots)\n", + " for subplot, image in zip(subplots, digits):\n", + " show_digit(image, subplot=subplot)\n", + " for subplot in subplots[len(digits) :]:\n", + " subplot.axis(\"off\")\n", + " plt.show()" ] }, { diff --git a/sagemaker-python-sdk/mxnet_gluon_mnist/mxnet_mnist_with_gluon.ipynb b/sagemaker-python-sdk/mxnet_gluon_mnist/mxnet_mnist_with_gluon.ipynb index 066f4705dd..6fdd89259e 100644 --- a/sagemaker-python-sdk/mxnet_gluon_mnist/mxnet_mnist_with_gluon.ipynb +++ b/sagemaker-python-sdk/mxnet_gluon_mnist/mxnet_mnist_with_gluon.ipynb @@ -41,8 +41,34 @@ "metadata": {}, "outputs": [], "source": [ - "gluon.data.vision.MNIST(\"./data/train\", train=True)\n", - "gluon.data.vision.MNIST(\"./data/test\", train=False)" + "import os\n", + "\n", + "for inner_dir in [\"train\", \"test\"]:\n", + " data_dir = \"./data/{}/\".format(inner_dir)\n", + " if not os.path.exists(data_dir):\n", + " os.makedirs(data_dir)\n", + "\n", + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/MNIST/train/train-images-idx3-ubyte.gz\",\n", + " \"./data/train/train-images-idx3-ubyte.gz\",\n", + ")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/MNIST/train/train-labels-idx1-ubyte.gz\",\n", + " \"./data/train/train-labels-idx1-ubyte.gz\",\n", + ")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/MNIST/test/t10k-images-idx3-ubyte.gz\",\n", + " \"./data/test/t10k-images-idx3-ubyte.gz\",\n", + ")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/MNIST/test/t10k-labels-idx1-ubyte.gz\",\n", + " \"./data/test/t10k-labels-idx1-ubyte.gz\",\n", + ")" ] }, { @@ -157,7 +183,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can now use this predictor to classify hand-written digits. Drawing into the image box loads the pixel data into a 'data' variable in this notebook, which we can then pass to the mxnet predictor. " + "We can now use this predictor to classify hand-written digits. Manually drawing into the image box loads the pixel data into a 'data' variable in this notebook, which we can then pass to the MXNet predictor. " ] }, { @@ -171,6 +197,37 @@ "HTML(open(\"input.html\").read())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fetch the first image from the test dataset and display it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "f = gzip.open(\"data/train/train-images-idx3-ubyte.gz\", \"r\")\n", + "\n", + "image_size = 28\n", + "\n", + "f.read(16)\n", + "buf = f.read(image_size * image_size)\n", + "data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)\n", + "data = data.reshape(1, image_size, image_size, 1)\n", + "\n", + "image = np.asarray(data).squeeze()\n", + "plt.imshow(image)\n", + "plt.show()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -225,7 +282,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.13" }, "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." }, diff --git a/sagemaker-python-sdk/mxnet_gluon_mnist/mxnet_mnist_with_gluon_local_mode.ipynb b/sagemaker-python-sdk/mxnet_gluon_mnist/mxnet_mnist_with_gluon_local_mode.ipynb index 5390a4f984..99ca279469 100644 --- a/sagemaker-python-sdk/mxnet_gluon_mnist/mxnet_mnist_with_gluon_local_mode.ipynb +++ b/sagemaker-python-sdk/mxnet_gluon_mnist/mxnet_mnist_with_gluon_local_mode.ipynb @@ -8,7 +8,7 @@ "\n", "### Pre-requisites\n", "\n", - "This notebook shows how to use the SageMaker Python SDK to run your code in a local container before deploying to SageMaker's managed training or hosting environments. This can speed up iterative testing and debugging while using the same familiar Python SDK interface. Just change your estimator's `train_instance_type` to `local`. You could also use `local_gpu` if you're using an ml.p2 or ml.p3 notebook instance, but then you'll need to set `train_instance_count=1` since distributed, local, GPU training is not yet supported.\n", + "This notebook shows how to use the SageMaker Python SDK to run your code in a local container before deploying to SageMaker's managed training or hosting environments. This can speed up iterative testing and debugging while using the same familiar Python SDK interface. Just change your estimator's `instance_type` to `local`. You could also use `local_gpu` if you're using an ml.p2 or ml.p3 notebook instance, but then you'll need to set `instance_count=1` since distributed, local, GPU training is not yet supported.\n", "\n", "In order to use this feature you'll need to install docker-compose (and nvidia-docker if training with a GPU). Running the setup.sh script below will handle this for you.\n", "\n", @@ -73,8 +73,34 @@ "metadata": {}, "outputs": [], "source": [ - "gluon.data.vision.MNIST(\"./data/train\", train=True)\n", - "gluon.data.vision.MNIST(\"./data/test\", train=False)" + "import os\n", + "\n", + "for inner_dir in [\"train\", \"test\"]:\n", + " data_dir = \"./data/{}/\".format(inner_dir)\n", + " if not os.path.exists(data_dir):\n", + " os.makedirs(data_dir)\n", + "\n", + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/MNIST/train/train-images-idx3-ubyte.gz\",\n", + " \"./data/train/train-images-idx3-ubyte.gz\",\n", + ")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/MNIST/train/train-labels-idx1-ubyte.gz\",\n", + " \"./data/train/train-labels-idx1-ubyte.gz\",\n", + ")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/MNIST/test/t10k-images-idx3-ubyte.gz\",\n", + " \"./data/test/t10k-images-idx3-ubyte.gz\",\n", + ")\n", + "s3.download_file(\n", + " \"sagemaker-sample-files\",\n", + " \"datasets/image/MNIST/test/t10k-labels-idx1-ubyte.gz\",\n", + " \"./data/test/t10k-labels-idx1-ubyte.gz\",\n", + ")" ] }, { @@ -121,7 +147,7 @@ "source": [ "## Run the training script on SageMaker\n", "\n", - "The ```MXNet``` class allows us to run our training function on SageMaker local mode. We need to configure it with our training script, an IAM role, the number of training instances, and the training instance type. This is the the only difference from [mnist_with_gluon.ipynb](./mnist_with_gluon.ipynb). Instead of ``train_instance_type='ml.c4.xlarge'``, we set it to ``train_instance_type='local'``. For local training with GPU, we could set this to \"local_gpu\". In this case, `instance_type` was set above based on your whether you're running a GPU instance." + "The ```MXNet``` class allows us to run our training function on SageMaker local mode. We need to configure it with our training script, an IAM role, the number of training instances, and the training instance type. This is the only difference from [mxnet_mnist_with_gluon.ipynb](./mxnet_mnist_with_gluon.ipynb). Instead of ``instance_type='ml.c4.xlarge'``, we set it to ``instance_type='local'``. For local training with GPU, we could set this to \"local_gpu\". In this case, `instance_type` was set above based on your whether you're running a GPU instance." ] }, { @@ -189,7 +215,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can now use this predictor to classify hand-written digits. Drawing into the image box loads the pixel data into a 'data' variable in this notebook, which we can then pass to the mxnet predictor." + "We can now use this predictor to classify hand-written digits. Manually drawing into the image box loads the pixel data into a 'data' variable in this notebook, which we can then pass to the MXNet predictor." ] }, { @@ -203,6 +229,37 @@ "HTML(open(\"input.html\").read())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fetch the first image from the test dataset and display it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "f = gzip.open(\"data/train/train-images-idx3-ubyte.gz\", \"r\")\n", + "\n", + "image_size = 28\n", + "\n", + "f.read(16)\n", + "buf = f.read(image_size * image_size)\n", + "data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)\n", + "data = data.reshape(1, image_size, image_size, 1)\n", + "\n", + "image = np.asarray(data).squeeze()\n", + "plt.imshow(image)\n", + "plt.show()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -257,7 +314,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.13" }, "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." }, diff --git a/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb b/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb index db69b82d12..ee665386b3 100644 --- a/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb +++ b/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb @@ -10,21 +10,9 @@ "* SDK https://sagemaker.readthedocs.io/en/stable/sagemaker.sklearn.html\n", "* boto3 https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#client\n", "\n", - "In this notebook we show how to use Amazon SageMaker to develop, train, tune and deploy a Scikit-Learn based ML model (Random Forest). More info on Scikit-Learn can be found here https://scikit-learn.org/stable/index.html. We use the Boston Housing dataset, present in Scikit-Learn: https://scikit-learn.org/stable/datasets/index.html#boston-dataset\n", + "In this notebook we show how to use Amazon SageMaker to develop, train, tune and deploy a Scikit-Learn based ML model (Random Forest). More info on Scikit-Learn can be found here https://scikit-learn.org/stable/index.html. We use the California Housing dataset, present in Scikit-Learn: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html. The California Housing dataset was originally published in:\n", "\n", - "\n", - "More info on the dataset:\n", - "\n", - "The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 1980. N.B. Various transformations are used in the table on pages 244-261 of the latter.\n", - "\n", - "The Boston house-price data has been used in many machine learning papers that address regression problems.\n", - "References\n", - "\n", - " * Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n", - " * Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n", - " \n", - " \n", - " \n", + "> Pace, R. Kelley, and Ronald Barry. \"Sparse spatial autoregressions.\" Statistics & Probability Letters 33.3 (1997): 291-297.\n", " \n", "**This sample is provided for demonstration purposes, make sure to conduct appropriate testing if derivating this code for your own use-cases!**" ] @@ -45,7 +33,7 @@ "from sagemaker import get_execution_role\n", "import sagemaker\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.datasets import load_boston\n", + "from sklearn.datasets import fetch_california_housing\n", "\n", "\n", "sm_boto3 = boto3.client(\"sagemaker\")\n", @@ -73,8 +61,8 @@ "metadata": {}, "outputs": [], "source": [ - "# we use the Boston housing dataset\n", - "data = load_boston()" + "# we use the California housing dataset\n", + "data = fetch_california_housing()" ] }, { @@ -109,8 +97,8 @@ "metadata": {}, "outputs": [], "source": [ - "trainX.to_csv(\"boston_train.csv\")\n", - "testX.to_csv(\"boston_test.csv\")" + "trainX.to_csv(\"california_housing_train.csv\")\n", + "testX.to_csv(\"california_housing_test.csv\")" ] }, { @@ -121,11 +109,11 @@ "source": [ "# send data to S3. SageMaker will take training data from s3\n", "trainpath = sess.upload_data(\n", - " path=\"boston_train.csv\", bucket=bucket, key_prefix=\"sagemaker/sklearncontainer\"\n", + " path=\"california_housing_train.csv\", bucket=bucket, key_prefix=\"sagemaker/sklearncontainer\"\n", ")\n", "\n", "testpath = sess.upload_data(\n", - " path=\"boston_test.csv\", bucket=bucket, key_prefix=\"sagemaker/sklearncontainer\"\n", + " path=\"california_housing_test.csv\", bucket=bucket, key_prefix=\"sagemaker/sklearncontainer\"\n", ")" ] }, @@ -174,8 +162,8 @@ " parser.add_argument(\"--model-dir\", type=str, default=os.environ.get(\"SM_MODEL_DIR\"))\n", " parser.add_argument(\"--train\", type=str, default=os.environ.get(\"SM_CHANNEL_TRAIN\"))\n", " parser.add_argument(\"--test\", type=str, default=os.environ.get(\"SM_CHANNEL_TEST\"))\n", - " parser.add_argument(\"--train-file\", type=str, default=\"boston_train.csv\")\n", - " parser.add_argument(\"--test-file\", type=str, default=\"boston_test.csv\")\n", + " parser.add_argument(\"--train-file\", type=str, default=\"california_housing_train.csv\")\n", + " parser.add_argument(\"--test-file\", type=str, default=\"california_housing_test.csv\")\n", " parser.add_argument(\n", " \"--features\", type=str\n", " ) # in this script we ask user to explicitly name features\n", @@ -237,7 +225,7 @@ " --model-dir ./ \\\n", " --train ./ \\\n", " --test ./ \\\n", - " --features 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT' \\\n", + " --features 'MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude' \\\n", " --target target" ] }, @@ -277,7 +265,7 @@ " hyperparameters={\n", " \"n-estimators\": 100,\n", " \"min-samples-leaf\": 3,\n", - " \"features\": \"CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT\",\n", + " \"features\": \"MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude\",\n", " \"target\": \"target\",\n", " },\n", ")" @@ -360,7 +348,7 @@ " \"n_estimators\": \"300\",\n", " \"min_samples_leaf\": \"3\",\n", " \"sagemaker_program\": \"script.py\",\n", - " \"features\": \"CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT\",\n", + " \"features\": \"MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude\",\n", " \"target\": \"target\",\n", " \"sagemaker_submit_directory\": \"s3://\" + bucket + \"/\" + project + \"/\" + source,\n", " },\n", @@ -641,7 +629,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/sagemaker-python-sdk/tensorboard_keras/tensorboard_keras.ipynb b/sagemaker-python-sdk/tensorboard_keras/tensorboard_keras.ipynb index fa18d7d1a8..aff842f707 100644 --- a/sagemaker-python-sdk/tensorboard_keras/tensorboard_keras.ipynb +++ b/sagemaker-python-sdk/tensorboard_keras/tensorboard_keras.ipynb @@ -134,45 +134,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create a training job using the sagemaker.TensorFlow estimator, running locally\n", - "To test that the code will work in SageMaker, we'll first use SageMaker local mode." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.tensorflow import TensorFlow\n", - "\n", - "hyperparameters = {\"epochs\": 2, \"batch-size\": 128, \"tf-logs-path\": tensorflow_logs_path}\n", - "\n", - "inputs = {\n", - " \"train\": \"file://\" + os.getcwd() + \"/data/train\",\n", - " \"validation\": \"file://\" + os.getcwd() + \"/data/validation\",\n", - "}\n", - "\n", - "estimator = TensorFlow(\n", - " base_job_name=\"tensorboard-example-local\",\n", - " entry_point=\"tensorboard_keras_cifar10.py\",\n", - " source_dir=\"source_dir\",\n", - " role=role,\n", - " framework_version=\"2.2.0\",\n", - " py_version=\"py37\",\n", - " hyperparameters=hyperparameters,\n", - " instance_count=1,\n", - " instance_type=\"local\",\n", - ")\n", - "\n", - "estimator.fit(inputs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run on SageMaker cloud" + "## Run on SageMaker Training on the cloud" ] }, { @@ -242,9 +204,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ + "from sagemaker.tensorflow import TensorFlow\n", + "\n", "hyperparameters = {\"epochs\": 2, \"batch-size\": 256, \"tf-logs-path\": tensorflow_logs_path}\n", "\n", "inputs = {\"train\": dataset_location + \"/train\", \"validation\": dataset_location + \"/validation\"}\n", @@ -326,7 +292,7 @@ "\n", "hyperparameter_ranges = {\n", " \"learning-rate\": ContinuousParameter(0.00001, 0.001),\n", - " \"batch-size\": CategoricalParameter([64, 128, 256, 512]),\n", + " \"batch-size\": CategoricalParameter([64, 128]),\n", " \"optimizer\": CategoricalParameter([\"sgd\", \"adam\", \"rmsprop\"]),\n", "}\n", "\n", @@ -340,7 +306,7 @@ " hyperparameter_ranges,\n", " metric_definitions=keras_metric_definition,\n", " objective_type=\"Maximize\",\n", - " max_jobs=10,\n", + " max_jobs=4,\n", " max_parallel_jobs=2,\n", " early_stopping_type=\"Auto\",\n", " base_tuning_job_name=\"remote-hpo\",\n", @@ -360,46 +326,68 @@ "Now we can use TensorBoard to compare all training jobs, including local, cloud, tuning training. The following cell will run TensorBoard inside of the SageMaker Notebook Instance." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Start Tensorboard from the System terminal and view the Tensorboard UI. To do this, follow steps 1-3 below:\n", + "\n", + "1. From SageMaker Studio's Jupyter Server, launch the System terminal (under Utilities and files) (see image below)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://github.com/anoop-ml/smstudio_tensorboard_sample/blob/main/images/launch_system_terminal.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. Run the following command to install tensorboard onto the system terminal\n", + "\n", + "```pip install tensorboard```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. Paste the command that is the output of the next cell to start your tensorboard instance on Studio:\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "aws_region = sagemaker_session.boto_region_name\n", - "\n", - "!AWS_REGION={aws_region} tensorboard --logdir {tensorflow_logs_path}" + "!AWS_REGION={aws_region}\n", + "!echo tensorboard --logdir {tensorflow_logs_path}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "[**Click here to access TensorBoard instance**](/proxy/6006/)\n", - "\n", - "Instance of TensorBoard will be available at `https:///proxy/6006/`.\n", - "By default TensorBoard assigns port 6006, but if it's already in use TensorBoard will increase the port by 1, so 6007, 6008 and so on until it finds an avilable port.\n", - "\n", - "You should something similar to this:\n", - "![TensorBoard screenshot]()\n" + "4. Finally click the link below!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Run TensorBoard locally\n", + "[**Click here to access TensorBoard instance**](/jupyter/default/proxy/6006/)\n", "\n", - "The same command can be used to run TensorBoard in any environment:\n", - "- install TensorFlow locally using `pip install tensorflow`\n", - "- profiling the model also requires Profiler plugin `pip install tensorboard_plugin_profile`\n", - "- configure AWS CLI and make sure you have read access to S3 bucket containing TensorFlow log files\n", - "- configure default AWS region. You can do this by setting an environment variable named `AWS_REGION` with region name where the S3 bucket is located, e.g. `export AWS_REGION=eu-west-1`\n", - "- start TensorBoard with a following command `tensorboard --logdir `\n", + "Instance of TensorBoard will be available at `https:///proxy/6006/`.\n", + "By default TensorBoard assigns port 6006, but if it's already in use TensorBoard will increase the port by 1, so 6007, 6008 and so on until it finds an available port.\n", "\n", - "Now you can access the local instance of TensorBoard at http://localhost:6006" + "You should see something similar to this:\n", + "![TensorBoard screenshot]()\n" ] }, { @@ -412,10 +400,11 @@ } ], "metadata": { + "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "conda_tensorflow2_p36", + "display_name": "Python 3 (TensorFlow 2.1 Python 3.6 CPU Optimized)", "language": "python", - "name": "conda_tensorflow2_p36" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/tensorflow-2.1-cpu-py36" }, "language_info": { "codemirror_mode": { @@ -427,7 +416,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" }, "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.", "pycharm": { @@ -441,5 +430,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/sagemaker-python-sdk/tensorflow_script_mode_training_and_serving/tensorflow_script_mode_training_and_serving.ipynb b/sagemaker-python-sdk/tensorflow_script_mode_training_and_serving/tensorflow_script_mode_training_and_serving.ipynb index 537e96760b..f28600f687 100644 --- a/sagemaker-python-sdk/tensorflow_script_mode_training_and_serving/tensorflow_script_mode_training_and_serving.ipynb +++ b/sagemaker-python-sdk/tensorflow_script_mode_training_and_serving/tensorflow_script_mode_training_and_serving.ipynb @@ -64,7 +64,7 @@ "source": [ "# Construct a script for distributed training\n", "\n", - "This tutorial's training script was adapted from TensorFlow's official [CNN MNIST example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/layers/cnn_mnist.py). We have modified it to handle the ``model_dir`` parameter passed in by SageMaker. This is an S3 path which can be used for data sharing during distributed training and checkpointing and/or model persistence. We have also added an argument-parsing function to handle processing training-related variables.\n", + "This tutorial's training script was adapted from an earlier version of TensorFlow's official [CNN MNIST example](https://github.com/tensorflow/tensorflow/blob/95e808ba44075dfe0b7db57bb49d2e64a1977a95/tensorflow/examples/tutorials/layers/cnn_mnist.py). An updated version is available at [Convolutional Neural Network (CNN)](https://github.com/tensorflow/docs/blob/master/site/en/tutorials/images/cnn.ipynb). We have modified the example to handle the ``model_dir`` parameter passed in by SageMaker. This is an S3 path which can be used for data sharing during distributed training and checkpointing and/or model persistence. We have also added an argument-parsing function to handle processing training-related variables.\n", "\n", "At the end of the training job we have added a step to export the trained model to the path stored in the environment variable ``SM_MODEL_DIR``, which always points to ``/opt/ml/model``. This is critical because SageMaker uploads all the model artifacts in this folder to S3 at end of training.\n", "\n", diff --git a/sagemaker-python-sdk/tensorflow_serving_container/sample_utils.py b/sagemaker-python-sdk/tensorflow_serving_container/sample_utils.py index d13d9db8d2..be527331c6 100644 --- a/sagemaker-python-sdk/tensorflow_serving_container/sample_utils.py +++ b/sagemaker-python-sdk/tensorflow_serving_container/sample_utils.py @@ -29,14 +29,14 @@ def tfhub_to_savedmodel( model_path = "{}/{}/00000001".format(export_path, model_name) tfhub_uri = uri_pattern.format(model_name) - with tf.Session(graph=tf.Graph()) as sess: + with tf.compat.v1.Session(graph=tf.Graph()) as sess: module = hub.Module(tfhub_uri) input_params = module.get_input_info_dict() dtype = input_params["images"].dtype shape = input_params["images"].get_shape() # define the model inputs - inputs = {"images": tf.placeholder(dtype, shape, "images")} + inputs = {"images": tf.compat.v1.placeholder(dtype, shape, "images")} # define the model outputs # we want the class ids and probabilities for the top 3 classes @@ -46,8 +46,8 @@ def tfhub_to_savedmodel( outputs = {"classes": classes, "probabilities": probs} # export the model - sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) - tf.saved_model.simple_save(sess, model_path, inputs=inputs, outputs=outputs) + sess.run([tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer()]) + tf.compat.v1.saved_model.simple_save(sess, model_path, inputs=inputs, outputs=outputs) return model_path diff --git a/sagemaker-python-sdk/tensorflow_serving_container/tensorflow_serving_container.ipynb b/sagemaker-python-sdk/tensorflow_serving_container/tensorflow_serving_container.ipynb index 673f5698a7..984f819cb0 100644 --- a/sagemaker-python-sdk/tensorflow_serving_container/tensorflow_serving_container.ipynb +++ b/sagemaker-python-sdk/tensorflow_serving_container/tensorflow_serving_container.ipynb @@ -10,6 +10,7 @@ "# install dependencies\n", "!pip install opencv-python\n", "!pip install tensorflow-hub\n", + "!apt-get update\n", "!apt-get install ffmpeg libsm6 libxext6 -y" ] }, @@ -352,7 +353,7 @@ }, "outputs": [], "source": [ - "from sagemaker.tensorflow.serving import Model\n", + "from sagemaker.tensorflow.model import TensorFlowModel\n", "\n", "# Use an env argument to set the name of the default model.\n", "# This is optional, but recommended when you deploy multiple models\n", @@ -360,7 +361,7 @@ "# predictable model.\n", "env = {\"SAGEMAKER_TFS_DEFAULT_MODEL_NAME\": \"mobilenet_v2_140_224\"}\n", "\n", - "model = Model(model_data=model_data, role=sagemaker_role, framework_version=\"1.15.2\", env=env)\n", + "model = TensorFlowModel(model_data=model_data, role=sagemaker_role, framework_version=\"1.15.2\", env=env)\n", "predictor = model.deploy(initial_instance_count=1, instance_type=\"ml.c5.xlarge\")" ] }, @@ -479,7 +480,7 @@ "Note: if you are using local mode (by changing the instance type to `local` or `local_gpu`), you'll need to create the new predictor this way instead:\n", "\n", "```\n", - "predictor2 = Predictor(predictor.endpoint, model_name='mobilenet_v2_035_224', \n", + "predictor2 = TensorFlowPredictor(predictor.endpoint_name, model_name='mobilenet_v2_035_224', \n", " sagemaker_session=predictor.sagemaker_session)\n", "```" ] @@ -500,10 +501,10 @@ }, "outputs": [], "source": [ - "from sagemaker.tensorflow.serving import Predictor\n", + "from sagemaker.tensorflow.model import TensorFlowPredictor\n", "\n", "# use values from the default predictor to set up the new one\n", - "predictor2 = Predictor(predictor.endpoint, model_name=\"mobilenet_v2_035_224\")\n", + "predictor2 = TensorFlowPredictor(predictor.endpoint_name, model_name=\"mobilenet_v2_035_224\")\n", "\n", "# make a new prediction\n", "bee_image = sample_utils.image_file_to_tensor(\"bee.jpg\")\n", diff --git a/sagemaker-script-mode/sagemaker-script-mode.ipynb b/sagemaker-script-mode/sagemaker-script-mode.ipynb index ea42d2ec7e..1f906523a2 100644 --- a/sagemaker-script-mode/sagemaker-script-mode.ipynb +++ b/sagemaker-script-mode/sagemaker-script-mode.ipynb @@ -65,7 +65,6 @@ "import os\n", "import boto3\n", "import numpy as np\n", - "from sklearn.datasets import load_boston\n", "from sklearn.preprocessing import StandardScaler\n", "from sagemaker.pytorch import PyTorch\n", "from sagemaker.xgboost import XGBoost\n", diff --git a/sagemaker_edge_manager/README.md b/sagemaker_edge_manager/README.md index 93bfe947ca..74fd368f5f 100644 --- a/sagemaker_edge_manager/README.md +++ b/sagemaker_edge_manager/README.md @@ -2,9 +2,9 @@ SageMaker Edge Manager is a new service from Amazon SageMaker that lets you: -+ prepares custom models for edge device hardware -+ includes a runtime for running machine learning inference efficiently on edge devices -+ enables the device to send samples of data from each model securely to SageMaker for relabeling and retraining. ++ prepare custom models for edge device hardware ++ include a runtime for running machine learning inference efficiently on edge devices ++ enable the device to send samples of data from each model securely to SageMaker for relabeling and retraining. There are two main components to this service: @@ -12,6 +12,7 @@ There are two main components to this service: + SageMaker Edge Agent on the Edge device -This notebook walks the user through steps for compiling a pre-trained model using AWS SageMaker Neo service. We show how to package this compiled model and then load it to the Agent on the Edge Device to make predictions with. Finally, we show how to capture model's input and output to S3 via the Agent. +These notebooks walks the user through steps for compiling a pre-trained model using AWS SageMaker Neo service. We show how to package this compiled model and then how to use it on devices. In the first notebool we show you how to manually install the agent on the device, load the model and make predictions with. In the second notebook we show you how to use the provided SageMaker EdgeManager Greengrass component for an automatic installation. Finally, we show how to capture model's input and output to S3 via the Agent. -- [SageMaker Edge Example](sagemaker_edge_example) +- [SageMaker Edge Example](sagemaker_edge_example/sagemaker_edge_example.ipynb) +- [SageMaker Edge Manager Greengrass Example](sagemaker_edge_example/sagemaker_edge_greengrass_example.ipynb) diff --git a/sagemaker_edge_manager/sagemaker_edge_example/darknet.bmp b/sagemaker_edge_manager/sagemaker_edge_example/darknet.bmp deleted file mode 100644 index 2ee279cefd..0000000000 Binary files a/sagemaker_edge_manager/sagemaker_edge_example/darknet.bmp and /dev/null differ diff --git a/sagemaker_edge_manager/sagemaker_edge_example/inference.py b/sagemaker_edge_manager/sagemaker_edge_example/inference.py new file mode 100644 index 0000000000..deaeb5b7a4 --- /dev/null +++ b/sagemaker_edge_manager/sagemaker_edge_example/inference.py @@ -0,0 +1,123 @@ +import grpc +from PIL import Image +import agent_pb2 +import agent_pb2_grpc +import os +import numpy as np +import json + + +model_path = os.environ['MODEL_PATH'] +ml_model = os.environ['ML_MODEL'] +image_path = os.environ['IMAGE_PATH'] +capture_data = os.environ['CAPTURE_DATA'].lower() == "true" + +agent_socket = 'unix:///tmp/aws.greengrass.SageMakerEdgeManager.sock' + +agent_channel = grpc.insecure_channel(agent_socket, options=(('grpc.enable_http_proxy', 0),)) + +agent_client = agent_pb2_grpc.AgentStub(agent_channel) + + +def list_models(): + return agent_client.ListModels(agent_pb2.ListModelsRequest()) + + +def list_model_tensors(models): + return { + model.name: { + 'inputs': model.input_tensor_metadatas, + 'outputs': model.output_tensor_metadatas + } + for model in list_models().models + } + + +def load_model(model_name, model_path): + load_request = agent_pb2.LoadModelRequest() + load_request.url = model_path + load_request.name = model_name + return agent_client.LoadModel(load_request) + + +def unload_model(name): + unload_request = agent_pb2.UnLoadModelRequest() + unload_request.name = name + return agent_client.UnLoadModel(unload_request) + + +def predict_image(model_name, image_path): + image_tensor = agent_pb2.Tensor() + im = Image.open(image_path) + img = np.asarray(im) + # Neo compiled model requires the array to be of shape (3, 244, 244) + img = img.transpose(2,0,1) + # normalization according to https://github.com/tensorflow/tensorflow/blob/a4dfb8d1a71385bd6d122e4f27f86dcebb96712d/tensorflow/python/keras/applications/imagenet_utils.py#L259 + img = (img/127.5).astype(np.float32) + img -= 1. + image_tensor.byte_data = img.tobytes() + image_tensor_metadata = list_model_tensors(list_models())[model_name]['inputs'][0] + image_tensor.tensor_metadata.name = image_tensor_metadata.name + image_tensor.tensor_metadata.data_type = image_tensor_metadata.data_type + for shape in image_tensor_metadata.shape: + image_tensor.tensor_metadata.shape.append(shape) + predict_request = agent_pb2.PredictRequest() + predict_request.name = model_name + predict_request.tensors.append(image_tensor) + predict_response = agent_client.Predict(predict_request) + return predict_response + +def main(): + try: + unload_model(ml_model) + except: + pass + + print('LoadModel...', end='') + try: + load_model(ml_model, model_path=model_path) + print('done.') + except Exception as e: + print() + print(e) + print('Model already loaded!') + + print('ListModels...', end='') + try: + print(list_models()) + print('done.') + + except Exception as e: + print() + print(e) + print('List model failed!') + + print('Predict') + try: + prediction = predict_image(ml_model, image_path=image_path) + #print(prediction) # uncomment to print the predictio object + pred = np.frombuffer(prediction.tensors[0].byte_data, dtype = np.float32) + # the returned array has shape (1000,), while mobilenet v2 returns a shape (1, 1000) + # decoding results https://github.com/tensorflow/tensorflow/blob/a4dfb8d1a71385bd6d122e4f27f86dcebb96712d/tensorflow/python/keras/applications/imagenet_utils.py#L159 + + top_indexes = pred.argsort()[-5:][::-1] + with open(os.environ["IMAGENET_CLASS_INDEX_PATH"]) as f: + classes = json.load(f) + result = [tuple(classes[str(i)]) + (pred[i],) for i in top_indexes] + print(result) + except Exception as e: + print() + print(e) + print('Predict failed!') + + print('Unload model...', end='') + try: + unload_model(ml_model) + print('done.') + except Exception as e: + print() + print(e) + print('unload model failed!') + +if __name__ == '__main__': + main() diff --git a/sagemaker_edge_manager/sagemaker_edge_example/sagemaker_edge_example.ipynb b/sagemaker_edge_manager/sagemaker_edge_example/sagemaker_edge_example.ipynb index a411fbf3c3..906ca2d126 100644 --- a/sagemaker_edge_manager/sagemaker_edge_example/sagemaker_edge_example.ipynb +++ b/sagemaker_edge_manager/sagemaker_edge_example/sagemaker_edge_example.ipynb @@ -4,28 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# SageMaker Edge Manager Example\n", - "\n", - "1. [Introduction](#Introduction)\n", - "2. [Demo Setup](#Demo-Setup)\n", - " 1. [Launch EC2 Instance](#Launch-EC2-Instance)\n", - "3. [Compile Model using SageMaker Neo](#Compile-Model-using-SageMaker-Neo)\n", - " 1. [Load pretrained model](#Load-pretrained-model)\n", - "6. [Deploy Model using Sagemaker Edge Manager](#Deploy-Model-using-Sagemaker-Edge-Manager)\n", - " 1. [Package Model](#Package-Model)\n", - " 2. [Create AWS IoT thing](#Create-AWS-IoT-thing)\n", - " 3. [Create Device Fleet](#Create-Device-Fleet)\n", - " 4. [Create and register client certificate with AWS IoT](#Create-and-register-client-certificate-with-AWS-IoT)\n", - "7. [Inference on Edge](#Inference-on-Edge)\n", - " 1. [Setup Sagemaker Edge Manager Agent](#Setup-Sagemaker-Edge-Manager-Agent) \n", - " 2. [Load Model](#Load-Model)\n", - " 3. [List Models](#List-Models)\n", - " 4. [Run Predict](#Run-Predict)\n", - " 5. [Capture Data](#Capture-Data)\n", - " 6. [Unload Model](#Unload-Model)\n", - "8. [Clean Up](#Clean-Up)\n", - "9. [Appendix](#Appendix)\n", - " 1. [(Optional)Install CloudWatch Agent](#(Optional)Install-CloudWatch-Agent )" + "# SageMaker Edge Manager Example" ] }, { @@ -36,9 +15,9 @@ "\n", "SageMaker Edge Manager is a service from Amazon SageMaker that lets you:\n", "\n", - "+ prepares custom models for edge device hardware\n", - "+ includes a runtime for running machine learning inference efficiently on edge devices\n", - "+ enables devices to send samples of data from each model securely to SageMaker for relabeling and retraining.\n", + "+ prepare custom models for edge device hardware\n", + "+ include a runtime for running machine learning inference efficiently on edge devices\n", + "+ enable the device to send samples of data from each model securely to SageMaker for relabeling and retraining.\n", "\n", "There are two main components to this service:\n", "+ SageMaker Edge Manager in the Cloud \n", @@ -55,7 +34,7 @@ "**Note**:\n", "Typically, the SageMaker Edge Agent is run on an edge device. For the sake of this notebook, we will run the Agent on an EC2 instance. We show how to package the compiled model and then load it to the Agent on the edge Device to make predictions with. Finally, we show how to capture model's input and output to S3 via the Agent.\n", "\n", - "This notebook is intented only for notebook instances. When you run this notebook, choose the kernel: `conda_tensorflow_p36`" + "This notebook is intended only for notebook instances. When you run this notebook, choose the kernel: `conda_tensorflow_p36`" ] }, { @@ -81,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -113,6 +92,7 @@ "\n", "- AmazonEC2FullAccess \n", "- AmazonEC2RoleforSSM \n", + "- AmazonS3FullAccess \n", "- AmazonSSMManagedInstanceCore \n", "- AmazonSSMFullAccess \n", "- AWSIoTFullAccess \n", @@ -150,7 +130,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally we upload the test image to S3 bucket. This image will be used in inference later." + "Finally, we upload the test image to S3 bucket. This image will be used in inference later." ] }, { @@ -159,7 +139,6 @@ "metadata": {}, "outputs": [], "source": [ - "darknet_img_path = sess.upload_data(\"darknet.bmp\", bucket, iot_folder)\n", "keras_img_path = sess.upload_data(\"keras.bmp\", bucket, iot_folder)" ] }, @@ -179,18 +158,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "ec2_client = boto3.client(\"ec2\", region_name=region)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2021 Amazon.com.\n", + "# SPDX-License-Identifier: MIT\n", + "\n" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Generate a key pair for the EC2 instance, and save the key pem file. We can use this key with SSH to connect to the instance. But in this notebook example, we will not use SSH, instead, we will use AWS Systems Manager to send commands to the instance instead." + "Generate a key pair for the EC2 instance, and save the key PEM file. We can use this key with SSH to connect to the instance. But in this notebook example, we will not use SSH, instead, we will use AWS Systems Manager to send commands to the instance instead." ] }, { @@ -254,18 +244,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'ami-006ff58f5247c50eb'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "ami_map = {\n", - " \"us-east-1\": \"ami-063585f0e06d22308\",\n", - " \"us-east-2\": \"ami-01bd6a1621a6968d7\",\n", - " \"us-west-2\": \"ami-0bc87a16c757a7f07\",\n", - " \"eu-central-1\": \"ami-01227276a4e5a4a31\",\n", - " \"ap-northeast-1\": \"ami-03b8cfea5460e4881\",\n", - " \"eu-west-1\": \"ami-006ff58f5247c50eb\",\n", - "}" + "ami = ec2_client.describe_images(Filters=[{'Name': 'name', 'Values': ['Deep Learning AMI (Ubuntu 18.04) Version 36.0']}])['Images'][0]['ImageId']\n", + "ami" ] }, { @@ -277,7 +272,7 @@ "ec2_profile_name = \"\" # the name of the role created for EC2\n", "\n", "ec2_instance = ec2_client.run_instances(\n", - " ImageId=ami_map[region],\n", + " ImageId=ami,\n", " MinCount=1,\n", " MaxCount=1,\n", " InstanceType=\"c5.large\",\n", @@ -322,180 +317,6 @@ "sagemaker_client = boto3.client(\"sagemaker\", region_name=region)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Download pretrained Darknet model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!wget -O yolov3-tiny.cfg https://github.com/pjreddie/darknet/blob/master/cfg/yolov3-tiny.cfg?raw=true\n", - "!wget https://pjreddie.com/media/files/yolov3-tiny.weights" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tarfile\n", - "\n", - "with tarfile.open(\"yolov3-tiny.tar.gz\", mode=\"w:gz\") as archive:\n", - " archive.add(\"yolov3-tiny.cfg\")\n", - " archive.add(\"yolov3-tiny.weights\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "darknet_model_path = sess.upload_data(\"yolov3-tiny.tar.gz\", bucket, folder)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: When calling ``create_compilation_job()`` the user is expected to provide all the correct input shapes required by the model for successful compilation. If using a different model, you will need to specify the framework and data shape correctly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "darknet_model_data_shape = '{\"data\":[1,3,416,416]}'\n", - "darknet_model_framework = \"darknet\"\n", - "target_os = \"LINUX\"\n", - "target_arch = \"X86_64\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "darknet_compilation_job_name = \"Sagemaker-Edge-\" + str(time.time()).split(\".\")[0]\n", - "print(\"Compilation job for %s started\" % darknet_compilation_job_name)\n", - "\n", - "response = sagemaker_client.create_compilation_job(\n", - " CompilationJobName=darknet_compilation_job_name,\n", - " RoleArn=role,\n", - " InputConfig={\n", - " \"S3Uri\": darknet_model_path,\n", - " \"DataInputConfig\": darknet_model_data_shape,\n", - " \"Framework\": darknet_model_framework.upper(),\n", - " },\n", - " OutputConfig={\n", - " \"S3OutputLocation\": s3_compilation_output_location,\n", - " \"TargetPlatform\": {\"Arch\": target_arch, \"Os\": target_os},\n", - " },\n", - " StoppingCondition={\"MaxRuntimeInSeconds\": 900},\n", - ")\n", - "\n", - "print(response)\n", - "\n", - "# Poll every 30 sec\n", - "while True:\n", - " response = sagemaker_client.describe_compilation_job(\n", - " CompilationJobName=darknet_compilation_job_name\n", - " )\n", - " if response[\"CompilationJobStatus\"] == \"COMPLETED\":\n", - " break\n", - " elif response[\"CompilationJobStatus\"] == \"FAILED\":\n", - " print(response)\n", - " raise RuntimeError(\"Compilation failed\")\n", - " print(\"Compiling ...\")\n", - " time.sleep(30)\n", - "print(\"Done!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Package Model using Sagemaker Edge Manager\n", - "\n", - "In this section, we will walk through packaging two models that achieve different goals. One is a Keras based Image Classification model and the other is a Darknet based Object Detection Model. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Package Darknet Model\n", - "\n", - "Before we can deploy the compiled model to edge devices, we need to package the model using SageMaker Edge Manager cloud service." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "darknet_packaged_model_name = \"darknet-model\"\n", - "darknet_model_version = \"1.0\"\n", - "darknet_model_package = \"{}-{}.tar.gz\".format(darknet_packaged_model_name, darknet_model_version)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "darknet_packaging_job_name = darknet_compilation_job_name + \"-packaging\"\n", - "response = sagemaker_client.create_edge_packaging_job(\n", - " RoleArn=role,\n", - " OutputConfig={\n", - " \"S3OutputLocation\": s3_compilation_output_location,\n", - " },\n", - " ModelName=darknet_packaged_model_name,\n", - " ModelVersion=darknet_model_version,\n", - " EdgePackagingJobName=darknet_packaging_job_name,\n", - " CompilationJobName=darknet_compilation_job_name,\n", - ")\n", - "\n", - "print(response)\n", - "\n", - "# Poll every 30 sec\n", - "while True:\n", - " job_status = sagemaker_client.describe_edge_packaging_job(\n", - " EdgePackagingJobName=darknet_packaging_job_name\n", - " )\n", - " if job_status[\"EdgePackagingJobStatus\"] == \"COMPLETED\":\n", - " break\n", - " elif job_status[\"EdgePackagingJobStatus\"] == \"FAILED\":\n", - " print(job_status)\n", - " raise RuntimeError(\"Edge Packaging failed\")\n", - " print(\"Packaging ...\")\n", - " time.sleep(30)\n", - "print(\"Done!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "darknet_model_data = job_status[\"ModelArtifact\"]" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -666,7 +487,7 @@ "source": [ "### Create AWS IoT thing\n", "\n", - "SageMaker Edge Manager uses AWS IoT Core to authenticate the device so we can make calls to SageMaker Edge Manager endpoints in AWS Cloud. \n", + "SageMaker Edge Manager uses AWS IoT Core to authenticate the device in order to make calls to SageMaker Edge Manager endpoints in AWS Cloud. \n", "\n", "In order for an edge device to use AWS services, it is necessary for it to first authenticate. We recommend doing this via AWS IoT based authentication, for more details refer [here](https://docs.aws.amazon.com/iot/latest/developerguide/authorizing-direct-aws.html) and [here](https://aws.amazon.com/blogs/security/how-to-eliminate-the-need-for-hardcoded-aws-credentials-in-devices-by-using-the-aws-iot-credentials-provider/)." ] @@ -726,7 +547,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Configure an IAM role in your AWS account that will be assumed by the credentials provider on behalf of the devices in your device fleet. \n", + "Configure an IAM role in your AWS account that will be assumed by the credentials' provider on behalf of the devices in your device fleet. \n", "\n", "**Notice**: The name of the role must start with `SageMaker`.\n", "\n", @@ -933,10 +754,14 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Obtain your AWS account-specific endpoint for the credentials provider." + "# Copyright 2021 Amazon.com.\n", + "# SPDX-License-Identifier: MIT\n", + "\n" ] }, { @@ -963,7 +788,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Get offical Amazon Root CA file and upload to S3 bucket. " + "Get official Amazon Root CA file and upload to S3 bucket. " ] }, { @@ -979,7 +804,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use the endpoint to make an HTTPS request to the credentials provider to return a security token. The following example command uses curl, but you can use any HTTP client.\n", + "Use the endpoint to make an HTTPS request to the credentials' provider to return a security token. The following example command uses curl, but you can use any HTTP client.\n", "\n", "**Optional: verify the credentials.**\n" ] @@ -999,7 +824,7 @@ "source": [ "If the certificate can be verified with the endpoint without error, upload certificate files to S3 bucket.\n", "\n", - "These files will be used in the [Setup Sagemaker Edge Manager Agent](#Setup-Sagemaker-Edge-Manager-Agent) section on EC2/device as Credential Provider." + "These files will be used in the [Setup SageMaker Edge Manager Agent](#Setup-Sagemaker-Edge-Manager-Agent) section on EC2/device as Credential Provider." ] }, { @@ -1024,7 +849,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In our example, we will use [AWS Systems Manager](https://docs.aws.amazon.com/systems-manager/latest/userguide/ssm-agent.html) to remotely perform actions on the EC2 instance. To see the SSM logs from CloudWatch, refer to [Install CouldWatch Agent section](#(Optional)Install-CloudWatch-Agent). \n", + "In our example, we will use [AWS Systems Manager](https://docs.aws.amazon.com/systems-manager/latest/userguide/ssm-agent.html) to remotely perform actions on the EC2 instance. To see the SSM logs from CloudWatch, refer to [Install CloudWatch Agent section](#(Optional)Install-CloudWatch-Agent). \n", "\n", "Execution status of send_command is available in [AWS Systems Manager console](https://console.aws.amazon.com/systems-manager/run-command/complete-commands) command history." ] @@ -1223,10 +1048,6 @@ " \"aws s3 cp \" + device_cert_path + \" .\",\n", " \"aws s3 cp \" + device_key_path + \" .\",\n", " \"cd /demo\",\n", - " \"aws s3 cp \" + darknet_img_path + \" .\",\n", - " \"aws s3 cp \" + darknet_model_data + \" .\",\n", - " \"mkdir darknet_model\",\n", - " \"tar -xf \" + darknet_model_package + \" -C darknet_model\",\n", " \"aws s3 cp \" + keras_img_path + \" .\",\n", " \"aws s3 cp \" + keras_model_data + \" .\",\n", " \"mkdir keras_model\",\n", @@ -1277,6 +1098,7 @@ " \"sagemaker_edge_provider_aws_cert_pk_file\": \"/demo/iot-credentials/iot_key.pem.key\",\n", " \"sagemaker_edge_provider_aws_iot_cred_endpoint\": endpoint,\n", " \"sagemaker_edge_provider_provider\": \"Aws\",\n", + " \"sagemaker_edge_provider_provider_path\": \"/demo/lib/libprovider_aws.so\",\n", " \"sagemaker_edge_provider_s3_bucket_name\": bucket,\n", " \"sagemaker_edge_core_capture_data_destination\": \"Cloud\",\n", "}" @@ -1385,7 +1207,7 @@ "source": [ "### Load Model\n", "\n", - "In this section, we show the model management capabilities offered by SageMaker Edge Manager. We will load the two compiled and packaged models using the Agent. This keeps both models ready to run inference with. As you will see, once the models are loaded you can run multiple inferences as many times as necessary until the models are unloaded. This relieves the client applications from the logic and operational burden of managing them separately. These models are now simply an API away from running inference with.\n", + "In this section, we show the model management capabilities offered by SageMaker Edge Manager. We will load the two compiled and packaged models using the Agent. This keeps both models ready to run inference with. As you will see, once the models are loaded you can run multiple inferences as many times as necessary until the models are unloaded. This reliefs the client applications from the logic and operational burden of managing them separately. These models are now simply an API away from running inference with.\n", "\n", "When loading the model with the SageMaker Edge Agent, the argument to the API points the Agent to a directory containing the packaged model (without any extraneous files within the directory). " ] @@ -1394,48 +1216,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Load darknet model\n", - "\n", - "`darknet_model` is the path containing the packaged model in this notebook. `demo-darknet` is the name given to this model. This name will be used later to refer to this model for, making predictions, capturing data, unload." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load_darknet_model_out = ssm_client.send_command(\n", - " InstanceIds=[instance_id],\n", - " DocumentName=\"AWS-RunShellScript\",\n", - " OutputS3BucketName=bucket,\n", - " OutputS3KeyPrefix=folder,\n", - " Parameters={\n", - " \"commands\": [\n", - " \"cd /demo\",\n", - " \"./bin/sagemaker_edge_agent_client_example LoadModel darknet_model demo-darknet\",\n", - " ]\n", - " },\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ssm_client.get_command_invocation(\n", - " CommandId=load_darknet_model_out[\"Command\"][\"CommandId\"],\n", - " InstanceId=instance_id,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Load keras model\n", + "#### Load Keras model\n", "\n", "`keras_model` is the path containing the packaged model in this notebook. `demo-keras` is the name given to this model. This name will be used later to refer to this model for, making predictions, capturing data, unload." ] @@ -1514,53 +1295,14 @@ "source": [ "### Run Predict\n", "\n", - "In this API, we pass the model name, input data file that will be directly fed into the neural network, input tensor name that was passed earlier during the compilation phase, along with it's size and shape." + "In this API, we pass the model name, input data file that will be directly fed into the neural network, input tensor name that was passed earlier during the compilation phase, along with its size and shape." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Run prediction on darknet model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "darknet_predict_out = ssm_client.send_command(\n", - " InstanceIds=[instance_id],\n", - " DocumentName=\"AWS-RunShellScript\",\n", - " OutputS3BucketName=bucket,\n", - " OutputS3KeyPrefix=folder,\n", - " Parameters={\n", - " \"commands\": [\n", - " \"cd /demo\",\n", - " \"./bin/sagemaker_edge_agent_client_example Predict demo-darknet darknet.bmp data 416 416 3\",\n", - " ]\n", - " },\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ssm_client.get_command_invocation(\n", - " CommandId=darknet_predict_out[\"Command\"][\"CommandId\"],\n", - " InstanceId=instance_id,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Run prediction on keras model" + "#### Run prediction on Keras model" ] }, { @@ -1604,38 +1346,6 @@ "Capture the inputs and outputs of an inference call to cloud or disk. The specific parameters were configured earlier in the config file. " ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "darknet_capture_out = ssm_client.send_command(\n", - " InstanceIds=[instance_id],\n", - " DocumentName=\"AWS-RunShellScript\",\n", - " OutputS3BucketName=bucket,\n", - " OutputS3KeyPrefix=folder,\n", - " Parameters={\n", - " \"commands\": [\n", - " \"cd /demo\",\n", - " \"./bin/sagemaker_edge_agent_client_example PredictAndCapture demo-darknet darknet.bmp data 416 416 3\",\n", - " ]\n", - " },\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ssm_client.get_command_invocation(\n", - " CommandId=darknet_capture_out[\"Command\"][\"CommandId\"],\n", - " InstanceId=instance_id,\n", - ")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1691,7 +1401,6 @@ " Parameters={\n", " \"commands\": [\n", " \"cd /demo\",\n", - " \"./bin/sagemaker_edge_agent_client_example UnloadModel demo-darknet\",\n", " \"./bin/sagemaker_edge_agent_client_example UnloadModel demo-keras\",\n", " ]\n", " },\n", @@ -1961,10 +1670,11 @@ } ], "metadata": { + "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "conda_tensorflow_p36", + "display_name": "Python 3", "language": "python", - "name": "conda_tensorflow_p36" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1976,7 +1686,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/sagemaker_edge_manager/sagemaker_edge_example/sagemaker_edge_greengrass_example.ipynb b/sagemaker_edge_manager/sagemaker_edge_example/sagemaker_edge_greengrass_example.ipynb new file mode 100644 index 0000000000..aafd0cbd22 --- /dev/null +++ b/sagemaker_edge_manager/sagemaker_edge_example/sagemaker_edge_greengrass_example.ipynb @@ -0,0 +1,1241 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# SageMaker Edge Manager Example" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Introduction\n", + "\n", + "SageMaker Edge Manager is a service from Amazon SageMaker that lets you:\n", + "\n", + "+ prepare custom models for edge device hardware\n", + "+ include a runtime for running machine learning inference efficiently on edge devices\n", + "+ enable the device to send samples of data from each model securely to SageMaker for relabeling and retraining.\n", + "\n", + "There are two main components to this service:\n", + "+ SageMaker Edge Manager in the Cloud \n", + "+ SageMaker Edge Agent on the Edge device\n", + "\n", + "This notebook demonstrates the end-to-end workflow for getting a running SageMaker Edge on the edge device. This will involve the following steps:\n", + "\n", + "+ Compile the model using SageMaker Neo\n", + "+ Package the compiled model with SageMaker Edge Manager\n", + "+ Deploy with SageMaker Edge Manager Agent\n", + "+ Run inference with the model\n", + "+ Capture model's input and output data to S3\n", + "\n", + "**Note**:\n", + "Typically, the SageMaker Edge Agent is run on an Edge device. For the sake of this notebook, we will run the Agent on an EC2 instance. We show how to package the compiled model and then load it to the Agent on the Edge Device to make predictions with. Finally, we show how to capture model's input and output to S3 via the Agent.\n", + "\n", + " When you run this notebook, choose the kernel: `conda_tensorflow_p36` if you are using a notebook instance or `Python 3 (TensorFlow 1.15 Python 3.6 CPU Optimized)` if you are using SageMaker Studio." + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "**Please note**: There are pricing implications to the use of this notebook. Please refer to [Edge Manager](https://aws.amazon.com/sagemaker/edge-manager/pricing) for more information." + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Demo Setup" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "We need an AWS account role with SageMaker access. This role is used to give SageMaker access to S3, launch an EC2 instance and create components and deployments in Greengrass." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "import boto3\n", + "import botocore\n", + "import json\n", + "\n", + "role = get_execution_role()\n", + "sess = sagemaker.Session()\n", + "region = boto3.Session().region_name" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "print(role)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Locate the above printed sagemaker role from [IAM console](https://console.aws.amazon.com/iam), find and attach the following policies to role:\n", + "\n", + "- AmazonEC2FullAccess \n", + "- AmazonEC2RoleforSSM \n", + "- AmazonSSMManagedInstanceCore \n", + "- AmazonSSMFullAccess \n", + "- AWSGreengrassFullAccess\n", + "- AWSIoTFullAccess \n", + "\n", + "You can find more information about how to attach policies to role [here](https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies_manage-attach-detach.html#add-policies-console).\n", + "\n", + "**If you try this example with a real device, only attach AWSIoTFullAccess to create certificates on AWS IoT.**" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "We then need an S3 bucket that would be used for storing the model artifacts generated after compilation and packaged artifacts generated after edge packaging job." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "# S3 bucket and folders for saving model artifacts.\n", + "# Feel free to specify different bucket/folders here if you wish.\n", + "bucket = sess.default_bucket()\n", + "folder = \"DEMO-Sagemaker-Edge\"\n", + "compilation_output_sub_folder = folder + \"/compilation-output\"\n", + "iot_folder = folder + \"/iot\"\n", + "\n", + "# S3 Location to save the model artifact after compilation\n", + "s3_compilation_output_location = \"s3://{}/{}\".format(bucket, compilation_output_sub_folder)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Finally, we upload the test image to S3 bucket. This image will be used in inference later." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "keras_img_path = sess.upload_data(\"keras.bmp\", bucket, iot_folder)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Launch EC2 Instance" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "As mentioned earlier, this EC2 instance is used in place of an Edge device for running the agent software." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "ec2_client = boto3.client(\"ec2\", region_name=region)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Generate key pair for EC2 instance, save the key PEM file. We can use this key with SSH to connect to the instance. But in this notebook example, we will not use SSH, instead, we will use AWS Systems Manager to send commands to the instance." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "key_pairs = ec2_client.describe_key_pairs()\n", + "key_names = list(map(lambda x: x[\"KeyName\"], key_pairs[\"KeyPairs\"]))\n", + "\n", + "key_name = \"ec2-key-pair\"\n", + "\n", + "if key_name in key_names:\n", + " ec2_key_pair = ec2_client.delete_key_pair(\n", + " KeyName=key_name,\n", + " )" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "ec2_key_pair = ec2_client.create_key_pair(\n", + " KeyName=key_name,\n", + ")\n", + "\n", + "key_pair = str(ec2_key_pair[\"KeyMaterial\"])\n", + "key_pair_file = open(\"ec2-key-pair.pem\", \"w\")\n", + "key_pair_file.write(key_pair)\n", + "key_pair_file.close()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "# specify your role name used by GGv2\n", + "iot_device_role_name = 'SageMakerTESRole'" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Create a role for the EC2 instance we are going to use. Read for detailed information about [IAM roles for Amazon EC2](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html).\n", + "\n", + "Following steps here to [create an IAM role](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html#create-iam-role). Note down the role name and role ARN, role name will be used when we launch the EC2 instance, and role ARN will be needed to create inline policy.\n", + "\n", + "After creation, make sure the following policies are attached to role:\n", + "\n", + "- AmazonS3FullAccess \n", + "- AmazonSSMManagedInstanceCore \n", + "- CloudWatchAgentAdminPolicy\n", + "\n", + "Add an inline policy for this EC2 instance role, choose `Add inline policy` button on the role summary page, choose JSON format and replace the content with below statement ([Minimal IAM policy for installer to provision resources](https://docs.aws.amazon.com/greengrass/v2/developerguide/provision-minimal-iam-policy.html)). Replace account-id with your AWS account ID, and replace SageMakerTESRole with the name of the token exchange role that you specify in the cell above with iot_device_role_name.\n", + "\n", + "```\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Sid\": \"CreateTokenExchangeRole\",\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"iam:AttachRolePolicy\",\n", + " \"iam:CreatePolicy\",\n", + " \"iam:CreateRole\",\n", + " \"iam:GetPolicy\",\n", + " \"iam:GetRole\",\n", + " \"iam:PassRole\"\n", + " ],\n", + " \"Resource\": [\n", + " \"arn:aws:iam:::role/\",\n", + " \"arn:aws:iam:::policy/Access\",\n", + " \"arn:aws:iam::aws:policy/Access\"\n", + " ]\n", + " },\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"iot:AddThingToThingGroup\",\n", + " \"iot:AttachPolicy\",\n", + " \"iot:AttachThingPrincipal\",\n", + " \"iot:CreateKeysAndCertificate\",\n", + " \"iot:CreatePolicy\",\n", + " \"iot:CreateRoleAlias\",\n", + " \"iot:CreateThing\",\n", + " \"iot:CreateThingGroup\",\n", + " \"iot:DescribeEndpoint\",\n", + " \"iot:DescribeRoleAlias\",\n", + " \"iot:DescribeThingGroup\",\n", + " \"iot:GetPolicy\",\n", + " \"sts:GetCallerIdentity\"\n", + " ],\n", + " \"Resource\": \"*\"\n", + " },\n", + " {\n", + " \"Sid\": \"DeployDevTools\",\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"greengrass:CreateDeployment\",\n", + " \"iot:CancelJob\",\n", + " \"iot:CreateJob\",\n", + " \"iot:DeleteThingShadow\",\n", + " \"iot:DescribeJob\",\n", + " \"iot:DescribeThing\",\n", + " \"iot:DescribeThingGroup\",\n", + " \"iot:GetThingShadow\",\n", + " \"iot:UpdateJob\",\n", + " \"iot:UpdateThingShadow\"\n", + " ],\n", + " \"Resource\": \"*\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "\n", + "Locate the same sagemaker role used for this notebook in [Demo Setup](#Demo-Setup) in [IAM console](https://console.aws.amazon.com/iam), choose `Add inline policy` button on the role summary page, choose JSON format and replace the content with below statement:\n", + "\n", + "Before copy the following content, make sure you use the EC2 role ARN you just created in the `Resource` field for `iam:PassRole` action.\n", + "\n", + "```\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": \"iam:PassRole\",\n", + " \"Resource\": \"arn:aws:iam:::role/\"\n", + " }\n", + " ]\n", + "}\n", + "```" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Launch an EC2 C5 instance. In this example we will use aws deep learning AMI." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "ami = ec2_client.describe_images(\n", + " Filters=[{\"Name\": \"name\", \"Values\": [\"Deep Learning AMI (Ubuntu 18.04) Version 36.0\"]}]\n", + ")[\"Images\"][0][\"ImageId\"]\n", + "ami" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "ec2_profile_name = \"\" # replace with the name of the role created for EC2\n", + "\n", + "ec2_instance = ec2_client.run_instances(\n", + " ImageId=ami,\n", + " MinCount=1,\n", + " MaxCount=1,\n", + " InstanceType=\"c5.large\",\n", + " KeyName=key_name,\n", + " IamInstanceProfile={\"Name\": ec2_profile_name},\n", + ")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "instance_id = ec2_instance[\"Instances\"][0][\"InstanceId\"] # will use for running inference later\n", + "print(instance_id)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Compile Model using SageMaker Neo\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "sagemaker_client = boto3.client(\"sagemaker\", region_name=region)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Download pretrained Keras model" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "import tensorflow as tf\n", + "\n", + "model = tf.keras.applications.MobileNetV2()\n", + "model.save(\"mobilenet_v2.h5\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "import tarfile\n", + "\n", + "with tarfile.open(\"mobilenet_v2.tar.gz\", mode=\"w:gz\") as archive:\n", + " archive.add(\"mobilenet_v2.h5\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "keras_model_path = sess.upload_data(\"mobilenet_v2.tar.gz\", bucket, folder)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "**Note**: When calling ``create_compilation_job()`` user is expected to provide all the correct input shapes required by the model for successful compilation. If we are using a different model, we need to specify the framework and data shape correctly." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "keras_model_data_shape = '{\"input_1\":[1,3,224,224]}'\n", + "keras_model_framework = \"keras\"\n", + "target_device = \"ml_c5\"" + ], + "outputs": [], + "metadata": { + "scrolled": true + } + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "import time\n", + "\n", + "keras_compilation_job_name = \"Sagemaker-Edge-\" + str(time.time()).split(\".\")[0]\n", + "print(\"Compilation job for %s started\" % keras_compilation_job_name)\n", + "\n", + "response = sagemaker_client.create_compilation_job(\n", + " CompilationJobName=keras_compilation_job_name,\n", + " RoleArn=role,\n", + " InputConfig={\n", + " \"S3Uri\": keras_model_path,\n", + " \"DataInputConfig\": keras_model_data_shape,\n", + " \"Framework\": keras_model_framework.upper(),\n", + " },\n", + " OutputConfig={\n", + " \"S3OutputLocation\": s3_compilation_output_location,\n", + " \"TargetDevice\": target_device,\n", + " },\n", + " StoppingCondition={\"MaxRuntimeInSeconds\": 900},\n", + ")\n", + "\n", + "print(response)\n", + "\n", + "# Poll every 30 sec\n", + "while True:\n", + " response = sagemaker_client.describe_compilation_job(\n", + " CompilationJobName=keras_compilation_job_name\n", + " )\n", + " if response[\"CompilationJobStatus\"] == \"COMPLETED\":\n", + " break\n", + " elif response[\"CompilationJobStatus\"] == \"FAILED\":\n", + " raise RuntimeError(\"Compilation failed\")\n", + " print(\"Compiling ...\")\n", + " time.sleep(30)\n", + "print(\"Done!\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Package Keras Model" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "keras_packaged_model_name = \"keras-model\"\n", + "keras_model_version = \"1.0.1\"\n", + "keras_component_name = \"com.model.keras\"\n", + "keras_model_package = \"{}-{}.tar.gz\".format(keras_packaged_model_name, keras_model_version)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "keras_packaging_job_name = (\n", + " keras_compilation_job_name + \"-packaging-\" + str(time.time()).split(\".\")[0]\n", + ")\n", + "response = sagemaker_client.create_edge_packaging_job(\n", + " RoleArn=role,\n", + " OutputConfig={\n", + " \"PresetDeploymentType\": \"GreengrassV2Component\",\n", + " \"PresetDeploymentConfig\": json.dumps(\n", + " {\"ComponentName\": keras_component_name, \"ComponentVersion\": keras_model_version}\n", + " ),\n", + " \"S3OutputLocation\": s3_compilation_output_location,\n", + " },\n", + " ModelName=keras_packaged_model_name,\n", + " ModelVersion=keras_model_version,\n", + " EdgePackagingJobName=keras_packaging_job_name,\n", + " CompilationJobName=keras_compilation_job_name,\n", + ")\n", + "\n", + "print(response)\n", + "\n", + "# Poll every 30 sec\n", + "while True:\n", + " job_status = sagemaker_client.describe_edge_packaging_job(\n", + " EdgePackagingJobName=keras_packaging_job_name\n", + " )\n", + " if job_status[\"EdgePackagingJobStatus\"] == \"COMPLETED\":\n", + " break\n", + " elif job_status[\"EdgePackagingJobStatus\"] == \"FAILED\":\n", + " raise RuntimeError(\"Edge Packaging failed\")\n", + " print(\"Packaging ...\")\n", + " time.sleep(30)\n", + "\n", + "if job_status[\"PresetDeploymentOutput\"][\"Status\"] != \"COMPLETED\":\n", + " print(\"!!Component packaging failed!!\")\n", + " print(job_status[\"PresetDeploymentOutput\"][\"StatusMessage\"])\n", + "else:\n", + " print(\"Done!\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "

If you are getting a Component packaging failed error, it is likely that you have already a component named com.model.keras with the same version in your account. To fix, you can either delete the component from your account or increment the version of the component for the packaging job (see keras_model_version variable above)

" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "keras_model_data = job_status[\"ModelArtifact\"]" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### Install Greengrass\n", + "\n", + "SageMaker Edge Manager can use AWS IoT Greengrass to deploy the agent, the model and the inference application to the edge device.\n", + "\n", + "AWS IoT Greengrass provides all the necessary features to manage applications on remote devices in a secure and scalable way. To learn more about Greengrass, head to the [What is AWS IoT Greengrass?](https://docs.aws.amazon.com/greengrass/v2/developerguide/what-is-iot-greengrass.html). \n", + "\n", + "The SageMaker Edge Manager agent leverages the AWS credentials provided by the [Token exchange service](https://docs.aws.amazon.com/greengrass/v2/developerguide/token-exchange-service-component.html) component to securely communicate with the SageMaker Edge Manager backend.\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "ssm_client = boto3.client(\"ssm\", region_name=region)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "**Note**: If you are using a real device, connect to the device via SSH, ensure that you have both Java v8 or above and the Unzip command and then run the following commands (replace `` with the correct value). To run this command on the device you also need to provide IAM credentials with at least the permissions listed in [Minimal IAM policy for installer to provision resources](https://docs.aws.amazon.com/greengrass/v2/developerguide/provision-minimal-iam-policy.html).\n", + "\n", + "```bash\n", + "curl -s https://d2s8p88vqu9w66.cloudfront.net/releases/greengrass-nucleus-latest.zip > greengrass-nucleus-latest.zip \\\n", + " && unzip greengrass-nucleus-latest.zip -d GreengrassCore,\n", + "sudo -E java -Droot=\"/greengrass/v2\" -Dlog.store=FILE -jar ./GreengrassCore/lib/Greengrass.jar \\\n", + " --thing-name GreengrassSMEdgeManagerDevice -trn SageMakerTESRole -tra SageMakerTESRoleAlias \\ \n", + " --thing-group-name GreengrassSMEdgeManagerGroup \\ \n", + " --component-default-user ggc_user:ggc_group --provision true --setup-system-service true --deploy-dev-tools true \\\n", + " --aws-region \n", + "```\n", + "\n", + "Otherwise, run the following command to install Greengrass on the EC2 instance. You can find further information about the command to run and their effect in [Install AWS IoT Greengrass Core software with automatic resource provisioning](https://docs.aws.amazon.com/greengrass/v2/developerguide/quick-installation.html)." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "response = ssm_client.send_command(\n", + " InstanceIds=[instance_id],\n", + " DocumentName=\"AWS-RunShellScript\",\n", + " OutputS3BucketName=bucket,\n", + " OutputS3KeyPrefix=folder,\n", + " Parameters={\n", + " \"commands\": [\n", + " \"#!/bin/bash\",\n", + " \"sudo apt update && apt install python3-venv -y\",\n", + " \"curl -s https://d2s8p88vqu9w66.cloudfront.net/releases/greengrass-nucleus-latest.zip > greengrass-nucleus-latest.zip && unzip greengrass-nucleus-latest.zip -d GreengrassCore\",\n", + " f'sudo -E java -Droot=\"/greengrass/v2\" -Dlog.store=FILE -jar ./GreengrassCore/lib/Greengrass.jar --aws-region {region} --thing-name GreengrassSMEdgeManagerDevice -trn {iot_device_role_name} -tra SageMakerTESRoleAlias --thing-group-name GreengrassSMEdgeManagerGroup --component-default-user ggc_user:ggc_group --provision true --setup-system-service true --deploy-dev-tools true',\n", + " ]\n", + " },\n", + ")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "ssm_client.get_command_invocation(\n", + " CommandId=response[\"Command\"][\"CommandId\"],\n", + " InstanceId=instance_id,\n", + ")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "print(iot_device_role_name)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Create Device Fleet" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### Modify the IAM role for device fleet" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Configure an IAM role in your AWS account that will be assumed by the credentials' provider on behalf of the devices in your device fleet. \n", + "\n", + "\n", + "Go to [IAM console](https://console.aws.amazon.com/iam/home?#/roles/SageMakerTESRole), and look for the role create role for IoT, which is printed in the cell above \n", + "\n", + "1. Attach the following policies:\n", + "\n", + " - AmazonSageMakerEdgeDeviceFleetPolicy\n", + "\n", + "\n", + "2. Add the following permissions to the `SageMakerTESRoleAccess`:\n", + "\n", + "```json\n", + "{\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"s3:GetObject\"\n", + " ],\n", + " \"Resource\": [\n", + " \"arn:aws:s3:::*SageMaker*\",\n", + " \"arn:aws:s3:::*Sagemaker*\",\n", + " \"arn:aws:s3:::*sagemaker*\"\n", + " ]\n", + "}\n", + "```\n", + "\n", + "3. Edit then the [trust relationship](https://console.aws.amazon.com/iam/home?#/roles/SageMakerTESRole?section=trust) as follow:\n", + "```\n", + "{\n", + " \"Version\": \"2012-10-17\",\n", + " \"Statement\": [\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Principal\": {\"Service\": \"credentials.iot.amazonaws.com\"},\n", + " \"Action\": \"sts:AssumeRole\"\n", + " },\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"},\n", + " \"Action\": \"sts:AssumeRole\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "Note down the role ARN, it will be later used for creating the device fleet." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "role_arn = \"\"" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "device_fleet_name = \"demo-device-fleet\" + str(time.time()).split(\".\")[0]\n", + "\n", + "sagemaker_client.create_device_fleet(\n", + " DeviceFleetName=device_fleet_name,\n", + " RoleArn=role_arn,\n", + " OutputConfig={\"S3OutputLocation\": s3_compilation_output_location},\n", + ")\n", + "\n", + "print(device_fleet_name)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "#### Register device to the fleet" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "device_name = \"GreengrassSMEdgeManagerDevice\"\n", + "\n", + "sagemaker_client.register_devices(\n", + " DeviceFleetName=device_fleet_name,\n", + " Devices=[\n", + " {\n", + " \"DeviceName\": device_name,\n", + " \"IotThingName\": device_name,\n", + " \"Description\": \"this is a sample virtual device\",\n", + " }\n", + " ],\n", + ")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Inference on Edge" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "In this example, we will use [AWS IoT Greengrass](https://docs.aws.amazon.com/) to remotely deploy the agent, the model and the inference application.\n", + "\n", + "The [SageMaker Edge Manager component](https://docs.aws.amazon.com/greengrass/v2/developerguide/sagemaker-edge-manager-component.html) is already provided and will be used to deploy and run the agent on the device.\n", + "\n", + "The model component has been created for you by the packaging jobs you execute previously and in [your account](https://console.aws.amazon.com/iot/home?#/greengrass/v2/components) you should now have 1 component called `com.model.keras`. \n", + "\n", + "In order to be able to use the model, we also need an application component to load the model and invoke it. In the next section we are going to see how to create such component." + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Create the inference application component" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "We will use a Python application to load the model and perform the inference. The application is provided in the [inference.py](./inference.py) file. In addition to this file, you will also need to generate the Protobuf libraries that can be used with the gRPC API of the agent.\n", + "\n", + "First list the available releases from the S3 bucket. It does not matter which OS we are going to use since we only need the protobuf definitions." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "!aws s3 ls s3://sagemaker-edge-release-store-us-west-2-linux-x64/Releases/ | sort -r" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Select the archive corresponding to the first item on the list:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "!aws s3 cp \"s3://sagemaker-edge-release-store-us-west-2-linux-x64/Releases/1.20210512.96da6cc/1.20210512.96da6cc.tgz\" sm_agent.tgz" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Now we extract the protobuf definition:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "!tar tf sm_agent.tgz" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "!tar xf sm_agent.tgz ./docs/api/agent.proto" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Run the following code to generate the Python libraries to use the API:" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "%%bash\n", + "# If you are running this on a personal computer you might want to create a Virtual Environment first. Uncomment the following lines \n", + "# python3 -m venv venv\n", + "# . venv/bin/activate\n", + "pip install pip --upgrade\n", + "pip install wheel\n", + "pip install grpcio==1.38.1\n", + "pip install grpcio-tools==1.38.1\n", + "python3 -m grpc_tools.protoc --proto_path=./docs/api --python_out=. --grpc_python_out=. agent.proto" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "We also need to download the JSON file containing the ImageNet classes that will be used to print the predictions" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "!curl \"https://storage.googleapis.com/download.tensorflow.org/data/imagenet_class_index.json\" -O" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Create the AWS IoT Greengrass component\n", + "\n", + "Next, upload the `inference.py` and the protobuf libraries to an S3 bucket so that they can be referenced by the component recipe and from where they will be downloaded by Greengrass on the device. We use the SageMaker bucket for this." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "bucket" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "%%bash\n", + "# Replace with the bucket name printed above\n", + "export BUCKET=''\n", + "aws s3 cp inference.py s3://$BUCKET/com.sagemaker.edgePythonExample/1.0.0/inference.py\n", + "aws s3 cp agent_pb2.py s3://$BUCKET/com.sagemaker.edgePythonExample/1.0.0/agent_pb2.py\n", + "aws s3 cp agent_pb2_grpc.py s3://$BUCKET/com.sagemaker.edgePythonExample/1.0.0/agent_pb2_grpc.py\n", + "aws s3 cp keras.bmp s3://$BUCKET/com.sagemaker.edgePythonExample/1.0.0/keras.bmp\n", + "aws s3 cp imagenet_class_index.json s3://$BUCKET/com.sagemaker.edgePythonExample/1.0.0/imagenet_class_index.json" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "As a final step, create the component. You can use the [AWS Greengrass Console](https://console.aws.amazon.com/iot/home?#/greengrass/v2/components/create) to create a new component. Go to **Component>Create component**, select `Enter recipe as YAML` and copy and paste the following YAML. Before choosing `Create component`, make sure you have replaced the `_BUCKET_` placeholder with the name of the bucket to which you have uploaded the artifacts.\n", + "\n", + "```yaml\n", + "---\n", + "RecipeFormatVersion: 2020-01-25\n", + "ComponentName: com.sagemaker.edgePythonExample\n", + "ComponentVersion: 1.0.0\n", + "ComponentDescription: Sagemaker Edge Manager Python example\n", + "ComponentPublisher: Amazon Web Services, Inc.\n", + "ComponentDependencies:\n", + " aws.greengrass.SageMakerEdgeManager:\n", + " VersionRequirement: '>=1.0.0'\n", + " DependencyType: HARD\n", + " com.model.keras:\n", + " VersionRequirement: '~1.0.0'\n", + " DependencyType: HARD\n", + "ComponentConfiguration:\n", + " DefaultConfiguration:\n", + " Demo: \"true\"\n", + " MLModel: keras\n", + " ImagePath: /absolute/path\n", + " CaptureData: \"true\"\n", + "Manifests:\n", + " - Platform:\n", + " os: linux\n", + " architecture: \"/amd64|x86/\"\n", + " Lifecycle:\n", + " Install: |-\n", + " python3 -m venv venv\n", + " . venv/bin/activate\n", + " pip install pip --upgrade\n", + " pip install wheel \n", + " pip3 install grpcio==1.38.1\n", + " pip3 install grpcio-tools==1.38.1\n", + " pip3 install protobuf\n", + " pip3 install Pillow\n", + " pip3 install numpy\n", + " Run:\n", + " Setenv:\n", + " DEMO: \"{configuration:/Demo}\"\n", + " ML_MODEL: \"{configuration:/MLModel}\"\n", + " CAPTURE_DATA: \"{configuration:/CaptureData}\"\n", + " Script: |- \n", + " export MODEL_PATH=\"{com.model.keras:work:path}\"\n", + " export IMAGE_PATH=\"{artifacts:path}/keras.bmp\"\n", + " export IMAGENET_CLASS_INDEX_PATH=\"{artifacts:path}/imagenet_class_index.json\"\n", + " if [ $DEMO != 'true' ]; then\n", + " export IMAGE_PATH=\"{configuration:/ImagePath}\"\n", + " fi\n", + " . venv/bin/activate \n", + " python3 -u {artifacts:path}/inference.py \n", + " Artifacts:\n", + " - URI: s3://_BUCKET_/com.sagemaker.edgePythonExample/1.0.0/inference.py\n", + " - URI: s3://_BUCKET_/com.sagemaker.edgePythonExample/1.0.0/agent_pb2.py\n", + " - URI: s3://_BUCKET_/com.sagemaker.edgePythonExample/1.0.0/agent_pb2_grpc.py\n", + " - URI: s3://_BUCKET_/com.sagemaker.edgePythonExample/1.0.0/keras.bmp \n", + " - URI: s3://_BUCKET_/com.sagemaker.edgePythonExample/1.0.0/imagenet_class_index.json\n", + "```\n", + "\n", + "> Note: instead of uploading each singe file and specify them as artifacts in the component recipe, you can also create a ZIP archive and modify the recipe as follow:\n", + "\n", + "```yaml\n", + " Lifecycle:\n", + " Run:\n", + " ...\n", + " Script:\n", + " ...\n", + " python3 -u {artifacts:decompressedPath}/app/inference.py\n", + " Artifacts:\n", + " - URI: s3://_BUCKET_/com.sagemaker.edgePythonExample/1.0.0/app.zip\n", + " Archive: ZIP\n", + "```" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "# Deploy the application\n", + "\n", + "Once the application component has been created, it can be deployed to the device. \n", + "\n", + "1. In the [AWS Console](https://console.aws.amazon.com/iot/home#/greengrass/v2/components/private) select the `com.sagemaker.edgePythonExample` component, and choose **Deploy**.\n", + "2. Select the deployment named `Deployment for GreengrassSMEdgeManagerGroup` and choose `Next`\n", + "3. Toggle the selector on the `Public components` pane, search for `SageMakerEdgeManager`, and selct it\n", + "4. Choose **Next**\n", + "5. Select the `aws.greengrass.SageMakerEdgeManager` component and choose **Configure component**\n", + "6. Replace the **Configuration to merge** content with the following json. Don't forget to change the placeholder to the actual values.\n", + "```json\n", + "{\n", + "\t\"DeviceFleetName\": ,\n", + "\t\"BucketName\": \n", + "}\n", + "```\n", + "7. Choose **Confirm**\n", + "3. Choose **Next** until you reach the last screen. \n", + "4. Choose **Deploy**.\n", + "\n", + "You can check the status of the deployment by clicking on the device name `GreengrassSMEdgeManagerDevice` in the Target core devices pane and then selecting Deployments tab.\n", + "\n", + "After few seconds the components will have been deployed to the EC2 instance and you should be able to see the results of the inference in the Greengrass logs by executing:\n", + "```bash\n", + "sudo cat /greengrass/v2/logs/com.sagemaker.edgePythonExample.log\n", + "```\n", + "\n", + "If you are using another device, it might take longer depending on the network speed.\n", + "\n", + "As you noticed there is no need to install the agent separately or download the model: these tasks are performed by Greengrass based on the fact that the application recipe defined the dependencies on SageMagerEdge manager component and the model component created by the packaging job.\n", + "\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "cat_log_out = ssm_client.send_command(\n", + " InstanceIds=[instance_id],\n", + " DocumentName=\"AWS-RunShellScript\",\n", + " OutputS3BucketName=bucket,\n", + " OutputS3KeyPrefix=folder,\n", + " Parameters={\n", + " \"commands\": [\n", + " \"sudo tail -30 /greengrass/v2/logs/com.sagemaker.edgePythonExample.log\",\n", + " ]\n", + " },\n", + ")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "output = ssm_client.get_command_invocation(\n", + " CommandId=cat_log_out[\"Command\"][\"CommandId\"],\n", + " InstanceId=instance_id,\n", + ")[\"StandardOutputContent\"]\n", + "print(output)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Customizing the application\n", + "\n", + "The application component is configured to run in demo mode by default, meaning it will use a pre-loaded image to perform the prediction. In case you want to provide your own image, you can change the component configuration during the deployment passing the following values:\n", + "```json\n", + "{\n", + " \"Demo\": \"false\",\n", + " \"ImagePath\": \"/absolute/host/path/to/the/image.bmp\"\n", + "}\n", + "```\n", + "\n", + "Note that the image must be in BMP format and 224x224 pixels. If you want to provide images in other format and sizes you need to preprocess the image to obtain the above format and encoding.\n", + "\n", + "The code will:\n", + "\n", + "1. Load the model in SageMager Edge Manager agent\n", + "2. List the models\n", + "3. Perform the prediction\n", + "4. Unload the model\n" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Clean Up" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Undeploy the application" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "ssm_client.cancel_command(CommandId=agent_out[\"Command\"][\"CommandId\"], InstanceIds=[instance_id])" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Stop the EC2 instance" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "ec2_client.stop_instances(InstanceIds=[instance_id])" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Detach and delete policy" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "iot_client.detach_policy(policyName=policy_name, target=iot_cert[\"certificateArn\"])\n", + "\n", + "iot_client.delete_policy(policyName=policy_name)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "Deregister device and delete device fleet" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "sagemaker_client.deregister_devices(DeviceFleetName=device_fleet_name, DeviceNames=[device_name])\n", + "\n", + "sagemaker_client.delete_device_fleet(DeviceFleetName=device_fleet_name)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Appendix" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### (Optional) Use LogManager component to upload logs to CloudWatch" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "If you want to be able to access the logs generated by the component fom the cloud, you can add `aws.greengrass.LogManager` component to the deployment. \n", + "\n", + "1. Open the AWS Console and navigate to IoT Core > Greengrass > Components\n", + "2. Choose the `aws.greengrass.LogManager` component in the Public components tab \n", + "3. Choose `Deploy`\n", + "4. Select **Add to existing deployment** and select the same deployment you used earlier\n", + "5. Choose **Next**\n", + "6. Choose **Next**\n", + "7. Choose **Next**\n", + "7. On the **Component configuration** screen select the `aws.greengrass.LogManager` component and choose **Configure component**\n", + "8. In the **Configuration to merge** pane enter the following\n", + "```json\n", + "{\n", + " \"logsUploaderConfiguration\": {\n", + " \"componentLogsConfiguration\": [\n", + " {\n", + " \"componentName\": \"com.sagemaker.edgePythonExample\"\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "```\n", + "9. Choose **Confirm**\n", + "10. Choose **Next** until the last page and then **Deploy**\n", + "\n", + "Once the deployment is completed on the device, you will find the logs uploaded to the Amazon CloudWatch console." + ], + "metadata": {} + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/sagemaker_neo_compilation_jobs/README.md b/sagemaker_neo_compilation_jobs/README.md index 94cbc9f13a..2db0ad01bc 100644 --- a/sagemaker_neo_compilation_jobs/README.md +++ b/sagemaker_neo_compilation_jobs/README.md @@ -5,7 +5,6 @@ These examples focus on the Amazon SageMaker Neo which allows you to compile models and host in pre-built containers. -- [GluonCV SSD Mobilenet](gluoncv_ssd_mobilenet) - [Image Classification](imageclassification_caltech) - [MNIST with MXNet](mxnet_mnist) - [Deploying pre-trained PyTorch vision models](pytorch_torchvision) diff --git a/sagemaker_neo_compilation_jobs/deploy_pytorch_model_on_Inf1_instance/pytorch_torchvision_neo_on_Inf1.ipynb b/sagemaker_neo_compilation_jobs/deploy_pytorch_model_on_Inf1_instance/pytorch_torchvision_neo_on_Inf1.ipynb index 9fbeafb500..04027c019e 100644 --- a/sagemaker_neo_compilation_jobs/deploy_pytorch_model_on_Inf1_instance/pytorch_torchvision_neo_on_Inf1.ipynb +++ b/sagemaker_neo_compilation_jobs/deploy_pytorch_model_on_Inf1_instance/pytorch_torchvision_neo_on_Inf1.ipynb @@ -160,7 +160,7 @@ "source": [ "## Deploy model on Inf1 instance for real-time inferences\n", "\n", - "After creating the PyTorch model, we compile the model using Amazon SageMaker Neo to optize performance for our desired deployment target. To compile our model for deploying on Inf1 instances, we are using the ``compile()`` method and select ``'ml_inf1'`` as our deployment target. The compiled model will then be deployed on an endpoint using Inf1 instances in Amazon SageMaker. \n", + "After creating the PyTorch model, we compile the model using Amazon SageMaker Neo to optimize performance for our desired deployment target. To compile our model for deploying on Inf1 instances, we are using the ``compile()`` method and select ``'ml_inf1'`` as our deployment target. The compiled model will then be deployed on an endpoint using Inf1 instances in Amazon SageMaker.\n", "\n", "## Compile the model \n", "\n", @@ -276,7 +276,7 @@ "metadata": {}, "source": [ "## Delete the Endpoint\n", - "Having an endpoint running will incur some costs. Therefore as a clean-up job, we should delete the endpoint." + "Having an endpoint running will incur some costs. Therefore, as a clean-up job, we should delete the endpoint." ] }, { @@ -339,7 +339,7 @@ "source": [ "In order to host model compiled for 2 cores, we set environment variables NEURONCORE_GROUP_SIZES and SAGEMAKER_MODEL_SERVER_WORKERS.\n", "### More Information on Environment Variables for Hosting\n", - "NEURONCORE_GROUP_SIZES - If the model is compiled for n inferentia cores , set NEURONCORE_GROUP_SIZES=n . For more information on NEURONCORE_GROUP_SIZES, refer to https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/tutorials/tutorial-tensorflow-NeuronCore-Group.html\n", + "NEURONCORE_GROUP_SIZES - If the model is compiled for n inferentia cores, set NEURONCORE_GROUP_SIZES=n. For more information on NEURONCORE_GROUP_SIZES, refer to https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/tutorials/tutorial-tensorflow-NeuronCore-Group.html\n", "\n", "SAGEMAKER_MODEL_SERVER_WORKERS - Number of workers required to utilize all inferentia cores. For example, on inf1.2xlarge or inf1.xlarge, if the model is compiled for one core,\n", "we need 4 workers to utilize all inferentia cores which will load the compiled model in different processes. If the model is compiled for 2 cores, we only need 2 workers to utilize \n", @@ -386,7 +386,7 @@ " framework_version=\"1.5.1\",\n", " role=role,\n", " job_name=compilation_job_name,\n", - " compiler_options={\"num-neuroncores\": 2},\n", + " compiler_options='\"--verbose 1 --neuroncore-pipeline-cores 2\"',\n", ")" ] }, @@ -472,7 +472,7 @@ "metadata": {}, "source": [ "## Delete the Endpoint\n", - "Having an endpoint running will incur some costs. Therefore as a clean-up job, we should delete the endpoint." + "Having an endpoint running will incur some costs. Therefore, as a clean-up job, we should delete the endpoint." ] }, { diff --git a/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet/gluoncv_ssd_mobilenet_neo.ipynb b/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet/gluoncv_ssd_mobilenet_neo.ipynb new file mode 100644 index 0000000000..c850a71528 --- /dev/null +++ b/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet/gluoncv_ssd_mobilenet_neo.ipynb @@ -0,0 +1,443 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deploy pre-trained GluonCV SSD Mobilenet model with SageMaker Neo\n", + "\n", + "1. [Introduction](#Introduction)\n", + "2. [Setup](#Setup)\n", + " 1. [Import SSD Mobilenet model from MXNet GluonCV](#Import-SSD-Mobilenet-model-from-MXNet-GluonCV)\n", + " 2. [Upload model to S3](#Upload-model-to-S3)\n", + " 3. [Use sagemaker MXNetModel to load pretrained MXNet model](#Use-sagemaker-MXNetModel-to-load-pretrained-MXNet-model)\n", + "3. [Compile the pre-trained model using SageMaker Neo](#Compile-the-pre-trained-model-using-SageMaker-Neo)\n", + "4. [Deploy-the-compiled-model-and-request-Inferences](#Deploy-the-compiled-model-and-request-Inferences)\n", + "5. [Delete the Endpoint](#Delete-the-Endpoint)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "This example demonstrates how to load a pre-trained MXNet GluonCV SSD model, optimize the trained model using SageMaker Neo, and host the model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "To compile and deploy the ssd mobilenet model on Amazon SageMaker, we need to setup and authenticate the use of AWS services.\n", + "\n", + "To start, we need to upgrade the [SageMaker SDK for Python](https://sagemaker.readthedocs.io/en/stable/v2.html) to v2.33.0 or greater and latest MXNet GluonCV and restart the kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!~/anaconda3/envs/mxnet_p36/bin/pip install --upgrade sagemaker>=2.33.0 gluoncv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we need an AWS account role with SageMaker access. This role is used to give SageMaker access to your data in S3. We also create a session." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "\n", + "role = get_execution_role()\n", + "sess = sagemaker.Session()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then need an S3 bucket that would be used for storing the model artifacts generated after training and compilation, training data and custom code. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# S3 bucket and folders for saving code and model artifacts.\n", + "# Feel free to specify different bucket/folders here if you wish.\n", + "bucket = sess.default_bucket()\n", + "folder = \"DEMO-ObjectDetection-SSD-MobileNet\"\n", + "pretrained_model_sub_folder = folder + \"/pretrained-model\"\n", + "compilation_output_sub_folder = folder + \"/compilation-output\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To easily visualize the detection outputs we also define the following function. The function visualizes the high-confidence predictions with bounding box by filtering out low-confidence detections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "def visualize_detection(img_file, dets, classes=[], thresh=0.6):\n", + " \"\"\"\n", + " visualize detections in one image\n", + " Parameters:\n", + " ----------\n", + " img_file : numpy.array\n", + " image, in bgr format\n", + " dets : numpy.array\n", + " ssd detections, numpy.array([[id, score, x1, y1, x2, y2]...])\n", + " each row is one object\n", + " classes : tuple or list of str\n", + " class names\n", + " thresh : float\n", + " score threshold\n", + " \"\"\"\n", + " import random\n", + " import matplotlib.pyplot as plt\n", + " import matplotlib.image as mpimg\n", + " from matplotlib.patches import Rectangle\n", + "\n", + " img = mpimg.imread(img_file)\n", + " plt.imshow(img)\n", + " height = img.shape[0]\n", + " width = img.shape[1]\n", + " colors = dict()\n", + " klasses = dets[0][0]\n", + " scores = dets[1][0]\n", + " bbox = dets[2][0]\n", + " for i in range(len(classes)):\n", + " klass = klasses[i][0]\n", + " score = scores[i][0]\n", + " x0, y0, x1, y1 = bbox[i]\n", + " if score < thresh:\n", + " continue\n", + " cls_id = int(klass)\n", + " if cls_id not in colors:\n", + " colors[cls_id] = (random.random(), random.random(), random.random())\n", + " xmin = int(x0 * width / 512)\n", + " ymin = int(y0 * height / 512)\n", + " xmax = int(x1 * width / 512)\n", + " ymax = int(y1 * height / 512)\n", + " rect = Rectangle(\n", + " (xmin, ymin),\n", + " xmax - xmin,\n", + " ymax - ymin,\n", + " fill=False,\n", + " edgecolor=colors[cls_id],\n", + " linewidth=3.5,\n", + " )\n", + " plt.gca().add_patch(rect)\n", + " class_name = str(cls_id)\n", + " if classes and len(classes) > cls_id:\n", + " class_name = classes[cls_id]\n", + " plt.gca().text(\n", + " xmin,\n", + " ymin - 2,\n", + " \"{:s} {:.3f}\".format(class_name, score),\n", + " bbox=dict(facecolor=colors[cls_id], alpha=0.5),\n", + " fontsize=12,\n", + " color=\"white\",\n", + " )\n", + " plt.tight_layout(rect=[0, 0, 2, 2])\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing object categories\n", + "object_categories = [\n", + " \"aeroplane\",\n", + " \"bicycle\",\n", + " \"bird\",\n", + " \"boat\",\n", + " \"bottle\",\n", + " \"bus\",\n", + " \"car\",\n", + " \"cat\",\n", + " \"chair\",\n", + " \"cow\",\n", + " \"diningtable\",\n", + " \"dog\",\n", + " \"horse\",\n", + " \"motorbike\",\n", + " \"person\",\n", + " \"pottedplant\",\n", + " \"sheep\",\n", + " \"sofa\",\n", + " \"train\",\n", + " \"tvmonitor\",\n", + "]\n", + "\n", + "# Setting a threshold 0.20 will only plot detection results that have a confidence score greater than 0.20\n", + "threshold = 0.20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we load the test image into the memory. The test image used in this notebook is from [PEXELS](https://www.pexels.com/) which remains unseen until the time of prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import PIL.Image\n", + "import numpy as np\n", + "\n", + "test_file = \"test.jpg\"\n", + "test_image = PIL.Image.open(test_file)\n", + "test_image = np.asarray(test_image.resize((512, 512)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import SSD Mobilenet model from MXNet GluonCV\n", + "\n", + "This example uses pre-trained MXNet GluonCV SSD model initially published in:\n", + "> Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. ECCV 2016." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import mxnet as mx\n", + "import gluoncv as gcv\n", + "import tarfile\n", + "\n", + "net = gcv.model_zoo.get_model(\"ssd_512_mobilenet1.0_voc\", pretrained=True)\n", + "net.hybridize()\n", + "net(mx.nd.ones((1, 3, 512, 512)))\n", + "net.export(\"model\")\n", + "tar = tarfile.open(\"ssd_512_mobilenet1.0_voc.tar.gz\", \"w:gz\")\n", + "\n", + "for name in [\"model-0000.params\", \"model-symbol.json\"]:\n", + " tar.add(name)\n", + "tar.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upload model to S3\n", + "Upload the pre-trained model to the S3 bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pretrained_model_path = sess.upload_data(\n", + " path=\"ssd_512_mobilenet1.0_voc.tar.gz\", bucket=bucket, key_prefix=pretrained_model_sub_folder\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we need to setup training and compilation output locations in S3, where the respective model artifacts will be dumped. We also setup the s3 location for training data and custom code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# S3 Location to save the model artifact after training\n", + "s3_pretrained_model_location = \"s3://{}/{}\".format(bucket, pretrained_model_sub_folder)\n", + "\n", + "# S3 Location to save the model artifact after compilation\n", + "s3_compilation_output_location = \"s3://{}/{}\".format(bucket, compilation_output_sub_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use sagemaker MXNetModel to load pretrained MXNet model\n", + "When loading the model, user is expected to provide the `entry_point` script required by the model. We set `MMS_DEFAULT_RESPONSE_TIMEOUT` environment variable to `500` for MXNet model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% H\n" + } + }, + "outputs": [], + "source": [ + "from sagemaker.mxnet.model import MXNetModel\n", + "from sagemaker.mxnet import MXNetPredictor\n", + "\n", + "pre_trained_model = MXNetModel(\n", + " model_data=pretrained_model_path,\n", + " predictor_cls=MXNetPredictor,\n", + " framework_version=\"1.8\",\n", + " role=role,\n", + " sagemaker_session=sess,\n", + " entry_point=\"ssd_entry_point.py\",\n", + " py_version=\"py3\",\n", + " env={\"MMS_DEFAULT_RESPONSE_TIMEOUT\": \"500\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compile the pre-trained model using SageMaker Neo\n", + "\n", + "After loading the pretrained model we can use SageMaker Neo's ``compile()`` API to compile the pretrained model. When calling ``compile()``, the user is expected to provide all the correct input shapes required by the model for successful compilation. We also specify the target instance family, the name of our IAM execution role, S3 bucket to which the compiled model would be stored.\n", + "\n", + "For this example, we will choose `ml_p3` as the target instance family while compiling the trained model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "import time\n", + "\n", + "compiled_model = pre_trained_model.compile(\n", + " job_name=\"ssd-512-mobilenet-{}\".format(time.strftime(\"%Y%m%d%I%M%S\")),\n", + " target_instance_family=\"ml_p3\",\n", + " input_shape={\"data\": [1, 3, 512, 512]},\n", + " role=role,\n", + " framework=\"mxnet\",\n", + " framework_version=\"1.8\",\n", + " output_path=s3_compilation_output_location,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy the compiled model and request Inferences\n", + "\n", + "We have to deploy the compiled model within the instance family for which the trained model was compiled. Since we have compiled for `ml_p3` we can deploy to any `ml.p3` instance type. For this example we will choose `ml.p3.2xlarge`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "neo_object_detector = compiled_model.deploy(initial_instance_count=1, instance_type=\"ml.p3.2xlarge\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "response = neo_object_detector.predict(test_image)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize the detections.\n", + "visualize_detection(test_file, response, object_categories, threshold)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Delete the Endpoint\n", + "Having an endpoint running will incur some costs. Therefore, as an optional clean-up job, you can delete it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Endpoint name: \" + neo_object_detector.endpoint_name)\n", + "neo_object_detector.delete_endpoint()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_mxnet_p36", + "language": "python", + "name": "conda_mxnet_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet/ssd_entry_point.py b/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet/ssd_entry_point.py new file mode 100644 index 0000000000..bfb27dc713 --- /dev/null +++ b/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet/ssd_entry_point.py @@ -0,0 +1,277 @@ +import io +import json +import logging +import os + +import numpy as np +import PIL.Image + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +# ------------------------------------------------------------ # +# Training methods # +# ------------------------------------------------------------ # + +import argparse +import glob +import time +import warnings + +import mxnet as mx +from mxnet import autograd, gluon, nd + + +def parse_args(): + parser = argparse.ArgumentParser(description="Train SSD networks.") + parser.add_argument( + "--network", type=str, default="ssd_512_mobilenet1.0_voc", help="Network name" + ) + parser.add_argument( + "--data-shape", type=int, default=512, help="Input data shape, use 300, 512." + ) + parser.add_argument("--batch-size", type=int, default=32, help="Training mini-batch size") + parser.add_argument( + "--num-workers", + "-j", + dest="num_workers", + type=int, + default=4, + help="Number of data workers, you can use larger " + "number to accelerate data loading, if you CPU and GPUs are powerful.", + ) + parser.add_argument( + "--gpus", type=str, default="0", help="Training with GPUs, you can specify 1,3 for example." + ) + parser.add_argument("--epochs", type=int, default=240, help="Training epochs.") + parser.add_argument( + "--start-epoch", + type=int, + default=0, + help="Starting epoch for resuming, default is 0 for new training." + "You can specify it to 100 for example to start from 100 epoch.", + ) + parser.add_argument( + "--log-interval", type=int, default=100, help="Logging mini-batch interval. Default is 100." + ) + parser.add_argument("--lr", type=float, default=0.001, help="Learning rate, default is 0.001") + parser.add_argument( + "--lr-decay", type=float, default=0.1, help="decay rate of learning rate. default is 0.1." + ) + parser.add_argument( + "--lr-decay-epoch", + type=str, + default="160,200", + help="epochs at which learning rate decays. default is 160,200.", + ) + parser.add_argument("--momentum", type=float, default=0.9, help="SGD momentum, default is 0.9") + parser.add_argument("--wd", type=float, default=0.0005, help="Weight decay, default is 5e-4") + + return parser.parse_args() + + +def get_dataloader(net, data_shape, batch_size, num_workers, ctx): + """Get dataloader.""" + + from gluoncv import data as gdata + from gluoncv.data.batchify import Pad, Stack, Tuple + from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform + + width, height = data_shape, data_shape + # use fake data to generate fixed anchors for target generation + with autograd.train_mode(): + _, _, anchors = net(mx.nd.zeros((1, 3, height, width), ctx)) + anchors = anchors.as_in_context(mx.cpu()) + batchify_fn = Tuple(Stack(), Stack(), Stack()) # stack image, cls_targets, box_targets + train_dataset = gdata.RecordFileDetection( + os.path.join(os.environ["SM_CHANNEL_TRAIN"], "train.rec") + ) + train_loader = gluon.data.DataLoader( + train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)), + batch_size, + True, + batchify_fn=batchify_fn, + last_batch="rollover", + num_workers=num_workers, + ) + return train_loader + + +def train(net, train_data, ctx, args): + """Training pipeline""" + + import gluoncv as gcv + + net.collect_params().reset_ctx(ctx) + + trainer = gluon.Trainer( + net.collect_params(), + "sgd", + {"learning_rate": args.lr, "wd": args.wd, "momentum": args.momentum}, + update_on_kvstore=None, + ) + + # lr decay policy + lr_decay = float(args.lr_decay) + lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(",") if ls.strip()]) + + mbox_loss = gcv.loss.SSDMultiBoxLoss() + ce_metric = mx.metric.Loss("CrossEntropy") + smoothl1_metric = mx.metric.Loss("SmoothL1") + + # set up logger + logging.basicConfig() + logger = logging.getLogger() + logger.setLevel(logging.INFO) + logger.info(args) + logger.info("Start training from [Epoch {}]".format(args.start_epoch)) + best_map = [0] + + for epoch in range(args.start_epoch, args.epochs): + while lr_steps and epoch >= lr_steps[0]: + new_lr = trainer.learning_rate * lr_decay + lr_steps.pop(0) + trainer.set_learning_rate(new_lr) + logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) + ce_metric.reset() + smoothl1_metric.reset() + tic = time.time() + btic = time.time() + net.hybridize(static_alloc=True, static_shape=True) + + for i, batch in enumerate(train_data): + data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) + cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) + box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) + + with autograd.record(): + cls_preds = [] + box_preds = [] + for x in data: + cls_pred, box_pred, _ = net(x) + cls_preds.append(cls_pred) + box_preds.append(box_pred) + sum_loss, cls_loss, box_loss = mbox_loss( + cls_preds, box_preds, cls_targets, box_targets + ) + autograd.backward(sum_loss) + # since we have already normalized the loss, we don't want to normalize + # by batch-size anymore + trainer.step(1) + + local_batch_size = int(args.batch_size) + ce_metric.update(0, [l * local_batch_size for l in cls_loss]) + smoothl1_metric.update(0, [l * local_batch_size for l in box_loss]) + if args.log_interval and not (i + 1) % args.log_interval: + name1, loss1 = ce_metric.get() + name2, loss2 = smoothl1_metric.get() + logger.info( + "[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}".format( + epoch, i, args.batch_size / (time.time() - btic), name1, loss1, name2, loss2 + ) + ) + btic = time.time() + + name1, loss1 = ce_metric.get() + name2, loss2 = smoothl1_metric.get() + logger.info( + "[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}".format( + epoch, (time.time() - tic), name1, loss1, name2, loss2 + ) + ) + current_map = 0.0 + + # save model + net.set_nms(nms_thresh=0.45, nms_topk=400, post_nms=100) + net(mx.nd.ones((1, 3, 512, 512), ctx=ctx[0])) + net.export("%s/model" % os.environ["SM_MODEL_DIR"]) + return net + + +if __name__ == "__main__": + + from gluoncv import model_zoo + + args = parse_args() + + ctx = [mx.gpu(int(i)) for i in args.gpus.split(",") if i.strip()] + ctx = ctx if ctx else [mx.cpu()] + + net = model_zoo.get_model(args.network, pretrained=False, ctx=ctx) + net.initialize(ctx=mx.gpu(0)) + train_loader = get_dataloader(net, args.data_shape, args.batch_size, args.num_workers, ctx[0]) + + train(net, train_loader, ctx, args) + +# ------------------------------------------------------------ # +# Hosting methods for Neo compiled model # +# ------------------------------------------------------------ # + + +def model_fn(model_dir): + """ + Load the gluon model. Called once when hosting service starts. + :param: model_dir The directory where model files are stored. + :return: a model (in this case a Gluon network) + """ + logging.info("Invoking user-defined model_fn") + import neomx # noqa: F401 + + # change context to mx.cpu() when optimizing and deploying with Neo for CPU endpoints + ctx = mx.gpu() + net = gluon.SymbolBlock.imports( + "%s/compiled-symbol.json" % model_dir, + ["data"], + "%s/compiled-0000.params" % model_dir, + ctx=ctx, + ) + net.hybridize(static_alloc=True, static_shape=True) + # run warm-up inference on empty data + warmup_data = mx.nd.empty((1, 3, 512, 512), ctx=ctx) + class_IDs, scores, bounding_boxes = net(warmup_data) + + return net + + +def transform_fn(net, data, content_type, output_content_type): + """ + pre-process the incoming payload, perform prediction & convert the prediction output into response payload + """ + logging.info("Invoking user-defined transform_fn") + + import gluoncv as gcv + + # change context to mx.cpu() when optimizing and deploying with Neo for CPU endpoints + ctx = mx.gpu() + + """ + pre-processing + """ + # decode json string into numpy array + data = json.loads(data) + + # preprocess image + x, image = gcv.data.transforms.presets.ssd.transform_test(mx.nd.array(data), 512) + + # load image onto right context + x = x.as_in_context(ctx) + + """ + prediction/inference + """ + class_IDs, scores, bounding_boxes = net(x) + + """ + post-processing + """ + # create list of results + result = [ + class_IDs.asnumpy().tolist(), + scores.asnumpy().tolist(), + bounding_boxes.asnumpy().tolist(), + ] + + # decode as json string + response_body = json.dumps(result) + + return response_body, output_content_type diff --git a/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet/test.jpg b/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet/test.jpg new file mode 100644 index 0000000000..2eab5780e7 Binary files /dev/null and b/sagemaker_neo_compilation_jobs/gluoncv_ssd_mobilenet/test.jpg differ diff --git a/sagemaker_neo_compilation_jobs/index.rst b/sagemaker_neo_compilation_jobs/index.rst index 1972fca43b..e91e483e3d 100644 --- a/sagemaker_neo_compilation_jobs/index.rst +++ b/sagemaker_neo_compilation_jobs/index.rst @@ -6,7 +6,6 @@ Get started with model compilation with Neo xgboost_customer_churn/xgboost_customer_churn_neo imageclassification_caltech/Image-classification-fulltraining-highlevel-neo - gluoncv_ssd_mobilenet/gluoncv_ssd_mobilenet_neo Apache MXNet diff --git a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb index ac3a7de2b5..7f5c0368a9 100644 --- a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb +++ b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb @@ -2,14 +2,13 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "# Fairness and Explainability with SageMaker Clarify" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "1. [Overview](#Overview)\n", "1. [Prerequisites and Data](#Prerequisites-and-Data)\n", @@ -45,21 +44,20 @@ "1. Accessing the reports through SageMaker Studio if you have an instance set up.\n", "\n", "In doing so, the notebook will first train a [SageMaker XGBoost](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) model using training dataset, then use SageMaker Clarify to analyze a testing dataset in CSV format. SageMaker Clarify also supports analyzing dataset in [SageMaker JSONLines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats), which is illustrated in [another notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_jsonlines_format.ipynb)." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## Prerequisites and Data\n", "### Initialize SageMaker" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from sagemaker import Session\n", "\n", @@ -76,11 +74,12 @@ "\n", "role = get_execution_role()\n", "s3_client = boto3.client(\"s3\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### Download data\n", "Data Source: [https://archive.ics.uci.edu/ml/machine-learning-databases/adult/](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/)\n", @@ -88,13 +87,12 @@ "Let's __download__ the data and save it in the local folder with the name adult.data and adult.test from UCI repository$^{[2]}$.\n", "\n", "$^{[2]}$Dua Dheeru, and Efi Karra Taniskidou. \"[UCI Machine Learning Repository](http://archive.ics.uci.edu/ml)\". Irvine, CA: University of California, School of Information and Computer Science (2017)." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "adult_columns = [\n", " \"Age\",\n", @@ -128,11 +126,12 @@ " print(\"adult.test saved!\")\n", "else:\n", " print(\"adult.test already on disk.\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### Loading the data: Adult Dataset\n", "From the UCI repository of machine learning datasets, this database contains 14 features concerning demographic characteristics of 45,222 rows (32,561 for training and 12,661 for testing). The task is to predict whether a person has a yearly income that is more or less than $50,000.\n", @@ -156,15 +155,12 @@ "\n", "Next, we specify our binary prediction task: \n", "15. **Target**: <=50,000, >$50,000." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], "source": [ "training_data = pd.read_csv(\n", " \"adult.data\", names=adult_columns, sep=r\"\\s*,\\s*\", engine=\"python\", na_values=\"?\"\n", @@ -175,54 +171,56 @@ ").dropna()\n", "\n", "training_data.head()" - ] + ], + "outputs": [], + "metadata": { + "scrolled": true + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### Data inspection\n", "Plotting histograms for the distribution of the different features is a good way to visualize the data. Let's plot a few of the features that can be considered _sensitive_. \n", "Let's take a look specifically at the Sex feature of a census respondent. In the first plot we see that there are fewer Female respondents as a whole but especially in the positive outcomes, where they form ~$\\frac{1}{7}$th of respondents." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], "source": [ "training_data[\"Sex\"].value_counts().sort_values().plot(kind=\"bar\", title=\"Counts of Sex\", rot=0)" - ] + ], + "outputs": [], + "metadata": { + "scrolled": true + } }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], "source": [ "training_data[\"Sex\"].where(training_data[\"Target\"] == \">50K\").value_counts().sort_values().plot(\n", " kind=\"bar\", title=\"Counts of Sex earning >$50K\", rot=0\n", ")" - ] + ], + "outputs": [], + "metadata": { + "scrolled": true + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### Encode and Upload the Dataset\n", "Here we encode the training and test data. Encoding input data is not necessary for SageMaker Clarify, but is necessary for the model." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from sklearn import preprocessing\n", "\n", @@ -246,36 +244,36 @@ "test_features = testing_data.drop([\"Target\"], axis=1)\n", "test_target = testing_data[\"Target\"]\n", "test_features.to_csv(\"test_features.csv\", index=False, header=False)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "A quick note about our encoding: the \"Female\" Sex value has been encoded as 0 and \"Male\" as 1." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "training_data.head()" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Lastly, let's upload the data to S3" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from sagemaker.s3 import S3Uploader\n", "from sagemaker.inputs import TrainingInput\n", @@ -283,22 +281,22 @@ "train_uri = S3Uploader.upload(\"train_data.csv\", \"s3://{}/{}\".format(bucket, prefix))\n", "train_input = TrainingInput(train_uri, content_type=\"csv\")\n", "test_uri = S3Uploader.upload(\"test_features.csv\", \"s3://{}/{}\".format(bucket, prefix))" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### Train XGBoost Model\n", "#### Train Model\n", "Since our focus is on understanding how to use SageMaker Clarify, we keep it simple by using a standard XGBoost model." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from sagemaker.image_uris import retrieve\n", "from sagemaker.estimator import Estimator\n", @@ -324,64 +322,64 @@ ")\n", "\n", "xgb.fit({\"train\": train_input}, logs=False)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Deploy Model\n", "Here we create the SageMaker model." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "model_name = \"DEMO-clarify-model\"\n", "model = xgb.create_model(name=model_name)\n", "container_def = model.prepare_container_def()\n", "session.create_model(model_name, role, container_def)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## Amazon SageMaker Clarify\n", "Now that you have your model set up. Let's say hello to SageMaker Clarify!" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from sagemaker import clarify\n", "\n", "clarify_processor = clarify.SageMakerClarifyProcessor(\n", " role=role, instance_count=1, instance_type=\"ml.m5.xlarge\", sagemaker_session=session\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### Detecting Bias\n", "SageMaker Clarify helps you detect possible pre- and post-training biases using a variety of metrics.\n", "#### Writing DataConfig and ModelConfig\n", "A `DataConfig` object communicates some basic information about data I/O to SageMaker Clarify. We specify where to find the input dataset, where to store the output, the target column (`label`), the header names, and the dataset type." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "bias_report_output_path = \"s3://{}/{}/clarify-bias\".format(bucket, prefix)\n", "bias_data_config = clarify.DataConfig(\n", @@ -391,22 +389,22 @@ " headers=training_data.columns.to_list(),\n", " dataset_type=\"text/csv\",\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "A `ModelConfig` object communicates information about your trained model. To avoid additional traffic to your production models, SageMaker Clarify sets up and tears down a dedicated endpoint when processing.\n", "* `instance_type` and `instance_count` specify your preferred instance type and instance count used to run your model on during SageMaker Clarify's processing. The testing dataset is small so a single standard instance is good enough to run this example. If your have a large complex dataset, you may want to use a better instance type to speed up, or add more instances to enable Spark parallelization.\n", "* `accept_type` denotes the endpoint response payload format, and `content_type` denotes the payload format of request to the endpoint." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "model_config = clarify.ModelConfig(\n", " model_name=model_name,\n", @@ -415,52 +413,53 @@ " accept_type=\"text/csv\",\n", " content_type=\"text/csv\",\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "A `ModelPredictedLabelConfig` provides information on the format of your predictions. XGBoost model outputs probabilities of samples, so SageMaker Clarify invokes the endpoint then uses `probability_threshold` to convert the probability to binary labels for bias analysis. Prediction above the threshold is interpreted as label value `1` and below or equal as label value `0`." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.8)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Writing BiasConfig\n", "SageMaker Clarify also needs information on what the sensitive columns (`facets`) are, what the sensitive features (`facet_values_or_threshold`) may be, and what the desirable outcomes are (`label_values_or_threshold`).\n", "SageMaker Clarify can handle both categorical and continuous data for `facet_values_or_threshold` and for `label_values_or_threshold`. In this case we are using categorical data.\n", "\n", "We specify this information in the `BiasConfig` API. Here that the positive outcome is earning >$50,000, Sex is a sensitive category, and Female respondents are the sensitive group. `group_name` is used to form subgroups for the measurement of Conditional Demographic Disparity in Labels (CDDL) and Conditional Demographic Disparity in Predicted Labels (CDDPL) with regards to Simpson’s paradox." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "bias_config = clarify.BiasConfig(\n", " label_values_or_threshold=[1], facet_name=\"Sex\", facet_values_or_threshold=[0], group_name=\"Age\"\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Pre-training Bias\n", - "Bias can be present in your data before any model training occurs. Inspecting your data for bias before training begins can help detect any data collection gaps, inform your feature engineering, and hep you understand what societal biases the data may reflect.\n", + "Bias can be present in your data before any model training occurs. Inspecting your data for bias before training begins can help detect any data collection gaps, inform your feature engineering, and help you understand what societal biases the data may reflect.\n", "\n", "Computing pre-training bias metrics does not require a trained model.\n", "\n", @@ -471,13 +470,12 @@ "\n", "\n", "You can run these options separately with `run_pre_training_bias()` and `run_post_training_bias()` or at the same time with `run_bias()` as shown below." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "clarify_processor.run_bias(\n", " data_config=bias_data_config,\n", @@ -487,11 +485,12 @@ " pre_training_methods=\"all\",\n", " post_training_methods=\"all\",\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Viewing the Bias Report\n", "In Studio, you can view the results under the experiments tab.\n", @@ -505,44 +504,43 @@ "You could also summarize the results in a handy table!\n", "\n", "\n" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "If you're not a Studio user yet, you can access the bias report in pdf, html and ipynb formats in the following S3 bucket:" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "bias_report_output_path" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### Explaining Predictions\n", "There are expanding business needs and legislative regulations that require explanations of _why_ a model made the decision it did. SageMaker Clarify uses SHAP to explain the contribution that each input feature makes to the final decision." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ - "Kernel SHAP algorithm requires a baseline (also known as background dataset). Baseline dataset type shall be the same as `dataset_type` of `DataConfig`, and baseline samples shall only include features. By definition, `baseline` should either be a S3 URI to the baseline dataset file, or an in-place list of samples. In this case we chose the latter, and put the first sample of the test dataset to the list." - ] + "Kernel SHAP algorithm requires a baseline (also known as background dataset). If not provided, a baseline is calculated automatically by SageMaker Clarify using K-means or K-prototypes in the input dataset. Baseline dataset type shall be the same as `dataset_type` of `DataConfig`, and baseline samples shall only include features. By definition, `baseline` should either be a S3 URI to the baseline dataset file, or an in-place list of samples. In this case we chose the latter, and put the first sample of the test dataset to the list." + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "shap_config = clarify.SHAPConfig(\n", " baseline=[test_features.iloc[0].values.tolist()],\n", @@ -559,24 +557,25 @@ " headers=training_data.columns.to_list(),\n", " dataset_type=\"text/csv\",\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "clarify_processor.run_explainability(\n", " data_config=explainability_data_config,\n", " model_config=model_config,\n", " explainability_config=shap_config,\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Viewing the Explainability Report\n", "As with the bias report, you can view the explainability report in Studio under the experiments tab\n", @@ -587,32 +586,31 @@ "The Model Insights tab contains direct links to the report and model insights.\n", "\n", "If you're not a Studio user yet, as with the Bias Report, you can access this report at the following S3 bucket." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "explainability_output_path" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Analysis of local explanations\n", "It is possible to visualize the the local explanations for single examples in your dataset. You can use the obtained results from running Kernel SHAP algorithm for global explanations.\n", "\n", "You can simply load the local explanations stored in your output path, and visualize the explanation (i.e., the impact that the single features have on the prediction of your model) for any single example." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "local_explanations_out = pd.read_csv(explainability_output_path + \"/explanations_shap/out.csv\")\n", "feature_names = [str.replace(c, \"_label0\", \"\") for c in local_explanations_out.columns.to_series()]\n", @@ -629,24 +627,26 @@ "local_explanations_out.iloc[selected_example].plot(\n", " kind=\"bar\", title=\"Local explanation for the example number \" + str(selected_example), rot=90\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### Clean Up\n", "Finally, don't forget to clean up the resources we set up and used for this demo!" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "session.delete_model(model_name)" - ] + ], + "outputs": [], + "metadata": {} } ], "metadata": { @@ -671,4 +671,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_byoc.ipynb b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_byoc.ipynb index 0fb5e33273..2ab981a25c 100644 --- a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_byoc.ipynb +++ b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_byoc.ipynb @@ -672,7 +672,7 @@ " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": \"iam:PassRole\",\n", - " \"Resource\": \"arn:$partition:iam::*:role/*\",\n", + " \"Resource\": \"arn:$partition:iam::$account_id:role/$execution_role\",\n", " \"Condition\": {\n", " \"StringLikeIfExists\": {\n", " \"iam:PassedToService\": \"codebuild.amazonaws.com\"\n", @@ -682,7 +682,9 @@ " ]\n", "}\"\"\"\n", ")\n", - "permissions_policy = template.substitute(partition=arn_partition)\n", + "permissions_policy = template.substitute(\n", + " partition=arn_partition, account_id=account_id, execution_role=role\n", + ")\n", "print(permissions_policy)" ] }, diff --git a/training/distributed_training/pytorch/data_parallel/rnnt/RNNT_notebook.ipynb b/training/distributed_training/pytorch/data_parallel/rnnt/RNNT_notebook.ipynb new file mode 100644 index 0000000000..7e41ecb468 --- /dev/null +++ b/training/distributed_training/pytorch/data_parallel/rnnt/RNNT_notebook.ipynb @@ -0,0 +1,372 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "55659189", + "metadata": {}, + "source": [ + "# Distributed data parallel RNN-T training with PyTorch and SageMaker distributed\n", + "\n", + "[Amazon SageMaker's distributed library](https://docs.aws.amazon.com/sagemaker/latest/dg/distributed-training.html) can be used to train deep learning models faster and cheaper. The [data parallel](https://docs.aws.amazon.com/sagemaker/latest/dg/data-parallel.html) feature in this library (`smdistributed.dataparallel`) is a distributed data parallel training framework for PyTorch, TensorFlow, and MXNet.\n", + "\n", + "This notebook demonstrates how to use `smdistributed.dataparallel` with PyTorch(version 1.8.1) on [Amazon SageMaker](https://aws.amazon.com/sagemaker/) to train an RNN-T model on [LibriSpeech](http://www.openslr.org/12) (License: [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)) using [Amazon FSx for Lustre file-system](https://aws.amazon.com/fsx/lustre/) as data source.\n", + "\n", + "The outline of steps is as follows:\n", + "\n", + "1. Stage the LibriSpeech dataset in [Amazon S3](https://aws.amazon.com/s3/)\n", + "2. Create Amazon FSx Lustre file-system and import data into the file-system from S3\n", + "3. Build Docker training image and push it to [Amazon ECR](https://aws.amazon.com/ecr/)\n", + "4. Configure data input channels for SageMaker\n", + "5. Configure hyper-prarameters\n", + "6. Define training metrics\n", + "7. Define training job, set distribution strategy to SMDataParallel and start training\n", + "\n", + "**NOTE:** With large training dataset, we recommend using [Amazon FSx](https://aws.amazon.com/fsx/) as the input file system for the SageMaker training job. FSx file input to SageMaker significantly cuts down training start up time on SageMaker because it avoids downloading the training data each time you start the training job (as done with S3 input for SageMaker training job) and provides good data read throughput.\n", + "\n", + "\n", + "**NOTE:** This example requires SageMaker Python SDK v2.X." + ] + }, + { + "cell_type": "markdown", + "id": "1901d71a", + "metadata": {}, + "source": [ + "## Amazon SageMaker Initialization\n", + "\n", + "Initialize the notebook instance. Get the AWS Region and a SageMaker execution role.\n", + "\n", + "### SageMaker role\n", + "\n", + "The following code cell defines `role` which is the IAM role ARN used to create and run SageMaker training and hosting jobs. This is the same IAM role used to create this SageMaker Notebook instance. \n", + "\n", + "`role` must have permission to create a SageMaker training job and host a model. For granular policies you can use to grant these permissions, see [Amazon SageMaker Roles](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html). If you do not require fine-tuned permissions for this demo, you can use the IAM managed policy AmazonSageMakerFullAccess to complete this demo. \n", + "\n", + "As described above, since we will be using FSx, please make sure to attach `FSx Access` permission to this IAM role." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6fb79f1", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "! python3 -m pip install --upgrade sagemaker\n", + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.estimator import Estimator\n", + "import boto3\n", + "\n", + "sagemaker_session = sagemaker.Session()\n", + "bucket = sagemaker_session.default_bucket()\n", + "\n", + "role = (\n", + " get_execution_role()\n", + ") # provide a pre-existing role ARN as an alternative to creating a new role\n", + "role_name = role.split([\"/\"][-1])\n", + "print(f\"SageMaker Execution Role:{role}\")\n", + "print(f\"The name of the Execution role: {role_name[-1]}\")\n", + "\n", + "client = boto3.client(\"sts\")\n", + "account = client.get_caller_identity()[\"Account\"]\n", + "print(f\"AWS account:{account}\")\n", + "\n", + "session = boto3.session.Session()\n", + "region = session.region_name\n", + "print(f\"AWS region:{region}\")" + ] + }, + { + "cell_type": "markdown", + "id": "099b092e", + "metadata": {}, + "source": [ + "To verify that the role above has required permissions:\n", + "\n", + "1. Go to the IAM console: https://console.aws.amazon.com/iam/home.\n", + "2. Select **Roles**.\n", + "3. Enter the role name in the search box to search for that role. \n", + "4. Select the role.\n", + "5. Use the **Permissions** tab to verify this role has required permissions attached." + ] + }, + { + "cell_type": "markdown", + "id": "ede42360", + "metadata": {}, + "source": [ + "## Prepare SageMaker Training Images\n", + "\n", + "1. SageMaker by default uses the latest [Amazon Deep Learning Container Images (DLC)](https://github.com/aws/deep-learning-containers/blob/master/available_images.md) PyTorch training image. In this step, we use it as a base image and install additional dependencies required for training the RNN-T model.\n", + "2. In the Github repository https://github.com/HerringForks/SMDDP-Examples/tree/main/pytorch/rnnt we have forked an RNN-T example from [mlcommons/\n", + "training_results_v1.0](https://github.com/mlcommons/training_results_v1.0/tree/master/NVIDIA/benchmarks/rnnt/implementations/pytorch) and adapted the training script to work with `smdistributed.dataparallel`. We will use the `Dockerfile` provided there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bfbccfc", + "metadata": {}, + "outputs": [], + "source": [ + "# clone the repo and build the docker image\n", + "! pwd && rm -rf SMDDP-Examples && \\\n", + " aws ecr get-login-password --region {region} | docker login \\\n", + " --username AWS --password-stdin 763104351884.dkr.ecr.{region}.amazonaws.com && \\\n", + " git clone https://github.com/HerringForks/SMDDP-Examples.git && \\\n", + " cd SMDDP-Examples/pytorch/rnnt && \\\n", + " bash scripts/docker/build.sh\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dda17648", + "metadata": {}, + "outputs": [], + "source": [ + "# name the image\n", + "image = \"zhaoqi-dev\" # Example: mask-rcnn-smdataparallel-sagemaker\n", + "tag = \"rnnt_dlc_pt1.8.1_smddp\" # Example: pt1.8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97bd6481", + "metadata": {}, + "outputs": [], + "source": [ + "# tag the image we just built and push it to ecr\n", + "%%time\n", + "! chmod +x tag_and_push.sh; bash tag_and_push.sh {region} {image} {tag}" + ] + }, + { + "cell_type": "markdown", + "id": "c04fca1f", + "metadata": {}, + "source": [ + "## Preparing FSx Input for SageMaker\n", + "\n", + "1. Download and prepare your training dataset on S3, please refer to the [instructions](https://github.com/mlcommons/training_results_v1.0/tree/master/NVIDIA/benchmarks/rnnt/implementations/pytorch#steps-to-download-data). \n", + "2. Follow the [steps](https://docs.aws.amazon.com/fsx/latest/LustreGuide/create-fs-linked-data-repo.html) to create a FSx linked with your S3 bucket with training data. Make sure to add an endpoint to your VPC allowing S3 access.\n", + "3. Follow the [steps](https://aws.amazon.com/blogs/machine-learning/speed-up-training-on-amazon-sagemaker-using-amazon-efs-or-amazon-fsx-for-lustre-file-systems/) to configure your SageMaker training job to use FSx.\n", + "\n", + "### Important Caveats\n", + "\n", + "1. You need to use the same `subnet` and `vpc` and `security group` used with FSx when launching the SageMaker notebook instance. The same configurations will be used by your SageMaker training job.\n", + "2. Make sure you set [appropriate inbound/output rules](https://docs.aws.amazon.com/fsx/latest/LustreGuide/limit-access-security-groups.html) in the `security group`. Specifically, opening up these ports is necessary for SageMaker to access the FSx file system in the training job. \n", + "3. Make sure `SageMaker IAM Role` used to launch this SageMaker training job has access to `AmazonFSx`." + ] + }, + { + "cell_type": "markdown", + "id": "8873826a", + "metadata": {}, + "source": [ + "## SageMaker PyTorch Estimator function options\n", + "\n", + "In the following code block, you can update the estimator function to use a different instance type, instance count, and distribution strategy. You're also passing in the training script you reviewed in the previous cell.\n", + "\n", + "**Instance types**\n", + "\n", + "SMDataParallel supports model training on SageMaker with the following instance types only. For best performance, it is recommended you use an instance type that supports Amazon Elastic Fabric Adapter (ml.p3dn.24xlarge and ml.p4d.24xlarge).\n", + "\n", + "1. ml.p3.16xlarge\n", + "1. ml.p3dn.24xlarge [Recommended]\n", + "1. ml.p4d.24xlarge [Recommended]\n", + "\n", + "**Instance count**\n", + "\n", + "To get the best performance and the most out of SMDataParallel, you should use at least 2 instances, but you can also use 1 for testing this example.\n", + "\n", + "**Distribution strategy**\n", + "\n", + "Note that to use DDP mode, you need to update the `distribution` strategy, and set it to use `smdistributed dataparallel`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e948387d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from sagemaker.pytorch import PyTorch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dcf432f", + "metadata": {}, + "outputs": [], + "source": [ + "instance_type = \"ml.p4d.24xlarge\" # \"ml.p3dn.24xlarge\" # Other supported instance type: ml.p3.16xlarge, ml.p4d.24xlarge\n", + "instance_count = 2 # You can use 2, 4, 8 etc.\n", + "docker_image = f\"{account}.dkr.ecr.{region}.amazonaws.com/{image}:{tag}\" # YOUR_ECR_IMAGE_BUILT_WITH_ABOVE_DOCKER_FILE\n", + "username = \"AWS\"\n", + "subnets = [\"\"] # Should be same as Subnet used for FSx. Example: subnet-0f9XXXX\n", + "security_group_ids = [\n", + " \"\"\n", + "] # Should be same as Security group used for FSx. sg-03ZZZZZZ\n", + "job_name = \"pytorch-smdataparallel-rnnt\" # This job name is used as prefix to the sagemaker training job. Makes it easy for your look for your training job in SageMaker Training job console.\n", + "file_system_id = \"\" # FSx file system ID with your training dataset. Example: 'fs-0bYYYYYY'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4dcc47e", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure the RNN-T hyper-parameters\n", + "hyperparameters = {\n", + " \"batch_size\": 48,\n", + " \"beta1\": 0.9,\n", + " \"beta2\": 0.999,\n", + " \"max_duration\": 16.7,\n", + " \"val_batch_size\": 44,\n", + " \"target\": 0.058,\n", + " \"lr\": 0.007,\n", + " \"min_lr\": 1e-5,\n", + " \"lr_exp_gamma\": 0.939,\n", + " \"epochs\": 80,\n", + " \"warmup_epochs\": 6,\n", + " \"hold_epochs\": 33,\n", + " \"epochs_this_job\": 0,\n", + " \"ema\": 0.995,\n", + " \"model_config\": \"/workspace/rnnt/configs/baseline_v3-1023sp.yaml\",\n", + " \"seed\": 28400,\n", + " \"cudnn_benchmark\": \"\",\n", + " \"dali_device\": \"gpu\",\n", + " \"weight_decay\": 1e-3,\n", + " \"log_frequency\": 1, #!!\n", + " \"val_frequency\": 1,\n", + " \"grad_accumulation_steps\": 1,\n", + " \"prediction_frequency\": 100000000,\n", + " \"weights_init_scale\": 0.5,\n", + " \"save_at_the_end\": \"\",\n", + " \"max_symbol_per_sample\": 300,\n", + " \"apex_transducer_loss\": \"fp16\",\n", + " \"fuse_relu_dropout\": \"\",\n", + " \"multi_tensor_ema\": \"\",\n", + " \"batch_eval_mode\": \"cg_unroll_pipeline\",\n", + " \"apex_transducer_joint\": \"pack\",\n", + " \"buffer_pre_alloc\": \"\",\n", + " \"ema_update_type\": \"fp16\",\n", + " \"amp_level\": 2,\n", + " \"data_cpu_threads\": 8,\n", + " \"batch_split_factor\": 1,\n", + " \"min_seq_split_len\": 20,\n", + " \"vectorized_sa\": \"\",\n", + " \"multilayer_lstm\": \"\",\n", + " \"enable_prefetch\": \"\",\n", + " \"tokenized_transcript\": \"\",\n", + " \"vectorized_sampler\": \"\",\n", + " \"dist_sampler\": \"\",\n", + " \"apex_mlp\": \"\",\n", + " \"pre_sort_for_seq_split\": \"\",\n", + " \"jit_tensor_formation\": \"\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2e5811e", + "metadata": {}, + "outputs": [], + "source": [ + "estimator = PyTorch(\n", + " entry_point=\"entry_point.py\",\n", + " role=role,\n", + " image_uri=docker_image,\n", + " source_dir=\".\",\n", + " instance_count=instance_count,\n", + " instance_type=instance_type,\n", + " framework_version=\"1.8.1\",\n", + " py_version=\"py36\",\n", + " sagemaker_session=sagemaker_session,\n", + " hyperparameters=hyperparameters,\n", + " subnets=subnets,\n", + " security_group_ids=security_group_ids,\n", + " debugger_hook_config=False,\n", + " # Training using SMDataParallel Distributed Training Framework\n", + " distribution={\"smdistributed\": {\"dataparallel\": {\"enabled\": True}}},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "712890a9", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure FSx Input for your SageMaker Training job\n", + "\n", + "from sagemaker.inputs import FileSystemInput\n", + "\n", + "file_system_directory_path = (\n", + " \"/\" # NOTE: '/fsx/' will be the root mount path. Example: '/fsx/mask_rcnn/PyTorch'\n", + ")\n", + "file_system_access_mode = \"ro\"\n", + "file_system_type = \"FSxLustre\"\n", + "train_fs = FileSystemInput(\n", + " file_system_id=file_system_id,\n", + " file_system_type=file_system_type,\n", + " directory_path=file_system_directory_path,\n", + " file_system_access_mode=file_system_access_mode,\n", + ")\n", + "data_channels = {\"train\": train_fs}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82c4ef3e", + "metadata": {}, + "outputs": [], + "source": [ + "# Submit SageMaker training job\n", + "estimator.fit(inputs=data_channels, job_name=job_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8fa5b62", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/training/distributed_training/pytorch/data_parallel/rnnt/entry_point.py b/training/distributed_training/pytorch/data_parallel/rnnt/entry_point.py new file mode 100644 index 0000000000..1c378c93d9 --- /dev/null +++ b/training/distributed_training/pytorch/data_parallel/rnnt/entry_point.py @@ -0,0 +1,31 @@ +# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file +# except in compliance with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" +# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for +# the specific language governing permissions and limitations under the License. + +import subprocess +import sys +import os + +exe = 'python' + +trainer = '/workspace/rnnt/train.py' + +cmd_list = [exe] + [trainer] + sys.argv[1:] +cmd = ' '.join(cmd_list) + +cmd += ' ' +cmd += '--dataset_dir ' + os.environ['SM_CHANNEL_TRAIN'] + '/datasets/LibriSpeech/ ' +cmd += '--output_dir ' + os.environ['SM_OUTPUT_DIR'] + ' ' +cmd += '--val_manifests ' + os.environ['SM_CHANNEL_TRAIN'] + '/tokenized/librispeech-dev-clean-wav-tokenized.pkl ' +cmd += '--train_manifests ' + os.environ['SM_CHANNEL_TRAIN'] + '/tokenized/librispeech-train-clean-100-wav-tokenized.pkl ' + os.environ['SM_CHANNEL_TRAIN'] + '/tokenized/librispeech-train-clean-360-wav-tokenized.pkl ' + os.environ['SM_CHANNEL_TRAIN'] + '/tokenized/librispeech-train-other-500-wav-tokenized.pkl ' + +print('Final command is: ', cmd) + +subprocess.run(cmd, shell=True) \ No newline at end of file diff --git a/training/distributed_training/pytorch/data_parallel/rnnt/tag_and_push.sh b/training/distributed_training/pytorch/data_parallel/rnnt/tag_and_push.sh new file mode 100755 index 0000000000..cad072d883 --- /dev/null +++ b/training/distributed_training/pytorch/data_parallel/rnnt/tag_and_push.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# This script shows how to tag the Docker image and push it to ECR to be ready for use +# by SageMaker. +# The argument to this script is the image name. This will be used as the image on the local +# machine and combined with the account and region to form the repository name for ECR. +# set region + +if [ "$#" -eq 3 ]; then + region=$1 + image=$2 + tag=$3 +else + echo "usage: $0 $1 $2 " + exit 1 +fi + +# Get the account number associated with the current IAM credentials +account=$(aws sts get-caller-identity --query Account --output text) + +if [ $? -ne 0 ] +then + exit 255 +fi + +fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:${tag}" + +# If the repository doesn't exist in ECR, create it. +aws ecr describe-repositories --region ${region} --repository-names "${image}" > /dev/null 2>&1 +if [ $? -ne 0 ]; then + echo "creating ECR repository : ${fullname} " + aws ecr create-repository --region ${region} --repository-name "${image}" > /dev/null +fi + +$(aws ecr get-login --no-include-email --region ${region} --registry-ids 763104351884) +docker tag mlperf/rnn_speech_recognition_smddp:latest ${fullname} + +# Get the login command from ECR and execute it directly +$(aws ecr get-login --region ${region} --no-include-email) +docker push ${fullname} +if [ $? -eq 0 ]; +then + echo "Amazon ECR URI: ${fullname}" +else + echo "Error: Image build and push failed" + exit 1 +fi \ No newline at end of file diff --git a/training/distributed_training/tensorflow/data_parallel/mnist/tensorflow2_smdataparallel_mnist_demo.ipynb b/training/distributed_training/tensorflow/data_parallel/mnist/tensorflow2_smdataparallel_mnist_demo.ipynb index 019b0b0862..14f8e6bfb2 100644 --- a/training/distributed_training/tensorflow/data_parallel/mnist/tensorflow2_smdataparallel_mnist_demo.ipynb +++ b/training/distributed_training/tensorflow/data_parallel/mnist/tensorflow2_smdataparallel_mnist_demo.ipynb @@ -24,54 +24,9 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing /home/ubuntu/.cache/pip/wheels/36/73/72/147e239a958fa69f277e3077dc08b53cbf466cf443463147bd/sagemaker-2.42.1-py2.py3-none-any.whl\n", - "Requirement already satisfied, skipping upgrade: protobuf3-to-dict>=0.1.5 in /home/ubuntu/.local/lib/python3.6/site-packages (from sagemaker) (0.1.5)\n", - "Requirement already satisfied, skipping upgrade: packaging>=20.0 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from sagemaker) (20.1)\n", - "Requirement already satisfied, skipping upgrade: attrs in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from sagemaker) (19.3.0)\n", - "Requirement already satisfied, skipping upgrade: protobuf>=3.1 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from sagemaker) (3.8.0)\n", - "Requirement already satisfied, skipping upgrade: boto3>=1.16.32 in /home/ubuntu/.local/lib/python3.6/site-packages (from sagemaker) (1.16.36)\n", - "Requirement already satisfied, skipping upgrade: importlib-metadata>=1.4.0 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from sagemaker) (1.5.0)\n", - "Requirement already satisfied, skipping upgrade: smdebug-rulesconfig==1.0.1 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from sagemaker) (1.0.1)\n", - "Requirement already satisfied, skipping upgrade: google-pasta in /home/ubuntu/.local/lib/python3.6/site-packages (from sagemaker) (0.2.0)\n", - "Requirement already satisfied, skipping upgrade: numpy>=1.9.0 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from sagemaker) (1.18.1)\n", - "Requirement already satisfied, skipping upgrade: pathos in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from sagemaker) (0.2.7)\n", - "Requirement already satisfied, skipping upgrade: pandas in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from sagemaker) (1.0.1)\n", - "Requirement already satisfied, skipping upgrade: six in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from protobuf3-to-dict>=0.1.5->sagemaker) (1.14.0)\n", - "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from packaging>=20.0->sagemaker) (2.4.6)\n", - "Requirement already satisfied, skipping upgrade: setuptools in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from protobuf>=3.1->sagemaker) (45.2.0.post20200210)\n", - "Requirement already satisfied, skipping upgrade: botocore<1.20.0,>=1.19.36 in /home/ubuntu/.local/lib/python3.6/site-packages (from boto3>=1.16.32->sagemaker) (1.19.36)\n", - "Requirement already satisfied, skipping upgrade: s3transfer<0.4.0,>=0.3.0 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from boto3>=1.16.32->sagemaker) (0.3.3)\n", - "Requirement already satisfied, skipping upgrade: jmespath<1.0.0,>=0.7.1 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from boto3>=1.16.32->sagemaker) (0.10.0)\n", - "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from importlib-metadata>=1.4.0->sagemaker) (2.2.0)\n", - "Requirement already satisfied, skipping upgrade: dill>=0.3.3 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from pathos->sagemaker) (0.3.3)\n", - "Requirement already satisfied, skipping upgrade: pox>=0.2.9 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from pathos->sagemaker) (0.2.9)\n", - "Requirement already satisfied, skipping upgrade: multiprocess>=0.70.11 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from pathos->sagemaker) (0.70.11.1)\n", - "Requirement already satisfied, skipping upgrade: ppft>=1.6.6.3 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from pathos->sagemaker) (1.6.6.3)\n", - "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from pandas->sagemaker) (2.8.1)\n", - "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from pandas->sagemaker) (2019.3)\n", - "Requirement already satisfied, skipping upgrade: urllib3<1.27,>=1.25.4; python_version != \"3.4\" in /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (from botocore<1.20.0,>=1.19.36->boto3>=1.16.32->sagemaker) (1.25.10)\n", - "Installing collected packages: sagemaker\n", - " Attempting uninstall: sagemaker\n", - " Found existing installation: sagemaker 2.42.0\n", - " Uninstalling sagemaker-2.42.0:\n", - " Successfully uninstalled sagemaker-2.42.0\n", - "Successfully installed sagemaker-2.42.1\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "pip install sagemaker --upgrade" ] @@ -89,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -163,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -188,1063 +143,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } + "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2021-05-28 23:54:33 Starting - Starting the training job...\n", - "2021-05-28 23:54:56 Starting - Launching requested ML instancesProfilerReport-1622246073: InProgress\n", - ".........\n", - "2021-05-28 23:56:33 Starting - Preparing the instances for training.........\n", - "2021-05-28 23:57:57 Downloading - Downloading input data...\n", - "2021-05-28 23:58:17 Training - Downloading the training image...............\n", - "2021-05-29 00:00:57 Training - Training image download completed. Training in progress.\u001b[35m2021-05-29 00:00:54.926771: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[35m2021-05-29 00:00:54.932755: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[35m2021-05-29 00:00:55.020912: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\u001b[0m\n", - "\u001b[35m2021-05-29 00:00:55.114217: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[35m2021-05-29 00:00:58,582 sagemaker-training-toolkit INFO Imported framework sagemaker_tensorflow_container.training\u001b[0m\n", - "\u001b[35m2021-05-29 00:00:59,222 sagemaker-training-toolkit INFO Starting MPI run as worker node.\u001b[0m\n", - "\u001b[35m2021-05-29 00:00:59,222 sagemaker-training-toolkit INFO Waiting for MPI Master to create SSH daemon.\u001b[0m\n", - "\u001b[35m2021-05-29 00:00:59,223 sagemaker-training-toolkit INFO Cannot connect to host algo-1\u001b[0m\n", - "\u001b[35m2021-05-29 00:00:59,223 sagemaker-training-toolkit INFO Connection failed with exception: \n", - " [Errno None] Unable to connect to port 22 on 10.0.130.141\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:00,233 paramiko.transport INFO Connected (version 2.0, client OpenSSH_7.6p1)\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:00,308 paramiko.transport INFO Authentication (publickey) successful!\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:00,309 sagemaker-training-toolkit INFO Can connect to host algo-1\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:00,309 sagemaker-training-toolkit INFO MPI Master online, creating SSH daemon.\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:00,309 sagemaker-training-toolkit INFO Writing environment variables to /etc/environment for the MPI process.\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:00,319 sagemaker-training-toolkit INFO Waiting for MPI process to finish.\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:02,331 sagemaker-training-toolkit INFO Process[es]: [psutil.Process(pid=172, name='orted', status='sleeping', started='00:01:01')]\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:02,331 sagemaker-training-toolkit INFO Orted process found [psutil.Process(pid=172, name='orted', status='sleeping', started='00:01:01')]\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:02,331 sagemaker-training-toolkit INFO Waiting for orted process [psutil.Process(pid=172, name='orted', status='sleeping', started='00:01:01')]\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:55.031306: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:55.037221: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:55.134243: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:55.237754: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:58,838 sagemaker-training-toolkit INFO Imported framework sagemaker_tensorflow_container.training\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:59,672 sagemaker-training-toolkit INFO Starting MPI run as worker node.\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:59,672 sagemaker-training-toolkit INFO Creating SSH daemon.\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:59,680 sagemaker-training-toolkit INFO Waiting for MPI workers to establish their SSH connections\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:59,681 sagemaker-training-toolkit INFO Cannot connect to host algo-2 at port 22. Retrying...\u001b[0m\n", - "\u001b[34m2021-05-29 00:00:59,681 sagemaker-training-toolkit INFO Connection closed\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:00,690 paramiko.transport INFO Connected (version 2.0, client OpenSSH_7.6p1)\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:00,761 paramiko.transport INFO Authentication (publickey) successful!\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:00,761 sagemaker-training-toolkit INFO Can connect to host algo-2 at port 22\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:00,761 sagemaker-training-toolkit INFO Connection closed\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:00,761 sagemaker-training-toolkit INFO Worker algo-2 available for communication\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:00,761 sagemaker-training-toolkit INFO Network interface name: eth0\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:00,761 sagemaker-training-toolkit INFO Host: ['algo-1', 'algo-2']\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:00,763 sagemaker-training-toolkit INFO instance type: ml.p3.16xlarge\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:00,851 sagemaker-training-toolkit INFO Invoking user script\n", - "\u001b[0m\n", - "\u001b[34mTraining Env:\n", - "\u001b[0m\n", - "\u001b[34m{\n", - " \"additional_framework_parameters\": {\n", - " \"sagemaker_distributed_dataparallel_enabled\": true,\n", - " \"sagemaker_distributed_dataparallel_custom_mpi_options\": \"\",\n", - " \"sagemaker_instance_type\": \"ml.p3.16xlarge\"\n", - " },\n", - " \"channel_input_dirs\": {},\n", - " \"current_host\": \"algo-1\",\n", - " \"framework_module\": \"sagemaker_tensorflow_container.training:main\",\n", - " \"hosts\": [\n", - " \"algo-1\",\n", - " \"algo-2\"\n", - " ],\n", - " \"hyperparameters\": {\n", - " \"model_dir\": \"s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/model\"\n", - " },\n", - " \"input_config_dir\": \"/opt/ml/input/config\",\n", - " \"input_data_config\": {},\n", - " \"input_dir\": \"/opt/ml/input\",\n", - " \"is_master\": true,\n", - " \"job_name\": \"tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230\",\n", - " \"log_level\": 20,\n", - " \"master_hostname\": \"algo-1\",\n", - " \"model_dir\": \"/opt/ml/model\",\n", - " \"module_dir\": \"s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/source/sourcedir.tar.gz\",\n", - " \"module_name\": \"train_tensorflow_smdataparallel_mnist\",\n", - " \"network_interface_name\": \"eth0\",\n", - " \"num_cpus\": 64,\n", - " \"num_gpus\": 8,\n", - " \"output_data_dir\": \"/opt/ml/output/data\",\n", - " \"output_dir\": \"/opt/ml/output\",\n", - " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", - " \"resource_config\": {\n", - " \"current_host\": \"algo-1\",\n", - " \"hosts\": [\n", - " \"algo-1\",\n", - " \"algo-2\"\n", - " ],\n", - " \"network_interface_name\": \"eth0\"\n", - " },\n", - " \"user_entry_point\": \"train_tensorflow_smdataparallel_mnist.py\"\u001b[0m\n", - "\u001b[34m}\n", - "\u001b[0m\n", - "\u001b[34mEnvironment variables:\n", - "\u001b[0m\n", - "\u001b[34mSM_HOSTS=[\"algo-1\",\"algo-2\"]\u001b[0m\n", - "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", - "\u001b[34mSM_HPS={\"model_dir\":\"s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/model\"}\u001b[0m\n", - "\u001b[34mSM_USER_ENTRY_POINT=train_tensorflow_smdataparallel_mnist.py\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_PARAMS={\"sagemaker_distributed_dataparallel_custom_mpi_options\":\"\",\"sagemaker_distributed_dataparallel_enabled\":true,\"sagemaker_instance_type\":\"ml.p3.16xlarge\"}\u001b[0m\n", - "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\",\"algo-2\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", - "\u001b[34mSM_INPUT_DATA_CONFIG={}\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", - "\u001b[34mSM_CHANNELS=[]\u001b[0m\n", - "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", - "\u001b[34mSM_MODULE_NAME=train_tensorflow_smdataparallel_mnist\u001b[0m\n", - "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_tensorflow_container.training:main\u001b[0m\n", - "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", - "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", - "\u001b[34mSM_NUM_CPUS=64\u001b[0m\n", - "\u001b[34mSM_NUM_GPUS=8\u001b[0m\n", - "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", - "\u001b[34mSM_MODULE_DIR=s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/source/sourcedir.tar.gz\u001b[0m\n", - "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{\"sagemaker_distributed_dataparallel_custom_mpi_options\":\"\",\"sagemaker_distributed_dataparallel_enabled\":true,\"sagemaker_instance_type\":\"ml.p3.16xlarge\"},\"channel_input_dirs\":{},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_tensorflow_container.training:main\",\"hosts\":[\"algo-1\",\"algo-2\"],\"hyperparameters\":{\"model_dir\":\"s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/model\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/source/sourcedir.tar.gz\",\"module_name\":\"train_tensorflow_smdataparallel_mnist\",\"network_interface_name\":\"eth0\",\"num_cpus\":64,\"num_gpus\":8,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\",\"algo-2\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train_tensorflow_smdataparallel_mnist.py\"}\u001b[0m\n", - "\u001b[34mSM_USER_ARGS=[\"--model_dir\",\"s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/model\"]\u001b[0m\n", - "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", - "\u001b[34mSM_HP_MODEL_DIR=s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/model\u001b[0m\n", - "\u001b[34mPYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/local/lib/python37.zip:/usr/local/lib/python3.7:/usr/local/lib/python3.7/lib-dynload:/usr/local/lib/python3.7/site-packages\n", - "\u001b[0m\n", - "\u001b[34mInvoking script with the following command:\n", - "\u001b[0m\n", - "\u001b[34mmpirun --host algo-1:8,algo-2:8 -np 16 --allow-run-as-root --tag-output --oversubscribe -mca btl_tcp_if_include eth0 -mca oob_tcp_if_include eth0 -mca plm_rsh_no_tree_spawn 1 -mca pml ob1 -mca btl ^openib -mca orte_abort_on_non_zero_status 1 -mca btl_vader_single_copy_mechanism none -mca plm_rsh_num_concurrent 2 -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -x SMDATAPARALLEL_USE_HOMOGENEOUS=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 -x LD_PRELOAD=/usr/local/lib/python3.7/site-packages/gethostname.cpython-37m-x86_64-linux-gnu.so -x SMDATAPARALLEL_SERVER_ADDR=algo-1 -x SMDATAPARALLEL_SERVER_PORT=7592 -x SAGEMAKER_INSTANCE_TYPE=ml.p3.16xlarge smddprun /usr/local/bin/python3.7 -m mpi4py train_tensorflow_smdataparallel_mnist.py --model_dir s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/model\n", - "\n", - "\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Bootstrap : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Bootstrap : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,8]:\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO NET/Socket : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,8]:NCCL version 2.7.8+cuda11.0\u001b[0m\n", - "\u001b[34m[1,0]:\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO NET/Socket : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,0]:NCCL version 2.7.8+cuda11.0\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Bootstrap : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Bootstrap : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Bootstrap : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,10]:\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,11]:\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,9]:\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO NET/Socket : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO NET/Socket : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO NET/Socket : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Bootstrap : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Bootstrap : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Bootstrap : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,2]:\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,3]:\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,1]:\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO NET/Socket : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO NET/Socket : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO NET/Socket : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Bootstrap : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Bootstrap : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Bootstrap : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Bootstrap : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Bootstrap : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Bootstrap : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Bootstrap : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Bootstrap : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,4]:\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,5]:\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,6]:\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,7]:\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO NET/Socket : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO NET/Socket : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO NET/Socket : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO NET/Socket : Using [0]eth0:10.0.130.141<0>\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,12]:\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,13]:\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,14]:\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,15]:\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] find_ofi_provider:542 NCCL WARN NET/OFI Couldn't find any optimal provider\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO NET/IB : No device found.\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO NET/Socket : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO NET/Socket : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO NET/Socket : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO NET/Socket : Using [0]eth0:10.0.175.185<0>\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Using network Socket\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Trees [0] 2/-1/-1->3->0|0->3->2/-1/-1 [1] 2/-1/-1->3->0|0->3->2/-1/-1 [2] -1/-1/-1->3->2|2->3->-1/-1/-1 [3] -1/-1/-1->3->2|2->3->-1/-1/-1 [4] 7/-1/-1->3->1|1->3->7/-1/-1 [5] 1/-1/-1->3->7|7->3->1/-1/-1 [6] 2/-1/-1->3->0|0->3->2/-1/-1 [7] 2/-1/-1->3->0|0->3->2/-1/-1 [8] -1/-1/-1->3->2|2->3->-1/-1/-1 [9] -1/-1/-1->3->2|2->3->-1/-1/-1 [10] 7/-1/-1->3->1|1->3->7/-1/-1 [11] 1/-1/-1->3->7|7->3->1/-1/-1\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Trees [0] -1/-1/-1->4->7|7->4->-1/-1/-1 [1] -1/-1/-1->4->7|7->4->-1/-1/-1 [2] 7/-1/-1->4->0|0->4->7/-1/-1 [3] 7/-1/-1->4->0|0->4->7/-1/-1 [4] 6/-1/-1->4->5|5->4->6/-1/-1 [5] 5/-1/-1->4->6|6->4->5/-1/-1 [6] -1/-1/-1->4->7|7->4->-1/-1/-1 [7] -1/-1/-1->4->7|7->4->-1/-1/-1 [8] 7/-1/-1->4->0|0->4->7/-1/-1 [9] 7/-1/-1->4->0|0->4->7/-1/-1 [10] 6/-1/-1->4->5|5->4->6/-1/-1 [11] 5/-1/-1->4->6|6->4->5/-1/-1\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Trees [0] 6/-1/-1->5->1|1->5->6/-1/-1 [1] 6/-1/-1->5->1|1->5->6/-1/-1 [2] 1/-1/-1->5->6|6->5->1/-1/-1 [3] 1/-1/-1->5->6|6->5->1/-1/-1 [4] 4/-1/-1->5->7|7->5->4/-1/-1 [5] 7/-1/-1->5->4|4->5->7/-1/-1 [6] 6/-1/-1->5->1|1->5->6/-1/-1 [7] 6/-1/-1->5->1|1->5->6/-1/-1 [8] 1/-1/-1->5->6|6->5->1/-1/-1 [9] 1/-1/-1->5->6|6->5->1/-1/-1 [10] 4/-1/-1->5->7|7->5->4/-1/-1 [11] 7/-1/-1->5->4|4->5->7/-1/-1\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Trees [0] 7/-1/-1->6->5|5->6->7/-1/-1 [1] 7/-1/-1->6->5|5->6->7/-1/-1 [2] 5/-1/-1->6->7|7->6->5/-1/-1 [3] 5/-1/-1->6->7|7->6->5/-1/-1 [4] 2/-1/-1->6->4|4->6->2/-1/-1 [5] 4/-1/-1->6->2|2->6->4/-1/-1 [6] 7/-1/-1->6->5|5->6->7/-1/-1 [7] 7/-1/-1->6->5|5->6->7/-1/-1 [8] 5/-1/-1->6->7|7->6->5/-1/-1 [9] 5/-1/-1->6->7|7->6->5/-1/-1 [10] 2/-1/-1->6->4|4->6->2/-1/-1 [11] 4/-1/-1->6->2|2->6->4/-1/-1\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 00/12 : 0 3 2 1 5 6 7 4\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 01/12 : 0 3 2 1 5 6 7 4\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 02/12 : 0 4 7 6 5 1 2 3\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 03/12 : 0 4 7 6 5 1 2 3\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Trees [0] 4/-1/-1->7->6|6->7->4/-1/-1 [1] 4/-1/-1->7->6|6->7->4/-1/-1 [2] 6/-1/-1->7->4|4->7->6/-1/-1 [3] 6/-1/-1->7->4|4->7->6/-1/-1 [4] 5/-1/-1->7->3|3->7->5/-1/-1 [5] 3/-1/-1->7->5|5->7->3/-1/-1 [6] 4/-1/-1->7->6|6->7->4/-1/-1 [7] 4/-1/-1->7->6|6->7->4/-1/-1 [8] 6/-1/-1->7->4|4->7->6/-1/-1 [9] 6/-1/-1->7->4|4->7->6/-1/-1 [10] 5/-1/-1->7->3|3->7->5/-1/-1 [11] 3/-1/-1->7->5|5->7->3/-1/-1\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Trees [0] 5/-1/-1->1->2|2->1->5/-1/-1 [1] 5/-1/-1->1->2|2->1->5/-1/-1 [2] 2/-1/-1->1->5|5->1->2/-1/-1 [3] 2/-1/-1->1->5|5->1->2/-1/-1 [4] 3/-1/-1->1->0|0->1->3/-1/-1 [5] -1/-1/-1->1->3|3->1->-1/-1/-1 [6] 5/-1/-1->1->2|2->1->5/-1/-1 [7] 5/-1/-1->1->2|2->1->5/-1/-1 [8] 2/-1/-1->1->5|5->1->2/-1/-1 [9] 2/-1/-1->1->5|5->1->2/-1/-1 [10] 3/-1/-1->1->0|0->1->3/-1/-1 [11] -1/-1/-1->1->3|3->1->-1/-1/-1\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 04/12 : 0 1 3 7 5 4 6 2\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 05/12 : 0 2 6 4 5 7 3 1\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 06/12 : 0 3 2 1 5 6 7 4\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 07/12 : 0 3 2 1 5 6 7 4\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Trees [0] 1/-1/-1->2->3|3->2->1/-1/-1 [1] 1/-1/-1->2->3|3->2->1/-1/-1 [2] 3/-1/-1->2->1|1->2->3/-1/-1 [3] 3/-1/-1->2->1|1->2->3/-1/-1 [4] -1/-1/-1->2->6|6->2->-1/-1/-1 [5] 6/-1/-1->2->0|0->2->6/-1/-1 [6] 1/-1/-1->2->3|3->2->1/-1/-1 [7] 1/-1/-1->2->3|3->2->1/-1/-1 [8] 3/-1/-1->2->1|1->2->3/-1/-1 [9] 3/-1/-1->2->1|1->2->3/-1/-1 [10] -1/-1/-1->2->6|6->2->-1/-1/-1 [11] 6/-1/-1->2->0|0->2->6/-1/-1\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 08/12 : 0 4 7 6 5 1 2 3\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 09/12 : 0 4 7 6 5 1 2 3\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 10/12 : 0 1 3 7 5 4 6 2\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 11/12 : 0 2 6 4 5 7 3 1\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Trees [0] 3/-1/-1->0->-1|-1->0->3/-1/-1 [1] 3/-1/-1->0->-1|-1->0->3/-1/-1 [2] 4/-1/-1->0->-1|-1->0->4/-1/-1 [3] 4/-1/-1->0->-1|-1->0->4/-1/-1 [4] 1/-1/-1->0->-1|-1->0->1/-1/-1 [5] 2/-1/-1->0->-1|-1->0->2/-1/-1 [6] 3/-1/-1->0->-1|-1->0->3/-1/-1 [7] 3/-1/-1->0->-1|-1->0->3/-1/-1 [8] 4/-1/-1->0->-1|-1->0->4/-1/-1 [9] 4/-1/-1->0->-1|-1->0->4/-1/-1 [10] 1/-1/-1->0->-1|-1->0->1/-1/-1 [11] 2/-1/-1->0->-1|-1->0->2/-1/-1\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 00 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 00 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 00 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 00 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 00 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 00 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 00 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 00 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 00 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 00 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 00 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 00 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 00 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 00 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Trees [0] 2/-1/-1->3->0|0->3->2/-1/-1 [1] 2/-1/-1->3->0|0->3->2/-1/-1 [2] -1/-1/-1->3->2|2->3->-1/-1/-1 [3] -1/-1/-1->3->2|2->3->-1/-1/-1 [4] 7/-1/-1->3->1|1->3->7/-1/-1 [5] 1/-1/-1->3->7|7->3->1/-1/-1 [6] 2/-1/-1->3->0|0->3->2/-1/-1 [7] 2/-1/-1->3->0|0->3->2/-1/-1 [8] -1/-1/-1->3->2|2->3->-1/-1/-1 [9] -1/-1/-1->3->2|2->3->-1/-1/-1 [10] 7/-1/-1->3->1|1->3->7/-1/-1 [11] 1/-1/-1->3->7|7->3->1/-1/-1\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 00 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Trees [0] -1/-1/-1->4->7|7->4->-1/-1/-1 [1] -1/-1/-1->4->7|7->4->-1/-1/-1 [2] 7/-1/-1->4->0|0->4->7/-1/-1 [3] 7/-1/-1->4->0|0->4->7/-1/-1 [4] 6/-1/-1->4->5|5->4->6/-1/-1 [5] 5/-1/-1->4->6|6->4->5/-1/-1 [6] -1/-1/-1->4->7|7->4->-1/-1/-1 [7] -1/-1/-1->4->7|7->4->-1/-1/-1 [8] 7/-1/-1->4->0|0->4->7/-1/-1 [9] 7/-1/-1->4->0|0->4->7/-1/-1 [10] 6/-1/-1->4->5|5->4->6/-1/-1 [11] 5/-1/-1->4->6|6->4->5/-1/-1\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Trees [0] 6/-1/-1->5->1|1->5->6/-1/-1 [1] 6/-1/-1->5->1|1->5->6/-1/-1 [2] 1/-1/-1->5->6|6->5->1/-1/-1 [3] 1/-1/-1->5->6|6->5->1/-1/-1 [4] 4/-1/-1->5->7|7->5->4/-1/-1 [5] 7/-1/-1->5->4|4->5->7/-1/-1 [6] 6/-1/-1->5->1|1->5->6/-1/-1 [7] 6/-1/-1->5->1|1->5->6/-1/-1 [8] 1/-1/-1->5->6|6->5->1/-1/-1 [9] 1/-1/-1->5->6|6->5->1/-1/-1 [10] 4/-1/-1->5->7|7->5->4/-1/-1 [11] 7/-1/-1->5->4|4->5->7/-1/-1\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Trees [0] 4/-1/-1->7->6|6->7->4/-1/-1 [1] 4/-1/-1->7->6|6->7->4/-1/-1 [2] 6/-1/-1->7->4|4->7->6/-1/-1 [3] 6/-1/-1->7->4|4->7->6/-1/-1 [4] 5/-1/-1->7->3|3->7->5/-1/-1 [5] 3/-1/-1->7->5|5->7->3/-1/-1 [6] 4/-1/-1->7->6|6->7->4/-1/-1 [7] 4/-1/-1->7->6|6->7->4/-1/-1 [8] 6/-1/-1->7->4|4->7->6/-1/-1 [9] 6/-1/-1->7->4|4->7->6/-1/-1 [10] 5/-1/-1->7->3|3->7->5/-1/-1 [11] 3/-1/-1->7->5|5->7->3/-1/-1\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Trees [0] 7/-1/-1->6->5|5->6->7/-1/-1 [1] 7/-1/-1->6->5|5->6->7/-1/-1 [2] 5/-1/-1->6->7|7->6->5/-1/-1 [3] 5/-1/-1->6->7|7->6->5/-1/-1 [4] 2/-1/-1->6->4|4->6->2/-1/-1 [5] 4/-1/-1->6->2|2->6->4/-1/-1 [6] 7/-1/-1->6->5|5->6->7/-1/-1 [7] 7/-1/-1->6->5|5->6->7/-1/-1 [8] 5/-1/-1->6->7|7->6->5/-1/-1 [9] 5/-1/-1->6->7|7->6->5/-1/-1 [10] 2/-1/-1->6->4|4->6->2/-1/-1 [11] 4/-1/-1->6->2|2->6->4/-1/-1\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 00/12 : 0 3 2 1 5 6 7 4\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 01/12 : 0 3 2 1 5 6 7 4\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 02/12 : 0 4 7 6 5 1 2 3\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 03/12 : 0 4 7 6 5 1 2 3\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Trees [0] 5/-1/-1->1->2|2->1->5/-1/-1 [1] 5/-1/-1->1->2|2->1->5/-1/-1 [2] 2/-1/-1->1->5|5->1->2/-1/-1 [3] 2/-1/-1->1->5|5->1->2/-1/-1 [4] 3/-1/-1->1->0|0->1->3/-1/-1 [5] -1/-1/-1->1->3|3->1->-1/-1/-1 [6] 5/-1/-1->1->2|2->1->5/-1/-1 [7] 5/-1/-1->1->2|2->1->5/-1/-1 [8] 2/-1/-1->1->5|5->1->2/-1/-1 [9] 2/-1/-1->1->5|5->1->2/-1/-1 [10] 3/-1/-1->1->0|0->1->3/-1/-1 [11] -1/-1/-1->1->3|3->1->-1/-1/-1\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 04/12 : 0 1 3 7 5 4 6 2\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 05/12 : 0 2 6 4 5 7 3 1\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 06/12 : 0 3 2 1 5 6 7 4\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Trees [0] 1/-1/-1->2->3|3->2->1/-1/-1 [1] 1/-1/-1->2->3|3->2->1/-1/-1 [2] 3/-1/-1->2->1|1->2->3/-1/-1 [3] 3/-1/-1->2->1|1->2->3/-1/-1 [4] -1/-1/-1->2->6|6->2->-1/-1/-1 [5] 6/-1/-1->2->0|0->2->6/-1/-1 [6] 1/-1/-1->2->3|3->2->1/-1/-1 [7] 1/-1/-1->2->3|3->2->1/-1/-1 [8] 3/-1/-1->2->1|1->2->3/-1/-1 [9] 3/-1/-1->2->1|1->2->3/-1/-1 [10] -1/-1/-1->2->6|6->2->-1/-1/-1 [11] 6/-1/-1->2->0|0->2->6/-1/-1\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 07/12 : 0 3 2 1 5 6 7 4\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 08/12 : 0 4 7 6 5 1 2 3\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 09/12 : 0 4 7 6 5 1 2 3\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 10/12 : 0 1 3 7 5 4 6 2\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 11/12 : 0 2 6 4 5 7 3 1\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Trees [0] 3/-1/-1->0->-1|-1->0->3/-1/-1 [1] 3/-1/-1->0->-1|-1->0->3/-1/-1 [2] 4/-1/-1->0->-1|-1->0->4/-1/-1 [3] 4/-1/-1->0->-1|-1->0->4/-1/-1 [4] 1/-1/-1->0->-1|-1->0->1/-1/-1 [5] 2/-1/-1->0->-1|-1->0->2/-1/-1 [6] 3/-1/-1->0->-1|-1->0->3/-1/-1 [7] 3/-1/-1->0->-1|-1->0->3/-1/-1 [8] 4/-1/-1->0->-1|-1->0->4/-1/-1 [9] 4/-1/-1->0->-1|-1->0->4/-1/-1 [10] 1/-1/-1->0->-1|-1->0->1/-1/-1 [11] 2/-1/-1->0->-1|-1->0->2/-1/-1\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 01 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 00 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 01 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 00 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 00 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 00 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 00 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 00 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 00 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 00 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 01 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 01 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 01 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 01 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 01 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 01 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 01 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 00 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 01 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 01 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 01 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 01 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 01 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 00 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 01 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 00 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 00 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 00 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 00 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 00 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 02 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 02 : 0[170] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 01 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 01 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 02 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 02 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 02 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 02 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 02 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 01 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 02 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 01 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 01 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 01 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 01 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 01 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 02 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 02 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 01 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 02 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 02 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 02 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 01 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 01 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 02 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 02 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 01 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 01 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 01 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 01 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 03 : 0[170] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 03 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 02 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 03 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 02 : 0[170] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 03 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 02 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 03 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 03 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 02 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 03 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 03 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 02 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 02 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 02 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 02 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 03 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 03 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 02 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 02 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 03 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 03 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 03 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 03 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 03 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 02 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 02 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 02 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 02 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 02 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 04 : 3[1a0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 04 : 0[170] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 03 : 0[170] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 04 : 4[1b0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 03 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 04 : 5[1c0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 04 : 6[1d0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 04 : 7[1e0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 03 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 04 : 2[190] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 04 : 1[180] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 03 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 03 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 03 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 03 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 03 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 03 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 03 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 04 : 4[1b0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 04 : 2[190] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 04 : 3[1a0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 03 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 04 : 5[1c0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 03 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 03 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 03 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 04 : 6[1d0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 04 : 7[1e0] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 03 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 04 : 1[180] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 04 : 3[1a0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 05 : 2[190] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 04 : 0[170] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 05 : 4[1b0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 05 : 3[1a0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 04 : 4[1b0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 05 : 0[170] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 04 : 5[1c0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 05 : 5[1c0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 05 : 6[1d0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 04 : 7[1e0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 05 : 7[1e0] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 04 : 1[180] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 04 : 6[1d0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 04 : 2[190] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 05 : 1[180] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 05 : 2[190] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 05 : 1[180] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 05 : 4[1b0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 05 : 3[1a0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 04 : 3[1a0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 04 : 2[190] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 05 : 5[1c0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 04 : 4[1b0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 05 : 6[1d0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 05 : 7[1e0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 04 : 5[1c0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 04 : 6[1d0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 04 : 7[1e0] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 04 : 1[180] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 06 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 06 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 05 : 2[190] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 06 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 06 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 06 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 05 : 0[170] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 05 : 3[1a0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 05 : 4[1b0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 06 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 06 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 05 : 5[1c0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 06 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 05 : 6[1d0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 05 : 1[180] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 05 : 7[1e0] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 06 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 06 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 06 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 06 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 05 : 2[190] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 05 : 1[180] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 06 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 06 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 06 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 05 : 4[1b0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 05 : 3[1a0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 05 : 5[1c0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 05 : 6[1d0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 05 : 7[1e0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 07 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 07 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 07 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 06 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 06 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 07 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 07 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 07 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 07 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 07 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 06 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 06 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 06 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 06 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 06 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 06 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 07 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 07 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 07 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 07 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 06 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 06 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 07 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 07 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 07 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 06 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 06 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 06 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 06 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 06 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 08 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 08 : 0[170] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 08 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 08 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 07 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 08 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 07 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 08 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 08 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 07 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 07 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 08 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 07 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 07 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 07 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 07 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 08 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 08 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 08 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 08 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 07 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 08 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 07 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 08 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 07 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 08 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 07 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 07 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 07 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 07 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 09 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 09 : 0[170] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 09 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 08 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 09 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 09 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 08 : 0[170] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 09 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 09 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 08 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 08 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 09 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 08 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 08 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 08 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 08 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 09 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 09 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 09 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 09 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 08 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 09 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 09 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 08 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 09 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 08 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 08 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 08 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 08 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 10 : 3[1a0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 08 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 10 : 0[170] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 10 : 1[180] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 10 : 2[190] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 09 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 10 : 4[1b0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 10 : 5[1c0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 10 : 6[1d0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 09 : 0[170] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 10 : 7[1e0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 09 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 09 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 09 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 09 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 10 : 2[190] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 09 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 09 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 10 : 3[1a0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 10 : 1[180] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 10 : 4[1b0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 10 : 5[1c0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 10 : 6[1d0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 09 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 10 : 7[1e0] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 09 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 09 : 4[1b0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 09 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 09 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 09 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 11 : 2[190] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 09 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 11 : 0[170] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 11 : 1[180] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 11 : 3[1a0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 11 : 4[1b0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 10 : 3[1a0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 11 : 5[1c0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 11 : 6[1d0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 11 : 7[1e0] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 10 : 0[170] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 10 : 1[180] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 10 : 4[1b0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 10 : 2[190] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 10 : 5[1c0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 11 : 1[180] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 10 : 7[1e0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 10 : 6[1d0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 11 : 2[190] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 11 : 4[1b0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 11 : 3[1a0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 11 : 6[1d0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 11 : 5[1c0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 11 : 7[1e0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO comm 0x55f354790400 rank 1 nranks 8 cudaDev 1 busId 180 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 10 : 2[190] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 10 : 3[1a0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO comm 0x55ea7902bea0 rank 0 nranks 8 cudaDev 0 busId 170 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 10 : 1[180] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 10 : 4[1b0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO comm 0x555bd27b30f0 rank 2 nranks 8 cudaDev 2 busId 190 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 10 : 5[1c0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO comm 0x55a34a0dc190 rank 4 nranks 8 cudaDev 4 busId 1b0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO comm 0x55c773dbc960 rank 3 nranks 8 cudaDev 3 busId 1a0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO comm 0x563ef9733c80 rank 5 nranks 8 cudaDev 5 busId 1c0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 10 : 7[1e0] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO comm 0x557dc4830b40 rank 6 nranks 8 cudaDev 6 busId 1d0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 10 : 6[1d0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO comm 0x55d228d3ba80 rank 7 nranks 8 cudaDev 7 busId 1e0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 11 : 2[190] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 11 : 0[170] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 11 : 1[180] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 11 : 3[1a0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 11 : 4[1b0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 11 : 5[1c0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 11 : 7[1e0] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 11 : 6[1d0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 11 : 1[180] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 11 : 2[190] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 11 : 3[1a0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 11 : 4[1b0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 11 : 5[1c0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 11 : 7[1e0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 11 : 6[1d0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO comm 0x56056c79cd10 rank 1 nranks 8 cudaDev 1 busId 180 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO comm 0x561abb42ba10 rank 0 nranks 8 cudaDev 0 busId 170 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO comm 0x55dc8aaa1440 rank 3 nranks 8 cudaDev 3 busId 1a0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO comm 0x560b414a63e0 rank 4 nranks 8 cudaDev 4 busId 1b0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO comm 0x55b7eabe4e10 rank 2 nranks 8 cudaDev 2 busId 190 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO comm 0x55b7b5efe120 rank 5 nranks 8 cudaDev 5 busId 1c0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO 12 coll channels, 16 p2p channels, 2 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO comm 0x56491cf25920 rank 7 nranks 8 cudaDev 7 busId 1e0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO comm 0x55f911199e80 rank 6 nranks 8 cudaDev 6 busId 1d0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Trees [0] 4/-1/-1->7->6|6->7->4/-1/-1 [1] 4/-1/-1->7->6|6->7->4/-1/-1\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 00/02 : 0 3 2 1 5 6 7 4 8 11 10 9 13 14 15 12\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 01/02 : 0 3 2 1 5 6 7 4 8 11 10 9 13 14 15 12\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Trees [0] 5/-1/-1->1->2|2->1->5/-1/-1 [1] 5/-1/-1->1->2|2->1->5/-1/-1\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Trees [0] 1/-1/-1->2->3|3->2->1/-1/-1 [1] 1/-1/-1->2->3|3->2->1/-1/-1\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Trees [0] 2/8/-1->3->0|0->3->2/8/-1 [1] 2/-1/-1->3->0|0->3->2/-1/-1\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Trees [0] 3/-1/-1->0->-1|-1->0->3/-1/-1 [1] 3/-1/-1->0->11|11->0->3/-1/-1\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Trees [0] -1/-1/-1->4->7|7->4->-1/-1/-1 [1] -1/-1/-1->4->7|7->4->-1/-1/-1\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Trees [0] 6/-1/-1->5->1|1->5->6/-1/-1 [1] 6/-1/-1->5->1|1->5->6/-1/-1\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Trees [0] 13/-1/-1->9->10|10->9->13/-1/-1 [1] 13/-1/-1->9->10|10->9->13/-1/-1\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Trees [0] 7/-1/-1->6->5|5->6->7/-1/-1 [1] 7/-1/-1->6->5|5->6->7/-1/-1\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Trees [0] 11/-1/-1->8->3|3->8->11/-1/-1 [1] 11/-1/-1->8->-1|-1->8->11/-1/-1\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Trees [0] 12/-1/-1->15->14|14->15->12/-1/-1 [1] 12/-1/-1->15->14|14->15->12/-1/-1\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Trees [0] -1/-1/-1->12->15|15->12->-1/-1/-1 [1] -1/-1/-1->12->15|15->12->-1/-1/-1\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Trees [0] 10/-1/-1->11->8|8->11->10/-1/-1 [1] 10/0/-1->11->8|8->11->10/0/-1\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Trees [0] 9/-1/-1->10->11|11->10->9/-1/-1 [1] 9/-1/-1->10->11|11->10->9/-1/-1\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Trees [0] 14/-1/-1->13->9|9->13->14/-1/-1 [1] 14/-1/-1->13->9|9->13->14/-1/-1\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/64\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Trees [0] 15/-1/-1->14->13|13->14->15/-1/-1 [1] 15/-1/-1->14->13|13->14->15/-1/-1\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 00 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 00 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 00 : 13[1c0] -> 14[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 00 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 00 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 00 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 00 : 14[1d0] -> 15[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 00 : 11[1a0] -> 10[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 00 : 15[1e0] -> 12[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 00 : 10[190] -> 9[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 00 : 9[180] -> 13[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 00 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 00 : 12[1b0] -> 0[170] [receive] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO NET/Socket: Using 2 threads and 8 sockets per thread\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 00 : 4[1b0] -> 8[170] [receive] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO NET/Socket: Using 2 threads and 8 sockets per thread\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 00 : 4[1b0] -> 8[170] [send] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 00 : 12[1b0] -> 0[170] [send] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 00 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 00 : 8[170] -> 11[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 00 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 00 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 00 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 00 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 00 : 13[1c0] -> 9[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 00 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 00 : 14[1d0] -> 13[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 00 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 00 : 10[190] -> 11[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 00 : 15[1e0] -> 14[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 00 : 9[180] -> 10[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 00 : 12[1b0] -> 15[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 00 : 11[1a0] -> 8[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 00 : 8[170] -> 3[1a0] [receive] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO NET/Socket: Using 2 threads and 8 sockets per thread\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 00 : 8[170] -> 3[1a0] [send] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 01 : 7[1e0] -> 4[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 00 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 01 : 2[190] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 01 : 1[180] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 01 : 5[1c0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 01 : 6[1d0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 01 : 13[1c0] -> 14[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 01 : 14[1d0] -> 15[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 01 : 10[190] -> 9[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 01 : 15[1e0] -> 12[1b0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 01 : 9[180] -> 13[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 01 : 4[1b0] -> 8[170] [send] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 01 : 12[1b0] -> 0[170] [send] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 01 : 11[1a0] -> 10[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 01 : 12[1b0] -> 0[170] [receive] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO NET/Socket: Using 2 threads and 8 sockets per thread\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO Channel 01 : 7[1e0] -> 6[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO Channel 01 : 1[180] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO Channel 01 : 5[1c0] -> 1[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO Channel 01 : 6[1d0] -> 5[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 01 : 0[170] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 00 : 3[1a0] -> 8[170] [receive] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO NET/Socket: Using 2 threads and 8 sockets per thread\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 00 : 3[1a0] -> 8[170] [send] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,5]:algo-1:215:215 [5] NCCL INFO comm 0x563efc409e60 rank 5 nranks 16 cudaDev 5 busId 1c0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,6]:algo-1:221:221 [6] NCCL INFO comm 0x557dc7506d20 rank 6 nranks 16 cudaDev 6 busId 1d0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO Channel 01 : 13[1c0] -> 9[180] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO Channel 01 : 14[1d0] -> 13[1c0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO Channel 01 : 15[1e0] -> 14[1d0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO Channel 01 : 9[180] -> 10[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 01 : 3[1a0] -> 2[190] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO Channel 01 : 10[190] -> 11[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO Channel 01 : 12[1b0] -> 15[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,13]:algo-2:227:227 [5] NCCL INFO comm 0x55b7b8bd4300 rank 13 nranks 16 cudaDev 5 busId 1c0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,14]:algo-2:223:223 [6] NCCL INFO comm 0x55f913e70060 rank 14 nranks 16 cudaDev 6 busId 1d0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,12]:algo-2:226:226 [4] NCCL INFO comm 0x560b4417c5c0 rank 12 nranks 16 cudaDev 4 busId 1b0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,15]:algo-2:225:225 [7] NCCL INFO comm 0x56491fbfbb00 rank 15 nranks 16 cudaDev 7 busId 1e0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,9]:algo-2:231:231 [1] NCCL INFO comm 0x56056f472ef0 rank 9 nranks 16 cudaDev 1 busId 180 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO Channel 01 : 2[190] -> 3[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO Channel 01 : 3[1a0] -> 0[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,1]:algo-1:214:214 [1] NCCL INFO comm 0x55f3574665e0 rank 1 nranks 16 cudaDev 1 busId 180 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 01 : 4[1b0] -> 8[170] [receive] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO NET/Socket: Using 2 threads and 8 sockets per thread\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO Channel 01 : 8[170] -> 11[1a0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,2]:algo-1:220:220 [2] NCCL INFO comm 0x555bd54892d0 rank 2 nranks 16 cudaDev 2 busId 190 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,3]:algo-1:222:222 [3] NCCL INFO comm 0x55c776a92b40 rank 3 nranks 16 cudaDev 3 busId 1a0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO Channel 01 : 4[1b0] -> 7[1e0] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,10]:algo-2:228:228 [2] NCCL INFO comm 0x55b7ed8baff0 rank 10 nranks 16 cudaDev 2 busId 190 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,4]:algo-1:219:219 [4] NCCL INFO comm 0x55a34cdb2370 rank 4 nranks 16 cudaDev 4 busId 1b0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,7]:algo-1:218:218 [7] NCCL INFO comm 0x55d22ba11c60 rank 7 nranks 16 cudaDev 7 busId 1e0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 01 : 0[170] -> 11[1a0] [send] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 01 : 0[170] -> 11[1a0] [receive] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO NET/Socket: Using 2 threads and 8 sockets per thread\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 01 : 11[1a0] -> 8[170] via P2P/IPC\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:678 [0] NCCL INFO comm 0x561abe101bf0 rank 8 nranks 16 cudaDev 0 busId 170 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO Channel 01 : 11[1a0] -> 0[170] [send] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO Channel 01 : 11[1a0] -> 0[170] [receive] via NET/Socket/0\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO NET/Socket: Using 2 threads and 8 sockets per thread\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:669 [0] NCCL INFO comm 0x55ea7bd02080 rank 0 nranks 16 cudaDev 0 busId 170 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,11]:algo-2:224:224 [3] NCCL INFO comm 0x55dc8d777620 rank 11 nranks 16 cudaDev 3 busId 1a0 - Init COMPLETE\u001b[0m\n", - "\u001b[34m[1,0]:Running smdistributed.dataparallel v1.2.0\u001b[0m\n", - "\u001b[34m[1,10]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,15]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,13]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,12]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,9]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,14]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,11]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,8]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,5]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,6]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,1]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,0]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,2]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,4]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,7]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,3]:Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\u001b[0m\n", - "\u001b[34m[1,12]:#015 8192/11490434 [..............................] - ETA: 0s[1,15]:#015 8192/11490434 [..............................] - ETA: 0s[1,10]:#015[1,10]: 8192/11490434 [..............................] - ETA: 0s[1,13]:#015[1,13]: 8192/11490434 [..............................][1,13]: - ETA: 0s[1,11]:#015 8192/11490434 [..............................] - ETA: 0s[1,14]:#015 8192/11490434 [..............................][1,14]: - ETA: 0s[1,8]:#015 8192/11490434 [..............................] - ETA: 0s[1,9]:#015[1,9]: 8192/11490434 [..............................] - ETA: 0s[1,4]:#015 8192/11490434 [..............................] - ETA: 0s[1,5]:#015[1,5]: 8192/11490434 [..............................] - ETA: 0s[1,6]:#015 8192/11490434 [..............................] - ETA: 0s[1,7]:#015 8192/11490434 [..............................] - ETA: 0s[1,0]:#015 8192/11490434 [..............................][1,0]: - ETA: 0s[1,3]:#015 8192/11490434 [..............................][1,2]:#015[1,2]: 8192/11490434 [..............................] - ETA: 0s[1,3]: - ETA: 0s[1,1]:#015 8192/11490434 [..............................] - ETA: 0s[1,12]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 1073152/11490434 [=>............................] - ETA: 0s[1,10]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 729088/11490434 [>.............................] - ETA: 0ss[1,13]: 1449984/11490434 [==>...........................] - ETA: 0s[1,11]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 1073152/11490434 [=>............................][1,11]: - ETA: 0s[1,14]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 516096/11490434 [>.............................] - ETA: 1s[1,8]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 1146880/11490434 [=>............................] - ETA: 0s[1,9]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 950272/11490434 [=>............................] - ETA: 0s[1,4]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 1826816/11490434 [===>..........................] - ETA: 0s[1,5]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 2883584/11490434 [======>.......................] - ETA: 0s[1,6]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 2613248/11490434 [=====>........................] - ETA: 0s[1,7]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 786432/11490434 [=>............................] - ETA: 0s[1,0]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 901120/11490434 [=>............................] - ETA: 0s[1,2]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,2]: 1769472/11490434 [===>..........................][1,2]: - ETA: 0s[1,3]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,3]: 761856/11490434 [>.............................][1,3]: - ETA: 0s[1,1]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 548864/11490434 [>.............................][1,1]: - ETA: 1s[1,3]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 3833856/11490434 [=========>....................] - ETA: 0s[1,0]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 4202496/11490434 [=========>....................] - ETA: 0s[1,7]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 4202496/11490434 [=========>....................] - ETA: 0s[1,6]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,6]: 4202496/11490434 [=========>....................] - ETA: 0s[1,2]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 4202496/11490434 [=========>....................] - ETA: 0s[1,1]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,1]: 4202496/11490434 [=========>....................][1,1]: - ETA: 0s[1,5]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 4202496/11490434 [=========>....................] - ETA: 0s[1,4]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,4]: 4202496/11490434 [=========>....................] - ETA: 0s[1,11]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,11]: 4202496/11490434 [=========>....................] - ETA: 0s[1,15]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 4202496/11490434 [=========>....................] - ETA: 0s[1,12]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 4202496/11490434 [=========>....................] - ETA: 0s[1,14]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,14]: 4202496/11490434 [=========>....................] - ETA: 0s[1,9]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,9]: 4202496/11490434 [=========>....................][1,9]: - ETA: 0s[1,3]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 6201344/11490434 [===============>..............] - ETA: 0s[1,13]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 4202496/11490434 [=========>....................] - ETA: 0s[1,10]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 4202496/11490434 [=========>....................] - ETA: 0s[1,13]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,13]:11493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,12]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,7]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 8396800/11490434 [====================>.........] - ETA: 0s[1,6]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,6]: 8396800/11490434 [====================>.........][1,6]: - ETA: 0s[1,4]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,4]:10911744/11490434 [===========================>..] - ETA: 0s[1,15]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,10]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,11]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,11]: 6537216/11490434 [================>.............] - ETA: 0s[1,4]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,14]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01510240000/11490434 [=========================>....] - ETA: 0s[1,9]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 6979584/11490434 [=================>............] - ETA: 0s[1,3]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 8396800/11490434 [====================>.........] - ETA: 0s[1,1]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 8396800/11490434 [====================>.........] - ETA: 0ss[1,5]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 8396800/11490434 [====================>.........] - ETA: 0s[1,0]: - ETA: 0s[1,8]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015 4202496/11490434 [=========>....................] - ETA: 0s[1,14]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,7]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,5]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,6]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,0]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,0]:11493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,3]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,1]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,2]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,8]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,11]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015[1,11]:11493376/11490434 [==============================][1,11]: - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,9]:#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01511493376/11490434 [==============================] - 0s 0us/step\u001b[0m\n", - "\u001b[34m[1,15]:[2021-05-29 00:01:14.049 algo-2:225 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,8]:[2021-05-29 00:01:14.049 algo-2:678 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,10]:[2021-05-29 00:01:14.049 algo-2:228 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,12]:[2021-05-29 00:01:14.049 algo-2:226 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,11]:[2021-05-29 00:01:14.049 algo-2:224 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,14]:[2021-05-29 00:01:14.049 algo-2:223 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,13]:[2021-05-29 00:01:14.049 algo-2:227 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,9]:[2021-05-29 00:01:14.049 algo-2:231 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,13]:[2021-05-29 00:01:14.128 algo-2:227 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,15]:[2021-05-29 00:01:14.128 algo-2:225 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,8]:[2021-05-29 00:01:14.128 algo-2:678 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,10]:[2021-05-29 00:01:14.128 algo-2:228 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,12]:[2021-05-29 00:01:14.128 algo-2:226 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,11]:[2021-05-29 00:01:14.128 algo-2:224 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,14]:[2021-05-29 00:01:14.128 algo-2:223 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,9]:[2021-05-29 00:01:14.128 algo-2:231 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,13]:[2021-05-29 00:01:14.129 algo-2:227 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,9]:[2021-05-29 00:01:14.129 algo-2:231 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,14]:[2021-05-29 00:01:14.129 algo-2:223 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,8]:[2021-05-29 00:01:14.129 algo-2:678 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,10]:[2021-05-29 00:01:14.129 algo-2:228 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,12]:[2021-05-29 00:01:14.129 algo-2:226 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,11]:[2021-05-29 00:01:14.129 algo-2:224 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,15]:[2021-05-29 00:01:14.129 algo-2:225 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,9]:[2021-05-29 00:01:14.130 algo-2:231 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,13]:[2021-05-29 00:01:14.130 algo-2:227 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,11]:[2021-05-29 00:01:14.130 algo-2:224 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,14]:[2021-05-29 00:01:14.130 algo-2:223 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,10]:[2021-05-29 00:01:14.130 algo-2:228 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,12]:[2021-05-29 00:01:14.130 algo-2:226 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,15]:[2021-05-29 00:01:14.130 algo-2:225 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,8]:[2021-05-29 00:01:14.130 algo-2:678 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,12]:[2021-05-29 00:01:14.133 algo-2:226 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,15]:[2021-05-29 00:01:14.133 algo-2:225 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,8]:[2021-05-29 00:01:14.133 algo-2:678 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,11]:[2021-05-29 00:01:14.133 algo-2:224 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,14]:[2021-05-29 00:01:14.133 algo-2:223 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,15]:[2021-05-29 00:01:14.133 algo-2:225 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,9]:[2021-05-29 00:01:14.133 algo-2:231 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,13]:[2021-05-29 00:01:14.133 algo-2:227 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,8]:[2021-05-29 00:01:14.133 algo-2:678 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,12]:[2021-05-29 00:01:14.133 algo-2:226 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,11]:[2021-05-29 00:01:14.133 algo-2:224 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,10]:[2021-05-29 00:01:14.133 algo-2:228 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,14]:[2021-05-29 00:01:14.133 algo-2:223 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,9]:[2021-05-29 00:01:14.133 algo-2:231 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,13]:[2021-05-29 00:01:14.133 algo-2:227 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,10]:[2021-05-29 00:01:14.133 algo-2:228 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,12]:[2021-05-29 00:01:14.133 algo-2:226 INFO hook.py:413] Monitoring the collections: sm_metrics, metrics, losses\u001b[0m\n", - "\u001b[34m[1,15]:[2021-05-29 00:01:14.133 algo-2:225 INFO hook.py:413] Monitoring the collections: metrics, sm_metrics, losses\u001b[0m\n", - "\u001b[34m[1,8]:[2021-05-29 00:01:14.133 algo-2:678 INFO hook.py:413] Monitoring the collections: sm_metrics, metrics, losses\u001b[0m\n", - "\u001b[34m[1,11]:[2021-05-29 00:01:14.133 algo-2:224 INFO hook.py:413] Monitoring the collections: sm_metrics, losses, metrics\u001b[0m\n", - "\u001b[34m[1,13]:[2021-05-29 00:01:14.133 algo-2:227 INFO hook.py:413] Monitoring the collections: losses, sm_metrics, metrics\u001b[0m\n", - "\u001b[34m[1,14]:[2021-05-29 00:01:14.133 algo-2:223 INFO hook.py:413] Monitoring the collections: metrics, losses, sm_metrics\u001b[0m\n", - "\u001b[34m[1,9]:[2021-05-29 00:01:14.133 algo-2:231 INFO hook.py:413] Monitoring the collections: metrics, sm_metrics, losses\u001b[0m\n", - "\u001b[34m[1,10]:[2021-05-29 00:01:14.133 algo-2:228 INFO hook.py:413] Monitoring the collections: sm_metrics, metrics, losses\u001b[0m\n", - "\u001b[34m[1,6]:[2021-05-29 00:01:14.146 algo-1:221 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,3]:[2021-05-29 00:01:14.146 algo-1:222 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,4]:[2021-05-29 00:01:14.146 algo-1:219 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,5]:[2021-05-29 00:01:14.146 algo-1:215 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,7]:[2021-05-29 00:01:14.146 algo-1:218 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,0]:[2021-05-29 00:01:14.146 algo-1:669 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,1]:[2021-05-29 00:01:14.146 algo-1:214 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,2]:[2021-05-29 00:01:14.146 algo-1:220 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[1,1]:[2021-05-29 00:01:14.235 algo-1:214 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,6]:[2021-05-29 00:01:14.235 algo-1:221 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,3]:[2021-05-29 00:01:14.235 algo-1:222 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,4]:[2021-05-29 00:01:14.235 algo-1:219 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,5]:[2021-05-29 00:01:14.235 algo-1:215 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,7]:[2021-05-29 00:01:14.235 algo-1:218 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,0]:[2021-05-29 00:01:14.235 algo-1:669 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,2]:[2021-05-29 00:01:14.235 algo-1:220 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[1,0]:[2021-05-29 00:01:14.236 algo-1:669 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,1]:[2021-05-29 00:01:14.236 algo-1:214 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,6]:[2021-05-29 00:01:14.236 algo-1:221 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,3]:[2021-05-29 00:01:14.236 algo-1:222 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,4]:[2021-05-29 00:01:14.236 algo-1:219 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,5]:[2021-05-29 00:01:14.236 algo-1:215 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,7]:[2021-05-29 00:01:14.236 algo-1:218 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,2]:[2021-05-29 00:01:14.236 algo-1:220 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[1,0]:[2021-05-29 00:01:14.237 algo-1:669 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,1]:[2021-05-29 00:01:14.237 algo-1:214 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,6]:[2021-05-29 00:01:14.237 algo-1:221 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,4]:[2021-05-29 00:01:14.237 algo-1:219 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,5]:[2021-05-29 00:01:14.237 algo-1:215 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,2]:[2021-05-29 00:01:14.237 algo-1:220 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,7]:[2021-05-29 00:01:14.237 algo-1:218 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,3]:[2021-05-29 00:01:14.237 algo-1:222 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[1,1]:[2021-05-29 00:01:14.241 algo-1:214 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,7]:[2021-05-29 00:01:14.241 algo-1:218 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,6]:[2021-05-29 00:01:14.241 algo-1:221 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,4]:[2021-05-29 00:01:14.241 algo-1:219 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,5]:[2021-05-29 00:01:14.241 algo-1:215 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,3]:[2021-05-29 00:01:14.241 algo-1:222 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,4]:[2021-05-29 00:01:14.241 algo-1:219 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,0]:[2021-05-29 00:01:14.241 algo-1:669 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,1]:[2021-05-29 00:01:14.241 algo-1:214 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,7]:[2021-05-29 00:01:14.241 algo-1:218 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,6]:[2021-05-29 00:01:14.241 algo-1:221 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,2]:[2021-05-29 00:01:14.241 algo-1:220 INFO hook.py:253] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[1,5]:[2021-05-29 00:01:14.241 algo-1:215 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,3]:[2021-05-29 00:01:14.241 algo-1:222 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,0]:[2021-05-29 00:01:14.241 algo-1:669 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,2]:[2021-05-29 00:01:14.241 algo-1:220 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[1,4]:[2021-05-29 00:01:14.242 algo-1:219 INFO hook.py:413] Monitoring the collections: losses, metrics, sm_metrics\u001b[0m\n", - "\u001b[34m[1,7]:[2021-05-29 00:01:14.242 algo-1:218 INFO hook.py:413] Monitoring the collections: losses, sm_metrics, metrics\u001b[0m\n", - "\u001b[34m[1,6]:[2021-05-29 00:01:14.242 algo-1:221 INFO hook.py:413] Monitoring the collections: sm_metrics, metrics, losses\u001b[0m\n", - "\u001b[34m[1,1]:[2021-05-29 00:01:14.242 algo-1:214 INFO hook.py:413] Monitoring the collections: losses, metrics, sm_metrics\u001b[0m\n", - "\u001b[34m[1,5]:[2021-05-29 00:01:14.242 algo-1:215 INFO hook.py:413] Monitoring the collections: sm_metrics, metrics, losses\u001b[0m\n", - "\u001b[34m[1,0]:[2021-05-29 00:01:14.242 algo-1:669 INFO hook.py:413] Monitoring the collections: metrics, sm_metrics, losses\u001b[0m\n", - "\u001b[34m[1,3]:[2021-05-29 00:01:14.242 algo-1:222 INFO hook.py:413] Monitoring the collections: metrics, sm_metrics, losses\u001b[0m\n", - "\u001b[34m[1,2]:[2021-05-29 00:01:14.242 algo-1:220 INFO hook.py:413] Monitoring the collections: sm_metrics, metrics, losses\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:1465 [0] NCCL INFO Launch mode Parallel\u001b[0m\n", - "\u001b[34m[1,0]:Step #0#011Loss: 2.306592\u001b[0m\n", - "\u001b[34m[1,0]:algo-1:669:1457 [0] NCCL INFO Launch mode Parallel\u001b[0m\n", - "\u001b[34m[1,8]:algo-2:678:1454 [0] NCCL INFO Launch mode Parallel\u001b[0m\n", - "\u001b[34m[1,0]:Step #50#011Loss: 0.269254\u001b[0m\n", - "\u001b[34m[1,0]:Step #100#011Loss: 0.248688\u001b[0m\n", - "\u001b[34m[1,0]:Step #150#011Loss: 0.144664\u001b[0m\n", - "\u001b[34m[1,0]:Step #200#011Loss: 0.219410\u001b[0m\n", - "\u001b[34m[1,0]:Step #250#011Loss: 0.116688\u001b[0m\n", - "\u001b[34m[1,0]:Step #300#011Loss: 0.104166\u001b[0m\n", - "\u001b[34m[1,0]:Step #350#011Loss: 0.056800\u001b[0m\n", - "\u001b[34m[1,0]:Step #400#011Loss: 0.191574\u001b[0m\n", - "\u001b[34m[1,0]:Step #450#011Loss: 0.105225\u001b[0m\n", - "\u001b[34m[1,0]:Step #500#011Loss: 0.163396\u001b[0m\n", - "\u001b[34m[1,0]:Step #550#011Loss: 0.066595\u001b[0m\n", - "\u001b[34m[1,0]:Step #600#011Loss: 0.100547\u001b[0m\n", - "\u001b[35m2021-05-29 00:01:39,084 sagemaker-training-toolkit INFO Orted process exited\u001b[0m\n", - "\u001b[34mWarning: Permanently added 'algo-2,10.0.175.185' (ECDSA) to the list of known hosts.#015\u001b[0m\n", - "\u001b[34m[1,11]:2021-05-29 00:01:02.688131: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,12]:2021-05-29 00:01:02.688116: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,14]:2021-05-29 00:01:02.688121: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,15]:2021-05-29 00:01:02.688117: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,14]:2021-05-29 00:01:02.688288: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,11]:2021-05-29 00:01:02.688288: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,12]:2021-05-29 00:01:02.688288: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,15]:2021-05-29 00:01:02.688288: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,10]:2021-05-29 00:01:02.708510: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,10]:2021-05-29 00:01:02.708701: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,9]:2021-05-29 00:01:02.708740: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,9]:2021-05-29 00:01:02.708899: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,13]:2021-05-29 00:01:02.712251: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,13]:2021-05-29 00:01:02.712431: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,15]:2021-05-29 00:01:02.728492: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,14]:2021-05-29 00:01:02.728993: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,12]:2021-05-29 00:01:02.729164: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,11]:2021-05-29 00:01:02.729588: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,10]:2021-05-29 00:01:02.748637: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,9]:2021-05-29 00:01:02.748923: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,13]:2021-05-29 00:01:02.754496: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,1]:2021-05-29 00:01:02.825521: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,4]:2021-05-29 00:01:02.825515: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,5]:2021-05-29 00:01:02.825527: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,7]:2021-05-29 00:01:02.825520: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,5]:2021-05-29 00:01:02.825707: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,1]:2021-05-29 00:01:02.825707: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,4]:2021-05-29 00:01:02.825707: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,7]:2021-05-29 00:01:02.825707: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,2]:2021-05-29 00:01:02.846439: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,2]:2021-05-29 00:01:02.846661: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,3]:2021-05-29 00:01:02.851308: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,6]:2021-05-29 00:01:02.851307: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,3]:2021-05-29 00:01:02.851509: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,6]:2021-05-29 00:01:02.851509: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,7]:2021-05-29 00:01:02.866095: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,5]:2021-05-29 00:01:02.866095: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,4]:2021-05-29 00:01:02.866619: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,1]:2021-05-29 00:01:02.866746: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,2]:2021-05-29 00:01:02.888026: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,6]:2021-05-29 00:01:02.892022: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,3]:2021-05-29 00:01:02.892022: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,8]:2021-05-29 00:01:03.154460: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,8]:2021-05-29 00:01:03.154627: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,8]:2021-05-29 00:01:03.195015: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,0]:2021-05-29 00:01:03.285532: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,0]:2021-05-29 00:01:03.285795: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", - "\u001b[34m[1,0]:2021-05-29 00:01:03.328978: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", - "\u001b[34m[1,0]:2021-05-29 00:01:34.603259: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\u001b[0m\n", - "\u001b[34m[1,0]:INFO:tensorflow:Assets written to: /opt/ml/model/1/assets\u001b[0m\n", - "\u001b[34m[1,0]:INFO:tensorflow:Assets written to: /opt/ml/model/1/assets\n", - "\u001b[0m\n", - "\u001b[34m2021-05-29 00:01:39,083 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n", - "\u001b[35m2021-05-29 00:02:09,114 sagemaker-training-toolkit INFO MPI process finished.\u001b[0m\n", - "\u001b[35m2021-05-29 00:02:09,115 sagemaker_tensorflow_container.training WARNING No model artifact is saved under path /opt/ml/model. Your training job will not save any model files to S3.\u001b[0m\n", - "\u001b[35mFor details of how to construct your training script see:\u001b[0m\n", - "\u001b[35mhttps://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script\u001b[0m\n", - "\u001b[35m2021-05-29 00:02:09,115 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n", - "\n", - "2021-05-29 00:02:38 Uploading - Uploading generated training model\n", - "2021-05-29 00:02:38 Completed - Training job completed\n", - "ProfilerReport-1622246073: NoIssuesFound\n", - "Training seconds: 536\n", - "Billable seconds: 536\n" - ] - } - ], + "outputs": [], "source": [ "estimator.fit()" ] @@ -1258,18 +161,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Storing s3://sagemaker-us-west-2-688520471316/tensorflow2-smdataparallel-mnist-2021-05-28-23-54-33-230/output/model.tar.gz as model_data\n", - "Stored 'model_data' (str)\n" - ] - } - ], + "outputs": [], "source": [ "model_data = estimator.model_data\n", "print(\"Storing {} as model_data\".format(model_data))\n", @@ -1278,46 +172,54 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "update_endpoint is a no-op in sagemaker>=2.\n", - "See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "----!" - ] - } - ], + "outputs": [], "source": [ "predictor = estimator.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensorflow2-smdataparallel-mnist-2021-05-29-00-02-50-266\n" - ] - } - ], + "outputs": [], "source": [ "print(predictor.endpoint_name)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "\n", + "(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(path=\"/tmp/data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(10):\n", + " data = mnist_images[i].reshape(1, 28, 28, 1)\n", + "\n", + " predict_response = predictor.predict(data)\n", + "\n", + " print(\"========================================\")\n", + " label = mnist_labels[i]\n", + "\n", + " predict_label = np.argmax(predict_response[\"predictions\"])\n", + "\n", + " print(\"label is {}\".format(label))\n", + " print(\"prediction is {}\".format(predict_label))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1329,19 +231,27 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictor.delete_endpoint()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { + "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Environment (conda_tensorflow_p36)", + "display_name": "Python 3 (TensorFlow 2.1 Python 3.6 CPU Optimized)", "language": "python", - "name": "conda_tensorflow_p36" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/tensorflow-2.1-cpu-py36" }, "language_info": { "codemirror_mode": { @@ -1353,7 +263,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/use-cases/index.rst b/use-cases/index.rst index 06d06d0be8..9ab084bbf6 100644 --- a/use-cases/index.rst +++ b/use-cases/index.rst @@ -33,7 +33,7 @@ E-Commerce Personalization Computer Vision for Medical Imaging --------------------------- +----------------------------------- .. toctree:: :maxdepth: 1 @@ -42,3 +42,12 @@ Computer Vision for Medical Imaging computer_vision/2-metastases-detection-lineage-registry computer_vision/3-metastases-detection-deploy-predict computer_vision/4-metastases-detection-pipeline + + +Pipelines with NLP for Product Rating Prediction +------------------------------------------------ + +.. toctree:: + :maxdepth: 1 + + product_ratings_with_pipelines/pipelines_product_ratings diff --git a/use-cases/product_ratings_with_pipelines/README.md b/use-cases/product_ratings_with_pipelines/README.md new file mode 100644 index 0000000000..fdd87e4b6a --- /dev/null +++ b/use-cases/product_ratings_with_pipelines/README.md @@ -0,0 +1,40 @@ +# Amazon SageMaker Pipelines +## Training and deploying a text classification model using Amazon SageMaker Pipelines + +## Contents +1. [Background](#Background) +2. [Prerequisites](#Prereqs) +3. [Data](#Data) +4. [Approach](#Approach) +5. [Other Resources](#Other-Resources) + +--- + +# Background + +Amazon SageMaker Pipelines makes it easy for data scientists and engineers to build, automate, and scale end-to-end machine learning workflows. Machine learning workflows are complex, requiring iteration and experimentation across each step of the machine learning process, such as exploring and preparing data, experimenting with different algorithms, training and turning models, and deploying models to production. Developing and managing these workflows can take weeks or months of coding and manually managing workflow dependencies can become complex. With Amazon SageMaker Pipelines, data science teams have an easy-to-use continuous integration and continuous delivery (CI/CD) service that simplifies the development and management of machine learning workflows at scale. + +In this notebook we use SageMaker Pipelines to train and deploy a text classification model to predict e-commerce product ratings based on customers’ product reviews. We’ll use BlazingText, a SageMaker built-in algorithm, to minimize the amount of effort required to train and deploy the model. BlazingText provides highly optimized implementations of Word2vec and text classification algorithms. + +# Prereqs + +You will need an AWS account to use this solution. Sign up for an [account](https://aws.amazon.com/) before you proceed. + +You will also need to have permission to use [Amazon SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-studio.html). All AWS permissions can be managed through [AWS IAM](https://aws.amazon.com/iam/). Admin users will have the required permissions, but please contact your account's AWS administrator if your user account doesn't have the required permissions. + +# Data + +To train the model, we’ll use a sample of data containing e-commerce reviews and associated product ratings. Our pipeline will start with processing the data for model training and will proceed with model training, evaluation, registry and deployment. The Women’s E-Commerce Clothing Clothing Reviews dataset has been made available under a Creative Commons license. A copy of the dataset has been saved in a sample data Amazon S3 bucket. In the first section of the notebook, we’ll walk through how to download the data and get started with building the ML workflow as a SageMaker pipeline. + +# Approach + +Our ML workflow will be built in the following SageMaker pipeline steps: +* Data processing step - in this step we use a scikit-learn processor to process the training data by cleaning up the review text (eg. remove punctuation and convert to lower case), rebalancing the dataset, creating review categories and generating the training, testing and validation datasets +* Model training step - in this step we create a SageMaker estimator and specify model training hyperparameters and the location of training and validation data +* Create model step - in the create model step we pass the model data from the training step +* Deploy model step - the deploy model step uses a scikit-learn processor to deploy the trained model +* Register model step - in the final model step we submit the trained model to the model registry. We can optionally configure this step to require manual approval before submission. + +# Other Resources + +For additional SageMaker Pipelines examples, see [Orchestrating Jobs with Amazon SageMaker Model Building Pipelines](https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-pipelines/tabular/abalone_build_train_deploy/sagemaker-pipelines-preprocess-train-evaluate-batch-transform.html) or the related [GitHub repo](https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-pipelines). \ No newline at end of file diff --git a/use-cases/product_ratings_with_pipelines/code/deploy_model.py b/use-cases/product_ratings_with_pipelines/code/deploy_model.py new file mode 100644 index 0000000000..e558253458 --- /dev/null +++ b/use-cases/product_ratings_with_pipelines/code/deploy_model.py @@ -0,0 +1,56 @@ + +import time +from datetime import datetime +import boto3 +import argparse + + +# Parse argument variables passed via the DeployModel processing step +parser = argparse.ArgumentParser() +parser.add_argument('--model-name', type=str) +parser.add_argument('--region', type=str) +parser.add_argument('--endpoint-instance-type', type=str) +parser.add_argument('--endpoint-name', type=str) +args = parser.parse_args() + +region = args.region +boto3.setup_default_session(region_name=region) +sagemaker_boto_client = boto3.client('sagemaker') + +# truncate name per sagameker length requirememnts (63 char max) if necessary +endpoint_config_name = f'{args.endpoint_name}-config-{datetime.now().strftime("%Y%m%d-%H%M%S")}' + +# create new endpoint config file +create_ep_config_response = sagemaker_boto_client.create_endpoint_config( + EndpointConfigName=endpoint_config_name, + ProductionVariants=[{ + 'InstanceType': args.endpoint_instance_type, + 'InitialVariantWeight': 1, + 'InitialInstanceCount': 1, + 'ModelName': args.model_name, + 'VariantName': 'AllTraffic' + }]) + +print("ModelName: {}".format(args.model_name)) + +# create endpoint if model endpoint does not already exist, otherwise update the endpoint +try: + create_endpoint_response = sagemaker_boto_client.create_endpoint( + EndpointName=args.endpoint_name, + EndpointConfigName=endpoint_config_name + ) +except: + create_endpoint_response = sagemaker_boto_client.update_endpoint( + EndpointName=args.endpoint_name, + EndpointConfigName=endpoint_config_name + ) + +endpoint_info = sagemaker_boto_client.describe_endpoint(EndpointName=args.endpoint_name) +endpoint_status = endpoint_info['EndpointStatus'] + +while endpoint_status != 'InService': + endpoint_info = sagemaker_boto_client.describe_endpoint(EndpointName=args.endpoint_name) + endpoint_status = endpoint_info['EndpointStatus'] + print('Endpoint status:', endpoint_status) + if endpoint_status != 'InService': + time.sleep(30) diff --git a/use-cases/product_ratings_with_pipelines/code/preprocessing.py b/use-cases/product_ratings_with_pipelines/code/preprocessing.py new file mode 100644 index 0000000000..11f90ee940 --- /dev/null +++ b/use-cases/product_ratings_with_pipelines/code/preprocessing.py @@ -0,0 +1,53 @@ + +import numpy as np +import pandas as pd +import string +from sklearn.utils import resample + +base_dir = "/opt/ml/processing" + +df = pd.read_csv( f"{base_dir}/input/Womens Clothing E-Commerce Reviews.csv") +df = df[df['Review Text'].notna()] # drop rows where Review text is missing + +def process_review(text): + punctuation = string.punctuation + review = text.lower() + review = review.replace("\r\n", " ").replace("\n\n", " ") + translator = str.maketrans("","", punctuation) + review = review.translate(translator) + return review + +# create columns for concat reviews and new labels +df['Complete_Review'] = df['Title'] + ' ' + df['Review Text'] +df = df[df['Complete_Review'].notna()] # drop rows where review text is missing +df['Label'] = df['Rating'].map({1:'negative',2:'negative',3:'none',4:'none',5:'positive'}) +df = df.loc[df['Label'].isin(['negative','positive'])] # only use positive and negative reviews +df['Review'] = df['Complete_Review'].astype(str).apply(process_review) +df['Processed'] = '__label__' + df['Label'].astype(str) + ' ' + df['Review'] + +# create train:test split +train, validation, test = np.split(df, [int(0.7 * len(df)), int(0.85 * len(df))]) + +# deal with unbalanced classes +# only include resampling for training set so no data leakeage for validation sets +positive = train.loc[train['Label']=='positive'] +negative = train.loc[train['Label']=='negative'] + +# oversample the minority classes +negative_oversample = resample(negative, replace=True, n_samples=len(positive)) + +# remake training set using balanced class camples +train = pd.concat([positive,negative_oversample]) + +# create Series datasets for BlazingText format +train = train['Processed'] +validation = validation['Processed'] +test = test['Processed'] + +# save datasets +pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv", header=False, index=False) +pd.DataFrame(validation).to_csv(f"{base_dir}/validation/validation.csv", header=False, index=False) +pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv", header=False, index=False) + +print(f"Number of reviews in the training dataset: {train.shape[0]}") +print(f"Number of reviews in the validation set: {validation.shape[0]}") diff --git a/use-cases/product_ratings_with_pipelines/pipelines_product_ratings.ipynb b/use-cases/product_ratings_with_pipelines/pipelines_product_ratings.ipynb new file mode 100644 index 0000000000..3caae986c0 --- /dev/null +++ b/use-cases/product_ratings_with_pipelines/pipelines_product_ratings.ipynb @@ -0,0 +1,1001 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training and Deploying a Text Classification model using Amazon SageMaker Pipelines" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Background\n", + "\n", + "[Amazon SageMaker Pipelines](https://aws.amazon.com/sagemaker/pipelines/) makes it easy for data scientists and engineers to build, automate, and scale end-to-end machine learning workflows. Machine learning workflows are complex, requiring iteration and experimentation across each step of the machine learning process, such as exploring and preparing data, experimenting with different algorithms, training and turning models, and deploying models to production. Developing and managing these workflows can take weeks or months of coding and manually managing workflow dependencies can become complex. With Amazon SageMaker Pipelines, data science teams have an easy-to-use continuous integration and continuous delivery (CI/CD) service that simplifies the development and management of machine learning workflows at scale.\n", + "\n", + "In this notebook, we use SageMaker Pipelines to train and deploy a text classification model to predict e-commerce product ratings based on customers’ product reviews. We’ll use BlazingText, one of the SageMaker [built-in algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/algorithms-choose.html#built-in-algorithms-benefits), to minimize the amount of effort required to train and deploy the model. [BlazingText](https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext.html) provides highly optimized implementations of Word2vec and text classification algorithms.\n", + "\n", + "To train the model, we’ll use a sample of data containing e-commerce reviews and associated product ratings. This data has been generated by customers that have rated products on a scale between 1 (worst) and 5 (best), and have left a short review. Our pipeline will start with processing the data for model training and will proceed with model training, evaluation, registry and deployment. The [Women’s E-Commerce Clothing Reviews](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews/) dataset has been made available under a [Creative Commons license](https://creativecommons.org/publicdomain/zero/1.0/). A copy of the dataset has been saved in a sample data Amazon S3 bucket. In the first section of the notebook, we’ll walk through how to download the data and get started with building the ML workflow as a SageMaker pipeline.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Contents\n", + "\n", + "- [Project setup](#setup)\n", + " - [Set up SageMaker environment](#environment)\n", + " - [Obtain training data](#obtain-data)\n", + "- [Step 0: Prepare pipeline parameters](#pipeline-parameters)\n", + "- [Step 1: Create dataset and train/test split](#dataset-train-test)\n", + "- [Step 2: Train text classification model using BlazingText](#train-model)\n", + "- [Step 3: Model creation](#create-model)\n", + "- [Step 4: Deploy model](#deploy-model)\n", + "- [Step 5: Register model](#register-model)\n", + "- [Step 6: Create pipeline](#create-pipeline)\n", + "- [Use deployed model to return predictions](#return-predictions)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Project setup\n", + "\n", + "In this section, we'll install some necessary packages, set parameters that we can use in the pipeline and download the training data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Set up SageMaker environment\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#install necessary packages\n", + "import boto3\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sagemaker\n", + "print(f'SageMaker version: {sagemaker.__version__}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#import necessary execution role so that you can read from S3 buckets\n", + "role = sagemaker.get_execution_role()\n", + "\n", + "#source default session parameters (region, default S3 bucket etc)\n", + "region = boto3.Session().region_name\n", + "sagemaker_session = sagemaker.Session()\n", + "s3_client = boto3.client('s3', region_name=region)\n", + "sagemaker_client = boto3.client(\"sagemaker-runtime\")\n", + "default_bucket = sagemaker_session.default_bucket()\n", + "prefix = 'sagemaker-pipelines-nlp-demo'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Obtain training data\n", + "\n", + "To get started, we'll download the training data from a public S3 bucket and then upload the data to our own S3 bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p data\n", + "!wget https://sagemaker-sample-files.s3.amazonaws.com/datasets/tabular/womens_clothing_ecommerce/Womens_Clothing_E-Commerce_Reviews.csv -O 'data/Womens Clothing E-Commerce Reviews.csv'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Clothing IDAgeTitleReview TextRatingRecommended INDPositive Feedback CountDivision NameDepartment NameClass Name
076733NaNAbsolutely wonderful - silky and sexy and comf...410InitmatesIntimateIntimates
1108034NaNLove this dress! it's sooo pretty. i happene...514GeneralDressesDresses
2107760Some major design flawsI had such high hopes for this dress and reall...300GeneralDressesDresses
3104950My favorite buy!I love, love, love this jumpsuit. it's fun, fl...510General PetiteBottomsPants
484747Flattering shirtThis shirt is very flattering to all due to th...516GeneralTopsBlouses
\n", + "
" + ], + "text/plain": [ + " Clothing ID Age Title \\\n", + "0 767 33 NaN \n", + "1 1080 34 NaN \n", + "2 1077 60 Some major design flaws \n", + "3 1049 50 My favorite buy! \n", + "4 847 47 Flattering shirt \n", + "\n", + " Review Text Rating Recommended IND \\\n", + "0 Absolutely wonderful - silky and sexy and comf... 4 1 \n", + "1 Love this dress! it's sooo pretty. i happene... 5 1 \n", + "2 I had such high hopes for this dress and reall... 3 0 \n", + "3 I love, love, love this jumpsuit. it's fun, fl... 5 1 \n", + "4 This shirt is very flattering to all due to th... 5 1 \n", + "\n", + " Positive Feedback Count Division Name Department Name Class Name \n", + "0 0 Initmates Intimate Intimates \n", + "1 4 General Dresses Dresses \n", + "2 0 General Dresses Dresses \n", + "3 0 General Petite Bottoms Pants \n", + "4 6 General Tops Blouses " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('data/Womens Clothing E-Commerce Reviews.csv', index_col=0)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A quick look at the distribution of product ratings shows us that the dataset is imbalanced; there are more observations with good product ratings than poor product ratings. This inbalance in the training data set may lead to a tendency to favor the majority classes (high product ratings) and overall poor model accuracy. During the data processing step, we'll mitigate the impact of the imbalanced dataset by:\n", + "\n", + "* Grouping ratings of 1 & 2 in to a larger 'negative review' category\n", + "* Oversampling the 'negative review+' minority class\n", + "* Ignoring neutral reviews (ratings of 3 & 4) to remove reviews that may be difficult to distinguish from positive and negative reviews. While this means that we will be unable to classify reviews as neutral, this may enable the model to better flag positive and negative reviews and therefor identify customers that require additional customer service attention or who are particularly likely to purchase additional products.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot = df.groupby('Rating')['Rating'].count().plot(kind='bar', title = 'Count of Product Rating')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#upload the data to your default S3 bucket or another S3 bucket of your choosing\n", + "local_path = \"data/Womens Clothing E-Commerce Reviews.csv\"\n", + "\n", + "base_uri = f\"s3://{default_bucket}/{prefix}/data\"\n", + "input_data_uri = sagemaker.s3.S3Uploader.upload(\n", + " local_path=local_path,\n", + " desired_s3_uri=base_uri,\n", + ")\n", + "print(input_data_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Step 0: Define parameters to parametrize pipeline execution\n", + "\n", + "Using SageMaker Pipelines, we can define the steps to be included in a pipeline but then use parameters to modify that pipeline when we go to execute the pipeline, without having to modify the pipeline definition. We'll provide some default parameter values that can be overridden on pipeline execution.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.parameters import (ParameterInteger, ParameterString)\n", + "\n", + "#specify location of input data\n", + "input_data = ParameterString(\n", + " name=\"InputData\",\n", + " default_value=input_data_uri,\n", + ")\n", + "\n", + "#specify default number of instances for processing step\n", + "processing_instance_count = ParameterInteger(\n", + " name=\"ProcessingInstanceCount\",\n", + " default_value=1\n", + ")\n", + "\n", + "#specify default instance type for processing step\n", + "processing_instance_type = ParameterString(\n", + " name=\"ProcessingInstanceType\",\n", + " default_value=\"ml.m4.xlarge\"\n", + ")\n", + "\n", + "#specify default instance type for training step\n", + "train_instance_type = ParameterString(\n", + " name=\"TrainingInstanceType\",\n", + " default_value=\"ml.m4.xlarge\",\n", + ")\n", + "\n", + "#specify default model approval mode\n", + "model_approval_status = ParameterString(\n", + " name=\"ModelApprovalStatus\",\n", + " default_value=\"Approved\"\n", + ")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Step 1: Create Dataset and Train/Test Split\n", + "\n", + "In this step, an SKLearn processor is used to prepare the data for model training. This data processing includes converting the review text to lowercase, removing carriage returns and line breaks and removing punctuation. We also group categories and conduct oversampling to reduce training dataset imbalance. Additionally, we process the reviews so that the training input data is in the format expected by the BlazingText algorithm (see more information here: https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext.html#bt-inputoutput) and we split the data in to training, testing and validation datasets.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile code/preprocessing.py\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import string\n", + "from sklearn.utils import resample\n", + "\n", + "base_dir = \"/opt/ml/processing\"\n", + "\n", + "df = pd.read_csv( f\"{base_dir}/input/Womens Clothing E-Commerce Reviews.csv\")\n", + "df = df[df['Review Text'].notna()] # drop rows where Review text is missing\n", + "\n", + "def process_review(text):\n", + " punctuation = string.punctuation\n", + " review = text.lower()\n", + " review = review.replace(\"\\r\\n\", \" \").replace(\"\\n\\n\", \" \")\n", + " translator = str.maketrans(\"\",\"\", punctuation)\n", + " review = review.translate(translator)\n", + " return review\n", + " \n", + "# create columns for concat reviews and new labels\n", + "df['Complete_Review'] = df['Title'] + ' ' + df['Review Text']\n", + "df = df[df['Complete_Review'].notna()] # drop rows where review text is missing\n", + "df['Label'] = df['Rating'].map({1:'negative',2:'negative',3:'none',4:'none',5:'positive'})\n", + "df = df.loc[df['Label'].isin(['negative','positive'])] # only use positive and negative reviews\n", + "df['Review'] = df['Complete_Review'].astype(str).apply(process_review)\n", + "df['Processed'] = '__label__' + df['Label'].astype(str) + ' ' + df['Review'] \n", + "\n", + "# create train:test split\n", + "train, validation, test = np.split(df, [int(0.7 * len(df)), int(0.85 * len(df))])\n", + "\n", + "# deal with unbalanced classes\n", + "# only include resampling for training set so no data leakeage for validation sets\n", + "positive = train.loc[train['Label']=='positive']\n", + "negative = train.loc[train['Label']=='negative']\n", + "\n", + "# oversample the minority classes\n", + "negative_oversample = resample(negative, replace=True, n_samples=len(positive))\n", + "\n", + "# remake training set using balanced class camples\n", + "train = pd.concat([positive,negative_oversample])\n", + "\n", + "# create Series datasets for BlazingText format\n", + "train = train['Processed']\n", + "validation = validation['Processed']\n", + "test = test['Processed']\n", + "\n", + "# save datasets\n", + "pd.DataFrame(train).to_csv(f\"{base_dir}/train/train.csv\", header=False, index=False)\n", + "pd.DataFrame(validation).to_csv(f\"{base_dir}/validation/validation.csv\", header=False, index=False)\n", + "pd.DataFrame(test).to_csv(f\"{base_dir}/test/test.csv\", header=False, index=False) \n", + "\n", + "print(f\"Number of reviews in the training dataset: {train.shape[0]}\")\n", + "print(f\"Number of reviews in the validation set: {validation.shape[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.sklearn.processing import SKLearnProcessor\n", + "\n", + "framework_version = \"0.23-1\"\n", + "\n", + "sklearn_processor = SKLearnProcessor(\n", + " framework_version=framework_version,\n", + " instance_type=processing_instance_type,\n", + " instance_count=processing_instance_count,\n", + " base_job_name=\"sklearn-nlp-process\",\n", + " role=role,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", + "from sagemaker.workflow.steps import ProcessingStep\n", + "\n", + "s3_client.upload_file(Filename='./code/preprocessing.py', Bucket=default_bucket, Key=f'{prefix}/code/preprocessing.py')\n", + "preprocess_script_uri = f's3://{default_bucket}/{prefix}/code/preprocessing.py'\n", + "\n", + "process_step = ProcessingStep(\n", + " name=\"BTDemoProcessStep\",\n", + " processor=sklearn_processor,\n", + " inputs=[\n", + " ProcessingInput(source=input_data, destination=\"/opt/ml/processing/input\"),\n", + " ],\n", + " outputs=[\n", + " ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n", + " ProcessingOutput(output_name=\"validation\", source=\"/opt/ml/processing/validation\"),\n", + " ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\"),\n", + " ],\n", + " code=preprocess_script_uri,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Step 2: Train text classification model using BlazingText" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set up estimator:\n", + "\n", + "from sagemaker.estimator import Estimator\n", + "\n", + "bt_estimator = Estimator(\n", + " role=role,\n", + " instance_type=train_instance_type,\n", + " instance_count=1,\n", + " image_uri=sagemaker.image_uris.retrieve(\"blazingtext\", region),\n", + " output_path=f's3://{default_bucket}/{prefix}/training_jobs',\n", + " base_job_name='bt-model-estimator',\n", + " input_mode = 'File'\n", + ") \n", + "\n", + "#for more info on hyperparameters, see: https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext.html\n", + "bt_estimator.set_hyperparameters(mode=\"supervised\",\n", + " epochs=25,\n", + " learning_rate=0.02,\n", + " min_count=2,\n", + " early_stopping=True,\n", + " patience=4,\n", + " min_epochs=10,\n", + " word_ngrams=3\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set up model training step \n", + "from sagemaker.inputs import TrainingInput\n", + "from sagemaker.workflow.steps import TrainingStep\n", + "\n", + "train_step = TrainingStep(\n", + " name='BTDemoTrainStep',\n", + " estimator=bt_estimator,\n", + " inputs={\n", + " 'train': sagemaker.inputs.TrainingInput(\n", + " s3_data=process_step.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri,\n", + " content_type=\"text/csv\"\n", + " ),\n", + " 'validation': sagemaker.inputs.TrainingInput(\n", + " s3_data=process_step.properties.ProcessingOutputConfig.Outputs['test'].S3Output.S3Uri,\n", + " content_type=\"text/csv\"\n", + " )\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Step 3: Model creation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.steps import CreateModelStep\n", + "\n", + "model = sagemaker.model.Model(\n", + " name='nlp-blaztext-model',\n", + " image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,\n", + " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", + " sagemaker_session=sagemaker_session,\n", + " role=role\n", + ")\n", + "\n", + "inputs = sagemaker.inputs.CreateModelInput(\n", + " instance_type=\"ml.m4.xlarge\"\n", + ")\n", + "\n", + "create_model_step = CreateModelStep(\n", + " name=\"BTDemoCreatemodelStep\",\n", + " model=model,\n", + " inputs=inputs\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Step 4: Deploy model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile code/deploy_model.py\n", + "\n", + "import time\n", + "from datetime import datetime\n", + "import boto3\n", + "import argparse\n", + "\n", + "\n", + "# Parse argument variables passed via the DeployModel processing step\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument('--model-name', type=str)\n", + "parser.add_argument('--region', type=str)\n", + "parser.add_argument('--endpoint-instance-type', type=str)\n", + "parser.add_argument('--endpoint-name', type=str)\n", + "args = parser.parse_args()\n", + "\n", + "region = args.region\n", + "boto3.setup_default_session(region_name=region)\n", + "sagemaker_boto_client = boto3.client('sagemaker')\n", + "\n", + "# truncate name per sagameker length requirememnts (63 char max) if necessary\n", + "endpoint_config_name = f'{args.endpoint_name}-config-{datetime.now().strftime(\"%Y%m%d-%H%M%S\")}'\n", + "\n", + "# create new endpoint config file \n", + "create_ep_config_response = sagemaker_boto_client.create_endpoint_config(\n", + " EndpointConfigName=endpoint_config_name,\n", + " ProductionVariants=[{\n", + " 'InstanceType': args.endpoint_instance_type,\n", + " 'InitialVariantWeight': 1,\n", + " 'InitialInstanceCount': 1,\n", + " 'ModelName': args.model_name,\n", + " 'VariantName': 'AllTraffic'\n", + " }])\n", + "\n", + "print(\"ModelName: {}\".format(args.model_name))\n", + "\n", + "# create endpoint if model endpoint does not already exist, otherwise update the endpoint\n", + "try:\n", + " create_endpoint_response = sagemaker_boto_client.create_endpoint(\n", + " EndpointName=args.endpoint_name,\n", + " EndpointConfigName=endpoint_config_name\n", + " )\n", + "except:\n", + " create_endpoint_response = sagemaker_boto_client.update_endpoint(\n", + " EndpointName=args.endpoint_name,\n", + " EndpointConfigName=endpoint_config_name\n", + " )\n", + "\n", + "endpoint_info = sagemaker_boto_client.describe_endpoint(EndpointName=args.endpoint_name)\n", + "endpoint_status = endpoint_info['EndpointStatus']\n", + "\n", + "while endpoint_status != 'InService':\n", + " endpoint_info = sagemaker_boto_client.describe_endpoint(EndpointName=args.endpoint_name)\n", + " endpoint_status = endpoint_info['EndpointStatus']\n", + " print('Endpoint status:', endpoint_status)\n", + " if endpoint_status != 'InService':\n", + " time.sleep(30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s3_client.upload_file(Filename='./code/deploy_model.py', Bucket=default_bucket, Key=f'{prefix}/code/deploy_model.py')\n", + "deploy_model_script_uri = f's3://{default_bucket}/{prefix}/code/deploy_model.py'\n", + "pipeline_endpoint_name = 'nlp-blaztext-model-endpoint'\n", + "\n", + "deployment_instance_type = \"ml.m4.xlarge\"\n", + "\n", + "deploy_model_processor = SKLearnProcessor(\n", + " framework_version='0.23-1',\n", + " role=role,\n", + " instance_type='ml.m5.xlarge',\n", + " instance_count=1,\n", + " volume_size_in_gb=60,\n", + " base_job_name='nlp-blaztext-deploy-model',\n", + " sagemaker_session=sagemaker_session)\n", + "\n", + "deploy_step = ProcessingStep(\n", + " name='BTDemoDeployStep',\n", + " processor=deploy_model_processor,\n", + " job_arguments=[\n", + " \"--model-name\", create_model_step.properties.ModelName, \n", + " \"--region\", region,\n", + " \"--endpoint-instance-type\", deployment_instance_type,\n", + " \"--endpoint-name\", pipeline_endpoint_name\n", + " ],\n", + " code=deploy_model_script_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Step 5: Register model\n", + "\n", + "In this step you will use the ParameterString `model_approval_status` defined at the outset of the pipeline code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.step_collections import RegisterModel\n", + "\n", + "register_step = RegisterModel(\n", + " name=\"BTDemoRegistermodelStep\",\n", + " estimator=bt_estimator,\n", + " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", + " content_types=[\"text/csv\"],\n", + " response_types=[\"text/csv\"],\n", + " inference_instances=[\"ml.t2.medium\", \"ml.m5.xlarge\"],\n", + " transform_instances=[\"ml.m5.xlarge\"],\n", + " model_package_group_name=prefix,\n", + " approval_status=model_approval_status,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Step 6: Create pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.workflow.pipeline import Pipeline\n", + "\n", + "#run full pipeline\n", + "steps_full = [process_step,\n", + " train_step,\n", + " create_model_step,\n", + " deploy_step,\n", + " register_step]\n", + "\n", + "#run data processing step\n", + "steps_preprocessing = [process_step]\n", + "\n", + "pipeline_name = 'BlazingTextPipeline'\n", + "\n", + "pipeline = Pipeline(\n", + " name=pipeline_name,\n", + " parameters=[\n", + " processing_instance_type,\n", + " processing_instance_count,\n", + " train_instance_type,\n", + " model_approval_status,\n", + " input_data\n", + " ],\n", + " steps=steps_full, #switch to steps_preprocessing if you would like to run only the data processing step\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Examine the JSON pipeline definition:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import json\n", + "\n", + "definition = json.loads(pipeline.definition())\n", + "definition" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the pipeline definition to the SageMaker Pipelines service to create a pipeline if it doesn't exist, or update the pipeline if it does. The role passed in is used by SageMaker Pipelines to create all the jobs defined in the steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.upsert(role_arn=role)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Start a pipeline execution:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "execution = pipeline.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Describe the pipeline execution status to ensure that it has been created and started successfully:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "execution.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wait for the execution to finish:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "execution.wait()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List the execution steps and their status:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "execution.list_steps()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Use deployed model to return predictions\n", + "\n", + "Now that our model is deployed as an endpoint, we can submit sample product reviews and return predicted product ratings. For more information on BlazingText inference, see the [BlazingText algorithm documentation](https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/blazingtext_text_classification_dbpedia/blazingtext_text_classification_dbpedia.html#Hosting-/-Inference)." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "import string \n", + "\n", + "def process_review(text):\n", + " punctuation = string.punctuation\n", + " review = text.lower()\n", + " review = review.replace(\"\\r\\n\", \" \").replace(\"\\n\\n\", \" \")\n", + " translator = str.maketrans(\"\",\"\", punctuation)\n", + " review = review.translate(translator)\n", + " return review" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "sentences = [\"i loved this blouse when i saw it on-line, and the fabric is so soft!\",\n", + " \"love the top, but very small to the size. ordered a medium and had to send back in exchange for x-large\", \n", + " \"horrible! this top was scratchy and too small.\"]\n", + "# process the reviews to predict the same as training data\n", + "processed_sentences = [ process_review(sent) for sent in sentences ]\n", + "\n", + "payload = {\"instances\" : processed_sentences}" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "def get_predictions(payload, endpoint_name, client):\n", + " response = client.invoke_endpoint(EndpointName=endpoint_name,\n", + " Body=json.dumps(payload),\n", + " ContentType='application/json')\n", + " predictions = json.loads(response['Body'].read().decode('utf-8'))\n", + " return list(zip(payload['instances'], predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('i loved this blouse when i saw it online and the fabric is so soft',\n", + " {'label': ['__label__positive'], 'prob': [0.9529946446418762]}),\n", + " ('love the top but very small to the size ordered a medium and had to send back in exchange for xlarge',\n", + " {'label': ['__label__negative'], 'prob': [0.5224761962890625]}),\n", + " ('horrible this top was scratchy and too small',\n", + " {'label': ['__label__negative'], 'prob': [0.999985933303833]})]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# return predictions\n", + "get_predictions(payload, pipeline_endpoint_name, sagemaker_client)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean up\n", + "\n", + "To avoid incurring unnecessary charges, delete the endpoints and resources that you created while running the notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#sagemaker_boto_client.delete_endpoint(EndpointName=pipeline_endpoint_name)\n", + "#print(f\"Deleting endpoint: {pipeline_endpoint_name}\")" + ] + } + ], + "metadata": { + "instance_type": "ml.m5.large", + "kernelspec": { + "display_name": "Python 3 (Data Science)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}