diff --git a/docs/PAIMode.md b/docs/PAIMode.md index d8470884d4..05da292afa 100644 --- a/docs/PAIMode.md +++ b/docs/PAIMode.md @@ -48,6 +48,7 @@ Compared with LocalMode and [RemoteMachineMode](RemoteMachineMode.md), trial con * Required key. Should be positive number based on your trial program's memory requirement * image * Required key. In pai mode, your trial program will be scheduled by OpenPAI to run in [Docker container](https://www.docker.com/). This key is used to specify the Docker image used to create the container in which your traill will run. + * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](../deployment/Dockerfile.build.base). You can either use this image directly in your config file, or build your own image based on it. * dataDir * Optional key. It specifies the HDFS data direcotry for trial to download data. The format should be something like hdfs://{your HDFS host}:9000/{your data directory} * outputDir diff --git a/docs/RELEASE.md b/docs/RELEASE.md index e7ef20c729..61b4c78c72 100644 --- a/docs/RELEASE.md +++ b/docs/RELEASE.md @@ -1,9 +1,9 @@ # Release 0.2.0 - 9/29/2018 ## Major Features - * Support for [OpenPAI](https://github.com/Microsoft/pai) (aka pai) Training Service + * Support [OpenPAI](https://github.com/Microsoft/pai) (aka pai) Training Service (See [here](./PAIMode.md) for instructions about how to submit NNI job in pai mode) * Support training services on pai mode. NNI trials will be scheduled to run on OpenPAI cluster * NNI trial's output (including logs and model file) will be copied to OpenPAI HDFS for further debugging and checking - * Support [SMAC](https://www.cs.ubc.ca/~hutter/papers/10-TR-SMAC.pdf) tuner + * Support [SMAC](https://www.cs.ubc.ca/~hutter/papers/10-TR-SMAC.pdf) tuner (See [here](../src/sdk/pynni/nni/README.md) for instructions about how to use SMAC tuner) * [SMAC](https://www.cs.ubc.ca/~hutter/papers/10-TR-SMAC.pdf) is based on Sequential Model-Based Optimization (SMBO). It adapts the most prominent previously used model class (Gaussian stochastic process models) and introduces the model class of random forests to SMBO to handle categorical parameters. The SMAC supported by NNI is a wrapper on [SMAC3](https://github.com/automl/SMAC3) * Support NNI installation on [conda](https://conda.io/docs/index.html) and python virtual environment * Others diff --git a/docs/StartExperiment.md b/docs/StartExperiment.md new file mode 100644 index 0000000000..30f736ff15 --- /dev/null +++ b/docs/StartExperiment.md @@ -0,0 +1,33 @@ +How to start an experiment +=== +## 1.Introduce +There are few steps to start an new experiment of nni, here are the process. + +## 2.Details +### 2.1 Check environment +The first step to start an experiment is to check whether the environment is ready, nnictl will check if there is an old experiment running or the port of restfurl server is occupied. +NNICTL will also validate the content of config yaml file, to ensure the experiment config is in correct format. + +### 2.2 Start restful server +After check environment, nnictl will start an restful server process to manage nni experiment, the devault port is 51188. + +### 2.3 Check restful server +Before next steps, nnictl will check whether restful server is successfully started, or the starting process will stop and show error message. + +### 2.4 Set experiment config +NNICTL need to set experiment config before start an experiment, experiment config includes the config values in config yaml file. + +### 2.5 Check experiment cofig +NNICTL will ensure the request to set config is successfully executed. + +### 2.6 Start Web UI +NNICTL will start a Web UI process to show Web UI information,the default port of Web UI is 8080. + +### 2.7 Check Web UI +If Web UI is not successfully started, nnictl will give a warning information, and will continue to start experiment. + +### 2.8 Start Experiment +This is the most import step of starting an nni experiment, nnictl will call restful server process to setup an experiment. + +### 2.9 Check experiment +After start experiment, nnictl will check whether the experiment is correctly created, and show more information of this experiment to users. \ No newline at end of file diff --git a/docs/img/experiment_process.jpg b/docs/img/experiment_process.jpg new file mode 100644 index 0000000000..141e41cad9 Binary files /dev/null and b/docs/img/experiment_process.jpg differ diff --git a/examples/trials/auto-gbdt/config.yml b/examples/trials/auto-gbdt/config.yml index 8a2569d1a8..e6f3b963ac 100644 --- a/examples/trials/auto-gbdt/config.yml +++ b/examples/trials/auto-gbdt/config.yml @@ -3,7 +3,7 @@ experimentName: example_auto-gbdt trialConcurrency: 1 maxExecDuration: 10h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local searchSpacePath: search_space.json #choice: true, false diff --git a/examples/trials/auto-gbdt/config_pai.yml b/examples/trials/auto-gbdt/config_pai.yml new file mode 100644 index 0000000000..26577cf83a --- /dev/null +++ b/examples/trials/auto-gbdt/config_pai.yml @@ -0,0 +1,36 @@ +authorName: default +experimentName: example_auto-gbdt +trialConcurrency: 1 +maxExecDuration: 10h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: pai +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: minimize +trial: + command: python3 main.py + codeDir: . + gpuNum: 0 + cpuNum: 1 + memoryMB: 8196 + #The docker image to run nni job on pai + image: openpai/pai.example.tensorflow + #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' + hdfsDataDir: hdfs://10.10.10.10:9000/username/nni + #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' + hdfsOutputDir: hdfs://10.10.10.10:9000/username/nni +paiConfig: + #The username to login pai + userName: username + #The password to login pai + passWord: password + #The host of restful server of pai + host: 10.10.10.10 \ No newline at end of file diff --git a/examples/trials/ga_squad/config.yml b/examples/trials/ga_squad/config.yml index a0b1480992..c6fec5bcbc 100644 --- a/examples/trials/ga_squad/config.yml +++ b/examples/trials/ga_squad/config.yml @@ -3,7 +3,7 @@ experimentName: example_ga_squad trialConcurrency: 1 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local #choice: true, false useAnnotation: false diff --git a/examples/trials/ga_squad/config_pai.yml b/examples/trials/ga_squad/config_pai.yml new file mode 100644 index 0000000000..56c2d33069 --- /dev/null +++ b/examples/trials/ga_squad/config_pai.yml @@ -0,0 +1,34 @@ +authorName: default +experimentName: example_ga_squad +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: pai +#choice: true, false +useAnnotation: false +tuner: + codeDir: ../tuners/ga_customer_tuner + classFileName: customer_tuner.py + className: CustomerTuner + classArgs: + optimize_mode: maximize +trial: + command: python3 trial.py + codeDir: . + gpuNum: 0 + cpuNum: 1 + memoryMB: 8196 + #The docker image to run nni job on pai + image: openpai/pai.example.tensorflow + #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' + hdfsDataDir: hdfs://10.10.10.10:9000/username/nni + #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' + hdfsOutputDir: hdfs://10.10.10.10:9000/username/nni +paiConfig: + #The username to login pai + userName: username + #The password to login pai + passWord: password + #The host of restful server of pai + host: 10.10.10.10 \ No newline at end of file diff --git a/examples/trials/mnist-annotation/config.yml b/examples/trials/mnist-annotation/config.yml index 84c31166a8..b0555ad3a2 100644 --- a/examples/trials/mnist-annotation/config.yml +++ b/examples/trials/mnist-annotation/config.yml @@ -3,7 +3,7 @@ experimentName: example_mnist trialConcurrency: 1 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local #choice: true, false useAnnotation: true diff --git a/examples/trials/mnist-annotation/config_pai.yml b/examples/trials/mnist-annotation/config_pai.yml new file mode 100644 index 0000000000..edb9e62384 --- /dev/null +++ b/examples/trials/mnist-annotation/config_pai.yml @@ -0,0 +1,35 @@ +authorName: default +experimentName: example_mnist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: pai +#choice: true, false +useAnnotation: true +tuner: + #choice: TPE, Random, Anneal, Evolution, + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist.py + codeDir: . + gpuNum: 0 + cpuNum: 1 + memoryMB: 8196 + #The docker image to run nni job on pai + image: openpai/pai.example.tensorflow + #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' + hdfsDataDir: hdfs://10.10.10.10:9000/username/nni + #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' + hdfsOutputDir: hdfs://10.10.10.10:9000/username/nni +paiConfig: + #The username to login pai + userName: username + #The password to login pai + passWord: password + #The host of restful server of pai + host: 10.10.10.10 \ No newline at end of file diff --git a/examples/trials/mnist-batch-tune-keras/config.yml b/examples/trials/mnist-batch-tune-keras/config.yml index 1bb85085c5..e0722f9117 100644 --- a/examples/trials/mnist-batch-tune-keras/config.yml +++ b/examples/trials/mnist-batch-tune-keras/config.yml @@ -3,7 +3,7 @@ experimentName: example_mnist-keras trialConcurrency: 1 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local searchSpacePath: search_space.json #choice: true, false diff --git a/examples/trials/mnist-batch-tune-keras/config_pai.yml b/examples/trials/mnist-batch-tune-keras/config_pai.yml new file mode 100644 index 0000000000..183c220e2d --- /dev/null +++ b/examples/trials/mnist-batch-tune-keras/config_pai.yml @@ -0,0 +1,36 @@ +authorName: default +experimentName: example_mnist-keras +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: pai +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: BatchTuner + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist-keras.py + codeDir: . + gpuNum: 0 + cpuNum: 1 + memoryMB: 8196 + #The docker image to run nni job on pai + image: openpai/pai.example.tensorflow + #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' + hdfsDataDir: hdfs://10.10.10.10:9000/username/nni + #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' + hdfsOutputDir: hdfs://10.10.10.10:9000/username/nni +paiConfig: + #The username to login pai + userName: username + #The password to login pai + passWord: password + #The host of restful server of pai + host: 10.10.10.10 \ No newline at end of file diff --git a/examples/trials/mnist-keras/config.yml b/examples/trials/mnist-keras/config.yml index c1792e2a65..6ea1c2a367 100644 --- a/examples/trials/mnist-keras/config.yml +++ b/examples/trials/mnist-keras/config.yml @@ -3,7 +3,7 @@ experimentName: example_mnist-keras trialConcurrency: 1 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local searchSpacePath: search_space.json #choice: true, false diff --git a/examples/trials/mnist-keras/config_pai.yml b/examples/trials/mnist-keras/config_pai.yml new file mode 100644 index 0000000000..bbf8136144 --- /dev/null +++ b/examples/trials/mnist-keras/config_pai.yml @@ -0,0 +1,36 @@ +authorName: default +experimentName: example_mnist-keras +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: pai +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist-keras.py + codeDir: . + gpuNum: 0 + cpuNum: 1 + memoryMB: 8196 + #The docker image to run nni job on pai + image: openpai/pai.example.tensorflow + #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' + hdfsDataDir: hdfs://10.10.10.10:9000/username/nni + #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' + hdfsOutputDir: hdfs://10.10.10.10:9000/username/nni +paiConfig: + #The username to login pai + userName: username + #The password to login pai + passWord: password + #The host of restful server of pai + host: 10.10.10.10 \ No newline at end of file diff --git a/examples/trials/mnist-smartparam/config.yml b/examples/trials/mnist-smartparam/config.yml index a69e801ed8..912eabb24e 100644 --- a/examples/trials/mnist-smartparam/config.yml +++ b/examples/trials/mnist-smartparam/config.yml @@ -3,7 +3,7 @@ experimentName: example_mnist-smartparam trialConcurrency: 1 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local #choice: true, false useAnnotation: true diff --git a/examples/trials/mnist-smartparam/config_pai.yml b/examples/trials/mnist-smartparam/config_pai.yml new file mode 100644 index 0000000000..4b5a088d11 --- /dev/null +++ b/examples/trials/mnist-smartparam/config_pai.yml @@ -0,0 +1,35 @@ +authorName: default +experimentName: example_mnist-smartparam +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: pai +#choice: true, false +useAnnotation: true +tuner: + #choice: TPE, Random, Anneal, Evolution, + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist.py + codeDir: . + gpuNum: 0 + cpuNum: 1 + memoryMB: 8196 + #The docker image to run nni job on pai + image: openpai/pai.example.tensorflow + #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' + hdfsDataDir: hdfs://10.10.10.10:9000/username/nni + #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' + hdfsOutputDir: hdfs://10.10.10.10:9000/username/nni +paiConfig: + #The username to login pai + userName: username + #The password to login pai + passWord: password + #The host of restful server of pai + host: 10.10.10.10 \ No newline at end of file diff --git a/examples/trials/mnist/config.yml b/examples/trials/mnist/config.yml index 5eab536228..2f6141fa45 100644 --- a/examples/trials/mnist/config.yml +++ b/examples/trials/mnist/config.yml @@ -3,7 +3,7 @@ experimentName: example_mnist trialConcurrency: 1 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local searchSpacePath: search_space.json #choice: true, false diff --git a/examples/trials/mnist/config_pai.yml b/examples/trials/mnist/config_pai.yml new file mode 100644 index 0000000000..a20fdce40b --- /dev/null +++ b/examples/trials/mnist/config_pai.yml @@ -0,0 +1,36 @@ +authorName: default +experimentName: example_mnist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: pai +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist.py + codeDir: . + gpuNum: 0 + cpuNum: 1 + memoryMB: 8196 + #The docker image to run nni job on pai + image: openpai/pai.example.tensorflow + #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' + hdfsDataDir: hdfs://10.10.10.10:9000/username/nni + #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' + hdfsOutputDir: hdfs://10.10.10.10:9000/username/nni +paiConfig: + #The username to login pai + userName: username + #The password to login pai + passWord: password + #The host of restful server of pai + host: 10.10.10.10 \ No newline at end of file diff --git a/examples/trials/pytorch_cifar10/config.yml b/examples/trials/pytorch_cifar10/config.yml index 655dccd95f..1d6f847805 100644 --- a/examples/trials/pytorch_cifar10/config.yml +++ b/examples/trials/pytorch_cifar10/config.yml @@ -3,7 +3,7 @@ experimentName: example_pytorch_cifar10 trialConcurrency: 1 maxExecDuration: 100h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local searchSpacePath: search_space.json #choice: true, false diff --git a/examples/trials/pytorch_cifar10/config_pai.yml b/examples/trials/pytorch_cifar10/config_pai.yml new file mode 100644 index 0000000000..783285f815 --- /dev/null +++ b/examples/trials/pytorch_cifar10/config_pai.yml @@ -0,0 +1,36 @@ +authorName: default +experimentName: example_pytorch_cifar10 +trialConcurrency: 1 +maxExecDuration: 100h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: pai +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 main.py + codeDir: . + gpuNum: 1 + cpuNum: 1 + memoryMB: 8196 + #The docker image to run nni job on pai + image: openpai/pai.example.tensorflow + #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' + hdfsDataDir: hdfs://10.10.10.10:9000/username/nni + #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' + hdfsOutputDir: hdfs://10.10.10.10:9000/username/nni +paiConfig: + #The username to login pai + userName: username + #The password to login pai + passWord: password + #The host of restful server of pai + host: 10.10.10.10 diff --git a/src/webui/src/components/Sessionpro.tsx b/src/webui/src/components/Sessionpro.tsx index 44993ec31f..83b44b3f78 100644 --- a/src/webui/src/components/Sessionpro.tsx +++ b/src/webui/src/components/Sessionpro.tsx @@ -98,10 +98,10 @@ class Sessionpro extends React.Component<{}, SessionState> { let sessionData = res.data; let tunerAsstemp = []; let trialPro = []; - const startExper = new Date(sessionData.startTime).toLocaleString(); + const startExper = new Date(sessionData.startTime).toLocaleString('en-US'); let experEndStr: string; if (sessionData.endTime !== undefined) { - experEndStr = new Date(sessionData.endTime).toLocaleString(); + experEndStr = new Date(sessionData.endTime).toLocaleString('en-US'); } else { experEndStr = 'not over'; } @@ -156,8 +156,8 @@ class Sessionpro extends React.Component<{}, SessionState> { const desJobDetail: Parameters = { parameters: {} }; - const startTime = new Date(tableData[item].startTime).toLocaleString(); - const endTime = new Date(tableData[item].endTime).toLocaleString(); + const startTime = new Date(tableData[item].startTime).toLocaleString('en-US'); + const endTime = new Date(tableData[item].endTime).toLocaleString('en-US'); const duration = (tableData[item].endTime - tableData[item].startTime) / 1000; let acc; if (tableData[item].finalMetricData) { diff --git a/src/webui/src/components/TrialStatus.tsx b/src/webui/src/components/TrialStatus.tsx index 2c3d27be97..d0e0980ee2 100644 --- a/src/webui/src/components/TrialStatus.tsx +++ b/src/webui/src/components/TrialStatus.tsx @@ -230,10 +230,10 @@ class TrialStatus extends React.Component<{}, TabState> { ? trialJobs[item].status : ''; const startTime = trialJobs[item].startTime !== undefined - ? new Date(trialJobs[item].startTime).toLocaleString() + ? new Date(trialJobs[item].startTime).toLocaleString('en-US') : ''; const endTime = trialJobs[item].endTime !== undefined - ? new Date(trialJobs[item].endTime).toLocaleString() + ? new Date(trialJobs[item].endTime).toLocaleString('en-US') : ''; if (trialJobs[item].hyperParameters !== undefined) { desc.parameters = JSON.parse(trialJobs[item].hyperParameters).parameters; @@ -394,7 +394,7 @@ class TrialStatus extends React.Component<{}, TabState> { dataIndex: 'start', key: 'start', width: '15%', - sorter: (a: TableObj, b: TableObj): number => a.start.localeCompare(b.start) + sorter: (a: TableObj, b: TableObj): number => (Date.parse(a.start) - Date.parse(b.start)) }, { title: 'End', dataIndex: 'end', diff --git a/tools/nnicmd/common_utils.py b/tools/nnicmd/common_utils.py index bb2a4f236c..05afed1e28 100644 --- a/tools/nnicmd/common_utils.py +++ b/tools/nnicmd/common_utils.py @@ -21,7 +21,7 @@ import json import yaml import psutil -from .constants import ERROR_INFO, NORMAL_INFO +from .constants import ERROR_INFO, NORMAL_INFO, WARNING_INFO, COLOR_RED_FORMAT, COLOR_YELLOW_FORMAT def get_yml_content(file_path): '''Load yaml file content''' @@ -43,12 +43,16 @@ def get_json_content(file_path): def print_error(content): '''Print error information to screen''' - print(ERROR_INFO % content) + print(COLOR_RED_FORMAT % (ERROR_INFO % content)) def print_normal(content): '''Print error information to screen''' print(NORMAL_INFO % content) +def print_warning(content): + '''Print warning information to screen''' + print(COLOR_YELLOW_FORMAT % (WARNING_INFO % content)) + def detect_process(pid): '''Detect if a process is alive''' try: diff --git a/tools/nnicmd/constants.py b/tools/nnicmd/constants.py index b03b1bdcbe..14467f02ed 100644 --- a/tools/nnicmd/constants.py +++ b/tools/nnicmd/constants.py @@ -34,22 +34,37 @@ STDERR_FULL_PATH = os.path.join(LOG_DIR, 'stderr') -ERROR_INFO = 'Error: %s' +ERROR_INFO = 'ERROR: %s' -NORMAL_INFO = 'Info: %s' +NORMAL_INFO = 'INFO: %s' -WARNING_INFO = 'Waining: %s' +WARNING_INFO = 'WARNING: %s' -EXPERIMENT_SUCCESS_INFO = 'Start experiment success! The experiment id is %s, and the restful server post is %s.\n' \ - 'You can use these commands to get more information about this experiment:\n' \ +EXPERIMENT_SUCCESS_INFO = '\033[1;32;32mSuccessfully started experiment!\n\033[0m' \ + '-----------------------------------------------------------------------\n' \ + 'The experiment id is %s\n'\ + 'The restful server post is %s\n' \ + 'The Web UI urls are: %s\n' \ + '-----------------------------------------------------------------------\n\n' \ + 'You can use these commands to get more information about the experiment\n' \ + '-----------------------------------------------------------------------\n' \ ' commands description\n' \ '1. nnictl experiment show show the information of experiments\n' \ '2. nnictl trial ls list all of trial jobs\n' \ - '3. nnictl stop stop a experiment\n' \ - '4. nnictl trial kill kill a trial job by id\n' \ - '5. nnictl --help get help information about nnictl\n' \ - '6. nnictl webui url get the url of web ui' + '3. nnictl log stderr show stderr log content\n' \ + '4. nnictl log stdout show stdout log content\n' \ + '5. nnictl stop stop a experiment\n' \ + '6. nnictl trial kill kill a trial job by id\n' \ + '7. nnictl webui url get the url of web ui\n' \ + '8. nnictl --help get help information about nnictl\n' \ + '-----------------------------------------------------------------------\n' \ PACKAGE_REQUIREMENTS = { 'SMAC': 'smac_tuner' } + +COLOR_RED_FORMAT = '\033[1;31;31m%s\033[0m' + +COLOR_GREEN_FORMAT = '\033[1;32;32m%s\033[0m' + +COLOR_YELLOW_FORMAT = '\033[1;33;33m%s\033[0m' \ No newline at end of file diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 6570a75eee..b223551bea 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -30,13 +30,13 @@ from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response from .url_utils import cluster_metadata_url, experiment_url from .config_utils import Config -from .common_utils import get_yml_content, get_json_content, print_error, print_normal, detect_process -from .constants import EXPERIMENT_SUCCESS_INFO, STDOUT_FULL_PATH, STDERR_FULL_PATH, LOG_DIR, REST_PORT, ERROR_INFO, NORMAL_INFO +from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process +from .constants import * from .webui_utils import start_web_ui, check_web_ui def start_rest_server(port, platform, mode, experiment_id=None): '''Run nni manager process''' - print_normal('Checking experiment...') + print_normal('Checking environment...') nni_config = Config() rest_port = nni_config.get_config('restServerPort') running, _ = check_rest_server_quick(rest_port) @@ -191,6 +191,8 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No # Deal with annotation if experiment_config.get('useAnnotation'): path = os.path.join(tempfile.gettempdir(), 'nni', 'annotation') + if not os.path.isdir(path): + os.makedirs(path) path = tempfile.mkdtemp(dir=path) code_dir = expand_annotations(experiment_config['trial']['codeDir'], path) experiment_config['trial']['codeDir'] = code_dir @@ -204,10 +206,9 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No experiment_config['searchSpace'] = json.dumps('') # check rest server - print_normal('Checking restful server...') running, _ = check_rest_server(REST_PORT) if running: - print_normal('Restful server start success!') + print_normal('Successfully started Restful server!') else: print_error('Restful server start failed!') try: @@ -236,7 +237,7 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No if experiment_config['trainingServicePlatform'] == 'local': print_normal('Setting local config...') if set_local_config(experiment_config, REST_PORT): - print_normal('Success!') + print_normal('Successfully set local config!') else: print_error('Failed!') try: @@ -251,7 +252,7 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No print_normal('Setting pai config...') config_result, err_msg = set_pai_config(experiment_config, REST_PORT) if config_result: - print_normal('Success!') + print_normal('Successfully set pai config!') else: if err_msg: print_error('Failed! Error is: {}'.format(err_msg)) @@ -259,8 +260,19 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No cmds = ['pkill', '-P', str(rest_process.pid)] call(cmds) except Exception: - raise Exception(ERROR_INFO % 'Rest server stopped!') + raise Exception(ERROR_INFO % 'Restful server stopped!') exit(0) + + #start webui + if check_web_ui(): + print_warning('{0} {1}'.format(' '.join(nni_config.get_config('webuiUrl')),'is being used, please stop it first!')) + print_normal('You can use \'nnictl webui stop\' to stop old Web UI process...') + else: + print_normal('Starting Web UI...') + webui_process = start_web_ui(webuiport) + if webui_process: + nni_config.set_config('webuiPid', webui_process.pid) + print_normal('Successfully started Web UI!') # start a new experiment print_normal('Starting experiment...') @@ -274,25 +286,12 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No try: cmds = ['pkill', '-P', str(rest_process.pid)] call(cmds) + cmds = ['pkill', '-P', str(webui_process.pid)] + call(cmds) except Exception: - raise Exception(ERROR_INFO % 'Rest server stopped!') + raise Exception(ERROR_INFO % 'Restful server stopped!') exit(0) - - #start webui - print_normal('Checking web ui...') - if check_web_ui(): - print_error('{0} {1}'.format(' '.join(nni_config.get_config('webuiUrl')),'is being used, please stop it first!')) - print_normal('You can use \'nnictl webui stop\' to stop old web ui process...') - else: - print_normal('Starting web ui...') - webui_process = start_web_ui(webuiport) - if webui_process: - nni_config.set_config('webuiPid', webui_process.pid) - print_normal('Starting web ui success!') - print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl')))) - - print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, REST_PORT)) - + print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, REST_PORT, ' '.join(nni_config.get_config('webuiUrl')))) def resume_experiment(args): '''resume an experiment''' diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index 2b7628ec67..d071741f5b 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -64,17 +64,20 @@ def stop_experiment(args): stop_web_ui() return running, _ = check_rest_server_quick(rest_port) + stop_rest_result = True if running: response = rest_delete(experiment_url(rest_port), 20) if not response or not check_response(response): print_error('Stop experiment failed!') + stop_rest_result = False #sleep to wait rest handler done time.sleep(3) rest_pid = nni_config.get_config('restServerPid') cmds = ['pkill', '-P', str(rest_pid)] call(cmds) stop_web_ui() - print_normal('Stop experiment success!') + if stop_rest_result: + print_normal('Stop experiment success!') def trial_ls(args): '''List trial''' diff --git a/tools/trial_tool/trial_keeper.py b/tools/trial_tool/trial_keeper.py index f53e5d6774..675a0566ac 100644 --- a/tools/trial_tool/trial_keeper.py +++ b/tools/trial_tool/trial_keeper.py @@ -45,7 +45,6 @@ def main_loop(args): # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior process = Popen(args.trial_command, shell = True, stdout = stdout_file, stderr = stderr_file) print('Subprocess pid is {}'.format(process.pid)) - print('Current cwd is {}'.format(os.getcwd())) while True: retCode = process.poll() ## Read experiment metrics, to avoid missing metrics @@ -55,15 +54,15 @@ def main_loop(args): print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) #copy local directory to hdfs nni_local_output_dir = os.environ['NNI_OUTPUT_DIR'] - hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name) - print(nni_local_output_dir, args.pai_hdfs_output_dir) + hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) try: if copyDirectoryToHdfs(nni_local_output_dir, args.pai_hdfs_output_dir, hdfs_client): - print('copy directory success!') + print('copy directory from {0} to {1} success!'.format(nni_local_output_dir, args.pai_hdfs_output_dir)) else: - print('copy directory failed!') + print('copy directory from {0} to {1} failed!'.format(nni_local_output_dir, args.pai_hdfs_output_dir)) except Exception as exception: - print(exception) + print('HDFS copy directory got exception') + raise exception ## Exit as the retCode of subprocess(trial) exit(retCode) @@ -91,7 +90,10 @@ def trial_keeper_help_info(*args): try: main_loop(args) - except: - print('Exiting by user request') + except SystemExit as se: + print('NNI trial keeper exit with code {}'.format(se.code)) + sys.exit(se.code) + except Exception as e: + print('Exit trial keeper with code 1 because Exception: {} is catched'.format(str(e))) sys.exit(1)