Integrate a0a3a82 into master

microsoft · Nov 1, 2017 · 1312bf8 · 1312bf8
2 parents ba5108c + a0a3a82
commit 1312bf8
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 3 deletions.
diff --git a/Examples/Image/Detection/FastRCNN/BrainScript/A2_RunWithPyModel.py b/Examples/Image/Detection/FastRCNN/BrainScript/A2_RunWithPyModel.py
@@ -17,6 +17,7 @@
 import PARAMETERS
 import numpy as np
 import os, sys
+from cntk import distributed
 
 ###############################################################
 ###############################################################
@@ -44,6 +45,9 @@
 mb_size = p.cntk_mb_size
 max_epochs = p.cntk_max_epochs
 momentum_time_constant = p.cntk_momentum_time_constant
+distributed_flg = p.distributed_flg
+num_quantization_bits = p.num_quantization_bits
+warm_up = p.warm_up
 
 # model specific variables (only AlexNet for now)
 base_model = "AlexNet"
@@ -153,9 +157,14 @@ def train_fast_rcnn(debug_output=False, model_path=model_file):
     lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample)
     mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
 
-    # Instantiate the trainer object
+    # Instantiate the trainer object as default
     learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight)
-    progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
+    # Preparation for distributed learning, which is compatible for normal learner
+    learner = distributed.data_parallel_distributed_learner(
+        learner = learner,
+        num_quantization_bits = num_quantization_bits,   # non-quantized gradient accumulation
+        distributed_after = warm_up)                     # no warm start as default            
+    progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs, rank=distributed.Communicator.rank())
     trainer = Trainer(frcn_output, (ce, pe), learner, progress_printer)
 
     # Get minibatches of images and perform model training
@@ -164,14 +173,20 @@ def train_fast_rcnn(debug_output=False, model_path=model_file):
     for epoch in range(max_epochs):       # loop over epochs
         sample_count = 0
         while sample_count < epoch_size:  # loop over minibatches in the epoch
-            data = minibatch_source.next_minibatch(min(mb_size, epoch_size-sample_count), input_map=input_map)
+            data = minibatch_source.next_minibatch(min(mb_size * C.Communicator.num_workers(), epoch_size-sample_count), 
+                input_map=input_map, 
+                num_data_partitions=C.Communicator.num_workers(), 
+                partition_index=C.Communicator.rank())     
             trainer.train_minibatch(data)                                    # update model with it
             sample_count += trainer.previous_minibatch_sample_count          # count samples processed so far
 
         trainer.summarize_training_progress()
         if debug_output:
             frcn_output.save(os.path.join(abs_path, "Output", "frcn_py_%s.model" % (epoch+1)))
 
+    if distributed_flg:
+        distributed.Communicator.finalize()
+
     return frcn_output
 
 # Evaluate a Fast R-CNN model

diff --git a/Examples/Image/Detection/FastRCNN/BrainScript/PARAMETERS.py b/Examples/Image/Detection/FastRCNN/BrainScript/PARAMETERS.py
@@ -56,6 +56,11 @@ def __init__(self, datasetName):
         self.cntk_max_epochs = -1                # set per data set below
         self.cntk_momentum_time_constant = -1    # set per data set below
 
+        # for Distributed learner
+        self.distributed_flg = False     # In case of distributed learning, set 'True'
+        self.num_quantization_bits = 32  # set for distributed learner 
+        self.warm_up = 0                 # set for distributed learner
+
 ############################
 # project-specific parameters
 ############################

diff --git a/Examples/Image/Detection/FastRCNN/BrainScript/README.md b/Examples/Image/Detection/FastRCNN/BrainScript/README.md
@@ -104,6 +104,18 @@ This python code will start training Fast R-CNN using the [fastrcnn.cntk](./fast
 
 If you carefully examine the [fastrcnn.cntk](./fastrcnn.cntk) file, you would notice we load the pre-trained AlexNet model, clone the network up to the `conv5_y` layer and freeze all bottom layer parameters, and then added pooling and dense layers on the top with trainable parameters. The training will run for 17 epochs, and reaching training error around `1.05%`. The script will also write the network output for the entire train and test dataset.
 
+### Running Fast R-CNN distributed training
+
+In case of distributed training, set `distributed_flg` to `True` in [PARAMETERS.py](./PARAMETERS.py).
+It will cause `python A2_RunWithPyModel.py` for distributed learning with multi-GPU environment.
+Note: This example requires a multi-GPU machine to distribute.
+
+Simple aggregation with a 2-GPU machine:
+`mpiexec -n 2 python A2_RunWithPyModel.py`
+
+Please check 2 parameters `num_quantization_bits`, `warm_up` in [PARAMETERS.py](./PARAMETERS.py) for distributed learning.
+Here is a [quick reference](https://docs.microsoft.com/en-us/cognitive-toolkit/Multiple-GPUs-and-machines#2-configuring-parallel-training-in-cntk-in-python) for distributed learning with python.
+
 ### Evaluate trained model
 
 One the model has been trained for detection, you may run: