Skip to content

Commit

Permalink
Integrate a0a3a82 into master
Browse files Browse the repository at this point in the history
  • Loading branch information
Project Philly authored and CNTK Team committed Nov 1, 2017
2 parents ba5108c + a0a3a82 commit 1312bf8
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 3 deletions.
21 changes: 18 additions & 3 deletions Examples/Image/Detection/FastRCNN/BrainScript/A2_RunWithPyModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import PARAMETERS
import numpy as np
import os, sys
from cntk import distributed

###############################################################
###############################################################
Expand Down Expand Up @@ -44,6 +45,9 @@
mb_size = p.cntk_mb_size
max_epochs = p.cntk_max_epochs
momentum_time_constant = p.cntk_momentum_time_constant
distributed_flg = p.distributed_flg
num_quantization_bits = p.num_quantization_bits
warm_up = p.warm_up

# model specific variables (only AlexNet for now)
base_model = "AlexNet"
Expand Down Expand Up @@ -153,9 +157,14 @@ def train_fast_rcnn(debug_output=False, model_path=model_file):
lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample)
mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)

# Instantiate the trainer object
# Instantiate the trainer object as default
learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight)
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
# Preparation for distributed learning, which is compatible for normal learner
learner = distributed.data_parallel_distributed_learner(
learner = learner,
num_quantization_bits = num_quantization_bits, # non-quantized gradient accumulation
distributed_after = warm_up) # no warm start as default
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs, rank=distributed.Communicator.rank())
trainer = Trainer(frcn_output, (ce, pe), learner, progress_printer)

# Get minibatches of images and perform model training
Expand All @@ -164,14 +173,20 @@ def train_fast_rcnn(debug_output=False, model_path=model_file):
for epoch in range(max_epochs): # loop over epochs
sample_count = 0
while sample_count < epoch_size: # loop over minibatches in the epoch
data = minibatch_source.next_minibatch(min(mb_size, epoch_size-sample_count), input_map=input_map)
data = minibatch_source.next_minibatch(min(mb_size * C.Communicator.num_workers(), epoch_size-sample_count),
input_map=input_map,
num_data_partitions=C.Communicator.num_workers(),
partition_index=C.Communicator.rank())
trainer.train_minibatch(data) # update model with it
sample_count += trainer.previous_minibatch_sample_count # count samples processed so far

trainer.summarize_training_progress()
if debug_output:
frcn_output.save(os.path.join(abs_path, "Output", "frcn_py_%s.model" % (epoch+1)))

if distributed_flg:
distributed.Communicator.finalize()

return frcn_output

# Evaluate a Fast R-CNN model
Expand Down
5 changes: 5 additions & 0 deletions Examples/Image/Detection/FastRCNN/BrainScript/PARAMETERS.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ def __init__(self, datasetName):
self.cntk_max_epochs = -1 # set per data set below
self.cntk_momentum_time_constant = -1 # set per data set below

# for Distributed learner
self.distributed_flg = False # In case of distributed learning, set 'True'
self.num_quantization_bits = 32 # set for distributed learner
self.warm_up = 0 # set for distributed learner

############################
# project-specific parameters
############################
Expand Down
12 changes: 12 additions & 0 deletions Examples/Image/Detection/FastRCNN/BrainScript/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,18 @@ This python code will start training Fast R-CNN using the [fastrcnn.cntk](./fast

If you carefully examine the [fastrcnn.cntk](./fastrcnn.cntk) file, you would notice we load the pre-trained AlexNet model, clone the network up to the `conv5_y` layer and freeze all bottom layer parameters, and then added pooling and dense layers on the top with trainable parameters. The training will run for 17 epochs, and reaching training error around `1.05%`. The script will also write the network output for the entire train and test dataset.

### Running Fast R-CNN distributed training

In case of distributed training, set `distributed_flg` to `True` in [PARAMETERS.py](./PARAMETERS.py).
It will cause `python A2_RunWithPyModel.py` for distributed learning with multi-GPU environment.
Note: This example requires a multi-GPU machine to distribute.

Simple aggregation with a 2-GPU machine:
`mpiexec -n 2 python A2_RunWithPyModel.py`

Please check 2 parameters `num_quantization_bits`, `warm_up` in [PARAMETERS.py](./PARAMETERS.py) for distributed learning.
Here is a [quick reference](https://docs.microsoft.com/en-us/cognitive-toolkit/Multiple-GPUs-and-machines#2-configuring-parallel-training-in-cntk-in-python) for distributed learning with python.

### Evaluate trained model

One the model has been trained for detection, you may run:
Expand Down

0 comments on commit 1312bf8

Please sign in to comment.