chainer · keisukefukuda · Aug 4, 2017 · Jul 10, 2017 · Jul 14, 2017 · Jul 14, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -52,7 +52,7 @@ script:
   - flake8 --config=.flake8.cython .
   - autopep8 -r . --global-config .pep8 | tee check_autopep8
   - test ! -s check_autopep8
-  - for NP in 1 2; do PYTHONWARNINGS='ignore::FutureWarning,module::DeprecationWarning' mpiexec -n ${NP} nosetests -v -a '!nccl,!gpu'; done
+  - for NP in 1 2 3; do PYTHONWARNINGS='ignore::FutureWarning,module::DeprecationWarning' mpiexec -n ${NP} nosetests -v -a '!nccl,!gpu'; done
   # - cd tests
   # - PYTHONWARNINGS='ignore::FutureWarning,module::DeprecationWarning' nosetests -a '!gpu,!slow' --with-doctest chainer_tests
   - if [[ $TRAVIS_OS_NAME == "linux" ]]; then

diff --git a/chainermn/__init__.py b/chainermn/__init__.py
@@ -2,6 +2,7 @@
 
 from chainermn.communicators import create_communicator  # NOQA
 from chainermn.dataset import scatter_dataset  # NOQA
+from chainermn.link import MultiNodeChainList  # NOQA
 from chainermn.multi_node_evaluator import create_multi_node_evaluator  # NOQA
 from chainermn.multi_node_optimizer import create_multi_node_optimizer  # NOQA
 

diff --git a/chainermn/datasets/__init__.py b/chainermn/datasets/__init__.py
@@ -0,0 +1 @@
+from chainermn.datasets.empty_dataset import create_empty_dataset  # NOQA
diff --git a/chainermn/datasets/empty_dataset.py b/chainermn/datasets/empty_dataset.py
@@ -0,0 +1,21 @@
+import chainer
+
+
+def create_empty_dataset(dataset):
+    """Creates an empty dataset for models with no inputs and outputs.
+
+    This function generates an empty dataset, i.e., ``__getitem__()`` only
+    returns ``None``. Its dataset is compatible with the original one.
+    Such datasets used for models which do not take any inputs,
+    neither return any outputs. We expect models, e.g., whose ``forward()``
+    is starting with ``chainermn.functions.recv()`` and ending with
+    ``chainermn.functions.send()``.
+
+    Args:
+        dataset(chainer.datasets.TupleDataset): Dataset to convert.
+
+    Returns:
+        ~chainer.datasets.TransformDataset:
+            Dataset consists of only patterns in the original one.
+    """
+    return chainer.datasets.TransformDataset(dataset, lambda data: ())
diff --git a/chainermn/functions/point_to_point_communication.py b/chainermn/functions/point_to_point_communication.py
@@ -14,15 +14,23 @@ def __init__(self, comm, peer_rank, peer_tag):
 
     def forward(self, inputs):
         xp = cuda.get_array_module(*inputs)
-        x, = inputs
+        # Note: inputs[1] might contain delegate_variable.
+        x = inputs[0]
         self.comm.send(x, self.peer_rank, self.peer_tag)
-        return xp.array([]),
+        # Return an empty variable, which serves as "delegate_variable."
+        return xp.array([], dtype=xp.float32),
 
     def backward(self, inputs, grad_outputs):
         xp = cuda.get_array_module(*inputs)
         with cuda.get_device_from_array(*inputs):
             gy = self.comm.recv(self.peer_rank, self.peer_tag)
-            return xp.array(gy),
+            if len(inputs) > 1:
+                # Dummy grad for delegate_variable.
+                # This grad will not be used, only for silencing type checker.
+                grad_delegate_variable = inputs[1]
+                return xp.array(gy), grad_delegate_variable
+            else:
+                return xp.array(gy),
 
 
 class Recv(chainer.Function):
@@ -38,17 +46,25 @@ def __init__(self, comm, peer_rank, peer_tag, device=-1):
     def __call__(self, *inputs):
         xp = cuda.get_array_module(*inputs)
 
-        if chainer.__version__.startswith('1.'):
-            # For backward compatibility.
-            dummy_var = chainer.Variable(xp.array([]), volatile='auto')
-        else:
-            # This variable is necessary to backprop correctly in Chainer v2.
-            # This trick relies on the fact chainer.Variable.requires_grad is
-            # True by default at Chainer v2.0.0.
-            dummy_var = chainer.Variable(xp.array([]))
+        if inputs == ():
+            # Expected to be invoked without any args in usual case.
+            if chainer.__version__.startswith('1.'):
+                # For backward compatibility.
+                dummy_var = chainer.Variable(
+                    xp.array([], dtype=xp.float32),
+                    volatile='auto')
+            else:
+                # This variable is necessary to backprop correctly
+                # in Chainer v2. This trick relies on the fact
+                # chainer.Variable.requires_grad is True by default
+                # in Chainer v2.0.0.
+                dummy_var = chainer.Variable(xp.array([], dtype=xp.float32))
+
+            return super(Recv, self).__call__(dummy_var)
 
-        ret = super(Recv, self).__call__(dummy_var)
-        return ret
+        else:
+            # Used for retaining computational graph.
+            return super(Recv, self).__call__(*inputs)
 
     def forward(self, inputs):
         x = self.comm.recv(self.peer_rank, self.peer_tag)
@@ -61,7 +77,7 @@ def backward(self, inputs, grad_outputs):
         xp = cuda.get_array_module(*inputs)
         gw, = grad_outputs
         self.comm.send(gw, self.peer_rank, self.peer_tag)
-        dummy_var = xp.array([[]])
+        dummy_var = xp.array([[]], dtype=xp.float32)
         return dummy_var
 
 
@@ -83,23 +99,33 @@ def send(x, communicator, rank, tag=0):
     Returns:
         ~chainer.Variable:
             A dummy variable with no actual data, only holding the
-            computational graph. If ``backward()`` is invoked by this dummy
-            variable, it will try to receive gradients from the target process.
+            computational graph. We call this ``delegate_variable``.
+            If ``backward()`` is invoked by delegate_variable,
+            it will try to receive gradients from the target process.
 
     """
+    chainer.utils.experimental('chainermn.functions.send')
     return Send(communicator, peer_rank=rank, peer_tag=tag)(x)
 
 
-def recv(communicator, rank, tag=0, device=-1):
+def recv(communicator, rank, delegate_variable=None, tag=0, device=-1):
     """Receive elements from target process.
 
     This function returns data received from target process. If ``backward()``
     is invoked, it will try to send gradients to the target process.
 
+    .. note::
+        If you define non-connected computational graph on one machine,
+        you have to use ``delegate_variable`` to specify the output of
+        previous computational graph component.
+        Otherwise ``backward()`` does not work well.
+
     Args:
         communicator (chainer.communicators.CommunicatorBase):
             ChainerMN communicator.
         rank (int): Target process specifier.
+        delegate_variable (chainer.Variable):
+            Pointer to the other non-connected component.
         tag (int): Optional message ID (MPI feature).
         device (int): Target device specifier.
 
@@ -109,4 +135,16 @@ def recv(communicator, rank, tag=0, device=-1):
             by this variable, it will send gradients to the target process.
 
     """
-    return Recv(communicator, peer_rank=rank, peer_tag=tag, device=device)()
+    chainer.utils.experimental('chainermn.functions.recv')
+    if delegate_variable is None:
+        return Recv(
+            communicator,
+            peer_rank=rank,
+            peer_tag=tag,
+            device=device)()
+    else:
+        return Recv(
+            communicator,
+            peer_rank=rank,
+            peer_tag=tag,
+            device=device)(delegate_variable)
diff --git a/chainermn/functions/pseudo_connect.py b/chainermn/functions/pseudo_connect.py
@@ -0,0 +1,47 @@
+import chainer
+from chainer import cuda
+import chainer.utils
+
+
+class PseudoConnect(chainer.Function):
+    """Connect a variable with delegating variable."""
+
+    def forward(self, inputs):
+        # delegate_variable = inputs[0]
+        actual_variables = inputs[1:]
+        return actual_variables
+
+    def backward(self, inputs, grad_outputs):
+        delegate_variable = inputs[0]
+        # actual_variables = inputs[1:]
+        xp = cuda.get_array_module(*inputs)
+
+        # delegate_variable do not need backward gradients, instead sending
+        # back dummy grads in order to take consistency of shapes of grads.
+        grad_delegate_variable = xp.zeros_like(delegate_variable)
+
+        # grad_outputs corresponds to grads of actual_variables.
+        return tuple([grad_delegate_variable] + list(grad_outputs))
+
+
+def pseudo_connect(delegate_variable, *actual_variables):
+    """Connect independent connected graph component.
+
+    In model-parallel framework, models sometimes have many non-connected
+    components. When some additional components follow model outputs,
+    outputs of the last component must be merged with model outputs.
+    Otherwise backprop does not work well, got stuck into dead lock.
+
+    Args:
+        delegate_variable (chainer.Variable):
+            Pointer to the previous non-connected graph component.
+        actual_variables (tuple of chainer.Variable):
+            Actual values which ``delegate_variable`` imitate.
+
+    Returns:
+        ~chainer.Variable:
+            A variable with the given values combined with delegating variable.
+    """
+    chainer.utils\
+        .experimental('chainermn.functions.pseudo_connect.pseudo_connect')
+    return PseudoConnect()(delegate_variable, *actual_variables)