diff --git a/examples/mscclang/allgather_a100_pcie.py b/examples/mscclang/allgather_a100_pcie.py new file mode 100644 index 0000000..06d9ae3 --- /dev/null +++ b/examples/mscclang/allgather_a100_pcie.py @@ -0,0 +1,42 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import argparse +from msccl.language import * +from msccl.topologies import * +from msccl.language.collectives import AllGather + +# Allpairs allgather for A100 +def allgather_allpairs(gpus, instances, protocol): + size = gpus + chunksperloop = 1 + topology = fully_connected(gpus) + collective = AllGather(size, chunksperloop, True) + + with MSCCLProgram("allgather_hierarchical", topology, collective, instances, protocol=protocol, + interleaved_replication=True, dependence_nop=True): + for chnk in range(2): + for r in range(size): + if ((r % 2) == chnk): + c = chunk(r, Buffer.input, 0) + c.copy(r + 1 - 2 * chnk, Buffer.output, r) + for r in range(size): + if ((r % 2) == chnk): + c = chunk(r, Buffer.input, 0) + c.copy((r+2) % size, Buffer.output, r) + for r in range(size): + if ((r % 2) == chnk): + c = chunk(r, Buffer.output, (r+2) % size) + c.copy(r + 1 - 2 * chnk, Buffer.output, (r+2) % size) + + XML() + Check() + + +parser = argparse.ArgumentParser() +parser.add_argument('num_gpus', type=int, help ='number of gpus') +parser.add_argument('instances', type=int, help='number of instances') +parser.add_argument('--protocol', type=str, default='LL128', choices=['Simple', 'LL', 'LL128'], help ='NCCL protocol. Default: Simple') +args = parser.parse_args() + +allgather_allpairs(args.num_gpus, args.instances, args.protocol) \ No newline at end of file diff --git a/examples/mscclang/allreduce_a100_ncv4.py b/examples/mscclang/allreduce_a100_ncv4.py index 64cbf1d..c3f8a06 100755 --- a/examples/mscclang/allreduce_a100_ncv4.py +++ b/examples/mscclang/allreduce_a100_ncv4.py @@ -12,27 +12,27 @@ def allreduce_allpairs(gpus, instances, protocol): topology = fully_connected(size) collective = AllReduce(size, chunksperloop, True) with MSCCLProgram("allreduce_ncv4", topology, collective, instances, protocol=protocol, - interleaved_replication=False, dependence_nop=True): + interleaved_replication=False, threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True): for chnk in range(chunksperloop): for r in range(size): if ((r % 2) == chnk): c = chunk(r, Buffer.input, chnk) - c.reduce(chunk(r + 1 - 2 * chnk, Buffer.input, chnk)) + c.reduce(chunk(r + 1 - 2 * chnk, Buffer.input, chnk), sendtb=0, recvtb=0, ch=0) for r in range(size): if ((r % 2) == chnk): c = chunk(r, Buffer.input, chnk) - c.copy((r+2) % size, 'scratch', chnk) + c.copy((r+2) % size, 'scratch', chnk, sendtb=1, recvtb=1, ch=0) for r in range(size): if ((r % 2) == chnk): c = chunk(r, Buffer.input, chnk) - c.reduce(chunk(r, 'scratch', chnk)) + c.reduce(chunk(r, 'scratch', chnk), sendtb=1, recvtb=1, ch=0) for r in range(size): if ((r % 2) == chnk): c = chunk(r, Buffer.input, chnk) - c.copy(r + 1 - 2 * chnk, Buffer.input, chnk) + c.copy(r + 1 - 2 * chnk, Buffer.input, chnk, sendtb=2, recvtb=2, ch=1) XML() Check()