Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into uct/md/ib/support-m…
Browse files Browse the repository at this point in the history
…t-ksm-reg-for-indirect-keys
  • Loading branch information
ivankochin committed Dec 26, 2023
2 parents b1423cf + b44cd45 commit c4259ab
Show file tree
Hide file tree
Showing 130 changed files with 2,132 additions and 1,253 deletions.
3 changes: 2 additions & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ ForEachMacros: ['_UCS_BITMAP_FOR_EACH_WORD',
'UCX_PERF_TEST_FOREACH',
'ucs_lru_for_each']
StatementMacros : []
TypenameMacros: ['khash_t', 'ucs_array_t']
TypenameMacros: ['khash_t',
'ucs_array_s']
WhitespaceSensitiveMacros: []

# CPP
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ depcomp
contrib/rpmdef.sh
contrib/ucx
bindings/java/pom.xml
bindings/java/src/main/native/tmp-settings.xml
src/ucs/ucs_stats_parser
test/gtest/gtest
build-*
Expand Down
2 changes: 1 addition & 1 deletion bindings/go/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ run-perftest:
cd $(abs_top_srcdir)/bindings/go/src/examples/perftest ;\
LD_LIBRARY_PATH=$(UCX_SOPATH):${LD_LIBRARY_PATH} ${GOTMPDIR}/goperftest ${ARGS}

install-exec-hook:
install-exec-hook: goperftest
$(INSTALL) ${GOTMPDIR}/goperftest $(DESTDIR)$(bindir)

all: goperftest build
Expand Down
11 changes: 6 additions & 5 deletions bindings/go/tests/endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"testing"
. "ucx"
"unsafe"
. "cuda"
)

type TestEntity struct {
Expand Down Expand Up @@ -74,6 +75,9 @@ func get_mem_types() []memTypePair {
memTypePairs := []memTypePair{memTypePair{UCS_MEMORY_TYPE_HOST, UCS_MEMORY_TYPE_HOST}}

if IsMemTypeSupported(UCS_MEMORY_TYPE_CUDA, memTypeMask) {
if err := CudaSetDevice(); err != nil {
fmt.Errorf("%v", err)
}
memTypePairs = append(memTypePairs, memTypePair{UCS_MEMORY_TYPE_HOST, UCS_MEMORY_TYPE_CUDA})
memTypePairs = append(memTypePairs, memTypePair{UCS_MEMORY_TYPE_CUDA, UCS_MEMORY_TYPE_HOST})
memTypePairs = append(memTypePairs, memTypePair{UCS_MEMORY_TYPE_CUDA, UCS_MEMORY_TYPE_CUDA})
Expand Down Expand Up @@ -227,10 +231,6 @@ func TestUcpEpAm(t *testing.T) {
return UCS_OK
})

// To notify progress thread to exit
quit := make(chan bool)
go progressThread(quit, receiver.worker)

headerMem := CBytes([]byte(sendData))
sendChan := make(chan bool, 1)
sender.worker.SetAmRecvHandler(3, UCP_AM_FLAG_WHOLE_MSG, func(header unsafe.Pointer, headerSize uint64,
Expand Down Expand Up @@ -258,6 +258,7 @@ func TestUcpEpAm(t *testing.T) {
break senderProgress
default:
sender.worker.Progress()
receiver.worker.Progress()
}
}

Expand All @@ -267,6 +268,7 @@ func TestUcpEpAm(t *testing.T) {
for req := range requests {
for req.GetStatus() == UCS_INPROGRESS {
sender.worker.Progress()
receiver.worker.Progress()
}
req.Close()
}
Expand All @@ -285,7 +287,6 @@ func TestUcpEpAm(t *testing.T) {
}

amData.Close()
quit <- true

sender.Close()
receiver.Close()
Expand Down
20 changes: 0 additions & 20 deletions bindings/go/tests/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ package goucxtests

import (
. "ucx"
. "cuda"
"unsafe"
)

Expand All @@ -16,12 +15,6 @@ func memoryAllocate(entity *TestEntity, size uint64, memoryType UcsMemoryType) u
mmapParams := &UcpMmapParams{}
mmapParams.Allocate().SetLength(size).SetMemoryType(memoryType)

if memoryType == UCS_MEMORY_TYPE_CUDA {
if err := CudaSetDevice(); err != nil {
entity.t.Fatalf("%v", err)
}
}

result, err := entity.context.MemMap(mmapParams)
if err != nil {
entity.t.Fatalf("Failed to allocate memory %v", err)
Expand Down Expand Up @@ -82,16 +75,3 @@ func memoryGet(entity *TestEntity) []byte {
return GoBytes(recvMem, memAttr.Length)
}
}

// Progress thread that progress a worker until it receives quit signal from channel.
func progressThread(quit chan bool, worker *UcpWorker) {
for {
select {
case <-quit:
close(quit)
return
default:
worker.Progress()
}
}
}
6 changes: 6 additions & 0 deletions bindings/java/pom.xml.in
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,12 @@
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>ai.rapids</groupId>
<artifactId>cudf</artifactId>
<version>23.10.0</version>
<scope>test</scope>
</dependency>
</dependencies>

<build>
Expand Down
10 changes: 9 additions & 1 deletion bindings/java/src/main/native/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -103,22 +103,30 @@ set-version:
repack-jar: $(jarfile)
$(RM) -r $(java_build_dir)/repack_dir
unzip -o $(jarfile) -d $(java_build_dir)/repack_dir
\cp $(topdir)/libjucx_*.so/libjucx_*.so $(java_build_dir)/repack_dir
\cp -v $(topdir)/libjucx_*.so/libjucx_*.so $(topdir)/bindings/java/resources
\cp -v $(topdir)/libjucx_*.so/libjucx_*.so $(java_build_dir)/repack_dir
jar -cf $(jarfile) -C $(java_build_dir)/repack_dir .
jar tf $(jarfile)

multi-arch:
@make set-version JUCX_VERSION=${JUCX_VERSION}
@make repack-jar

check-jar:
@test $(shell jar tf $(jarfile) | grep -q libjucx_amd64.so)
@test $(shell jar tf $(jarfile) | grep -q libjucx_aarch64.so)

# Publish JUCX jar to maven central
publish-snapshot:
@make set-version JUCX_VERSION=@VERSION@-SNAPSHOT
@make repack-jar
@make check-jar
@make publish

publish-release:
@make set-version JUCX_VERSION=${JUCX_VERSION}
@make repack-jar
@make check-jar
@make publish

publish:
Expand Down
14 changes: 14 additions & 0 deletions bindings/java/src/test/java/org/openucx/jucx/UcpEndpointTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.junit.runner.RunWith;
import org.openucx.jucx.ucp.*;
import org.openucx.jucx.ucs.UcsConstants;
import ai.rapids.cudf.Cuda;

import java.net.InetSocketAddress;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -118,6 +119,8 @@ public void testGetNB(int memType) throws Exception {
.setName("testGetNB").setUcpAddress(worker2.getAddress());
UcpEndpoint endpoint = worker1.newEndpoint(epParams);

cudaSetDevice(memType);

// Allocate 2 source and 2 destination buffers, to perform 2 RDMA Read operations
MemoryBlock src1 = allocateMemory(context2, worker2, memType, UcpMemoryTest.MEM_SIZE);
MemoryBlock src2 = allocateMemory(context2, worker2, memType, UcpMemoryTest.MEM_SIZE);
Expand Down Expand Up @@ -212,6 +215,8 @@ public void testSendRecv(int memType) throws Exception {
UcpWorker worker1 = context1.newWorker(rdmaWorkerParams);
UcpWorker worker2 = context2.newWorker(rdmaWorkerParams);

cudaSetDevice(memType);

MemoryBlock src1 = allocateMemory(context1, worker1, memType, UcpMemoryTest.MEM_SIZE);
MemoryBlock src2 = allocateMemory(context1, worker1, memType, UcpMemoryTest.MEM_SIZE);

Expand Down Expand Up @@ -691,6 +696,8 @@ public void testActiveMessages(int memType) throws Exception {

header.rewind();

cudaSetDevice(memType);

MemoryBlock sendData = allocateMemory(context2, worker2, memType, dataSize);
sendData.setData(dataString);

Expand Down Expand Up @@ -813,4 +820,11 @@ public void onSuccess(UcpRequest request) {
closeResources();
cachedEp.clear();
}

private void cudaSetDevice(int memType) {
if (memType == UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_CUDA) {
Cuda.setDevice(0);
Cuda.deviceSynchronize();
}
}
}
73 changes: 67 additions & 6 deletions buildlib/azure-pipelines-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ pr:
variables:
- name: WorkDir
value: /hpc/scrap/azure/$(Build.DefinitionName)/$(Build.BuildId)-$(Build.BuildNumber)
- name: warning
value: Attention! CI testing is in progress. Please refrain from any manual action on this machine.
- name: threshold
value: 5

Expand All @@ -24,11 +26,11 @@ resources:
stages:
- stage: Prepare
jobs:
- job: Prepare
- job: PrepareWorkdir
pool:
name: MLNX
demands:
- ucx_perf
- ucx_perf_master

steps:
- checkout: self
Expand All @@ -45,6 +47,39 @@ stages:
mv $(Build.SourcesDirectory)/* $(WorkDir)
displayName: Prepare WorkDir with code
- job: UserHandling
strategy:
matrix:
node1:
name: vulcan03
node2:
name: vulcan04
pool:
name: MLNX
demands:
- ucx_perf

steps:
- checkout: none
- bash: |
set -xeE
# Send a warning
sudo usermod -aG tty `whoami` # add user to the tty group
sudo chmod g+rw /dev/pts/* # set permissions for terminals
wall 'CI test is starting. Connections will terminate soon!'
sleep 30
# Disconnect SSH users
pgrep -f 'sshd:' | xargs -r sudo kill -HUP || true
# Add nagging reminder
crontab <<EOL
* * * * * /usr/bin/mesg y
* * * * * sudo chmod g+rw /dev/pts/*
* * * * * wall "$(warning)"
EOL
displayName: Warn and disconnect users
- stage: Performance
dependsOn: Prepare
Expand All @@ -55,7 +90,7 @@ stages:
pool:
name: MLNX
demands:
- ucx_perf
- ucx_perf_master

steps:
- checkout: none
Expand Down Expand Up @@ -96,7 +131,9 @@ stages:
parameters:
Name: Perf-test-single-node

- script: $(WorkDir)/ucx/buildlib/tools/perf_results.py $(WorkDir)/results-Perf\*.txt $(threshold)
- bash: |
$(WorkDir)/ucx/buildlib/tools/perf_results.py $(WorkDir)/results-Perf\*.txt $(threshold)
echo "Manual repro environment: $(WorkDir)"
displayName: Results analyzer
workingDirectory: $(WorkDir)
Expand All @@ -116,7 +153,7 @@ stages:
dependsOn: Performance
condition: always()
jobs:
- job: Cleanup
- job: CleanupWorkDir
displayName: Cleanup WorkDir
pool:
name: MLNX
Expand All @@ -125,5 +162,29 @@ stages:
clean: true
- bash: |
set -x
rm -rf $(WorkDir)
echo 'Retain 3 latest workdirs for manual repro; clean the rest'
wrk_dirs="/hpc/scrap/azure/$(Build.DefinitionName)/"
for dir in $(ls -lt "$wrk_dirs" | tail -n +4 | awk '{print $NF}'); do
echo "Removing old workdir: ${wrk_dirs}${dir}"
rm -rf "${wrk_dirs}${dir}"
done
displayName: Cleanup WorkDir
- job: RemoveWarning
strategy:
matrix:
node1:
name: vulcan03
node2:
name: vulcan04
pool:
name: MLNX
demands:
- ucx_perf

steps:
- checkout: none
- bash: |
set -x
crontab -r || true
displayName: remove warning
2 changes: 2 additions & 0 deletions buildlib/azure-pipelines-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,5 @@ stages:
demands: ucx-arm64

- template: jucx/jucx-publish.yml
parameters:
target: publish-release
22 changes: 20 additions & 2 deletions buildlib/azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@ trigger:
- master
- v*.*.x

variables:
DOCKER_OPT_VOLUMES: -v /hpc/local:/hpc/local

resources:
containers:
- container: centos7_cuda11
image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5-cuda11:2
- container: centos7_cuda11_x86_64
image: rdmz-harbor.rdmz.labs.mlnx/ucx/x86_64/centos7-mofed5-cuda11:3
options: $(DOCKER_OPT_VOLUMES)
- container: centos8_cuda11_aarch64
image: rdmz-harbor.rdmz.labs.mlnx/ucx/aarch64/centos8-mofed5-cuda11:3

stages:
- stage: Prepare
Expand All @@ -34,6 +40,18 @@ stages:
dependsOn: Prepare
condition: eq(dependencies.Prepare.outputs['Check.Result.Launch'], 'True')
jobs:
- template: jucx/jucx-build.yml
parameters:
arch: amd64
container: centos7_cuda11_x86_64
demands: ucx_docker

- template: jucx/jucx-build.yml
parameters:
arch: aarch64
container: centos8_cuda11_aarch64
demands: ucx-arm64

- template: jucx/jucx-publish.yml
parameters:
${{ if eq(variables['Build.Reason'], 'IndividualCI') }}:
Expand Down
Loading

0 comments on commit c4259ab

Please sign in to comment.