diff --git a/docs/fundamentals/flow/topologies.md b/docs/fundamentals/flow/topologies.md index 3eaa6d7cda626..fcead6121beea 100644 --- a/docs/fundamentals/flow/topologies.md +++ b/docs/fundamentals/flow/topologies.md @@ -147,7 +147,7 @@ You can restrict the visible devices in round-robin assignment using `CUDA_VISIB | 0 | 4 | -You can restrict the visible devices in round-robin assignment by assigning a list of devices ids `CUDA_VISIBLE_DEVICES=RR1,3`. This creates the following assignment: +You can restrict the visible devices in round-robin assignment by assigning the list of device IDs to `CUDA_VISIBLE_DEVICES=RR1,3`. This creates the following assignment: | GPU device | Replica ID | |------------|------------| @@ -157,6 +157,16 @@ You can restrict the visible devices in round-robin assignment by assigning a li | 3 | 3 | | 1 | 4 | +You can also refer to GPUs by their UUID. For instance, you could assign a list of device UUIDs `CUDA_VISIBLE_DEVICES=RRGPU-0aaaaaaa-74d2-7297-d557-12771b6a79d5,GPU-0bbbbbbb-74d2-7297-d557-12771b6a79d5,GPU-0ccccccc-74d2-7297-d557-12771b6a79d5,GPU-0ddddddd-74d2-7297-d557-12771b6a79d5`. +Check [CUDA Documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) to see the accepted formats to assign CUDA devices by UUID. + +| GPU device | Replica ID | +|------------|------------| +| GPU-0aaaaaaa-74d2-7297-d557-12771b6a79d5 | 0 | +| GPU-0bbbbbbb-74d2-7297-d557-12771b6a79d5 | 1 | +| GPU-0ccccccc-74d2-7297-d557-12771b6a79d5 | 2 | +| GPU-0ddddddd-74d2-7297-d557-12771b6a79d5 | 3 | +| GPU-0aaaaaaa-74d2-7297-d557-12771b6a79d5 | 4 | ## Distributed replicas diff --git a/jina/orchestrate/deployments/__init__.py b/jina/orchestrate/deployments/__init__.py index 298945c841157..cd5eb67219b8f 100644 --- a/jina/orchestrate/deployments/__init__.py +++ b/jina/orchestrate/deployments/__init__.py @@ -732,7 +732,7 @@ def _parse_devices(value: str, num_devices: int): :return: slice """ - all_devices = range(num_devices) + use_uuids = False if re.match(WRAPPED_SLICE_BASE, value): value = value[1:-1] @@ -742,13 +742,28 @@ def _parse_devices(value: str, num_devices: int): parts = value.split(':') if len(parts) == 1: - # slice(stop) + try: + int(parts[0]) + except: + use_uuids = True + if use_uuids: + return parts parts = [parts[0], str(int(parts[0]) + 1)] - # else: slice(start, stop[, step]) else: - return [int(p) for p in parts] + # try to detect if parts are not numbers + try: + int(parts[0]) + except: + use_uuids = True + + if not use_uuids: + return [int(p) for p in parts] + else: + return parts else: parts = [] + + all_devices = range(num_devices) return all_devices[slice(*[int(p) if p else None for p in parts])] @staticmethod @@ -776,10 +791,11 @@ def _roundrobin_cuda_device(device_str: str, replicas: int): selected_devices = [] if device_str[2:]: - for device_num in Deployment._parse_devices( + + for device in Deployment._parse_devices( device_str[2:], num_devices ): - selected_devices.append(device_num) + selected_devices.append(device) else: selected_devices = range(num_devices) _c = cycle(selected_devices) diff --git a/jina/serve/stream/__init__.py b/jina/serve/stream/__init__.py index 1b4ca18bb2cf1..c5b4f3562d353 100644 --- a/jina/serve/stream/__init__.py +++ b/jina/serve/stream/__init__.py @@ -1,4 +1,3 @@ -import argparse import asyncio from typing import ( TYPE_CHECKING, diff --git a/tests/unit/orchestrate/deployments/test_cuda_assignment.py b/tests/unit/orchestrate/deployments/test_cuda_assignment.py index af384f18aee77..55b72e2f9dd34 100644 --- a/tests/unit/orchestrate/deployments/test_cuda_assignment.py +++ b/tests/unit/orchestrate/deployments/test_cuda_assignment.py @@ -27,9 +27,13 @@ def cuda_total_devices(request): ['RR1:', 5, {0: 1, 1: 2, 2: 1, 3: 2, 4: 1}, 3], ['RR0:2', 5, {0: 0, 1: 1, 2: 0, 3: 1, 4: 0}, 3], ['RR1:2', 2, {0: 1, 1: 1}, 3], + ['RR2', 2, {0: 2, 1: 2}, 3], + ['RRUUID1', 2, {0: 'UUID1', 1: 'UUID1'}, 3], ['RR1:2', 1, {0: 1}, 3], ['RR0,2,3', 3, {0: 0, 1: 2, 2: 3}, 4], ['RR0,2,3', 5, {0: 0, 1: 2, 2: 3, 3: 0, 4: 2}, 4], + ['RRUUID1,UUID2,UUID3', 5, {0: 'UUID1', 1: 'UUID2', 2: 'UUID3', 3: 'UUID1', 4: 'UUID2'}, 4], + ['RRGPU-0aaaaaaa-74d2-7297-d557-12771b6a79d5,GPU-0bbbbbbb-74d2-7297-d557-12771b6a79d5,GPU-0ccccccc-74d2-7297-d557-12771b6a79d5,GPU-0ddddddd-74d2-7297-d557-12771b6a79d5', 5, {0: 'GPU-0aaaaaaa-74d2-7297-d557-12771b6a79d5', 1: 'GPU-0bbbbbbb-74d2-7297-d557-12771b6a79d5', 2: 'GPU-0ccccccc-74d2-7297-d557-12771b6a79d5', 3: 'GPU-0ddddddd-74d2-7297-d557-12771b6a79d5', 4: 'GPU-0aaaaaaa-74d2-7297-d557-12771b6a79d5'}, 4], ], indirect=['cuda_total_devices'] ) def test_cuda_assignment(device_str, replicas, expected, cuda_total_devices):